但是蛮多小伙伴在公众号推文以及后台留言表示明明是可以做更多啊,不应该是仅仅是数据挖掘故事的引子(也可以是故事本身),我们这里就以WGCNA为例来说明。是河北工业大学硕士学位论文,标题是:《 乳腺癌潜在生物标志物的挖掘和分析 》,作者从 TCGA 中获取乳腺癌的基因表达数据和临床数据,并根据临床分期信息将 乳腺癌基因表达数据划分为癌旁样本和癌症4个阶段(I-IV)样本。在乳腺癌各阶段分别挖掘到 13、9、13、16 个可能与乳 腺癌相关且具有特定功能的共表达模块,详细的步骤是:
限于公众号篇幅(2万字),我这里不可能带领大家复现出这个河北工业大学硕士学位论文的全部图表。我们还是承接前面的:你研究的基因凭什么重要(这才是数据挖掘的用武之地),从肾癌数据集里面的任意选取一个stage的病人表达量矩阵,走wgcna给大家看看!
WGCNA分析全称加权基因共表达网络分析,可以用来鉴定高度协同变化的基因集。相较于使用阈值严格筛选差异表达基因,WGCNA可以选定几千到上万个变化的基因构建加权网络,筛选显著变化的模块,同表型进行关联分析。咱们可以进一步从显著变化的模块中,筛选出自身感兴趣的模块与模块基因进行后续分析。
### 1.清空环境,加载WGNA分析所需要的包
rm(list = ls()) ## 魔幻操作,一键清空~
library(stringr)
library(GEOquery)
library(AnnoProbe)
library(limma)
library(ggplot2)
library(tinyplanet)
library(tinyarray)
library(data.table)
library(dplyr)
library(stringr)
library(AnnoProbe)
library(WGCNA)
### 2.下载原始数据(KIRC对应的表达矩阵与临床信息)
proj = "TCGA-KIRC"
if(!file.exists(paste0(proj, ".htseq_counts.tsv.gz"))){
download.file(url = paste0("https://gdc.xenahubs.net/download/",proj, ".htseq_counts.tsv.gz"),destfile = paste0(proj,".htseq_counts.tsv.gz"))
download.file(url = paste0("https://gdc.xenahubs.net/download/",proj, ".GDC_phenotype.tsv.gz"),destfile = paste0(proj,".GDC_phenotype.tsv.gz"))
}
### 3.整理表达矩阵与临床信息(为后续WGCNA分析服务)
#### 3.1整理表达矩阵(获取行名是基因名,列名是样本名的表达矩阵)
dat = read.table("TCGA-KIRC.htseq_counts.tsv.gz",check.names = F,row.names = 1,header = T) #表达矩阵
head(rownames(dat)) #EnsembleID带有小数点(版本号),不能直接转换
## [1] "ENSG00000000003.13" "ENSG00000000005.5" "ENSG00000000419.11"
## [4] "ENSG00000000457.12" "ENSG00000000460.15" "ENSG00000000938.11"
rownames(dat) = str_split(rownames(dat),"\\.",simplify = T)[,1] #去除小数点
head(rownames(dat))
## [1] "ENSG00000000003" "ENSG00000000005" "ENSG00000000419" "ENSG00000000457"
## [5] "ENSG00000000460" "ENSG00000000938"
re = annoGene(rownames(dat),ID_type = "ENSEMBL");head(re)
## SYMBOL biotypes ENSEMBL chr start
## 1 DDX11L1 transcribed_unprocessed_pseudogene ENSG00000223972 chr1 11869
## 2 WASH7P unprocessed_pseudogene ENSG00000227232 chr1 14404
## 3 MIR6859-1 miRNA ENSG00000278267 chr1 17369
## 4 MIR1302-2HG lncRNA ENSG00000243485 chr1 29554
## 6 FAM138A lncRNA ENSG00000237613 chr1 34554
## 7 OR4G4P unprocessed_pseudogene ENSG00000268020 chr1 52473
## end
## 1 14409
## 2 29570
## 3 17436
## 4 31109
## 6 36081
## 7 53312
dat = trans_array(dat,ids = re,from = "ENSEMBL",to = "SYMBOL")
dat[1:4,1:4]
## TCGA-BP-4342-01A TCGA-A3-3387-01A TCGA-BP-4167-01A TCGA-B8-4620-01A
## DDX11L1 1.000000 0.000000 0.000000 0.000000
## WASH7P 6.285402 6.129283 4.643856 4.954196
## MIR6859-1 0.000000 2.000000 1.000000 1.584963
## MIR1302-2HG 1.584963 0.000000 0.000000 0.000000
#### 3.2整理临床信息(为后续WGCNA提供临床信息(性状),进行模块与性状分析)
#### 此处的核心是让挑选出来的样本既有表达量信息又具有临床信息
pda = read.delim("TCGA-KIRC.GDC_phenotype.tsv.gz",fill = T,header = T,sep = "\t")
rownames(pda)=pda$submitter_id.samples
pd=pda[rownames(pda)%in%colnames(dat),]
table(pda$tumor_stage.diagnoses) #查看不同样本的stage期
##
## not reported stage i stage ii stage iii stage iv
## 4 481 102 237 161
pd=pd[pd$tumor_stage.diagnoses=="stage i",] #选取stage i期
#### 3.3结合临床信息整理用于WGCNA分析的表达矩阵
exp=dat[,colnames(dat)%in%rownames(pd)] #取出stage i期的表达矩阵
exp=as.data.frame(exp)
exp=exp[match(rownames(pd),colnames(exp))] #match匹配一下
identical(rownames(pd),colnames(exp))
## [1] TRUE
#### 3.4设计design
### 挑选自身感兴趣的3种临床性状,为后续WGCNA的模块与性状分析做准备
### 整理成样本性状矩阵,性状间的分组信息用0与1区分
### 看样本3中不同临床性状的整体分布
table(pd$laterality)
##
## Bilateral Left Right
## 1 137 156
table(pd$gender.demographic)
##
## female male
## 112 182
table(pd$vital_status.demographic)
##
## Alive Dead
## 246 48
### 将3组临床性状整理成WGCNA能够识别的样子
laterality= ifelse(pd$laterality=="Left",0,ifelse(pd$laterality=="Right",1,2))
gender=ifelse(pd$gender.demographic=="female",0,1)
survival=ifelse(pd$vital_status.demographic=="Alive",0,1)
### 合并,构建样本性状矩阵
design=t(rbind(laterality,gender,survival) )
rownames(design)=rownames(pd)
#### 3.5.保存后续用于WGCNA分析的三个关键信息
save(exp,pd,design,file = "01.data.Rdata")
### 1清空环境,加载包,加载WGCNA分析的关键信息
rm(list = ls())
options(stringsAsFactors = F)
load("01.data.Rdata")
### 2过滤低权重基因,选取中位数绝对值偏差最大的前5000个基因
gsg = goodSamplesGenes(exp, verbose = 3)
## Flagging genes and samples with too many missing values...
## ..step 1
gsg$allOK;table(!is.na(exp))
## [1] TRUE
##
## TRUE
## 16615116
dat0 <- exp
dat0 <- t(dat0[order(apply(dat0,1,mad), decreasing = T)[1:5000],])
### 3绘制样本聚类树p1,看是否有显著离群样本
sampleTree = hclust(dist(dat0), method = "average");
par(cex = 0.6); #设置图的整体大小(字体和线条结构)
par(mar = c(0,6,6,0)) #设置边距
plot(sampleTree,
main = "Sample clustering to detect outliers",
sub="",
xlab="",
cex.lab = 1.5,
cex.axis = 1.5,
cex.main = 1.5)
### 294例样本展示可能有些太密集,可按需调整哦
### 4确定最佳软阈值
powers = c(c(1:10), seq(from = 12, to=20, by=2))
sft = pickSoftThreshold(dat0,
powerVector = powers,
verbose = 5)
## pickSoftThreshold: will use block size 5000.
## pickSoftThreshold: calculating connectivity for given powers...
## ..working on genes 1 through 5000 of 5000
## Power SFT.R.sq slope truncated.R.sq mean.k. median.k. max.k.
## 1 1 0.0939 -0.719 0.908 735.000 7.08e+02 1450.00
## 2 2 0.5140 -1.220 0.930 190.000 1.66e+02 584.00
## 3 3 0.7820 -1.430 0.965 67.100 5.06e+01 290.00
## 4 4 0.8610 -1.610 0.985 29.100 1.81e+01 182.00
## 5 5 0.8840 -1.720 0.988 14.600 7.06e+00 128.00
## 6 6 0.8930 -1.740 0.986 8.210 2.96e+00 95.00
## 7 7 0.8940 -1.710 0.985 5.020 1.33e+00 73.70
## 8 8 0.8500 -1.750 0.964 3.280 6.31e-01 58.80
## 9 9 0.8600 -1.690 0.968 2.260 3.19e-01 48.00
## 10 10 0.8640 -1.660 0.973 1.620 1.71e-01 39.80
## 11 12 0.8590 -1.610 0.969 0.912 5.13e-02 28.30
## 12 14 0.8400 -1.570 0.944 0.562 1.67e-02 20.80
## 13 16 0.3430 -2.110 0.348 0.369 5.60e-03 15.60
## 14 18 0.3380 -2.040 0.333 0.254 1.91e-03 11.90
## 15 20 0.9500 -1.330 0.986 0.181 6.59e-04 9.21
po <- sft$powerEstimate;po
## [1] 4
### 5绘制无尺度网络拟合度R^2以及平均连接度与最优软阈值曲线
#### 5.1设置整体绘图参数
par(mfrow = c(1,2))
cex1 = 0.85
#### 5.2绘制无尺度网络拟合度R^2与最优软阈值曲线
plot(sft$fitIndices[,1],
-sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold (power)",
ylab="Scale Free Topology Model Fit,signed R^2",
type="n",
main = paste("Scale independence"));
text(sft$fitIndices[,1],
-sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels=powers,cex=cex1,col="red");
abline(h=0.85,col="red") #添加最优软阈值截断线,h对应R^2截断点
#### 5.3接着绘制平均连接度与最优软阈值曲线
plot(sft$fitIndices[,1],
sft$fitIndices[,5],
xlab="Soft Threshold (power)",
ylab="Mean Connectivity",
type="n",main = paste("Mean connectivity"))
text(sft$fitIndices[,1],
sft$fitIndices[,5],
labels=powers, cex=cex1,col="red")
dev.off() #得到p2
## null device
## 1
### 6一步构建动态模块网络net
#### 6.1构建动态模块网络net
cor <- WGCNA::cor #许多数据分析的包中含有cor函数,为避免冲突,指定cor为WGCNA的cor
net = blockwiseModules(dat0, power = po,
TOMType = "unsigned", minModuleSize = 30,
reassignThreshold = 0, mergeCutHeight = 0.25,
numericLabels = TRUE, pamRespectsDendro = FALSE,
saveTOMs = TRUE,saveTOMFileBase = "femaleMouseTOM",
verbose = 3)
## Calculating module eigengenes block-wise from all genes
## Flagging genes and samples with too many missing values...
## ..step 1
## ..Working on block 1 .
## TOM calculation: adjacency..
## ..will not use multithreading.
## Fraction of slow calculations: 0.000000
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
## ..saving TOM for block 1 into file femaleMouseTOM-block.1.RData
## ....clustering..
## ....detecting modules..
## ....calculating module eigengenes..
## ....checking kME in modules..
## ..removing 313 genes from module 1 because their KME is too low.
## ..removing 166 genes from module 2 because their KME is too low.
## ..removing 50 genes from module 3 because their KME is too low.
## ..removing 55 genes from module 4 because their KME is too low.
## ..removing 39 genes from module 5 because their KME is too low.
## ..removing 13 genes from module 6 because their KME is too low.
## ..removing 12 genes from module 7 because their KME is too low.
## ..removing 16 genes from module 8 because their KME is too low.
## ..removing 8 genes from module 9 because their KME is too low.
## ..removing 11 genes from module 11 because their KME is too low.
## ..merging modules that are too close..
## mergeCloseModules: Merging modules whose distance is less than 0.25
## Calculating new MEs...
cor<-stats::cor #用完WGCNA::cor记得还回去,后续还要用常规的呢
#### 6.2提取net主要信息,并将模块的数字标签转换为要用于可视化的颜色
class(net)
## [1] "list"
names(net)
## [1] "colors" "unmergedColors" "MEs" "goodSamples"
## [5] "goodGenes" "dendrograms" "TOMFiles" "blockGenes"
## [9] "blocks" "MEsOK"
table(net$colors)
##
## 0 1 2 3 4 5 6 7 8 9 10 11
## 758 1495 1043 402 326 268 192 167 136 101 66 46
moduleLabels = net$colors
moduleColors = labels2colors(net$colors) #将数字标签转换为颜色
MEs = net$MEs;
#### 6.3准备基因聚类数据
geneTree = net$dendrograms[[1]]; #基因聚类树
save(net,MEs, moduleLabels, moduleColors, geneTree,
file = "02_networkConstruction.RData")
#### 6.4将基因聚类数据与模块信息可视化(p3)
par(cex = 0.6)
par(mar = c(0,6,6,0),mai = c(0.1,0.7,0.5,0.1))
plotDendroAndColors(geneTree,
moduleColors[net$blockGenes[[1]]],
"Module colors",
dendroLabels = FALSE,
hang = 0.03,
addGuide = TRUE,
guideHang = 0.05) #得到p3:基因聚类树带着模块映射的图
### 1.清空环境,加载数据
rm(list = ls()) ## 魔幻操作,一键清空~
options(stringsAsFactors = F)
load(file = "01.data.Rdata")
load(file = "02_networkConstruction.RData")
### 2.模块与性状数据整理
dat <- exp
dat <- t(dat[order(apply(dat,1,mad), decreasing = T)[1:5000],])
nGenes = ncol(dat)
nSamples = nrow(dat)
MEs0 = moduleEigengenes(dat, moduleColors)$eigengenes
MEs = orderMEs(MEs0)
MEs[1:6,1:6]
## MEblack MEbrown MEblue MEred MEgreenyellow
## TCGA-A3-3359-01A 0.01527725 0.05102101 -0.051241734 -0.07411841 -0.07858036
## TCGA-B0-4838-01A -0.02316250 0.01543678 -0.033803761 -0.06102517 -0.07791229
## TCGA-BP-5001-01A 0.05544412 0.06796056 -0.004793421 -0.02957076 -0.05963580
## TCGA-BP-4801-01A -0.01758141 0.01536496 -0.024371428 0.07648171 0.04522826
## TCGA-B0-5691-01A 0.06550518 0.01137653 0.055187467 0.08366529 -0.07631647
## TCGA-B0-5691-11A 0.19695564 0.08518558 0.155494131 0.13350277 -0.06740704
## MEpurple
## TCGA-A3-3359-01A -0.03536669
## TCGA-B0-4838-01A -0.05212159
## TCGA-BP-5001-01A -0.03414709
## TCGA-BP-4801-01A -0.03684990
## TCGA-B0-5691-01A -0.02349843
## TCGA-B0-5691-11A 0.07123953
moduleTraitCor = cor(MEs, design, use = "p")# 计算MEs和分组间的相关性
head(moduleTraitCor) #输出结果为行名为ME颜色,列名为样本分组的相关性矩阵
## laterality gender survival
## MEblack -0.107069618 -0.03225902 -0.04778869
## MEbrown -0.036836611 -0.16291239 -0.07300059
## MEblue -0.099677881 0.07791799 0.02850250
## MEred 0.005283112 0.03366532 0.01440820
## MEgreenyellow 0.014728904 0.93067180 -0.02245226
## MEpurple 0.017240952 0.04124978 -0.01589141
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, #计算相关性P值
nSamples)
head(moduleTraitPvalue)
## laterality gender survival
## MEblack 0.06675588 5.816926e-01 0.4142821
## MEbrown 0.52925796 5.107150e-03 0.2120190
## MEblue 0.08799131 1.827452e-01 0.6264475
## MEred 0.92812730 5.653292e-01 0.8056749
## MEgreenyellow 0.80143849 1.535088e-129 0.7014348
## MEpurple 0.76846504 4.810758e-01 0.7861316
### 3.绘制模块与前面挑选的3种临床性状的相关性热图数据准备
sizeGrWindow(10,6)
textMatrix = paste(signif(moduleTraitCor, 2), # 矩阵标签标记相关性数值与p值
"\n(",signif(moduleTraitPvalue, 1),
")",
sep = "");
dim(textMatrix) = dim(moduleTraitCor)
textMatrix[1:6,1:3]
## [,1] [,2] [,3]
## [1,] "-0.11\n(0.07)" "-0.032\n(0.6)" "-0.048\n(0.4)"
## [2,] "-0.037\n(0.5)" "-0.16\n(0.005)" "-0.073\n(0.2)"
## [3,] "-0.1\n(0.09)" "0.078\n(0.2)" "0.029\n(0.6)"
## [4,] "0.0053\n(0.9)" "0.034\n(0.6)" "0.014\n(0.8)"
## [5,] "0.015\n(0.8)" "0.93\n(2e-129)" "-0.022\n(0.7)"
## [6,] "0.017\n(0.8)" "0.041\n(0.5)" "-0.016\n(0.8)"
design <- as.data.frame(design)
### 4.设置整体绘图参数
par(cex = 0.6);
par(mai = c(0.1,0.7,0.5,0.1))
par(mar = c(3, 9, 3, 1));
### 5.绘制模块与三种临床性状的相关性热图(p4)
labeledHeatmap(Matrix = moduleTraitCor,
xLabels = names(design),
xLabelsAngle = 0,
yLabels = names(MEs),
ySymbols = names(MEs),
colorLabels = FALSE,
colors = blueWhiteRed(50),
textMatrix = textMatrix, #作为热图矩阵标签显示
setStdMargins = FALSE,
cex.text = 1,
zlim = c(-1,1),
main = paste("Module-trait relationships"))
### 1加载模块间相关性分析所需要的包
library(ggcorrplot)
library(corrplot)
library(ggcor)
### 2.整理数据
colnames(MEs0) <- substring(names(MEs0), 3) #简化MEs0列名
corr <- round(cor(MEs0), 1) #round四舍五入到指定的小数位数,和上面的signif 都为四舍五入的意思
head(corr[, 1:6])
## black blue brown green greenyellow grey
## black 1.0 0.6 0.6 -0.2 0.1 0.2
## blue 0.6 1.0 -0.2 -0.6 0.1 0.2
## brown 0.6 -0.2 1.0 0.2 -0.1 0.1
## green -0.2 -0.6 0.2 1.0 0.1 0.4
## greenyellow 0.1 0.1 -0.1 0.1 1.0 0.3
## grey 0.2 0.2 0.1 0.4 0.3 1.0
p.mat <- cor_pmat(MEs0) #使用ggplot2计算相关性矩阵
head(p.mat[, 1:4])
## black blue brown green
## black 0.000000e+00 6.215622e-26 4.117893e-31 5.135812e-05
## blue 6.215622e-26 0.000000e+00 6.405195e-05 6.295464e-28
## brown 4.117893e-31 6.405195e-05 0.000000e+00 1.604418e-05
## green 5.135812e-05 6.295464e-28 1.604418e-05 0.000000e+00
## greenyellow 2.283503e-01 3.163347e-02 2.511907e-01 7.242471e-02
## grey 4.009967e-04 1.084679e-03 9.622369e-02 8.677879e-14
### 3.绘制相关性图形p5
corrplot(corr,method = "square",type = "upper") #显示正方形,右上部分
### 4.对分析结果按r和p值重新赋值
mantel <- mantel_test(MEs0, MEs0,
spec.select = list(Times = 1:7,
Dissection = 1:7),
mantel.fun = "mantel.randtest") %>%
mutate(r = cut(r,
breaks = c(-Inf, -0.6-0.6, Inf),
labels = c("> 0.6 or < -0.6", "-0.6-0.6"),
right = FALSE),
p.value = cut(p.value,
breaks = c(-Inf, 0.01, 0.05, Inf),
labels = c("< 0.01", "0.01-0.05", ">=0.05"),
right = FALSE))
### 5.绘制重新赋值后的相关性图p6
p6 <- quickcor(MEs0, type = "upper") +
geom_square(inherit.aes = TRUE) +
add_link(mantel,
mapping = aes(colour = p.value, size = r),
diag.label = TRUE) +
scale_size_manual(values = c(1, 3, 5)) +
geom_diag_label(angle = 0)+
remove_axis("y")+
scale_color_manual(values=c("#FF8000","#56B4E9","#808A87"))
p6
### 关键三步
#1挑选感兴趣高权重基因(本文选择了权重非常显著的三个,如下);
#2利用在线数据库DGIdb获得基因药物互作表格;
#3.可视化基因药物与模块的关系
### 1.清空环境,加载包,加载数据
rm(list = ls())
#BiocManager::install("ggalluvial")
library(ggalluvial)
load("01.data.Rdata")
load("02_networkConstruction.RData")
### 2.构建模型颜色与基因矩阵
dat <- exp
dat <- t(dat[order(apply(dat,1,mad), decreasing = T)[1:5000],])
genes_all <- data.frame(module=moduleColors,
genes=colnames(dat))
head(genes_all) #从中选择差异模块的3个尤为显著基因,我这三个是从前30个中选的
## module genes
## 1 brown PLG
## 2 grey AC019117.3
## 3 black MYH8
## 4 yellow SORCS3
## 5 grey GRIA4
## 6 grey AC079466.1
gene <- c("PLG","RBP4","PAH") #基于这几个基因,下载基因药物互作数据
### 3从DGB数据库下载基因药物互作数据,放置于R工作路径读取
dgidb <- read.csv(file = "dgidb_export_2022-07-28 .tsv",
comment.char = "!",
header = T,
sep = "\t")
### 4.匹配基因对应的模块颜色
module <- c("brown","brown","black")
### 5.桑基图绘图数据准备---需变成长数据
gene_module <- data.frame(gene,module)
gene_drug <- data.frame(gene = dgidb$gene,
drug = dgidb$drug)
Tit <- inner_join(gene_drug,gene_module,by = "gene")
Tit <- Tit[,c("drug","gene","module")]
head(Tit)
## drug gene module
## 1 UROKINASE PLG brown
## 2 ANISTREPLASE PLG brown
## 3 ALTEPLASE PLG brown
## 4 DESMOTEPLASE PLG brown
## 5 NYSTATIN PLG brown
## 6 APROTININ PLG brown
tit_long <- to_lodes_form(data.frame(Tit),
key = "Demographic",
axes = 1:3)
### 6.绘制基因药物互作以及包含模块的桑基图
p7 <- ggplot(tit_long,
aes(x = Demographic, stratum = stratum, alluvium = alluvium,
fill=stratum, label = stratum)) +
geom_flow() +
geom_stratum(alpha = 3,width = 1/2) +
geom_text(stat = "stratum", size = 2.5) +
theme(legend.position = "none",
panel.background = element_blank(),
axis.ticks = element_blank(),
axis.text.y = element_blank(),
axis.text.x = element_text(size =15,colour = "black",),
axis.title = element_blank()) +
scale_x_discrete(position = "top")
p7
拿性别相关模块里面的全部的基因进行染色体注释,看看是哪一条性染色体。
我们《生信技能树》这些年有很多关于WGCNA的实战细节建议分享,见:
联系客服