这是一篇review,里面有几个图,我们重现了
这次 jimmy交给我了学徒任务,只能硬着头皮上,主要是完成一个海报的两个突,真的是指哪打哪!
海报在线链接:http://www.nature.com/nrclinonc/posters/breastcancer/
需要重现的图有:
marker基因差异表达与PAM50分型的热图:
somatic突变图谱与PAM50分型
文章中提到了数据来自于引用文献:https://www.nature.com/articles/nature11412
Agilent mRNA expression microarrays (n = 547)
Affymetrix 6.0 single nucleotide polymorphism (SNP) arrays (n = 773)
下载TCGA_xena中的BRCA数据:
根据这个推文里面说到的数据下载方法:https://mp.weixin.qq.com/s/Yz5sl-T3ITvLEFnv5H47wQ
安捷伦芯片数据:AgilentG4502A_07_3 (n=597) TCGA hub
临床数据:Phenotypes (n=1,247) TCGA hub
以及somatic mutations数据:来自于R包 TCGAmutations
具体下载方法请看后文
# 清空环境
rm(list = ls())
options(stringsAsFactors = F)
# 读取临床数据
phe = read.table('BRCA_clinicalMatrix.gz',header = T,sep = '\t',quote = '')
table(phe$PAM50Call_RNAseq)
# 取出sampleID和PAM50Call_RNAseq两列
phe = phe[c('sampleID','PAM50Call_RNAseq')]
rownames(phe) = phe$sampleID
table(phe$PAM50Call_RNAseq)
# 过滤掉Normal和空白分组
phe = phe[phe$PAM50Call_RNAseq != 'Normal',]
phe = phe[phe$PAM50Call_RNAseq != '',]
phe = phe[order(phe$PAM50Call_RNAseq),]
# 读取表达矩阵
dat <- read.table('AgilentG4502A_07_3.gz',
header = T,
sep = '\t',
quote = '',
row.names = 1)
dat[1:4,1:4]
colnames(dat) = gsub('\\.', '-', colnames(dat))
# 矩阵取小,取出有临床数据phe的pam50的样本
table(colnames(dat) %in% rownames(phe))
phe = phe[rownames(phe) %in% colnames(dat),]
dat = dat[,rownames(phe)]
# 取出感兴趣的基因
gene=c('ESR1','PGR','GATA3','XBP1','FOXA1',
'ERBB2','GRB7',
'EGFR','FOXC1','ID4','KIT','MYC',
'MKI67','MYBL2','AURKA','BUB1','CENPF',
'KRT8','KRT18','KRT5','KRT6A')
marker_dat = dat[gene,]
# 热图基因的注释
gene_list = data.frame(group = c(rep('Luminal',5),
rep('HER2',2),
rep('Basal',5),
rep('Proliferation',5),
rep('Keratin',4)),
row.names = gene)
# 热图样本注释
pam50_list = phe[colnames(dat),][,2]
sample_group = data.frame(pam50_list = pam50_list,
row.names = colnames(dat))
# 用pheatmap画热图,添加注释
pheatmap::pheatmap(marker_dat,cluster_cols = F,
cluster_rows = F,
cutree_rows = 5,
show_colnames =F,
show_rownames = T,
annotation_col = sample_group,
annotation_row = gene_list,
filename = 'TCGA_BRCA_gene_AgilentG.png')
# 清空环境
rm(list = ls())
options(stringsAsFactors = F)
# 切换镜像安装R包
options(repos='https://mirrors.ustc.edu.cn/CRAN')
options(BioC_mirror='https://mirrors.ustc.edu.cn/bioc')
# 安装maftools包
BiocManager::install('maftools')
# 安装TCGAmutations包,对网络要求较高,而且下载很久,可能下载失败
devtools::install_github(repo = 'PoisonAlien/TCGAmutations')
library(TCGAmutations)
library(maftools)
# TCGAmutations自带数据,可以用`?tcga_load`查看帮助文档
# 这里我们直接加载BRCA突变数据,读进来就是一个名为tcga_brca_mc3的maf对象
tcga_load(study = 'BRCA')
# 看看数据的整体情况
laml = tcga_brca_mc3
laml@data = laml@data[!grepl('^MT-', laml@data$Hugo_Symbol), ]
laml@data$t_vaf = (laml@data$t_alt_count / laml@data$t_depth)
unique(laml@data$Tumor_Sample_Barcode)
getSampleSummary(laml)
getGeneSummary(laml)
getFields(laml)
# 因为对象自带临床数据,可以拿到IHC分型,先统计一下
table(laml@clinical.data$breast_carcinoma_estrogen_receptor_status)
table(laml@clinical.data$breast_carcinoma_progesterone_receptor_status)
table(laml@clinical.data$lab_proc_her2_neu_immunohistochemistry_receptor_status)
# 然后把er,pr,her2的IHC分型做成一个数据框
# 这里需要注意一点,临床分型样本名所在的列名需要与maf对象laml保持一致
dat = data.frame(
Tumor_Sample_Barcode = laml@clinical.data$Tumor_Sample_Barcode,
er = laml@clinical.data$breast_carcinoma_estrogen_receptor_status,
pr = laml@clinical.data$breast_carcinoma_progesterone_receptor_status,
her2 = laml@clinical.data$lab_proc_her2_neu_immunohistochemistry_receptor_status)
# 把一些阴阳性不确定的样本过滤掉
dat = dat[with(dat,er %in% c('Negative','Positive')) &
with(dat,pr %in% c('Negative','Positive')) &
with(dat,her2 %in% c('Negative','Positive')),]
# 由er,pr,her2的阴阳性拿到IHC分型
dat$sub[(dat$er=='Positive' | dat$pr=='Positive') & dat$her2=='Negative'] = 'HR+/HER2–'
dat$sub[(dat$er=='Positive' | dat$pr=='Positive') & dat$her2=='Positive'] = 'HR+/HER2+'
dat$sub[(dat$er=='Negative' & dat$pr=='Negative') & dat$her2=='Positive'] = 'HER2'
dat$sub[(dat$er=='Negative' & dat$pr=='Negative') & dat$her2=='Negative'] = 'TNBC'
table(dat$sub)
# 最后用maftools的oncoplot画突变全景图,
# marker基因突变全景图
gene = c('PIK3CA','CDH1','GATA3','MAP3K1','MAP2K4','FOXA1','AKT1','PTEN','TP53')
oncoplot(maf = laml,
genes = gene,
fill = T,
fontSize = 0.8 ,
annotationDat = dat,
clinicalFeatures = 'sub',
sortByAnnotation = T,
GeneOrderSort = T,
keepGeneOrder = T,
showTumorSampleBarcodes = F)
本文排版奇差无比,图片也重复的很不美观,我都不想把它放入学徒数据挖掘列表,不过至少知识点和数据处理本身的代码都ok啦,希望对大家有所帮助!
联系客服