芯片的处理流程一般就是:数据读入——数据过滤——数据校正——下游分析。
读文章拿到测序数据
本次讲解用到的数据来自文章The relationship between DNA methylation, genetic and expression inter-individual variation in untransformed human fibroblast
从文章里面找到数据存放地址如下:https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE52025
直接下载即可:
我下载的数据如下:
接着我们需要跟样本分组,这里我是按照GM和WG分组比较:
library(limma)
library(minfi)
library(IlluminaHumanMethylation450kanno.ilmn12.hg19)
library(IlluminaHumanMethylation450kmanifest)
library(RColorBrewer)
library(missMethyl)
library(matrixStats)
library(minfiData)
rgSet <>'GSE52025/idat')
head(sampleNames(rgSet))
sampleNames(rgSet) <>substr(sampleNames(rgSet),'1', '10' )
建立样本分组信息,分组信息我是基于下图对应关系,构建了pD对象如下:
pD对象我是自己都进去整理的,部分代码如下:
pD <>
sampleNames(rgSet) <>
pD$group=substr(rownames(pD),'1', '2' )
detP - detectionP(rgSet)
pal - brewer.pal(8,'Dark2')
par(mfrow=c(1,2))
barplot(colMeans(detP), col=pal[factor(pD$group)], las=2,
cex.names=0.8, ylab='Mean detection p-values')
abline(h=0.05,col='red')
legend('topleft', legend=levels(factor(pD$group)), fill=pal,
bg='white')
barplot(colMeans(detP), col=pal[factor(pD$group)], las=2,
cex.names=0.8, ylim=c(0,0.002), ylab='Mean detection p-values')
abline(h=0.05,col='red')
legend('topleft', legend=levels(factor(pD$group)), fill=pal,
bg='white')
keep - colMeans(detP) <>.01
rgSet - rgSet[,keep]
rgSet
detP - detP[,keep]
dim(detP)
mSetSq <>
par(mfrow=c(1,2))
densityPlot(rgSet, sampGroups=pD$group,main='Raw', legend=FALSE)
legend('top', legend = levels(factor(pD$group)),
text.col=brewer.pal(8,'Dark2'))
densityPlot(getBeta(mSetSq), sampGroups=pD$group,
main='Normalized', legend=FALSE)
legend('top', legend = levels(factor(pD$group)),
text.col=brewer.pal(8,'Dark2'))
detP - detP[match(featureNames(mSetSq),rownames(detP)),]
keep <><>0.01) == ncol(mSetSq)
table(keep)
mSetSqFlt <>q[keep,]
mSetSqFlt
ann450k = getAnnotation(IlluminaHumanMethylation450kanno.ilmn12.hg19)
head(ann450k)
keep <>in% ann450k$Name[ann450k$chr %in% c('chrX','chrY')])
table(keep)
mSetSqFlt <>
mSetSqFlt <>
mSetSqFlt
网址:http://www.sickkids.ca/MS-Office-Files/Research/Weksberg%20Lab/48639-non-specific-probes-Illumina450k.xlsx
下载下来读进去即可
xReactiveProbes <>read.csv('C:/Users/Administrator/Downloads/48639-non-specific-probes-Illumina450k.csv',sep=',', stringsAsFactors=FALSE)
keep <>in% xReactiveProbes$TargetID)
table(keep)
mSetSqFlt <>
mSetSqFlt
一旦数据被过滤和标准化,重新检查MDS图以查看样本之间的关系是否发生了变化通常是有用的
par(mfrow=c(1,2))
plotMDS(getM(mSetSqFlt), top=1000, gene.selection='common',
col=pal[factor(pD$group)], cex=0.8)
legend('right', legend=levels(factor(pD$group)), text.col=pal,
cex=0.65, bg='white')
plotMDS(getM(mSetSqFlt), top=1000, gene.selection='common',
col=pal[factor(pD$group)])
legend('right', legend=levels(factor(pD$group)), text.col=pal,
cex=0.7, bg='white')
一旦数据被过滤和标准化,重新检查MDS图以查看样本之间的关系是否发生了变化通常是有用的
par(mfrow=c(1,3))
# Examine higher dimensions to look at other sources of variation
plotMDS(getM(mSetSqFlt), top=1000, gene.selection='common',
col=pal[factor(pD$group)], dim=c(1,3))
legend('right', legend=levels(factor(pD$group)), text.col=pal,
cex=0.7, bg='white')
plotMDS(getM(mSetSqFlt), top=1000, gene.selection='common',
col=pal[factor(pD$group)], dim=c(2,3))
legend('topright', legend=levels(factor(pD$group)), text.col=pal,
cex=0.7, bg='white')
plotMDS(getM(mSetSqFlt), top=1000, gene.selection='common',
col=pal[factor(pD$group)], dim=c(3,4))
legend('right', legend=levels(factor(pD$group)), text.col=pal,
cex=0.7, bg='white')
下一步是计算m值和beta值。m值具有更好的统计特性,因此更适合用于甲基化数据的统计分析,而beta值易于解释,因此更适合于显示数据
mVals - getM(mSetSqFlt)
head(mVals[,1:5])
bVals - getBeta(mSetSqFlt)
head(bVals[,1:5])
绘制密度曲线
par(mfrow=c(1,2))
densityPlot(bVals, sampGroups=pD$group, main='Beta values',
legend=FALSE, xlab='Beta values')
legend('top', legend = levels(factor(pD$group)),
text.col=brewer.pal(8,'Dark2'))
densityPlot(mVals, sampGroups=pD$group, main='M-values',
legend=FALSE, xlab='M values')
legend('topleft', legend = levels(factor(pD$group)),
text.col=brewer.pal(8,'Dark2'))
group <>group)
# use the above to create a design matrix
design <>0+group, data=pD)
colnames(design) <>group))
fit <>
# create a contrast matrix for specific comparisons
contMatrix <>
levels=design)
contMatrix
# fit the contrasts
fit2 <>
fit2 <>
ann450kSub - ann450k[match(rownames(mVals),ann450k$Name), c(1:4,12:19,24:ncol(ann450k))]
DMPs - topTable(fit2, num=Inf, coef=1, genelist=ann450kSub)
head(DMPs)
绘制cpg表达
par(mfrow=c(2,2))
purrr::map(rownames(DMPs)[1:4], function(cpg){
plotCpg(bVals, cpg=cpg, pheno=pD$group, ylab = 'Beta values')
})
sigCpGs <>$Name[DMPs$adj.P.Val0.05]
# First 10 significant CpGs
sigCpGs[1:10]
# Get all the CpG sites used in the analysis to form the background
all <>$Name
# Total number of CpG sites tested
length(all)
par(mfrow=c(1,1))
gst - gometh(sig.cpg=sigCpGs, all.cpg=all, plot.bias=TRUE)
#Top 10 GO categories
topGO(gst, number=10)
kegg - gometh(sig.cpg = sigCpGs, all.cpg = all, collection = 'KEGG', prior.prob=TRUE)
topKEGG(kegg)
如果大家发现有错误,欢迎留言或者发送我邮箱372479584@qq.com
参考:
http://master.bioconductor.org/packages/release/workflows/vignettes/methylationArrayAnalysis/inst/doc/methylationArrayAnalysis.html
http://www.bioinfo-scrounger.com/archives/476
联系客服