因为是表观联合分析,所以全文分成两部分,ChIP-seq和RNA-Seq首先是各自独立处理,然后是合并分析
手动安装aspera,我用conda安装老报错,于是改用手动下载然后直接安装即可。
wget https://ak-delivery04-mul.dhe.ibm.com/sar/CMA/OSA/09cne/0/ibm-aspera-connect-3.11.0.5-linux-g2.12-64.tar.gz
一开始在官网下载最新版本,下载后解压然后安装,结果在~/.aspera/connect/etc路径下没有找到openssh文件,尝试了很多办法均无法解决,于是换了个版本下载,安装后就有了openssh文件,目前用的是3.11的版本
然后再EBI官网上找到GSE130032的aspera下载路径,选择'fastq_aspera’,下载tsv文件
一共8个文件,每个文件都是双端测序,中间用;进行分隔,把它做成一个sh脚本,方便aspera批量下载
脚本如下,比较笨,但是简洁直观
需要输入openssh文件路径,aspera下载路径,以及下载存放路径
aspera下载路径,就是上图中的tsv文件里的路径,注意需要在前面加上'era-fasp@'
下载完成后即可得到16个文件。_1和_2代表双端测序的两端的结果。
这里SRR8928701和SRR8928702是同一个样品测2次的结果,直接用 cat合并SRR8928701_1.fastq.gz和SRR8928702_1.fastq.gz即可。
将下载的数据一一进行合并,最终得到8个fastq.gz
使用hisat2进行比对, 用samtools将sam文件转为bam文件,得到4个bam文件
采用featureCounts进行定量,得到4个count文件
报错预警——比对的结果没问题,但是featurecounts定量结果successfully assigned alignments是0
检查半天才发现,hisat2的索引用的是grcm38的(在hisat2官网下载的),定量用的gtf文件是grcm39,版本不一样导致,用hisat2重新对grcm39的参考基因组构建索引后,再重新定量就可以了!
我把得到的4个featurecount定量结果直接merge起来,就得到一个矩阵
#加载R包
library(DESeq2)
library(tibble)
library(pheatmap)
library(data.table)
library(stringr)
library(dplyr)
library(clusterProfiler)
library(org.Mm.eg.db)
library(VennDiagram)
rm(list = ls())
#读取数据
ct1 <- read.table('SRR89287012.count',header = T)[,c(1,7)]
ct2 <- read.table('SRR89287034.count',header = T)[,c(1,7)]
ES1 <- read.table('SRR89287056.count',header = T)[,c(1,7)]
ES2 <- read.table('SRR89287078.count',header = T)[,c(1,7)]
count_df <- merge(ES1,ES2,by='Geneid')
count_df <- merge(count_df,ct1,by='Geneid')
count_df <- merge(count_df,ct2,by='Geneid')
colnames(count_df) <- c('ID','ES1','ES2','CT1','CT2')
count_df <- column_to_rownames(count_df,'ID')
# DESeq2进行差异筛选,用count矩阵即可
group <- factor(c('ES','ES','CT','CT'),levels = c('CT','ES'))
coldata <- data.frame(row.names = colnames(count_df),group=group)
dds <- DESeqDataSetFromMatrix(countData = count_df,colData = coldata,design = ~group)
dds2 <- DESeq(dds)
res <- results(dds2)
res_df <- as.data.frame(res)
#差异筛选
res_df$change <- ifelse(res_df$pvalue<0.05 & res_df$log2FoldChange>1,'up',
ifelse(res_df$pvalue<0.05 & res_df$log2FoldChange<1,'down','nochange'))
deg <- res_df[which(res_df$change=='up' | res_df$change=='down'),]
deg_genes <- rownames(deg)
差异分析完成,看一下差异热图。热图需要表达矩阵,这里把count转换成TPM再画热图
# count 转换成TPM
gene_length <- fread('mm_gene_length.txt')
gene_length <- as.data.frame(gene_length)
gene_length <- gene_length[!duplicated(gene_length$id),]
mm_gene_length就是一个2列的表格,gene name和gene length,这个结果可以从gtf文件里自行获取
exp <- as.data.frame(count_df)
exp$id <- unlist(lapply(rownames(exp),function(x){str_split(x,'\\.',simplify = T)[,1]}))
merge <- left_join(x=exp,y=gene_length,by='id')
merge <- na.omit(merge)
merge <- merge[!duplicated(merge$id),]
rownames(merge) <- merge$id
merge <- merge[,-5]
# 计算TPM
kb <- merge$length / 1000
rpk <- merge[,-5] / kb
tpm <- t(t(rpk) / colSums(rpk) * 1E6)
exp <- tpm
# GO功能注释
up_genes <- rownames(res_df[which(res_df$change=='up'),])
down_genes <- rownames(res_df[which(res_df$change=='down'),])
up_enrich <- enrichGO(up_genes,OrgDb = org.Mm.eg.db,keyType = 'SYMBOL',ont = 'BP')
down_enrich <- enrichGO(down_genes,OrgDb = org.Mm.eg.db,keyType = 'SYMBOL',ont = 'BP')
barplot(up_enrich)
barplot(down_enrich)
#构建bwa索引
bwa index -p gcrm39 GCF_000001635.27_GRCm39_genomic.fna.gz
#把重复采集的两个文件进行合并,合并后再压缩
zcat SRR8903672.fastq.gz SRR8903673.fastq.gz > GSM3723395.fastq
gzip GSM3723395.fastq
#bwa进行比对,sam转bam
bwa mem -t 8 ~/mouse_database/gcrm39 GSM3723395.fastq.gz > GSM3723395.bwa.sam
samtools sort -@ 6 GSM3723395.bwa.sam -o GSM3723395.bwa.bam
#MACS2进行peak calling
macs2 callpeak -t GSM3723395.bwa.bam -c GSM3723397.bwa.bam -g mm -B -n rep1
采用chipseeker包里的readPeakFile函数读取上游分析得到的narrowpeak文件,或者读取bed文件,得到的都是一个Large GRange对象,这两个文件读进来后,内容基本上都是一样
library(ChIPseeker)
library(org.Mm.eg.db)
library(GenomicFeatures)
library(plotrix)
library(ggplot2)
rep1_narropeak <- readPeakFile('rep1_peaks.narrowPeak')
rep2_narropeak <- readPeakFile('rep2_peaks.narrowPeak')
file_list <- list('rep1_peaks.narrowPeak','rep2_peaks.narrowPeak')
除此之外,还需要一个TxDB数据库,这个可以自己构建,用GTF文件或者GFF文件
我在上游比对时用的是 GCF_000001635.27_GRCm39_genomic.fna.gz,那么这里构建TxDB就用GCF_000001635.27_GRCm39_genomic.gtf.gz这个文件,都是在NCBI上下载的
txdb <- makeTxDbFromGFF('../GCF_000001635.27_GRCm39_genomic.gtf.gz')
先看Venn图
vennplot(list(rep1=rep1_narropeak,rep2=rep2_narropeak),by='gplots')
对拿到的peak进行注释,可以单个文件单个文件的注释,也可以写成列表的形式多个文件一起注释
# peak注释
rep1_peakAnno <- annotatePeak(rep1_narropeak, tssRegion=c(-3000, 3000),
TxDb=txdb, annoDb="org.Mm.eg.db")
rep2_peakAnno <- annotatePeak(rep2_narropeak, tssRegion=c(-3000, 3000),
TxDb=txdb, annoDb="org.Mm.eg.db")
peakAnnoList <- lapply(file_list, annotatePeak,TxDb=txdb,tssRegion=c(-3000, 3000))
# 饼图 展示peak落在各个区域的占比
plotAnnoPie(rep1_peakAnno)
plotAnnoPie(rep2_peakAnno)
换一种展示方式,rep1和rep2间的一致性还是不错的
plotAnnoBar(peakAnnoList)
原文中是根据到TSS的距离划分成3类
这里可以根据GRanges对象里的distanceToTSS这个属性进行划分,里面包含了所有peak到TSS的距离
rep1_peakAnno@anno$distanceToTSS
pie_data1 <- ifelse(rep1_peakAnno@anno$distanceToTSS>(-1000) & rep1_peakAnno@anno$distanceToTSS<400,'Proximal',
ifelse(rep1_peakAnno@anno$distanceToTSS>(-100000) & rep1_peakAnno@anno$distanceToTSS<(-1000),'Distal upstream',
ifelse(rep1_peakAnno@anno$distanceToTSS>400 & rep1_peakAnno@anno$distanceToTSS<100000,'Distal downstream','none')))
freq <- table(pie_data1) / length(pie_data1)
freq <- freq[-3]
label <- c('Distal downstream(48%)','Distal upstream(36%)','Proximal(10%)')
pie3D(x = freq,labels = label,col = c('red','green','yellow'),radius=2,labelcex=1)
pie_data2 <- ifelse(rep2_peakAnno@anno$distanceToTSS>(-1000) & rep2_peakAnno@anno$distanceToTSS<400,'Proximal',
ifelse(rep2_peakAnno@anno$distanceToTSS>(-100000) & rep2_peakAnno@anno$distanceToTSS<(-1000),'Distal upstream',
ifelse(rep2_peakAnno@anno$distanceToTSS>400 & rep2_peakAnno@anno$distanceToTSS<100000,'Distal downstream','none')))
freq <- table(pie_data2) / length(pie_data2)
freq <- freq[-3]
label <- c('Distal downstream(49%)','Distal upstream(37%)','Proximal(9%)')
pie3D(x = freq,labels = label,col = c('red','green','yellow'),radius=2,labelcex=1)
Motif分析
原文中分别对TSS近端和远端的motif进行分析,用的是SeqPos,这里我用MEME进行分析
MEME要求的输入是等长的核酸序列,首先需要找到近端和远端peak上下游1000bp的核酸序列
可以通过如下这个dataFrame来进行数据处理
View(as.data.frame(peakAnno@anno))
每一行是一个peak,首先把近端和远端的peak筛选出来
Distal_index <- which(pie_data1=='Distal downstream' | pie_data1=='Distal upstream')
Proximal_index <- which(pie_data1=='Proximal')
Distal_df <- as.data.frame(peakAnno@anno)[Distal_index,]
Proximal_df <- as.data.frame(peakAnno@anno)[Proximal_index,]
然后构建出一个bed文件,文件中包含染色体信息,peak起始位点,peak终止位点。我找到了peak的中点,然后选取上下游各1000bp的长度
get_bed <- function(peakAnno_df,output){
df <- data.frame(row.names = 1:nrow(peakAnno_df))
df$name <- peakAnno_df[,'seqnames']
df$start <- as.integer((peakAnno_df[,'end']+peakAnno_df[,'start'])*0.5)-1000
df$start <- ifelse(df$start<0,0,df$start)
df$end <- as.integer((peakAnno_df[,'end']+peakAnno_df[,'start'])*0.5)+1000
df$peak <- peakAnno_df[,'V4']
df$intensity <- peakAnno_df[,'V9']
# 默认quote=T,输出的字符串会带"",bedtools getfasta会报错
write.table(df,output,row.names = F,col.names = F,sep = '\t',quote = F)
}
get_bed(Distal_df,'distal.bed')
get_bed(Proximal_df,'proximal.bed')
得到的bed文件是这样
然后根据这个bed文件,找到参考基因序列(GCF_000001635.27_GRCm39_genomic.fna.gz)中的碱基序列
当然,可以再R中读取参考基因组文件进行筛选,但是想必比较耗时,我还是在linux上用bedtools来完成这个操作
这样最终可以得到2个文件,distal.fa和proximal.fa,然后再MEME中进行分析
对proximal进行分析,找到了32个motif
shared_up_genes <- intersect(rep1_peakAnno@anno$geneId,up_genes)
shared_down_genes <- intersect(rep1_peakAnno@anno$geneId,down_genes)
venn.diagram(x=list('chipseq'=rep1_peakAnno@anno$geneId,'rnaseq_up'=up_genes),filename = 'venn1.png',
scaled = F, alpha=0.5,cex=1,fill=c('#FFFFCC','#CCFFFF'),cat.dist=0.05,cat.pos = 0)
venn.diagram(x=list('chipseq'=rep1_peakAnno@anno$geneId,'rnaseq_down'=down_genes),filename = 'venn2.png',
scaled = F, alpha=0.5,cex=1,fill=c('#FFFFCC','#CCFFFF'),cat.dist=0.05,cat.pos = 0)
rep1_df_up <- rep1_df[rep1_df$geneId %in% shared_up_genes,]
pie_data_up <- ifelse(rep1_df_up$distanceToTSS>(-1000) & rep1_df_up$distanceToTSS<400,'Proximal',
ifelse(rep1_df_up$distanceToTSS>(-100000) & rep1_df_up$distanceToTSS<(-1000),'Distal upstream',
ifelse(rep1_df_up$distanceToTSS>400 & rep1_df_up$distanceToTSS<100000,'Distal downstream','none')))
freq <- table(pie_data_up) / length(pie_data_up)
freq <- freq[-3]
label <- c('Distal downstream(56%)','Distal upstream(32%)','Proximal(11%)')
pie3D(x = freq,labels = label,col = c('red','green','yellow'),radius=1,labelcex=1)
rep1_df_down <- rep1_df[rep1_df$geneId %in% shared_down_genes,]
pie_data_down <- ifelse(rep1_df_down$distanceToTSS>(-1000) & rep1_df_down$distanceToTSS<400,'Proximal',
ifelse(rep1_df_down$distanceToTSS>(-100000) & rep1_df_down$distanceToTSS<(-1000),'Distal upstream',
ifelse(rep1_df_down$distanceToTSS>400 & rep1_df_down$distanceToTSS<100000,'Distal downstream','none')))
freq <- table(pie_data_down) / length(pie_data_down)
freq <- freq[-3]
label <- c('Distal downstream(54%)','Distal upstream(33%)','Proximal(10%)')
pie3D(x = freq,labels = label,col = c('red','green','yellow'),radius=1,labelcex=1)
up_gene_peak_df <- rep1_df[rep1_df$geneId %in% up_genes,]
down_gene_peak_df <- rep1_df[rep1_df$geneId %in% down_genes,]
get_bed <- function(peakAnno_df,output){
df <- data.frame(row.names = 1:nrow(peakAnno_df))
df$name <- peakAnno_df[,'seqnames']
df$start <- as.integer((peakAnno_df[,'end']+peakAnno_df[,'start'])*0.5)-1000
df$start <- ifelse(df$start<0,0,df$start)
df$end <- as.integer((peakAnno_df[,'end']+peakAnno_df[,'start'])*0.5)+1000
df$peak <- peakAnno_df[,'V4']
df$intensity <- peakAnno_df[,'V9']
write.table(df,output,row.names = F,col.names = F,sep = '\t',quote = F)
}
get_bed(up_gene_peak_df,'up_gene.bed')
get_bed(down_gene_peak_df,'down_gene.bed')
rnaseq上调gene和chipseq取交集,得到的gene的peak富集到的motif
rnaseq下调gene和chipseq取交集,得到的gene的peak富集到的motif
联系客服