我们的生信入门班和数据挖掘线上直播课程已经有了三年多的历史,培养了一波又一波优秀的生信人才。本期分享的内容不是课堂上讲的,是学员在我们的帮助下探索出来的。
(生信技能树优秀学员可乐同学)
马拉松授课线上直播课程,最近的一期是下周一开课哦:
准备分析转录组测序数据集(GSE111842)时发现只有RAW数据,具体是单个样本的count数据,需要先合并制作行名为gene_id,列名为样本的表达矩阵,然后进行id转换。自己写代码的时候是每个文件独立读取,效率很低,幸得生信技能树jimmy老师、小洁老师指导,使用jimmy老师推荐的代码,探索解决问题。最后是小洁老师的答案。
rm(list=ls())
filest <- list.files(pattern="GSM")
exp<- read.table(filest[1],row.names = 1)
for (i in 2:length(filest)) {
exp <- cbind(exp, read.table(filest[i],row.names = 1))
}
head(exp)
## V2 V2 V2 V2 V2 V2 V2 V2 V2 V2 V2 V2 V2 V2 V2 V2 V2 V2 V2 V2
## ENSG00000223972.5 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0
## ENSG00000227232.5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## ENSG00000278267.1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## ENSG00000243485.4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## ENSG00000237613.2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## ENSG00000268020.3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## V2 V2 V2 V2 V2 V2 V2 V2 V2 V2 V2 V2 V2 V2
## ENSG00000223972.5 0 0 1 3 0 6 0 0 0 0 0 0 15 0
## ENSG00000227232.5 0 0 3 0 0 5 0 0 17 0 0 0 0 0
## ENSG00000278267.1 0 0 0 0 0 2 0 0 0 0 0 0 0 0
## ENSG00000243485.4 0 0 2 0 0 3 28 2 0 0 0 0 0 0
## ENSG00000237613.2 0 0 0 0 0 3 0 0 0 0 0 0 0 0
## ENSG00000268020.3 0 0 0 0 0 0 0 0 6 0 0 0 0 0
exp <- cbind(rownames(exp),data.frame(exp,row.names=NULL))
names(exp) <- c("gene_id",paste0('BLOOD',1:6),paste0('CTC',1:16), paste0('Primary Tumor',1:12))
head(exp)
## gene_id BLOOD1 BLOOD2 BLOOD3 BLOOD4 BLOOD5 BLOOD6 CTC1 CTC2 CTC3
## 1 ENSG00000223972.5 0 0 0 0 0 0 0 0 0
## 2 ENSG00000227232.5 0 0 0 0 0 0 0 0 0
## 3 ENSG00000278267.1 0 0 0 0 0 0 0 0 0
## 4 ENSG00000243485.4 0 0 0 0 0 0 0 0 0
## 5 ENSG00000237613.2 0 0 0 0 0 0 0 0 0
## 6 ENSG00000268020.3 0 0 0 0 0 0 0 0 0
## CTC4 CTC5 CTC6 CTC7 CTC8 CTC9 CTC10 CTC11 CTC12 CTC13 CTC14 CTC15 CTC16
## 1 0 0 0 3 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0 0 0 0
## Primary Tumor1 Primary Tumor2 Primary Tumor3 Primary Tumor4 Primary Tumor5
## 1 1 3 0 6 0
## 2 3 0 0 5 0
## 3 0 0 0 2 0
## 4 2 0 0 3 28
## 5 0 0 0 3 0
## 6 0 0 0 0 0
## Primary Tumor6 Primary Tumor7 Primary Tumor8 Primary Tumor9 Primary Tumor10
## 1 0 0 0 0 0
## 2 0 17 0 0 0
## 3 0 0 0 0 0
## 4 2 0 0 0 0
## 5 0 0 0 0 0
## 6 0 6 0 0 0
## Primary Tumor11 Primary Tumor12
## 1 15 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
library(tidyr)
library(rtracklayer)
exp<- tidyr :: separate(exp, gene_id,into = c('gene_id','junk'),sep='\\.')
exp<- exp[,-2]
head(exp)
## gene_id BLOOD1 BLOOD2 BLOOD3 BLOOD4 BLOOD5 BLOOD6 CTC1 CTC2 CTC3 CTC4
## 1 ENSG00000223972 0 0 0 0 0 0 0 0 0 0
## 2 ENSG00000227232 0 0 0 0 0 0 0 0 0 0
## 3 ENSG00000278267 0 0 0 0 0 0 0 0 0 0
## 4 ENSG00000243485 0 0 0 0 0 0 0 0 0 0
## 5 ENSG00000237613 0 0 0 0 0 0 0 0 0 0
## 6 ENSG00000268020 0 0 0 0 0 0 0 0 0 0
## CTC5 CTC6 CTC7 CTC8 CTC9 CTC10 CTC11 CTC12 CTC13 CTC14 CTC15 CTC16
## 1 0 0 3 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0 0 0
## Primary Tumor1 Primary Tumor2 Primary Tumor3 Primary Tumor4 Primary Tumor5
## 1 1 3 0 6 0
## 2 3 0 0 5 0
## 3 0 0 0 2 0
## 4 2 0 0 3 28
## 5 0 0 0 3 0
## 6 0 0 0 0 0
## Primary Tumor6 Primary Tumor7 Primary Tumor8 Primary Tumor9 Primary Tumor10
## 1 0 0 0 0 0
## 2 0 17 0 0 0
## 3 0 0 0 0 0
## 4 2 0 0 0 0
## 5 0 0 0 0 0
## 6 0 6 0 0 0
## Primary Tumor11 Primary Tumor12
## 1 15 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
proj <- 'GSE111842'
library(stringr)
library(AnnoProbe)
library(org.Hs.eg.db)
library(usethis)
library(devtools)
ids=annoGene(exp$gene_id,'ENSEMBL','human',out_file ='convert.csv')
head(ids)
## SYMBOL biotypes ENSEMBL chr start
## 1 DDX11L1 transcribed_unprocessed_pseudogene ENSG00000223972 chr1 11869
## 2 WASH7P unprocessed_pseudogene ENSG00000227232 chr1 14404
## 3 MIR6859-1 miRNA ENSG00000278267 chr1 17369
## 4 MIR1302-2HG lncRNA ENSG00000243485 chr1 29554
## 6 FAM138A lncRNA ENSG00000237613 chr1 34554
## 7 OR4G4P unprocessed_pseudogene ENSG00000268020 chr1 52473
## end
## 1 14409
## 2 29570
## 3 17436
## 4 31109
## 6 36081
## 7 53312
ids=ids[!duplicated(ids$SYMBOL),]
ids=ids[!duplicated(ids$ENSEMBL),]
rownames(exp)=exp$gene_id
exp= exp[match(ids$ENSEMBL,rownames(exp)),]
rownames(exp) = ids$SYMBOL
exp=exp[,-1]
head(exp)
## BLOOD1 BLOOD2 BLOOD3 BLOOD4 BLOOD5 BLOOD6 CTC1 CTC2 CTC3 CTC4 CTC5
## DDX11L1 0 0 0 0 0 0 0 0 0 0 0
## WASH7P 0 0 0 0 0 0 0 0 0 0 0
## MIR6859-1 0 0 0 0 0 0 0 0 0 0 0
## MIR1302-2HG 0 0 0 0 0 0 0 0 0 0 0
## FAM138A 0 0 0 0 0 0 0 0 0 0 0
## OR4G4P 0 0 0 0 0 0 0 0 0 0 0
## CTC6 CTC7 CTC8 CTC9 CTC10 CTC11 CTC12 CTC13 CTC14 CTC15 CTC16
## DDX11L1 0 3 0 0 0 0 0 0 0 0 0
## WASH7P 0 0 0 0 0 0 0 0 0 0 0
## MIR6859-1 0 0 0 0 0 0 0 0 0 0 0
## MIR1302-2HG 0 0 0 0 0 0 0 0 0 0 0
## FAM138A 0 0 0 0 0 0 0 0 0 0 0
## OR4G4P 0 0 0 0 0 0 0 0 0 0 0
## Primary Tumor1 Primary Tumor2 Primary Tumor3 Primary Tumor4
## DDX11L1 1 3 0 6
## WASH7P 3 0 0 5
## MIR6859-1 0 0 0 2
## MIR1302-2HG 2 0 0 3
## FAM138A 0 0 0 3
## OR4G4P 0 0 0 0
## Primary Tumor5 Primary Tumor6 Primary Tumor7 Primary Tumor8
## DDX11L1 0 0 0 0
## WASH7P 0 0 17 0
## MIR6859-1 0 0 0 0
## MIR1302-2HG 28 2 0 0
## FAM138A 0 0 0 0
## OR4G4P 0 0 6 0
## Primary Tumor9 Primary Tumor10 Primary Tumor11 Primary Tumor12
## DDX11L1 0 0 15 0
## WASH7P 0 0 0 0
## MIR6859-1 0 0 0 0
## MIR1302-2HG 0 0 0 0
## FAM138A 0 0 0 0
## OR4G4P 0 0 0 0
save(exp,proj,file='data_matrix.Rdata')
rm(list = ls())
f = dir(pattern = "^GSM")
f
## [1] "GSM3040918_BLOOD_36581.readCounts.txt.gz"
## [2] "GSM3040919_BLOOD_36683.readCounts.txt.gz"
## [3] "GSM3040920_BLOOD_36828.readCounts.txt.gz"
## [4] "GSM3040921_BLOOD_47934.readCounts.txt.gz"
## [5] "GSM3040922_BLOOD_58029.readCounts.txt.gz"
## [6] "GSM3040923_BLOOD_68172.readCounts.txt.gz"
## [7] "GSM3040924_CTC_36581.readCounts.txt.gz"
## [8] "GSM3040925_CTC_36683.readCounts.txt.gz"
## [9] "GSM3040926_CTC_36828.readCounts.txt.gz"
## [10] "GSM3040927_CTC_47934.readCounts.txt.gz"
## [11] "GSM3040928_CTC_58029.readCounts.txt.gz"
## [12] "GSM3040929_CTC_68172.readCounts.txt.gz"
## [13] "GSM3040930_CTC_31176.readCounts.txt.gz"
## [14] "GSM3040931_CTC_36285.readCounts.txt.gz"
## [15] "GSM3040932_CTC_36515.readCounts.txt.gz"
## [16] "GSM3040933_CTC_37232.readCounts.txt.gz"
## [17] "GSM3040934_CTC_37289.readCounts.txt.gz"
## [18] "GSM3040935_CTC_68052.readCounts.txt.gz"
## [19] "GSM3040936_CTC_78322.readCounts.txt.gz"
## [20] "GSM3040937_CTC_78486.readCounts.txt.gz"
## [21] "GSM3040938_CTC_78858.readCounts.txt.gz"
## [22] "GSM3040939_CTC_78956.readCounts.txt.gz"
## [23] "GSM3040940_TUMOR_31176.readCounts.txt.gz"
## [24] "GSM3040941_TUMOR_36285.readCounts.txt.gz"
## [25] "GSM3040942_TUMOR_36515.readCounts.txt.gz"
## [26] "GSM3040943_TUMOR_36581.readCounts.txt.gz"
## [27] "GSM3040944_TUMOR_36683.readCounts.txt.gz"
## [28] "GSM3040945_TUMOR_36828.readCounts.txt.gz"
## [29] "GSM3040946_TUMOR_37232.readCounts.txt.gz"
## [30] "GSM3040947_TUMOR_37289.readCounts.txt.gz"
## [31] "GSM3040948_TUMOR_47934.readCounts.txt.gz"
## [32] "GSM3040949_TUMOR_68052.readCounts.txt.gz"
## [33] "GSM3040950_TUMOR_78322.readCounts.txt.gz"
## [34] "GSM3040951_TUMOR_78858.readCounts.txt.gz"
library(stringr)
Group = str_split(f,"_",simplify = T)[,2]
Group
## [1] "BLOOD" "BLOOD" "BLOOD" "BLOOD" "BLOOD" "BLOOD" "CTC" "CTC" "CTC"
## [10] "CTC" "CTC" "CTC" "CTC" "CTC" "CTC" "CTC" "CTC" "CTC"
## [19] "CTC" "CTC" "CTC" "CTC" "TUMOR" "TUMOR" "TUMOR" "TUMOR" "TUMOR"
## [28] "TUMOR" "TUMOR" "TUMOR" "TUMOR" "TUMOR" "TUMOR" "TUMOR"
result = list()
for(i in 1:length(f)){
result[[i]] = read.table(f[[i]],row.names = 1)
colnames(result[[i]]) = str_remove(f[[i]],".readCounts.txt.gz")
}
result = do.call(cbind,result)
表达矩阵行名ID转换
exp = as.matrix(result)
library(AnnoProbe)
rownames(exp) = str_split(rownames(exp),"\\.",simplify = T)[,1];head(rownames(exp))
## [1] "ENSG00000223972" "ENSG00000227232" "ENSG00000278267" "ENSG00000243485"
## [5] "ENSG00000237613" "ENSG00000268020"
re = annoGene(rownames(exp),ID_type = "ENSEMBL");head(re)
## SYMBOL biotypes ENSEMBL chr start
## 1 DDX11L1 transcribed_unprocessed_pseudogene ENSG00000223972 chr1 11869
## 2 WASH7P unprocessed_pseudogene ENSG00000227232 chr1 14404
## 3 MIR6859-1 miRNA ENSG00000278267 chr1 17369
## 4 MIR1302-2HG lncRNA ENSG00000243485 chr1 29554
## 6 FAM138A lncRNA ENSG00000237613 chr1 34554
## 7 OR4G4P unprocessed_pseudogene ENSG00000268020 chr1 52473
## end
## 1 14409
## 2 29570
## 3 17436
## 4 31109
## 6 36081
## 7 53312
library(tinyarray)
exp = trans_array(exp,ids = re,from = "ENSEMBL",to = "SYMBOL")
exp[1:4,1:4]
## GSM3040918_BLOOD_36581 GSM3040919_BLOOD_36683
## DDX11L1 0 0
## WASH7P 0 0
## MIR6859-1 0 0
## MIR1302-2HG 0 0
## GSM3040920_BLOOD_36828 GSM3040921_BLOOD_47934
## DDX11L1 0 0
## WASH7P 0 0
## MIR6859-1 0 0
## MIR1302-2HG 0 0
再次感谢jimmy老师和小洁老师,以及生信技能树里的优秀学员们。
联系客服