在这里,我们提供了用于构造RangedSummarizedExperiment对象的气道实验数据包。实验引用如下:
Himes BE, Jiang X, Wagner P, Hu R, Wang Q, Klanderman B, Whitaker RM, Duan Q, Lasky-Su J, Nikolos C, Jester W, Johnson M, Panettieri R Jr, Tantisira KG, Weiss ST, Lu Q.“RNA-Seq转录组分析发现CRISPLD2是调节气道平滑肌细胞细胞因子功能的糖皮质激素响应基因。”公共科学图书馆,2014年6月13日;9(6):e99625。PMID:24926665.地理:GSE52778.
摘要:对气道平滑肌(ASM)细胞系RNA-Seq实验的简要描述:“使用RNA-Seq,一种高通量测序方法,我们在四种原代人ASM细胞系中描述了转录组变化,地塞米松是一种有效的合成糖皮质激素(1微摩尔,持续18小时)。”
下面的代码块从从GEO下载的序列矩阵文件中获取示例信息。然后对列进行解析,并添加具有较短名称和因子级别的新列。
suppressPackageStartupMessages(library("GEOquery")) suppressPackageStartupMessages(library("airway")) dir <- system.file("extdata",package="airway") geofile <- file。path(dir, " gse52778_series_矩阵.txt") gse <- getGEO(filename=geofile)
##解析列规范:# #关口(# # ID_REF = col_character (), # # GSM1275862 = col_character (), # # GSM1275863 = col_character (), # # GSM1275864 = col_character (), # # GSM1275865 = col_character (), # # GSM1275866 = col_character (), # # GSM1275867 = col_character (), # # GSM1275868 = col_character (), # # GSM1275869 = col_character (), # # GSM1275870 = col_character (), # # GSM1275871 = col_character (), # # GSM1275872 = col_character (), # # GSM1275873 = col_character (), # # GSM1275874 = col_character (), # # GSM1275875 = col_character (),## GSM1275876 = col_character(), ## GSM1275877 = col_character() ##
##文件存储在:
# # / tmp / RtmpxJ7Nsk / GPL11154.soft
pdata <- pdata (gse)[,grepl("ch1",names(pdata (gse)))] names(pdata) <- c("treatment","tissue","ercc_mix","cell","celltype") pdataclean <- data.frame(treatment=sub("treatment: (.*)","\\1",pdata$treatment), cell=sub("cell line: "(.*)","\\1",pdata$cell), row.names=rownames(pdata)) pdataclean$dex <- ifelse(grepl(" dex ",pdataclean$treatment),"trt","untrt") pdataclean$albut <- ifelse(grepl(" albut ",pdataclean$treatment),"trt","untrt") pdataclean$SampleName <- rownames(pdataclean) pdataclean$treatment <- NULL
将来自GEO的示例信息与SRA运行id连接起来的信息从此处下载SRA使用发送到:文件按钮。
Srafile <- file。path(dir, "SraRunInfo_SRP033351.csv") srp <- read.csv(srafile) srpsmall <- srp[,c("Run","avgLength","Experiment","Sample","BioSample","SampleName")]
这两个data.frames合并,然后我们只将未使用沙丁胺醇处理的样本(这些样本未包括在该出版物的分析中)。
coldata <- merge(pdataclean, srpsmall, by="SampleName") rownames(coldata) <- coldata$Run coldata <- coldata[coldata$albut == "untrt",] coldata$albut <- NULL coldata
## SampleName cell编号## SRR1039508 GSM1275862组织:人气道平滑肌细胞untrt ## SRR1039509 GSM1275863组织:人气道平滑肌细胞untrt ## SRR1039510 GSM1275864组织:人气道平滑肌细胞untrt ## SRR1039511 GSM1275865组织:人气道平滑肌细胞untrt ## SRR1039513 GSM1275867组织:人气道平滑肌细胞untrt ## SRR1039514 GSM1275868组织:人气道平滑肌细胞untrt ## # SRR1039515 GSM1275869组织:人气道平滑肌细胞untrt ## SRR1039516 GSM1275870组织:人气道平滑肌细胞untrt ## SRR1039517 GSM1275871组织:人气道平滑肌细胞untrt ## SRR1039518 GSM1275872组织:人气道平滑肌细胞untrt ## SRR1039519 GSM1275873组织:人气道平滑肌细胞untrt # SRR1039520 GSM1275874组织:人体气道平滑肌细胞untrt ## SRR1039521 GSM1275875组织:人体气道平滑肌细胞untrt ## SRR1039522 GSM1275876组织:人体气道平滑肌细胞untrt ## SRR1039523 GSM1275877组织:SRR1039508 SRR1039508 126 SRX384345 SRS508568 SAMN02422669 ## SRR1039509 SRR1039509 126 SRX384346 SRS508567 SAMN02422675 ## SRR1039510 SRR1039510 126 SRX384347 SRS508570 SAMN02422668 ## SRR1039511 SRR1039511 SRX384348 srr508569 SAMN02422667 ## SRR1039511 SRR1039511 126 SRX384349 SRS508571 SAMN02422678 ## SRR1039513 SRR1039513 87 SRX384350 srr1039572 SRR1039514 SRR1039514 126 SRX384351Srs508574 samn02422681 ## srr1039515 srr1039515 114 srx384352 srs508573 samn02422671 ## srr1039516 srr1039516 srx384353 srs508575 samn02422682 ## srr1039517 srr1039517 126 srx384354 srs508576 samn02422673 ## srr1039518 srr1039518 126 srx384355 srs508578 samn02422679 ## srr1039519 srr1039519 107 srx384356 srs508577 samn02422672 ## srr1039519 srr1039519 srx384356 srs508577 samn02422683 ## srr1039521 srr1039521 98 srx384358 srs508580 srr1039522 srr1039522 125 srx384322Srs508582 samn02422680 ## srr1039523 srr1039523 126 srx384360 srs508581 samn02422674
最后,将示例表保存到CSV文件中以供将来参考。该文件包含在本月/ extdata
这个包的目录。
write.csv (coldata、文件=“sample_table.csv”)
创建一个包含SRA运行号的文件:文件
.此文件用于从SRA中下载已排序的读取wget
.下面的命令用于从.sra
文件,使用SRA工具包
Cat files | parallel -j 7 fastq-dump——split-files {}.sra
对象对读取数据进行对齐STAR读取校准器到GRCh37,使用来自Ensembl发行版75的注释。
对于f在' cat files ';do STAR——genomeDir ../STAR/ENSEMBL.homo_sapiens. .release-75 \——readFilesIn fastq/$f\_1。fastq fastq / $ f \ _2。fastq \——runThreadN 12——outFileNamePrefix aligned/$f.;完成
SAMtools用于生成BAM文件。
cat文件| parallel -j 7 samtools view -bS aligned/{}. aligned .out。Sam -o对齐/{}.bam
从Biomart获得了智人(homo sapiens Ensembl)基因的转录本数据库。
library(" genome features ") txdb <- makeTranscriptDbFromBiomart(biomart="ensembl", dataset="hsapiens_gene_ensembl") exonsByGene <- exonsBy(txdb, by="gene")
方法指定BAM文件SRR
来自SRA的id。使用200万次读取的yield大小来限制读计数期间使用的内存。
sampleTable <- read.csv("sample_table.csv", row.names=1) fls <- file.path("aligned",rownames(sampleTable), ".bam") library("Rsamtools") bamLst <- BamFileList(fls, yieldSize=2000000)
以下summarizeOverlaps
call将8个成对端的BAM文件分发给8个worker。每个工人最多占用16gb,耗时50分钟。
library("BiocParallel") register(MulticoreParam(workers=8)) library("GenomicAlignments") airway <- summarizeOverlaps(features=exonsByGene, reads=bamLst, mode="Union", singleEnd=FALSE,忽略。strand=TRUE, fragments=TRUE)
然后将示例信息作为列数据添加。
colData(气道)<- DataFrame(sampleTable)
最后,我们附上MIAME
使用Pubmed ID获取信息。
library(" annotation ") miame <- list(pmid2MIAME("24926665")) miame[[1]]@url <- "http://www.ncbi.nlm.nih.gov/pubmed/24926665" #因为R's CHECK不喜欢数据对象#或小片段中的非ascii字符。miame[[1]]@abstract <- gsub("micro","micro",abstract(miame[[1]])) miame[[1]]@abstract <- gsub("beta","beta",abstract(miame[[1]])) metadata(气道)<- miame save(气道,file="气道. rdata ")
下面我们打印一些基本的概要统计数据气道
对象,该对象是本实验数据包提供的。
库(“气道”)数据(气道)气道
##类:rangedsummarizeexperimental ## dim: 64102 8 ##元数据(1):“## assays(1):计数## rownames(64102): ENSG00000000003 ENSG00000000005…LRG_98 LRG_99 ## rowData names(0): ## colnames(8): SRR1039508 SRR1039509…SRR1039520 SRR1039521 ## colData names(9): SampleName cell…样本BioSample
as.data.frame (colData(气管))
## SampleName cell dex albut Run avgLength实验## SRR1039508 GSM1275862 N61311 untrt untrt SRR1039509 GSM1275863 N61311 trt untrt SRR1039509 126 SRX384346 ## SRR1039512 GSM1275866 N052611 trt untrt SRR1039512 126 SRX384349 ## SRR1039513 GSM1275867 N052611 trt untrt SRR1039513 # SRR1039516 GSM1275870 N080611 trt untrt SRR1039516 # SRR1039517 GSM1275871 N080611 trt untrt SRR1039517 126 srx1039554 ## SRR1039517## SRR1039508 SRS508568 SAMN02422669 ## SRR1039509 SRS508567 SAMN02422675 ## SRR1039512 SRS508571 SAMN02422678 ## SRR1039513 SRS508572 SAMN02422682 ## SRR1039517 SRS508575 SAMN02422673 ## SRR1039520 SRS508579 SAMN02422683 ## SRR1039521 SRS508580 SAMN02422677
总结(colSums(化验(气管))/ 1 e6)
最小第一曲,中位数,平均第三曲,最大值。## 15.16 19.05 20.90 21.94 24.67 30.82
元数据(rowRanges(气管))
## [1] "TranscriptDb" ## ## $genomeInfo$ '配套包' ## [1]" genome features " ## ## $genomeInfo$ '数据源' ## [1]"BioMart" ## ## $genomeInfo$有机体##[1]"智人" ## ## $genomeInfo$ '资源URL ' ## [1] "www.biomart.org:80" ## ## $genomeInfo$ ' BioMart数据库' ## [1]"ensembl" ## ## $genomeInfo$ ' BioMart数据库版本' ## [1]"ensembl GENES 75 (SANGER UK)"## ## $genomeInfo$ ' BioMart数据集' ## [1]"hsapiens_gene_ensembl" ## ## $genomeInfo$ ' BioMart数据集描述' ##[1]"智人基因(GRCh37.p13)"## ## $genomeInfo$ ' BioMart数据集版本' ## [1]"GRCh37。p13" ## ## $genomeInfo$ '完整数据集' ##[1]"是" ## ## $genomeInfo$ ' miRBase build ID ' ## [1] NA ## ## $genomeInfo$转录t_nrow ## [1] "215647" ## ## $genomeInfo$exon_nrow ## [1] "745593" ## ## $genomeInfo$cds_nrow ## [1] "537555" ## ## $genomeInfo$ ' Db created by ' ## [1] "GenomicFeatures package from Bioconductor" ## ## $genomeInfo$ ' Creation time ' ## [1] "2014-07-10 14:55:55 -0400 (Thu, 10 Jul 2014)"## ## $genomeInfo$“创建时的基因组特征版本”##[1]“1.17.9”## ## $genomeInfo$“创建时的RSQLite版本”##[1]“0.11.4”## ## $genomeInfo$DBSCHEMAVERSION ##[1]“1.0”
sessionInfo ()
## R version 3.5.1 Patched (2018-07-12 r74967) ##平台:x86_64-pc-linux-gnu(64位)##运行在Ubuntu 16.04.5 LTS下## ##矩阵产品:默认## BLAS: /home/biocbuild/bbs-3.8-bioc/R/lib/libRblas。/home/biocbuild/bbs-3.8-bioc/R/lib/libRlapack。所以## ## locale: ## [1] LC_CTYPE=en_US。UTF-8 LC_NUMERIC= c# # [3] LC_TIME=en_US。UTF-8 LC_COLLATE= c# # [5] LC_MONETARY=en_US。utf - 8 LC_MESSAGES = en_US。UTF-8 ## [7] LC_PAPER=en_US。UTF-8 LC_NAME= c# # [9] LC_ADDRESS=C lc_phone = c# # [11] LC_MEASUREMENT=en_US。UTF-8 LC_IDENTIFICATION=C ## ##附加的基本包:## [1]stats4并行统计图形grDevices utils数据集##[8]方法基础## ##其他附加包:[1] bdrcpp_0.2.2 airway_1.2.0 ## [5] SummarizedExperiment_1.12.0 DelayedArray_0.8.0 ## [5] BiocParallel_1.16.0 matrixstats_0.0.54.0 ## [7] GenomicRanges_1.34.0 GenomeInfoDb_1.18.0 ## [9] IRanges_2.16.0 S4Vectors_0.20.0 ## [13] BiocGenerics_0.28.0 ## ##通过命名空间加载(并且没有附加):## [1] Rcpp_0.12.19 pillar_1.3.0 compiler_3.5.1 ## [4] bindr_0.1.1 XVector_0.22.0 bitops_1.0-6 ## [7] tools_3.5.1 zlibbioc_1.28.0 lattice_0.20-35 ## [10] evaluate_0.12 tibble_1.4.2 pkgconfig_2.0.2 ## [13] rlang_0.3.0.1 Matrix_1.2-14 GenomeInfoDbData_1.2.0 ## [16] dplyr_0.7.7 string_1 .3.1 xml2_1.2.0 ## [19] knitr_1.20 hms_0.4.2 grid_3.5.1 ## [22] tidyselect_0.2.5 glue_1.3.0 r6_2 .1.1 ## [28] purrr_0.2.5 magrittr_1.5 assertthat_0.2.0 ## [31]stringi_1.2.4 RCurl_1.95-4.11 crayon_1.3.4