创建<code>tidybulk</code> tibble。

<！—badge: start—> .单击“确定”<！—徽章:end—>

在本文中，我们展示了tidybulk/tidyverse和base r之间编码差异的一些示例。我们注意到赋值减少> 10x，行号减少> 2x。

创建`tidybulk`宠物猫。

Tt = counts_mini %>% tidybulk(样本，副本，计数)

总重复`成绩单`

整齐的转录组"{。R .黄色}tt。Aggr = tt %>% aggregate_duplicate () "

Base R ' R temp = data.frame(symbol = dge_list$genes$symbol, dge_list$counts)nr <- by(temp, temp$symbol, function(df) if(length(df[1,1])>0) matrixStats::: colsum (as.matrix(df[，-1]))Nr <- do。调用("rbind"， dge_list.nr) colnames(dge_list.nr) <- colnames(dge_list)' ' '

规模`计数`

整齐的转录组“r tt。Norm = tt。Aggr %> identify_abundance (factor_of_interest = condition) %>% scale_abundance() "

Base R " R library(edgeR) dgList <- DGEList(count_m=x,group=group) keep <- filterByExpr(dgList) dgList <- dgList[keep，，keep.lib. lib.]大小= FALSE)[…dgList <- calcNormFactors(dgList, method="TMM") norm_counts.]表<- cpm(dgList)' ' '

过滤器`变量记录`

我们可能想要识别和过滤变量转录本。

整齐的转录组“r tt.norm.variable = tt。Norm %>% keep_variable() "

基本R " R库(edgeR) x = norm_counts。table s <- rowMeans((x-rowMeans(x))^2) o <- order(s，递减=TRUE) x <- x[o[1L:top]，，drop=FALSE] norm_counts。Table = norm_counts. Table [rownames(x)]table$cell_type = tidybulk::counts[匹配(tidybulk::counts$sample, rownames(norm_counts.table))， "Cell type"] ' ' ' '

减少`维`

整齐的转录组“r tt. normal . mds = tt。norm %>% reduce_dimensions(method= " MDS "， .dims = 2) "

Base R " R library(limma) count_m_log = log(count_m + 1) cmds = limma::plotMDS(ndim = .dims, plot = FALSE) cmds = cmds %$% cmdscale。out %>% setNames(sprintf(" Dim%s "， 1:6)) cmds$cell_type = tidybulk::counts[匹配(tidybulk::counts$sample, rownames(cmds))， " Cell type "] "

主成分分析

整齐的转录组“r tt. normal . pca = tt。norm %>% reduce_dimensions(method= " PCA "， .dim = 2) "

Base R " R count_m_log = log(count_m + 1) pc = count_m_log %>% prcomp(scale = TRUE)方差= pc$sdev^2方差=(方差/ sum(方差))[1:6]pc$cell_type = counts[匹配(计数$样本，rownames(pc))， "单元类型"]"

tSNE

整理转录组" r tt.norm.tSNE = breast_tcga_mini %>% tidybulk(sample, ens, count_scaled) %>% identify_abundant() %>% reduce_dimensions(method = " tSNE "， perplexity=10, pca_scale =TRUE) "

基数R " R count_m_log = log(count_m + 1) tsne = Rtsne::Rtsne(t(count_m_log)， perplexity=10, pca_scale =TRUE)$Y tsne$cell_type = tidybulk::counts[匹配(tidybulk::counts$sample, rownames(tsne))， " Cell type "] "

旋转`维`

整理转录组" r tt. normal . mds .rotate = tt. normal . mds %>% rotate_dimensions(Dim1, Dim2, rotation_degrees = 45, action= " get ") "

Base R " R rotation = function(m, d) {R = d * pi / 180 ((bind_rows(c(1 = cos®，2 = -sin®)，c(1 = sin®，2 = cos®))%>% as_matrix) %*% m)} mds_r = pca %>% rotation(rotation_degrees) mds_r$cell_type = counts[匹配(counts$sample, rownames(mds_r))， " Cell type "] "

测试`微分丰富`

整齐的转录组" r tt.de = tt %>% test_differal_abundance (~ condition, action= " get ") tt.de "

Base R " R library(edgeR) dgList <- DGEList(counts=counts_m,group=group) keep <- filterByExpr(dgList) dgList <- dgList[keep，，keep.lib. lib.]sizes=FALSE] dgList <- calcNormFactors(dgList) design <- model.matrix(~group) dgList <- estimateDisp(dgList,design) fit <- glmQLFit(dgList,design) qlf <- glmQLFTest(fit,coef=2) topTags(qlf, n=Inf)' ' '

调整`计数`

整齐的转录组“r tt. normal .adj = tt。调节丰度(~条件+时间)"

基本R " R库(sva) count_m_log = log(count_m + 1)设计=模型。矩阵(object = ~ condition + time, data = annotation) count_m_log。sva = ComBat(batch = design[，2]， mod = design，…)Sva = ceiling(exp(count_m_log. Sva) -1)sva$cell_type = counts[match(counts$sample, rownames(count_m_log.sva))， "Cell type "] "

Deconvolve`细胞类型组成`

整齐的转录组“r tt。sybersort = tt %>% deconvolve_cell (action= " get "， cores=1) "

Base R " R source(' CIBERSORT.R ') count_m %>% write.table("mixture_file.txt") results <- CIBERSORT("sig_matrix_file.txt"， "mixture_file.txt"， perm=100, QN=TRUE) results$cell_type = tidybulk::counts[match(tidybulk::counts$sample, rownames(results))， "Cell type"] ' ' ' '

集群`样品`

k - means

整理转录组" r tt. normal .cluster = tt. normal . mds %>% cluster_elements(method="kmeans "， centers = 2, action= " get ") "

Base R " R count_m_log = log(count_m + 1) k = kmeans(count_m_log, iter. log)max = 1000，…)cluster = k$cluster cluster$cell_type = tidybulk::counts[匹配(tidybulk::counts$sample, rownames(cluster))， c(" Cell type "， " Dim1 "， " Dim2 ")] "

SNN

整齐的转录组" r tt.norm.SNN = tt.norm.tSNE %>% cluster_elements(method = " SNN ") "

Base R " R library(Seurat) snn = CreateSeuratObject(count_m) snn = ScaleData(snn, display。progress = TRUE, num.cores=4, do。参数= TRUE) snn = FindVariableFeatures(snn，选择。snn = FindVariableFeatures(snn, selection. method = " vst ")method = " vst ") snn = RunPCA(snn, npcs = 30) snn = FindNeighbors(snn) snn = FindClusters(snn, method = " igraph "，…)snn = snn[[" seurat_clusters "]] snn$cell_type = tidybulk::counts[match(tidybulk::counts$sample, rownames(snn))， c(" Cell type "， " Dim1 "， " Dim2 ")] "

下降`冗余`成绩单

整齐的转录组" r tt.norm。non_redundancy = tt. normal . mds %>% remove_redundancy(method = " correlation ") "

基本R " R库(widyr) .data。related = pairwise_cor(counts, sample, transcript, rc, sort = TRUE, diag = FALSE, upper = FALSE) %>% filter(correlation > correlation_threshold) %>% distinct(item1) %>% rename(!! !#返回非冗余数据帧计数%>% anti_join(.data. related) %>% spread(sample, rc， - transcript) %>% left_join(annotation) "

画`的热图`

tidytranscriptomics " r tt.norm.MDS %>% # filter low - abundant keep_abundant() %>% # extract 500 most可变基因keep_variable(.abundance = count_scaled, top = 500) %>% # create heatmap as_tibble() %>% heatmap(sample, transcript, count_scaled, transform = log1p) %>% add_tile(Cell type) "

基础R”R #示例取自BioC2020研讨会的气道数据集。dgList <- SE2DGEList(气道)group <- factor(dgList$samples$ '细胞类型')keep。exprs <- filterByExpr(dgList, group=group) dgList <- dgList[keep.]exprs、keep.lib。sizes=FALSE] dgList <- calcNormFactors(dgList) logcounts <- cpm(dgList, log=TRUE) var_genes <- apply(logcounts, 1, var) select_var <- names(sort(var_genes，递减=TRUE))[1:500] highly_variable_lcpm <- logcounts[select_var，] colors <- c("#440154FF"， "#21908CFF"， "#fefada") col.group <- c("红色"，"灰色")[group] gplots::热图。2(highly_variable_lcpm, col= colors, trace="none"， ColSideColors=col。集团规模= "行")' ' '

画`密度图`

tidytranscriptomics“r#示例取自BioC2020研讨会的气道数据集。Airway %>% tidybulk() %>% identify_abundance () %>% scale_abundance() %>% pivot_longer(cols = starts_with("counts ")， names_to = " source "， values_to = " abundance ") %>% filter(!lowly_abundant) %>% ggplot(aes(x=abundance + 1, color=sample)) + geom_density() + facet_wrap(~source) + scale_x_log10() "

基础R”R #示例取自BioC2020研讨会的气道数据集。dgList <- SE2DGEList(气道)group <- factor(dgList$samples$dex) keep。exprs <- filterByExpr(dgList, group=group) dgList <- dgList[keep.]exprs、keep.lib。sizes=FALSE] dgList <- calcNormFactors(dgList) logcounts <- cpm(dgList, log=TRUE) var_genes <- apply(logcounts, 1, var) select_var <- names(sort(var_genes，递减=TRUE))[1:500] highly_variable_lcpm <- logcounts[select_var，] colors <- c("#440154FF"， "#21908CFF"， "#fefada") col.group <- c("红色"，"灰色")[group] gplots::热图。2(highly_variable_lcpm, col= colors, trace="none"， ColSideColors=col。集团规模= "行")' ' '

附录

sessionInfo ()

## R版本4.0.4(2021-02-15)##平台:x86_64-pc-linux-gnu(64位)##运行在Ubuntu 18.04.5 LTS ## ##矩阵产品:默认## BLAS: /home/biocbuild/bbs-3.12-bioc/R/lib/libRblas。所以## LAPACK: /home/biocbuild/bbs-3.12-bioc/R/lib/libRlapack。所以## ## locale: ## [1] LC_CTYPE=en_US。UTF-8 LC_NUMERIC= c# # [3] LC_TIME=en_US。UTF-8 LC_COLLATE= c# # [5] LC_MONETARY=en_US。utf - 8 LC_MESSAGES = en_US。UTF-8 ## [7] LC_PAPER=en_US。UTF-8 LC_NAME= c# # [9] LC_ADDRESS=C lc_phone = c# # [11] LC_MEASUREMENT=en_US。UTF-8 LC_IDENTIFICATION=C ## ##附加的基本包:## [1]stats graphics grDevices utils datasets methods基础## ##其他附加包:## [1]tidybulk_1.2.1 ggrepel_0.9.1 ggplot2_3.3.3 magrittr_2.0.1 tibble_3.1.0 ## [6] tidyr_1.1.3 dplyr_1.0.5 knitr_1.31 ## ##通过命名空间加载(且未附加):# # # # [1] backports_1.2.1 tidytext_0.3.0 [3] plyr_1.8.6 igraph_1.2.6 # # [5] lazyeval_0.2.2 splines_4.0.4 # # [7] BiocParallel_1.24.1 listenv_0.8.0 # # [9] SnowballC_0.7.0 scattermore_0.7 # # [11] GenomeInfoDb_1.26.6 sva_3.38.0 # # [13] digest_0.6.27 htmltools_0.5.1.1 # # [15] fansi_0.4.2 memoise_2.0.0 # # [17] tensor_1.5 cluster_2.1.1 # # [19] ROCR_1.0-11 limma_3.46.0 # # [21] globals_0.14.0 readr_1.4.0 # # [23] annotate_1.68.0 matrixStats_0.58.0 # # [25] spatstat.sparse_2.0-0 colorspace_2.0-0 # # [27]blob_1.2.1 xfun_0.22 # # [29] crayon_1.4.1 rcurl_1.98 - 1.3 # # [31] jsonlite_1.7.2 genefilter_1.72.1 # # [33] spatstat.data_2.1-0 survival_3.2-10 # # [35] zoo_1.8-9 glue_1.4.2 # # [37] polyclip_1.10-0 gtable_0.3.0 # # [39] zlibbioc_1.36.0 XVector_0.30.0 # # [41] leiden_0.3.7 DelayedArray_0.16.3 # # [43] future.apply_1.7.0 BiocGenerics_0.36.0 # # [45] abind_1.4-5 scales_1.1.1 # # [47] DBI_1.1.1 edgeR_3.32.1 # # [49] miniUI_0.1.1.1 Rcpp_1.0.6 # # [51] widyr_0.1.3 viridisLite_0.3.0 # # [53] xtable_1.8-4reticulate_1.18 ## [55] spatstat.core_2.0-0 bit_4.0.4 ## [57] proxy_0.4-25 preprocessCore_1.52.1 ## [59] stats4_4.0.4 htmlwidgets_1.5.3 ## [61] httr_1.4.2 RColorBrewer_1.1-2 ## [63] ellipsis_0.3.1 Seurat_4.0.1 ## [65] ica_1.0-2 pkgconfig_2.0.3 ## [67] XML_3.99-0.6 uwot_0.1.10 ## [69] deldir_0.2-10 locfit_1. 1.5-9.4 ## [71] utf8_1.2.1 tidyselect_1.1.0 ## [73] rlang_0.4.10 reshape2_1.4.4 ## [77] munsell_0.5.0 tools_4.0.4 ## [79] cachem_1.0.4 cli_2.4.0 ##[81] generics_0.1.0 RSQLite_2.2.5 ## [83] broom_0.7.6 ggridges_0.5.3 ## [85] evaluate_0.14 string_1 .4.0 ## [87] fastmap_1.1.0 goftest_1.2-2 ## [89] bit64_4.0.5 fitdistrplus_1.1-3 ## [91] purrr_0.3.4 RANN_2.6.1 ## [93] pbapply_1.4-3 future_1.21.0 ## [95] nlme_1 .1-152 mime_0.10 ## [97] tokenizers_0.2.1 debugme_1.1.0 ## [99] compiler_4.0.4 rstudioapi_0.13 ## [101] plotly_4.9.3 png_0.1-7 ## [105] stringi_1.5.3 ps_1.6.0 ## [107] lattice_0.20-41 Matrix_1.3-2 .0 ## ## [109] vctrs_0.3.7 pillar_1.5.1 ## [111] lifecycle_1.0.0 spatstat.geom_2.0-1 ## [113] lmtest_0.9-38 RcppAnnoy_0.0.18 ## [115] data.table_1.14.0 cowplot_1.1.1 ## [117] bitops_1.0-6 irlba_2.3.3 ## [119] httpuv_1.5.5 patchwork_1.1.1 ## [121] GenomicRanges_1.42.0 R6_2.5.0 ## [123] promises_1.2.0.1 KernSmooth_2.23-18 ## [125] gridExtra_2.3 janeaustenr_0.1.5 ## [127] IRanges_2.24.1 parallelly_1.24.0 ## [129] codetools_0.2-18 MASS_7.3-53.1 ## [131] assertthat_0.2.1 SummarizedExperiment_1.20.0 ## [133] withr_2.4.1 SeuratObject_4.0.0 ## [135] sctransform_0.3.2 S4Vectors_0.28.1 ## [137] GenomeInfoDbData_1.2.4 mgcv_1.8-34 ## [139] parallel_4.0.4 hms_1.0.0 ## [141] rpart_4.1-15 grid_4.0.4 ## [143] class_7.3-18 MatrixGenerics_1.2.1 ## [145] Rtsne_0.15 Biobase_2.50.0 ## [147] shiny_1.6.0

创建tidybulk宠物猫。

总重复成绩单

规模计数

过滤器变量记录

减少维

主成分分析

tSNE

旋转维

测试微分丰富

调整计数

Deconvolve细胞类型组成

集群样品