This file gives the answer to the document “Practical statistical analysis of RNA-Seq data” using the R packages DESeq2 (version 1.4.5), mixOmics (version 5.0-1), RColorBrewer(version 1.0-5) and HTSFilter (version 1.6.0).
library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
##
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
##
## The following object is masked from 'package:stats':
##
## xtabs
##
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, as.vector, cbind,
## colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
## intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unlist, unsplit
##
## Loading required package: IRanges
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: Rcpp
## Loading required package: RcppArmadillo
library(RColorBrewer)
library(mixOmics)
## Loading required package: MASS
## Loading required package: lattice
##
## Attaching package: 'mixOmics'
##
## The following object is masked from 'package:GenomicRanges':
##
## map
##
## The following object is masked from 'package:IRanges':
##
## map
library(HTSFilter)
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## No methods found in "IRanges" for requests: mcols
Properly set the directory from which files are imported:
directory <- "RNAseq_data/count_table_files/"
dir(directory)
## [1] "count_table.tsv" "pasilla_design.txt"
Exercise 3.1 Read the files:
rawCountTable <- read.table(paste0(directory,"count_table.tsv"), header=TRUE,
row.names=1)
sampleInfo <- read.table(paste0(directory,"pasilla_design.txt"), header=TRUE,
row.names=1)
Exercise 3.2 Have a look at the count data:
head(rawCountTable)
## untreated1 untreated2 untreated3 untreated4 treated1 treated2
## FBgn0000003 0 0 0 0 0 0
## FBgn0000008 92 161 76 70 140 88
## FBgn0000014 5 1 0 0 4 0
## FBgn0000015 0 2 1 2 1 0
## FBgn0000017 4664 8714 3564 3150 6205 3072
## FBgn0000018 583 761 245 310 722 299
## treated3
## FBgn0000003 1
## FBgn0000008 70
## FBgn0000014 0
## FBgn0000015 0
## FBgn0000017 3334
## FBgn0000018 308
nrow(rawCountTable)
## [1] 14599
14599 genes are included in this file.
Exercise 3.3 Have a look at the sample information and order the count table in the same way that the sample information:
sampleInfo
## type number.of.lanes total.number.of.reads exon.counts
## treated1 single-read 5 35158667 15679615
## treated2 paired-end 2 12242535 (x2) 15620018
## treated3 paired-end 2 12443664 (x2) 12733865
## untreated1 single-read 2 17812866 14924838
## untreated2 single-read 6 34284521 20764558
## untreated3 paired-end 2 10542625 (x2) 10283129
## untreated4 paired-end 2 12214974 (x2) 11653031
rawCountTable <- rawCountTable[,match(rownames(sampleInfo),
colnames(rawCountTable))]
head(rawCountTable)
## treated1 treated2 treated3 untreated1 untreated2 untreated3
## FBgn0000003 0 0 1 0 0 0
## FBgn0000008 140 88 70 92 161 76
## FBgn0000014 4 0 0 5 1 0
## FBgn0000015 1 0 0 0 2 1
## FBgn0000017 6205 3072 3334 4664 8714 3564
## FBgn0000018 722 299 308 583 761 245
## untreated4
## FBgn0000003 0
## FBgn0000008 70
## FBgn0000014 0
## FBgn0000015 2
## FBgn0000017 3150
## FBgn0000018 310
Exercise 3.4 Create the ‘condition’ column
sampleInfo$condition <- substr(rownames(sampleInfo), 1,
nchar(rownames(sampleInfo))-1)
sampleInfo$condition[sampleInfo$condition=="untreated"] <- "control"
sampleInfo$condition <- factor(sampleInfo$condition)
sampleInfo
## type number.of.lanes total.number.of.reads exon.counts
## treated1 single-read 5 35158667 15679615
## treated2 paired-end 2 12242535 (x2) 15620018
## treated3 paired-end 2 12443664 (x2) 12733865
## untreated1 single-read 2 17812866 14924838
## untreated2 single-read 6 34284521 20764558
## untreated3 paired-end 2 10542625 (x2) 10283129
## untreated4 paired-end 2 12214974 (x2) 11653031
## condition
## treated1 treated
## treated2 treated
## treated3 treated
## untreated1 control
## untreated2 control
## untreated3 control
## untreated4 control
Exercise 3.5 Create a ‘DESeqDataSet’ object
ddsFull <- DESeqDataSetFromMatrix(as.matrix(rawCountTable), sampleInfo,
formula(~condition))
ddsFull
## class: DESeqDataSet
## dim: 14599 7
## exptData(0):
## assays(1): counts
## rownames(14599): FBgn0000003 FBgn0000008 ... FBgn0261574
## FBgn0261575
## rowData metadata column names(0):
## colnames(7): treated1 treated2 ... untreated3 untreated4
## colData names(5): type number.of.lanes total.number.of.reads
## exon.counts condition
Exercise 3.6 List all files in directory
directory <- "RNAseq_data/separate_files/"
sampleFiles <- list.files(directory)
sampleFiles
## [1] "pasilla_design.txt" "treated1fb.txt" "treated2fb.txt"
## [4] "treated3fb.txt" "untreated1fb.txt" "untreated2fb.txt"
## [7] "untreated3fb.txt" "untreated4fb.txt"
Exercise 3.7 Create an object with file informations
keptFiles <- sampleFiles[-1]
sampleName <- sapply(keptFiles, function(afile)
substr(afile, 1, nchar(afile)-6))
condition<- sapply(keptFiles, function(afile)
substr(afile, 1, nchar(afile)-7))
fileInfo <- data.frame(sampleName = sampleName, sampleFiles = keptFiles,
condition = condition)
rownames(fileInfo) <- NULL
fileInfo
## sampleName sampleFiles condition
## 1 treated1 treated1fb.txt treated
## 2 treated2 treated2fb.txt treated
## 3 treated3 treated3fb.txt treated
## 4 untreated1 untreated1fb.txt untreated
## 5 untreated2 untreated2fb.txt untreated
## 6 untreated3 untreated3fb.txt untreated
## 7 untreated4 untreated4fb.txt untreated
Exercise 3.8 Construct a ‘DESeqDataSet’ object
ddsHTSeq <- DESeqDataSetFromHTSeqCount(fileInfo, directory, formula(~condition))
ddsHTSeq
## class: DESeqDataSet
## dim: 70463 7
## exptData(0):
## assays(1): counts
## rownames(70463): FBgn0000003:001 FBgn0000008:001 ...
## FBgn0261575:001 FBgn0261575:002
## rowData metadata column names(0):
## colnames(7): treated1 treated2 ... untreated3 untreated4
## colData names(1): condition
Exercise 3.9 Select the subset of paire-end samples
dds <- subset(ddsFull, select=colData(ddsFull)$type=="paired-end")
dds
## class: DESeqDataSet
## dim: 14599 4
## exptData(0):
## assays(1): counts
## rownames(14599): FBgn0000003 FBgn0000008 ... FBgn0261574
## FBgn0261575
## rowData metadata column names(0):
## colnames(4): treated2 treated3 untreated3 untreated4
## colData names(5): type number.of.lanes total.number.of.reads
## exon.counts condition
colData(dds)
## DataFrame with 4 rows and 5 columns
## type number.of.lanes total.number.of.reads exon.counts
## <factor> <integer> <factor> <integer>
## treated2 paired-end 2 12242535 (x2) 15620018
## treated3 paired-end 2 12443664 (x2) 12733865
## untreated3 paired-end 2 10542625 (x2) 10283129
## untreated4 paired-end 2 12214974 (x2) 11653031
## condition
## <factor>
## treated2 treated
## treated3 treated
## untreated3 control
## untreated4 control
Exercise 3.10 Extract pseudo-counts (ie \(\log_2(K+1)\))
pseudoCounts <- log2(counts(dds)+1)
head(pseudoCounts)
## treated2 treated3 untreated3 untreated4
## FBgn0000003 0.000000 1.000000 0.000000 0.000000
## FBgn0000008 6.475733 6.149747 6.266787 6.149747
## FBgn0000014 0.000000 0.000000 0.000000 0.000000
## FBgn0000015 0.000000 0.000000 1.000000 1.584963
## FBgn0000017 11.585432 11.703471 11.799686 11.621594
## FBgn0000018 8.228819 8.271463 7.942515 8.280771
Exercise 3.11 Histogram for pseudo-counts (sample treated2
)
hist(pseudoCounts[,"treated2"])
Exercise 3.12 Boxplot for pseudo-counts
boxplot(pseudoCounts, col="gray")
Exercise 3.13 MA-plots between control or treated samples
par(mfrow=c(1,2))
## treated2 vs treated3
# A values
avalues <- (pseudoCounts[,1] + pseudoCounts[,2])/2
# M values
mvalues <- (pseudoCounts[,1] - pseudoCounts[,2])
plot(avalues, mvalues, xlab="A", ylab="M", pch=19, main="treated")
abline(h=0, col="red")
## untreated3 vs untreated4
# A values
avalues <- (pseudoCounts[,3] + pseudoCounts[,4])/2
# M values
mvalues <- (pseudoCounts[,3] - pseudoCounts[,4])
plot(avalues, mvalues, xlab="A", ylab="M", pch=19, main="control")
abline(h=0, col="red")
Exercise 3.14 PCA for pseudo-counts
vsd <- varianceStabilizingTransformation(dds)
vsd
## class: SummarizedExperiment
## dim: 14599 4
## exptData(0):
## assays(1): ''
## rownames(14599): FBgn0000003 FBgn0000008 ... FBgn0261574
## FBgn0261575
## rowData metadata column names(6): baseMean baseVar ... dispFit
## dispFit.1
## colnames(4): treated2 treated3 untreated3 untreated4
## colData names(6): type number.of.lanes ... condition sizeFactor
plotPCA(vsd)
Exercise 3.15 heatmap for pseudo-counts (using mixOmics
package)
sampleDists <- as.matrix(dist(t(pseudoCounts)))
sampleDists
## treated2 treated3 untreated3 untreated4
## treated2 0.00000 60.42701 75.96033 70.65302
## treated3 60.42701 0.00000 80.03024 71.71797
## untreated3 75.96033 80.03024 0.00000 65.97926
## untreated4 70.65302 71.71797 65.97926 0.00000
cimColor <- colorRampPalette(rev(brewer.pal(9, "Blues")))(16)
cim(sampleDists, col=cimColor, symkey=FALSE)
Exercise 3.16 Run the DESeq2 analysis
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
dds
## class: DESeqDataSet
## dim: 14599 4
## exptData(0):
## assays(3): counts mu cooks
## rownames(14599): FBgn0000003 FBgn0000008 ... FBgn0261574
## FBgn0261575
## rowData metadata column names(28): baseMean baseVar ... deviance
## maxCooks
## colnames(4): treated2 treated3 untreated3 untreated4
## colData names(6): type number.of.lanes ... condition sizeFactor
Exercise 3.17 Extract the results
res <- results(dds)
res
## log2 fold change (MAP): condition treated vs control
## Wald test p-value: condition treated vs control
## DataFrame with 14599 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## FBgn0000003 0.2242980 0.01999754 0.03841923 0.5205085 0.60270918
## FBgn0000008 76.2956431 -0.04663587 0.21614576 -0.2157612 0.82917388
## FBgn0000014 0.0000000 NA NA NA NA
## FBgn0000015 0.7810873 -0.07124212 0.07009119 -1.0164204 0.30942925
## FBgn0000017 3298.6821506 -0.24384591 0.12034584 -2.0262098 0.04274329
## ... ... ... ... ... ...
## FBgn0261571 0.000000 NA NA NA NA
## FBgn0261572 5.272899 -0.19531772 0.1595563 -1.2241302 0.2209031
## FBgn0261573 1728.419843 0.04936833 0.1099932 0.4488310 0.6535536
## FBgn0261574 3129.036923 -0.03913704 0.1289667 -0.3034662 0.7615346
## FBgn0261575 2.659185 0.05090077 0.1244492 0.4090084 0.6825335
## padj
## <numeric>
## FBgn0000003 NA
## FBgn0000008 0.9566162
## FBgn0000014 NA
## FBgn0000015 NA
## FBgn0000017 0.2353693
## ... ...
## FBgn0261571 NA
## FBgn0261572 NA
## FBgn0261573 0.9027184
## FBgn0261574 0.9398855
## FBgn0261575 NA
Exercise 3.18 Obtain information on the meaning of the columns
mcols(res)
## DataFrame with 6 rows and 2 columns
## type description
## <character> <character>
## 1 intermediate mean of normalized counts for all samples
## 2 results log2 fold change (MAP): condition treated vs control
## 3 results standard error: condition treated vs control
## 4 results Wald statistic: condition treated vs control
## 5 results Wald test p-value: condition treated vs control
## 6 results BH adjusted p-values
Exercise 3.19 Count the number of significant genes at level 1%
sum(res$padj < 0.01, na.rm=TRUE)
## [1] 509
Exercise 3.20 Extract significant genes and sort them by the strongest down regulation
sigDownReg <- res[!is.na(res$padj), ]
sigDownReg <- sigDownReg[sigDownReg$padj < 0.01, ]
sigDownReg <- sigDownReg[order(sigDownReg$log2FoldChange),]
sigDownReg
## log2 fold change (MAP): condition treated vs control
## Wald test p-value: condition treated vs control
## DataFrame with 509 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## FBgn0039155 463.4369 -3.668239 0.16451365 -22.29748 3.909282e-110
## FBgn0039827 188.5927 -3.009043 0.19731233 -15.25015 1.642499e-52
## FBgn0003360 2544.2512 -2.795779 0.10893830 -25.66387 2.960163e-145
## FBgn0034736 162.0375 -2.565874 0.19789037 -12.96614 1.903771e-38
## FBgn0026562 36449.4713 -2.339838 0.09897541 -23.64060 1.474641e-123
## ... ... ... ... ... ...
## FBgn0003501 232.3835 1.718844 0.1751360 9.814335 9.768012e-23
## FBgn0051092 128.8251 1.777316 0.1987698 8.941578 3.836522e-19
## FBgn0000071 302.3697 2.163293 0.1634766 13.233045 5.654891e-40
## FBgn0035189 197.4689 2.238500 0.1934138 11.573630 5.605945e-31
## FBgn0025111 1340.2282 2.722749 0.1171002 23.251458 1.375043e-119
## padj
## <numeric>
## FBgn0039155 7.362156e-107
## FBgn0039827 1.546618e-49
## FBgn0003360 2.229891e-141
## FBgn0034736 9.560737e-36
## FBgn0026562 5.554234e-120
## ... ...
## FBgn0003501 2.452748e-20
## FBgn0051092 7.225130e-17
## FBgn0000071 3.276792e-37
## FBgn0035189 2.346088e-28
## FBgn0025111 3.452733e-116
Exercise 3.21 Extract significant genes and sort them by the strongest up regulation
sigUpReg <- res[!is.na(res$padj), ]
sigUpReg <- sigUpReg[sigUpReg$padj < 0.01, ]
sigUpReg <- sigUpReg[order(sigUpReg$log2FoldChange, decreasing=TRUE),]
sigUpReg
## log2 fold change (MAP): condition treated vs control
## Wald test p-value: condition treated vs control
## DataFrame with 509 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## FBgn0025111 1340.2282 2.722749 0.1171002 23.251458 1.375043e-119
## FBgn0035189 197.4689 2.238500 0.1934138 11.573630 5.605945e-31
## FBgn0000071 302.3697 2.163293 0.1634766 13.233045 5.654891e-40
## FBgn0051092 128.8251 1.777316 0.1987698 8.941578 3.836522e-19
## FBgn0003501 232.3835 1.718844 0.1751360 9.814335 9.768012e-23
## ... ... ... ... ... ...
## FBgn0026562 36449.4713 -2.339838 0.09897541 -23.64060 1.474641e-123
## FBgn0034736 162.0375 -2.565874 0.19789037 -12.96614 1.903771e-38
## FBgn0003360 2544.2512 -2.795779 0.10893830 -25.66387 2.960163e-145
## FBgn0039827 188.5927 -3.009043 0.19731233 -15.25015 1.642499e-52
## FBgn0039155 463.4369 -3.668239 0.16451365 -22.29748 3.909282e-110
## padj
## <numeric>
## FBgn0025111 3.452733e-116
## FBgn0035189 2.346088e-28
## FBgn0000071 3.276792e-37
## FBgn0051092 7.225130e-17
## FBgn0003501 2.452748e-20
## ... ...
## FBgn0026562 5.554234e-120
## FBgn0034736 9.560737e-36
## FBgn0003360 2.229891e-141
## FBgn0039827 1.546618e-49
## FBgn0039155 7.362156e-107
Exercise 3.22 Create permanent storage of results
write.csv(sigDownReg, file="sigDownReg-deseq.csv")
write.csv(sigUpReg, file="sigUpReg-deseq.csv")
Exercise 3.23 Plot a histogram of unadjusted p-values after filtering
hist(res$pvalue, breaks=50)
Exercise 3.24 Create a MA plot showing differentially expressed genes
plotMA(res, alpha=0.01)
Exercise 3.25 Create a Volcano plot
volcanoData <- cbind(res$log2FoldChange, -log10(res$padj))
volcanoData <- na.omit(volcanoData)
colnames(volcanoData) <- c("logFC", "negLogPval")
head(volcanoData)
## logFC negLogPval
## [1,] -0.04663587 0.019262290
## [2,] -0.24384591 0.628250140
## [3,] -0.04161106 0.024127703
## [4,] -0.00870916 0.006284238
## [5,] 0.48382518 5.875357944
## [6,] 0.72477623 13.492690129
plot(volcanoData, pch=19, cex=0.5)
Exercise 3.26 Transform the normalized counts for variance stabilization
vsnd <- varianceStabilizingTransformation(dds, blind=FALSE)
vsnd
## class: SummarizedExperiment
## dim: 14599 4
## exptData(0):
## assays(1): ''
## rownames(14599): FBgn0000003 FBgn0000008 ... FBgn0261574
## FBgn0261575
## rowData metadata column names(28): baseMean baseVar ... deviance
## maxCooks
## colnames(4): treated2 treated3 untreated3 untreated4
## colData names(6): type number.of.lanes ... condition sizeFactor
Exercise 3.27 Extract the transformed data
head(assay(vsnd), 10)
## treated2 treated3 untreated3 untreated4
## FBgn0000003 7.467258 7.569963 7.467258 7.467258
## FBgn0000008 8.454053 8.314503 8.459439 8.355754
## FBgn0000014 7.467258 7.467258 7.467258 7.467258
## FBgn0000015 7.467258 7.467258 7.583294 7.619755
## FBgn0000017 11.709561 11.703802 12.112722 11.757305
## FBgn0000018 9.213338 9.169608 9.181377 9.250923
## FBgn0000022 7.467258 7.467258 7.467258 7.467258
## FBgn0000024 7.750574 7.696720 7.668130 7.653985
## FBgn0000028 7.574489 7.569963 7.467258 7.467258
## FBgn0000032 9.970716 9.968425 10.005398 9.947107
selY <- assay(vsnd)[!is.na(res$pval), ]
selY <- selY[res$pval[!is.na(res$pval)] < 0.01,]
cimColor <- colorRampPalette(rev(brewer.pal(9, "Blues")))(255)[255:1]
cim(t(selY), col=cimColor, symkey=FALSE)