R Code Examples - RNA-Seq
For each example, I start with a data frame of count data that looks like the following.
|
NPL32
|
NPL34
|
NPL44
|
NPL46
|
NPL56
|
NPL61
|
PL20
|
PL31
|
PL43
|
PL55
|
PL59
|
PL62
|
comp0_c0_seq1
|
1073920
|
1228123
|
5254066
|
1371825
|
967934
|
677390
|
232806
|
1123876
|
2613486
|
630659
|
1231571
|
229342
|
comp1_c0_seq1
|
1918518
|
1726845
|
4206394
|
1314385
|
615831
|
939729
|
343003
|
1570057
|
1481815
|
1227947
|
1273835
|
420144
|
comp2_c0_seq1
|
171037
|
243930
|
712047
|
646413
|
469036
|
166717
|
129706
|
629589
|
1289674
|
564959
|
332828
|
116855
|
comp5_c0_seq1
|
147703
|
41802
|
781225
|
261413
|
299237
|
101581
|
73395
|
175481
|
433382
|
79492
|
304055
|
97796
|
comp5_c1_seq1
|
121391
|
43070
|
656968
|
231221
|
305144
|
81390
|
50360
|
152298
|
430000
|
77287
|
252271
|
62325
|
comp6_c0_seq1
|
342865
|
294385
|
638089
|
228506
|
102413
|
290270
|
121084
|
301103
|
341995
|
234032
|
224490
|
213252
|
DESeq
# Getting the data and setting up the conditional factor vector
deseq.sums.countData_l <- sums.count.data[,c(1:6, 13:18)]
deseq.sums.conds_l <- factor(c(rep("NPL", 6), rep("PL", 6)))
# Creating the DESeq data object
deseq.sums.cds_l <- newCountDataSet(countData = deseq.sums.countData_l, conditions = deseq.sums.conds_l)
# Normalizing
deseq.sums.cds_l <- estimateSizeFactors(deseq.sums.cds_l)
# Doing the variance calculation
deseq.sums.cds_l <- estimateDispersions(deseq.sums.cds_l)
# Running the test and getting the results
deseq.sums.res_l <- nbinomTest(deseq.sums.cds_l, "NPL", "PL")
DESeq2
# Extracting the columns I want
deseq2.sums.countData_l <- sums.count.data[,c(1:6, 13:18)]
# Setting up the dataframe that describes the data
deseq2.sums.colData_l <- data.frame(condition=factor(c(rep("NPL", 6), rep("PL", 6))),
type=factor(rep("single-read", 12)))
rownames(deseq2.sums.colData_l) <- colnames(deseq2.sums.countData_l)
# Creating the DESeq2 data object
deseq2.sums.dds_l <- DESeqDataSetFromMatrix(countData = deseq2.sums.countData_l,
colData = deseq2.sums.colData_l,
design = ~ condition)
# Running the analysis, getting the results and sorting them
deseq2.sums.dds_l <- DESeq(deseq2.sums.dds_l)
deseq2.sums.res_l <- results(deseq2.sums.dds_l)
deseq2.sums.res_l <- deseq2.sums.res_l[order(rownames(deseq2.sums.res_l)), ]
edgeR
# Creating the edgeR data object
edger.dgel_l <- DGEList(counts=deseq2.sums.countData_l, group=factor(c(rep(1,6), rep(2,6))))
# Normalizing the data
edger.dgel_l <- calcNormFactors(edger.dgel_l)
# Some other stuff I'm not sure about
edger.dgel_l <- estimateCommonDisp(edger.dgel_l)
edger.dgel_l <- estimateTagwiseDisp(edger.dgel_l)
# Running the tests
edger.et_l <- exactTest(edger.dgel_l)
# Getting the results and sorting them
edger.res_l <- topTags(edger.et_l, n=nrow, sort.by="none")
edger.res_l$table <- edger.res_l$table[order(rownames(edger.res_l$table)), ]
baySeq
# Setting up a 'mini' cluster using the snow library, using 4 cores on the current computer
cl <- makeCluster(c(rep("localhost", 4)), type="SOCK")
# Setting up the data object and description
bayseq.cd_l <- new("countData", data=as.matrix(sums.count.data[,c(1:6, 13:18)]),
replicates=c(rep("NPL", 6), rep("PL", 6)),
groups=list(NDE=rep(1,12), DE=c(rep(1,6), rep(2,6))))
# Normalizing
libsizes(bayseq.cd_l) <- getLibsizes(bayseq.cd_l)
# Adding in annotation
bayseq.cd_l@annotation <- data.frame(name=rownames(sums.count.data))
# Running the Tests, getting results and sorting them
bayseq.cd_l <- getPriors.NB(bayseq.cd_l, samplesize=10000, estimation="QL", cl=cl)
bayseq.cd_l <- getLikelihoods.NB(bayseq.cd_l, pET='BIC', cl=cl)
bayseq.res_l <- topCounts(bayseq.cd_l, group="DE", number=nrow(sums.count.data))
bayseq.res_l <- bayseq.res_l[order(rownames(bayseq.res_l)), ]
Plots
tmp <- deseq2.sums.res_l
# The main plot
plot(tmp$baseMean, tmp$log2FoldChange, pch=20, cex=0.45, ylim=c(-3, 3), log="x", col="darkgray",
main="MA-Plot of NPL vs PL with DESeq2 Significant Results (pval <= 0.05)",
xlab="mean of normalized counts",
ylab="Log2 Fold Change")
# Getting the significant points and plotting them again so they're a different color
tmp.sig <- deseq2.sums.res_l[!is.na(deseq2.sums.res_l$padj) & deseq2.sums.res_l$padj <= 0.05, ]
points(tmp.sig$baseMean, tmp.sig$log2FoldChange, pch=20, cex=0.45, col="red")
# 2 FC lines
abline(h=c(-1,1), col="blue")