Assesing BGI Data Quality - 2.27.14

#load reqired packages
library("ggplot2")
library("reshape2")
library("scales")
library("RColorBrewer")

#__Set the working directory__
#setwd("~/Desktop/bgidata")
setwd("~/Dropbox-GoetzLab/Dropbox/Public/Andy/knitr Documents/bgidata_quality_assesment_2.26.14")

#__Read in the data that I extracted from process_radtags log files. "melt" the data frames so that they can be plotted in ggplot__

#list.files()

Anofim_BGI_1.fq.gz.barcode.stats <- read.delim("BGI_Run_CleanedReads/Anofim_BGI_1.fq.gz.barcode.stats")
Anofim_BGI_2.fq.gz.barcode.stats <- read.delim("BGI_Run_CleanedReads/Anofim_BGI_2.fq.gz.barcode.stats")
Anofim_BGI_3.fq.gz.barcode.stats <- read.delim("BGI_Run_CleanedReads/Anofim_BGI_3.fq.gz.barcode.stats")
Anofim_BGI_4.fq.gz.barcode.stats <- read.delim("BGI_Run_CleanedReads/Anofim_BGI_4.fq.gz.barcode.stats")
Anofim_BGI_5.fq.gz.barcode.stats <- read.delim("BGI_Run_CleanedReads/Anofim_BGI_5.fq.gz.barcode.stats")
Anofim_BGI_6.fq.gz.barcode.stats <- read.delim("BGI_Run_CleanedReads/Anofim_BGI_6.fq.gz.barcode.stats")
Anofim_BGI_7.fq.gz.barcode.stats <- read.delim("BGI_Run_CleanedReads/Anofim_BGI_7.fq.gz.barcode.stats")
Anofim_test1_SBFI.barcode.stats <- read.delim("Previous_Runs/Anofim_test1_SBFI.barcode.stats")
Anofim_test1_PSTI.barcode.stats <- read.delim("Previous_Runs/Anofim_test1_PSTI.barcode.stats")
Anofim_test2.barcode.stats <- read.delim("Previous_Runs/Anofim_test2.barcode.stats")
AnofimSBFI_1.2.3.4.barcode.stats <- read.delim("Previous_Runs/AnofimSBFI_1.2.3.4.barcode.stats")
miSeqRun1.barcode.stats <- read.delim("Previous_Runs/miSeqRun1.barcode.stats")
miSeqRun2.barcode.stats <- read.delim("Previous_Runs/miSeqRun2.barcode.stats")

#raw reads
Anofim_BGI_1.Raw <- read.delim("BGI_Run_RawReads/1.barcode.stats")
Anofim_BGI_2.Raw <- read.delim("BGI_Run_RawReads/2.barcode.stats")
Anofim_BGI_3.Raw <- read.delim("BGI_Run_RawReads/3.barcode.stats")
Anofim_BGI_4.Raw <- read.delim("BGI_Run_RawReads/4.barcode.stats")
Anofim_BGI_5.Raw <- read.delim("BGI_Run_RawReads/5.barcode.stats")
Anofim_BGI_6.Raw <- read.delim("BGI_Run_RawReads/6.barcode.stats")
Anofim_BGI_7.Raw <- read.delim("BGI_Run_RawReads/7.barcode.stats")


raw.lane.stats <- read.delim("BGI_Run_RawReads/lane.stats")
lane.stats <- read.delim("BGI_Run_CleanedReads/lane.stats")
prev.lane.stats <- read.delim("Previous_Runs/prev.lane.stats",col.names=names(lane.stats))
lane.stats <- rbind(raw.lane.stats,lane.stats,prev.lane.stats)


#melt the dataframes
Anofim_BGI_1.fq.gz.barcode.stats <- melt(Anofim_BGI_1.fq.gz.barcode.stats,id.vars=1,measure.vars=c(5:3))
Anofim_BGI_2.fq.gz.barcode.stats <- melt(Anofim_BGI_2.fq.gz.barcode.stats,id.vars=1,measure.vars=c(5:3))
Anofim_BGI_3.fq.gz.barcode.stats <- melt(Anofim_BGI_3.fq.gz.barcode.stats,id.vars=1,measure.vars=c(5:3))
Anofim_BGI_4.fq.gz.barcode.stats <- melt(Anofim_BGI_4.fq.gz.barcode.stats,id.vars=1,measure.vars=c(5:3))
Anofim_BGI_5.fq.gz.barcode.stats <- melt(Anofim_BGI_5.fq.gz.barcode.stats,id.vars=1,measure.vars=c(5:3))
Anofim_BGI_6.fq.gz.barcode.stats <- melt(Anofim_BGI_6.fq.gz.barcode.stats,id.vars=1,measure.vars=c(5:3))
Anofim_BGI_7.fq.gz.barcode.stats <- melt(Anofim_BGI_7.fq.gz.barcode.stats,id.vars=1,measure.vars=c(5:3))
Anofim_test1_PSTI.barcode.stats <- melt(Anofim_test1_PSTI.barcode.stats,id.vars=1,measure.vars=c(5:3))
Anofim_test1_SBFI.barcode.stats <- melt(Anofim_test1_SBFI.barcode.stats,id.vars=1,measure.vars=c(5:3))
Anofim_test2.barcode.stats <- melt(Anofim_test2.barcode.stats,id.vars=1,measure.vars=c(5:3))
AnofimSBFI_1.2.3.4.barcode.stats <- melt(AnofimSBFI_1.2.3.4.barcode.stats,id.vars=1,measure.vars=c(5:3))
miSeqRun1.barcode.stats <- melt(miSeqRun1.barcode.stats,id.vars=1,measure.vars=c(5:3))
miSeqRun2.barcode.stats <- melt(miSeqRun2.barcode.stats,id.vars=1,measure.vars=c(5:3))


Anofim_BGI_1.Raw <- melt(Anofim_BGI_1.Raw,id.vars=1,measure.vars=c(5:3))
Anofim_BGI_2.Raw <- melt(Anofim_BGI_2.Raw,id.vars=1,measure.vars=c(5:3))
Anofim_BGI_3.Raw <- melt(Anofim_BGI_3.Raw,id.vars=1,measure.vars=c(5:3))
Anofim_BGI_4.Raw <- melt(Anofim_BGI_4.Raw,id.vars=1,measure.vars=c(5:3))
Anofim_BGI_5.Raw <- melt(Anofim_BGI_5.Raw,id.vars=1,measure.vars=c(5:3))
Anofim_BGI_6.Raw <- melt(Anofim_BGI_6.Raw,id.vars=1,measure.vars=c(5:3))
Anofim_BGI_7.Raw <- melt(Anofim_BGI_7.Raw,id.vars=1,measure.vars=c(5:3))

lane.stats <- melt(lane.stats,id.vars=1,measure.vars=c(2:5))

Lane performance

## Error in eval(expr, envir, enclos): object 'Tile' not found

Barcode stats for BGI run

plot of chunk unnamed-chunk-5

## Warning: Removed 1 rows containing missing values (position_stack).

plot of chunk unnamed-chunk-5

## Warning: Removed 1 rows containing missing values (position_stack).

plot of chunk unnamed-chunk-5

Barcode stats for the hiSeq runs at University of Oregon

plot of chunk unnamed-chunk-6

Barcode stats for miSeq runs

ggplot() + geom_bar(data = miSeqRun1.barcode.stats, aes(x=Barcode,y=value,fill=variable),color="#444546",stat="identity") + 
  theme(axis.text.x=element_text(angle=90)) +
  scale_fill_manual(values=c("#397486","#192730","#D94B57")) +
  scale_y_continuous(labels = comma,limits=c(0,8000000)) +
  labs(title="Lane - miSeqRun1 - KMNJHrun_S1 ",y="Number of Reads")

plot of chunk unnamed-chunk-7

ggplot() + geom_bar(data = miSeqRun2.barcode.stats, aes(x=Barcode,y=value,fill=variable),color="#444546",stat="identity") + 
  theme(axis.text.x=element_text(angle=90)) +
  scale_fill_manual(values=c("#397486","#192730","#D94B57")) +
  scale_y_continuous(labels = comma,limits=c(0,8000000)) +
  labs(title="Lane - miSeqRun2 - sablefishPstI1",y="Number of Reads")

plot of chunk unnamed-chunk-7

Plot of the values for read depth vs percent utilized reads

prev.ustacks.stats <- read.delim("Previous_Runs/prev.ustacks.stats.txt")
cleaned.ustacks.stats <- read.delim("BGI_Run_CleanedReads/ustacks.stats.txt")
raw.ustacks.stats <- read.delim("BGI_Run_RawReads/ustacks.stats.txt")

PE.pt.labs <- prev.ustacks.stats[prev.ustacks.stats$Lane=="PE_Run_PSTI",]

ggplot() +  geom_point(data=prev.ustacks.stats,aes(x=Mean.Coverage.Depth,y=PercentUtilized,fill=Lane),pch=21,color="black",size=3,alpha=0.8) + 
  scale_fill_brewer(palette="Dark2") +
  labs(title="Runs at University of Oregon",x="Mean Coverage Depth",y="% Utilized reads") +
  geom_text(data=PE.pt.labs,aes(x=Mean.Coverage.Depth,y=PercentUtilized,label=Sample),angle=-45,hjust=0, vjust=0.5,size=4,position="dodge")

plot of chunk unnamed-chunk-8

ggplot() + geom_point(data=cleaned.ustacks.stats,aes(x=Mean.Coverage.Depth,y=PercentUtilized,fill=Lane),pch=21,color="black",size=3,alpha=0.8) + 
  scale_fill_brewer(palette="Dark2") +
  labs(title="BGI - 'Cleaned' Reads",x="Mean Coverage Depth",y="% Utilized reads")

plot of chunk unnamed-chunk-8

ggplot() +  geom_point(data=raw.ustacks.stats,aes(x=Mean.Coverage.Depth,y=PercentUtilized,fill=Lane),pch=21,color="black",size=3,alpha=0.8) + 
  scale_fill_brewer(palette="Dark2") +
  labs(title="BGI - Raw Reads",x="Mean Coverage Depth",y="% Utilized reads")

plot of chunk unnamed-chunk-8 looks like lane #2 from BGI didn't really perform very well, but the others seem to look good. This may resolve itself if it had something to do with how BGI processed the run.

Combining the MiSeq Runs with Anofim_Test2 library #1

Comparing with the depth of the samples sequenced for the paired end library because they were sequenced at sufficient depth.

Anofim_test2.barcode.stats <- read.delim("Previous_Runs/Anofim_test2.barcode.stats")
miSeqRun1.barcode.stats <- read.delim("Previous_Runs/miSeqRun1.barcode.stats")
miSeqRun2.barcode.stats <- read.delim("Previous_Runs/miSeqRun2.barcode.stats")

miSeqCombined <- merge(miSeqRun1.barcode.stats,miSeqRun2.barcode.stats, by="Barcode")

Total <- apply(miSeqCombined[,c(2,6)],1,sum)
No.RadTag <- apply(miSeqCombined[,c(3,7)],1,sum)
Low.Quality <- apply(miSeqCombined[,c(4,8)],1,sum)
Retained <- apply(miSeqCombined[,c(5,9)],1,sum)
Barcode <- as.character(miSeqCombined$Barcode)

miSeqCombined <- data.frame(cbind(Barcode,Total,No.RadTag,Low.Quality,Retained))
miSeqCombined[,2] <- as.numeric(as.character(miSeqCombined[,2]))
miSeqCombined[,3] <- as.numeric(as.character(miSeqCombined[,3]))
miSeqCombined[,4] <- as.numeric(as.character(miSeqCombined[,4]))
miSeqCombined[,5] <- as.numeric(as.character(miSeqCombined[,5]))

miSeqCombined <- merge(miSeqCombined,Anofim_test2.barcode.stats, by="Barcode")


Total <- apply(miSeqCombined[,c(2,6)],1,sum)
No.RadTag <- apply(miSeqCombined[,c(3,7)],1,sum)
Low.Quality <- apply(miSeqCombined[,c(4,8)],1,sum)
Retained <- apply(miSeqCombined[,c(5,9)],1,sum)
Barcode <- as.character(miSeqCombined$Barcode)

miSeqCombined <- data.frame(cbind(Barcode,Total,No.RadTag,Low.Quality,Retained))

miSeqCombined.barcode.stats <- melt(miSeqCombined,id.vars=1,measure.vars=c(5:3))

## Warning: attributes are not identical across measure variables; they will
## be dropped

miSeqCombined.barcode.stats$value <- as.numeric(miSeqCombined.barcode.stats$value)

PE.barcode.stats <- read.delim("Previous_Runs/PE.barcode.stats")

## Warning in read.table(file = file, header = header, sep = sep, quote =
## quote, : incomplete final line found by readTableHeader on
## 'Previous_Runs/PE.barcode.stats'

PE.barcode.stats <- melt(PE.barcode.stats,id.vars=1,measure.vars=c(5:3))

plot of chunk unnamed-chunk-11 It appears that we might have to sequence at much greater depth to actually get useable data from the pstI samples

Updated summary stats from ustacks

When samples sequenced more than once are combined into a single file, namely the miSeq runs combined with Anofim_Test2 plot of chunk unnamed-chunk-12

comb.ustacks.stats <- read.delim("Previous_Runs/comb.ustacks.txt")

comb.ustacks.stats <- comb.ustacks.stats[comb.ustacks.stats$Lane!="Combined",]
comb.ustacks.stats <- comb.ustacks.stats[comb.ustacks.stats$Lane!="Anofim_Test_1",]
comb.ustacks.stats <- comb.ustacks.stats[comb.ustacks.stats$Lane!="Anofim_Test_2",]
comb.ustacks.stats <- comb.ustacks.stats[comb.ustacks.stats$Lane!="Anofim_Test_2 & MiSeq",]
comb.ustacks.stats <- comb.ustacks.stats[comb.ustacks.stats$Lane!="PE_4.29.12",]

comb.ustacks.stats <- comb.ustacks.stats


ggplot() + geom_point(data=comb.ustacks.stats,aes(x=Total.Reads,y=Mean.Coverage.Depth,fill=Lane),stat="identity",pch=21,color="black",alpha=0.8)

plot of chunk unnamed-chunk-13

mean.reads.lane <- mean(c(prev.lane.stats[c(1,5,6),6],raw.lane.stats[c(1,3,4,5,6,7),6]))

mean.reads.lane / mean(comb.ustacks.stats[comb.ustacks.stats$Mean.Coverage.Depth==25,4])

## [1] 86.22766

mean.reads.lane / mean(comb.ustacks.stats[comb.ustacks.stats$Mean.Coverage.Depth==20,4])

## [1] 115.9562

mean.reads.lane / mean(comb.ustacks.stats[comb.ustacks.stats$Mean.Coverage.Depth==10,4])

## [1] 255.1679