##Lets try and do what I was doing in excel for TE's in R TEdata<-read.csv("C:\\Users\\Harry\\Documents\\coordinates4R.csv") TEdata<-TEdata[5:nrow(TEdata),] #s1=TEdata$X.home.harry.Documents.Bioinformatics.cgigas_alpha_v032.fa.1 #e1=TEdata$X #s2=TEdata$X.2 #e2=TEdata$X.3 #t1=TEdata$X.10 #t2=TEdata$X.11 nrow(TEdata) virginData=TEdata TEdata<-subset(TEdata, as.numeric(levels(X)[X])!=as.numeric(levels(X.3)[X.3]) | levels(X.10)[X.10]!=levels(X.11)[X.11] | as.numeric(levels(X.home.harry.Documents.Bioinformatics.cgigas_alpha_v032.fa.1)[X.home.harry.Documents.Bioinformatics.cgigas_alpha_v032.fa.1])!=as.numeric(levels(X.2)[X.2])) nrow(TEdata) #can we make TEdata smaller # newDat=rep(-1,16) a=c(0) selfsRemoved=TEdata done=FALSE id<-1:nrow(TEdata) TEdata<-cbind(TEdata,id) goodData=c(-1) badData=c(-1) #The easiest way of fixing the problems I was having is to make a newdata set from the old one for(i in id){ #Check row i currentRow=TEdata[i,] start1=as.numeric(levels(currentRow$X.home.harry.Documents.Bioinformatics.cgigas_alpha_v032.fa.1)[currentRow$X.home.harry.Documents.Bioinformatics.cgigas_alpha_v032.fa.1]) end1=as.numeric(levels(currentRow$X)[currentRow$X]) start2=as.numeric(levels(currentRow$X.2)[currentRow$X.2]) end2=as.numeric(levels(currentRow$X.3)[currentRow$X.3]) tag1=currentRow$X.10 tag2=currentRow$X.11 A=levels(tag1)[tag1] #Is there another row that matches it? otherRow=subset(TEdata, X.2==start1 & X.3==end1 & X.11==A & X.home.harry.Documents.Bioinformatics.cgigas_alpha_v032.fa.1==start2 & X==end2 & X.10==levels(tag2)[tag2]) cID=otherRow$id C=currentRow$id skip=FALSE L=nrow(otherRow) if(L==0){ skip=TRUE } if(!skip & !C%in%badData){#IF there is goodData=rbind(goodData,C) badData=rbind(badData,cID) }else if(skip){ goodData=rbind(goodData,C) } } goodData=subset(goodData, goodData>0) badData=subset(badData, badData>0) nrow(TEdata) TEdata=subset(TEdata, id%in%goodData) write.csv(TEdata, file = "halfsize_coordinates_file.csv") # #At this point all of the data points that are exactly the same have been removed # #now we want to remove by some criterion (subject sequence matched >10X) # #cleanData=cbind(TEdata$X.home.harry.Documents.Bioinformatics.cgigas_alpha_v032.fa.1, TEdata$X,TEdata$X.2,TEdata$X.3,TEdata$X.5, TEdata$X.6, TEdata$X.8, TEdata$X.10, TEdata$X.11, TEdata$id) # #s1=TEdata$X.10 # data=rep(-1,16) #First we will look at a particular gene than we need to just look at one location #so we should need three nested loops # criticalN=10 # for(tag in s1){ # currentGene=subset(TEdata,X.10==tag) # startPositions=levels(currentGene$X.home.harry.Documents.Bioinformatics.cgigas_alpha_v032.fa.1) # if(length(currentGene$X.home.harry.Documents.Bioinformatics.cgigas_alpha_v032.fa.1)>=criticalN){ # for(start in startPositions){ # currentStart=subset(currentGene,X.home.harry.Documents.Bioinformatics.cgigas_alpha_v032.fa.1==start) # endPositions=levels(currentStart$X) # if(length(currentStart$X)>=criticalN){ # for(end in endPositions){ # currentLocation=subset(currentStart, X==end) # repeats=nrow(currentLocation) # if(repeats>=criticalN){ # data=rbind(data,currentLocation) # } # } # } # } # } # } # x=2 # x