#!/usr/bin/perl
# FASTX-toolkit - FASTA/FASTQ preprocessing tools.
# Copyright (C) 2009-2013 A. Gordon (assafgordon@gmail.com)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see .
use strict;
use warnings;
use IO::Handle;
use Data::Dumper;
use Getopt::Long;
use Carp;
##
## This program splits a FASTQ/FASTA file into several smaller files,
## Based on barcode matching.
##
## run with "--help" for usage information
##
## Assaf Gordon , 11sep2008
# Forward declarations
sub load_barcode_file ($);
sub parse_command_line ;
sub match_sequences ;
sub mismatch_count($$) ;
sub print_results;
sub open_and_detect_input_format;
sub read_record;
sub write_record($);
sub usage();
# Global flags and arguments,
# Set by command line argumens
my $barcode_file ;
my $barcodes_at_eol = 0 ;
my $barcodes_at_bol = 0 ;
my $exact_match = 0 ;
my $allow_partial_overlap = 0;
my $allowed_mismatches = 1;
my $newfile_suffix = '';
my $newfile_prefix ;
my $quiet = 0 ;
my $debug = 0 ;
my $fastq_format = 1;
# Global variables
# Populated by 'create_output_files'
my %filenames;
my %files;
my %counts = ( 'unmatched' => 0 );
my $barcodes_length;
my @barcodes;
my $input_file_io;
# The Four lines per record in FASTQ format.
# (when using FASTA format, only the first two are used)
my $seq_name;
my $seq_bases;
my $seq_name2;
my $seq_qualities;
#
# Start of Program
#
parse_command_line ;
load_barcode_file ( $barcode_file ) ;
open_and_detect_input_format;
match_sequences ;
print_results unless $quiet;
#
# End of program
#
sub parse_command_line {
my $help;
usage() if (scalar @ARGV==0);
my $result = GetOptions ( "bcfile=s" => \$barcode_file,
"eol" => \$barcodes_at_eol,
"bol" => \$barcodes_at_bol,
"exact" => \$exact_match,
"prefix=s" => \$newfile_prefix,
"suffix=s" => \$newfile_suffix,
"quiet" => \$quiet,
"partial=i" => \$allow_partial_overlap,
"debug" => \$debug,
"mismatches=i" => \$allowed_mismatches,
"help" => \$help
) ;
usage() if ($help);
die "Error: barcode file not specified (use '--bcfile [FILENAME]')\n" unless defined $barcode_file;
die "Error: prefix path/filename not specified (use '--prefix [PATH]')\n" unless defined $newfile_prefix;
if ($barcodes_at_bol == $barcodes_at_eol) {
die "Error: can't specify both --eol & --bol\n" if $barcodes_at_eol;
die "Error: must specify either --eol or --bol\n" ;
}
die "Error: invalid for value partial matches (valid values are 0 or greater)\n" if $allow_partial_overlap<0;
$allowed_mismatches = 0 if $exact_match;
die "Error: invalid value for mismatches (valid values are 0 or more)\n" if ($allowed_mismatches<0);
die "Error: partial overlap value ($allow_partial_overlap) bigger than " .
"max. allowed mismatches ($allowed_mismatches)\n" if ($allow_partial_overlap > $allowed_mismatches);
exit unless $result;
}
#
# Read the barcode file
#
sub load_barcode_file ($) {
my $filename = shift or croak "Missing barcode file name";
open BCFILE,"<$filename" or die "Error: failed to open barcode file ($filename)\n";
while () {
next if m/^#/;
chomp;
my ($ident, $barcode) = split ;
$barcode = uc($barcode);
# Sanity checks on the barcodes
die "Error: bad data at barcode file ($filename) line $.\n" unless defined $barcode;
die "Error: bad barcode value ($barcode) at barcode file ($filename) line $.\n"
unless $barcode =~ m/^[AGCT]+$/;
die "Error: bad identifier value ($ident) at barcode file ($filename) line $. (must be alphanumeric)\n"
unless $ident =~ m/^\w+$/;
die "Error: badcode($ident, $barcode) is shorter or equal to maximum number of " .
"mismatches ($allowed_mismatches). This makes no sense. Specify fewer mismatches.\n"
if length($barcode)<=$allowed_mismatches;
$barcodes_length = length($barcode) unless defined $barcodes_length;
die "Error: found barcodes in different lengths. this feature is not supported yet.\n"
unless $barcodes_length == length($barcode);
push @barcodes, [$ident, $barcode];
if ($allow_partial_overlap>0) {
foreach my $i (1 .. $allow_partial_overlap) {
substr $barcode, ($barcodes_at_bol)?0:-1, 1, '';
push @barcodes, [$ident, $barcode];
}
}
}
close BCFILE;
if ($debug) {
print STDERR "barcode\tsequence\n";
foreach my $barcoderef (@barcodes) {
my ($ident, $seq) = @{$barcoderef};
print STDERR $ident,"\t", $seq ,"\n";
}
}
}
# Create one output file for each barcode.
# (Also create a file for the dummy 'unmatched' barcode)
sub create_output_files {
my %barcodes = map { $_->[0] => 1 } @barcodes; #generate a uniq list of barcode identifiers;
$barcodes{'unmatched'} = 1 ;
foreach my $ident (keys %barcodes) {
my $new_filename = $newfile_prefix . $ident . $newfile_suffix;
$filenames{$ident} = $new_filename;
open my $file, ">$new_filename" or die "Error: failed to create output file ($new_filename)\n";
$files{$ident} = $file ;
}
}
sub match_sequences {
my %barcodes = map { $_->[0] => 1 } @barcodes; #generate a uniq list of barcode identifiers;
$barcodes{'unmatched'} = 1 ;
#reset counters
foreach my $ident ( keys %barcodes ) {
$counts{$ident} = 0;
}
create_output_files;
# Read file FASTQ file
# split accotding to barcodes
while ( read_record ) {
chomp $seq_bases;
print STDERR "sequence $seq_bases: \n" if $debug;
my $best_barcode_mismatches_count = $barcodes_length;
my $best_barcode_ident = undef;
#Try all barcodes, find the one with the lowest mismatch count
foreach my $barcoderef (@barcodes) {
my ($ident, $barcode) = @{$barcoderef};
# Get DNA fragment (in the length of the barcodes)
# The barcode will be tested only against this fragment
# (no point in testing the barcode against the whole sequence)
my $sequence_fragment;
if ($barcodes_at_bol) {
$sequence_fragment = substr $seq_bases, 0, $barcodes_length;
} else {
$sequence_fragment = substr $seq_bases, - $barcodes_length;
}
my $mm = mismatch_count($sequence_fragment, $barcode) ;
# if this is a partial match, add the non-overlap as a mismatch
# (partial barcodes are shorter than the length of the original barcodes)
$mm += ($barcodes_length - length($barcode));
if ( $mm < $best_barcode_mismatches_count ) {
$best_barcode_mismatches_count = $mm ;
$best_barcode_ident = $ident ;
}
}
$best_barcode_ident = 'unmatched'
if ( (!defined $best_barcode_ident) || $best_barcode_mismatches_count>$allowed_mismatches) ;
print STDERR "sequence $seq_bases matched barcode: $best_barcode_ident\n" if $debug;
$counts{$best_barcode_ident}++;
#get the file associated with the matched barcode.
#(note: there's also a file associated with 'unmatched' barcode)
my $file = $files{$best_barcode_ident};
write_record($file);
}
}
#Quickly calculate hamming distance between two strings
#
#NOTE: Strings must be same length.
# returns number of different characters.
#see http://www.perlmonks.org/?node_id=500235
sub mismatch_count($$) { length( $_[ 0 ] ) - ( ( $_[ 0 ] ^ $_[ 1 ] ) =~ tr[\0][\0] ) }
sub print_results
{
print "Barcode\tCount\tLocation\n";
my $total = 0 ;
foreach my $ident (sort keys %counts) {
print $ident, "\t", $counts{$ident},"\t",$filenames{$ident},"\n";
$total += $counts{$ident};
}
print "total\t",$total,"\n";
}
sub read_record
{
$seq_name = $input_file_io->getline();
return undef unless defined $seq_name; # End of file?
$seq_bases = $input_file_io->getline();
die "Error: bad input file, expecting line with sequences\n" unless defined $seq_bases;
# If using FASTQ format, read two more lines
if ($fastq_format) {
$seq_name2 = $input_file_io->getline();
die "Error: bad input file, expecting line with sequence name2\n" unless defined $seq_name2;
$seq_qualities = $input_file_io->getline();
die "Error: bad input file, expecting line with quality scores\n" unless defined $seq_qualities;
}
return 1;
}
sub write_record($)
{
my $file = shift;
croak "Bad file handle" unless defined $file;
print $file $seq_name;
print $file $seq_bases,"\n";
#if using FASTQ format, write two more lines
if ($fastq_format) {
print $file $seq_name2;
print $file $seq_qualities;
}
}
sub open_and_detect_input_format
{
$input_file_io = new IO::Handle;
die "Failed to open STDIN " unless $input_file_io->fdopen(fileno(STDIN),"r");
# Get the first characeter, and push it back
my $first_char = $input_file_io->getc();
$input_file_io->ungetc(ord $first_char);
if ($first_char eq '>') {
# FASTA format
$fastq_format = 0 ;
print STDERR "Detected FASTA format\n" if $debug;
} elsif ($first_char eq '@') {
# FASTQ format
$fastq_format = 1;
print STDERR "Detected FASTQ format\n" if $debug;
} else {
die "Error: unknown file format. First character = '$first_char' (expecting > or \@)\n";
}
}
sub usage()
{
print<