This Perl script was written by Shichen Wang under the direction of Eduard Akhunov at Kansas State University ()
Input data instructions
Input for the script is a fastq file, which has additional information about Block, Phase and Contig.
Format
@DJB775P1:264:D0M7EACXX:3:1107:7206:67178-1 BL:0 PH:0 CT:td-k45_contig_51016
ATCATGCTAGCTGTAGCTGATCGTAGCTAGCTAGCTAGCTGATCGTAGCTAGCTAGCTAG
+
BHBHBH@H@HBHBHBHBHBHBHBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
Reads that belong to the same contig and have the same number of block (BL) and phase (PH) will be extracted and assembled with Mira.
The final output is "mira_contigs.output.fasta", which contains all the assemblies.
Program below
#!/usr/bin/perl -w
use strict;
#use Parallel::ForkManager;
# multithread version not implemented yet.
my $sUsage = qq(
****************************************************************************************
Modify the \$MIRA_BIN and \$CAP3 to fit your own system before running this script.
Usage:
perl $0 <num_threads> <fastq>
Example:
perl $0 1 phased_elf.fastq
*****************************************************************************************
);
die $sUsage unless @ARGV == 2;
my($num_threads, $fastq) = @ARGV;
# Path for executable mira and cap3
my $MIRA_BIN = "/home/DNA/Tools/mira_3.2.1_prod_linux-gnu_x86_64_static/bin/mira ";
my $CAP3 = "/home/DNA/Tools/CAP3/cap3 ";
open (IN, "$fastq") or die "can't open file $fastq\n";
my $out_file = "mira_contigs.output.fasta";
open (my $out_fh, ">$out_file") or die $!;
my $count_mira = 0;
my @data;
my $line_counter = 0;
my $pre_id;
while(<IN>)
{
next if /^\s+$/;
chomp;
my $line = $_;
$line_counter++;
#print STDERR 'scalar @data: ', scalar @data, "\n";
if($line_counter % 4 == 1)
{
# @DJB775P1:264:D0M7EACXX:3:1107:7206:67178-1 BL:0 PH:0 CT:td-k45_contig_51016
my $id = $1 if $line=~/(BL.*\d+)$/;
$pre_id = $id unless defined $pre_id;
#print STDERR $pre_id, "\t", $id, "\n";
if($id eq $pre_id)
{
push @data, $line;
}
else
{
run_mira($pre_id, @data);
$pre_id = $id;
@data=();
push @data, $line;
}
}
else
{
push @data, $line;
}
if(eof(IN))
{
run_mira($pre_id, @data);
}
}
close IN;
sub run_mira
{
$count_mira++;
print STDERR "Runnig mira times: ", $count_mira, "\n";
my ($id, @data) = @_;
my $tmp_fasta_file = "data.tmp.fasta";
my $tmp_qual_file = "data.tmp.fasta.qual";
generate_files($tmp_fasta_file, $tmp_qual_file, @data);
my $cmd = $MIRA_BIN . "--project=mira_tmp --job=denovo,solexa,est --fasta=" . $tmp_fasta_file;
print STDERR $cmd, "\n";
eval{ system($cmd)}; return if $@;
my $mira_contig = "mira_tmp_assembly/mira_tmp_d_results/mira_tmp_out.padded.fasta";
return unless -e $mira_contig;
die if system($CAP3. $mira_contig);
my @cap_outputs = map{$mira_contig.$_}(".cap.contigs", ".cap.singlets");
processing_fasta_file($id, @cap_outputs);
}
sub generate_files
{
my ($fasta, $qual, @data) = @_;
open(F, ">$fasta") or die;
open(Q, ">$qual") or die;
foreach my $ind(0..$#data)
{
if($ind % 4 == 0)
{
$data[$ind]=~s/\@/>/;
print F $data[$ind], "\n";
print Q $data[$ind], "\n";
}
if($ind % 4 ==1)
{
print F $data[$ind], "\n";
}
if($ind % 4 == 3)
{
my @s = split //, $data[$ind];
my @q=map{ord($_)-33}@s;
print Q join(" ", @q), "\n";
}
}
close F;
close Q;
}
sub processing_fasta_file
{
my $pre_id = shift;
my @files = @_;
my %hash;
my $count = 0;
foreach my $f (@files)
{
open (I, $f) or die;
my $id;
while(<I>)
{
chomp;
if(/>/){$count++; $id="C".$count; next}
$hash{$id} .=$_;
}
close I;
}
foreach (keys %hash)
{
print {$out_fh} ">", $_, " ", $pre_id, "\n", $hash{$_}, "\n";
}
}