Commit a3175750 authored by JCabral's avatar JCabral

adding nml seqtk sample tool

parent f7ad074e
#!/usr/bin/env perl
use strict;
use Bio::SeqIO;
my ($fastaref, $type, $coverage, $outfile, $outfile_rev, $length, $subsample_size, @fastqs, $rv);
$fastaref = $ARGV[0];
shift(@ARGV);
$type = $ARGV[0];
shift(@ARGV);
if ($type eq "single"){
$fastqs[0] = $ARGV[0];
shift(@ARGV);
}else{
$fastqs[0] = $ARGV[0];
shift(@ARGV);
$fastqs[1] = $ARGV[0];
shift(@ARGV);
}
$coverage = $ARGV[0];
shift(@ARGV);
$outfile = $ARGV[0];
shift(@ARGV);
if ($type ne "single"){
$outfile_rev = $ARGV[0];
shift(@ARGV);
}
if (!($coverage)){
$coverage = 50;
}
my $seq_in = Bio::SeqIO->new(
-format => 'fasta',
-file => $fastaref,
);
while ( my $seq = $seq_in->next_seq()) {
$length += $seq->length();
}
my $all_fastq = join (' ' , map { "\"$_\"" } @fastqs);
#one liner from : http://onetipperday.blogspot.ca/2012/05/simple-way-to-get-reads-length.html with some modification by Philip Mabon
my $total=`cat $all_fastq | perl -ne '\$s=<>;<>;<>;chomp(\$s);print length(\$s)."\n";' | sort | uniq -c | perl -e 'while(my \$line=<>){chomp \$line; \$line =~s/^\\s+//;(\$l,\$n)=split/\\s+/,\$line; \$t+= \$l*\$n;} print "\$t\n"'`;
print "Total number of Basepairs: $total\n";
print "Length of Reference: $length\n";
my $rawcoverage = $total/$length;
printf "Raw Coverage: %.3f\n",$rawcoverage;
my $out = $outfile;
my $i = 0;
if($rawcoverage > $coverage){
#need to downsample
#calculate $subsample_size
$subsample_size = $coverage/$rawcoverage;
printf("subsample: %.3f",$subsample_size);
#printf()
my $out = $outfile;
my $i = 0;
foreach my $fastq (@fastqs){
if($i > 0){
$out = $outfile_rev;
}
#seed always set to 42 for reproducibility
my $seqCommand = "seqtk sample -s42 $fastq $subsample_size > $out";
$rv = system($seqCommand);
$i ++;
}
} else {
#no sampling needed, just copy the fastq's to the output
print "Subsampling not required\n";
foreach my $fastq (@fastqs){
if($i > 0){
$out = $outfile_rev;
}
my $copyCommand = "scp $fastq $out";
$rv = system($copyCommand);
$i ++;
}
}
exit $rv >> 8 if $rv >>8;
#return $rv >> 8;
<tool id="seqtk_nml_sample" name="seqTK Sample NML" version="0.0.1">
<description>Runs seqTK sample if raw coverage is above user defined threshold </description>
<requirements>
<requirement type="package" version="1.0-r75-dirty">seqtk</requirement>
<requirement type="package" version="5.18.1">perl</requirement>
<requirement type="package" version="1.6.922">bioperl</requirement>
</requirements>
<command interpreter="perl">
seqTK-nml.pl $fastar
#if $single_or_paired.type == "single"
single
$input_se
#elif $single_or_paired.type == "paired"
paired
$single_or_paired.forward_pe $single_or_paired.reverse_pe
#else
collection
$single_or_paired.fastq_collection.forward $single_or_paired.fastq_collection.reverse
#end if
$coverage $output
#if $single_or_paired.type != "single"
$output_rev
#end if
</command>
<stdio>
<exit_code range="1:" level="fatal" description="Unknown error has occured"/>
</stdio>
<inputs>
<conditional name="single_or_paired">
<param name="type" type="select" label="Read type">
<option value="single">Single-end</option>
<option value="paired">Paired-end</option>
<option value="collection">Collection Paired-end</option>
</param>
<when value="single">
<param name="input_se" type="data" format="fastqsanger" label="Single end read file(s)"/>
</when>
<when value="paired">
<param name="forward_pe" type="data" format="fastqsanger" label="Forward paired-end read file"/>
<param name="reverse_pe" type="data" format="fastqsanger" label="Reverse paired-end read file"/>
</when>
<when value="collection">
<param name="fastq_collection" type="data_collection" label="Paired-end reads collection" optional="false" format="txt" collection_type="paired" />
</when>
</conditional>
<param name="fastar" type="data" label="Fasta Reference File" format="fasta" />
<param name="coverage" type="integer" label="Desired Coverage" value="50" />
</inputs>
<outputs>
<data format="fastqsanger" name="output" label="SubSampled Fastq" />
<data format="fastqsanger" name="output_rev" label="SubSampled Fastq Reverse">
<filter>single_or_paired['type']!="single"</filter>
</data>
</outputs>
<tests>
<test>
</test>
</tests>
<help>
What it does
============
Calculates raw coverage. If the raw coverage is greater than desired coverage, runs seqTK sample to generate downsampled reads.
Usage
=====
**Parameters**
- Fastq reads (single end, paired end, or paired end collection)
- Fasta reference file
**Options**
- Desired coverage (50)
</help>
</tool>
<?xml version="1.0"?>
<tool_dependency>
<package name="perl" version="5.18.1">
<repository name="package_perl_5_18" owner="iuc" />
</package>
<package name="seqtk" version="1.0-r75-dirty">
<repository name="package_seqtk_1_0_r75" owner="iuc" />
</package>
<package name="bioperl" version="1.6.922">
<repository name="package_bioperl_1_6" owner="iuc" />
</package>
</tool_dependency>
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment