#!/usr/bin/env perl if (@ARGV < 1) { print STDERR "\n\tUsage: getMappingStats.pl [options] [sample directory2] ...\n"; print STDERR "\tWill print stats to stdout\n"; print STDERR "\n\tProgram looks for:\n"; print STDERR "\t\t*.lengths file containing trimming stats\n"; print STDERR "\t\t*.bowtie2.log containing bowtie2 mapping stats\n"; print STDERR "\t\t*.final.out containing STAR mapping stats\n"; print STDERR "\n\tOptions:\n"; print STDERR "\t\t-min <#> (minimum length to consider a sequence an adapter-dimer, def. 15)\n"; print STDERR "\t\t-genome (limit analysis to alignment for this genome)\n"; print STDERR "\n"; exit; } $minlen = 15; my $GENOME = ""; $tmpFile = rand() . ".tmp"; my @files = (); for (my $i=0;$i<@ARGV;$i++) { if ($ARGV[$i] eq '-min') { $minlen = $ARGV[++$i]; } elsif ($ARGV[$i] eq '-genome') { $GENOME = $ARGV[++$i]; } else { push(@files, $ARGV[$i]); } } print "Experiment File\tGenome\tTotal reads\t\% Adapter Dimers"; print "\t\% Aligned\t\% Unique Match\t\% Multimappers\t\% unmapped"; print "\tAligner\n"; foreach(@files) { my $file = $_; print "$file"; #print STDERR "FILE: $file"; my $trimFile = ''; my $bowtie2File = ''; my $starFile = ''; my $genome = 'NA'; my $aligner = 'NA'; my $m = $file; my $DIR = "."; my $file2 = $file; if ($m =~ /^(.+)\/([^\/]+)$/) { $DIR = $1; $file2 = $2; } #print STDERR "||$file||\n"; #print STDERR "||$DIR||\t||$file2||\n"; #`sleep 100`; #exit; # $file2 =~ s/\.gz$//; $file2 =~ s/fastq.*$/fastq/; $file2 =~ s/fq.*$/fq/; $file2 =~ s/\.bam$//; $file2 =~ s/\.sam$//; `ls $DIR/$file2* > $tmpFile`; my $error = "ls $m*\n"; open IN, $tmpFile; while () { $error .= $_; chomp; s/\r//g; if ($_ =~ /\.lengths$/) { $trimFile = $_; } elsif ($_ =~ /\.(.*?)\.bowtie2\.log/) { my $cgenome = $1; $cgenome =~ s/^.*\.//; if ($GENOME ne '') { if ($cgenome ne $GENOME) { next; } } $genome = $cgenome; $bowtie2File = $_; } elsif ($_ =~ /\.(.*?)\.hisat2\.log/) { my $cgenome = $1; $cgenome =~ s/^.*\.//; if ($GENOME ne '') { if ($cgenome ne $GENOME) { next; } } $genome = $cgenome; $bowtie2File = $_; } elsif ($_ =~ /\.(.*?)\.Log\.final\.out/ || $_ =~ /\.(.*?)\.STAR\.log/) { my $cgenome = $1; $cgenome =~ s/^.*\.//; if ($GENOME ne '') { if ($cgenome ne $GENOME) { next; } } $genome = $cgenome; $starFile = $_; } } close IN; `rm $tmpFile`; my $adapterDimers = 'NA'; my $total = 0; if ($trimFile ne '') { ($adapterDimers,$goodReads) = readTrimLengthFile($trimFile); if ($goodReads > 0) { $total = $goodReads; } if ($adapterDimers ne 'NA') { $total += $adapterDimers; $adapterDimers /= $total if ($total > 0); $adapterDimers = sprintf("%.2f",$adapterDimers*100) . '%'; } } my ($mt,$um,$mm,$un) = (0,0,0,0); if ($starFile ne '') { ($mt,$um,$mm,$un) = readSTARFile($starFile); $aligner = 'STAR'; } elsif ($bowtie2File ne '') { ($mt,$um,$mm,$un) = readBowtie2File($bowtie2File); $aligner = 'bowtie2'; } else { print STDERR "!!! No alignment log found for $file: !!!\n"; print STDERR "$error"; print STDERR "\n"; } $total = 1 if ($total < 1); $total = $mt if ($total < $mt); print "\t$genome"; print "\t$total"; print "\t$adapterDimers"; $total = 1 if ($total < 0); $p = sprintf("%.2f",($um+$mm)/$total*100) . '%'; print "\t$p"; $p = sprintf("%.2f",($um)/$total*100) . '%'; print "\t$p"; $p = sprintf("%.2f",$mm/$total*100) . '%'; print "\t$p"; $p = sprintf("%.2f",$un/$total*100) . '%'; print "\t$p"; print "\t$aligner\n"; } sub readBowtie2File { my ($file) = @_; open IN, $file or print STDERR "Can't open: $file\n"; my $totalReads = 0; my $uniqReads = 0; my $multiReads = 0; while () { chomp; s/\r//g; if (/(\d+) reads; of these:/) { $totalReads = $1; } elsif (/(\d+) $.*$ aligned exactly 1 time/) { $uniqReads = $1; } elsif (/(\d+) $.*$ aligned >1 times/) { $multiReads = $1; } } close IN; my $unmappedReads = $totalReads - $uniqReads - $multiReads; return ($totalReads, $uniqReads, $multiReads, $unmappedReads); } sub readSTARFile { my ($file) = @_; open IN, $file; #print STDERR "\t$file\n"; my $totalReads = 0; my $uniqReads = 0; my $multiReads = 0; while () { chomp; s/\r//g; if (/Number of input reads \|\s(\d+)/) { $totalReads = $1; } elsif (/Uniquely mapped reads number \|\s(\d+)/) { $uniqReads = $1; } elsif (/Number of reads mapped to multiple loci \|\s(\d+)/) { $multiReads = $1; } } close IN; my $unmappedReads = $totalReads - $uniqReads - $multiReads; #print STDERR "\t$unmappedReads = $totalReads - $uniqReads - $multiReads\n"; return ($totalReads, $uniqReads, $multiReads, $unmappedReads); } sub readTrimLengthFile { my ($file) = @_; my $adapterDimers = 0; my $goodReads = 0; if (open IN, $file) { while () { chomp; s/\r//g; my @line = split /\t/; next if ($line[0] =~ /Length/); if ($line[0] < $minlen) { $adapterDimers += $line[1]; } else { $goodReads += $line[1]; } } close IN; } else { $adapterDimers = 'NA'; } return ($adapterDimers, $goodReads); }