#!/usr/bin/env perl use warnings; use lib "/share/data/umw_biocore/dnext_data/bin/homer//bin"; my $homeDir = "/share/data/umw_biocore/dnext_data/bin/homer/"; # Copyright 2009-2014 Christopher Benner # # This file is part of HOMER # # HOMER is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # HOMER is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. if (@ARGV < 2) { printCMD(); } sub printCMD { print STDERR "\n\tScript to automatically remove reads mapping outside of chromosome coordinates\n"; print STDERR "\n\tUsage: removeOutOfBoundsReads.pl [options]\n"; print STDERR "\n\tAltUsage: removeOutOfBoundsReads.pl [options] > newPeakFile.txt\n"; print STDERR "\n\tOtions:\n"; print STDERR "\t\t-chromSizes (specify the chromosome sizes, default: automatic)\n"; print STDERR "\t\t-force (force calculation of chromsome sizes for genome FASTA files)\n"; print STDERR "\n\t\tchrom.size file is tab delimited: chrsize\n"; print STDERR "\n\t\tFor custom genomes, use removeOutOfBoundsReads.pl none -chromSizes \n"; print STDERR "\n"; exit; } my $dir = $ARGV[0]; my $genome = $ARGV[1]; $dir =~ s/\/$//; my $fileFlag = 0; if (-d $dir) { $fileFlag = 0; } elsif (-e $dir) { $fileFlag = 1; print STDERR "\tTreating input as a peak/BED file...\n"; } my $chromSizeFile = ""; my $forceCalc = 0; for (my $i=2;$i<@ARGV;$i++) { if ($ARGV[$i] eq '-chromSizes') { $chromSizeFile = $ARGV[++$i]; } elsif ($ARGV[$i] eq '-force') { $forceCalc = 1; } elsif ($ARGV[$i] eq '-help') { printCMD(); } elsif ($ARGV[$i] eq '--help') { printCMD(); } else { printCMD(); } } my $rand = sprintf("%d",rand()*1e7); my $tmpFile = $rand . ".tmp"; if ($chromSizeFile eq '') { $chromSizeFile = $homeDir . "/data/genomes/$genome/chrom.sizes"; if (-e $chromSizeFile && $forceCalc == 0) { } else { print STDERR "\t!! Could not find chromosome size file ($chromSizeFile)\n"; print STDERR "\t Generating one from genome sequence files...\n"; `homerTools extract stats \"$homeDir/data/genomes/$genome/\" > "$tmpFile"`; open IN, $tmpFile; open OUT, ">$chromSizeFile"; while () { chomp; my @line = split /\t/; next if ($line[0] eq 'ChrName'); next if ($line[0] eq 'genome'); print OUT "$line[0]\t$line[1]\n"; } close IN; close OUT; `rm "$tmpFile"`; } } my %chrSizes = (); open IN, $chromSizeFile or die "!!! Could not open a valid chromosome sizes file !!!\ntried: $chromSizeFile)\n"; while () { chomp; s/\r//g; my @line = split /\t/; $chrSizes{$line[0]}=$line[1]; } close IN; if ($fileFlag) { my $totalLines = 0; my $totalBad = 0; open IN, $dir; while () { chomp; if (/^#/) { print "$_\n"; next; } my @line = split /\t/; my $cool = 0; if (exists($chrSizes{$line[0]}) && $line[1]=~/\d+/ && $line[2] =~ /\d+/) { #bed file $totalLines++; next if ($line[1] > $chrSizes{$line[0]}); if ($line[2] > $chrSizes{$line[0]}) { $totalBad++; $line[2] = $chrSizes{$line[0]}; } $cool = 1; } elsif (exists($chrSizes{$line[1]}) && $line[2]=~/\d+/ && $line[3] =~ /\d+/) { #peak file $totalLines++; next if ($line[2] > $chrSizes{$line[1]}); if ($line[3] > $chrSizes{$line[1]}) { $totalBad++; $line[3] = $chrSizes{$line[1]}; } $cool = 1; } if ($cool) { print "$line[0]"; for (my $i=1;$i<@line;$i++){ print "\t$line[$i]"; } print "\n"; } } print STDERR "\tTotal of $totalBad of $totalLines were outside of range\n\n"; exit; } `ls -1 "$dir/"*tags.tsv > "$tmpFile"`; open IN, $tmpFile; my %chrFiles = (); while () { chomp; my $f = $_; s/^.*\///; s/\.tags\.tsv//; my $c = $_; $chrFiles{$c} = $f; } close IN; `rm "$tmpFile"`; my @chr = sort {$a cmp $b} keys %chrFiles; foreach(@chr) { my $chr = $_; my $file = $chrFiles{$chr}; print STDERR "\tProcessing $chr:"; if (!exists($chrSizes{$chr})) { print STDERR " !! No limit for $chr !! - removing...\n"; `rm "$file"`; next; } my $limit = $chrSizes{$chr}; my $totalPos = 0; my $badPos = 0; open OUT, ">$tmpFile"; open IN, $file or die "!!! Could not open file ($file) for $chr !!!\n"; while () { my $og = $_; chomp; my @line = split /\t/; $totalPos++; if ($line[2] > $limit || $line[2] < 1) { $badPos++; } else { print OUT $og; } } close OUT; close IN; `mv "$tmpFile" "$file"`; my $DD = $totalPos; $DD = 1 if ($DD < 1); my $percentBad = $badPos/$DD*100; my $v = sprintf("%.3lf",$percentBad); print STDERR " $badPos of $totalPos positions ($v%) out of bounds\n"; }