#!/usr/bin/env perl use warnings; use lib "/share/data/umw_biocore/dnext_data/bin/homer//bin"; my $homeDir = "/share/data/umw_biocore/dnext_data/bin/homer/"; use POSIX; use HomerConfig; my $suffix = ".masked"; my $suffix2 = ".fa"; my $config = HomerConfig::loadConfigFile(); sub printCMD { print STDERR "\n\tUsage: scanMotifGenomeWide.pl [options]\n"; print STDERR "\t\tPossible Genomes:\n"; foreach(keys %{$config->{'GENOMES'}}) { print STDERR "\t\t$_\t$config->{'GENOMES'}->{$_}->{'org'}\t$config->{'GENOMES'}->{$_}->{'directory'}\n"; } print STDERR "\t\t\t-- or --\n"; print STDERR "\t\tCustom: provide the path to genome FASTA files (directory or single file)\n"; print STDERR "\n\tOutput will be sent to stdout\n"; print STDERR "\n\tOptions:\n"; print STDERR "\t\t-5p (report positions centered on the 5' start of the motif)\n"; print STDERR "\t\t-bed (format as a BED file, i.e. for UCSC upload)\n"; print STDERR "\t\t\t-int (round motif scores to nearest integer, use if making bigBed file)\n"; print STDERR "\t\t-homer1 (use the original homer)\n"; print STDERR "\t\t-homer2 (use homer2 instead of the original homer, default)\n"; print STDERR "\t\t-keepAll (keep ALL sites, even ones that overlap, default is to keep one)\n"; print STDERR "\t\t-mask (search for motifs in repeat masked sequence)\n"; print STDERR "\t\t-p <#> (Number of CPUs to use)\n"; print STDERR "\n"; exit; } if (@ARGV < 2) { printCMD(); } $p5flag = 0; $maskFlag = 0; $homer2Flag = 1; $bedFlag = 0; $numCPUs = 1; $intFlag = 0; $keepAllFlag = 0; for (my $i=2;$i<@ARGV;$i++) { if ($ARGV[$i] eq '-5p') { $p5flag = 1; print STDERR "Outputing file centered on the 5' start of the motif\n"; } elsif ($ARGV[$i] eq '-bed') { $bedFlag = 1; } elsif ($ARGV[$i] eq '-mask') { $maskFlag = 1; } elsif ($ARGV[$i] eq '-int') { $intFlag = 1; } elsif ($ARGV[$i] eq '-keepAll') { $keepAllFlag = 1; } elsif ($ARGV[$i] eq '-p') { $numCPUs = $ARGV[++$i]; } elsif ($ARGV[$i] eq '-homer1') { $homer2Flag = 0; print STDERR "\tUsing original homer to scan for motifs\n"; } elsif ($ARGV[$i] eq '-homer2') { $homer2Flag = 1; print STDERR "\tUsing homer2 to scan for motifs\n"; } else { printCMD(); } } $mfile = $ARGV[0]; $genome = $ARGV[1]; $genomeDir = ""; $customGenome = 0; if (!exists($config->{'GENOMES'}->{$genome})) { $customGenome = 1; my $asdf = ""; ($genome,$genomeDir,$asdf) = HomerConfig::parseCustomGenome($genome); } else { $genomeDir = $config->{'GENOMES'}->{$genome}->{'directory'}; } $size = 10000; $tmpFile = rand() . ".tmp"; $tmpFile2 = $tmpFile . ".2"; if ($customGenome==1 && -f $genomeDir) { `ls -1 "$genomeDir" > "$tmpFile"`; } else { `ls -1 "$genomeDir"/*fa* > "$tmpFile"`; } open IN, $tmpFile; @files = (); while () { chomp; s/\r//g; push(@files, $_); } close IN; %idCounts = (); $chr = ''; #print STDERR "Seqfiles: @files\n"; foreach(@files) { my $file = $_; #$file =~ s/(\/.*)?$//; #$file .= "/bioinformatics/homer/data/genomes/mm9/chr11.fa"; open IN, $file or die "Couldn't open $file\n"; my $position = 0; my $totalLength = 0; $chr = ''; my $curSeq = ''; my $startPos = 0; my $justPrinted = 0; #open SEQFILE, ">$tmpFile2"; while () { chomp; s/\r//g; if (/^>/) { /^>(.*?)$/; my $nextchr = $1; if ($chr ne '' && $curSeq ne '') { print SEQFILE "$chr-$startPos\t$curSeq\n"; close SEQFILE; processChr(); } $chr = $nextchr; print STDERR "\n\tProcessing $chr\n"; $position = 0; $totalLength = 0; $curSeq = ''; $startPos = 0; open SEQFILE, ">$tmpFile2"; next; } if ($maskFlag==1) { s/[acgt]/N/g; } else { s/a/A/g; s/c/C/g; s/g/G/g; s/t/T/g; } my $len = length($_); $totalLength += $len; $curSeq .= $_; if ($totalLength > $size) { print SEQFILE "$chr-$startPos\t$curSeq\n"; $totalLength = $len; $startPos = $position; $curSeq = $_; $justPrinted = 1; } else { $justPrinted = 0; } $position += $len; } if ($justPrinted == 0) { print SEQFILE "$chr-$startPos\t$curSeq\n"; } close IN; close SEQFILE; processChr(); } `rm -f "$tmpFile" "$tmpFile2"`; exit; sub processChr { if ($homer2Flag) { `homer2 find -s "$tmpFile2" -m "$mfile" -offset 0 -p $numCPUs > "$tmpFile"`; } else { `homer -s "$tmpFile2" -a FIND -m "$mfile" > "$tmpFile"`; } open INN, "$tmpFile"; my %pos = (); my @sites = (); while () { chomp; my @line = split /\t/; $line[0]=~ /^(.*?)\-(\d+)$/; my $chr = $1; my $gpos = $2; my $pos = $line[1]; my $seq = $line[2]; my $d = 0; my $name = ""; my $score = ""; if ($homer2Flag) { $name = $line[3]; $d = $line[4]; $score = $line[5]; } else { $d = $line[4]; $name = $line[5]; $score = $line[6]; } my $start = $gpos+$pos+1; my $end = $start + length($seq)-1; if ($homer2Flag && ($d eq '-' || $d eq '1') ) { $start -= length($seq)-1; $end -= length($seq)-1; } my $mid = floor(($start+$end)/2); my $pd = $name . "-" . $chr . "-" . $start . "-" . $d; next if (exists($pos{$pd})); $pos{$pd}= 1; if (!exists($idCounts{$name})) { $idCounts{$name} = 1; } if ($p5flag==1) { my $center = $start; if ($d eq '-' || $d eq '1') { $center = $end; } $start = $center -100; $end = $center +100; } if ($start > $end) { next; } my $ss = {s=>$start,e=>$end,d=>$d,seq=>$seq,m=>$mid,n=>$name,ss=>$score,c=>$chr}; push(@sites, $ss); } close INN; @sites = sort {$a->{'c'} cmp $b->{'c'} || $a->{'s'} <=> $b->{'s'}} @sites; my $Nsites = scalar(@sites); my $removed= 0; for (my $i=0;$i<@sites;$i++) { my $m= $sites[$i]->{'m'}; my $bad = 0; my $mlen = length($sites[$i]->{'seq'}); if ($keepAllFlag==0) { for (my $j=$i-1;$j>=0;$j--) { last if ($m - $sites[$j]->{'m'} > $mlen/2); if ($sites[$i]->{'n'} eq $sites[$j]->{'n'} && $sites[$i]->{'c'} eq $sites[$j]->{'c'} && $sites[$i]->{'ss'} < $sites[$j]->{'ss'}) { $bad = 1; last; } } for (my $j=$i+1;$j<@sites;$j++) { last if ($sites[$j]->{'m'}-$m > $mlen/2); if ($sites[$i]->{'n'} eq $sites[$j]->{'n'} && $sites[$i]->{'c'} eq $sites[$j]->{'c'} && $sites[$i]->{'ss'} <= $sites[$j]->{'ss'}) { $bad = 1; last; } } } if ($bad == 0) { my $chr = $sites[$i]->{'c'}; my $start = $sites[$i]->{'s'}; my $end = $sites[$i]->{'e'}; my $dir = $sites[$i]->{'d'}; my $score = $sites[$i]->{'ss'}; my $name = $sites[$i]->{'n'}; my $seq = $sites[$i]->{'seq'}; if ($start < 0) { #print STDERR "\t!!! Error, start=$start < 0\n"; #print STDERR "\t\t$chr\t$start\t$end\t$name\t$score\t$dir\n"; next; } if ($start >= $end) { #print STDERR "\t!!! Error, start>=end: $start $end\n"; #print STDERR "\t\t$chr\t$start\t$end\t$name\t$score\t$dir\n"; next; } if ($bedFlag) { $name =~ s/\/.*$//g; $name =~ s/\s/_/g; if ($intFlag) { $score = floor($score+0.5); } print "$chr\t$start\t$end\t$name\t$score\t$dir\n"; } else { my $id = $idCounts{$name}; print "$name-$id\t$chr\t$start\t$end\t$dir\t$score\t$seq\n"; $idCounts{$name}++; } } else { $removed++; } } print STDERR "\t$chr - total: $Nsites (removed: $removed)\n"; }