#!/usr/bin/env perl use warnings; use lib "/share/data/umw_biocore/dnext_data/bin/homer//bin"; my $homeDir = "/share/data/umw_biocore/dnext_data/bin/homer/"; # Copyright 2009-2014 Christopher Benner # # This file is part of HOMER # # HOMER is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # HOMER is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. use POSIX; use HomerConfig; my $config = HomerConfig::loadConfigFile(); if (@ARGV < 1) { cmdLineOptions(); } sub cmdLineOptions { print STDERR "\n\tProgram preparse the genome for use with motif finding.\n"; print STDERR "\n\tUsage: preparseGenome.pl -size <#> [additional options]\n"; print STDERR "\tOutput files will be placed in $homeDir/data/genomes/[genome]/preparsed/\n"; print STDERR "\t\t*.###.seq *.###.pos *.###.cgfreq *.###.cgbins *.###.gcbins\n"; print STDERR "\tFirst argument must be \n"; print STDERR "\n\tPossible Genomes:\n"; foreach(keys %{$config->{'GENOMES'}}) { print STDERR "\t\t$_\t$config->{'GENOMES'}->{$_}->{'org'}\t$config->{'GENOMES'}->{$_}->{'directory'}\n"; } print STDERR "\t\t\t-- or --\n"; print STDERR "\t\tCustom: provide the path to genome FASTA files (directory or single file)\n"; print STDERR "\t\t\tHeads up: will create the directory \"preparsed/\" in same location.\n"; print STDERR "\n\tBasic options:\n"; print STDERR "\t\t-size <#> (size of fragments to use for preparsing the genome)\n"; print STDERR "\t\t-mask (mask repeats - i.e. lower-case bases)\n"; print STDERR "\t\t-ref (reference position file, default: [genome].tss)\n"; print STDERR "\t\t\tIf no reference file is given or found, random regions will be used\n"; print STDERR "\t\t\tTo force the use of random regions, use \"-ref random\"\n"; print STDERR "\t\t-window <#> (size of window around ref positions to prepare, default=50000)\n"; print STDERR "\t\t-max <#> (maximum number of preparesed fragments to create, default=2e6)\n"; print STDERR "\t\t-minInc <#> (minimum size of region near ref pos to include, default=1000)\n"; print STDERR "\t\t-preparsedDir (alternative directory to place the preparsed output files)\n"; print STDERR "\n"; exit; } my $genome = $ARGV[0]; my $customGenome = 0; my $genomeDir = ""; my $genomeParseDir = ""; my $forceRandom = 0; my $maskFlag = 0; if ($genome =~ s/r$//) { $maskFlag = 1; } if (!exists($config->{'GENOMES'}->{$genome})) { ($genome,$genomeDir,$genomeParseDir) = HomerConfig::parseCustomGenome($genome); } else { $genomeDir = $config->{'GENOMES'}->{$genome}->{'directory'} . "/"; $genomeParseDir = $genomeDir . "preparsed/"; } #print STDERR "\tPreparsed File will be put in: $genomeParseDir\n"; #print STDERR "\tgenome name: $genome\n"; my $defFile = $genomeDir . "$genome.tss"; if ($customGenome == 0) { if (-f $defFile) { } else { #print STDERR "!!! You should update the $genome genome using configureHomer.pl !!!\n"; #$defFile = $genomeDir . "$genome.promoters"; } } my $refFile = $defFile; my $regionSize = 200; my $window = 50000; my $minInclude = 1000; my $maxRegions = 2e6; my $noRefFileFlag = 0; for (my $i=1;$i<@ARGV; $i++) { if ($ARGV[$i] eq '-size') { $regionSize = $ARGV[++$i]; print STDERR "\tpreparse size set to $regionSize\n"; } elsif ($ARGV[$i] eq '-ref') { $refFile = $ARGV[++$i]; if ($refFile eq 'random') { $forceRandom = 1; $noRefFileFlag = 1; print STDERR "\tUsing random positions as reference points\n"; } else { print STDERR "\tPeak file $refFile will be used as reference positions\n"; } } elsif ($ARGV[$i] eq '-preparsedDir') { $genomeParseDir = $ARGV[++$i]; } elsif ($ARGV[$i] eq '-mask') { $maskFlag = 1; } elsif ($ARGV[$i] eq '-window') { $window = $ARGV[++$i]; print STDERR "\tWindow size set to $window\n"; } elsif ($ARGV[$i] eq '-max') { $maxRegions = $ARGV[++$i]; print STDERR "\tMax number of preparsed regions set to $maxRegions\n"; } elsif ($ARGV[$i] eq '-minInc') { $minInclude = $ARGV[++$i]; print STDERR "\tA minimum of $minInclude will be used around reference positions\n"; } else { print STDERR "!!! $ARGV[$i] not recognized as a valid option!\n"; cmdLineOptions(); } } $genomeParseDir .= "/"; my $regionSizeHalf = floor($regionSize/2); print STDERR "\n"; if ($refFile eq $defFile) { unless (-e $refFile) { $noRefFileFlag = 1; print STDERR "\tNo reference file found - using random positions...\n"; } else { print STDERR "\tBy default, using $defFile for reference positions\n"; } } my $outputGenome = $genome; if ($maskFlag) { $outputGenome .= "r"; } print STDERR "\tOutput files will be placed in $genomeParseDir". "$outputGenome.$regionSize.* ...\n"; `mkdir -p "$genomeParseDir"`; if (-w "$genomeParseDir") { } else { print STDERR "!!! Warning: Looks like you do not have permission to write the preparsed\n"; print STDERR "!!! genome files to the following directory:\n"; print STDERR "!!! $genomeParseDir\n"; print STDERR "!!! Consider one of the two options:\n"; print STDERR "!!! 1.) Get your system admin to set the directory to be writeable to you and/or\n"; print STDERR "!!! your group.\n"; print STDERR "!!! -or-\n"; print STDERR "!!! 2.) Use the \"-preparsedDir \" option to specify a directory that\n"; print STDERR "!!! you do have permission to write files to.\n"; print STDERR "\n"; exit; } print STDERR "\tBy default, HOMER will set the the preparsed Directory to be group read/write\n"; `chmod g+rw "$genomeParseDir"`; my $tmpID = rand(); my $tmpFile = $tmpID . ".tmp"; my $tmpFile2 = $tmpID . ".2.tmp"; my $totalTSS = 0; my %tss = (); if ($noRefFileFlag) { my $numRefRegions = 10000; `homerTools extract stats "$genomeDir" > "$tmpFile"`; my %chr = (); open IN, $tmpFile; my $lineNum=0; my $gtotal = 0; while () { $lineNum++; next if ($lineNum < 2); chomp; my @line = split /\t/; if ($line[0] eq 'genome') { $gtotal = $line[1]; } else { $chr{$line[0]}=$line[1]; } } close IN; foreach(keys %chr) { my $chr = $_; my $chrSize = $chr{$chr}; my %c = (); $tss{$chr} = \%c; my $numToGet = floor($numRefRegions*$chrSize/$gtotal); for (my $i=0;$i<$numToGet;$i++) { my $pos = floor(rand()*($chrSize)); my $dir = 0; if (rand() > 0.5) { $dir = 1; } my $bestID = "$chr-$pos-$dir"; $tss{$chr}->{$bestID} = {p=>$pos,d=>$dir}; $totalTSS++; } } } else { open IN, $refFile or die "Could not open reference file $refFile\n"; while () { chomp; s/\r//g; my @line = split /\t/; my $id = $line[0]; my $chr = $line[1]; my $dir = $line[4]; if ($dir eq '-' || $dir eq '1') { $dir = 1; } else { $dir = 0; } my $pos = floor(($line[2]+$line[3])/2); my $bestID = $id . "-$chr-$pos-$dir"; if (!exists($tss{$chr})) { my %c = (); $tss{$chr} = \%c; } $tss{$chr}->{$bestID} = {p=>$pos,d=>$dir}; $totalTSS++; } close IN; } my $posFile = $genomeParseDir . "$outputGenome.$regionSize.pos"; open POS, ">$posFile"; my $totalPossible = $totalTSS*($window/$regionSize); my $successRate = $maxRegions/$totalPossible; foreach(keys %tss) { my $chr= $_; my @tss = sort {$tss{$chr}->{$a}->{'p'} <=> $tss{$chr}->{$b}->{'p'}} keys %{$tss{$chr}}; for (my $i=0;$i<@tss;$i++) { my $gid = $tss[$i]; my $tss = $tss{$chr}->{$tss[$i]}->{'p'}; my $dir = $tss{$chr}->{$tss[$i]}->{'d'}; my $start = $tss-$window; if ($start < 1) { $start = 1; } if ($i>0) { my $ptss = $tss{$chr}->{$tss[$i-1]}->{'p'}; my $midpoint = floor(($ptss+$tss)/2); if ($start < $midpoint) { $start = $midpoint; } } my $end = $tss+$window; if ($i<@tss-1) { my $ntss = $tss{$chr}->{$tss[$i+1]}->{'p'}; my $midpoint = floor(($ntss+$tss)/2); if ($end > $midpoint) { $end = $midpoint; } } for (my $p=$tss;$p>$start;$p-=$regionSize) { my $diff= $p-$tss; if ($dir == 1) { $diff *= -1; } if (abs($diff) > $minInclude) { next if (rand() > $successRate); } my $id = $gid . '+' . $diff; if ($diff < 0) { $id = $gid . $diff; } my $s = $p-$regionSizeHalf; my $e = $p+$regionSizeHalf; my $len = $e-$s; #print TSS "$id\t$diff\n"; print POS "$id\t$chr\t$s\t$e\t$dir\n"; } for (my $p=$tss+$regionSize;$p<$end;$p+=$regionSize) { my $diff= $p-$tss; if ($dir == 1) { $diff *= -1; } if (abs($diff) > $minInclude) { next if (rand() > $successRate); } my $id = $gid . '+' . $diff; if ($diff < 0) { $id = $gid . $diff; } my $s = $p-$regionSizeHalf; my $e = $p+$regionSizeHalf; my $len = $e-$s; #print TSS "$id\t$diff\n"; print POS "$id\t$chr\t$s\t$e\t$dir\n"; } } } close POS; my $seqFile = $genomeParseDir . "$outputGenome.$regionSize.seq"; my $freqFile = $genomeParseDir . "$outputGenome.$regionSize.cgfreq"; my $cgBinsFile = $genomeParseDir . "$outputGenome.$regionSize.cgbins"; my $gcBinsFile = $genomeParseDir . "$outputGenome.$regionSize.gcbins"; #my $tssBinsFile = "$genome.$regionSize.tssbins"; #my $cgtssBinsFile = "$genome.$regionSize.cgtssbins"; #my $gctssBinsFile = "$genome.$regionSize.gctssbins"; print STDERR "\tExtracting sequences\n"; my $mflag = ''; if ($maskFlag) { $mflag = " -mask "; } `homerTools extract "$posFile" "$genomeDir" $mflag > "$tmpFile"`; `cleanUpSequences.pl "$tmpFile" > "$tmpFile2"`; `removePoorSeq.pl "$tmpFile2" > "$seqFile"`; `homerTools freq -gc "$freqFile" -format tsv "$seqFile" > "$tmpFile2"`; `freq2group.pl "$freqFile" > "$cgBinsFile"`; `freq2group.pl "$freqFile" 2 > "$gcBinsFile"`; #`getTSSBins.pl $tssFile > $tssBinsFile`; #`combineBinFiles.pl $cgBinsFile $tssBinsFile > $cgtssBinsFile`; #`combineBinFiles.pl $gcBinsFile $tssBinsFile > $gctssBinsFile`; `rm "$tmpFile" "$tmpFile2"`; print STDERR "\n";