#!/usr/bin/env perl use warnings; use lib "/share/data/umw_biocore/dnext_data/bin/homer//bin"; my $homeDir = "/share/data/umw_biocore/dnext_data/bin/homer/"; # Copyright 2009-2016 Christopher Benner # # This file is part of HOMER # # HOMER is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # HOMER is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. use POSIX; use HomerConfig; use Statistics; my $config = HomerConfig::loadConfigFile(); if (@ARGV < 2) { printCMD(); } my $denovoFile = $ARGV[0]; my $genome = $ARGV[1]; my $refRNAfile = ""; my $repeatFile = ""; my $tssDistance = 500; my $repeatCheckRegion = 100; my $noRepeats = 0; my $minOverlap = 0.1; my $tagDir = ""; my $maxFold = 0; $strand = "+"; $minRPKM = 0.01; $minP = 10; my $tssFile = ""; my $tssRange = 1000; sub printCMD { print STDERR "\n\tProgram to annotate de novo transcripts from GRO-seq data\n"; print STDERR "\n\tUsage: annotateTranscripts.pl [options]\n"; print STDERR "\n\tOptions:\n"; print STDERR "\t\t-noRepeats (skip repeat annotation)\n"; print STDERR "\t\t-minOverlap <#> (Min fraction of overlap to assign transcript as genic, 0.1)\n"; print STDERR "\t\t-promoterSize <#> (to identify TSS antisense transcripts vs. enhancers, default: 500)\n"; print STDERR "\t\t-repeatSize <#> (size from beginning of transcript to use for repeat ann, default: 500)\n"; print STDERR "\t\t-d (to asses rpkm, merge fragments of same gene)\n"; print STDERR "\t\t\t-min <#> (minimum rpkm, default: 0.01)\n"; print STDERR "\t\t\t-minp <#> (minimum number of unique reads, default: 10)\n"; print STDERR "\t\t\t-strand <+|-|both> (strand to search for reads, default: +)\n"; print STDERR "\t\t\t-merge <#> (maximum fold difference for adjacent transcripts to be merged, e.g. 2)\n"; #print STDERR "\t\t-tss (adjust TSS to these called TSS locations)\n"; #print STDERR "\t\t\t-tssRange <#> (TSS reassignment range, default: 1000)\n"; print STDERR "\n"; exit; } for (my $i=2;$i<@ARGV;$i++) { if ($ARGV[$i] eq '-noRepeats') { $noRepeats = 1; } elsif ($ARGV[$i] eq '-promoterSize') { $tssDistance = $ARGV[++$i]; } elsif ($ARGV[$i] eq '-repeatSize') { $repeatCheckRegion = $ARGV[++$i]; } elsif ($ARGV[$i] eq '-minOverlap') { $minOverlap = $ARGV[++$i]; } elsif ($ARGV[$i] eq '-d') { $tagDir = $ARGV[++$i]; } elsif ($ARGV[$i] eq '-strand') { $strand = $ARGV[++$i]; } elsif ($ARGV[$i] eq '-min') { $minRPKM = $ARGV[++$i]; } elsif ($ARGV[$i] eq '-minp') { $minP = $ARGV[++$i]; } elsif ($ARGV[$i] eq '-tss') { $tssFile = $ARGV[++$i]; } elsif ($ARGV[$i] eq '-merge') { $maxFold = $ARGV[++$i]; print STDERR "\tmergeFold = $maxFold\n"; } else { printCMD(); } } if ($tagDir eq '') { $minRPKM = -1e10; $minP = -1e10; } my $genomeDir = ""; my $customGenome = ""; my $organism = 'null'; my $preparsedDir = ""; if (!exists($config->{'GENOMES'}->{$genome})) { $customGenome = $genome; ($genome,$genomeDir,$preparsedDir) = HomerConfig::parseCustomGenome($genome); } else { $genomeDir = $config->{'GENOMES'}->{$genome}->{'directory'} . "/"; $organism = $config->{'GENOMES'}->{$genome}->{'org'}; } my $denovo = openPeakFile($denovoFile); # step 0- reannotate TSS using tss File if ($tssFile ne '') { print STDERR "\tReassigning TSS within range of $tssRange...\n"; my $tssPeaks = openPeakFile($tssFile); } # first step, annotate transcripts relative to RNA file print STDERR "\tAnnotating transcripts to reference genes...\n"; if ($refRNAfile eq '') { $refRNAfile = $genomeDir . '/' . $genome . ".rna"; } unless (-e $refRNAfile) { print STDERR "!!!! Could not open refence RNA file \"$refRNAfile\"\n"; exit; } my $repeatFlag = 0; my $ref = openPeakFile($refRNAfile); annotateTranscripts($denovo, $ref,$repeatFlag); if ($tagDir ne '') { if ($maxFold > 0) { getRPKM($denovo, $tagDir,0); mergeAdjacentTranscripts($denovo,$maxFold); } getRPKM($denovo, $tagDir,0); if ($minP > 0) { getRPKM($denovo, $tagDir,1); } } # second step, annotate transcripts relative to if ($noRepeats == 0) { print STDERR "\tAnnotating non-gene transcripts to possibly repeats...\n"; if ($repeatFile eq '') { $repeatFile = $genomeDir . '/' . $genome . ".repeats"; } unless (-e $repeatFile) { print STDERR "!!!! Could not open repeat annotation file \"$repeatFile\"\n"; exit; } $repeatFlag =1; $ref = openPeakFile($repeatFile); annotateTranscripts($denovo, $ref,$repeatFlag); } printTranscripts($denovo,$organism,"",1); exit; sub mergeAdjacentTranscripts { my ($denovo, $maxFold) = @_; my $eps = 0.001; my $totalTranscripts = 0; my $totalMerged = 0;; foreach(keys %$denovo) { my $c = $_; my %gids = (); foreach(keys %{$denovo->{$c}}) { my $id = $_; $totalTranscripts++; my $gid = $denovo->{$c}->{$id}->{'gid'}; my $type = $denovo->{$c}->{$id}->{'type'}; #my $rpkm = $denovo->{$c}->{$id}->{'rpkm'}; if ($type eq 'Gene') { if (!exists($gids{$gid})) { my @a = (); $gids{$gid} = \@a; } push(@{$gids{$gid}},$id); } } foreach(keys %gids) { my $gid = $_; my @ids = sort {$denovo->{$c}->{$a}->{'d'} cmp $denovo->{$c}->{$b}->{'d'} || $denovo->{$c}->{$a}->{'s'} cmp $denovo->{$c}->{$b}->{'s'}} @{$gids{$gid}}; my $lastGood = 0; for (my $i=1;$i<@ids;$i++) { my $id = $_; if ($denovo->{$c}->{$ids[$i]}->{'d'} ne $denovo->{$c}->{$ids[$lastGood]}->{'d'}) { $lastGood = $i; next; } my $rpkm = $denovo->{$c}->{$ids[$i]}->{'rpkm'}; my $rpkmLast = $denovo->{$c}->{$ids[$lastGood]}->{'rpkm'}; my $fold = ($rpkm + $eps)/($rpkmLast+$eps); if ($fold < $maxFold && 1/$fold < $maxFold) { $denovo->{$c}->{$ids[$lastGood]}->{'e'} = $denovo->{$c}->{$ids[$i]}->{'e'}; #print STDERR "merged $c:$denovo->{$c}->{$ids[$i]}->{'s'}" ."-$denovo->{$c}->{$ids[$i]}->{'e'}\n"; delete $denovo->{$c}->{$ids[$i]}; $totalMerged++; } else { $lastGood = $i; next; } } } } print STDERR "\tMerged $totalMerged out of $totalTranscripts transcripts\n"; } sub assignTSS { my ($transcripts, $denovoFile, $tss, $tssFile, $tssRange) = @_; my $rand = rand(); my $tmpFile = $rand . ".tmp"; my $tmpFile2 = $rand . ".2.tmp"; `adjustPeakFile.pl "$denovoFile" -5p > "$tmpFile"`; `mergePeaks "$tmpFile" "$tssFile"`; } sub getRPKM { my ($denovo, $tagDir,$pc) = @_; print STDERR "\tQuantifying Transcript levels with data in \"$tagDir\"\n"; my $rand = rand(); my $tmpFile = $rand . ".tmp"; my $tmpFile2 = $rand . ".2.tmp"; my $tmpRPKM = $minRPKM; $minRPKM = -1e10; my $tmpMINP = $minP; $minRPKM = -1e10; $minP = -1e10; printTranscripts($denovo,'null',$tmpFile,0); $minRPKM = $tmpRPKM; $minP = $tmpMINP; my $option = ""; if ($pc > 0) { $option = " -pc $pc -noadj "; } else { $option = " -rpkm "; } `analyzeRepeats.pl "$tmpFile" $genome -noCondensing -strand $strand -d "$tagDir" $option > "$tmpFile2"`; open IN, $tmpFile2; while () { chomp; s/\r//g; my @line = split /\t/; my $id = $line[0]; my $chr = $line[1]; my $rpkm = $line[8]; next if (!exists($denovo->{$chr})); next if (!exists($denovo->{$chr}->{$id})); if ($pc > 0) { $denovo->{$chr}->{$id}->{'uniq'} = $rpkm; } else { $denovo->{$chr}->{$id}->{'rpkm'} = $rpkm; } } close IN; `rm "$tmpFile" "$tmpFile2"`; } sub printTranscripts { my ($denovo, $org,$outputFileName, $ucscFlag) = @_; my $emptyAnn = ""; my %ann = (); if ($org ne '' && $org ne 'null') { $emptyAnn = "\t\t\t\t"; my $rand = rand(); my $tmpFile = $rand . ".tmp"; my $tmpFile2 = $rand . ".2.tmp"; open OUT, ">$tmpFile"; foreach(keys %$denovo) { my $c = $_; foreach(values %{$denovo->{$c}}) { my $n = $_->{'gid'}; print OUT "$n\n"; } } `addGeneAnnotation.pl "$tmpFile" $org no > "$tmpFile2"`; open IN, $tmpFile2; while () { chomp; s/\r//g; my @line = split /\t/; my $annStr = "\t$line[5]\t$line[6]\t$line[7]\t$line[8]\t$line[10]"; $ann{$line[1]} = $annStr; } close IN; `rm "$tmpFile" "$tmpFile2"`; } if ($ucscFlag) { open UCSC, ">ucsc.txt"; } my $file = ''; if ($outputFileName ne '') { open OUT, ">$outputFileName"; $file = *OUT; } else { $file = *STDOUT; } print $file "TranscriptID\tchr\tstart\tend\tstrand\tOriginalID\tAnnotatedID\tType"; print $file "\tName\tAlias\tOrf\tDescription\tRefSeqType"; print $file "\tOverlap\tAntisense\tClosest TSS\tRPKM\tUniq Reads\n"; my $totalPossible = 0; my $totalPassed = 0; my %pieChart = (); my %okTypes = (); $okTypes{'Gene'}=1; $okTypes{'Gene-antisense'}=1; $okTypes{'Promoter-antisense'}=1; $okTypes{'Enhancer'}=1; my %unique = (); my @chrs = sort {$a cmp $b} keys %$denovo; foreach(@chrs) { my $c = $_; my @ids = sort {$denovo->{$c}->{$a}->{'s'} <=> $denovo->{$c}->{$b}->{'s'}} keys %{$denovo->{$c}}; for (my $i=0;$i<@ids;$i++) { $totalPossible++; my $id = $ids[$i]; my $start = $denovo->{$c}->{$id}->{'s'}; my $end = $denovo->{$c}->{$id}->{'e'}; my $strand = $denovo->{$c}->{$id}->{'d'}; my $nearest = $denovo->{$c}->{$id}->{'n'}; my $nearestDist = $denovo->{$c}->{$id}->{'ndist'}; my $gid = $denovo->{$c}->{$id}->{'gid'}; my $type = $denovo->{$c}->{$id}->{'type'}; my $rpkm = $denovo->{$c}->{$id}->{'rpkm'}; my $uniq = $denovo->{$c}->{$id}->{'uniq'}; next if ($rpkm < $minRPKM); next if ($uniq < $minP); $totalPassed++; my $nid =''; if ($type eq 'Gene') { $nid = $gid; } elsif ($type eq 'Gene-antisense') { $nid = $gid . "-gene-antisense"; } elsif ($type eq 'Promoter-antisense') { $nid = $gid . '-promoter-antisense'; } elsif ($type eq 'Enhancer') { $nid = $gid . '-enhancer'; } elsif ($type eq 'other') { $nid = $gid . '-other'; } elsif ($type eq 'other') { $nid = $gid . '-other'; } elsif ($type =~ /^Repeat-(.*)/) { $nid = $gid . '-repeat-' . $1; } else { $nid = $id . '-other'; } if (exists($unique{$nid})) { $nid .= "-HOMER" . ++$unique{$nid}; } else { $unique{$nid}=1; } print $file "$nid\t$c\t$start\t$end\t$strand\t$id\t$gid\t$type"; my $pieType = $type; if (!exists($okTypes{$pieType})) { $pieType = 'other'; } $pieChart{$pieType}++; my $ann = $emptyAnn; if (exists($ann{$gid})) { $ann = $ann{$gid}; } print $file "$ann"; my $name = "$id-"; my $sense = ''; my $antisense = ''; my @overlap = sort {$denovo->{$c}->{$id}->{'o'}->{$b} <=> $denovo->{$c}->{$id}->{'o'}->{$a}} keys %{$denovo->{$c}->{$id}->{'o'}}; foreach(@overlap) { my $rid = $_; my $overlap = $denovo->{$c}->{$id}->{'o'}->{$rid}; $name .= $rid . "($overlap),"; if ($overlap > 0) { $sense .= $rid . "($overlap),"; } else { $antisense .= $rid . "($overlap),"; } } print $file "\t$sense\t$antisense\t$nearest($nearestDist)\t$rpkm\t$uniq\n"; $name .= "TSS:$nearest($nearestDist)"; if ($ucscFlag) { print UCSC "$c\t$start\t$end\t$gid-$type\t1\t$strand\n"; } } } if ($minRPKM > 0) { print STDERR "\t$totalPassed of $totalPossible passed RPKM threshold of $minRPKM\n"; } if ($ucscFlag) { close UCSC; } if ($outputFileName ne '') { close OUT; } print STDERR "\n\tAnnotation Summary:\n"; foreach(keys %pieChart) { print STDERR "\t$_\t$pieChart{$_}\n"; } print STDERR "\n"; } sub annotateTranscripts { my ($denovo, $ref, $repeatFlag) = @_; my @chrs = sort {$a cmp $b} keys %$denovo; foreach(@chrs) { my $c = $_; print STDERR "\t\t$c\n"; my @ids = sort {$denovo->{$c}->{$a}->{'s'} <=> $denovo->{$c}->{$b}->{'s'}} keys %{$denovo->{$c}}; my @rids = sort {$ref->{$c}->{$a}->{'s'} <=> $ref->{$c}->{$b}->{'s'}} keys %{$ref->{$c}}; my $refIndex = 0; for (my $i=0;$i<@ids;$i++) { my $id = $ids[$i]; my $start = $denovo->{$c}->{$id}->{'s'}; my $end = $denovo->{$c}->{$id}->{'e'}; my $strand = $denovo->{$c}->{$id}->{'d'}; my $tss = $denovo->{$c}->{$id}->{'tss'}; my $len = $denovo->{$c}->{$id}->{'len'}; if ($repeatFlag) { if ($strand eq '+') { #$start -= $tssDistance; if ($end-$start>$repeatCheckRegion) { $end = $start+$repeatCheckRegion; } } else { #$end += $tssDistance; if ($end-$start>$repeatCheckRegion) { $start = $end-$repeatCheckRegion; } } } my $bestOverlap = 0; my $bestOverlapID = ''; my $overlapFlag = 0; my $tssDist = 0; for (my $j=$refIndex;$j<@rids;$j++) { my $rid = $rids[$j]; my $s = $ref->{$c}->{$rid}->{'s'}; my $e = $ref->{$c}->{$rid}->{'e'}; my $d = $ref->{$c}->{$rid}->{'d'}; my $t = $ref->{$c}->{$rid}->{'tss'}; my $l = $ref->{$c}->{$rid}->{'len'}; my $dist = abs($tss-$t); if ($repeatFlag == 0 && ($dist < $denovo->{$c}->{$id}->{'ndist'})) { $denovo->{$c}->{$id}->{'ndist'} = $dist; $denovo->{$c}->{$id}->{'n'} = $rid; } my $overlap = 0; if ($s <= $end && $e >= $start) { #overlap my $ss = $start; $ss = $s if ($s > $ss); my $ee = $end; $ee = $e if ($e < $ee); $overlap = $ee-$ss; #print STDERR "$id\t$rid\t$overlap\t$l\t$len\n" if ($id eq 'chr2-883-0'); if ($overlap/$l > $minOverlap || $overlap/$len > $minOverlap) { if ($d ne $strand) { $overlap *= -1; } if ((($overlap > 0 || $bestOverlap > 0) && $overlap > $bestOverlap) || ($overlap <= 0 && $bestOverlap <= 0 && $overlap < $bestOverlap)) { if ($repeatFlag == 0 || $overlap > 0) { $bestOverlap = $overlap; $bestOverlapID =$rid; } } if ($repeatFlag == 0) { $denovo->{$c}->{$id}->{'o'}->{$rid}=$overlap; } } $overlapFlag = 1; } else { if ($overlapFlag == 0 && $tss > $t && $j == $refIndex+1) { $refIndex = $j; } if ($denovo->{$c}->{$id}->{'ndist'}+$tss+100 < $s) { last; } } } if ($bestOverlapID ne "") { if ($repeatFlag) { if ($denovo->{$c}->{$id}->{'type'} ne "Gene") { $repeatName = $bestOverlapID; $repeatName =~ s/\-HOMER\d+//; $denovo->{$c}->{$id}->{'type'} = "Repeat-" . $repeatName; if ($bestOverlap < 0) { $denovo->{$c}->{$id}->{'type'} = "Repeat-" . $repeatName . "-antisense"; } } } else { $denovo->{$c}->{$id}->{'gid'} = $bestOverlapID; if ($denovo->{$c}->{$id}->{'o'}->{$bestOverlapID} < 0) { $denovo->{$c}->{$id}->{'type'} = "Gene-antisense"; } else { $denovo->{$c}->{$id}->{'type'} = "Gene"; } } } elsif ($repeatFlag == 0) { $denovo->{$c}->{$id}->{'gid'} = $denovo->{$c}->{$id}->{'n'}; if ($denovo->{$c}->{$id}->{'ndist'} < $tssDistance) { if ($ref->{$c}->{$denovo->{$c}->{$id}->{'n'}}->{'d'} ne $denovo->{$c}->{$id}->{'d'}) { $denovo->{$c}->{$id}->{'type'} = "Promoter-antisense"; } else { $denovo->{$c}->{$id}->{'type'} = "Promoter-sense"; } } else { $denovo->{$c}->{$id}->{'type'} = "Enhancer"; } } } } } sub openPeakFile { my ($file) = @_; my %peaks = (); open IN, $file; while () { chomp; s/\r//g; next if (/^\s*\#/); my @line = split /\t/; next if (@line < 5); next unless ($line[2] =~ /^[\d\.\-\e\+]+$/); next unless ($line[3] =~ /^[\d\.\-\e\+]+$/); my $id = $line[0]; my $chr = $line[1]; my $s = $line[2]; my $e = $line[3]; my $strand = $line[4]; $strand = "+" if ($strand eq 0); $strand = "-" if ($strand eq 1); if (!exists($peaks{$chr})) { my %a = (); } my $tss = $s; if ($strand eq '-') { $tss = $e } my $len = $e-$s; $len = 1 if ($len < 1); my $tid = $id; my $count = 1; while (exists($peaks{$chr}->{$tid})) { $tid .= "-$count"; $count++; } my %overlap = (); my %near = (); my $nearID = ''; $peaks{$chr}->{$tid} = {id=>$id,s=>$s,e=>$e,d=>$strand,c=>$chr,tss=>$tss,o=>\%overlap,n=>$nearID,ndist=>1e10, gid=>'',code=>'',cs=>$s,ce=>$e,len=>$len,rpkm=>0,uniq=>0}; } close IN; return \%peaks; }