#!/usr/bin/env perl use warnings; use lib "/share/data/umw_biocore/dnext_data/bin/homer//bin"; my $homeDir = "/share/data/umw_biocore/dnext_data/bin/homer/"; my $promoterSeqOffset = -2000; my $goDir = $homeDir . "/data/GO/"; my $goFile = $homeDir . "/data/GO/GO.txt"; my $promoterDir = $homeDir . "/data/promoters/"; my $accDir = $homeDir . "/data/accession/"; my $pvalueThresh = 0.05; sub printCMD { print STDERR "\n\tUsage: findGO.pl [options]\n"; print STDERR "\n\tOptions:\n"; print STDERR "\t\t-bg \n"; print STDERR "\t\t-cpu <#> (number of cpus to use)\n"; print STDERR "\t\t-human (convert IDs and run as human [uses homologene])\n"; print STDERR "\t\t-customID (don't convert IDs)\n"; print STDERR "\t\t-ontology [file2.genes] ... (custom ontologies)\n"; print STDERR "\n"; exit; } if (@ARGV < 3) { printCMD(); } my $inputIDfile = $ARGV[0]; my $org = $ARGV[1]; my $outDir = $ARGV[2]; my $bgIDfile = ''; my $maxCPUs = 1; my $humanFlag = 0; my $customIDflag = 0; my @customOntologies = (); for (my $i=3;$i<@ARGV;$i++) { if ($ARGV[$i] eq '-bg') { $bgIDfile = $ARGV[++$i]; } elsif ($ARGV[$i] eq '-p' || $ARGV[$i] eq '-cpu') { $maxCPUs = $ARGV[++$i]; } elsif ($ARGV[$i] eq '-human') { $humanFlag = 1; } elsif ($ARGV[$i] eq '-customID') { $customIDflag = 1; } elsif ($ARGV[$i] eq '-ontology') { $i++; while ($i < @ARGV) { if ($ARGV[$i] =~ /^\-/) { $i--; last; } push(@customOntologies, $ARGV[$i]); $i++ } } else { printCMD(); } } $tmpFileFg = rand() . ".fg.tmp"; $tmpFileBg = rand() . ".bg.tmp"; if ($customIDflag) { `cp "$inputIDfile" "$tmpFileFg"`; if ($bgIDfile ne '') { `cp "$bgIDfile" "$tmpFileBg"`; } else { if (-f "$promoterDir/$org.base") { `cp "$promoterDir/$org.base" "$tmpFileBg"`; } } } elsif ($humanFlag == 0) { `convertIDs.pl "$inputIDfile" $org gene > "$tmpFileFg"`; if ($bgIDfile ne '') { `convertIDs.pl "$bgIDfile" $org gene > "$tmpFileBg"`; } else { if (-f "$promoterDir/$org.base.gene") { `cp "$promoterDir/$org.base.gene" "$tmpFileBg"`; } } } else { `convertOrganismID.pl "$inputIDfile" $org human gene > "$tmpFileFg"`; if ($bgIDfile ne '') { `convertOrganismID.pl "$bgIDfile" $org human gene > "$tmpFileBg"`; } else { if (-f "$promoterDir/human.base.gene") { `cp "$promoterDir/human.base.gene" "$tmpFileBg"`; } } $org = 'human'; } %geneNames = (); open IN, $accDir . $org . ".description"; while () { chomp; my @line = split /\t/; my $name = $line[4]; if (@line < 5 || $line[4] eq '-' || $line[4] eq '') { $name = $line[0]; } $geneNames{$line[0]} = $name; } close IN; my @ontologies = (); open IN, $goFile; while () { chomp; s/\r//g; next if (/^#/); my @line = split /\t/; my $go = {name=>$line[0],genes=>$line[1],output=>$line[2],desc=>$line[3],url=>$line[4],linkName=>$line[5],custom=>0}; push(@ontologies,$go); } close IN; foreach(@customOntologies) { my $n = $_; my $nn = $_; $nn =~ s/^.*\///; my $go = {name=>$nn,genes=>$n,output=>"$nn.txt",desc=>"$nn custom ontology",url=>"NA",linkName=>"NA",custom=>1}; push(@ontologies,$go); } `mkdir -p "$outDir"`; my $cpus=0; foreach(@ontologies) { my $go = $_; my $pid = fork(); $cpus++; if ($pid == 0) { #child process my $fullFile = "$goDir/$org." . $go->{'genes'}; if ($go->{'custom'}) { $fullFile = $go->{'genes'}; } `findGOtxt.pl "$fullFile" "$tmpFileBg" "$tmpFileFg" > "$outDir/$go->{'output'}"`; exit(0); } if ($cpus >= $maxCPUs) { wait(); $cpus--; } } my $id = 0; while ($id >= 0) { $id = wait(); } `rm "$tmpFileFg" "$tmpFileBg"`; my %results = (); for (my $i=0;$i<@ontologies;$i++) { my $go = $ontologies[$i]; parseResults("$outDir/$go->{'output'}", $go->{'name'}); } my @go = sort {$results{$a}->{'lp'} <=> $results{$b}->{'lp'}} keys %results; open MAIN, ">$outDir/geneOntology.html"; print MAIN "Gene Ontology Results\n"; print MAIN "

Gene Ontology Enrichment Results

\n"; print MAIN "Homer de novo Motif Enrichment Results
\n"; print MAIN "Known Motif Enrichment Results
\n"; print MAIN "

Text file version of complete results (i.e. open with Excel)\n"; print MAIN "
\n"; foreach(@ontologies) { my $go = $_; print MAIN "
{'output'}\">$go->{'name'}: $go->{'desc'} "; print MAIN "({'url'}\">$go->{'linkName'})
\n"; } print MAIN "
\n"; print MAIN "

Enriched Categories

\n"; print MAIN "\n"; print MAIN ""; print MAIN ""; print MAIN "\n"; foreach(@go) { my $goid = $_; my $pvalue = sprintf("%.3e", $results{$goid}->{'p'}); last if ($pvalue > $pvalueThresh); my $lp = sprintf("%.2f", $results{$goid}->{'lp'}); my $term = $results{$goid}->{'term'}; my $tree = $results{$goid}->{'n'}; my $nterm = $results{$goid}->{'t'}; my $ncommon = $results{$goid}->{'c'}; my $N = $results{$goid}->{'N'}; my $T = $results{$goid}->{'P'}; print MAIN ""; print MAIN "\n"; print MAIN "\n"; } print MAIN "

P-value	ln(P)	Term	GO Tree	GO ID	# of Genes in Term	# of Target Genes in Term	# of Total Genes	# of Target Genes	Common Genes
$pvalue	$lp	$term	$tree	$goid	$nterm	$ncommon	$N	$T	"; my $z = 0; foreach(@{$results{$goid}->{'genes'}}) { print MAIN "," if ($z > 0); $z++; my $name = $_; #if (exists($geneNames{$_})) { # $name = $geneNames{$_}; #} print MAIN "$name"; } print MAIN "

\n"; print MAIN "\n"; close MAIN; sub parseResults { my ($file, $name) = @_; my $c = 0; open TMP, ">$tmpFileFg"; print TMP "TermID\tTerm\tEnrichment\tlogP\tGenes in Term\tTarget Genes in Term\tFraction of Targets in Term\tTotal Target Genes\tTotal Genes\tEntrez Gene IDs\tGene Symbols\n"; open IN, $file; while () { $c++; next if ($c < 2); chomp; my @line = split /\t/; my $term = $line[1]; my $goid = $line[0]; my $pvalue = $line[2]; my $lp = $line[3]; my $numTerm = $line[4]; my $common = $line[5]; my $total = $line[8]; my $pos = $line[7]; my @common = (); if (@line > 9) { my @gids = split /\,/, $line[9]; foreach(@gids) { if (exists($geneNames{$_})) { push(@common, $geneNames{$_}); } else { push(@common, $_); } } } $results{$goid} = {term=>$term, lp=>$lp, p=>$pvalue, t=>$numTerm,c=>$common, N=>$total,P=>$pos,n=>$name,genes=>\@common}; print TMP "$line[0]"; for (my $i=1;$i<@line;$i++) { print TMP "\t$line[$i]"; } if (scalar(@common) > 0) { print TMP "\t$common[0]"; for (my $i=1;$i<@common;$i++) { print TMP ",$common[$i]"; } } print TMP "\n"; } close IN; close TMP; `mv "$tmpFileFg" "$file"`; }

Gene Ontology Enrichment Results

Text file version of complete results (i.e. open with Excel)\n"; print MAIN "\n"; foreach(@ontologies) { my $go = $_; print MAIN "{'output'}\">$go->{'name'}: $go->{'desc'} "; print MAIN "({'url'}\">$go->{'linkName'})\n"; } print MAIN "\n"; print MAIN "

Enriched Categories

Text file version of complete results (i.e. open with Excel)\n"; print MAIN "
\n"; foreach(@ontologies) { my $go = $_; print MAIN "
{'output'}\">$go->{'name'}: $go->{'desc'} "; print MAIN "({'url'}\">$go->{'linkName'})
\n"; } print MAIN "
\n"; print MAIN "