// Copyright 2009, 2010, 2011, 2012 Christopher Benner // // This file is part of HOMER // // HOMER is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // HOMER is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. #include "SeqTag.h" #define BUFFER 10000 void printCMD(); char** addPeakFile2List(char** pfiles, int &numFiles, char* file); void analyzePooledPeaks(TagLibrary* tags, PeakLibrary** p, GenomeOntologyResults* gr, char strand, int startIndex, int stopIndex); int main(int argc, char** argv) { char** peakFiles = NULL; int numPeakFiles = 0; char* controlPeakFile = NULL; char strand = STRAND_BOTH; int maxDistance = 100; long int gsize = DEFAULT_GSIZE; //int bpFlag = 0; if (argc < 2) { printCMD(); } for (int i=1;ireadPeakFile(peakFiles[0], PEAK_READ_MODE_NORMAL); char** pfiles = &(peakFiles[1]); refPeaks->genomeOntology(pfiles, numPeakFiles-1, strand, maxDistance, gsize, controlPeakFile); delete refPeaks; } else { // Tag Directory Version - analyze tag counts in each set relative to expected // To keep things undercontrol, limit ourselves to MAX_PEAKS_AT_A_TIME //fprintf(stderr, "\tPerforming Tag Directory Analysis (%s)\n",peakFiles[0]); TagLibrary* tags = new TagLibrary(peakFiles[0]); tags->readTagDirectory(); tags->setSingleRead(1); TagLibrary* input = NULL; if (controlPeakFile != NULL) { input = new TagLibrary(controlPeakFile); input->readTagDirectory(); input->setSingleRead(1); } FILE* fp = stdout; int currentPeakFile = 0; int currentTotal = 0; int totalPeaks = 0; PeakLibrary**p = new PeakLibrary*[numPeakFiles-1]; GenomeOntologyResults* gr = new GenomeOntologyResults[numPeakFiles-1]; GenomeOntologyResults* grInput = new GenomeOntologyResults[numPeakFiles-1]; for (int i=0;inumPeaks; gr[i].name = pfiles[i]; gr[i].coverage = p[i]->calculateCoverage(); gr[i].numPeaks = p[i]->numPeaks; gr[i].tagCount=0.0; gr[i].numPeaksReporting=0; if (input != NULL) grInput[i].tagCount = 0.0; if (input != NULL) grInput[i].numPeaksReporting = 0; if (currentTotal > 0 && currentTotal+p[i]->numPeaks > MAX_PEAKS_AT_A_TIME) { fprintf(stderr, "\t\tAnalyzing annotations %d to %d of %d (%d total peaks)\n", currentPeakFile+1,i,numPeakFiles-1,currentTotal); analyzePooledPeaks(tags,p,gr,strand,currentPeakFile,i-1); if (input != NULL) { fprintf(stderr, "\t\tAnalyzing annotations %d to %d of %d for Input (%d total peaks)\n", currentPeakFile+1,i,numPeakFiles-1,currentTotal); analyzePooledPeaks(input,p,grInput,strand,currentPeakFile,i-1); } for (int j=currentPeakFile;jnumPeaks; } else { currentTotal += p[i]->numPeaks; } } fprintf(stderr, "\t\tAnalyzing annotations %d to %d of %d (%d total peaks)\n", currentPeakFile+1,numPeakFiles-1,numPeakFiles-1,currentTotal); analyzePooledPeaks(tags,p,gr,strand,currentPeakFile,numPeakFiles-2); if (input != NULL) { fprintf(stderr, "\t\tAnalyzing annotations %d to %d of %d for Input (%d total peaks)\n", currentPeakFile+1,numPeakFiles-1,numPeakFiles-1,currentTotal); analyzePooledPeaks(input,p,grInput,strand,currentPeakFile,numPeakFiles-2); } for (int i=currentPeakFile;itotalTags ); fprintf(fp, "\tExpected Tag Counts(gSize=%.1le)\tLog Fold Enrichment", (double)gsize); fprintf(fp, "\tLog P-value (+ for underrepresented)\tP-value"); if (input != NULL) { fprintf(fp, "\tControl Tag Counts [Total=%le]\tExpected Input Tag Counts",input->totalTags); fprintf(fp, "\tLog Fold Enrichment"); fprintf(fp, "\tLog P-value (+ underrepresented)\tP-value"); fprintf(fp, "\tLog Fold Enrichment (Control vs. Input)"); fprintf(fp, "\tLog P-value (Control vs. Input, + for Input enrichment)"); fprintf(fp, "\tP-value (Control vs. Input)"); } fprintf(fp, "\t%% of total"); fprintf(fp, "\n"); double tbp = tags->totalTags / (double)gsize; double tbpInput = 1; if (input != NULL) { tbpInput = input->totalTags / (double)gsize; } double totalTagsD = tags->totalTags; if (totalTagsD < 1.0) totalTagsD = 1.0; for (int i=0;i ee) { logp = ilogCumulativePoisson((int)vv,ee); pvalue = exp(logp); } else { logp = -1*logCumulativePoisson((int)vv,ee); pvalue = exp(-1*logp); } fprintf(fp, "\t%.1lf\t%.2lf\t%.2lf\t%.2le",expected,lratio,logp,pvalue); if (input != NULL) { expected = tbpInput * gr[i].coverage; ee = expected; vv = grInput[i].tagCount; if (ee < 1) ee = 1.0; if (vv < 1) vv = 1.0; lratio = log(vv/ee); logp = 0.0; pvalue = 1.0; if (vv > ee) { logp = ilogCumulativePoisson((int)vv,ee); pvalue = exp(logp); } else { logp = -1*logCumulativePoisson((int)vv,ee); pvalue = exp(-1*logp); } fprintf(fp, "\t%.1lf\t%.1lf\t%.2lf\t%.2lf\t%.2le",grInput[i].tagCount,expected,lratio,logp,pvalue); //comparison float tC = gr[i].tagCount; float iC = grInput[i].tagCount; if (tags->totalTags > input->totalTags) { tC /= tags->totalTags; tC *= input->totalTags; } else { iC /= input->totalTags; iC *= tags->totalTags; } if (tC < 1) tC = 1; if (iC < 1) iC = 1; lratio = log(tC/iC); logp = 0.0; pvalue = 1.0; if (tC > iC) { logp = ilogCumulativePoisson((int)tC,iC); pvalue = exp(logp); } else { logp = -1*logCumulativePoisson((int)tC,iC); pvalue = exp(-1*logp); } fprintf(fp, "\t%.2lf\t%.2lf\t%.2le",lratio,logp,pvalue); } double total = gr[i].tagCount/totalTagsD; fprintf(fp, "\t%lf%%", total*100.0); fprintf(fp,"\n"); } delete []p; delete []gr; delete tags; if (input != NULL) { delete []grInput; delete input; } } delete []peakFiles; return 0; } void analyzePooledPeaks(TagLibrary* tags, PeakLibrary** p, GenomeOntologyResults* gr, char strand, int startIndex, int stopIndex) { PeakLibrary* pooled = new PeakLibrary(MAX_PEAKS_AT_A_TIME); for (int j=startIndex;j<=stopIndex;j++) { pooled->addPeakLibrary(p[j]); } Doubletable* counts = tags->getPeakTagCounts(pooled,strand); char** keys = counts->keys(); for (int i=0;itotal;i++) { double v = counts->search(keys[i]); int index = 0; int slen = strlen(keys[i]); for (int j=0;j 0) { for (int i=0;i [additional peak/ann files...]\n"); fprintf(stderr, "\n\tCalculates significance of overlap between primary peak file and all others\n"); fprintf(stderr, "\tA tag directory can be given in stead of a peak file to perform an unbiased\n"); fprintf(stderr, "\tanalysis of reads, although for this tags will be counted in annotations instead\n"); fprintf(stderr, "\tof calculating overlaps (different, but still useful)\n"); fprintf(stderr, "\n\t\tOutput is a table of p-values/stats is sent to stdout\n"); fprintf(stderr, "\n\tGeneral Options:\n"); //fprintf(stderr, "\t\t-strand (Only merge/consider peaks on the same strand, default: either strand)\n"); fprintf(stderr, "\t\t-d <#|given> (Maximum distance between peak centers to consider, default: 100)\n"); fprintf(stderr, "\t\t\tUsing \"-d given\" looks for literal overlaps in peak regions, and calculates\n"); fprintf(stderr, "\t\t\tsignificance based on the total overlap in bp between peaks/annotations\n"); fprintf(stderr, "\t\t\tUse \"-d given\" when features have vastly different sizes (i.e. introns vs. peaks)\n"); fprintf(stderr, "\t\t-file (file listing peak files to compare - for lots of peak files)\n"); fprintf(stderr, "\t\t-gsize <#> (Genome size for significance calculations, default: 2e9)\n"); //fprintf(stderr, "\t\t-bp (Overlaps calculated as coverage in bp, otherwise by default overlaps are binary)\n"); fprintf(stderr, "\n"); exit(0); }