// Copyright 2009 - 2014 Christopher Benner // // This file is part of HOMER // // HOMER is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // HOMER is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. #include "SeqTag.h" void printCMD(); void printCircosFiles(char* prefix, int resolution, TagLibrary* tags, PeakLibrary* peaks1, PeakLibrary* peaks2,int,int,int,int,char**,int,char**,int,char**,int,TagLibrary*); double** balanceMatrix(double ** matrix, int size, int size2); int main(int argc, char** argv) { int maxFiles = 1000; char* directory = NULL; char* bgDirectory = NULL; char* customBgFile = NULL; char* customBgBgFile = NULL; char* ifcFile = NULL; int defaultResolution = 1; int resolution = 10000000; int fullModelFlag = 1; char* chr1 = NULL; char* chr2 = NULL; int start1 = -1; int start2 = -1; int iskbFlag = 1; int override = 0; int relativeFlag=0; int balanceFlag = 0; int washuFlag = 0; int superRes = -1; int numCPUs = 1; int vsGenomeFlag = 0; //int boundaryFlag = 0; int boundaryScale = 0; int end1 = 2000000000; int end2 = 2000000000; int minDist = -1; int maxDist = -1; int logFlag = -1; int valueIsLog = 0; int chopifyFlag = 0; int fixedFlag = 0; int forceFlag = 0; int bgOnlyFlag = 0; int randomReadTotal = 100000000; char* randomizeModel = NULL; int siteFlag = 0; char* restrictionSite = NULL; int maxMisMatches = 0; char* poutFile = NULL; char* pout2File = NULL; double normTotal = -2.0; double normArea = -2.0; char** geneFiles = new char*[maxFiles]; char** bedFiles = new char*[maxFiles]; char** tagDirs = new char*[maxFiles]; for (int i=0;i= argc) break; } i--; } else if (strcmp(argv[i],"-d")==0) { i++; while (argv[i][0] != '-') { tagDirs[numTagDirs++] = argv[i++]; if (i >= argc) break; } i--; } else if (strcmp(argv[i],"-g")==0) { i++; while (argv[i][0] != '-') { geneFiles[numGeneFiles++] = argv[i++]; if (i >= argc) break; } i--; } else if (strcmp(argv[i],"-res")==0) { i++; if (strcmp(argv[i],"site") == 0) { siteFlag = 1; } else { sscanf(argv[i],"%d",&resolution); } defaultResolution = 0; } else if (strcmp(argv[i],"-superRes")==0 || strcmp(argv[i],"-win")==0 || strcmp(argv[i],"-window")==0) { i++; if (strcmp(argv[i],"site") == 0) { siteFlag = 1; } else { sscanf(argv[i],"%d",&superRes); if (defaultResolution==1) { resolution = superRes; } } defaultResolution = 0; } else if (strcmp(argv[i],"-hist")==0) { sscanf(argv[++i],"%d",&resolution); defaultResolution = 0; expectMatrixFlag=1; histFlag = 1; } else if (strcmp(argv[i],"-start")==0) { sscanf(argv[++i],"%d",&start1); if (start2 == -1) start2 = start1; } else if (strcmp(argv[i],"-start2")==0) { sscanf(argv[++i],"%d",&start2); } else if (strcmp(argv[i],"-end")==0) { sscanf(argv[++i],"%d",&end1); if (end2 == 2000000000) end2 = end1; } else if (strcmp(argv[i],"-end2")==0) { sscanf(argv[++i],"%d",&end2); } else if (strcmp(argv[i],"-clusterFixed")==0) { //sscanf(argv[++i],"%lf",&clusterThreshold); actionFlag = actionFlag | HIC_MASK_CLUSTER; actionFlag = actionFlag | HIC_MASK_CLUSTERFIXED; } else if (strcmp(argv[i],"-cluster")==0) { //sscanf(argv[++i],"%lf",&clusterThreshold); actionFlag = actionFlag | HIC_MASK_CLUSTER; } else if (strcmp(argv[i],"-corrDepth")==0) { sscanf(argv[++i],"%lf",&minExpReads); } else if (strcmp(argv[i],"-corrIters")==0) { sscanf(argv[++i],"%d",&clusteringIterations); } else if (strcmp(argv[i],"-nomatrix")==0) { actionFlag = actionFlag | HIC_MASK_NO_MATRIX; } else if (strcmp(argv[i],"-pvalue")==0) { sscanf(argv[++i],"%lf",&pvalueThreshold); } else if (strcmp(argv[i],"-zscore")==0) { sscanf(argv[++i],"%lf",&zscoreThreshold); } else if (strcmp(argv[i],"-loci")==0) { lociFile = argv[++i]; //actionFlag = actionFlag | HIC_MASK_LOGPVALUES; //actionFlag = actionFlag | HIC_MASK_LOGPVALUESEXACT; actionFlag = actionFlag | HIC_MASK_LOCISCORE; } else if (strcmp(argv[i],"-interactions")==0) { interactionFile = argv[++i]; actionFlag = actionFlag | HIC_MASK_LOGPVALUES; actionFlag = actionFlag | HIC_MASK_LOGPVALUESEXACT; actionFlag = actionFlag | HIC_MASK_INTERACTIONS; } else if (strcmp(argv[i],"-circos")==0) { //interactionFile = argv[++i]; i++; circosPrefix = argv[i]; actionFlag = actionFlag | HIC_MASK_LOGPVALUES; actionFlag = actionFlag | HIC_MASK_LOGPVALUESEXACT; actionFlag = actionFlag | HIC_MASK_INTERACTIONS; } else if (strcmp(argv[i],"-logp")==0) { actionFlag = actionFlag | HIC_MASK_LOGPVALUES; actionFlag = actionFlag | HIC_MASK_LOGPVALUESEXACT; if (logFlag == -1) logFlag = 1; valueIsLog = 1; } else if (strcmp(argv[i],"-logpn")==0) { actionFlag = actionFlag | HIC_MASK_LOGPVALUES; mask = HIC_MASK_LOGPVALUESEXACT; actionFlag = actionFlag & (~mask); if (logFlag == -1) logFlag = 1; valueIsLog = 1; } else if (strcmp(argv[i],"-corr")==0) { actionFlag = actionFlag | HIC_MASK_CORRELATION; expectMatrixFlag=1; if (actionFlag & HIC_MASK_LOGPVALUES) { } else { //if (logFlag == -1) logFlag = 0; //valueIsLog =0; } } else if (strcmp(argv[i],"-force")==0) { forceFlag = 1; } else if (strcmp(argv[i],"-bgonly")==0) { forceFlag = 1; bgOnlyFlag = 1; iskbFlag = 0; actionFlag = actionFlag | HIC_MASK_NORM_SEQDEPTH; actionFlag = actionFlag | HIC_MASK_NORM_BALANCED_SEQDEPTH; actionFlag = actionFlag | HIC_MASK_NORM_DISTANCE; valueIsLog = 0; if (logFlag == -1) logFlag = 1; } else if (strcmp(argv[i],"-norm")==0 || strcmp(argv[i],"-distNorm")==0 || strcmp(argv[i],"-distanceNorm")==0) { iskbFlag = 0; actionFlag = actionFlag | HIC_MASK_NORM_SEQDEPTH; actionFlag = actionFlag | HIC_MASK_NORM_BALANCED_SEQDEPTH; actionFlag = actionFlag | HIC_MASK_NORM_DISTANCE; valueIsLog = 0; if (logFlag == -1) logFlag = 1; } else if (strcmp(argv[i],"-zscoreNorm")==0) { iskbFlag = 0; actionFlag = actionFlag | HIC_MASK_NORM_SEQDEPTH; actionFlag = actionFlag | HIC_MASK_NORM_BALANCED_SEQDEPTH; actionFlag = actionFlag | HIC_MASK_NORM_DISTANCE; actionFlag = actionFlag | HIC_MASK_NORM_ZSCORE; valueIsLog = 1; if (logFlag == -1) logFlag = 1; } else if (strcmp(argv[i],"-rawAndExpected")==0) { actionFlag = actionFlag | HIC_MASK_NORM_SEQDEPTH; actionFlag = actionFlag | HIC_MASK_NORM_BALANCED_SEQDEPTH; actionFlag = actionFlag | HIC_MASK_NORM_DISTANCE; actionFlag = actionFlag | HIC_MASK_RAWANDEXPECTED; expectMatrixFlag=1; expectMatrixOutputFile = argv[++i]; valueIsLog = 0; if (logFlag == -1) logFlag = 0; } else if (strcmp(argv[i],"-raw")==0) { iskbFlag = 0; mask = HIC_MASK_NORM_SEQDEPTH | HIC_MASK_NORM_DISTANCE | HIC_MASK_NORM_BALANCED_SEQDEPTH; actionFlag = actionFlag & (~mask); valueIsLog = 0; if (logFlag == -1) logFlag = 0; } else if (strcmp(argv[i],"-expected")==0) { actionFlag = actionFlag | HIC_MASK_EXPECTED; valueIsLog = 0; if (logFlag == -1) logFlag = 0; } else if (strcmp(argv[i],"-balance")==0) { balanceFlag = 1; } else if (strcmp(argv[i],"-balanceNorm")==0) { mask = HIC_MASK_NORM_DISTANCE; actionFlag = actionFlag & (~mask); actionFlag = actionFlag | HIC_MASK_NORM_SEQDEPTH; actionFlag = actionFlag | HIC_MASK_NORM_BALANCED_SEQDEPTH; valueIsLog = 0; if (logFlag == -1) logFlag = 0; } else if (strcmp(argv[i],"-simpleNorm")==0 || strcmp(argv[i],"-coverageNorm")==0) { mask = HIC_MASK_NORM_DISTANCE | HIC_MASK_NORM_BALANCED_SEQDEPTH; actionFlag = actionFlag & (~mask); actionFlag = actionFlag | HIC_MASK_NORM_SEQDEPTH; valueIsLog = 0; if (logFlag == -1) logFlag = 0; } else if (strcmp(argv[i],"-p")==0) { peakFile1 = argv[++i]; } else if (strcmp(argv[i],"-p2")==0) { peakFile2 = argv[++i]; } else if (strcmp(argv[i],"-o")==0) { outputFile = argv[++i]; } else { fprintf(stderr, "!!! Couldn't recognize \"%s\" !!!\n", argv[i]); printCMD(); } } } if (logFlag == -1) { logFlag = 0; valueIsLog =0; } if ((actionFlag & HIC_MASK_INTERACTION4CBEDFILE || actionFlag & HIC_MASK_INTERACTIONSTATSFILES) && numCPUs > 1) { //fprintf(stderr, "!!! Warning, options not compatible with -cpu %d, switching to single CPU\n", numCPUs); //numCPUs = 1; } if (peakFile1 == NULL && histFlag) { fprintf(stderr, "!!! Histogram mode only works with a peak file (\"-p \")!!!\n"); exit(0); } if (siteFlag) { resolution = -1; fixedFlag = 1; } else { if (superRes < 0) { superRes = resolution; } } if (iskbFlag || normArea > 0) { iskbFlag = 1; if (normTotal < -1.5) normTotal = 1e9; //default if (normArea < -1.5) normArea = 100000000.0; //default = 100kb^2 if (normArea > 0) { double ratio = ((double)superRes)*((double)superRes)/normArea; normTotal /= ratio; } } //fprintf(stderr, "normTotal=%.1lf\n", normTotal); // params->setRes(resolution, superRes); params->minDist = minDist; params->maxDist = maxDist; params->boundaryScale = boundaryScale; params->logFlag = logFlag; params->minExpReads = minExpReads; params->totalNormReads = normTotal; //Will need to check this for site resolution... if (randomizeModel != NULL) { HiCBgModel* model = new HiCBgModel(superRes,randomizeModel,1); FILE* fp = stdout; if (outputFile != NULL) { fp = fopen(outputFile, "w"); if (fp == NULL) { fprintf(stderr, "!!! Couldn't open %s for writing!!!\n", outputFile); exit(0); } } model->createRandomReads(fp,randomReadTotal); if (fp != stdout) fclose(fp); exit(0); } //fprintf(stderr, "chr1=%s\tstart1=%d\tend1=%d\n", chr1, start1, end1); //fprintf(stderr, "chr2=%s\tstart2=%d\tend2=%d\n", chr2, start2, end2); TagLibrary* tags = NULL; if (strcmp(directory,"none")==0) { //placeholder tags = new TagLibrary(directory); } else { //fprintf(stderr, "directory=%s\n", directory); tags = new TagLibrary(directory); //fprintf(stderr, "flag=%d\n",tags->pairedEndFlag); tags->readTagDirectory(); //fprintf(stderr, "flag=%d\n",tags->pairedEndFlag); tags->setMaxTBP(maxTbp); //fprintf(stderr, "flag=%d\n",tags->pairedEndFlag); //fprintf(stderr, "ASDF=%d\n", tags->pairedEndFlag); if (!tags->pairedEndFlag) { fprintf(stderr, "!!! Tag Directory %s doesn't appear to be a Paired-end Tag Directory!!!\n", directory); exit(0); } tags->numCPUs = numCPUs; tags->minDistHiC = minDist; tags->maxDistHiC = maxDist; } if (siteFlag) { resolution = -1; if (restrictionSite == NULL) { restrictionSite = tags->restrictionSite; if (restrictionSite == NULL) { fprintf(stderr, "!!! Need to specify a restriction site for \"site based\" analysis!!!\n"); exit(0); } else { fprintf(stderr, "\tUsing %s as the restriction site\n", restrictionSite); } } } TagLibrary* bgtags = NULL; if (bgDirectory != NULL) { bgtags = new TagLibrary(bgDirectory); bgtags->readTagDirectory(); bgtags->setMaxTBP(maxTbp); if (!bgtags->pairedEndFlag) { fprintf(stderr, "!!! Background Tag Directory %s doesn't appear to be a Paired-end Tag Directory!!!\n", bgDirectory); exit(0); } bgtags->numCPUs = numCPUs; bgtags->minDistHiC = minDist; bgtags->maxDistHiC = maxDist; } GenomeInteractionLibrary* gil = new GenomeInteractionLibrary(); GenomeInteractionLibrary* gilBg = new GenomeInteractionLibrary(); gil->dlrDistance = dlrDistance; gilBg->dlrDistance = dlrDistance; PeakLibrary* peaks1 = NULL; PeakLibrary* peaks2 = NULL; if (inputInteractions != NULL) { int resEstimate = gil->read(inputInteractions); if (resEstimate == -1) { fprintf(stderr, "!!! Something is wrong with your input interaction file: %s!!!\n", inputInteractions); exit(0); } peaks1 = gil->peakList; if (defaultResolution == 1 && siteFlag == 0) { fprintf(stderr, "\tUsing Estimated Resolution Size: %d\n", resEstimate); resolution = resEstimate; superRes = resEstimate; } else { fprintf(stderr, "\tUsing Given Resolution Size: %d\n", superRes); gil->setInteractionPeakSize(superRes); if (fixedFlag==0) { peaks1->setPeakSize(superRes); } } } else { if (superRes > resolution) { gil->removeOverlap=1; gilBg->removeOverlap=1; } } if (strcmp(directory,"none")==0 && circosPrefix != NULL) { printCircosFiles(circosPrefix,resolution,tags,peaks1,peaks2,start1,end1, start2,end2,geneFiles,numGeneFiles,tagDirs,numTagDirs,bedFiles, numBedFiles,bgtags); //needs to happen after makeHiCMatrix sprintf(fname,"%s.circos.interactions.txt",circosPrefix); FILE* fp = fopen(fname,"w"); gil->printCircos(fp,maxCircosInteractions); fclose(fp); sprintf(fname,"circos -conf %s.circos.conf",circosPrefix); int ok = system(fname); if (ok) { fprintf(stderr, "\t!!! Problem running circos. Command:\n"); fprintf(stderr, "\t\t%s\n",fname); } else { fprintf(stderr, "\tCircos output should be in %s.circos.png\n",circosPrefix); } exit(0); } // prepare file to output interesting interaction reads if (rawbedFile != NULL ) { gil->bedFile = fopen(rawbedFile,"w"); if (gil->bedFile == NULL) { fprintf(stderr, "!!! Could not open file %s for writing!!!\n", rawbedFile); exit(0); } if (actionFlag & HIC_MASK_INTERACTIONBEDFILE && !(actionFlag & HIC_MASK_INTERACTIONTAGFILE)) { fprintf(gil->bedFile,"track name=\"Paired-end reads for %s\" itemRgb=On visibility=3 colorByStrand=\"255,0,0 255,0,0\"\n",directory); } } if (fourCbedFile != NULL ) { gil->fourCbedFile = fopen(fourCbedFile,"w"); if (gil->fourCbedFile == NULL) { fprintf(stderr, "!!! Could not open file %s for writing!!!\n", fourCbedFile); exit(0); } if (actionFlag & HIC_MASK_INTERACTION4CBEDFILE) { fprintf(gil->fourCbedFile,"track name=\"In silico 4C for %s\" type=bedGraph yLineMark=\"0.0\" alwaysZeron=on maxHeightPixels=100:75:11 visibility=full autoScale=on\n",directory); } } // prepare files for regional stats if (actionFlag & HIC_MASK_INTERACTIONSTATSFILES && rawbedFilePrefix != NULL) { if (0 && numCPUs > 1) { //fixed... //fprintf(stderr, "!!! Unfortunately, you cannot use -compactionStats option with multiple CPUs for now\n"); //exit(0); } if (strcmp(rawbedFilePrefix,"auto")==0) { char* name = new char[100000]; sprintf(name, "%s/%s", directory, cleanDirectoryName(directory)); rawbedFilePrefix = name; fprintf(stderr, "\tauto peakStat file prefix set: %s\n", rawbedFilePrefix); } char* filename = new char[100000]; sprintf(filename, "%s.COV.bedGraph", rawbedFilePrefix); FILE* fp = fopen(filename,"w"); if (fp == NULL) { fprintf(stderr, "!!! Could not open %s for writing!!!\n", filename); exit(0); } gil->bedFileCoverage = fp; sprintf(filename, "%s.ICF.bedGraph", rawbedFilePrefix); fp = fopen(filename,"w"); if (fp == NULL) { fprintf(stderr, "!!! Could not open %s for writing!!!\n", filename); exit(0); } gil->bedFileInterFrac = fp; sprintf(filename, "%s.DLR.bedGraph", rawbedFilePrefix); fp = fopen(filename,"w"); if (fp == NULL) { fprintf(stderr, "!!! Could not open %s for writing!!!\n", filename); exit(0); } gil->bedFileLocalFrac = fp; delete []filename; int rd1 = resolution / 1000; int rd2 = superRes / 1000; fprintf(gil->bedFileCoverage,"track name=\"%s COV (Read Coverage) %dx%dkb\" type=bedGraph autoScale=on visibility=2 alwaysZero=on maxHeightPixels=100:75:11\n",directory,rd1,rd2); fprintf(gil->bedFileInterFrac,"track name=\"%s ICF (InterChromosomal interaction Fraction) %dx%dkb\" type=bedGraph autoScale=on visibility=2 alwaysZero=on maxHeightPixels=100:75:11\n",directory,rd1,rd2); fprintf(gil->bedFileLocalFrac,"track name=\"%s DLR (Distal Local (log2) Ratio of interactions, local<%d bp) %dx%dkb\" type=bedGraph autoScale=on visibility=2 alwaysZero=on maxHeightPixels=100:75:11\n",directory,dlrDistance,rd1,rd2); } // prepare file to report significant region stats if (gil != NULL && lociFile != NULL) { gil->lociFile = fopen(lociFile,"w"); if (gil->lociFile == NULL) { fprintf(stderr, "!!! Could not open file %s for writing!!!\n", lociFile); exit(0); } fprintf(gil->lociFile, "PositionID\tchr\tstart\tend\tstrand\tSum(logp)\n"); } // prepare file to output significant interactions if (gil != NULL && (circosPrefix != NULL || interactionFile != NULL)) { gil->recordInteractions = 1; gilBg->recordInteractions = 1; if (interactionFile != NULL) { sprintf(fname,"%s",interactionFile); gil->interactionFile = fopen(fname,"w"); sprintf(fname,"%s.bg.txt",interactionFile); //gilBg->interactionFile = fopen(fname,"w"); } } if (minDist == -1) minDist = resolution/2; gil->resolution = superRes; gilBg->resolution = superRes; gil->threshold = log(pvalueThreshold); gilBg->threshold = log(pvalueThreshold); gil->zscoreThreshold = zscoreThreshold; gilBg->zscoreThreshold = zscoreThreshold; gil->minDist = minDist; gilBg->minDist = minDist; gil->maxDist = maxDist; gilBg->maxDist = maxDist; int peaks1IsGenomeFlag = 0; if (peaks1 == NULL) { if (peakFile1 == NULL) { if (siteFlag) { //stuff is not yet ready //peaks1 = tags->getCoverageRestrictionFragments(chr1,start1,end1,superRes, // restrictionSite,maxMisMatches,genomeDirectory); } else { peaks1 = tags->getCoveragePeaks(chr1,start1,end1,resolution,superRes); } if (chr1 == NULL) { peaks1IsGenomeFlag = 1; } } else { peaks1 = new PeakLibrary(peakFile1,PEAK_READ_MODE_NORMAL); if (chopifyFlag) { int preN = peaks1->numPeaks; PeakLibrary* p = peaks1->getCoveragePeaks(resolution,superRes); delete peaks1; peaks1 = p; fprintf(stderr, "\tChopify: %d peaks chopped into %d peaks separated by %d of size %d\n", preN,peaks1->numPeaks,resolution,superRes); } else if (fixedFlag == 0) { peaks1->setPeakSize(superRes); } } if (peakFile2 == NULL && vsGenomeFlag == 0) { if (chr1 != chr2 || start1 != start2 || end1 != end2) { peaks2 = tags->getCoveragePeaks(chr2,start2,end2,resolution,superRes); } } else { peaks2 = new PeakLibrary(peakFile2,PEAK_READ_MODE_NORMAL); if (chopifyFlag) { int preN = peaks2->numPeaks; PeakLibrary* p = peaks2->getCoveragePeaks(resolution,superRes); delete peaks2; peaks2 = p; fprintf(stderr, "\tChopify: %d peaks chopped into %d peaks separated by %d of size %d\n", preN,peaks2->numPeaks,resolution,superRes); } else if (fixedFlag == 0) { peaks2->setPeakSize(superRes); } } } if (vsGenomeFlag) { if (peaks2 != NULL) delete peaks2; peaks2 = tags->getCoveragePeaks(NULL, -1, 2000000000,resolution,superRes); } Doubletable* chrSizes = tags->getChrSizesFromTags(); params->gsize = (double) tags->gsizeEstimate; if (peaks1 != NULL) peaks1->setMaxChrPositions(chrSizes); if (peaks2 != NULL) peaks2->setMaxChrPositions(chrSizes); if (poutFile != NULL) { FILE* fp = fopen(poutFile, "w"); if (fp==NULL) { fprintf(stderr, "!!! Couldn't open %s for writing peaks !!!\n", poutFile); exit(0); } peaks1->print(fp); fclose(fp); } if (pout2File != NULL && peaks2 != NULL) { FILE* fp = fopen(pout2File, "w"); if (fp==NULL) { fprintf(stderr, "!!! Couldn't open %s for writing 2nd set of peaks !!!\n", pout2File); exit(0); } peaks2->print(fp); fclose(fp); } HiCBgModel* bgModel = NULL; PeakLibrary* modelPeaks1 = NULL; PeakLibrary* modelPeaks2 = NULL; if (customBgFile != NULL) { bgModel = new HiCBgModel(superRes,customBgFile,1+forceFlag); modelPeaks1 = peaks1; modelPeaks2 = peaks2; } else { bgModel = new HiCBgModel(superRes,directory,0); peaks1IsGenomeFlag = 0; } bgModel->stdFilter = stdFilter; bgModel->minFilter = minFilter; bgModel->minExpectFraction = minExpectFraction; bgModel->initializeCurrentRegions(resolution); if (actionFlag & (HIC_MASK_NORM_DISTANCE)) { if (bgModel->ready && forceFlag == 0) { fprintf(stderr, "\tFound Hi-C background model for %d bp resolution (-force to overwrite)\n", superRes); } else { fprintf(stderr, "\tNo Hi-C background model found for %d bp resolution. Creating...\n",superRes); if (superRes != resolution) { fprintf(stderr, "\n\t!!!!Warning... It is HIGHLY recommended that the -res and -superRes are the same\n"); fprintf(stderr, "\twhen generating a background model!!! (consider building the bg model first...)\n\n"); } //tags->getPETagDistribution(2000,300000000,superRes,0); tags->makeHiCBgModel(bgModel,modelPeaks1,modelPeaks2,peaks1IsGenomeFlag,fullModelFlag,params); fprintf(stderr, "\tFinished creating background model.\n\n"); if (bgOnlyFlag) exit(0); } bgModel->scale(); } else { tags->makeHiCBgModelSimple(bgModel,params); if (bgModel->ready && forceFlag == 0) { //fprintf(stderr, "\tFound HiC background model for %d bp resolution (-force to overwrite)\n", superRes); } else { //fprintf(stderr, "\tNo HiC background model found for %d bp resolution. Creating simple model...\n",superRes); //tags->makeHiCBgModelSimple(bgModel,params); } } bgModel->stdFilter = stdFilter; bgModel->minFilter = minFilter; bgModel->minExpectFraction = minExpectFraction; HiCBgModel* bgBgModel = NULL; if (bgDirectory != NULL) { if (customBgBgFile != NULL) { bgBgModel = new HiCBgModel(superRes,customBgBgFile,1); } else { bgBgModel = new HiCBgModel(superRes,bgDirectory,0); } bgBgModel->stdFilter = stdFilter; bgBgModel->minFilter = minFilter; bgBgModel->initializeCurrentRegions(resolution); bgModel->minExpectFraction = minExpectFraction; if (actionFlag & (HIC_MASK_NORM_DISTANCE)) { if (bgBgModel->ready && forceFlag == 0) { fprintf(stderr, "\tFound HiC background model (for bg directory) for %d bp resolution (-force to overwrite)\n", superRes); } else { fprintf(stderr, "\tNo HiC background model (for bg directory) found for %d bp resolution. Creating...\n",superRes); bgtags->makeHiCBgModel(bgBgModel,modelPeaks1,modelPeaks2,peaks1IsGenomeFlag,fullModelFlag,params); fprintf(stderr, "\tFinished creating background model.\n\n"); if (bgOnlyFlag) exit(0); } bgBgModel->scale(); } bgBgModel->bgFlag = 1; //bgBgModel->normalizeVariance(bgModel); } if (histFlag == 0) { //Matrix mode double totalMatrix = 0; int reportX = peaks1->numPeaks; int reportY = 0; if (peaks2 != NULL) { totalMatrix = (double)peaks1->numPeaks*(double)peaks2->numPeaks; reportY = peaks2->numPeaks; } else { totalMatrix = (double)peaks1->numPeaks*(double)peaks1->numPeaks; reportY = peaks1->numPeaks; } double maxMatrixSize = ((double)HIC_MATRIX_LIMIT)*((double)HIC_MATRIX_LIMIT); if (!(actionFlag & HIC_MASK_NO_MATRIX) && relativeFlag==0 && totalMatrix > maxMatrixSize) { fprintf(stderr, "!!! Matrix way to big (%d x %d = %.2le elements) !!!\n", reportX, reportY,totalMatrix); fprintf(stderr, "\n\tTry using \"-nomatrix\" (max is %.2le)\n\n", maxMatrixSize); if (override != 1) { fprintf(stderr, "!!! If you're sure about this, add \"-override\"\n"); exit(0); } } else if (!(actionFlag & HIC_MASK_NO_MATRIX) && relativeFlag != 0) { if (maxDist > 0) { relativeFlag = ((double)maxDist)/((double)resolution); } else { relativeFlag = peaks1->numPeaks; } if (maxMatrixSize < peaks1->numPeaks*relativeFlag) { fprintf(stderr, "!!! When making a relative matrix consider lowering the -maxDist <#> parameter.\n"); fprintf(stderr, "!!! If you're sure about this, add \"-override\"\n"); if (override != 1) { fprintf(stderr, "!!! If you're sure about this, add \"-override\"\n"); exit(0); } } } if (expectMatrixFlag) { int p1 = peaks1->numPeaks; int p2 = p1; if (peaks2 != NULL) p2 = peaks2->numPeaks; if (relativeFlag) { p2 = relativeFlag*2+1; } if (bgModel!=NULL) bgModel->initializeExpectMatrix(p1,p2); if (bgBgModel != NULL) bgBgModel->initializeExpectMatrix(p1,p2); } params->relativeFlag = relativeFlag; double** matrix = tags->makeHiCMatrix(peaks1, peaks2, superRes,bgModel,actionFlag,gil,params); double** bgMatrix = NULL; if (bgtags != NULL) { if (gil->recordInteractions) { gil->recordInteractionBg=1; gil->sortInteractionIndexes(); } if (gil->bedFile != NULL) { if (actionFlag & HIC_MASK_INTERACTIONBEDFILE && !(actionFlag & HIC_MASK_INTERACTIONTAGFILE)) { fprintf(gil->bedFile,"track name=\"Paired-end reads for %s\" itemRgb=On visibility=3 colorByStrand=\"0,0,0 0,0,0\"\n",bgDirectory); } } if (gil->fourCbedFile != NULL) { if (actionFlag & HIC_MASK_INTERACTION4CBEDFILE) { fprintf(gil->fourCbedFile,"track name=\"In silico 4C for %s\" type=bedGraph yLineMark=\"0.0\" alwaysZeron=on maxHeightPixels=100:75:11 visibility=full autoScale=on\n",bgDirectory); } } bgMatrix = bgtags->makeHiCMatrix(peaks1, peaks2, superRes,bgBgModel,actionFlag,gil,params); } if (gil != NULL && gil->recordInteractions && gil->interactionFile != NULL) { gil->print(gil->interactionFile,centerFlag); } int np2 = peaks1->numPeaks; if (peaks2 != NULL) np2 = peaks2->numPeaks; if (relativeFlag) { np2 = relativeFlag*2+1; } // if (balanceFlag) { matrix = balanceMatrix(matrix,peaks1->numPeaks,np2); } if (logFlag==0 && valueIsLog == 1) {// && matrix != NULL) { fprintf(stderr, "\tReporting linear (not log2) ratios\n"); for (int i=0;inumPeaks;i++) { for (int j=0;jnumPeaks;i++) { for (int j=0;jnumPeaks;i++) { for (int j=0;jnumPeaks;i++) { for (int j=0;j= 0) { maxDiffIndex = maxDist/resolution; } double **expectedMatrix = NULL; if (minExpReads > 0.0) { if (bgModel != NULL && bgModel->expectMatrix != NULL) { expectedMatrix = bgModel->expectMatrix; } //actionFlag = actionFlag | HIC_MASK_EXPECTED; //expectedMatrix = tags->makeHiCMatrix(peaks1, peaks2, superRes,bgModel,actionFlag,gil,minExpReads,boundaryScale); //mask = HIC_MASK_EXPECTED; //actionFlag = actionFlag & (~mask); } for (int z=0;z 0) fprintf(stderr, "\tClustering Iteration: %d\n", z+1); matrix = tags->calcCorrelationMatrix(matrix, peaks1->numPeaks,expectedMatrix,minExpReads, maxDiffIndex); } if (0) { //experimental for (int i=0;inumPeaks-2;i++) { double corr = TagLibrary::calcCorrelation(matrix[i],matrix[i+2],peaks1->numPeaks, expectedMatrix[i],expectedMatrix[i+2],minExpReads,maxDiffIndex,i,i+2); fprintf(stdout, "%s\t%d\t%d\t%lf\n", peaks1->peakOrder[i+1]->chr, peaks1->peakOrder[i+1]->start,peaks1->peakOrder[i+1]->end,corr); } exit(0); } if (bgMatrix != NULL) { double **expectedMatrixBg = NULL; if (minExpReads > 0.0) { if (bgBgModel != NULL && bgBgModel->expectMatrix != NULL) { expectedMatrixBg = bgBgModel->expectMatrix; } //actionFlag = actionFlag | HIC_MASK_EXPECTED; //expectedMatrixBg = tags->makeHiCMatrix(peaks1, peaks2, superRes,bgModel,actionFlag,gil,minExpReads,boundaryScale); //mask = HIC_MASK_EXPECTED; //actionFlag = actionFlag & (~mask); } for (int z=0;z 0) fprintf(stderr, "\tClustering Bg Iteration: %d\n", z+1); bgMatrix = tags->calcCorrelationMatrix(bgMatrix, peaks1->numPeaks, expectedMatrixBg,minExpReads,maxDiffIndex); } double* em = NULL; double* emBg = NULL; for (int i=0;inumPeaks;i++) { if (minExpReads > 0.0) { em = expectedMatrix[i]; emBg = expectedMatrixBg[i]; } double corr = TagLibrary::calcCorrelation(matrix[i],bgMatrix[i],peaks1->numPeaks, em,emBg,minExpReads, maxDiffIndex,i,i); fprintf(stdout, "%s\t%lf\n", peaks1->peakOrder[i]->name,corr); } exit(0); /*if (expectedMatrixBg != NULL) { for (int z=0;znumPeaks;z++) { delete [](expectedMatrixBg[z]); } delete []expectedMatrixBg; }*/ expectedMatrixBg=NULL; } /*if (expectedMatrix != NULL) { for (int z=0;znumPeaks;z++) { delete [](expectedMatrix[z]); } delete []expectedMatrix; }*/ expectedMatrix=NULL; } } if (bgMatrix != NULL) { for (int i=0;inumPeaks;i++) { for (int j=0;jnumPeaks, np2); if (relativeFlag) fprintf(stderr, " (relative mode)"); fprintf(stderr, "\n"); FILE* fp = NULL; if (outputFile != NULL) { fp = fopen(outputFile, "w"); if (fp == NULL) { fprintf(stderr, "!!! Could not open %s for writing !!!\n", outputFile); exit(0); } } else { fp = stdout; } if (washuFlag == 0) { fprintf(fp, "HiCMatrix (directory=%s)\tRegions",directory); if (relativeFlag) { for (int i=0;inumPeaks;i++) { fprintf(fp,"\t%s", peaks2->peakOrder[i]->name); } } fprintf(fp,"\n"); } int washuID = 1; int minWashUOffset = -1; if (minDist > 0) { minWashUOffset = minDist/resolution; } for (int i=0;inumPeaks;i++) { if (washuFlag == 0) fprintf(fp,"%s\t%s",peaks1->peakOrder[i]->name,peaks1->peakOrder[i]->name); for (int j=0;j= peaks1->numPeaks) continue; if (abs(j-relativeFlag) < minWashUOffset) continue; if (strcmp(peaks1->peakOrder[i]->chr,peaks1->peakOrder[i]->chr)!=0) continue; //if (peaks1->peakOrder[i]->chr != peaks1->peakOrder[jj]->chr) continue; fprintf(fp,"%s\t%d\t%d\t%s:%d-%d,%.3lf\t%d\t.\n",peaks1->peakOrder[i]->chr, peaks1->peakOrder[i]->start,peaks1->peakOrder[i]->end, peaks1->peakOrder[jj]->chr,peaks1->peakOrder[jj]->start, peaks1->peakOrder[jj]->end,matrix[i][j],washuID++); //fprintf(fp,"%s\t%d\t%d\t%s:%d-%d,%.3lf\t%d\t.\n",peaks1->peakOrder[jj]->chr, // peaks1->peakOrder[jj]->start,peaks1->peakOrder[jj]->end, // peaks1->peakOrder[i]->chr,peaks1->peakOrder[i]->start, // peaks1->peakOrder[i]->end,matrix[i][j],washuID++); } else { if (peaks1 == peaks2 && j == i) continue; //if (peaks1 == peaks2 && j <= i) continue; fprintf(fp,"%s\t%d\t%d\t%s:%d-%d,%.3lf\t%d\t.\n",peaks1->peakOrder[i]->chr, peaks1->peakOrder[i]->start,peaks1->peakOrder[i]->end, peaks2->peakOrder[j]->chr,peaks2->peakOrder[j]->start, peaks2->peakOrder[j]->end,matrix[i][j],washuID++); //fprintf(fp,"%s\t%d\t%d\t%s:%d-%d,%.3lf\t%d\t.\n",peaks2->peakOrder[j]->chr, // peaks2->peakOrder[j]->start,peaks2->peakOrder[j]->end, // peaks1->peakOrder[i]->chr,peaks1->peakOrder[i]->start, // peaks1->peakOrder[i]->end,matrix[i][j],washuID++); if (peaks1 != peaks2) { //fprintf(fp,"%s\t%d\t%d\t%s:%d-%d,%.3le\t%d\t.\n",peaks2->peakOrder[j]->chr, // peaks2->peakOrder[j]->start,peaks2->peakOrder[j]->end, // peaks1->peakOrder[i]->chr,peaks1->peakOrder[i]->start, // peaks1->peakOrder[i]->end,matrix[i][j],washuID++); } } } } if (washuFlag == 0) fprintf(fp,"\n"); } if (outputFile != NULL) { fclose(fp); } if (expectMatrixOutputFile != NULL && bgModel != NULL && bgModel->expectMatrix != NULL) { FILE* fexp = fopen(expectMatrixOutputFile,"w"); if (fexp == NULL) { fprintf(stderr, "!!! Could not open %s for writing !!!\n", expectMatrixOutputFile); exit(0); } double** expectMatrix = bgModel->expectMatrix; fprintf(fexp, "Expected HiCMatrix (directory=%s)\tRegions",directory); if (relativeFlag) { for (int i=0;ipeakOrder[i]->name); } } fprintf(fexp,"\n"); for (int i=0;inumPeaks;i++) { fprintf(fexp,"%s\t%s",peaks1->peakOrder[i]->name,peaks1->peakOrder[i]->name); for (int j=0;j -1.99) { //fprintf(fp,"\t%.3lf",matrix[i][j]); fprintf(fexp,"\t%.3le",expectMatrix[i][j]); } else { fprintf(fexp,"\tNA"); } } fprintf(fexp,"\n"); } fclose(fexp); } //clustering if ((actionFlag & HIC_MASK_CLUSTER) && !(actionFlag & HIC_MASK_NO_MATRIX)) { if (peaks1 != peaks2) { fprintf(stderr, "\tClustering can only be performed with a symetric matrix, skipping...\n"); } else { fprintf(stderr, "\tClustering regions\n"); if (actionFlag & HIC_MASK_LOGPVALUES) { } else { for (int i=0;inumPeaks;i++) { for (int j=0;jnumPeaks;j++) { //matrix[i][j] = exp(matrix[i][j]); matrix[i][j] = -1*matrix[i][j]; } } } int removeBadRegions=1; if (removeBadRegions && bgModel != NULL) { int newNumPeaks = 0; PeakLibrary* newpeaks = new PeakLibrary(); int* map = new int[peaks1->numPeaks]; for (int i=0;inumPeaks;i++) { if (bgModel->minCoverage <= peaks1->peakOrder[i]->v && bgModel->maxCoverage >= peaks1->peakOrder[i]->v) { map[newNumPeaks++] = i; newpeaks->addPeak(peaks1->peakOrder[i]); } } fprintf(stderr, "\tOmitting %d(of %d) peaks due to low/high read depth (>%.1lf std or <%.2lf of average depth)\n", newNumPeaks,peaks1->numPeaks,stdFilter,minFilter); newpeaks->setDefaultPeakOrder(); double** nmatrix = new double*[newNumPeaks]; for (int i=0;inumPeaks]; for (int i=0;inumPeaks;i++) { pnames[i] = peaks1->peakOrder[i]->name; } TreeCluster* cluster = new TreeCluster(matrix,peaks1->numPeaks); if (actionFlag & HIC_MASK_CLUSTERFIXED) { cluster->fixedPositionFlag = 1; } cluster->fixedClusterScore = 0; cluster->cluster(); if (actionFlag & HIC_MASK_LOGPVALUES) { } else { for (int i=0;inumPeaks;i++) { for (int j=0;jnumPeaks;j++) { matrix[i][j] = -1*matrix[i][j]; } } } //cluster->printCDT(outputFile, pnames, matrix); cluster->printCDT(outputFile,1); //cluster->printClusters(outputFile, pnames, clusterThreshold); if (actionFlag & HIC_MASK_CLUSTERFIXED) { if (outputFile != NULL) { sprintf(fname, "%s.clusterSizes.txt", outputFile); } else { sprintf(fname, "out.clusterSizes.txt"); } FILE* fpSizes = fopen(fname,"w"); cluster->printClusterSizes(fpSizes,1000); fclose(fpSizes); } delete cluster; delete []pnames; } } if (matrix != NULL) { for (int i=0;inumPeaks;i++) { delete [](matrix[i]); } delete []matrix; } if (bgMatrix != NULL) { for (int i=0;inumPeaks;i++) { delete [](bgMatrix[i]); } delete []bgMatrix; } if (symmetric) peaks2 = NULL; } if (circosPrefix != NULL) { printCircosFiles(circosPrefix,resolution,tags,peaks1,peaks2,start1,end1, start2,end2,geneFiles,numGeneFiles,tagDirs,numTagDirs,bedFiles, numBedFiles,bgtags); //needs to happen after makeHiCMatrix sprintf(fname,"%s.circos.interactions.txt",circosPrefix); FILE* fp = fopen(fname,"w"); gil->printCircos(fp,maxCircosInteractions); fclose(fp); sprintf(fname,"circos -conf %s.circos.conf",circosPrefix); int ok = system(fname); if (ok) { fprintf(stderr, "\t!!! Problem running circos. Command:\n"); fprintf(stderr, "\t\t%s\n",fname); } else { fprintf(stderr, "\tCircos output should be in %s.circos.png\n",circosPrefix); } } } else { //Histogram Mode if (size <= 0) size = resolution*100; //peaks1 = new PeakLibrary(peakFile1,PEAK_READ_MODE_NORMAL); tags->makeHiCHistogram(outputFile,peaks1, size, resolution,bgModel,actionFlag,params); } if (ifcFile != NULL && bgModel != NULL) { if (strcmp(ifcFile,"auto")==0) { char* name = new char[100000]; sprintf(name, "%s/%s.ifc.tsv", directory, cleanDirectoryName(directory)); ifcFile = name; fprintf(stderr, "\tauto ifc file set to: %s\n", ifcFile); } FILE* fp = fopen(ifcFile, "w"); if (fp==NULL) { fprintf(stderr, "!!! Couldn't open %s for writing ifc file !!!\n", ifcFile); exit(0); } if (1) bgModel->currentRegions->normalizeCurve(); bgModel->printCurrentRegions(fp); fclose(fp); } if (poutFile != NULL) { FILE* fp = fopen(poutFile, "w"); if (fp==NULL) { fprintf(stderr, "!!! Couldn't open %s for writing peaks !!!\n", poutFile); exit(0); } peaks1->print(fp); fclose(fp); } if (pout2File != NULL && peaks2 != NULL) { FILE* fp = fopen(pout2File, "w"); if (fp==NULL) { fprintf(stderr, "!!! Couldn't open %s for writing 2nd set of peaks !!!\n", pout2File); exit(0); } peaks2->print(fp); fclose(fp); } if (gil != NULL) { char* cmd = new char[100000]; if (gil->bedFileCoverage != NULL) { fclose(gil->bedFileCoverage); sprintf(cmd, "(head -n 1 \"%s.COV.bedGraph\" && tail -n +2 \"%s.COV.bedGraph\" | sort -k1,1 -k2,2g ) > \"%s.COV.bedGraph.2\"; mv \"%s.COV.bedGraph.2\" \"%s.COV.bedGraph\"", rawbedFilePrefix,rawbedFilePrefix,rawbedFilePrefix,rawbedFilePrefix,rawbedFilePrefix); (void)system(cmd); } if (gil->bedFileInterFrac != NULL) { fclose(gil->bedFileInterFrac); sprintf(cmd, "(head -n 1 \"%s.ICF.bedGraph\" && tail -n +2 \"%s.ICF.bedGraph\" | sort -k1,1 -k2,2g ) > \"%s.ICF.bedGraph.2\"; mv \"%s.ICF.bedGraph.2\" \"%s.ICF.bedGraph\"", rawbedFilePrefix,rawbedFilePrefix,rawbedFilePrefix,rawbedFilePrefix,rawbedFilePrefix); (void)system(cmd); } if (gil->bedFileLocalFrac != NULL) { fclose(gil->bedFileLocalFrac); sprintf(cmd, "(head -n 1 \"%s.DLR.bedGraph\" && tail -n +2 \"%s.DLR.bedGraph\" | sort -k1,1 -k2,2g ) > \"%s.DLR.bedGraph.2\"; mv \"%s.DLR.bedGraph.2\" \"%s.DLR.bedGraph\"", rawbedFilePrefix,rawbedFilePrefix,rawbedFilePrefix,rawbedFilePrefix,rawbedFilePrefix); (void)system(cmd); } if (gil->fourCbedFile != NULL) { fclose(gil->fourCbedFile); sprintf(cmd, "(head -n 1 \"%s\" && tail -n +2 \"%s\" | sort -k1,1 -k2,2g ) > \"%s.2\"; mv \"%s.2\" \"%s\"", fourCbedFile,fourCbedFile,fourCbedFile,fourCbedFile,fourCbedFile); (void)system(cmd); } //gil->bedFileCoverage; //gil->bedFileInterFrac; //gil->bedFileLocalFrac; } if (gil != NULL && gil->lociFile != NULL) fclose(gil->lociFile); if (gil != NULL && gil->interactionFile != NULL) fclose(gil->interactionFile); if (chrSizes != NULL) delete chrSizes; } double calculateRowSum(double** matrix, int size,int size2,double* sums, double* ratio, int rowOrColFlag); double** balanceMatrix(double ** matrix, int size,int size2) { // if size2 != size, then the matrix is a relative matrix (i.e. diagonal) int relFlag = 0; int halfJ = (int)(floor((double)size2+0.5)/2.0); //fprintf(stderr, "%d x %d (halfJ = %d)\n", size, size2,halfJ); if (size != size2) { relFlag=1; //fprintf(stderr, "\tBalancing a relative matrix\n"); } double dampeningFactor = 0.5; double minError = 1e-7; int maxIter = 100; double** balMatrix = new double*[size]; double* sum = new double[size]; double* norm = new double[size]; double* adjust = new double[size]; for (int i=0;i= 0 && k < size) { normJ = norm[k]; } balMatrix[i][j] = matrix[i][j]*norm[i]*normJ; } else { balMatrix[i][j] = matrix[i][j]*norm[i]*norm[j]; } } } error = calculateRowSum(balMatrix,size,size2,sum,adjust,0); //fprintf(stderr, "\tIter=%d\tError=%lf\n",iteration,error); if (error < minError) break; if (iteration > maxIter) break; } return balMatrix; } double calculateRowSum(double** matrix, int size, int size2,double* sum, double* ratio, int rowOrColFlag) { for (int i=0;i 0.0) { avg += sum[i]; N++; } } if (N > 0) avg /= (double)N; double error = 0.0; for (int i=0;i 0.0) { ratio[i] = avg/sum[i]; error += (sum[i]-avg)*(sum[i]-avg); } } if (N > 0) error /= (double)N; error = sqrt(error); return error; } void printCircosFiles(char* prefix, int resolution, TagLibrary* tags, PeakLibrary* peaks1, PeakLibrary* peaks2, int start1, int end1, int start2, int end2,char** geneFiles, int numGeneFiles, char** tagDirs, int numTagDirs,char** bedFiles, int numBedFiles, TagLibrary* bgtags) { Doubletable* chrSizes = tags->getChrSizesFromTags(); Inttable* chrs = new Inttable(1000); char** keys = peaks1->chrs->keys(); for (int i=0;ichrs->total;i++) { double size = chrSizes->search(keys[i]); if (size > EMPTY_DOUBLE_CHECK) { chrs->insert((int)size,keys[i]); } delete [](keys[i]); } delete []keys; if (peaks2 != NULL && peaks1 != peaks2) { char** keys = peaks2->chrs->keys(); for (int i=0;ichrs->total;i++) { double size = chrSizes->search(keys[i]); if (size > EMPTY_DOUBLE_CHECK) { chrs->insert((int)size,keys[i]); } delete [](keys[i]); } delete []keys; } char** currentChrs = chrs->keys(); qsort(currentChrs,chrs->total,sizeof(char*),&chrcmp); if (chrs->total < 1) { fprintf(stderr, "!!! Something seems to be wrong - no valid chromosomes are available...\n"); } char* fname = new char[10000]; char* fname2 = new char[10000]; sprintf(fname,"%s.circos.conf",prefix); FILE* cfp = fopen(fname,"w"); fprintf(cfp,"# automated circos file generated by HOMER (analyzeHiC)\n"); fprintf(cfp,"\n\n\t<>\n\t<>\n\n\n"); fprintf(cfp,"\n\n\t<>\n\n"); fprintf(cfp,"\n\n"); fprintf(cfp,"\n\n"); fprintf(cfp,"\tdefault = 50u\n"); fprintf(cfp,"\tbreak = 10u\n"); fprintf(cfp,"\taxis_break_at_edge = yes\n"); fprintf(cfp,"\taxis_break = yes\n"); fprintf(cfp,"\taxis_break_style = 2\n"); fprintf(cfp,"\n\t\n"); fprintf(cfp,"\t\tstroke_color = black\n"); fprintf(cfp,"\t\tfill_color = blue\n"); fprintf(cfp,"\t\tthickness = 0.25r\n"); fprintf(cfp,"\t\tstroke_thickness = 2\n"); fprintf(cfp,"\t\n"); fprintf(cfp,"\n\t\n"); fprintf(cfp,"\t\tstroke_color = black\n"); fprintf(cfp,"\t\tthickness = 1.5r\n"); fprintf(cfp,"\t\tstroke_thickness = 3\n"); fprintf(cfp,"\t\n"); fprintf(cfp,"\n"); fprintf(cfp,"\nthickness = 10p\n"); fprintf(cfp,"stroke_thickness = 2\n"); fprintf(cfp,"stroke_color = black\n"); fprintf(cfp,"fill = no\n"); fprintf(cfp,"fill_color = black\n"); fprintf(cfp,"radius = 0.80r\nshow_label = yes\nlabel_with_tag = yes\nlabel_font = default\n"); fprintf(cfp,"label_radius = dims(ideogram,radius) + 0.075r\nlabel_size = 60p\n"); fprintf(cfp,"band_stroke_thinkness = 2\nshow_bands = yes\nfill_bands = yes\n"); fprintf(cfp,"\n\n"); fprintf(cfp,"\n\n"); fprintf(cfp,"\tdir = ./\n"); fprintf(cfp,"\tfile = ./%s.circos.png\n",prefix); fprintf(cfp,"\t24bit = yes\n\tpng = yes\n\tsvg = yes\n\tradius = 1500p\n\tbackground = white\n"); fprintf(cfp,"\tangle_offset = -90\n\tauto_alpha_colors = yes\n\tauto_alpha_steps = 20\n"); fprintf(cfp,"\n"); fprintf(cfp,"\nkaryotype = %s.circos.karyotype.txt\n",prefix); fprintf(cfp,"\nchromosomes_display_default = no\n"); fprintf(cfp,"\nchromosomes_units = %d\n", resolution); fprintf(cfp,"\nchromosomes = "); double totalBp = 0; for (int i=0;itotal;i++) { int curLength = chrs->search(currentChrs[i]); if (i>0) fprintf(cfp,";"); fprintf(cfp,"%s",currentChrs[i]); if (start1 > 1 || end1 < curLength) { int end = end1; if (end > chrs->search(currentChrs[i])) { end = chrs->search(currentChrs[i]); } int chrCurSize = end - start1 + 1; end = end/resolution+1; int start = start1/resolution-1; if (start < 0) start = 0; fprintf(cfp,":%d-%d",start,end); totalBp += (double)(chrCurSize); //fprintf(stderr, "adding %lf %d %d\n",(double)chrCurSize,start, end); } else { totalBp += (double)chrs->search(currentChrs[i]); //fprintf(stderr, "adding %lf\n",(double)chrs->search(currentChrs[i])); } } fprintf(stderr, "\tTotal bp to visualize in Circos: %.0lf\n", totalBp); double maxSizeForNames = 10e6; double extrasRadius = numTagDirs*0.1 + numGeneFiles*0.15 + numBedFiles*0.025; if (totalBp > maxSizeForNames) { extrasRadius = numTagDirs*0.1 + numGeneFiles*0.10 + numBedFiles*0.025; } double defaultInfoSpace = 0.5; double extrasScaleFactor = 1.0; if (extrasRadius > defaultInfoSpace) { extrasScaleFactor = defaultInfoSpace/extrasRadius; } double interactionRadius = 1.0 - extrasRadius*extrasScaleFactor-0.005; double unadj = totalBp/(double)resolution/75.0; int ticRes = (int)(log(unadj)/log(10.0)); int ticThickness = (int)pow(10.0,ticRes); int ticThickness2 = (int)pow(10.0,ticRes+1.0); if (ticThickness < 1) ticThickness = 1; if (ticThickness2 < 10) ticThickness2 = 10; //fprintf(stderr, "multiplier: %d %lf totalbp = %lf %d %d\n",ticRes,unadj,totalBp,ticThickness, ticThickness2); fprintf(cfp,"\nshow_ticks = yes\nshow_tick_labels=yes\n"); fprintf(cfp,"\n\n"); fprintf(cfp,"\tradius = dims(ideogram,radius_outer)\n"); //int multiplyer = (int)(log(ticRes)/log(10.0)); fprintf(cfp,"\tmultiplier = 1e-6\n"); fprintf(cfp,"\t\n"); fprintf(cfp,"\t\tspacing = %du\n",ticThickness); fprintf(cfp,"\t\tsize = 8p\n\t\tthickness = 2p\n\t\tcolor = black\n\t\tshow_label = no\n"); fprintf(cfp,"\t\tlabel_size = 20p\n\t\tformat = %%.3f\n"); fprintf(cfp,"\t\n"); fprintf(cfp,"\t\n"); fprintf(cfp,"\t\tspacing = %du\n",ticThickness2); fprintf(cfp,"\t\tsize = 8p\n\t\tthickness = 2p\n\t\tcolor = black\n\t\tshow_label = yes\n"); fprintf(cfp,"\t\tlabel_size = 40p\n\t\tlabel_offset = 5p\n\t\tformat = %%.3f\n"); fprintf(cfp,"\t\n"); fprintf(cfp,"\n"); fprintf(cfp,"\n\n"); fprintf(cfp,"\n\tz = 5\n"); fprintf(cfp,"\n\tradius = %.2lfr\n",interactionRadius); fprintf(cfp,"\tbezier_radius = 0.4r\n"); fprintf(cfp,"\tbezier_radius_purity = 0.45\n"); fprintf(cfp,"\tcrest = 0.4\n"); fprintf(cfp,"\tperturb = yes\n"); fprintf(cfp,"\n\t\n"); fprintf(cfp,"\t\tshow = yes\n\t\tcolor = black_a8\n\t\tthickness = 2\n\t\trecord_limit=5000\n"); fprintf(cfp,"\t\tfile = %s.circos.interactions.txt\n", prefix); fprintf(cfp,"\t\t\n"); if (bgtags != NULL) { fprintf(cfp,"\t\t\t\n"); fprintf(cfp,"\t\t\t\timportance = 210\n"); fprintf(cfp,"\t\t\t\tcondition = _THICKNESS1_ < 0\n"); fprintf(cfp,"\t\t\t\tthickness = eval(abs(_THICKNESS1_))\n"); fprintf(cfp,"\t\t\t\tcolor = red_a3\n"); fprintf(cfp,"\t\t\t\tz = 10\n"); fprintf(cfp,"\t\t\t\tflow = continue\n"); fprintf(cfp,"\t\t\t\n"); } fprintf(cfp,"\t\t\t\n"); fprintf(cfp,"\t\t\t\timportance = 205\n"); fprintf(cfp,"\t\t\t\tcondition = _THICKNESS1_ > 20\n"); fprintf(cfp,"\t\t\t\tthickness = 20\n"); fprintf(cfp,"\t\t\t\n"); fprintf(cfp,"\t\t\t\n"); fprintf(cfp,"\t\t\t\timportance = 200\n"); fprintf(cfp,"\t\t\t\tcondition = _THICKNESS1_ < -20\n"); fprintf(cfp,"\t\t\t\tthickness = 20\n"); fprintf(cfp,"\t\t\t\n"); fprintf(cfp,"\t\t\t\n"); fprintf(cfp,"\t\t\t\timportance = 195\n"); fprintf(cfp,"\t\t\t\tcondition = _THICKNESS1_ < 0\n"); fprintf(cfp,"\t\t\t\tthickness = eval(abs(_THICKNESS1_))\n"); fprintf(cfp,"\t\t\t\n"); fprintf(cfp,"\t\t\t\n"); fprintf(cfp,"\t\t\t\timportance = 190\n"); fprintf(cfp,"\t\t\t\tcondition = _THICKNESS1_ < 1\n"); fprintf(cfp,"\t\t\t\tthickness = 1\n"); fprintf(cfp,"\t\t\t\n"); fprintf(cfp,"\t\t\n"); fprintf(cfp,"\t\n"); fprintf(cfp,"\n\n"); // plots int maxColors = 6; char** colors = new char*[maxColors]; colors[0] = (char*)"red"; colors[1] = (char*)"green"; colors[2] = (char*)"blue"; colors[3] = (char*)"orange"; colors[4] = (char*)"purple"; colors[5] = (char*)"grey"; int curColor = 0; fprintf(cfp,"\n"); char* command = new char[10000]; int ucscRes = (int) (totalBp / 3e3); if (ucscRes < 1) ucscRes = 1; fprintf(stderr, "\tSetting UCSC resolution at %d\n", ucscRes); char* ucscPos = new char[1000]; strcpy(ucscPos,"genome"); if (chrs->total == 1) { int start11 = 0; if (start1 > start11) start11 = start1; sprintf(ucscPos,"%s:%d-%d",currentChrs[0],start11,end1); } double curRadius = interactionRadius; for (int i=0;i\n"); fprintf(cfp,"\t\tshow = yes\n\t\tz=5\n\t\ttype = tile\n"); fprintf(cfp,"\t\tr0 = %.2lfr\n\t\tr1 = %.2lfr\n",curRadius,curRadius+radiusThickness); curRadius += radiusThickness; fprintf(cfp,"\t\tcolor = %s\n", colors[curColor]); if (++curColor >= maxColors) curColor = 0; fprintf(cfp,"\t\tlayers = 2\n\t\tmargin = 0.2u\n\t\tthickness=15\n\t\tpadding=2\n"); fprintf(cfp,"\t\torientation = out\n\t\tlayers_overflow=hide\n"); //fprintf(cfp,"\t\tbackground = no\n"); fprintf(cfp,"\t\tfile = %s\n", fname); fprintf(cfp,"\t\n"); //------ prep data -------- FILE* fp = fopen(fname,"w"); PeakLibrary* peaks = new PeakLibrary(bedFiles[i],PEAK_READ_MODE_NORMAL); peaks->setDefaultPeakOrder(); for (int j=0;jnumPeaks;j++) { int good = 1; if (chrs->total == 1) { if (strcmp(currentChrs[0],peaks->peakOrder[j]->chr)!=0) good = 0; if (peaks->peakOrder[j]->end < start1) good = 0; if (peaks->peakOrder[j]->start > end1) good = 0; } if (good) { fprintf(fp, "%s %d %d\n", peaks->peakOrder[j]->chr,peaks->peakOrder[j]->start, peaks->peakOrder[j]->end); } } fclose(fp); delete peaks; } curColor = 0; for (int i=0;i\n"); fprintf(cfp,"\t\tshow = yes\n\t\tz=5\n\t\ttype = histogram\n"); fprintf(cfp,"\t\tr0 = %.2lfr\n\t\tr1 = %.2lfr\n",curRadius,curRadius+radiusThickness); curRadius += radiusThickness; fprintf(cfp,"\t\tcolor = %s\n\t\tfill_color = %s\n", colors[curColor],colors[curColor]); if (++curColor >= maxColors) curColor = 0; fprintf(cfp,"\t\tfill_under = yes\n\t\tthickness = 1\n\t\textend_bin = no\n"); //fprintf(cfp,"\t\tbackground = no\n\t\taxis = no\n"); sprintf(fname,"%s.circos.histogram%d.txt", prefix,i+1); fprintf(cfp,"\t\tfile = %s\n\t\tmin = %d\n\t\tmax = %d\n", fname,0,75); fprintf(cfp,"\t\n"); sprintf(command,"makeUCSCfile %s -circos %s -fsize 1e10 -res %d -o %s",tagDirs[i],ucscPos, ucscRes, fname); //fprintf(stderr, "cmd: %s\n", command); (void)system(command); } for (int i=0;i maxSizeForNames) { radiusThickness = extrasScaleFactor * 0.10; } fprintf(cfp,"\t\n"); fprintf(cfp,"\t\tshow = yes\n\t\tz=5\n\t\ttype = tile\n"); fprintf(cfp,"\t\tr0 = %.2lfr\n\t\tr1 = %.2lfr\n",curRadius,curRadius+radiusThickness); curRadius += radiusThickness; fprintf(cfp,"\t\tcolor = purple\n"); fprintf(cfp,"\t\tlayers = 5\n\t\tmargin = 0.2u\n\t\tthickness=15\n\t\tpadding=8\n"); fprintf(cfp,"\t\torientation = out\n\t\tlayers_overflow=hide\n"); //fprintf(cfp,"\t\tbackground = no\n"); fprintf(cfp,"\t\tfile = %s\n", fname); fprintf(cfp,"\t\n"); //------ text section/gene names -------- radiusThickness = extrasScaleFactor * 0.10; fprintf(cfp,"\t\n"); if (totalBp <= maxSizeForNames) { fprintf(cfp,"\t\tshow = yes\n"); } else { fprintf(cfp,"\t\tshow = no\n"); } fprintf(cfp,"\t\tz=5\n\t\ttype = text\n"); fprintf(cfp,"\t\tr0 = %.2lfr\n\t\tr1 = %.2lfr\n",curRadius,curRadius+radiusThickness); if (totalBp <= maxSizeForNames) { curRadius += radiusThickness; } fprintf(cfp,"\t\tcolor = black\n"); fprintf(cfp,"\t\tshow_links = yes\n\t\tlink_dims=4p,4p,8p,4p,4p\n\t\tlink_thickness=4p\n"); fprintf(cfp,"\t\tlink_color = red\n\t\tlabel_size=25p\n\t\tlabel_font=condensed\n"); fprintf(cfp,"\t\tpadding = 0p\n\t\trpadding=0p\n"); fprintf(cfp,"\t\tlabel_snuggle=yes\n\t\tmax_snuggle_distance = 1r\n"); fprintf(cfp,"\t\tsnuggle_sampling = 2\n\t\tsnuggle_tolerance = 0.25r\n"); fprintf(cfp,"\t\tsnuggle_link_overlap_test = yes\n\t\tsnuggle_link_overlap_tolerance = 2p\n"); fprintf(cfp,"\t\tsnuggle_refine = yes\n"); fprintf(cfp,"\t\tfile = %s\n", fname2); fprintf(cfp,"\t\n"); FILE* fp = fopen(fname,"w"); FILE* fp2 = fopen(fname2,"w"); PeakLibrary* peaks = new PeakLibrary(geneFiles[i],PEAK_READ_MODE_NORMAL); peaks->setDefaultPeakOrder(); for (int j=0;jnumPeaks;j++) { int good = 1; if (chrs->total == 1) { if (strcmp(currentChrs[0],peaks->peakOrder[j]->chr)!=0) good = 0; if (peaks->peakOrder[j]->end < start1) good = 0; if (peaks->peakOrder[j]->start > end1) good = 0; } if (good) { fprintf(fp, "%s %d %d\n", peaks->peakOrder[j]->chr,peaks->peakOrder[j]->start, peaks->peakOrder[j]->end); fprintf(fp2, "%s %d %d %s\n", peaks->peakOrder[j]->chr,peaks->peakOrder[j]->start, peaks->peakOrder[j]->end,peaks->peakOrder[j]->name); } } fclose(fp); fclose(fp2); delete peaks; } fprintf(cfp,"\n"); fprintf(cfp,"\n\n<>\n"); fclose(cfp); sprintf(fname,"%s.circos.karyotype.txt",prefix); cfp = fopen(fname,"w"); for (int i=0;itotal;i++) { fprintf(cfp,"chr - %s %s 0 %d %s\n", currentChrs[i],currentChrs[i],chrs->search(currentChrs[i]), currentChrs[i]); delete [](currentChrs[i]); } fclose(cfp); delete []currentChrs; delete chrs; delete chrSizes; delete []fname; delete []command; delete []ucscPos; } void printCMD() { fprintf(stderr, "\n\tUsage: analyzeHiC [options]\n"); fprintf(stderr, "\t\t...\n"); fprintf(stderr, "\n\tResolution Options:\n"); fprintf(stderr, "\t\t-res <#> (Resolution of matrix in bp or use \"-res site\" [see below], default: 10000000)\n"); fprintf(stderr, "\t\t-window <#> (size of region to count tags for overlapping windows, default: same as res)\n"); //fprintf(stderr, "\t\tFull Resolution Options:\n"); //fprintf(stderr, "\t\t\t-res site (to trigger full restriction site resolution analysis)\n"); //fprintf(stderr, "\t\t\t-restrictionSite (restriction site used, -mis <#> to allow mismatches)\n"); fprintf(stderr, "\n\tOptions for specifying the region to analyze:\n"); fprintf(stderr, "\t\t-chr (create matrix on this chromosome, default: whole genome)\n"); fprintf(stderr, "\t\t-start <#> (start matrix at this position, default:0)\n"); fprintf(stderr, "\t\t-end <#> (end matrix at this position, default: no limit)\n"); fprintf(stderr, "\t\t-pos chrN:xxxxxx-yyyyyy (UCSC formatted position instead of -chr/-start/-end)\n"); fprintf(stderr, "\t\t-chr2 , -start2 <#>, -end2 <#>, or -pos2 (Use these positions on the\n"); fprintf(stderr, "\t\t\ty-axis of the matrix. Otherwise the matrix will be sysmetric)\n"); fprintf(stderr, "\t\t-p (specify regions to make matrix, unbalanced, use -p2 )\n"); fprintf(stderr, "\t\t-vsGenome (normally makes a square matrix, this will force 2nd set of peaks to be the genome)\n"); fprintf(stderr, "\t\t-chopify (divide up peaks into regions the size of the resolution, default: use peak midpoints)\n"); //fprintf(stderr, "\t\t-fixed (do not scale the size of the peaks to that of the resolution)\n"); fprintf(stderr, "\t\t-relative (use with -maxDist <#>, outputs diagonal of matrix up to maxDistance)\n"); fprintf(stderr, "\t\t-pout (output peaks used for analysis as a peak file, -pout2 for 2nd set)\n"); fprintf(stderr, "\n\tContact Matrix Options:\n"); fprintf(stderr, "\t\t-ihskb <#> (normalize counts to \"interactions per hundred square kilobases per billion, default)\n"); fprintf(stderr, "\t\t\tUse '-normTotal <#>' and '-normArea <#>' to modify normalization constants, area in bp^2\n"); fprintf(stderr, "\t\t\tUse '-normTotal given' and '-normArea given' to disable normalizations\n"); fprintf(stderr, "\t\t-raw (report raw interaction counts)\n"); fprintf(stderr, "\t\t-coverageNorm (Only adjust reads based on total interactions per region, default)\n"); fprintf(stderr, "\t\t-balance (balance resulting matrix - row/column totals the same)\n"); fprintf(stderr, "\t\t-distNorm (return log2 obs/expected counts normalized for interaction distance)\n"); fprintf(stderr, "\t\t-log | -nolog (force matrix to be linear or log2 transformed)\n"); fprintf(stderr, "\t\t-corr (report Pearson's correlation coeff, use \"-corrIters <#>\" to recursively calculate)\n"); fprintf(stderr, "\t\t\t-corrDepth <#> (merge regions in correlation so that minimum # expected tags per data point)\n"); fprintf(stderr, "\n\tGeneral options:\n"); fprintf(stderr, "\t\t-o (Output file name, default: sent to stdout)\n"); fprintf(stderr, "\t\t-compactionStats (calculates DLR and ICF compaction scores)\n"); fprintf(stderr, "\t\t\t-dlrDistance <#> (Cutoff distance for distal vs. local interactions for DLR, default: 3Mb\n"); fprintf(stderr, "\t\t-ifc (outputs interaction frequence curve for regions, can set to \"auto\")\n"); fprintf(stderr, "\t\t-4C (outputs tags interacting with specified regions)\n"); fprintf(stderr, "\t\t-cpu <#> (number of CPUs to use, default: 1)\n"); fprintf(stderr, "\n\tFilters & Other:\n"); fprintf(stderr, "\t\t-nomatrix (skip matrix creation - use if more than 100,000 loci)\n"); fprintf(stderr, "\t\t-std <#> (# of std deviations from mean interactions per region to exclude, default:8)\n"); fprintf(stderr, "\t\t-min <#> (minimum fraction of average read depth to include in analysis, default: 0.05)\n"); fprintf(stderr, "\t\t-minExpect <#> (minimum fraction of average read depth to use for coverage normalization, default: 0.75)\n"); fprintf(stderr, "\t\t-override (Allow very large matrices to be created... be carful using this)\n"); //--------------------------------------------------------- fprintf(stderr, "\n\tOlder options:\n"); fprintf(stderr, "\n\tBackground Options:\n"); //fprintf(stderr, "\t\t-fullModel (perform exhaustive background model calculations, default: approxmiate)\n"); //fprintf(stderr, "\t\t-quickModel (perform approximate background model calculations, default: approxmiate)\n"); fprintf(stderr, "\t\t-force (force the creation of a fresh genome-wide background model)\n"); fprintf(stderr, "\t\t-bgonly (quit after creating the background model)\n"); fprintf(stderr, "\t\t-createModel (Create custome bg from regions specified, i.e. -p/-pos)\n"); fprintf(stderr, "\t\t-model (Use Custom background model, -modelBg for -ped)\n"); fprintf(stderr, "\t\t-randomize <# reads> (and the output is a PE tag file, initail PE tag directory not used\n"); fprintf(stderr, "\t\t\tUse makeTagDirectory ... -t outputfile to create the new directory)\n"); fprintf(stderr, "\n\tOlder normalization options (often require creation of a background model):\n"); fprintf(stderr, "\t\t-logp (output log p-values from old-style interaction calculations)\n"); fprintf(stderr, "\t\t-expected (report expected interaction counts based on average profile)\n"); fprintf(stderr, "\t\t-rawAndExpected (raw counts sent to stdout)\n"); fprintf(stderr, "\t\t-cluster (cluster regions, uses \"-o\" to name cdt/gtr files, default: out.cdt)\n"); fprintf(stderr, "\t\t-clusterFixed (clusters adjacent regions, good for linear domains)\n"); //fprintf(stderr, "\t\t-zscoreNorm (normalize log ratio by distance dependent stddev, add \"-nolog\" to return linear values)\n"); //fprintf(stderr, "\t\t-normTotal <#> (normalize counts to this total number of HiC reads)\n"); //fprintf(stderr, "\t\t-balanceNorm (similar to -coverageNorm but uses iteratively balanced totals from bgmodel)\n"); //fprintf(stderr, "\t\t\t(# defined the threshold at which to define clusters/domains, i.e. 0.5)\n"); fprintf(stderr, "\n\tOld interaction finding options (see findTADsAndLoops.pl):\n"); //fprintf(stderr, "\t\t-loci (calculate collective p-value of each loci)\n"); fprintf(stderr, "\t\t-interactions (output interactions, add \"-center\" to adjust pos to avg of reads)\n"); fprintf(stderr, "\t\t-pvalue <#> (p-value cutoff for interactions, default 0.001)\n"); fprintf(stderr, "\t\t-zscore <#> (z-score cutoff for interactions, default 1.0)\n"); fprintf(stderr, "\t\t-minDist <#> (Minimum interaction distance, default: resolution/2)\n"); fprintf(stderr, "\t\t-maxDist <#> (Maximum interaction distance, default: -1=none)\n"); fprintf(stderr, "\t\t-boundary <#> (score boundaries at the given scale i.e. 100000)\n"); fprintf(stderr, "\n\tComparing HiC experiments:\n"); fprintf(stderr, "\t\t-ped \n"); fprintf(stderr, "\n\tCreating BED file to view with Wash U Epigenome Browser:\n"); fprintf(stderr, "\t\t-washu (Both matrix and interaction outputs will be in WashH BED format)\n"); fprintf(stderr, "\n\tCreating Circos Diagrams:\n"); fprintf(stderr, "\t\t-circos (creates circos files with the given prefix)\n"); fprintf(stderr, "\t\t-d [tag directory 2] ... (will plot tag densities with circos)\n"); fprintf(stderr, "\t\t-b (similar to visiualization of genes/-g, but no labels)\n"); fprintf(stderr, "\t\t-g (shows gene locations)\n"); fprintf(stderr, "\n\tMaking Histograms:\n"); fprintf(stderr, "\t\t-hist <#> (create a histogram matrix around peak positions, # is the resolution)\n"); fprintf(stderr, "\t\t-size <#> (size of region in histogram, default = 100 * resolution)\n"); fprintf(stderr, "\n\tGiven Interaction Analysis Mode (no matrix is produced):\n"); fprintf(stderr, "\t\t-i (for analyzing specific sets of interactions)\n"); fprintf(stderr, "\t\t-iraw (output raw reads from interactions, or -irawtags )\n"); fprintf(stderr, "\n"); exit(0); }