// Copyright 2009, 2010, 2011, 2012 Christopher Benner // // This file is part of HOMER // // HOMER is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // HOMER is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. #include #include "Hashtable.h" #include "statistics.h" #include #include #include #include #include #include #ifndef MOTIF_H #define MOTIF_H #define MOTIF2_BUFFER 10000100 #define MOTIF_STRAND_POS 0 #define MOTIF_STRAND_NEG 1 #define MOTIF_STRAND_BOTH 2 #define MOTIF_STRAND_SEPARATE 3 #define MOTIF_SCORING_HYPERGEOMETRIC 0 #define MOTIF_SCORING_BINOMIAL 1 #define MOTIF_SCORING_ZSCORE 2 #define MOTIF_MISMATCH_PVALUE_MINIMUM -1.3863 //-0.6931 = 0.50 //-1.3863 = 0.25 //-2.3026 = 0.10 //-2.9957 = 0.05 //-4.6052 = 0.01 //-6.9078 = 0.001 #define MOTIF_MAX_BACKGROUND_PERCENT 0.5 #define MOTIF_SCORE_OLIGOS 0 #define MOTIF_REMOVE_OLIGOS 1 #define MOTIF_MISMATCH_MINIMUM_SKIP 1 #define MOTIF_MISMATCH_MINIMUM_COUNTS 1 #define MOTIF_MAX_SEED_OLIGOS_FRACTION 0.10 #define MOTIF_MAX_SEED_OLIGOS_TOTAL 1e5 #define MAX_OLIGO_NORMALIZATION_ITERATIONS 160 #define MIN_OLIGO_NORMALIZATION_IMPROVEMENT 0.01 #define MOTIF_SEED_MINIMUM_LOGP -10.0 #define OLIGO_SEARCH_KEEP 0 #define OLIGO_SEARCH_REMOVE 1 #define MOTIF_DEFAULT_NUMBER_OF_MOTIFS 25 #define MOTIF_INITIAL_DEGENERACY 0.1 #define MOTIF_PROBABILITY_MINIMUM 0.001 #define MOTIF_IUPAC_THRESHOLD 0.15 #define MOTIFS_NUM_POTENTIAL_MOTIFS 10 #define MOTIF_OPTIMZATION_DIFFERENT_THRESHOLDS 0 #define MOTIFS_MAX_OPT_ITERATIONS 10 #define MOTIF_MAX_HIT_RATE_TO_CONSIDER 0.005 #define MAXIMUM_MOTIF_LENGTH 10000 #define MAXIMUM_DENOVO_MOTIF_LENGTH 50 #define DENOVO_SPEED_AUTOADJUST 0 #define DENOVO_SPEED_CONSTANT 1 #define DENOVO_MASK_FULL 0 #define DENOVO_MASK_QUICK 1 #define SCORER_STANDARD_ZOOPS 0 #define SCORER_APPROXIMATE_ZOOPS 1 #define KNOWN_JUST_SCORE_ZOOPS 0 #define KNOWN_OPTIMIZE_THREASHOLD 1 #define KNOWN_MASK_MOTIFS 2 #define KNOWN_MSCORE_MOTIFS 3 #define KNOWN_FIND_ALL 4 #define MOTIF_OFFSET_NULL -12345678 #define MOTIF_OVERLAP_PERCENT 0.75 #define MOTIF_DEFAULT_REDUCE_PERCENTAGE 0.20 #define HOMER_MINIMUM_SEQ_WEIGHT 0.001 #define DNA_ALPHA 4 #define DOUBLE_EPS 1e-20 #define FLOAT_EPS 1e-10 #define MAX_STAT_CACHE_SIZE 500 #define HOMER_VERBOSE_LEVEL0 0 #define HOMER_VERBOSE_LEVEL1 1 class DeNovoMotifs; class KnownMotifs; class OligoArray; class Oligo; class Motif; class EnrichmentScorer; struct ThreadArgs_findMisMatchEnrichmentThread; struct ThreadArgs_optimizeMotifThresholdThread; struct ThreadArgs_scanMotifThread; class SequenceStats; class OligoArray { public: int* tree; //if tree instances are 0 or negative, they reference oligos, 1=tree root int numTree; int maxTree; int alphaSize; int oligoLength; char* alpha; char* rvAlpha; char* rvMap; char* alphaIndex; char* rvAlphaIndex; double totalTarget; double totalBackground; Oligo** staticOligos; int numOligos; Oligo** activeOligos; int numActiveOligos; int maxOligos; int rvFlag; int maxAutoNormIters; pthread_mutex_t sortMutex; //stack variables for "recursive" looping - save's on new and delete time to keep them here int numCPUs; Oligo*** utilityOligos; int* numUtilityOligos; int** indexStack; int** seqIndex; int** alphaStack; int** positionStack; int** misMatchStack; int** fwdMisMatchStack; int** rvMisMatchStack; double** fwdScoreStack; double** rvScoreStack; OligoArray(); ~OligoArray(); void init(int length, int totalDataSetBp, int alpha, int strandFlag,int numCPUs); void optimizeMotifThreshold(Motif* motif); void optimizeMemoryUsage(); void printOligo(char* seq, FILE* fp); int addOligo(char* seq, double vt, double vbg, double addValue); //addValue = -1 for normal add int searchOligo(char* seq, int maxMisMatches, double& vt, double& vbg,int removeFlag,int cpu); void sortActiveOligos(); void sortUtilityOligos(int cpu); void initializeActiveOligoArray(); void initializeOligoPvalues(EnrichmentScorer* scorer); void normalizeOligos(int nlen, char* normFile); double normalizeOligosIteration(int nlen, char* normFile,int printFlag); void scoreOligosWithMotif(Motif* motif,double minimumScore, int removeFlag,int cpu); void calculateFoldEnrichment(double pseudoCount); void adjustOligoInstances(char* oligoSeq, double targetValue, double backgroundValue); void readOligoCountFile(char* file, int strand, int cpus); private: int addOligoTree(char* seq, double vt, double vbg, double addvalue,int rvMode, int knownID); int searchOligoTree(char* seq,int maxMisMatches, double& vt, double& vbg); }; class Oligo { public: char* seq; double numTarget; double numBackground; float normWeight; // used if -olen is used to normalize short oligos float logp; // used to figure out contribution to motifs when optimizing matrix double value; //generic value - used for different things // pvalue during findMisMatchEnrichment char flag; //flag used to make sure we don't doulbe count oligos when traversing tree char flag2; //generic flag - used to mark oligos to skip during mismatch analysis double* cpuData; Oligo(); Oligo(char* seq); ~Oligo(); void init(); void print(FILE*); static int oligoCmp(const void* a, const void* b); static int oligoCmpReverse(const void* a, const void* b); static int cpu; static int oligoCmpReverseParallel(const void* a, const void* b); static void revopp(char* seq); }; class Sequence { public: char* s; int g; // group, i.e. target (=1) or background (=0) double w; // i.e. 1.0 char* name; int length; int id; Sequence(); Sequence(char* name, char* seq, int group, double weight); ~Sequence(); void init(); void setSeq(char* name, char* seq, int group, double weight); int motifScan(Motif* m,int strand, int offset,int mode, double &bestScore, int* sitePositions, char* siteStrands, FILE* fp, pthread_mutex_t fileLock); }; class LinkedSequence { public: Sequence* s; LinkedSequence* link; LinkedSequence(Sequence* seq, LinkedSequence* link); ~LinkedSequence(); }; class SequenceArray { public: Sequence** seqs; int numSeqs; LinkedSequence* link; double numTarget; double numBackground; int bpTarget; int bpBackground; int longestSequence; EnrichmentScorer* scorer; double maxBackgroundPercent; int maxAutoNormIters; SequenceArray(); ~SequenceArray(); void deleteLinks(); void addSequence(Sequence* s); void optimizeArray(); void readSequenceFiles(char* seqFile, char* groupFile); void printGroupFile(FILE* fp); void printSequenceFile(FILE* fp); void motifScan(Motif* m,int strand, int offset,int mode,FILE* fp, pthread_mutex_t fileLock); void normalizeSequence(int nlen, int strand, char* normFile, int equalFlag); double normalizeSequenceIteration(int nlen, int strand, char* normFile, int printFlag,int neutralFlag); void maskMotifFromOligoArray(Motif* motif,OligoArray* oligoArray,char strand); void initializeScorer(int scoringMethod,int zoopsFlag,unsigned long long int maxCacheSize); void optimizeMotifThreshold(Motif* motif, char strand); void parseFasta2SeqAndGroupFiles(char* inputFASTAfile,char* bgFASTAfile,char* sFile, char* gFile); }; class Motif { public: double** m; int length; double threshold; int logFlag; char* name; char* consensus; double numTarget; double numBackground; double freqTarget; double freqBackground; double numSeqTarget; double numSeqBackground; double logp; double minimumScore; int badFlag; double similarPercent; double similarPercentOfOther; Motif* similarMotif; SequenceStats* seqStats; int numSeqs; double statAvgTarget; double statAvgBackground; double statStdTarget; double statStdBackground; double statStrandLogRatio; double siteMultiplicity; double* fwdMaxPossible; double* rvMaxPossible; static int alphaSize; static char* alphaIndex; Motif(); Motif(char* seq); Motif(Motif*); Motif(char* name, char* consensus, double threshold, double logp, double** matrix, int mlen); ~Motif(); void init(); void setName(char*); void logxform(); void expxform(); void normalize(); void justNormalize(); void print(FILE*); void getConsensus(); char getSymbol(char*); void setMinimumScore(double percent); void setBlank(int len); void addOligo(Oligo* o); void initializeSeqStats(int); void optimizeThreshold(EnrichmentScorer* scorer, double maxBackPercent); void adjustLength(int); void calculateDistributionStats(); static double scoreOverlap(Motif* m1, Motif* m2, double &percent1, double &percent2); static int motifCmp(const void* a, const void* b); static int nameID; static Motif** readMotifFile(char* file, int &numMotifs); }; class DeNovoMotifs { public: OligoArray* oligoArray; unsigned long long int totalOligos; unsigned long long int numOligoEstimate; unsigned int oligoLength; unsigned int normLength; int bufferLength; unsigned char alphaSize; unsigned char strand; double maximumExpectedPerBp; unsigned long long int maxCacheSize; int noDuplicateOligos; unsigned int numCPUs; int mutexIndex; int mutexTotal; int nextMeter; int incMeter; pthread_mutex_t mutex; double numTargets; double numBackground; Oligo** seedOligos; int numSeedOligos; Motif** motifs; int numMotifs; SequenceArray* fullSequences; char* seqFile; char* groupFile; char* inputFASTAfile; char* bgFASTAfile; int fastaFlag; char* normFile; char* tmpFile; FILE* sitefp; int randFlag; int speedFlag; double siteReduceThreshold; int optimizeGivenFlag; int scoringMethod; int finalZoopsFlag; double maxBackgroundPercent; int maxMisMatches; int misMatchTargetOnlyFlag; double misMatchMinPvalue; int minimumMisMatchToStartSkipping; double minimumCountsToStartSkipping; double minimumFractionToStartSkipping; double minimumSeedLogp; int totalMotifsToFind; int numTrialMotifs; int numConcurrentMotifs; int maxOptimizationIterations; int motifOptimizationMethod; int maskMethod; int maxAutoNormIters; int zoopsFlag; EnrichmentScorer* scorer; //double (DeNovoMotifs::*enrichmentFunction)(double pv, double nv); DeNovoMotifs(); ~DeNovoMotifs(); void init(); void readSequence2Tree(); void loadFullSequences(); void checkParameters(); void getOligoEnrichment(); void normalizeOligos(); void findMisMatchEnrichment(int maxMisMatches,FILE* fp,int verboseLevel); void findMisMatchEnrichmentThread(ThreadArgs_findMisMatchEnrichmentThread* threadArgs); void optimizeMotifMatrix(Motif* &motif); void optimizeMotifThreshold(Motif* m, int cpu); void optimizeMotifThresholdThread(ThreadArgs_optimizeMotifThresholdThread*); void initializeEnrichmentScoring(); double scoreEnrichmentHypergeometric(double pv, double nv); double scoreEnrichmentBinomial(double pv, double nv); void getSeedOligos(); void optimizeSeeds2Motifs(); void optimizeGivenMotifs(); void scoreWithKnownMotifs(int optFlag); void freeOligos(); void parseFasta(); void printMotifs(FILE* fp); // sorts motifs array }; class EnrichmentScorer { public: unsigned int numTargets; unsigned int numBackground; double** cache; int cacheLength; unsigned long long int maxCacheSize; int zoopsFlag; double* targetZoopsApprox; double* bgZoopsApprox; unsigned int maxZoopsApprox; double (EnrichmentScorer::*enrichmentFunction)(double pv, double nv); EnrichmentScorer(); ~EnrichmentScorer(); double scoreEnrichment(double pv, double nv); void initializeEnrichmentScoring(int method, double numTarget, double numBack, int zoopsFlag, unsigned long long int newMaxCacheSize); double scoreEnrichmentHypergeometric(double pv, double nv); double scoreEnrichmentBinomial(double pv, double nv); double scoreEnrichmentZscore(double pv, double nv); double getZoopsApproxTarget(double value); double getZoopsApproxBackground(double value); }; class KnownMotifs { public: SequenceArray* seqArray; unsigned long long int totalBp; unsigned int normLength; unsigned char alphaSize; unsigned char strand; unsigned long long int maxCacheSize; int offset; double numTargets; double numBackground; Motif** motifs; int numMotifs; unsigned int numCPUs; int mutexIndex; int mutexTotal; double nextMeter; double incMeter; pthread_mutex_t mutex; pthread_mutex_t fileLock; char* seqFile; char* groupFile; char* inputFASTAfile; char* bgFASTAfile; int fastaFlag; char* knownFile; char* normFile; FILE* sitefp; FILE* normfp; char* motifFilename; double maxBackgroundPercent; double siteReduceThreshold; int scoringMethod; EnrichmentScorer* scorer; int mscoreFlag; int threasholdOptimizationFlag; int zoopsFlag; int knownFlag; int findFlag; int maskFlag; int normOnlyFlag; int maxAutoNormIters; int neutralFlag; KnownMotifs(); ~KnownMotifs(); void checkParameters(); void loadMotifs(); void loadSequence(); void parseFasta(); void getMotifEnrichment(); void printMotifEnrichment(FILE* fp); void printMotifs(FILE* fp); void scanMotifThread(ThreadArgs_scanMotifThread* args); void findOverlappingMotifs(); }; class GroupInfo { public: double weight; unsigned int group; }; class SequenceStats { public: Sequence* seq; double score; int numSites; int* pos; char* strands; SequenceStats(); ~SequenceStats(); void initialize(); void setSites(int* positions, char* strands, int num); static int scoreCmp(const void* a, const void* b); static int defaultOrder(const void* a, const void* b); }; void splitMotif2(char* string, char** cols, int &numCols, char delim); void cleanSequence(char* seq); int checkSequence(char* seq); int sortDouble(const void* a, const void* b); void printCMDhomer(); void printCMDdenovo(); void programDeNovo(int argc, char** argv); void programKnown(int argc, char** argv); struct ThreadArgs_findMisMatchEnrichmentThread { public: DeNovoMotifs* denovo; int cpu; int maxMisMatches; int totalChecked; int numSkipped; int dueToTargetTags; int dueToLessThanLast; int dueToLowPvalue; int lastChecked; int lastCheckedTotal; int verboseLevel; ThreadArgs_findMisMatchEnrichmentThread(DeNovoMotifs* obj,int cpu,int mis, int total,int vlevel); }; void* DeNovoMotifs_findMisMatchEnrichmentThread(void* threadArgs); struct ThreadArgs_optimizeMotifThresholdThread { public: DeNovoMotifs* denovo; int cpu; Motif** motifs; ThreadArgs_optimizeMotifThresholdThread(DeNovoMotifs* obj,int cpu,Motif** motifs); }; void* DeNovoMotifs_optimizeMotifThresholdThread(void* threadArgs); struct ThreadArgs_scanMotifThread { public: KnownMotifs* known; int cpu; ThreadArgs_scanMotifThread(KnownMotifs* obj,int cpu); }; void* KnownMotifs_scanMotifThread(void* threadArgs); #endif