// Copyright 2009 - 2016 Christopher Benner // // This file is part of HOMER // // HOMER is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // HOMER is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. #include "homerTools.h" #define _FILE_OFFSET_BITS 64 void printCMD(); void printCMDbarcodes(); void printCMDtrim(); void printCMDfreq(); void printCMDextract(); void printCMDspecial(); void barcodesProgram(int argc, char** argv); void truSeqProgram(int argc, char** argv); void trimProgram(int argc, char** argv); void freqProgram(int argc, char** argv); void extractProgram(int argc, char** argv); void clusterProgram(int argc, char** argv); void matrix3DProgram(int argc, char** argv); void specialProgram(int argc, char** argv); void decontaminateProgram(int argc, char** argv); int main(int argc, char** argv) { if (argc < 2) { printCMD(); } char* program = argv[1]; if (strcmp(program,"barcodes")==0) { barcodesProgram(argc,argv); } else if (strcmp(program,"trim")==0) { trimProgram(argc,argv); } else if (strcmp(program,"truseq")==0) { truSeqProgram(argc,argv); } else if (strcmp(program,"freq")==0) { freqProgram(argc,argv); } else if (strcmp(program,"extract")==0) { extractProgram(argc,argv); } else if (strcmp(program,"decontaminate")==0) { decontaminateProgram(argc,argv); } else if (strcmp(program,"cluster")==0) { clusterProgram(argc,argv); } else if (strcmp(program,"matrix3D")==0) { matrix3DProgram(argc,argv); } else if (strcmp(program,"special")==0) { specialProgram(argc,argv); } else if (strcmp(program,"--help")==0) { printCMD(); } else { fprintf(stderr, "!!! Could not recognize \"%s\" as a command !!!\n", argv[1]); printCMD(); } return 0; } void printCMD() { fprintf(stderr, "\n\tUsage: homerTools [--help | options]\n"); fprintf(stderr, "\n\tCollection of tools for sequence manipulation\n"); fprintf(stderr, "\n\tCommands: [type \"homerTools \" to see individual command options]\n"); fprintf(stderr, "\t\tbarcodes - separate FASTQ file by barcodes\n"); fprintf(stderr, "\t\ttruseq - process truseq barcodes from unidentified indexes (illumina)\n"); fprintf(stderr, "\t\ttrim - trim adapter sequences or fixed sizes from FASTQ files(also splits)\n"); fprintf(stderr, "\t\tfreq - calculate position-dependent nucleotide/dinucleotide frequencies\n"); fprintf(stderr, "\t\textract - extract specific sequences from FASTA file(s)\n"); fprintf(stderr, "\t\tdecontaminate - remove bad tags from a contaminated tag directory\n"); fprintf(stderr, "\t\tcluster - hierarchical clustering of a NxN distance matrix\n"); // fprintf(stderr, "\t\tmatrix3D - convert distance matrix to 3D coordinates\n"); fprintf(stderr, "\t\tspecial - specialized routines (i.e. only really useful for chuck)\n"); fprintf(stderr, "\t\t\n"); exit(0); } //------------------------------ trim ------------------------------------------------------------------- int matchAdapter5prime(char* seq, char* adapter,int maxMisMatches,int minMatchLength,int matchStart); int matchAdapter3prime(char* seq, char* adapter,int maxMisMatches,int minMatchLength,int matchStart); void printCMDtrim() { fprintf(stderr, "\n\tUsage: homerTools trim [options] [file2] ...\n"); //fprintf(stderr, "\n\tfor paired-end: homerTools trim [options] , ...\n"); //fprintf(stderr, "\t\tno space between mate files and comma: ,\n"); fprintf(stderr, "\n\tProbably best to use only on option at a time...\n"); fprintf(stderr, "\n\tOptions for command: trim\n"); fprintf(stderr, "\t\t-3 <#|[ACGT]> (trim # bp or adapter sequence from 3' end of sequences)\n"); fprintf(stderr, "\t\t-5 <#|[ACGT]> (trim # bp or adapter sequence from 5' end of sequences)\n"); fprintf(stderr, "\t\t\t-mis <#> (Maximum allowed mismatches in adapter sequence, default: 0)\n"); fprintf(stderr, "\t\t\t-minMatchLength <#> (minimum adapter sequence at edge to match, default: half adapter length)\n"); fprintf(stderr, "\t\t\t-matchStart <#> (don't start searching for adapter until this position, default: 0)\n"); fprintf(stderr, "\t\t-q <#> (Trim sequences once quality dips below threshold, default: none [range:0-40])\n"); fprintf(stderr, "\t\t\t-qstart <#> (don't check quality until sequences are at least this long, default: 10)\n"); fprintf(stderr, "\t\t\t-qwindow <#> (size of moving average to check for quality dropoff, default: 5)\n"); fprintf(stderr, "\t\t-len <#> (Keep first # bp of sequence - i.e. make them the same length)\n"); fprintf(stderr, "\t\t-stats (Output trimming statistics to filename, default: sent to stdout)\n"); fprintf(stderr, "\t\t-min <#> (Minimum size of trimmed sequence to keep, default: 1)\n"); fprintf(stderr, "\t\t-max <#> (Maximum read length, default: %d)\n", TRIM_MAXREADLENGTH); fprintf(stderr, "\t\t-suffix (output is sent to InuptFileName.suffix, default: trimmed)\n"); fprintf(stderr, "\t\t-lenSuffix (length distribution is sent to InuptFileName.suffix, default: lengths)\n"); fprintf(stderr, "\t\t-uniq (replace read names with unique IDs, for cases when there are problems with the names)\n"); fprintf(stderr, "\t\t-pe (find look for 2nd mate pair file, matches _R1_ to _R2_ and _1.fastq to _2.fastq)\n"); //fprintf(stderr, "\t\t-split <#> (Split reads into two reads at bp #, output to trimmed1 and trimmed2)\n"); //fprintf(stderr, "\t\t-revopp <#> (Return reverse opposite of read [if used with -split, only the 2nd\n"); //fprintf(stderr, "\t\t\t\thalf of the read will be retuned as reverse opposite])\n"); fprintf(stderr, "\n\tCommon Examples:\n"); fprintf(stderr, "\t\tIllumina TruSeq:\n"); fprintf(stderr, "\t\t\thomerTools trim -3 AGATCGGAAGAGCACACGTCT -mis 2 -minMatchLength 4 -min 20\n"); //fprintf(stderr, "\t\tIllumina TruSeq (PE - 2nd read):\n"); //fprintf(stderr, "\t\t\thomerTools trim -3 AGATCGGAAGAGCGTCGTGT -mis 1 -minMatchLength 6\n"); fprintf(stderr, "\t\tIllumina small RNA adapter:\n"); fprintf(stderr, "\t\t\thomerTools trim -3 TCGTATGCCGTCTTCTGCTTG -mis 2 -minMatchLength 4 -min 20\n"); fprintf(stderr, "\t\tHiC trimming:\n"); fprintf(stderr, "\t\t\tMboI: homerTools trim -3 GATC -mis 0 -matchStart 20 -min 20\n"); fprintf(stderr, "\t\t\tHindIII: homerTools trim -3 AAGCTAGCTT -mis 0 -matchStart 20 -min 20\n"); fprintf(stderr, "\t\t\tNcoI: homerTools trim -3 CCATGCATGG -mis 0 -matchStart 20 -min 20\n"); fprintf(stderr, "\t\t\n"); exit(0); } void trimProgram(int argc, char** argv) { char* suffix = NULL; char* lenSuffix = NULL; char* statsFile = NULL; char** files = new char*[100000]; int numfiles = 0; if (argc < 3) { printCMDtrim(); } int p3len = -1; int p5len = -1; int matchStart = 0; int maxMisMatches = 0; int minMatchLength = -1; int splitPos = -1; int minSizeToKeep = 1; int qthresh = -1; int qwindow = 5; int qstart = 10; int uniqNamesFlag = 0; int findPEFlag = 0; int revoppFlag = 0; int maxReadLength = TRIM_MAXREADLENGTH; char* p3adapter = NULL; char* p5adapter = NULL; int fixedLength = -1; for (int i=2;i 47 && argv[i][0] < 58) { sscanf(argv[i],"%d",&p3len); fprintf(stderr, " by %d bp\n", p3len); } else { fprintf(stderr, " with adapter: %s\n", argv[i]); p3adapter = argv[i]; if (minMatchLength < 0) { minMatchLength = strlen(p3adapter)/2; } } } else if (strcmp(argv[i],"-5")==0) { fprintf(stderr, "\tSequences will be trimmed at 5' end"); i++; if (argv[i][0] > 47 && argv[i][0] < 58) { sscanf(argv[i],"%d",&p5len); fprintf(stderr, " by %d bp\n", p5len); } else { fprintf(stderr, " with adapter: %s\n", argv[i]); p5adapter = argv[i]; if (minMatchLength < 0) { minMatchLength = strlen(p5adapter)/2; } } } else if (argv[i][0] == '-') { printCMDtrim(); } else { fprintf(stderr, "\tWill parse file %s\n", argv[i]); files[numfiles++] = argv[i]; } } if (numfiles < 1) { fprintf(stderr, "!!! No input files... Not much to do!!!\n"); exit(0); } char* cmd = new char[BUFFER]; char* file1 = new char[BUFFER]; char* file2 = new char[BUFFER]; char* buf = new char[BUFFER]; char* buf2 = new char[BUFFER]; char* current = new char[BUFFER]; char* current2 = new char[BUFFER]; char* curSeq = new char[BUFFER]; char* curSeq2 = new char[BUFFER]; char* curQual = new char[BUFFER]; char* curQual2 = new char[BUFFER]; char* curHeader = new char[BUFFER]; char* curHeader2 = new char[BUFFER]; char* newfilename = new char[10000]; char* newfilename2 = new char[10000]; int* readLengths = new int[maxReadLength+1]; int* readLengths2 = new int[maxReadLength+1]; current[0]='\0'; for (int i=0;i 3) { if (strcmp(&(file1[strlen(file1)-3]), ".gz")==0) { gzipFlag1 = 1; } else if (strlen(file1) > 4 && strcmp(&(file1[strlen(file1)-4]), ".bz2")==0) { gzipFlag1 = 2; } } if (strlen(file2) > 3) { if (strcmp(&(file2[strlen(file2)-3]), ".gz")==0) { gzipFlag2 = 1; } else if (strlen(file2) > 4 && strcmp(&(file2[strlen(file2)-4]), ".bz2")==0) { gzipFlag2 = 2; } } if (gzipFlag1) { if (gzipFlag1==1) sprintf(cmd, "zcat %s", file1); if (gzipFlag1==2) sprintf(cmd, "bunzip2 -c %s", file1); inputfp = popen(cmd,"r"); } else { inputfp = fopen(file1,"r"); } if (inputfp == NULL) { fprintf(stderr, "!!! Could not open input file: %s - skipping !!!\n", file1); continue; } if (file2[0] != '\0') { if (gzipFlag2) { if (gzipFlag2==1) sprintf(cmd, "zcat %s", file2); if (gzipFlag2==2) sprintf(cmd, "bunzip2 -c %s", file2); inputfp2 = popen(cmd,"r"); } else { //fprintf(stderr, "cmd=%s\n", cmd); inputfp2 = fopen(file2,"r"); } if (inputfp2 == NULL) { fprintf(stderr, "!!! Could not open 2nd read input file: %s - skipping !!!\n", file2); continue; } } FILE* outfp = stdout; FILE* outfp2 = stdout; strcpy(newfilename,file1); strcat(newfilename,"."); strcpy(newfilename2,file2); strcat(newfilename2,"."); if (suffix == NULL) { strcat(newfilename,"trimmed"); strcat(newfilename2,"trimmed"); } else { strcat(newfilename,suffix); if (splitPos > 0) { strcat(newfilename2,suffix); strcat(newfilename,"1"); strcat(newfilename2,"2"); } else { strcat(newfilename2,suffix); } } outfp= fopen(newfilename, "w"); if (outfp == NULL) { fprintf(stderr, "!!! Could not open output file: %s - skipping !!!\n", newfilename); continue; } if (splitPos > 0 || inputfp2 != NULL) { //fprintf(stderr, "Outputing to %s\n", newfilename2); outfp2= fopen(newfilename2, "w"); if (outfp2 == NULL) { fprintf(stderr, "!!! Could not open output file: %s - skipping !!!\n", newfilename2); continue; } } FILE* lenfp = stdout; FILE* lenfp2 = stdout; strcpy(newfilename,file1); strcat(newfilename,"."); strcpy(newfilename2,file1); strcat(newfilename2,"."); if (lenSuffix == NULL) { strcat(newfilename,"lengths"); strcat(newfilename2,"lengths"); } else { strcat(newfilename2,lenSuffix); } lenfp= fopen(newfilename, "w"); if (lenfp == NULL) { fprintf(stderr, "!!! Could not open length distribution file: %s - skipping !!!\n", newfilename); } if (splitPos > 0 || inputfp2 != NULL) { lenfp2= fopen(newfilename2, "w"); if (lenfp2 == NULL) { fprintf(stderr, "!!! Could not open length distribution file: %s - skipping !!!\n", newfilename2); } } long long int lastStart = 0; long long int currentline = 0; int lastLinePlus=0; current[0]='\0'; current2[0]='\0'; curSeq[0]='\0'; curSeq2[0]='\0'; curHeader[0]='\0'; curHeader2[0]='\0'; curQual[0]='\0'; curQual2[0]='\0'; int start = 0; int end = 1000000000; int start2 = 0; int end2 = 1000000000; int goodReads = 0; int totalReads = 0; int curLen = 0; int curLen2 = 0; int maxObservedLength = 0; int fastqFlag = 0; for (int i=0;i 1) { //trim off newline character if (lineLength > 0) buf[lineLength-1]='\0'; if (lineLength2 > 0) buf2[lineLength2-1]='\0'; if ((buf[0] == '@' && lastLinePlus==0) || (buf[0] == '>' && fastqFlag != 1)) { if ((buf[0] == '@' && lastLinePlus==0)) fastqFlag = 1; int good1 = 0; int good2 = 1; if (curLen >= minSizeToKeep && curLen < maxReadLength) { if (maxObservedLength < curLen) maxObservedLength = curLen; readLengths[curLen]++; good1 = 1; } else { readLengths[0]++; } if (lineLength2 > 0) { good2 = 0; if (buf[0] != buf2[0]) { fprintf(stderr, "!!! Error, PE mate files are not in order:\n1: %s\n2: %s\n", buf, buf2); exit(0); } if (curLen2 >= minSizeToKeep && curLen2 < maxReadLength) { if (maxObservedLength < curLen2) maxObservedLength = curLen2; readLengths2[curLen2]++; good2 = 1; } else { readLengths2[0]++; } } if (good1 && good2) { fprintf(outfp, "%s\n%s\n+\n%s\n", curHeader,curSeq,curQual); if (lineLength2 > 0 || splitPos > 0) { fprintf(outfp2, "%s\n%s\n+\n%s\n", curHeader2,curSeq2,curQual2); } goodReads++; } totalReads++; current[0]='\0'; current2[0]='\0'; curSeq[0]='\0'; curSeq2[0]='\0'; curHeader[0]='\0'; curHeader2[0]='\0'; curQual[0]='\0'; curQual2[0]='\0'; strcpy(curHeader,buf); if (uniqNamesFlag) { sprintf(curHeader, "@Read%d", totalReads); } if (lineLength2 > 0) { strcpy(curHeader2,buf2); if (uniqNamesFlag) { sprintf(curHeader2, "@Read%d", totalReads); } } else if (splitPos > 0) { int x = strlen(curHeader); current[x-1] = '1'; current[x] = '\0'; strcpy(curHeader2,curHeader); current2[x-1] = '2'; } lastStart = currentline; lastLinePlus=0; curLen = 0; curLen2 = 0; } else if (buf[0] == '+' && currentline-2==lastStart) { //quality header - ignore lastLinePlus=1; } else { if (currentline-1==lastStart) { //sequence char* seq = buf; curLen=lineLength-1; start = 0; end = curLen; if (p5len > 0) { if (p5len >= curLen) { seq[0]='\0'; curLen=0; start=0; end=0; } else { seq = &(seq[p5len]); start=p5len; curLen-=p5len; } } if (p3len > 0) { if (p3len >= curLen) { seq[0]='\0'; curLen=0; start=0; end=0; } else { seq[curLen-p3len] = '\0'; end=start+curLen-p3len; curLen-=p3len; } } if (p5adapter != NULL && curLen > 0) { int offset = matchAdapter5prime(seq, p5adapter,maxMisMatches,minMatchLength, matchStart); if (offset >= 0) { seq = &(seq[offset+1]); curLen -= offset+1; start=offset+1; end = curLen; } } if (p3adapter != NULL && curLen > 0) { int offset = matchAdapter3prime(seq, p3adapter,maxMisMatches,minMatchLength,matchStart); if (offset >= 0) { seq[offset]='\0'; curLen = offset; end = offset; } } if (fixedLength > 0) { if (curLen > fixedLength) { seq[fixedLength] = '\0'; curLen = fixedLength; end = start + fixedLength; } } if (splitPos > 0) { if (splitPos < (int)strlen(seq)) { char tmp = seq[splitPos]; seq[splitPos] = '\0'; strcat(curSeq,seq); seq[splitPos] = tmp; if (revoppFlag) { int L = strlen(seq); seq[L-1] = '\0'; revopp(&(seq[splitPos])); seq[L-1] = '\0'; } strcat(curSeq2,&(seq[splitPos])); } else { strcat(curSeq,seq); } } else { if (revoppFlag) { seq[strlen(seq)-1] = '\0'; revopp(seq); seq[strlen(seq)-1] = '\0'; } strcat(curSeq,seq); } if (lineLength2>0) { //mate pair sequence char* seq = buf2; curLen2=lineLength2-1; start2 = 0; end2 = curLen2; if (p5len > 0) { if (p5len >= curLen) { seq[0]='\0'; curLen2=0; start2=0; end2=0; } else { seq = &(seq[p5len]); start2=p5len; curLen2-=p5len; } } if (p3len > 0) { if (p3len >= curLen2) { seq[0]='\0'; curLen2=0; start2=0; end2=0; } else { seq[curLen2-p3len] = '\0'; end2=start2+curLen2-p3len; curLen2-=p3len; } } if (p5adapter != NULL && curLen > 0) { int offset = matchAdapter5prime(seq, p5adapter,maxMisMatches,minMatchLength, matchStart); if (offset >= 0) { seq = &(seq[offset+1]); curLen2 -= offset+1; start2=offset+1; end2 = curLen2; } } if (p3adapter != NULL && curLen2 > 0) { int offset = matchAdapter3prime(seq, p3adapter,maxMisMatches,minMatchLength,matchStart); if (offset >= 0) { seq[offset]='\0'; curLen2 = offset; end2 = offset; } } if (fixedLength > 0) { if (curLen2 > fixedLength) { seq[fixedLength] = '\0'; curLen2 = fixedLength; end2 = start2 + fixedLength; } } if (revoppFlag) { seq[strlen(seq)-1] = '\0'; revopp(seq); seq[strlen(seq)-1] = '\0'; } strcat(curSeq2,seq); } } else { //quality scores char* qual = buf; if (end > lineLength-1) { end=0; } else { qual = &(buf[start]); } qual[end]='\0'; //trim read based on quality score if (qthresh > 0) { int L = strlen(qual); int total = 0; int N = 0; for (int i=qstart;i=L) break; N++; total+=qual[j]-33; } } else { total -= qual[i-1]-33; if (i+qwindow < L) { total += qual[i+qwindow]-33; } else { N--; } } if (N > 0 && (total/N < qthresh)) { qual[i] = '\0'; curSeq[i] = '\0'; curLen = i-1; break; } else if (N < 1) { } } } if (splitPos > 0) { if (splitPos < (int)strlen(qual)) { char tmp = qual[splitPos]; qual[splitPos] = '\0'; strcat(curQual,qual); qual[splitPos] = tmp; if (revoppFlag) { int L = strlen(qual); qual[L-1] = '\0'; reverse(&(qual[splitPos])); qual[L-1] = '\0'; } strcat(curQual2,&(qual[splitPos])); } else { strcat(curQual,qual); } } else { if (revoppFlag) { qual[strlen(qual)-1] = '\0'; reverse(qual); qual[strlen(qual)-1] = '\0'; } strcat(curQual,qual); } if (lineLength2 > 0) { //quality scores char* qual = buf2; if (end2 > lineLength2-1) { end2=0; } else { qual = &(buf2[start2]); } qual[end2]='\0'; //trim read based on quality score if (qthresh > 0) { int L = strlen(qual); int total = 0; int N = 0; for (int i=qstart;i=L) break; N++; total+=qual[j]-33; } } else { total -= qual[i-1]-33; if (i+qwindow < L) { total += qual[i+qwindow]-33; } else { N--; } } if (N > 0 && (total/N < qthresh)) { qual[i] = '\0'; curSeq2[i] = '\0'; curLen2 = i-1; break; } else if (N < 1) { } } } if (revoppFlag) { qual[strlen(qual)-1] = '\0'; reverse(qual); qual[strlen(qual)-1] = '\0'; } strcat(curQual2,qual); } } lastLinePlus=0; } } else { lastLinePlus=0; if (currentline-1==lastStart || (currentline-3==lastStart && lastLinePlus==1)) { //blank sequence or quality information } else { //whitespace } } } //take care of last one if (lastLinePlus==0 || fastqFlag != 1) { int good1 = 0; int good2 = 1; if (curLen >= minSizeToKeep && curLen < maxReadLength) { if (maxObservedLength < curLen) maxObservedLength = curLen; readLengths[curLen]++; good1=1; } else { readLengths[0]++; } if (inputfp2 != NULL || splitPos > 0) { good2=0; if (curLen2 >= minSizeToKeep && curLen2 < maxReadLength) { if (maxObservedLength < curLen2) maxObservedLength = curLen2; readLengths2[curLen2]++; good2=1; } else { readLengths2[0]++; } } if (good1 && good2) { goodReads++; fprintf(outfp, "%s\n%s\n+\n%s\n", curHeader,curSeq,curQual); if (inputfp2 != NULL || splitPos > 0) { fprintf(outfp2, "%s\n%s\n+\n%s\n", curHeader2,curSeq2,curQual2); } } } fprintf(lenfp, "Length\t# reads\tFraction\n"); for (int i=0;i 0) { fprintf(lenfp2, "Length\t# reads\tFraction\n"); for (int i=0;i=matchStart;i--) { int match=1; int numMis = 0; int numMatch = 0; for (int j=adapterLength-1;j>=0;j--) { int nindex = i-(adapterLength-1)+j; if (nindex<0) break; numMatch++; if (seq[nindex] == 'N') continue; if (adapter[j] == 'N') continue; if (seq[nindex] == adapter[j]) { continue; } else { numMis++; if (numMis > maxMisMatches) { match=0; break; } } } if (numMatch < minMatchLength) match = 0; if (match==1) { return i; } } return -1; } int matchAdapter3prime(char* seq, char* adapter, int maxMisMatches, int minMatchLength, int matchStart) { int seqLength = strlen(seq)-1; //seq is expected to have a \n character int adapterLength = strlen(adapter); for (int i=matchStart;i=seqLength) break; numMatch++; if (seq[i+j] == 'N') continue; if (adapter[j] == 'N') continue; if (seq[i+j] == adapter[j]) { continue; } else { numMis++; if (numMis > maxMisMatches) { match=0; break; } } } if (numMatch < minMatchLength) match = 0; if (match==1) { return i; } } return -1; } //---------------------------- truseq ------------------------------------------------------------------- #define MAX_FREQ_TO_REPORT 30 void getTruSeqBarcode(char* buf,int lineLength,char* barcode); int processBarcode(char* barcode, char* read, Hashtable* barcodeFPs, LongInttable* barcodeTotals, char* prefix,int printFlag); void printCMDtruSeq() { fprintf(stderr, "\n\tUsage: homerTools truseq [file2] ...\n"); fprintf(stderr, "\n\tOptions for command: truseq\n"); fprintf(stderr, "\t\t-min <#> (Minimum frequency of barcodes to keep: default=%.3f\n", BARCODE_MINFREQ); fprintf(stderr, "\t\t-freq (output file for barcode frequencies, default=file.freq.txt)\n"); fprintf(stderr, "\t\t-qual <#> (Minimum quality score for barcode nucleotides, default=not used)\n"); fprintf(stderr, "\t\t-qualBase (Minimum quality character in FASTQ file, default=B)\n"); fprintf(stderr, "\t\t\n"); exit(0); } void truSeqProgram(int argc, char** argv) { char* freqFileName = NULL; char** files = new char*[100000]; int numfiles = 0; int qualThreshold = -120; char qualBase = 'B'; int printFlag = 0; if (argc < 3) printCMDtruSeq(); //int barcodeSize = 0; double minPercentage = BARCODE_MINFREQ; fprintf(stderr, "\tDefault minimum barcode percentage is %.0lf%%\n", minPercentage*100); for (int i=2;i 1) { if (format == SEQFILE_FORMAT_UNKNOWN) { if (buf[0] == '@') { format = SEQFILE_FORMAT_FASTQ; fprintf(stderr, "\tFASTQ format detected\n"); } else if (buf[0] == '>') { format = SEQFILE_FORMAT_FASTA; fprintf(stderr, "\tFASTA format detected\n"); } } if ((format == SEQFILE_FORMAT_FASTQ && buf[0] == '@' && lastLinePlus==0) || (format == SEQFILE_FORMAT_FASTA && buf[0] == '>')) { if (minQualityScore >= qualThreshold) { totalReads += processBarcode(barcode,current,barcodeFPs,barcodeTotals,prefix,printFlag); } current[0]='\0'; barcode[0]='\0'; strcpy(current,buf); lastStart = currentline; lastLinePlus=0; minQualityScore=127; getTruSeqBarcode(buf,lineLength,barcode); //fprintf(stdout, "%s\n", barcode); } else if (buf[0] == '+' && currentline-2==lastStart) { //quality header lastLinePlus=1; strcat(current,buf); } else { lastLinePlus=0; strcpy(current,buf); } } else { lastLinePlus=0; if (currentline-1==lastStart || (currentline-3==lastStart && lastLinePlus==1)) { //blank sequence or quality information strcat(current,"\n"); } else { //whitespace } } } if ((format == SEQFILE_FORMAT_FASTQ && lastLinePlus==0) || (format == SEQFILE_FORMAT_FASTA)) { if (minQualityScore >= qualThreshold) { totalReads += processBarcode(barcode,current,barcodeFPs,barcodeTotals,prefix,printFlag); } } fclose(fp); char** keys = barcodeTotals->keys(); StrSort* data = new StrSort[barcodeTotals->total]; for (int j=0;jtotal;j++) { long long int barcodeTotal = barcodeTotals->search(keys[j]); if (printFlag) { FILE* bfp = (FILE*) barcodeFPs->search(keys[j]); fclose(bfp); } double ratio = ((double)barcodeTotal)/((double)totalReads); char* readInfo = new char[10000]; sprintf(readInfo, "%s\t%lld\t%lf\n", keys[j], barcodeTotal,ratio); data[j].str = readInfo; data[j].v = ratio; if (ratio < minPercentage) { strcpy(name, prefix); strcat(name,"."); strcat(name,keys[j]); remove(name); } delete [](keys[j]); } delete []keys; qsort(data,barcodeTotals->total,sizeof(StrSort),&decendStrSort); FILE* freqFP = NULL; strcpy(name,files[i]); strcat(name,".freq.txt"); freqFP = fopen(name, "w"); if (freqFP == NULL) { fprintf(stderr, "!!! Problem opening %s for writing (frequency file)!!!\n", name); exit(0); } fprintf(freqFP, "Barcode\tTotal Reads(of %lld)\tFrequency\n", totalReads); fprintf(stderr, "\tBarcode\tTotal Reads(of %lld)\tFrequency\n", totalReads); for (int i=0;itotal;i++) { fprintf(freqFP,"%s",data[i].str); if (i [options] [file2] ...\n"); fprintf(stderr, "\n\t3rd argument must be the number bp in the barcode\n"); fprintf(stderr, "\n\tOptions for command: barcode\n"); fprintf(stderr, "\t\t-min <#> (Minimum frequency of barcodes to keep: default=%.3f\n", BARCODE_MINFREQ); fprintf(stderr, "\t\t-freq (output file for barcode frequencies, default=file.freq.txt)\n"); fprintf(stderr, "\t\t-qual <#> (Minimum quality score for barcode nucleotides, default=not used)\n"); fprintf(stderr, "\t\t-qualBase (Minimum quality character in FASTQ file, default=B)\n"); fprintf(stderr, "\t\t\n"); exit(0); } void barcodesProgram(int argc, char** argv) { char* freqFileName = NULL; char** files = new char*[100000]; int numfiles = 0; int qualThreshold = -120; char qualBase = 'B'; int printFlag = 1; if (argc < 4) printCMDbarcodes(); int barcodeSize = 0; double minPercentage = BARCODE_MINFREQ; sscanf(argv[2],"%d",&barcodeSize); if (barcodeSize < 1 || barcodeSize > 100) { fprintf(stderr, "!!! Barcode of size %d seems odd - make sure you use the program correctly!!!\n", barcodeSize); } else { fprintf(stderr, "\tUsing barcode size of %d bp\n", barcodeSize); } fprintf(stderr, "\tDefault minimum barcode percentage is %.0lf%%\n", minPercentage*100); for (int i=3;i 1) { if (format == SEQFILE_FORMAT_UNKNOWN) { if (buf[0] == '@') { format = SEQFILE_FORMAT_FASTQ; fprintf(stderr, "\tFASTQ format detected\n"); } else if (buf[0] == '>') { format = SEQFILE_FORMAT_FASTA; fprintf(stderr, "\tFASTA format detected\n"); } } if ((format == SEQFILE_FORMAT_FASTQ && buf[0] == '@' && lastLinePlus==0) || (format == SEQFILE_FORMAT_FASTA && buf[0] == '>')) { if (minQualityScore >= qualThreshold) { totalReads += processBarcode(barcode,current,barcodeFPs,barcodeTotals,prefix,printFlag); } current[0]='\0'; barcode[0]='\0'; strcpy(current,buf); lastStart = currentline; lastLinePlus=0; minQualityScore=127; } else if (buf[0] == '+' && currentline-2==lastStart) { //quality header lastLinePlus=1; strcat(current,buf); } else { if (lineLength-1 >= barcodeSize) { strcat(current,&(buf[barcodeSize])); } else { strcat(current,"\n"); } if (currentline-1==lastStart) { //sequence - get barcode if (lineLength-1 >= barcodeSize) { strncpy(barcode,buf,barcodeSize); } } else { if (lineLength-1 < barcodeSize) { minQualityScore = -1; } else { minQualityScore = buf[0]-qualBase; for (int j=1;j= qualThreshold) { totalReads += processBarcode(barcode,current,barcodeFPs,barcodeTotals,prefix,printFlag); } } fclose(fp); char** keys = barcodeFPs->keys(); StrSort* data = new StrSort[barcodeFPs->total]; for (int j=0;jtotal;j++) { long long int barcodeTotal = barcodeTotals->search(keys[j]); FILE* bfp = (FILE*) barcodeFPs->search(keys[j]); fclose(bfp); double ratio = ((double)barcodeTotal)/((double)totalReads); char* readInfo = new char[10000]; sprintf(readInfo, "%s\t%lld\t%lf\n", keys[j], barcodeTotal,ratio); data[j].str = readInfo; data[j].v = ratio; if (ratio < minPercentage) { strcpy(name, prefix); strcat(name,"."); strcat(name,keys[j]); remove(name); } delete [](keys[j]); } delete []keys; qsort(data,barcodeFPs->total,sizeof(StrSort),&decendStrSort); FILE* freqFP = NULL; strcpy(name,files[i]); strcat(name,".freq.txt"); freqFP = fopen(name, "w"); if (freqFP == NULL) { fprintf(stderr, "!!! Problem opening %s for writing (frequency file)!!!\n", name); exit(0); } fprintf(freqFP, "Barcode\tTotal Reads(of %lld)\tFrequency\n", totalReads); fprintf(stderr, "\tBarcode\tTotal Reads(of %lld)\tFrequency\n", totalReads); for (int i=0;itotal;i++) { fprintf(freqFP,"%s",data[i].str); fprintf(stderr,"\t\t%s",data[i].str); delete [](data[i].str); } delete []data; if (freqFileName != NULL) { fclose(freqFP); } delete barcodeFPs; delete barcodeTotals; } delete []name; delete []files; delete []buf; } int processBarcode(char* barcode, char* read, Hashtable* barcodeFPs, LongInttable* barcodeTotals, char* prefix,int printFlag) { if (barcode[0] == '\0') return 0; if (read[0] == '\0') return 0; long long int total = barcodeTotals->search(barcode); FILE* fp = NULL; if (total == EMPTY_INT) { barcodeTotals->insert(1,barcode); } else { barcodeTotals->insert(total+1,barcode); } if (printFlag) { if (total == EMPTY_INT) { char* name= new char[10000]; strcpy(name, prefix); strcat(name, "."); strcat(name, barcode); fp = fopen(name,"w"); barcodeFPs->insert(fp,barcode); delete []name; } else { barcodeTotals->insert(total+1,barcode); fp = (FILE*) barcodeFPs->search(barcode); } fprintf(fp,"%s",read); } return 1; } int decendStrSort(const void* a, const void* b) { StrSort* aa = (StrSort*)a; StrSort* bb = (StrSort*)b; if (aa->v > bb->v) return -1; if (aa->v < bb->v) return 1; return 0; } //---------------------------- freq --------------------------- void printCMDfreq() { fprintf(stderr, "\n\tUsage: homerTools freq [options] \n"); fprintf(stderr, "\n\tOutputs nucleotide frequencies to stdout\n"); fprintf(stderr, "\n\tOptions for command: freq\n"); fprintf(stderr, "\t\t-format (sequence file format, default: auto detect)\n"); fprintf(stderr, "\t\t-offset <#> (offset of first base in output file, default: 0)\n"); fprintf(stderr, "\t\t-maxlen <#> (Maximum length of sequences to consider, default: length of 1st seq)\n"); fprintf(stderr, "\t\t-o (Output filename, default: output sent to stdout)\n"); fprintf(stderr, "\t\t-gc (calculate CpG/GC content per sequence output to \"filename\")\n"); fprintf(stderr, "\t\t\tOutputFormat: nameCpGGCAGACLength\n"); fprintf(stderr, "\t\t\n"); exit(0); } void freqProgram(int argc, char** argv) { char* outputFilename = NULL; char* gcFilename = NULL; char* seqFile = NULL; int fileFormat = SEQFILE_FORMAT_UNKNOWN; int maxLength = -1; int offset = 0; if (argc < 3) { printCMDfreq(); } fprintf(stderr, "\n"); for (int i=2;iprint(fp); if (outputFilename != NULL) { fclose(fp); } fprintf(stderr, "\n"); } //---------------------------- extract --------------------------- void printCMDextract() { fprintf(stderr, "\n\tUsage: homerTools extract [options]\n"); fprintf(stderr, "\n\tThe can be a single FASTA file instead.\n"); fprintf(stderr, "\tIf using a Directory, files should be named with chromosomes,\n"); fprintf(stderr, "\t\ti.e. chr1.fa or chr1.fa.masked or genome.fa/genome.fa.masked\n"); fprintf(stderr, "\t\tIf having trouble, place all FASTA entries in single file instead of a directory\n"); fprintf(stderr, "\t\tFASTA format: >chrname ... (anything after whitespace will be ignored)\n"); fprintf(stderr, "\tThis program will output sequences to stdout in tab-delimited format\n"); fprintf(stderr, "\n\tOptions for command: extract\n"); fprintf(stderr, "\t\t-fa (output sequences in FASTA format - default is tab-delimited format)\n"); fprintf(stderr, "\t\t-mask (mask out lower case sequence from genome)\n"); fprintf(stderr, "\n\tAlternate Usage: homerTools extract stats \n"); fprintf(stderr, "\t\tDisplays stats about the genome files (such as length)\n"); fprintf(stderr, "\t\t\n"); exit(0); } void extractProgram(int argc, char** argv) { if (argc < 4) { printCMDextract(); } char* posfile = argv[2]; char* genomeDir = argv[3]; int fastaFlag = 0; int statsFlag = 0; int maskFlag = 0; if (strcmp(posfile,"stats")==0) { statsFlag = 1; } fprintf(stderr, "\n"); for (int i=4;iextractSequence(genomeDir,fpout,fastaFlag,maskFlag); } exit(0); } //---------------------------- decontaminate --------------------------- void printCMDdecontaminate() { fprintf(stderr, "\n\tUsage: homerTools decontaminate \n"); fprintf(stderr, "\t\t\t\t [options]\n"); fprintf(stderr, "\n\tRemoves reads from an experiment that contains reads originating from another.\n"); fprintf(stderr, "\tEither specify the fraction of contaminated reads (-frac <#>), or the program\n"); fprintf(stderr, "\twill attempt to estimate (only works for quanitatively different experiments)\n"); fprintf(stderr, "\tCreates contaminationHistogram.txt and contaminationScatterPlot.txt in the\n"); fprintf(stderr, "\toutput directory to help guage the extent of contamination when estimating.\n"); fprintf(stderr, "\n\tTwo tag directories are required after \"homerTools decontaminate\":\n"); fprintf(stderr, "\t\t \n"); fprintf(stderr, "\t\tThe Contaminated Tag Directory will be modified (recommend to copy the original)\n"); fprintf(stderr, "\n\tOptions for command: decontaminate\n"); fprintf(stderr, "\t\t-frac <#> (Estimate fraction of sample that is contaminated, default: auto)\n"); fprintf(stderr, "\t\t-estimateOnly (Only estimate the contamination, do not decontaminate)\n"); fprintf(stderr, "\t\t-o (default: overrites contaminated tag directory)\n"); fprintf(stderr, "\t\t-size <#> (Peak size for estimating contamination/Max distance from contaminant\n"); fprintf(stderr, "\t\t\treads to remove contaminated reads, default: 250)\n"); fprintf(stderr, "\t\t-min <#> (Minimum tag count to consider when estimating contamination, default: 20)\n"); fprintf(stderr, "\t\t\n"); exit(0); } void decontaminateProgram(int argc, char** argv) { if (argc < 4) { printCMDdecontaminate(); } char* expDir = argv[2]; char* inputDir = argv[3]; char* outputDir = NULL; double fraction = -1.0; float minThreshold = 20.0; int estonly = 0; int size = 250; fprintf(stderr, "\n"); for (int i=4;i0;i--) { fprintf(stderr, "\t\t%d\n",i); (void)system("sleep 1"); } } outputDir = expDir; } else { fprintf(stderr, "\tOutput Directory: %s\n", outputDir); char* command = new char[10000]; sprintf(command, "mkdir -p \"%s\"",outputDir); (void)system(command); sprintf(command, "cp \"%s\"/* \"%s\"/",expDir, outputDir); (void)system(command); } TagLibrary* tags = new TagLibrary(outputDir); TagLibrary* input = new TagLibrary(inputDir); tags->readTagDirectory(); input->readTagDirectory(); if (fraction < 0.0) { fprintf(stderr, "\tEstimating fraction of reads in from contaminated experiement\n\n"); fraction = tags->estimateContamination(input,size,minThreshold); if (estonly) exit(0); } else { fprintf(stderr, "\tFraction of reads in from contaminated experiement: %lf%% [user defined]\n", fraction*100.0); } tags->decontaminate(input,fraction,size); //should probably do more quality control... fprintf(stderr, "\n"); exit(0); } //---------------------------- special --------------------------- void specialUniqMapProgram(int argc, char** argv); void printCMDspecial() { fprintf(stderr, "\n\tUsage: homerTools special <2nd command> ...\n"); fprintf(stderr, "\n\tHighly specialized programs\n"); fprintf(stderr, "\n\t2nd Commands:\n"); fprintf(stderr, "\t\tuniqmap (parse Unique mappability files)\n"); //fprintf(stderr, "\t\ttile (create peak file tiling the genome)\n"); fprintf(stderr, "\t\t\n"); exit(0); } void specialProgram(int argc, char** argv) { if (argc < 3) { printCMDspecial(); } for (int i=2;i \n"); fprintf(stderr, "\n\tOptions for command: special uniqmap\n"); fprintf(stderr, "\t\tnone\n"); fprintf(stderr, "\t\t\n"); exit(0); } void specialUniqMapProgram(int argc, char** argv) { if (argc < 5) { printCMDspecialUniqMap(); } char* directory = argv[3]; char* inputFile = argv[4]; char* command = new char[100000]; strcpy(command, "mkdir -p \""); strcat(command, directory); strcat(command, "\""); (void)system(command); char* buf = new char[BUFFER]; char** line = new char*[10000]; int numCols = 0; int initializationSize = 400000000; FILE* fp = fopen(inputFile,"r"); if (fp == NULL) { fprintf(stderr, "!! Could not open input file: %s\n", inputFile); exit(0); } Hashtable* uniqmapchr = new Hashtable(); char* currentChr = new char[10000]; currentChr[0] = '\0'; UniqMapChrs* umc = NULL; int format = -1; while (fgets(buf, BUFFER, fp) != NULL) { split(buf, line, numCols, '\t'); if (format == -1) { if (numCols < 3) continue; char* chr = line[0]; int count = 0; while (chr[count] != '\0') { if (chr[count] == '.') { chr[count] = '\0'; break; } count++; } if (strcmp(chr,currentChr) != 0) { strcpy(currentChr,chr); umc = (UniqMapChrs*) uniqmapchr->search(chr); if (umc == NULL) { umc = new UniqMapChrs(chr,directory,1); //1 to overwrite existing data umc->increaseMapSize(initializationSize); uniqmapchr->insert(umc,chr); } } int position = -1; int strand = -1; sscanf(line[1],"%d",&position); sscanf(line[2],"%d",&strand); umc->setMappable(position,strand); } } fclose(fp); sprintf(command, "%s/uniqMapStats.txt", directory); fp = fopen(command, "w"); long long int gsize=0; long long int mappability = 0; fprintf(fp, "#Chr\tMappable bp (both strands)\tTotal bp (both strand)\n"); char** keys = uniqmapchr->keys(); for (int i=0;itotal;i++) { umc = (UniqMapChrs*) uniqmapchr->search(keys[i]); umc->saveFiles(); gsize += umc->size; mappability += umc->numMappable; fprintf(fp, "%s\t%d\t%d\n",keys[i],umc->numMappable,umc->size); delete [](keys[i]); } delete []keys; fprintf(fp, "genome\t%lld\t%lld\n",mappability,gsize); fclose(fp); delete uniqmapchr; delete []buf; delete []currentChr; delete []line; delete []command; exit(0); } //---------------------------- special cluster --------------------------- void printCMDcluster() { fprintf(stderr, "\n\tUsage: homerTools cluster [options]\n"); fprintf(stderr, "\n\tOptions for command: cluster\n"); fprintf(stderr, "\t\t-d (tab delimited)\n"); fprintf(stderr, "\t\t\t-annCols <#> (number of annotation columns at beginning, def: 1)\n"); fprintf(stderr, "\t\t-i (tab delimited, i.e. gene expression instead of dist matrix)\n"); fprintf(stderr, "\t\t\t-dist (distance measure to analyze raw data matrix with, def: corr)\n"); fprintf(stderr, "\t\t\t-sub <#> (cluster using subset, then adds the rest of the data back,def=7000)\n"); fprintf(stderr, "\t\t-o (prefix for output files, def: out)\n"); fprintf(stderr, "\t\t-r (reverse distance metric = high values are \"closer\")\n"); fprintf(stderr, "\n\tExtract Clusters from output gtr or atr format file:\n"); fprintf(stderr, "\t\t-gtr (output from clustering - cdt must be present too)\n"); fprintf(stderr, "\t\t-thresh <#> (threshold to define clusters)\n"); fprintf(stderr, "\n\tScore regions against clusters (use either -gtr/-d to supply distance matrix):\n"); fprintf(stderr, "\t\t-c [cluster file 2] ... (First column is region name)\n"); fprintf(stderr, "\t\t\n"); exit(0); } void clusterProgram(int argc, char** argv) { fprintf(stderr, "\n"); if (argc < 3) { printCMDcluster(); } char* distMatrixFile = NULL; char* dataMatrixFile = NULL; char* prefix = NULL; char** clusterFiles = new char*[10000]; int numClusterFiles = 0; int reverseFlag = 0; int numAnnCols = 1; int distanceMetric = CLUSTER_DISTANCE_CORRELATION; int matrixOutputFlag = 1; int numSub = CLUSTER_SUB_INIT; char* gtrInputFile = NULL; double clusterThreshold = 0.0; for (int i=2;iloadGTR(gtrInputFile); if (numClusterFiles < 1) { cluster->printClusters(stdout,NULL,clusterThreshold); exit(0); } else { cluster->scoreClusterFiles(clusterFiles, numClusterFiles,clusterScoreFP); exit(0); } } cluster = new TreeCluster(); if (dataMatrixFile != NULL) { cluster->readDataMatrix(dataMatrixFile,numAnnCols); if (cluster->numMatrix > numSub) { cluster->sampleDataMatrix(numSub); } } if (distMatrixFile != NULL) { cluster->readDistMatrix(distMatrixFile,numAnnCols); } if (cluster->matrix == NULL) { cluster->calcDistMatrix(distanceMetric); } if (reverseFlag) { cluster->invertDistMatrix(); } //cluster = new TreeCluster(distMatrix,numMatrix); if (numClusterFiles > 0) { cluster->scoreClusterFiles(clusterFiles, numClusterFiles,clusterScoreFP); } else { cluster->cluster(); if (cluster->subFlag) { cluster->expandClusters(); } cluster->printCDT(prefix,matrixOutputFlag); } exit(0); } //----------------------- matrix3D program void printCMDmatrix3D() { fprintf(stderr, "\n\tUsage: homerTools matrix3D [options]\n"); fprintf(stderr, "\n\tOptions for command: cluster\n"); fprintf(stderr, "\t\t-r (invert the scores i.e. high scores = close together)\n"); fprintf(stderr, "\t\t-2 (input matrix has two columns of identifiers [to work well with treeview])\n"); fprintf(stderr, "\t\t-c (color clusters)\n"); fprintf(stderr, "\t\t-iter <#> (number of iterations)\n"); fprintf(stderr, "\t\t\n"); exit(0); } void matrix3DProgram(int argc, char** argv) { fprintf(stderr, "\n"); if (argc < 3) { printCMDmatrix3D(); } srand(time(NULL)); char* distMatrixFile = argv[2]; //char* prefix = NULL; //int reverseFlag = 0; int col2Flag = 0; char* clusterFile = NULL; double iters = 1e6; for (int i=3;iinit(distMatrix,numMatrix,names,fixedStep); if (clusterFile != NULL) genomeStruct->addColorsFromClusters(clusterFile); genomeStruct->optimize(maxIterations); genomeStruct->print(stdout); }