// Copyright 2009, 2010, 2011, 2012 Christopher Benner // // This file is part of HOMER // // HOMER is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // HOMER is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. #include "Clustering.h" void clusterSplit(char* string, char** cols, int &numCols, char delim); TreeCluster::TreeCluster(double** distMatrix, int matrixLength) { init(); init(distMatrix, matrixLength); } TreeCluster::TreeCluster() { init(); } void TreeCluster::init() { matrix = NULL; numMatrix = 0; clusters = NULL; mask = NULL; fixedClusterScore = 0; fixedPositionFlag = 0; reverseFlag = 0; minDistCache = NULL; minIndexCache = NULL; names = NULL; distanceMetric = CLUSTER_DISTANCE_CORRELATION; dataMatrix= NULL; expNames=NULL; numExps=0; numSub = CLUSTER_SUB_INIT; ogNumDataMatrix = 0; ogDataMatrix = NULL; ogDataMatrixNames = NULL; ogMatrix = NULL; subFlag = 0; sub2matrix = NULL; matrix2sub = NULL; dataCDTflag = 0; clusterSummary = NULL; } void TreeCluster::readDistMatrix(char* file, int numAnnCols) { numMatrix = 0; matrix = NULL; names = NULL; char* buf = new char[BUFFER_CLUSTER]; char** line = new char*[10000]; int numCols = 0; FILE* fp = fopen(file,"r"); if (fp == NULL) { fprintf(stderr, "!! Could not open input file: %s\n",file); exit(0); } int lineCount = 0; while (fgets(buf, BUFFER_CLUSTER, fp) != NULL) { split(buf, line, numCols, '\t'); lineCount++; if (lineCount == 1) { numMatrix = numCols-numAnnCols; matrix = new double*[numMatrix]; names = new char*[numMatrix]; for (int i=0;i= (int)(numMatrix*0.9)) { fprintf(stderr, "!!! warning sampling level is > 90%% of total data points (skipping...)\n"); return; } numSub = sub; subFlag = 1; ogNumDataMatrix = numMatrix; ogDataMatrix = dataMatrix; ogDataMatrixNames = names; srand(time(NULL)); dataMatrix = new double*[numMatrix]; names = new char*[numMatrix]; sub2matrix=new int[numMatrix]; matrix2sub=new int[ogNumDataMatrix]; char* randMask = new char[ogNumDataMatrix]; for (int i=0;i -1) { int leafIndex = sub2matrix[newClusters[i].left]; newClusters[i].left = leafIndex; clusterLocation[leafIndex] = i; } if (newClusters[i].right > -1) { int leafIndex = sub2matrix[newClusters[i].right]; newClusters[i].right = leafIndex; clusterLocation[leafIndex] = i; } } int curClusters = numMatrix-1; fprintf(stderr, "\tCluster expansion progress:\n"); fprintf(stderr, "\t|0%% 50%% 100%%|\n\t"); double nextMark = 0.0; double step = 1.0/80.0; for (int i=0;i= nextMark) { nextMark+=step; fprintf(stderr, "="); } if (matrix2sub[i] < 0) { double bestScore = FLT_MAX; int bestIndex = 0; for (int j=0;jinsert(n,line[0]); names[lineCount-3] = n; geneIndex->insert(lineCount-3,line[0]); for (int i=4;ikeys(); for (int i=0;itotal;i++) { char* n = (char*)nameHash->search(keys[i]); //fprintf(stderr, "%d\t%s\n", i,n); geneIndex->insert(i,keys[i]); delete [](keys[i]); } delete []keys; */ delete nameHash; clusters = new TreeNode[numMatrix-1]; mask = new int[numMatrix*2-1]; for (int i=0;iinsert(lineCount,line[0]); double w = 0.0; int left = geneIndex->search(line[1]); if (left == EMPTY_INT) { left = nodeIndex->search(line[1]); if (left == EMPTY_INT) { fprintf(stderr, "!!! Error parsing left node: %s (line:%d)\n", line[1],lineCount+1); exit(0); } clusters[lineCount].left = -1*(left+1); w += clusters[left].weight; } else { clusters[lineCount].left = left; w += 1.0; } int right = geneIndex->search(line[2]); if (right == EMPTY_INT) { right = nodeIndex->search(line[2]); if (right == EMPTY_INT) { fprintf(stderr, "!!! Error parsing right node: %s (line:%d)\n", line[2],lineCount+1); exit(0); } clusters[lineCount].right = -1*(right+1); w += clusters[right].weight; } else { clusters[lineCount].right = right; w += 1.0; } } fclose(fp); rootTree = numMatrix-2; delete []buf; delete []line; delete geneIndex; delete nodeIndex; } void TreeCluster::printClusters(FILE* fp, char** geneNames,double thresh) { //char* filename = new char[10000]; //sprintf(filename,"%s.clusters.txt",prefix); //FILE* fp = fopen(filename,"w"); if (geneNames == NULL) { geneNames = names; } numClusters = 0; int* assignment = getClusters(thresh, numClusters); if (dataCDTflag) { clusterSummary = new double*[numClusters]; for (int i=0;i= 0) { //order[curOrder++] = i; assignment[i] = currentCluster; lastIndex[curTree]++; continue; } else { lastIndex[curTree]++; int treeIndex = (-1*i)-1; lastTree[treeIndex] = curTree; curTree = treeIndex; continue; } } else if (lastIndex[curTree] == 1) { int i = clusters[curTree].right; if (i >= 0) { //order[curOrder++] = i; assignment[i] = currentCluster; lastIndex[curTree]++; continue; } else { lastIndex[curTree]++; int treeIndex = (-1*i)-1; lastTree[treeIndex] = curTree; curTree = treeIndex; continue; } } else { curTree = lastTree[curTree]; } } fprintf(stderr, "\t%d Total clusters identified\n", clusterNumber-1); delete []lastTree; delete []lastIndex; return assignment; } void TreeCluster::printCDT(char* prefix, int matrixFlag) { if (prefix == NULL) { prefix = (char*)"out"; } char* filename = new char[10000]; int* order = getOrder(); //fprintf(stderr, "Tree:\n"); //for (int i=0;irootTree;i--) { fprintf(gtr,"NODE%dX",i+1); if (clusters[i].left < 0) { fprintf(gtr,"\tNODE%dX",-1*(clusters[i].left)); } else { fprintf(gtr,"\tGENE%dX",clusters[i].left); } if (clusters[i].right < 0) { fprintf(gtr,"\tNODE%dX",-1*(clusters[i].right)); } else { fprintf(gtr,"\tGENE%dX",clusters[i].right); } //double outDist = (1-clusters[i].distance)/max; double outDist = -1*clusters[i].distance; //fprintf(stderr, "\t%d=%lf\n", i, clusters[i].distance); if (outDist > last) { outDist = last; } if (fixedClusterScore) { outDist = ((double)(numMatrix-1-i))/(double)numMatrix; } fprintf(gtr,"\t%lf\n",outDist); last = outDist; } for (int i=0;i<=rootTree;i++) { fprintf(gtr,"NODE%dX",i+1); if (clusters[i].left < 0) { fprintf(gtr,"\tNODE%dX",-1*(clusters[i].left)); } else { fprintf(gtr,"\tGENE%dX",clusters[i].left); } if (clusters[i].right < 0) { fprintf(gtr,"\tNODE%dX",-1*(clusters[i].right)); } else { fprintf(gtr,"\tGENE%dX",clusters[i].right); } //double outDist = (1-clusters[i].distance)/max; double outDist = -1*clusters[i].distance; //fprintf(stderr, "\t%d=%lf\n", i, clusters[i].distance); if (outDist > last) { outDist = last; } if (fixedClusterScore) { outDist = ((double)(numMatrix-1-i))/(double)numMatrix; } fprintf(gtr,"\t%lf\n",outDist); last = outDist; } fclose(gtr); delete []order; delete []filename; } void TreeCluster::printClusterSizes(FILE* fp,int maxSizeToReport) { double * sizes = new double[numMatrix+1]; double * expectedSizes = new double[numMatrix+1]; double * expectedSizes2 = new double[numMatrix+1]; for (int i=0;i numMatrix+1) { maxSizeToReport = numMatrix+1; } fprintf(fp , "ClusterRound\tClusterRound"); for (int i=1;i 0.0) { r = (expectedSizes[j]/((double)(numMatrix-i)))*((expectedSizes[k])/((double)(numMatrix-i))); } } odds += r; //fprintf(stderr, "\t\t%d\t%d\t%d\t%lf\n", i,j,k,r); if (m > numMatrix) { badOdds+=r; } else { expectedSizes2[m] += r; expectedSizes2[j] -= r; expectedSizes2[k] -= r; } } }*/ double makeUpFraction = (odds-badOdds)/odds; for (int j=0;j= 0) { order[curOrder++] = i; lastIndex[curTree]++; continue; } else { lastIndex[curTree]++; int treeIndex = (-1*i)-1; lastTree[treeIndex] = curTree; curTree = treeIndex; continue; } } else if (lastIndex[curTree] == 1) { int i = clusters[curTree].right; if (i >= 0) { order[curOrder++] = i; lastIndex[curTree]++; continue; } else { lastIndex[curTree]++; int treeIndex = (-1*i)-1; lastTree[treeIndex] = curTree; curTree = treeIndex; continue; } } else { curTree = lastTree[curTree]; } } delete []lastTree; delete []lastIndex; return order; } void TreeCluster::cluster() { double *newDistance = new double[numMatrix]; fprintf(stderr, "\tClustering progress:\n"); fprintf(stderr, "\t|0%% 50%% 100%%|\n\t"); double nextMark = 0.0; double step = 1.0/80.0; int left=0; int right=0; for (int i=0;i= nextMark) { nextMark+=step; fprintf(stderr, "="); } double min = FLT_MAX; //slow, safe way min = findMinimumPair(left,right); //new hotness /* if (i==0 || fixedPositionFlag) { min = initializeCache(left,right); } else { min = updateCache(left,right); }*/ clusters[i].left = left; clusters[i].right = right; clusters[i].distance = min; double maxSubDistance = min; double leftWeight = 1.0; if (mask[left] < 0) { //is node, not a leaf clusters[i].left = mask[left]; int node = -1*mask[left] - 1; leftWeight = clusters[node].weight; if (clusters[node].distance > maxSubDistance) maxSubDistance = clusters[node].distance; } double rightWeight = 1.0; if (mask[right] < 0) { //is node, not a leaf clusters[i].right = mask[right]; int node = -1*mask[right] - 1; rightWeight = clusters[node].weight; if (clusters[node].distance > maxSubDistance) maxSubDistance = clusters[node].distance; } //fprintf(stderr, "%d\t%lf\t%d\t%d\t%lf\t%lf\t%d\t%d\n", i, min,left,right,leftWeight,rightWeight,mask[left],mask[right] ); clusters[i].distance = maxSubDistance; clusters[i].weight = leftWeight + rightWeight; for (int j=0;j=0;j--) { if (mask[j] == 1) continue; if (matrix[i][j] < minDistCache[i]) { minDistCache[i] = matrix[i][j]; minIndexCache[i] = j; if (matrix[i][j] < gmin) { if (i < j) { left = i; right = j; } else { left = j; right = i; } gmin = matrix[i][j]; } } break; } for (int j=i+1;jinsert(i,names[i]); } int bufSize = 1000000; char* buf = new char[bufSize]; char** line = new char*[100000]; int numCols = 0; int** clusterIndex = new int*[numFiles]; int* clusterSize = new int[numFiles]; for (int i=0;isearch(line[0]); if (index != EMPTY_INT) { clusterIndex[i][clusterSize[i]++] = index; } //fprintf(stderr, "\t\t%s\t%d\n", line[0],index); if (clusterSize[i] >= numMatrix) { fprintf(stderr,"!!! Error - cluster size for file %s exceeds # of regions\n",files[i]); exit(1); } } fclose(fp); fprintf(stderr, "\t%s: %d regions\n", files[i], clusterSize[i]); } fprintf(fp, "Region Name"); for (int i=0;i 0.0) { score /= N; } fprintf(fp, "\t%lf",score); } fprintf(fp, "\n"); } for (int i=0;i