#include <stdio.h> 
#include <string.h> 
#include <stdlib.h> 
#include <time.h> 

#define TITEL "trimClusterer, 6.1.2008 by Nick Fankhauser" 
#define BLOCKSIZE 1024	// for reading file to count lines 
#define ABOVE_PERCENTAGE 80 // default: 80 (check this!) 
#define CUT2FACTOR 0.1	// default: 0.1 
#define TRIMSTEPS 100 // is this enough? 
#define LARGEFAM 10 // more than 10 members -> trimming 
#define TOOMUCH 0.4 // no more than 40% of proteome 
#define maxTrimLinks 10000 

typedef struct {
	int threshold;
	long size;
	unsigned char **scores;
} matrix;

typedef struct {
	off_t fileLen;
	long maxLen;
	long numLines;
} tStat;

typedef struct {
	int **fams;
	int *fam;
	int nFams;
	int *nFam;
} families;

matrix loadTextMatrix(char* filename);
tStat textStats(char* filename);
families createFam(matrix mat);
void showFamilies(families fs);
void checkFamilies(families fs,char *logFileName);
matrix trimFamilies(matrix mat, families fs);
void freeFamilies(families fs);
void printDuration(int t,char *logFileName);
void saveFamilies(families fs,char *ofn);

tStat textStats(char* filename) {
	FILE *ifp;	
	tStat ts;
	char *buf,c;
	long lineLen,lastPos=0;
	long i;
	ts.maxLen=0;
	ts.numLines=0;
	printf("Analysing text file structure...\n");
	if ((ifp=fopen(filename,"r"))==NULL) {
		printf("Not found: %s\n",filename);
		exit(1);
	}
	// determine file size
	fseek(ifp, 0L, SEEK_END);     /* Position to end of file */
	ts.fileLen = ftello(ifp);        /* Get file length */
	rewind(ifp);                  /* Back to start of file */
	// determine longest line
	buf = malloc(BLOCKSIZE);
	if (buf == NULL) {
		printf("Failed to allocate memory for block buffer!\n");	
		exit(1);
	}
	for (i=0;i<=ts.fileLen;i++) {
		c=fgetc(ifp);
		if (c==EOF) {break;}
		if (c=='\n') {
			ts.numLines++;
			lineLen=i-lastPos;
			if (lineLen>ts.maxLen) {ts.maxLen=lineLen;}
			lastPos=i;
		}
	}
	free(buf);
	fclose(ifp);
	return ts;
}

matrix loadTextMatrix(char* filename) {
	matrix mat;
	tStat mStat;
	FILE *ifp;
	char *buf;
	long totMem=0,lastLen=0,tSum=0,sum=0;
	int n1,n2,nb,selfmatch,i;
	char *result = NULL;
	int *histogram;
	unsigned char nbb;
	char *pBuf;
	const char delims[] = " \n";
	mStat=textStats(filename);
	mat.size=mStat.numLines;
	printf("Loading matrix...\n");
	buf = malloc(mStat.maxLen+1); // buffer for lines
	if (buf == NULL) {
		printf("Failed to allocate memory for line buffer!\n");
		exit(1);
	}
	mat.scores = malloc(sizeof(char *) * (mat.size));
	if (mat.scores == NULL) {
		printf("Failed to allocate memory for matrix buffer!\n");	
		exit(1);
	}
	histogram = calloc(1,sizeof(int) * 256 ); // buffer for histogram
	ifp=fopen(filename,"r");
	for (n1=0;n1<mat.size;n1++) {
		fgets (buf,mStat.maxLen+1,ifp);
		mat.scores[n1] = calloc(1,n1+1);
		if (mat.scores[n1] == NULL) {
			printf("Failed to allocate memory for matrix buffer line %d!\n",n1);	
			exit(1);
		}
		pBuf = malloc(strlen(buf)+1);
		if (pBuf == NULL) {
			printf("Failed to allocate memory for line %d parsing buffer!\n",n1);	
			exit(1);
		}
		strcpy(pBuf,buf);
		result = strtok( pBuf, delims );
		// get selfmatch (last column)
		while( result != NULL ) {
			selfmatch=atoi(result);
       			result = strtok( NULL, delims );
   		}
		n2=0;
		strcpy(pBuf,buf);
		result = strtok( pBuf, delims );
		// normalize and store scores
		while( result != NULL ) {
			nb=atoi(result);
			nbb=(float)nb/(float)selfmatch*255;
			if ((nb<0) || (nbb<1)) {nbb=1;}
			mat.scores[n1][n2]=nbb;
			histogram[nbb]++;
			n2++;
       			result = strtok( NULL, delims );
   		}
		free(pBuf);
		totMem+=n2;
		if (lastLen+1!=n2) {
			printf("%s\n",buf);
			printf("Column error on line %d: Expected %ld, got %d!\n",n1+1,lastLen+1,n2);
			exit(1);
		}
		lastLen=n2;
	}
	fclose(ifp);
	free(buf);
	printf("Size: %llu, Sequences: %ld, maxLine: %ld, Memory used: %ld bytes\n",
	mStat.fileLen,mat.size,mStat.maxLen,totMem);
	mat.threshold=0;totMem+=n1+1;
	tSum=totMem/100*ABOVE_PERCENTAGE;
	#ifdef DEBUG
	printf("tSum: %ld, totMem: %ld\n",tSum,totMem);
	#endif
	for (i=1;i<256;i++) {
		sum+=histogram[i];
		if ((sum>=tSum) && (mat.threshold==0)) {mat.threshold=i;}
	}
	return mat;
}

families createFams(matrix mat) {
	families fs;
	int f1,f2,n1,n2,i,f1in,f2in;
	fs.nFams=0;
	printf("Creating families from matrix, threshold %d.\n",mat.threshold);
	for (n1=0;n1<mat.size;n1++) {
		for (n2=0;n2<n1;n2++) {
			if ((mat.scores[n1][n2]>mat.threshold) && (n1!=n2)) {
				f1in=-1;f2in=-1;
				for (f1=0;f1<fs.nFams;f1++) {
					for (f2=0;f2<fs.nFam[f1];f2++) {
						if (fs.fams[f1][f2]==n1) {f1in=f1;}
						if (fs.fams[f1][f2]==n2) {f2in=f1;}
					}
				}
				if ((f1in==-1) && (f2in==-1)) { // add as new family
					#ifdef DEBUG
					printf("Adding [%d, %d]\n",n1,n2);
					#endif
					if (fs.nFams==0) {
						fs.fams=malloc(sizeof(int*));
						fs.nFam=malloc(sizeof(int));
					} else {
						fs.fams=realloc(fs.fams,sizeof(int*) * (fs.nFams+1));
						fs.nFam=realloc(fs.nFam,sizeof(int) * (fs.nFams+1));
					}
					fs.fams[fs.nFams]=malloc(sizeof(int)*2);
					fs.fams[fs.nFams][0]=n1;
					fs.fams[fs.nFams][1]=n2;
					fs.nFam[fs.nFams]=2;
					fs.nFams++;
				}
				if ((f1in>-1) && (f2in==-1) && (f1in!=f2in)) { // add f2 to family
					#ifdef DEBUG
					printf("%d-%d: Adding %d to family %d\n",n1,n2,n2,f1in);
					#endif
					fs.fams[f1in]=realloc(fs.fams[f1in], sizeof(int)*(fs.nFam[f1in]+1));
					fs.fams[f1in][fs.nFam[f1in]]=n2;
					fs.nFam[f1in]++;				}
				if ((f1in==-1) && (f2in>-1) && (f1in!=f2in)) { // add f1 to family
					#ifdef DEBUG
					printf("%d-%d: Adding %d to family %d\n",n1,n2,n1,f2in);
					#endif
					fs.fams[f2in]=realloc(fs.fams[f2in], sizeof(int)*(fs.nFam[f2in]+1));
					fs.fams[f2in][fs.nFam[f2in]]=n1;
					fs.nFam[f2in]++;
				}
				if ((f1in>-1) && (f2in>-1) && (f1in!=f2in)) { // join two families
					#ifdef DEBUG
					printf("%d-%d: Joining family %d and %d\n",n1,n2,f1in,f2in);
					#endif
					// add f2 to f1
					fs.fams[f1in]=realloc(fs.fams[f1in],
					sizeof(int) * (fs.nFam[f1in] + fs.nFam[f2in]) );
					for (i=0;i<fs.nFam[f2in];i++) {
						fs.fams[f1in][fs.nFam[f1in]+i]=fs.fams[f2in][i];
					}
					fs.nFam[f1in]=fs.nFam[f1in]+fs.nFam[f2in];
					// move last family to f2
					fs.fams[f2in]=realloc(fs.fams[f2in],
					sizeof(int) * fs.nFam[fs.nFams-1] );
					for (i=0;i<fs.nFam[fs.nFams-1];i++) {
						fs.fams[f2in][i]=fs.fams[fs.nFams-1][i];
					}
					fs.nFam[f2in]=fs.nFam[fs.nFams-1];
					// remove last family
					free(fs.fams[fs.nFams-1]);
					fs.fams=realloc(fs.fams,sizeof(int*) * (fs.nFams-1));
					fs.nFam=realloc(fs.nFam,sizeof(int) * (fs.nFams-1));
					fs.nFams--;
				}
			}
		}
	}
	return fs;
}

void showFamilies(families fs) {
	int f1,f2;
	printf("\nFamilies:\n");
	for (f1=0;f1<fs.nFams;f1++) {
		if (fs.nFam[f1]>2) {
			printf("%d: ",f1);
			for (f2=0;f2<fs.nFam[f1];f2++) {
				printf("%d ",fs.fams[f1][f2]);
			}
			printf("\n");
		}
	}
	printf("\n");
}

void saveFamilies(families fs,char *ofn) {
	int f1,f2;
	FILE *ofp;
	ofp=fopen(ofn,"w");
	for (f1=0;f1<fs.nFams;f1++) {
		if (fs.nFam[f1]>2) {
			for (f2=0;f2<fs.nFam[f1];f2++) {
				fprintf(ofp,"%d ",fs.fams[f1][f2]);
			}
			fprintf(ofp,"\n");
		}
	}
	fclose(ofp);
	printf("Families saved to %s\n",ofn);
}


void checkFamilies(families fs,char *logFileName) {
	int f1,famCount=0,famNum=0;
	float avgSize;
	FILE *ofp;
	for (f1=0;f1<fs.nFams;f1++) {
		if (fs.nFam[f1]>2) {
			famCount++;
			famNum+=fs.nFam[f1];
		}
	}
	if (famCount>0) {
		avgSize=(float)famNum/(float)famCount;
		printf("Families: %d, average size: %2.2f\n",famCount,avgSize);
		ofp=fopen(logFileName,"a");	
		fprintf(ofp,"Families: %d, average size: %2.2f\n",famCount,avgSize);
		fclose(ofp);
	} else {
		printf("No families possible at this theshold!\n");
		exit(1);
	}
}

void freeFamilies(families fs) {
	int f1;
	for (f1=0;f1<fs.nFams;f1++) {free(fs.fams[f1]);}
	free(fs.nFam);
	free(fs.fams);
}

matrix trimFamilies(matrix mat, families fs) {
	int f1,f2,f3,p1,p2,p3,liCnt,alloc,cut2,i,mean;
	long sum;
	unsigned char s;
//	unsigned char minScore;
	unsigned char *links;
	int *src1,*src2;
	for (f1=0;f1<fs.nFams;f1++) {
		if (fs.nFam[f1]<LARGEFAM) {continue;}
		#ifdef DEBUG
		printf("\nFamily %d:\n",f1);
		#endif
		alloc=fs.nFam[f1]*fs.nFam[f1]/2;
		if (alloc>maxTrimLinks) {alloc=maxTrimLinks;}
		links=malloc(alloc);
		if (links == NULL) {
			printf("Failed to allocate memory for family %d links buffer!\n",f1);	
			exit(1);
		}
		src1=malloc(sizeof(int) * alloc);
		if (src1 == NULL) {
			printf("Failed to allocate memory for family %d src1 buffer!\n",f1);	
			exit(1);
		}
		src2=malloc(sizeof(int) * alloc);
		if (src2 == NULL) {
			printf("Failed to allocate memory for family %d src2 buffer!\n",f1);	
			exit(1);
		}
		liCnt=0;
		for (f2=0;f2<fs.nFam[f1];f2++) {
			for (f3=0;f3<=f2;f3++) {
				p1=fs.fams[f1][f2];
				p2=fs.fams[f1][f3];
				if (p2>p1) {p3=p1;p1=p2;p2=p3;}
				if (p1!=p2) {
					s=mat.scores[p1][p2];
					if (s>mat.threshold) {
						if (liCnt<alloc) {
							#ifdef DEBUG
							printf("Append: %d-%d: %d\n",p1,p2,s);
							#endif
							links[liCnt]=s;
							src1[liCnt]=p1;
							src2[liCnt]=p2;
							liCnt++;
						}
					}
				}
			}
		}
		#ifdef DEBUG
		printf("Allocated: %d, used: %d\n",alloc,liCnt);
		#endif
		sum=0;
		for (i=0;i<liCnt;i++) {sum+=links[i];}
		mean=sum/liCnt;
		cut2=(mean-mat.threshold)*CUT2FACTOR+mat.threshold;
		#ifdef DEBUG
		printf("Mean link: %d, cut2: %d\n",mean,cut2);
		#endif
		// remove lowest link of extremely large families
		for (i=0;i<liCnt;i++) {
			if (links[i]<cut2) {
				#ifdef DEBUG
				printf("Setting %d-%d to zero.\n",src1[i],src2[i]);
				#endif
				mat.scores[src1[i]][src2[i]]=0;
			}
		}
		if (fs.nFam[f1]>mat.size*TOOMUCH) {
			printf("One family contains more than %2.0f%% of proteins!\n",TOOMUCH*100);
			mat.threshold+=1;
		}
		free(links);
		free(src1);
		free(src2);
	}
	return mat;
}

void printDuration(int t,char *logFileName) {
	int rDays,rHours,rMins,rSecs;
	FILE *ofp;
	rDays=t/(60*60*24);
	rSecs=t-60*60*24*rDays;
	rHours=rSecs/(60*60);
	rSecs-=60*60*rHours;
	rMins=rSecs/60;
	rSecs-=60*rMins;
	ofp=fopen(logFileName,"a");
	if (rDays>0) {
		printf("%d days, ",rDays);
		fprintf(ofp,"%d days, ",rDays);
	}
	if (rHours>0) {
		printf("%d hours, ",rHours);
		fprintf(ofp,"%d hours, ",rHours);
	}
	if (rMins>0) {
		printf("%d minutes, ",rMins);
		fprintf(ofp,"%d minutes, ",rMins);
	}
	printf("%d seconds elapsed.\n",rSecs);
	fprintf(ofp,"%d seconds elapsed.\n",rSecs);
	fclose(ofp);
}

int main(int argc, char *argv[]) {
	matrix mat;
	families fs;
	int step;
	int tStart;
	char *logFn,*famFn;
	FILE *lfp;
	char *logExt=".log";
	char *famExt=".families.txt";
	printf("%s\n",TITEL);
	if (argc<2) {
		printf("Syntax: protClust <matrix.txt>\n");
		exit(1);
	}
	logFn = malloc(strlen(argv[1])+strlen(logExt)+1);
	strcpy(logFn,argv[1]);
	strcat(logFn,logExt);
	famFn = malloc(strlen(argv[1])+strlen(famExt)+1);
	strcpy(famFn,argv[1]);
	strcat(famFn,famExt);
	lfp=fopen(logFn,"w");
	fprintf(lfp,"Loading matrix %s...\n",argv[1]);
	fclose(lfp);
	tStart=time(NULL);
	mat=loadTextMatrix(argv[1]);
	if (mat.size<2) {
		printf("Error: Matrix too small!\n");
		exit(1);
	}		
	printDuration(time(NULL)-tStart,logFn);
	for (step=0;step<TRIMSTEPS;step++) {
		tStart=time(NULL);
		printf("Trim step %d...\n",step+1);
		fs=createFams(mat);
		checkFamilies(fs,logFn);
		saveFamilies(fs,famFn);
		mat=trimFamilies(mat,fs);
		freeFamilies(fs);
		printDuration(time(NULL)-tStart,logFn);
	}
	fs=createFams(mat);
	checkFamilies(fs,logFn);
	showFamilies(fs);
	saveFamilies(fs,famFn);
	freeFamilies(fs);
	return 0;
}