#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <time.h>
#define TITEL "trimClusterer, 6.1.2008 by Nick Fankhauser"
#define BLOCKSIZE 1024
#define ABOVE_PERCENTAGE 80
#define CUT2FACTOR 0.1
#define TRIMSTEPS 100
#define LARGEFAM 10
#define TOOMUCH 0.4
#define maxTrimLinks 10000
typedef struct {
int threshold;
long size;
unsigned char **scores;
} matrix;
typedef struct {
off_t fileLen;
long maxLen;
long numLines;
} tStat;
typedef struct {
int **fams;
int *fam;
int nFams;
int *nFam;
} families;
matrix loadTextMatrix(char* filename);
tStat textStats(char* filename);
families createFam(matrix mat);
void showFamilies(families fs);
void checkFamilies(families fs,char *logFileName);
matrix trimFamilies(matrix mat, families fs);
void freeFamilies(families fs);
void printDuration(int t,char *logFileName);
void saveFamilies(families fs,char *ofn);
tStat textStats(char* filename) {
FILE *ifp;
tStat ts;
char *buf,c;
long lineLen,lastPos=0;
long i;
ts.maxLen=0;
ts.numLines=0;
printf("Analysing text file structure...\n");
if ((ifp=fopen(filename,"r"))==NULL) {
printf("Not found: %s\n",filename);
exit(1);
}
fseek(ifp, 0L, SEEK_END);
ts.fileLen = ftello(ifp);
rewind(ifp);
buf = malloc(BLOCKSIZE);
if (buf == NULL) {
printf("Failed to allocate memory for block buffer!\n");
exit(1);
}
for (i=0;i<=ts.fileLen;i++) {
c=fgetc(ifp);
if (c==EOF) {break;}
if (c=='\n') {
ts.numLines++;
lineLen=i-lastPos;
if (lineLen>ts.maxLen) {ts.maxLen=lineLen;}
lastPos=i;
}
}
free(buf);
fclose(ifp);
return ts;
}
matrix loadTextMatrix(char* filename) {
matrix mat;
tStat mStat;
FILE *ifp;
char *buf;
long totMem=0,lastLen=0,tSum=0,sum=0;
int n1,n2,nb,selfmatch,i;
char *result = NULL;
int *histogram;
unsigned char nbb;
char *pBuf;
const char delims[] = " \n";
mStat=textStats(filename);
mat.size=mStat.numLines;
printf("Loading matrix...\n");
buf = malloc(mStat.maxLen+1);
if (buf == NULL) {
printf("Failed to allocate memory for line buffer!\n");
exit(1);
}
mat.scores = malloc(sizeof(char *) * (mat.size));
if (mat.scores == NULL) {
printf("Failed to allocate memory for matrix buffer!\n");
exit(1);
}
histogram = calloc(1,sizeof(int) * 256 );
ifp=fopen(filename,"r");
for (n1=0;n1<mat.size;n1++) {
fgets (buf,mStat.maxLen+1,ifp);
mat.scores[n1] = calloc(1,n1+1);
if (mat.scores[n1] == NULL) {
printf("Failed to allocate memory for matrix buffer line %d!\n",n1);
exit(1);
}
pBuf = malloc(strlen(buf)+1);
if (pBuf == NULL) {
printf("Failed to allocate memory for line %d parsing buffer!\n",n1);
exit(1);
}
strcpy(pBuf,buf);
result = strtok( pBuf, delims );
while( result != NULL ) {
selfmatch=atoi(result);
result = strtok( NULL, delims );
}
n2=0;
strcpy(pBuf,buf);
result = strtok( pBuf, delims );
while( result != NULL ) {
nb=atoi(result);
nbb=(float)nb/(float)selfmatch*255;
if ((nb<0) || (nbb<1)) {nbb=1;}
mat.scores[n1][n2]=nbb;
histogram[nbb]++;
n2++;
result = strtok( NULL, delims );
}
free(pBuf);
totMem+=n2;
if (lastLen+1!=n2) {
printf("%s\n",buf);
printf("Column error on line %d: Expected %ld, got %d!\n",n1+1,lastLen+1,n2);
exit(1);
}
lastLen=n2;
}
fclose(ifp);
free(buf);
printf("Size: %llu, Sequences: %ld, maxLine: %ld, Memory used: %ld bytes\n",
mStat.fileLen,mat.size,mStat.maxLen,totMem);
mat.threshold=0;totMem+=n1+1;
tSum=totMem/100*ABOVE_PERCENTAGE;
#ifdef DEBUG
printf("tSum: %ld, totMem: %ld\n",tSum,totMem);
#endif
for (i=1;i<256;i++) {
sum+=histogram[i];
if ((sum>=tSum) && (mat.threshold==0)) {mat.threshold=i;}
}
return mat;
}
families createFams(matrix mat) {
families fs;
int f1,f2,n1,n2,i,f1in,f2in;
fs.nFams=0;
printf("Creating families from matrix, threshold %d.\n",mat.threshold);
for (n1=0;n1<mat.size;n1++) {
for (n2=0;n2<n1;n2++) {
if ((mat.scores[n1][n2]>mat.threshold) && (n1!=n2)) {
f1in=-1;f2in=-1;
for (f1=0;f1<fs.nFams;f1++) {
for (f2=0;f2<fs.nFam[f1];f2++) {
if (fs.fams[f1][f2]==n1) {f1in=f1;}
if (fs.fams[f1][f2]==n2) {f2in=f1;}
}
}
if ((f1in==-1) && (f2in==-1)) {
#ifdef DEBUG
printf("Adding [%d, %d]\n",n1,n2);
#endif
if (fs.nFams==0) {
fs.fams=malloc(sizeof(int*));
fs.nFam=malloc(sizeof(int));
} else {
fs.fams=realloc(fs.fams,sizeof(int*) * (fs.nFams+1));
fs.nFam=realloc(fs.nFam,sizeof(int) * (fs.nFams+1));
}
fs.fams[fs.nFams]=malloc(sizeof(int)*2);
fs.fams[fs.nFams][0]=n1;
fs.fams[fs.nFams][1]=n2;
fs.nFam[fs.nFams]=2;
fs.nFams++;
}
if ((f1in>-1) && (f2in==-1) && (f1in!=f2in)) {
#ifdef DEBUG
printf("%d-%d: Adding %d to family %d\n",n1,n2,n2,f1in);
#endif
fs.fams[f1in]=realloc(fs.fams[f1in], sizeof(int)*(fs.nFam[f1in]+1));
fs.fams[f1in][fs.nFam[f1in]]=n2;
fs.nFam[f1in]++; }
if ((f1in==-1) && (f2in>-1) && (f1in!=f2in)) {
#ifdef DEBUG
printf("%d-%d: Adding %d to family %d\n",n1,n2,n1,f2in);
#endif
fs.fams[f2in]=realloc(fs.fams[f2in], sizeof(int)*(fs.nFam[f2in]+1));
fs.fams[f2in][fs.nFam[f2in]]=n1;
fs.nFam[f2in]++;
}
if ((f1in>-1) && (f2in>-1) && (f1in!=f2in)) {
#ifdef DEBUG
printf("%d-%d: Joining family %d and %d\n",n1,n2,f1in,f2in);
#endif
fs.fams[f1in]=realloc(fs.fams[f1in],
sizeof(int) * (fs.nFam[f1in] + fs.nFam[f2in]) );
for (i=0;i<fs.nFam[f2in];i++) {
fs.fams[f1in][fs.nFam[f1in]+i]=fs.fams[f2in][i];
}
fs.nFam[f1in]=fs.nFam[f1in]+fs.nFam[f2in];
fs.fams[f2in]=realloc(fs.fams[f2in],
sizeof(int) * fs.nFam[fs.nFams-1] );
for (i=0;i<fs.nFam[fs.nFams-1];i++) {
fs.fams[f2in][i]=fs.fams[fs.nFams-1][i];
}
fs.nFam[f2in]=fs.nFam[fs.nFams-1];
free(fs.fams[fs.nFams-1]);
fs.fams=realloc(fs.fams,sizeof(int*) * (fs.nFams-1));
fs.nFam=realloc(fs.nFam,sizeof(int) * (fs.nFams-1));
fs.nFams--;
}
}
}
}
return fs;
}
void showFamilies(families fs) {
int f1,f2;
printf("\nFamilies:\n");
for (f1=0;f1<fs.nFams;f1++) {
if (fs.nFam[f1]>2) {
printf("%d: ",f1);
for (f2=0;f2<fs.nFam[f1];f2++) {
printf("%d ",fs.fams[f1][f2]);
}
printf("\n");
}
}
printf("\n");
}
void saveFamilies(families fs,char *ofn) {
int f1,f2;
FILE *ofp;
ofp=fopen(ofn,"w");
for (f1=0;f1<fs.nFams;f1++) {
if (fs.nFam[f1]>2) {
for (f2=0;f2<fs.nFam[f1];f2++) {
fprintf(ofp,"%d ",fs.fams[f1][f2]);
}
fprintf(ofp,"\n");
}
}
fclose(ofp);
printf("Families saved to %s\n",ofn);
}
void checkFamilies(families fs,char *logFileName) {
int f1,famCount=0,famNum=0;
float avgSize;
FILE *ofp;
for (f1=0;f1<fs.nFams;f1++) {
if (fs.nFam[f1]>2) {
famCount++;
famNum+=fs.nFam[f1];
}
}
if (famCount>0) {
avgSize=(float)famNum/(float)famCount;
printf("Families: %d, average size: %2.2f\n",famCount,avgSize);
ofp=fopen(logFileName,"a");
fprintf(ofp,"Families: %d, average size: %2.2f\n",famCount,avgSize);
fclose(ofp);
} else {
printf("No families possible at this theshold!\n");
exit(1);
}
}
void freeFamilies(families fs) {
int f1;
for (f1=0;f1<fs.nFams;f1++) {free(fs.fams[f1]);}
free(fs.nFam);
free(fs.fams);
}
matrix trimFamilies(matrix mat, families fs) {
int f1,f2,f3,p1,p2,p3,liCnt,alloc,cut2,i,mean;
long sum;
unsigned char s;
unsigned char *links;
int *src1,*src2;
for (f1=0;f1<fs.nFams;f1++) {
if (fs.nFam[f1]<LARGEFAM) {continue;}
#ifdef DEBUG
printf("\nFamily %d:\n",f1);
#endif
alloc=fs.nFam[f1]*fs.nFam[f1]/2;
if (alloc>maxTrimLinks) {alloc=maxTrimLinks;}
links=malloc(alloc);
if (links == NULL) {
printf("Failed to allocate memory for family %d links buffer!\n",f1);
exit(1);
}
src1=malloc(sizeof(int) * alloc);
if (src1 == NULL) {
printf("Failed to allocate memory for family %d src1 buffer!\n",f1);
exit(1);
}
src2=malloc(sizeof(int) * alloc);
if (src2 == NULL) {
printf("Failed to allocate memory for family %d src2 buffer!\n",f1);
exit(1);
}
liCnt=0;
for (f2=0;f2<fs.nFam[f1];f2++) {
for (f3=0;f3<=f2;f3++) {
p1=fs.fams[f1][f2];
p2=fs.fams[f1][f3];
if (p2>p1) {p3=p1;p1=p2;p2=p3;}
if (p1!=p2) {
s=mat.scores[p1][p2];
if (s>mat.threshold) {
if (liCnt<alloc) {
#ifdef DEBUG
printf("Append: %d-%d: %d\n",p1,p2,s);
#endif
links[liCnt]=s;
src1[liCnt]=p1;
src2[liCnt]=p2;
liCnt++;
}
}
}
}
}
#ifdef DEBUG
printf("Allocated: %d, used: %d\n",alloc,liCnt);
#endif
sum=0;
for (i=0;i<liCnt;i++) {sum+=links[i];}
mean=sum/liCnt;
cut2=(mean-mat.threshold)*CUT2FACTOR+mat.threshold;
#ifdef DEBUG
printf("Mean link: %d, cut2: %d\n",mean,cut2);
#endif
for (i=0;i<liCnt;i++) {
if (links[i]<cut2) {
#ifdef DEBUG
printf("Setting %d-%d to zero.\n",src1[i],src2[i]);
#endif
mat.scores[src1[i]][src2[i]]=0;
}
}
if (fs.nFam[f1]>mat.size*TOOMUCH) {
printf("One family contains more than %2.0f%% of proteins!\n",TOOMUCH*100);
mat.threshold+=1;
}
free(links);
free(src1);
free(src2);
}
return mat;
}
void printDuration(int t,char *logFileName) {
int rDays,rHours,rMins,rSecs;
FILE *ofp;
rDays=t/(60*60*24);
rSecs=t-60*60*24*rDays;
rHours=rSecs/(60*60);
rSecs-=60*60*rHours;
rMins=rSecs/60;
rSecs-=60*rMins;
ofp=fopen(logFileName,"a");
if (rDays>0) {
printf("%d days, ",rDays);
fprintf(ofp,"%d days, ",rDays);
}
if (rHours>0) {
printf("%d hours, ",rHours);
fprintf(ofp,"%d hours, ",rHours);
}
if (rMins>0) {
printf("%d minutes, ",rMins);
fprintf(ofp,"%d minutes, ",rMins);
}
printf("%d seconds elapsed.\n",rSecs);
fprintf(ofp,"%d seconds elapsed.\n",rSecs);
fclose(ofp);
}
int main(int argc, char *argv[]) {
matrix mat;
families fs;
int step;
int tStart;
char *logFn,*famFn;
FILE *lfp;
char *logExt=".log";
char *famExt=".families.txt";
printf("%s\n",TITEL);
if (argc<2) {
printf("Syntax: protClust <matrix.txt>\n");
exit(1);
}
logFn = malloc(strlen(argv[1])+strlen(logExt)+1);
strcpy(logFn,argv[1]);
strcat(logFn,logExt);
famFn = malloc(strlen(argv[1])+strlen(famExt)+1);
strcpy(famFn,argv[1]);
strcat(famFn,famExt);
lfp=fopen(logFn,"w");
fprintf(lfp,"Loading matrix %s...\n",argv[1]);
fclose(lfp);
tStart=time(NULL);
mat=loadTextMatrix(argv[1]);
if (mat.size<2) {
printf("Error: Matrix too small!\n");
exit(1);
}
printDuration(time(NULL)-tStart,logFn);
for (step=0;step<TRIMSTEPS;step++) {
tStart=time(NULL);
printf("Trim step %d...\n",step+1);
fs=createFams(mat);
checkFamilies(fs,logFn);
saveFamilies(fs,famFn);
mat=trimFamilies(mat,fs);
freeFamilies(fs);
printDuration(time(NULL)-tStart,logFn);
}
fs=createFams(mat);
checkFamilies(fs,logFn);
showFamilies(fs);
saveFamilies(fs,famFn);
freeFamilies(fs);
return 0;
}