// Authors: Korbinian Schneeberger, Stephan Ossowski and Joerg Hagmann
// Copyright (C) 2008 by Max-Planck Institute for Developmental Biology, Tuebingen, Germany

#include "genomemapper.h"

int alloc_chr_seq_buffer();
int load_chr_sequence();
int desc_parsing(char *c);

int load_genome()
{	
	int i,j;
		
	if ((CHR_SEQ = (char**) malloc (NUM_CHROMOSOMES * sizeof(char**))) == NULL) {
		fprintf(stderr, "ERROR : not enough memory for genome\n");
		exit(1);
	}

	char line[513];
	unsigned int fp = ftell(GENOME_FP);
	unsigned int linelen;
	
	for (i=0; i!=NUM_CHROMOSOMES; ++i) {

		if ((*(CHR_SEQ+i) = (char*) malloc ((CHR_LENGTH[i] + 1) * sizeof(char))) == NULL) {
			fprintf(stderr, "ERROR : not enough memory for genome\n");
			exit(1);
		}
		
		// load from file
		if (DEBUG) printf("Load chromosome %d with length %d\n",i+1, CHR_LENGTH[i]);
		
		unsigned int pos = 0;
		
		line[0] = '\0';
		fseek(GENOME_FP, fp, SEEK_SET);
		
		while (line[0] != '>') fgets(line, 512, GENOME_FP);
	
		if (fgets(line, 512, GENOME_FP) == NULL || line[0] == '>') {
			fprintf(stderr, "ERROR: cannot find sequence \"%s\"!\n",CHR_DESC[i]);
			exit(1);
		}
		while (line[0] != '>') {
			linelen = strcspn(line, " \n\t");
			if (linelen > 0 && (line[linelen] == '\t' || line[linelen] == ' ')) {
				fprintf(stderr, "ERROR: white space character unequal to newline found in genome input file '%s' in chromosome '%s'!\n", GENOME_FILE_NAME, CHR_DESC[i]);
				exit(0);
			}
			for (j=0; j!=linelen; j++){
				if (line[j]=='A' || line[j]=='a' || line[j]=='C' || line[j]=='c' ||
				    line[j]=='G' || line[j]=='g' || line[j]=='T' || line[j]=='t' || 
				    line[j]=='N' || line[j]=='n' || line[j]=='R' || line[j]=='r' ||
				    line[j]=='Y' || line[j]=='y' || line[j]=='M' || line[j]=='m' || 
				    line[j]=='K' || line[j]=='k' || line[j]=='W' || line[j]=='w' || 
				    line[j]=='S' || line[j]=='s' || line[j]=='B' || line[j]=='b' ||
				    line[j]=='D' || line[j]=='d' || line[j]=='H' || line[j]=='h' ||
				    line[j]=='V' || line[j]=='v') {
					CHR_SEQ[i][pos++] = toupper(line[j]);
				}
				else {
					fprintf(stderr,"ERROR: Character '%c' encountered in chromosome '%s'! Only IUPAC-code is accepted!\n", line[j], CHR_DESC[i]);
					exit(0);
				} 
			}
			// if you can assume that each base is already upper case, use the next 2 statements:
			//strncpy(CHR_SEQ[i] + pos, line, strlen(line) - (line[strlen(line)-1] == '\n'));
			//pos += strlen(line) - (line[strlen(line)-1] == '\n');
			
			fp = ftell(GENOME_FP);
			if (fgets(line, 512, GENOME_FP) == NULL) break;
		}
		
		CHR_SEQ[i][CHR_LENGTH[i]] = '\0';
		
		if (CHR_LENGTH[i] != strlen(CHR_SEQ[i])) {
			fprintf(stderr, "ERROR: Idx file seems to be corrupted. Chromosome %d has %d characters at the end! (%d %d)\n",i+1, (int)strlen(CHR_SEQ[i])-CHR_LENGTH[i], (int)strlen(CHR_SEQ[i]), CHR_LENGTH[i]);
			exit(1);
		}
		
	}
	
	if (DEBUG)
		for (i = 0; i != NUM_CHROMOSOMES; ++i) {
			printf("len %d strlen %d:", CHR_LENGTH[i], (int)strlen(CHR_SEQ[i]));
		}

	fclose(GENOME_FP);
	
	return 0;	
}
