// Authors: Korbinian Schneeberger and Joerg Hagmann
// Copyright (C) 2008 by Max-Planck Institute for Developmental Biology, Tuebingen, Germany

#include "genomemapper.h"

int alloc_chr_seq_buffer();
int load_chr_sequence();
int desc_parsing(char *c);

int load_genome()
{	
	int i,j;
		
	if ((CHR_SEQ = (char**) malloc (NUM_CHROMOSOMES * sizeof(char**))) == NULL) {
		perror("ERROR : not enough memory for genome\n");
		exit(1);
	}

	char line[513];
	unsigned int fp = ftell(GENOME_FP);
	unsigned int linelen;
	
	for (i=0; i!=NUM_CHROMOSOMES; ++i) {

		if ((*(CHR_SEQ+i) = (char*) malloc ((CHR_LENGTH[i] + 1) * sizeof(char))) == NULL) {
			perror("ERROR : not enough memory for genome\n");
			exit(1);
		}
		
		unsigned int pos = 0;
		
		line[0] = '\0';
		fseek(GENOME_FP, fp, SEEK_SET);
		
		while (line[0] != '>')
			if (fgets(line, 512, GENOME_FP) == 0) {}
	
		if (fgets(line, 512, GENOME_FP) == NULL || line[0] == '>') {
			fprintf(stderr, "ERROR: cannot find sequence \"%s\"!\n",CHR_DESC[i]);
			exit(1);
		}
		while (line[0] != '>') {
			linelen = strcspn(line, " \n\t");
			if (linelen > 0 && (line[linelen] == '\t' || line[linelen] == ' ')) {
				fprintf(stderr, "ERROR: white space character unequal to newline found in genome input file '%s' in chromosome '%s'!\n", GENOME_FILE_NAME, CHR_DESC[i]);
				exit(2);
			}
			for (j=0; j!=linelen; j++){
				if (line[j]=='A' || line[j]=='a' || line[j]=='C' || line[j]=='c' ||
				    line[j]=='G' || line[j]=='g' || line[j]=='T' || line[j]=='t' || 
				    line[j]=='N' || line[j]=='n' || line[j]=='R' || line[j]=='r' ||
				    line[j]=='Y' || line[j]=='y' || line[j]=='M' || line[j]=='m' || 
				    line[j]=='K' || line[j]=='k' || line[j]=='W' || line[j]=='w' || 
				    line[j]=='S' || line[j]=='s' || line[j]=='B' || line[j]=='b' ||
				    line[j]=='D' || line[j]=='d' || line[j]=='H' || line[j]=='h' ||
				    line[j]=='V' || line[j]=='v') {
					CHR_SEQ[i][pos++] = toupper(line[j]);
				}
				else {
					fprintf(stderr,"ERROR: Character '%c' encountered in chromosome '%s'! Only IUPAC-code is accepted!\n", line[j], CHR_DESC[i]);
					exit(2);
				} 
			}
			// if you can assume that each base is already upper case, use the next 2 statements:
			//strncpy(CHR_SEQ[i] + pos, line, strlen(line) - (line[strlen(line)-1] == '\n'));
			//pos += strlen(line) - (line[strlen(line)-1] == '\n');
			
			fp = ftell(GENOME_FP);
			if (fgets(line, 512, GENOME_FP) == NULL) break;
		}
		
		CHR_SEQ[i][CHR_LENGTH[i]] = '\0';
		
		if (CHR_LENGTH[i] != strlen(CHR_SEQ[i])) {
			fprintf(stderr, "ERROR: Idx file seems to be corrupted. Chromosome %d has %d characters at the end! (%d %d)\n",i+1, (int)strlen(CHR_SEQ[i])-CHR_LENGTH[i], (int)strlen(CHR_SEQ[i]), CHR_LENGTH[i]);
			exit(1);
		}
		
	}
	
	fclose(GENOME_FP);
	
	return 0;	
}

#ifdef MAP_LOAD_GENOME

#include <sys/stat.h>
#include <sys/mman.h>

// try to mmap fasta file first;
// if sequences are not on one line, use load_genome instead
void map_load_genome()
{
    // alloc CHR_SEQ pointers
    if((CHR_SEQ=(char**)malloc(NUM_CHROMOSOMES*sizeof(char**)))==NULL)
    {
        perror("ERROR: map_load_genome - malloc");
        exit(1);
    }

    // mmap the genome file
    const int fd=fileno(GENOME_FP);
    struct stat fstats;
    if(fstat(fd,&fstats)!=0)
    {
        perror("ERROR: map_load_genome - stat");
        exit(1);
    }

    const size_t total_space=fstats.st_size;

    if(total_space==0)
    {
        fprintf(stderr,"ERROR: empty genome file\n");
        exit(1);
    }
    
    void*const raw_mem=mmap(NULL,total_space,
        PROT_READ,MAP_SHARED|MAP_NORESERVE,fd,0);

    if(raw_mem==MAP_FAILED)
    {   
        // mapping failed, try load_genome instead
        perror("map_load_genome - cannot map genome file, trying normal load");
        free(CHR_SEQ);
        load_genome();
        return;
    }

    const size_t nchar=total_space/sizeof(char);

    char* data_begin=(char*)(raw_mem);
    char*const data_end=data_begin+nchar;

    // create valid base symbol lookup table
    int blut[256];
    memset(blut,0,256*sizeof(int));
    blut[(int)'A']=1;
    blut[(int)'G']=1;
    blut[(int)'N']=1;
    blut[(int)'Y']=1;
    blut[(int)'K']=1;
    blut[(int)'S']=1;
    blut[(int)'D']=1;
    blut[(int)'V']=1;
    blut[(int)'C']=1;
    blut[(int)'T']=1;
    blut[(int)'R']=1;
    blut[(int)'M']=1;
    blut[(int)'W']=1;
    blut[(int)'B']=1;
    blut[(int)'H']=1;

    // file is mapped... now check if it's in a sane format
    // or if we have to load it the regular way
    int i;
    int j;
    for(i=0;i<NUM_CHROMOSOMES;++i)
    {
        // find the header
        while((data_begin!=data_end)&&(data_begin[0]!='>'))
        {
            ++data_begin;
        }
        // find start of sequence
        while((data_begin!=data_end)&&(data_begin[0]!='\n'))
        {
            ++data_begin;
        }
        if(data_begin!=data_end) ++data_begin;

        const size_t s_remain=data_end-data_begin;
        if(CHR_LENGTH[i]>s_remain)
        {
            fprintf(stderr,"ERROR: genome is too short\n");
            exit(1);
        }

        // set sequence pointer
        CHR_SEQ[i]=data_begin;

        // check if sequence is sane
        for(j=0;j<CHR_LENGTH[i];++j)
        {
            if(blut[(int)(*data_begin)]==0)
            {
                // not in simple format, use load_genome
                if(VERBOSE) fprintf(stderr,"genome fasta is not sane, using normal load\n");
                munmap(raw_mem,total_space);
                free(CHR_SEQ);
                load_genome();
                return;
            }
            ++data_begin;
        }
    }
    if(VERBOSE) fprintf(stderr,"genome fasta is sane, file mmapped\n");
}

#endif

