/****************************************************************\
*                                                                *
*  Library for manipulation of exonerate dataset files           *
*                                                                *
*  Guy St.C. Slater..   mailto:guy@ebi.ac.uk                     *
*  Copyright (C) 2000-2006.  All Rights Reserved.                *
*                                                                *
*  This source code is distributed under the terms of the        *
*  GNU Lesser General Public License. See the file COPYING       *
*  or http://www.fsf.org/copyleft/lesser.html for details        *
*                                                                *
*  If you use this code, please keep this notice intact.         *
*                                                                *
\****************************************************************/

#include <stdlib.h> /* For qsort() */
#include <string.h> /* For strcmp() */

#include "dataset.h"
#include "fastadb.h"
#include "bitarray.h"

static void Dataset_write_int(guint64 num, FILE *fp){
    guint64 nbo_num = GUINT64_TO_BE(num); /* BigEndian == NBO */
    fwrite(&nbo_num, sizeof(guint64), 1, fp);
    return;
    }

static guint64 Dataset_read_int(FILE *fp){
    guint64 num;
    fread(&num, sizeof(guint64), 1, fp);
    return GUINT64_FROM_BE(num);
    }

/**/

static Dataset_Header *Dataset_Header_create(gboolean is_dna,
                                             gboolean softmask_input){
    register Dataset_Header *header = g_new0(Dataset_Header, 1);
    header->magic = 0; /* FIXME: pick magic number */
    header->version = 1;
    header->type = (is_dna?1:0) | ((softmask_input?1:0) << 1);
    g_message("made sm [%d] type [%d]", softmask_input, (gint)header->type);
    /* other fields are filled during seq parsing or Dataset_finalise */
    return header;
    }

static void Dataset_Header_destroy(Dataset_Header *header){
    g_free(header);
    return;
    }

static void Dataset_Header_write(Dataset_Header *header, FILE *fp){
    /**/
    Dataset_write_int(header->magic, fp);
    Dataset_write_int(header->version, fp);
    Dataset_write_int(header->type, fp);
    Dataset_write_int(header->line_length, fp);
    g_message("Write: magic [%d] version [%d] type [%d] line_len [%d]",
            (gint)header->magic, (gint)header->version,
            (gint)header->type, (gint)header->line_length);
    /**/
    Dataset_write_int(header->number_of_dbs, fp);
    Dataset_write_int(header->max_db_len, fp);
    Dataset_write_int(header->total_db_len, fp);
    /**/
    Dataset_write_int(header->number_of_seqs, fp);
    Dataset_write_int(header->max_seq_len, fp);
    Dataset_write_int(header->total_seq_len, fp);
    /**/
    Dataset_write_int(header->path_data_offset, fp);
    Dataset_write_int(header->seq_info_offset, fp);
    Dataset_write_int(header->seq_data_offset, fp);
    Dataset_write_int(header->total_file_length, fp);
    /**/
    return;
    }

static Dataset_Header *Dataset_Header_read(FILE *fp){
    register Dataset_Header *header = g_new(Dataset_Header, 1);
    /**/
    header->magic = Dataset_read_int(fp);
    header->version = Dataset_read_int(fp);
    header->type = Dataset_read_int(fp);
    header->line_length = Dataset_read_int(fp);
    g_message("Read: magic [%d] version [%d] type [%d] line_len [%d]",
            (gint)header->magic, (gint)header->version,
            (gint)header->type, (gint)header->line_length);
    /**/
    header->number_of_dbs = Dataset_read_int(fp);
    header->max_db_len = Dataset_read_int(fp);
    header->total_db_len = Dataset_read_int(fp);
    g_message("Have num dbs [%d], max db [%d], total [%d]",
            (gint)header->number_of_dbs,
            (gint)header->max_db_len,
            (gint)header->total_db_len);
    /**/
    header->number_of_seqs = Dataset_read_int(fp);
    header->max_seq_len = Dataset_read_int(fp);
    header->total_seq_len = Dataset_read_int(fp);
    /**/
    header->path_data_offset = Dataset_read_int(fp);
    header->seq_info_offset = Dataset_read_int(fp);
    header->seq_data_offset = Dataset_read_int(fp);
    header->total_file_length = Dataset_read_int(fp);
    /**/
    return header;
    }

/**/

static Dataset_Sequence *Dataset_Sequence_create(FastaDB_Seq *fdbs){
    register Dataset_Sequence *seq = g_new0(Dataset_Sequence, 1);
    seq->key = FastaDB_Seq_get_key(fdbs);
    seq->gcg_checksum = Sequence_checksum(fdbs->seq);
    seq->id = g_strdup(fdbs->seq->id);
    seq->def = fdbs->seq->def?g_strdup(fdbs->seq->def):NULL;
    g_message("set id [%s] def [%s]", seq->id, seq->def);
    return seq;
    }

static void Dataset_Sequence_destroy(Dataset_Sequence *seq){
    FastaDB_Key_destroy(seq->key);
    g_free(seq->id);
    if(seq->def)
        g_free(seq->def);
    g_free(seq);
    return;
    }

static int Dataset_Sequence_compare_by_id_uniq(const void *a,
                                          const void *b){
    register Dataset_Sequence **seq_a = (Dataset_Sequence**)a,
                              **seq_b = (Dataset_Sequence**)b;
    register gint retval = strcmp((*seq_a)->id, (*seq_b)->id);
    g_message("compare [%s][%s]", (*seq_a)->id, (*seq_b)->id);
    if(!retval)
        g_error("Dataset has duplicate sequence id: [%s]",
                (*seq_a)->id);
    return retval;
    }

static int Dataset_Sequence_compare_by_id(const void *a,
                                          const void *b){
    register Dataset_Sequence **seq_a = (Dataset_Sequence**)a,
                              **seq_b = (Dataset_Sequence**)b;
    register gint retval = strcmp((*seq_a)->id, (*seq_b)->id);
    g_message("compare [%s][%s]", (*seq_a)->id, (*seq_b)->id);
    return retval;
    }

/**/

static Dataset_Width *Dataset_Width_create(Dataset_Header *header){
    register Dataset_Width *width = g_new0(Dataset_Width, 1);
    register gint n;
    g_message("A");
    for(n = header->number_of_dbs; n; n >>= 1){
        g_message("loop on n [%d]", n);
        width->num_db_width++;
        }
    g_message("B");
    for(n = header->max_db_len; n; n >>= 1)
        width->max_db_len_width++;
    g_message("C");
    for(n = header->max_seq_len; n; n >>= 1)
        width->max_seq_len_width++;
    g_message("D");
    n = width->num_db_width
      + width->max_db_len_width
      + width->max_seq_len_width
      + 14; /* for gcg checksum */
    width->seq_data_item_size = (n >> 3) + ((n % 8)?1:0);
    return width;
    }

static void Dataset_Width_destroy(Dataset_Width *width){
    g_free(width);
    return;
    }

/**/

Dataset *Dataset_create(GPtrArray *path_list,
                        gboolean is_dna, gboolean softmask_input){
    register Dataset *dataset = g_new(Dataset, 1);
    register FastaDB_Seq *fdbs;
    register Dataset_Sequence *ds;
    register gint i;
    register gsize path_data_size, seq_info_size, seq_data_size;
    register gchar *path;
    dataset->ref_count = 1;
    dataset->alphabet = Alphabet_create((is_dna?Alphabet_Type_DNA
                                               :Alphabet_Type_PROTEIN),
                                        softmask_input);
    dataset->header = Dataset_Header_create(is_dna, softmask_input);
    dataset->fdb = FastaDB_open_list(path_list, dataset->alphabet);
    /**/
    dataset->seq_list = g_ptr_array_new();
    while((fdbs = FastaDB_next(dataset->fdb, FastaDB_Mask_ALL))){
        g_message("read seq [%s]", fdbs->seq->id);
        ds = Dataset_Sequence_create(fdbs);
        g_ptr_array_add(dataset->seq_list, ds);
        FastaDB_Seq_destroy(fdbs);
        dataset->header->number_of_seqs++;
        if(dataset->header->max_seq_len < ds->key->length)
            dataset->header->max_seq_len = ds->key->length;
        dataset->header->total_seq_len += ds->key->length;
        }
    dataset->header->line_length = dataset->fdb->line_length;
    if(dataset->header->line_length < 1)
        g_error("Input is not a regular FASTA file, use fastareformat");
    dataset->header->number_of_dbs = path_list->len;
    dataset->header->max_db_len
        = CompoundFile_get_max_element_length(dataset->fdb->cf);
    dataset->header->total_db_len = CompoundFile_get_length(dataset->fdb->cf);
    qsort(dataset->seq_list->pdata, dataset->seq_list->len,
          sizeof(gpointer), Dataset_Sequence_compare_by_id_uniq);
    dataset->width = Dataset_Width_create(dataset->header);
    /**/
    path_data_size = 0;
    for(i = 0; i < path_list->len; i++){
        path = path_list->pdata[i];
        path_data_size += (strlen(path) + 1);
        }
    seq_info_size = dataset->seq_list->len
                  * dataset->width->seq_data_item_size;
    seq_data_size = 0;
    for(i = 0; i < dataset->seq_list->len; i++){
        ds = dataset->seq_list->pdata[i];
        ds->pos = i;
        seq_data_size += (strlen(ds->id) + 1);
        }
    /**/
    dataset->header->path_data_offset = sizeof(Dataset_Header);
    dataset->header->seq_info_offset = dataset->header->path_data_offset
                                     + path_data_size;
    dataset->header->seq_data_offset = dataset->header->seq_info_offset
                                     + seq_info_size;
    dataset->header->total_file_length = dataset->header->seq_data_offset
                                       + seq_data_size;

    return dataset;
    }

Dataset *Dataset_share(Dataset *dataset){
    dataset->ref_count++;
    return dataset;
    }

void Dataset_destroy(Dataset *dataset){
    register gint i;
    register Dataset_Sequence *seq;
    if(--dataset->ref_count)
        return;
    for(i = 0; i < dataset->seq_list->len; i++){
        seq = dataset->seq_list->pdata[i];
        Dataset_Sequence_destroy(seq);
        }
    FastaDB_close(dataset->fdb);
    g_ptr_array_free(dataset->seq_list, TRUE);
    Dataset_Header_destroy(dataset->header);
    if(dataset->width)
        Dataset_Width_destroy(dataset->width);
    Alphabet_destroy(dataset->alphabet);
    g_free(dataset);
    return;
    }

static void Dataset_write_path_data(Dataset *dataset, FILE *fp){
    register gint i;
    register CompoundFile_Element *cfe;
    for(i = 0; i < dataset->fdb->cf->element_list->len; i++){
        cfe = dataset->fdb->cf->element_list->pdata[i];
        fprintf(fp, "%s\n", cfe->path);
        }
    return;
    }

static void Dataset_read_path_data(Dataset *dataset, FILE *fp){
    register gint i;
    gchar buf[1024];
    register GPtrArray *path_list = g_ptr_array_new();
    register gchar *path;
    for(i = 0; i < dataset->header->number_of_dbs; i++){
        if(!fgets(buf, 1024, fp))
            g_error("Problem parsing file data");
        path = g_strndup(buf, strlen(buf)-1);
        g_ptr_array_add(path_list, path);
        }
    dataset->fdb = FastaDB_open_list(path_list, dataset->alphabet);
    dataset->fdb->line_length = dataset->header->line_length;
    for(i = 0; i < path_list->len; i++){
        path = path_list->pdata[i];
        g_free(path);
        }
    g_ptr_array_free(path_list, TRUE);
    return;
    }

static void Dataset_write_seq_data(Dataset *dataset, FILE *fp){
    register gint i;
    register Dataset_Sequence *ds;
    for(i = 0; i < dataset->seq_list->len; i++){
        ds = dataset->seq_list->pdata[i];
        fprintf(fp, "%s%s%s\n",
                ds->id,
                ds->def?" ":"",
                ds->def?ds->def:"");
        }
    return;
    }

static void Dataset_read_seq_data(Dataset *dataset, FILE *fp){
    register gint i;
    register Dataset_Sequence *ds;
    gchar buf[1024];
    for(i = 0; i < dataset->header->number_of_seqs; i++){
        ds = g_new0(Dataset_Sequence, 1);
        g_ptr_array_add(dataset->seq_list, ds);
        if(!fgets(buf, 1024, fp))
            g_error("Problem parsing seq data");
        ds->id = g_strndup(buf, strlen(buf)-1);
        g_message("read seqid [%s]", ds->id);
        /* FIXME: should set ds->def when found */
        }
    return;
    }

static void Dataset_write_seq_info(Dataset *dataset, FILE *fp){
    register gint i;
    register Dataset_Sequence *sequence;
    register BitArray *ba = BitArray_create();
    for(i = 0; i < dataset->seq_list->len; i++){
        sequence = dataset->seq_list->pdata[i];
        /**/
        BitArray_append(ba, sequence->key->location->element_id,
                        dataset->width->num_db_width);
        BitArray_append(ba, sequence->key->location->pos,
                        dataset->width->max_db_len_width);
        BitArray_append(ba, sequence->key->length,
                        dataset->width->max_seq_len_width);
        BitArray_append(ba, sequence->gcg_checksum, 14);
        BitArray_write(ba, fp);
        BitArray_empty(ba);
        }
    BitArray_destroy(ba);
    return;
    }

static void Dataset_read_seq_info(Dataset *dataset, FILE *fp){
    register gint i, start, element_id, length, seq_offset;
    register Dataset_Sequence *ds;
    register BitArray *ba;
    register CompoundFile_Pos offset;
    register Sequence_Strand strand
        = (dataset->alphabet->type == Alphabet_Type_DNA)
        ? Sequence_Strand_FORWARD
        : Sequence_Strand_UNKNOWN;
    register CompoundFile_Location *location;
    for(i = 0; i < dataset->seq_list->len; i++){
        g_message("reading on seq [%d]", i);
        ds = dataset->seq_list->pdata[i];
        ba = BitArray_read(fp, dataset->width->seq_data_item_size);
        start = 0;
        element_id = BitArray_get(ba, start, dataset->width->num_db_width);
        start += dataset->width->num_db_width;
        offset = BitArray_get(ba, start, dataset->width->max_db_len_width);
        start += dataset->width->max_db_len_width;
        length = BitArray_get(ba, start, dataset->width->max_seq_len_width);
        start += dataset->width->max_seq_len_width;
        location = CompoundFile_Location_create(dataset->fdb->cf,
                                                offset, element_id);
        seq_offset = 1 + strlen(ds->id)
                   + (ds->def?strlen(ds->def)+1:0);
        ds->key = FastaDB_Key_create(dataset->fdb,
                                     location, strand,
                                     seq_offset, length);
        CompoundFile_Location_destroy(location);
        ds->gcg_checksum = BitArray_get(ba, start, 14);
        ds->pos = i;
        BitArray_destroy(ba);
        }
    return;
    }

void Dataset_write(Dataset *dataset, gchar *path){
    register FILE *fp = fopen(path, "r");
    if(fp)
        g_error("Output file [%s] already exists", path);
    fp = fopen(path, "w");
    Dataset_Header_write(dataset->header, fp);
    Dataset_write_path_data(dataset, fp);
    Dataset_write_seq_data(dataset, fp);
    Dataset_write_seq_info(dataset, fp);
    fclose(fp);
    return;
    }

Dataset *Dataset_read(gchar *path){
    register Dataset *dataset = g_new(Dataset, 1);
    register FILE *fp = fopen(path, "r");
    if(!fp)
        g_error("Could not open esd file [%s]", path);
    dataset->ref_count = 1;
    dataset->header = Dataset_Header_read(fp);
    dataset->alphabet = Alphabet_create(
                           ((dataset->header->type&1)
                            ?Alphabet_Type_DNA
                            :Alphabet_Type_PROTEIN),
                            (dataset->header->type&3));
    g_message("get width");
    dataset->width = Dataset_Width_create(dataset->header);
    g_message("got width");
    dataset->seq_list = g_ptr_array_new();
    Dataset_read_path_data(dataset, fp);
    Dataset_read_seq_data(dataset, fp);
    Dataset_read_seq_info(dataset, fp);
    fclose(fp);
    return dataset;
    }

gint Dataset_lookup_id(Dataset *dataset, gchar *id){
    register Dataset_Sequence *result, **result_ptr;
    Dataset_Sequence key_seq, *key_ptr;
    key_seq.id = id;
    key_ptr = &key_seq;
    result_ptr = bsearch(&key_ptr, dataset->seq_list->pdata,
            dataset->seq_list->len,
            sizeof(gpointer), Dataset_Sequence_compare_by_id);
    if(!result_ptr)
        return -1;
    result = *result_ptr;
    return result->pos;
    }

Sequence *Dataset_get_sequence(Dataset *dataset, gint dataset_pos){
    register Sequence *seq;
    register Dataset_Sequence *ds;
    register gchar *def;
    register SparseCache *cache;
    register Sequence_Strand strand
        = (dataset->alphabet->type == Alphabet_Type_DNA)
        ? Sequence_Strand_FORWARD
        : Sequence_Strand_UNKNOWN;
    g_assert(dataset_pos >= 0);
    g_assert(dataset_pos < dataset->seq_list->len);
    g_message("do db get seq");
    ds = dataset->seq_list->pdata[dataset_pos];
    def = FastaDB_Key_get_def(ds->key);
    cache = FastaDB_Key_get_SparseCache(ds->key);
    seq = Sequence_create_extmem(ds->id, def, ds->key->length,
                                 strand, dataset->alphabet, cache);
    g_free(def);
    SparseCache_destroy(cache);
    g_message("done db get seq");
    return seq;
    }
/* FIXME: should keep a single compound file for whole dataset,
 *        and use that for making keys, to prevent more than
 *        one file being opened simultaneously.
 *        ?? will not work with cf sorting ... cf needs refactoring
 */

/**/

