/*
Copyright 2013 Cameron Palmer

This file is a part of Genezip.

Genezip is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

Genezip is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTIBILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with Genezip.  If not, see <http://www.gnu.org/licenses/>
*/

/*!
  \file compression_handler.h
  \brief wrapper for binning of input data before compression,
  and compression itself
 */

#ifndef __GENEZIP__COMPRESSION_HANDLER_H__
#define __GENEZIP__COMPRESSION_HANDLER_H__

#include <vector>
#include <stdexcept>

#include "genezip/gzfile_compress.h"
#include "genezip/gzfile_decompress.h"
#include "genezip/huffman_code.h"
#include "genezip/prob_vector.h"
#include "genezip/smart_pointers.h"

//! namespace for all backend code for this library;
//! generally, this code should not be accessed directly by
//! users
namespace genezip_utils {
  //! a class to handle binning, compression and decompression
  //! for data from a series of vectors
  class compression_handler {
  public:
    //! constructor
    compression_handler()
      : _compressor(NULL),
      _max_bin_size(GENEZIP_DEFAULT_BIN_SIZE_LIMIT),
      _vectors_in_current_bin(0),
      _extracted_bin_index(0),
      _finalized(false) {}
    //! constructor: set bin size
    //! \param bin_size_limit max number of vectors per bin
    //! \param precompressed_bits_required how many bits can the unsigneds
    //! be minimally packed into?
    compression_handler(unsigned bin_size_limit,
			unsigned precompressed_bits_required)
      : _compressor(NULL),
      _max_bin_size(bin_size_limit),
      _current_data(0, precompressed_bits_required),
      _vectors_in_current_bin(0),
      _extracted_bin(0, precompressed_bits_required),
      _extracted_bin_index(0),
      _finalized(false) {}
    //! destructor: all pointers selfdeleting
    ~compression_handler() throw() {if (_compressor) delete _compressor;}
    //! add some data to the dataset, handling binning
    //! \param preproc_data data to be added to the dataset
    void enqueue_data(const std::vector<unsigned> &preproc_data);
    //! if any stored data have not been compressed, compress them
    //! \warning in the current implementation, this may generate
    //! a small bin for which lack of compression might be a more efficient
    //! storage method
    void finalize();
    //! extract a vector of data based on the order in which it was entered
    //! \param index order in which the vector was entered, on [0, nvecs]
    //! \param size length of vector expected
    //! \param target where the data should be stored
    void get_vector_of_data(unsigned index,
			    unsigned size,
			    std::vector<unsigned> &target);
    //! get the max bin size permitted in this object
    //! \return the max bin size permitted in this object
    inline unsigned max_bin_size() const {return _max_bin_size;}
    //! determine how many bins are currently stored
    //! \return the number of currently stored bins
    inline unsigned number_stored_bins() const {
      return _compressed_types.size();
    }
    //! reset object to factory state
    inline void clear() {
      _compressed_types.clear();
      _literal_length_codes.clear();
      _distance_codes.clear();
      _current_data.clear();
      _vectors_in_current_bin = 0;
      _extracted_bin.clear();
      _extracted_bin_index = 0;
      _max_bin_size = GENEZIP_DEFAULT_BIN_SIZE_LIMIT;
      _finalized = false;
    }
    //! get approximate number of bytes used by object
    //! \return approximate number of bytes used by object
    inline unsigned bytes_of_compressed_memory() const {
      unsigned res = 3 * sizeof(unsigned);
      unsigned res_bits = 0;
      res_bits = _current_data.size()*_current_data.bits_per_element()
	+ _extracted_bin.size()*_extracted_bin.bits_per_element() + 1;
      for (const_iterator iter = _compressed_types.begin();
	   iter != _compressed_types.end(); ++iter)
	res_bits += (*iter)->size();
      for (const_huffman_iterator iter = _literal_length_codes.begin();
	   iter != _literal_length_codes.end(); ++iter) {
	res += (*iter)->bytes_of_compressed_memory();
      }
      for (const_huffman_iterator iter = _distance_codes.begin();
	   iter != _distance_codes.end(); ++iter) {
	res += (*iter)->bytes_of_compressed_memory();
      }
      return res + res_bits/8;
    }
  private:
    //! compress any stored, uncompressed data in their own bin
    void compress_current_data();
    /*! convenience typedef of storage structure for compressed data subset */
    typedef std::vector<bool> gtype_vector;
    /*! convenience typedef of pointer to compressed data subset */
    typedef smart_pointer<gtype_vector> data_pointer;
    /*! convenience typedef of compressed dataset */
    typedef std::vector<data_pointer> data_vector;
    /*! convenience typedef of const iterator across compressed dataset */
    typedef data_vector::const_iterator const_iterator;
    /*! convenience typedef of iterator across compressed dataset */
    typedef data_vector::iterator iterator;
    /*! convenience typedef of vector of huffman codes */
    typedef std::vector<smart_pointer<huffman_code> > huffman_vector;
    /*! convenience typedef of const iterator across huffman code vector */
    typedef huffman_vector::const_iterator const_huffman_iterator;
    /*! convenience typedef of iterator across huffman code vector */
    typedef huffman_vector::iterator huffman_iterator;
    /*! access first stored subset (variant) in compressed dataset (const) */
    //! \return const_iterator to the first compressed variant
    const_iterator first_variant() const {
      return _compressed_types.begin();
    }
    //! access first stored subset (variant) in compressed dataset (non-const) 
    //! \return iterator to the first compressed variant
    iterator first_variant() {return _compressed_types.begin();}
    //! get end bound of compressed dataset (const)
    //! \return const_iterator to the end bound of the dataset
    const_iterator last_variant() const {
      return _compressed_types.end();
    }
    //! get end bound of compressed dataset (non-const)
    //! \return iterator to the end bound of the dataset
    iterator last_variant() {return _compressed_types.end();}
    gzfile_compress *_compressor;
    //! stored compressed imputed genotype data, in bit format
    data_vector _compressed_types;
    //! stored huffman codes for gzip literals and match lengths
    huffman_vector _literal_length_codes;
    //! stored huffman codes for gzip match distances
    huffman_vector _distance_codes;
    //! vector of not-yet-compressed types
    prob_vector _current_data;
    //! number of vectors contained within
    unsigned _vectors_in_current_bin;
    //! currently extracted bin
    prob_vector _extracted_bin;
    //! index of currently extracted bin
    unsigned _extracted_bin_index;
    //! for order-based binning: max number of vectors per bin
    unsigned _max_bin_size;
    //! whether finalize() has been called, locking the compressed data
    //! \sa finalize()
    bool _finalized;
  };
}

#endif //__COMPRESSION_HANDLER_H__
