/********************************************************************************************************
 * QRNA - Comparative analysis of biological sequences 
 *         with pair hidden Markov models, pair stochastic context-free
 *        grammars, and probabilistic evolutionary  models.
 *       
 * Version 2.0.0 (JUN 2003)
 *
 * Copyright (C) 2000-2003 Howard Hughes Medical Institute/Washington University School of Medicine
 * All Rights Reserved
 * 
 *     This source code is distributed under the terms of the
 *     GNU General Public License. See the files COPYING and LICENSE
 *     for details.
 ***********************************************************************************************************/

/* tying.c
 *
 * Tying SCFG parameters to each other to reduce # of free param's.
 *
 * ER, Tue Jun 22 13:47:15 CDT 1999 [STL]
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>

#include "funcs.h"
#include "globals.h"
#include "squid.h"
#include "structs.h"
#include "version.h"

#ifdef MEMDEBUG
#include "dbmalloc.h"
#endif

/* Function: EnslaveTrans()
 * 
 * Purpose:  Set up an array [0..NTRANS-1] that specifies which
 *           transitions are masters, which are slaves, and who the
 *           slaves' masters are -- for purposes of tying parameters.
 *           Values are either -1 (if this state is a master) or
 *           a state index (if this state a slave of the indicated
 *           state).
 *           
 *           The parameter tying of the SCFG is hard coded by 
 *           this function. 
 *           
 * Args:     void
 * 
 * Return:   [0..NTRANS-1] slave array. Alloc'ed here. Free'd by caller.
 */
int **
EnslaveTrans(void)
{
  int **enslave;
  int x,y;
  int i, j, k, l;

  enslave = AllocIntSCFG();      /* cheating: using SCFG "object" inappropriately */

		       /* unless told otherwise, everyone is a master */
  for (x = 0; x < NDPS; x++)
    for (y = 0; y < Ntrans[x]; y++) enslave[x][y] = -1;

				/* dpcL and dpcR are tied together    */
  for (i = 1; i < 4; i++) {
    enslave[W][idxL(i)]   = idxL(0);
    enslave[WB][idxL(i)]  = idxL(0);
  }

  for (i = 1; i < 4; i++) {
    enslave[W][idxR(i)]   = idxR(0);
    enslave[WB][idxR(i)]  = idxR(0);
  }

  for (i = 0; i < 4; i++)
    for (j = 0; j < 4; j++)
      for (k = 0; k < 4; k++)
	for (l = 0; l < 4; l++)
	  if (k != 0 || l != 0) {
	    enslave[V][idxS1(i,j,k,l)]  = idxS1(i,j,0,0);
	    enslave[V][idxS2S(i,j,k,l)] = idxS2S(i,j,0,0);
	    enslave[V][idxS2B(i,j,k,l)] = idxS2B(i,j,0,0);
 	    enslave[V][idxS2I(i,j,k,l)] = idxS2I(i,j,0,0);
	    enslave[V][idxMV(k,l)]      = idxMV(0,0);
	  }
  
  return enslave;
}

/* Function: TieCounts()
 * 
 * Purpose:  Provide for tying parameters together, to reduce
 *           number of effective free parameters. "enslave" is
 *           a [0..NTRANS-1] vector specifying masters and
 *           slaves -- slaves send their counts to their masters,
 *           masters accumulate the counts and redistribute them
 *           justly to themselves and their slaves, and the
 *           whole SCFG is then reestimated into probability form.
 *           
 * Args:     cfg     -- SCFG in counts form.
 *           enslave -- array specifying masters and slaves. [0..NDPS-1]
 *           
 * Return:   (void)
 *           cfg counts are modified.
 */          
void
TieCounts(double **cfg, int **enslave)
{
  int      i,j;
  double **tmp;                  /* holds counts */
  double **num;                  /* holds total number of transitions summed into masters */
  int      usej;                
 
  tmp = AllocSCFG();
  num = AllocSCFG();		/* cheating: using SCFG "object" inappropriately */

				/* slaves send their counts to master.
				 * keep track of number of transitions being summed */
  for (i = 0; i < NDPS; i++)
    for (j = 0; j < Ntrans[i]; j++)
      if (Connects(i,Ntype(i,j)))
	{
	  usej = (enslave[i][j] == -1) ? j : enslave[i][j]; 
	  tmp[i][usej] += cfg[i][j];  /* collect counts from slave  */
	  num[i][usej] += 1.0;        /* increment master's counter */
	}

  				/* masters redistribute the wealth */
  for (i = 0; i < NDPS; i++)
    for (j = 0; j < Ntrans[i]; j++)
      if (Connects(i,Ntype(i,j)))
	{
	  usej = (enslave[i][j] == -1) ? j : enslave[i][j]; 
	  if (num[i][usej] != 0.0) 
	    cfg[i][j] = tmp[i][usej] / num[i][usej];
	}
  
    FreeSCFG(num);
    FreeSCFG(tmp);
}

/* Function: CountFreeParameters()
 * 
 * Purpose:  Determine the number of free and nonzero parameters in a model.
 */
void
CountFreeParameters(int **enslave, int *ret_free, int *ret_nonzero, int allow_pseudoknots)
{
  int i, idx, j;
  int nfree = 0;
  int nonzero = 0;
  int tmp;

  for (i = 0; i < NDPS; i++)
    {
      /*if (!allow_pseudoknots && (i == VH || i == WH)) continue;*/
      
      for (idx = 0; idx < Idx[i]; idx++) {
	
	tmp = 0;

	for (j = idx*TransPerDp[i]; j < (idx+1)*TransPerDp[i]; j++) 
	  if (Connects(i,Ntype(i,j)) &&
	      (enslave == NULL || enslave[i][j] != -1)) 
	    tmp++;
	nonzero += tmp;
	nfree   += tmp-1;		/* n-1 free parameters: constraint to sum to 1.0 */
      }
    }
  
  *ret_free = nfree;
  *ret_nonzero = nonzero;
}

/* Function: CountsPerState()
 * 
 * Purpose:  Given an (untied) counts-based SCFG, 
 *           figure out how many counts are going into determining
 *           each state's transition probability vector.
 *           Return a [0..NDPS-1] vector of these numbers.
 *           
 * Args:     cfg -- counts-based SCFG, before tying, before prob'ifying
 *         
 * Return:   [0..NDPS-1] array of count sums per transition vector.
 *           alloc'ed here. Caller must free.
 *           
 */                   
double *
CountsPerState(double **cfg)
{
  int i, j;
  double *tot;

  tot = (double *) MallocOrDie (sizeof(double) * NDPS);
  for (i = 0; i < NDPS; i++)
    {
      tot[i] = 0.0;
      for (j = 0; j < Ntrans[i]; j++)
	tot[i] += cfg[i][j];
    }
  for (i = 0; i < NDPS; i++)
    printf("CountsPerState %s = %f\n", stNAME[i], tot[i]);

  return tot;
}



/* Function: BootstrapConfidence()
 * 
 * Purpose:  Determine bootstrap confidence intervals on a
 *           probability vector. This may be a "parametric
 *           bootstrap" since I use the parameters of an estimated vector
 *           to resample, rather than sampling from count data
 *           with replacement.
 *           
 *           Because of tying across distributions, we must
 *           bootstrap on the whole SCFG instead of just the
 *           individual vectors (which would seem to be a more
 *           generalized and desirable implementation).
 *           
 * Args:     cfg      - prob-form SCFG used for resampling
 *           counts   - number of counts to resample per state [0..NDPS-1]        
 *           enslave  - array for tying parameters, or NULL to leave untied
 *           nboot    - number of bootstrap reestimates (~1000?)
 *           conf     - confidence interval reported (e.g. 0.95)
 *           ret_high - "scfg" containing the upper confidence bound
 *           ret_low  - "scfg" containing the lower confidence bound
 *           
 * Return:   void.
 *           ret_low and ret_high give the confidence interval on
 *           each parameter.
 */
void
BootstrapConfidence(double **cfg, double *counts, int **enslave, int nboot, double conf,
		    double ***ret_high, double ***ret_low)
{
  double ***hi;    /* sorted arrays of high parameter values [0..NDPS-1][0..NDPS-1][0..d-1] */
  double ***lo;    /* sorted arrays of low parameter values [0..NDPS-1][0..NDPS-1][0..d-1] */
  int   d;         /* if we have nboot=1000 and conf=0.95, discard=25 top and bottom values */
  int   booti;     /* counter for bootstrap */
  int   i,j;	   /* counter for states  */
  int   v;         /* counter for values in a sorted array of parameter values */ 
  double   swap;   /* used for swapping values in insertion sort */
  double   x;      /* counter for counts (!) */
  double **samp;   /* SCFG into which we sample counts */
  double **high;   /* RETURN: high bounds on confidence intervals */
  double **low;    /* RETURN: low bounds on confidence intervals  */

  
  d = nboot * ((1. - conf) / 2.);
  
  /* Allocation, initialization.
   * We will keep the highest and lowest scores by insertion sort,
   * in arrays of [0..d] (d+1) numbers with guard values at position d+1.
   * At the end, hi[i][j][0] will contain the high end of the confidence
   * interval and hi[i][j][d] contains the highest value seen.
   * Similarly, low[i][j][0] will contain the low end of the confidence
   * interval, and lo[i][j][d] contains the lowest value seen.
   */
  hi = (double ***) MallocOrDie (sizeof(double **) * NDPS);
  lo = (double ***) MallocOrDie (sizeof(double **) * NDPS);
  for (i = 0; i < NDPS; i++)
    {
      hi[i] = (double **) MallocOrDie (sizeof(double *) * Ntrans[i]);
      lo[i] = (double **) MallocOrDie (sizeof(double *) * Ntrans[i]);
      for (j = 0; j < Ntrans[i]; j++)
	if (Connects(i,Ntype(i,j)))
	  {
	    hi[i][j] = (double *) MallocOrDie (sizeof(double) * (d+2));
	    lo[i][j] = (double *) MallocOrDie (sizeof(double) * (d+2));
	    for (v = 0; v < d+1; v++)
	      {			/* starting dummies in array, must always lose */
		hi[i][j][v] = -1.0;
		lo[i][j][v] = 2.0; 
	      }
				/* guard values must always win */
	    hi[i][j][d+1] = 2.0;
	    lo[i][j][d+1] = -1.0;
	  }
    }


  /* Bootstrap samples:
   *    - Count into the SCFG by random sampling.
   *    - Reestimate the SCFG.
   *    - Insertion sort the parameters into the high and low value arrays.
   */
  samp = AllocSCFG();
  for (booti = 0; booti < nboot; booti++)
    {
      printf("sample %d\n", booti);
      /* Clear the old SCFG       */
      for (i = 0; i < NDPS; i++)
	for (j = 0; j < Ntrans[i]; j++)
	  samp[i][j] = 0.0;

      /* Sample into the SCFG
       * We add one extra count artificially to avoid the
       * paradox of no data -> no error.
       */
      for (i = 0; i < NDPS; i++)
	for (x = 0.; x < counts[i] + 1.0; x += 1.0)
	  {
	    j = DChoose(cfg[i], Ntrans[i]);
	    if (Connects(i,Ntype(i,j))) samp[i][j] += 1.0;
	  }

      /* Reestimate the SCFG
       */
      if (enslave != NULL) TieCounts(samp, enslave);
      Log2ProbSCFG(samp); 

      /* Insertion-sort the parameters into our high/low lists
       */
      for (i = 0; i < NDPS; i++)
	for (j = 0; j < Ntrans[i]; j++)
	  if (Connects(i,Ntype(i,j)))
	    {
				/* high score sort */
	      if (samp[i][j] > hi[i][j][0])
		{
		  hi[i][j][0] = samp[i][j];
				/* note reliance on guard value */
		  for (v = 0; hi[i][j][v] > hi[i][j][v+1]; v++)
		    {
		      swap = hi[i][j][v+1];
		      hi[i][j][v+1] = hi[i][j][v];
		      hi[i][j][v] = swap;
		    }
		}
				/* low score sort */
	      if (samp[i][j] < lo[i][j][0])
      		{
		  lo[i][j][0] = samp[i][j];
		  for (v = 0; lo[i][j][v] < lo[i][j][v+1]; v++)
		    {
		       swap = lo[i][j][v+1];
		       lo[i][j][v+1] = lo[i][j][v];
		       lo[i][j][v] = swap;
		    }
		}
	    }
    }
  
  /* Save the bounds in two returned SCFGs
   */
  high = AllocSCFG();
  low  = AllocSCFG();
  for (i = 0; i < NDPS; i++)
    for (j = 0; j < Ntrans[i]; j++)
      if (Connects(i,Ntype(i,j)))
	{
	  high[i][j] = hi[i][j][0];
	  low[i][j]  = lo[i][j][0];
	}

  /* Free the sorted arrays of parameter reestimates
   */
  for (i = 0; i < NDPS; i++)
    {
      for (j = 0; j < Ntrans[i]; j++)
	if (Connects(i,Ntype(i,j)))
	  {
	    free(hi[i][j]);
	    free(lo[i][j]);
	  }
      free(hi[i]);
      free(lo[i]);
    }
  free(hi);
  free(lo);
  FreeSCFG(samp);

  *ret_high = high;
  *ret_low  = low;
} 
