/*
    Theseus - maximum likelihood superpositioning of macromolecular structures

    Copyright (C) 2004-2010 Douglas L. Theobald

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the:

    Free Software Foundation, Inc.,
    59 Temple Place, Suite 330,
    Boston, MA  02111-1307  USA

    -/_|:|_|_\-
*/

#include <pthread.h>
#include "Threads.h"
#include "ProcGSLSVD.h"
#include "ProcGSLSVDOcc.h"
#include "MultiPose_local.h"
#include "MultiPose.h"


void
SuperPose2Anchor(CdsArray *scratchA, CdsArray *baseA, char *anchorf_name)
{
    double        **anchormat = MatAlloc(3, 3);
    double         *anchortrans = malloc(3 * sizeof(double));
    double         *tmpanchortrans = malloc(3 * sizeof(double));
    double         *trans = malloc(3 * sizeof(double));
    double          norm1, norm2, innprod;
    int             i, j, anchor = 0;

    for (i = 0; i < baseA->cnum; ++i)
    {
        if (strncmp(anchorf_name, baseA->cds[i]->filename, FILENAME_MAX - 1) == 0)
        {
            anchor = i;
            break;
        }
    }

    SuperPose(scratchA->cds[anchor], baseA->cds[anchor], anchormat, anchortrans,
              &norm1, &norm2, &innprod);

    for (i = 0; i < baseA->cnum; ++i)
    {
        InvRotVec(tmpanchortrans, anchortrans, scratchA->cds[i]->matrix);

        for (j = 0; j < 3; ++j)
            scratchA->cds[i]->center[j] = scratchA->cds[i]->translation[j] =
                scratchA->cds[i]->center[j] - tmpanchortrans[j];

        Mat3MultIp(scratchA->cds[i]->matrix, (const double **) anchormat);
    }

    for (j = 0; j < 3; ++j)
        scratchA->avecds->center[j] = scratchA->avecds->translation[j] =
            anchortrans[j];

    Mat3Cpy(scratchA->avecds->matrix, (const double **) anchormat);

    free(trans);
    free(anchortrans);
    free(tmpanchortrans);
    MatDestroy(&anchormat);
}


/* static void */
/* CenMassWtHVarIp_old3D(Cds *cds, const double *wts, const double wtnorm, */
/*                 const double *mean, const double *var, const double precision) */
/* { */
/*     int             i; */
/*     double          tempx, tempy, tempz; */
/*     double          wti, wtsum; */
/*     const double   *x = (const double *) cds->x, */
/*                    *y = (const double *) cds->y, */
/*                    *z = (const double *) cds->z; */
/*  */
/*     tempx = tempy = tempz = wtsum = 0.0; */
/*     for (i = 0; i < cds->vlen; ++i) */
/*     { */
/*         wti = wts[i]; */
/*         wtsum += wti; */
/*         tempx += (wti * x[i]); */
/*         tempy += (wti * y[i]); */
/*         tempz += (wti * z[i]); */
/*     } */
/*  */
/*     if (wtsum < precision * wtnorm / var[0]) */
/*     { */
/*  */
/*         cds->center[0] = - wtnorm * mean[0]; */
/*         cds->center[1] = - wtnorm * mean[1]; */
/*         cds->center[2] = - wtnorm * mean[2]; */
/*     } */
/*     else */
/*     { */
/*         cds->center[0] = (tempx - wtnorm*mean[0] / var[0]) / (wtsum + wtnorm / var[0]); */
/*         cds->center[1] = (tempy - wtnorm*mean[1] / var[1]) / (wtsum + wtnorm / var[1]); */
/*         cds->center[2] = (tempz - wtnorm*mean[2] / var[2]) / (wtsum + wtnorm / var[2]); */
/*     } */
/* } */


/* For superimposing to an alignment, we don't need to weight by occupancy
   since we are using pseudo-coordinates here from the E-M expectation step */
/* static void */
/* CalcTranslations_old3D(CdsArray *scratchA, Algorithm *algo) */
/* { */
/*     Cds        **cds = scratchA->cds; */
/*     int             i, j, cnt; */
/*     double          chi2; */
/*  */
/*     if (algo->notrans == 0) */
/*     { */
/*         if (algo->htrans == 1 && algo->rounds > 1) */
/*         { */
/*             double      logL, lvar, varsum; */
/*             double     *mean = malloc(3 * sizeof(double)); */
/*             double     *var = malloc(3 * sizeof(double)); */
/*             double     *trans = malloc(scratchA->cnum * sizeof(double)); */
/*  */
/*             varsum = FLT_MAX; */
/*             cnt = 0; */
/*             do */
/*             { */
/*                 lvar = varsum; */
/*                 ++cnt; */
/*                 varsum = 0.0; */
/*                 for (j = 0; j < 3; ++j) */
/*                 { */
/*                     for (i = 0; i < scratchA->cnum; ++i) */
/*                         trans[i] = -cds[i]->center[j]; */
/*  */
/*                     chi2 = normal_fit((const double *) trans, scratchA->cnum, &mean[j], &var[j], &logL); */
/*                     varsum += var[j]; */
/*  */
/*                     fflush(NULL); */
/*                 } */
/*  */
/*                 for (i = 0; i < scratchA->cnum; ++i) */
/*                     CenMassWtHVarIp(cds[i], scratchA->w, scratchA->stats->wtnorm, mean, var, algo->precision); */
/*                 break; */
/*             } */
/*             while(fabs(lvar - varsum)/varsum > algo->precision); */
/*  */
/*             scratchA->stats->htrans_ave = mean[0]; */
/*             scratchA->stats->htrans_var = var[0]; */
/*             scratchA->stats->htrans_chi2 = chi2; */
/*  */
/*             free(trans); */
/*             free(var); */
/*             free(mean); */
/*         } */
/*         else */
/*         { */
/*             for (i = 0; i < scratchA->cnum; ++i) */
/*             { */
/*                 if (algo->covweight == 0) */
/*                 { */
/*                     if (algo->alignment == 1 && algo->rounds < 3) */
/*                         CenMassWtIpOcc(cds[i], scratchA->w); */
/*                     else */
/*                         CenMassWtIp(cds[i], scratchA->w); */
/*                 } */
/*                 else */
/*                     CenMassCov(cds[i], (const double **) scratchA->WtMat); */
/*             } */
/*         } */
/*     } */
/* } */


static void
CenMassWtHVarIp(Cds *cds, const double *wts, const double wtnorm,
                const double *mean, const double var, const double precision)
{
    int             i;
    double          tempx, tempy, tempz;
    double          wti, wtsum;
    const double   *x = (const double *) cds->x,
                   *y = (const double *) cds->y,
                   *z = (const double *) cds->z;

    tempx = tempy = tempz = wtsum = 0.0;
    for (i = 0; i < cds->vlen; ++i)
    {
        wti = wts[i];
        wtsum += wti;
        tempx += (wti * x[i]);
        tempy += (wti * y[i]);
        tempz += (wti * z[i]);
    }
/* printf("\nwtsum: %f wtnorm: %f", wtsum, wtnorm); */
/* printf("\n% f % f % f    %f %f %f", tempx / wtsum, tempy / wtsum, tempz / wtsum, wtsum, wtnorm, wtnorm / var); */

    printf("\nbefore: % f % f % f", cds->center[0], cds->center[1], cds->center[2]);
    if (var * wtsum < precision * wtnorm)
    {
        cds->center[0] = -mean[0];
        cds->center[1] = -mean[1];
        cds->center[2] = -mean[2];
    }
    else
    {
        cds->center[0] = (tempx*var - wtnorm*mean[0]) / (wtsum*var + wtnorm);
        cds->center[1] = (tempy*var - wtnorm*mean[1]) / (wtsum*var + wtnorm);
        cds->center[2] = (tempz*var - wtnorm*mean[2]) / (wtsum*var + wtnorm);
    }

    printf("\nafter:  % f % f % f", cds->center[0], cds->center[1], cds->center[2]);
    fflush(NULL);
}


void
CalcTranslationsOp(CdsArray *scratchA, CdsArray *baseA, Algorithm *algo)
{
    int             i, j, cnt;
    double          chi2;

    if (algo->notrans == 0)
    {
        if (algo->htrans == 1 && algo->rounds > 1)
        {
            double      logL, lvar, varsum/* , chi2 */;
            double     *mean = malloc(3 * sizeof(double));
            double     *var = malloc(3 * sizeof(double));
            double     *trans = malloc(scratchA->cnum * sizeof(double));

            varsum = FLT_MAX;
            cnt = 0;
            do
            {
                lvar = varsum;
                ++cnt;
                varsum = 0.0;
                for (j = 0; j < 3; ++j)
                {
                    for (i = 0; i < scratchA->cnum; ++i)
                        trans[i] = -baseA->cds[i]->center[j];

                    chi2 = normal_fit((const double *) trans, scratchA->cnum, &mean[j], &var[j], &logL);
                    varsum += var[j];
                    printf("\n %3d:%d chi2: %f mean: %f  var: %f logL: %f wtnorm: %f",
                           cnt, j, chi2, mean[j], var[j], logL, scratchA->stats->wtnorm);
                    fflush(NULL);
                }

                printf("\n %3d: varsum:%f", cnt, varsum);

                for (i = 0; i < scratchA->cnum; ++i)
                    CenMassWtHVarIp(baseA->cds[i], scratchA->w, scratchA->stats->wtnorm, mean, varsum, algo->precision);
            }
            while(fabs(lvar - varsum) > algo->precision * varsum);

            scratchA->stats->htrans_ave = mean[0];
            scratchA->stats->htrans_var = varsum;
            scratchA->stats->htrans_chi2 = chi2;

            free(trans);
            free(var);
            free(mean);
        }
        else
        {
            for (i = 0; i < scratchA->cnum; ++i)
            {
                if (algo->covweight == 0)
                {
                    if (algo->alignment == 1 && algo->rounds < 3)
                    {
                        CenMassWtIpOcc(baseA->cds[i], scratchA->w);
                    }
                    else
                    {
                        if (algo->commandeur == 1)
                            CenMassWtIpEM(baseA->cds[i], scratchA->avecds, scratchA->w);
                        else
                            CenMassWtIp(baseA->cds[i], scratchA->w);
                    }
                }
                else
                {
                    CenMassCov(baseA->cds[i], (const double **) scratchA->WtMat);
                }

                //printf("\n********** cen[%d]: %f %f %f", i+1, baseA->cds[i]->center[0], baseA->cds[i]->center[1], baseA->cds[i]->center[2]);
            }
        }
    }

//  fflush(NULL);

/*  for (i = 0; i < scratchA->cnum; ++i) */
/*      memcpy(scratchA->cds[i]->center, baseA->cds[i]->center, 3 * sizeof(double)); */
}


void
CalcTranslationsIp(CdsArray *scratchA, Algorithm *algo)
{
    Cds        **cds = scratchA->cds;
    int             i, j, cnt;
    double          chi2;

    if (algo->notrans == 0)
    {
        if (algo->htrans == 1 && algo->rounds > 1)
        {
            double      logL, lvar, varsum/* , chi2 */;
            double     *mean = malloc(3 * sizeof(double));
            double     *var = malloc(3 * sizeof(double));
            double     *trans = malloc(scratchA->cnum * sizeof(double));

            varsum = FLT_MAX;
            cnt = 0;
            do
            {
                lvar = varsum;
                ++cnt;
                varsum = 0.0;
                for (j = 0; j < 3; ++j)
                {
                    for (i = 0; i < scratchA->cnum; ++i)
                        trans[i] = -cds[i]->center[j];

                    chi2 = normal_fit((const double *) trans, scratchA->cnum, &mean[j], &var[j], &logL);
                    varsum += var[j];
                    printf("\n %3d:%d chi2: %f mean: %f  var: %f logL: %f wtnorm: %f",
                           cnt, j, chi2, mean[j], var[j], logL, scratchA->stats->wtnorm);
                    fflush(NULL);
                }

                printf("\n %3d: varsum:%f", cnt, varsum);

                for (i = 0; i < scratchA->cnum; ++i)
                    CenMassWtHVarIp(cds[i], scratchA->w, scratchA->stats->wtnorm, mean, varsum, algo->precision);
                break; /* iterating converges to singularities */
            }
            while(fabs(lvar - varsum) > algo->precision * varsum);

            scratchA->stats->htrans_ave = mean[0];
            scratchA->stats->htrans_var = varsum;
            scratchA->stats->htrans_chi2 = chi2;

            free(trans);
            free(var);
            free(mean);
        }
        else
        {
            for (i = 0; i < scratchA->cnum; ++i)
            {
                if (algo->covweight == 0)
                {
/*                     if (algo->alignment == 1 && algo->rounds < 3) */
/*                     { */
/*                         CenMassWtIpOcc(scratchA->cds[i], scratchA->w); */
/*                     } */
/*                     else */
/*                     { */
                    if (algo->commandeur == 1)
                        CenMassWtIpEM(scratchA->cds[i], scratchA->avecds, scratchA->w);
                    else
                        CenMassWtIp(scratchA->cds[i], scratchA->w);
/*                     } */
                }
                else
                {
                    CenMassCov(scratchA->cds[i], (const double **) scratchA->WtMat);
                }
            }
        }
    }
}


void
MatMultCdsMultMatDiag(Cds *outcds, const double **matK, const Cds *cds)
{
    int             i, k;
    const int       vlen = cds->vlen;
    double        **TmpMat = MatAlloc(vlen, 3);
    double          matKik;


    for (i = 0; i < vlen; ++i)
    {
        for (k = 0; k < vlen; ++k)
        {
            matKik = matK[i][k];
            TmpMat[i][0] += matKik * cds->x[k];
            TmpMat[i][1] += matKik * cds->y[k];
            TmpMat[i][2] += matKik * cds->z[k];
        }
    }

    for (i = 0; i < vlen; ++i)
    {
        outcds->x[i] = TmpMat[i][0];
        outcds->y[i] = TmpMat[i][1];
        outcds->z[i] = TmpMat[i][2];
    }

    MatDestroy(&TmpMat);
}


void
MatMultCdsMultMat(Cds *outcds, const double **matK, const Cds *cds, const double **matD)
{
    int             i, k;
    const int       vlen = cds->vlen;
    double        **TmpMat = MatAlloc(vlen, 3);
    double          matKik;
    double          xi, yi, zi;

    for (i = 0; i < vlen; ++i)
    {
        for (k = 0; k < vlen; ++k)
        {
            matKik = matK[i][k];
            TmpMat[i][0] += matKik * cds->x[k];
            TmpMat[i][1] += matKik * cds->y[k];
            TmpMat[i][2] += matKik * cds->z[k];
        }
    }

    for (i = 0; i < vlen; ++i)
    {
        xi = TmpMat[i][0];
        yi = TmpMat[i][1];
        zi = TmpMat[i][2];

        outcds->x[i] = xi * matD[0][0] + yi * matD[1][0] + zi * matD[2][0];
        outcds->y[i] = xi * matD[0][1] + yi * matD[1][1] + zi * matD[2][1];
        outcds->z[i] = xi * matD[0][2] + yi * matD[1][2] + zi * matD[2][2];
    }

    MatDestroy(&TmpMat);
}


void
MatDiagMultCdsMultMat(Cds *outcds, const double *diag, const Cds *cds, const double **matD)
{
    int             i;
    const int       vlen = cds->vlen;
    double          diagi;
    double          xi, yi, zi;

    for (i = 0; i < vlen; ++i)
    {
        diagi = diag[i];
        xi = diagi * cds->x[i];
        yi = diagi * cds->y[i];
        zi = diagi * cds->z[i];

        outcds->x[i] = xi * matD[0][0] + yi * matD[1][0] + zi * matD[2][0];
        outcds->y[i] = xi * matD[0][1] + yi * matD[1][1] + zi * matD[2][1];
        outcds->z[i] = xi * matD[0][2] + yi * matD[1][2] + zi * matD[2][2];
    }
}


void
MatDiagMultCdsMultMatDiag(Cds *outcds, const double *wtK, const Cds *cds)
{
    int             i;
    double          wtKi;
    const double   *x = (const double *) cds->x,
                   *y = (const double *) cds->y,
                   *z = (const double *) cds->z;

    for (i = 0; i < cds->vlen; ++i)
    {
        wtKi = wtK[i];

        outcds->x[i] = wtKi * x[i];
        outcds->y[i] = wtKi * y[i];
        outcds->z[i] = wtKi * z[i];
    }
}


static double
InnerProduct(double *A, Cds *cds1, Cds *cds2, const int len, const double *weight)
{
    double          x1, x2, y1, y2, z1, z2;
    int             i;
    const double   *fx1 = cds1->x, *fy1 = cds1->y, *fz1 = cds1->z;
    const double   *fx2 = cds2->x, *fy2 = cds2->y, *fz2 = cds2->z;
    double          G1 = 0.0, G2 = 0.0;
    A[0] = A[1] = A[2] = A[3] = A[4] = A[5] = A[6] = A[7] = A[8] = 0.0;
    if (weight != NULL)
   {
      for (i = 0; i < len; ++i)
      {
         x1 = weight[i] * fx1[i];
         y1 = weight[i] * fy1[i];
         z1 = weight[i] * fz1[i];

         G1 += x1 * fx1[i] + y1 * fy1[i] + z1 * fz1[i];

         x2 = fx2[i];
         y2 = fy2[i];
         z2 = fz2[i];

         G2 += weight[i] * (x2 * x2 + y2 * y2 + z2 * z2);

         A[0] +=  (x1 * x2);
         A[1] +=  (x1 * y2);
         A[2] +=  (x1 * z2);

         A[3] +=  (y1 * x2);
         A[4] +=  (y1 * y2);
         A[5] +=  (y1 * z2);

         A[6] +=  (z1 * x2);
         A[7] +=  (z1 * y2);
         A[8] +=  (z1 * z2);   
      }
   }
   else
   {
      for (i = 0; i < len; ++i)
      {
         x1 = fx1[i];
         y1 = fy1[i];
         z1 = fz1[i];

         G1 += x1 * x1 + y1 * y1 + z1 * z1;

         x2 = fx2[i];
         y2 = fy2[i];
         z2 = fz2[i];

         G2 += (x2 * x2 + y2 * y2 + z2 * z2);

         A[0] +=  (x1 * x2);
         A[1] +=  (x1 * y2);
         A[2] +=  (x1 * z2);

         A[3] +=  (y1 * x2);
         A[4] +=  (y1 * y2);
         A[5] +=  (y1 * z2);

         A[6] +=  (z1 * x2);
         A[7] +=  (z1 * y2);
         A[8] +=  (z1 * z2);  
      }
   }
    return (G1 + G2) * 0.5;
}


static
int FastCalcRMSDAndRotation(double *rot, double *A, double *msd, double E0, int len, double minScore)
{
   double          Sxx, Sxy, Sxz, Syx, Syy, Syz, Szx, Szy, Szz;
   double          Szz2, Syy2, Sxx2, Sxy2, Syz2, Sxz2, Syx2, Szy2, Szx2,
                   SyzSzymSyySzz2, Sxx2Syy2Szz2Syz2Szy2, Sxy2Sxz2Syx2Szx2,
                   SxzpSzx, SyzpSzy, SxypSyx, SyzmSzy,
                   SxzmSzx, SxymSyx, SxxpSyy, SxxmSyy;
   double          C[4];
   int i;
   double mxEigenV; 
   double oldg = 0.0;
   double b, a, delta, ms;
   double q1, q2, q3, q4, normq;
   double d11, d12, d13, d14, d21, d22, d23, d24;
   double d31, d32, d33, d34, d41, d42, d43, d44;
   double a2, x2, y2, z2; 
   double xy, az, zx, ay, yz, ax; 
   double d3344_4334, d3244_4234, d3243_4233, d3143_4133,d3144_4134, d3142_4132; 

   Sxx = A[0]; Sxy = A[1]; Sxz = A[2];
   Syx = A[3]; Syy = A[4]; Syz = A[5];
   Szx = A[6]; Szy = A[7]; Szz = A[8];

   Sxx2 = Sxx * Sxx;
   Syy2 = Syy * Syy;
   Szz2 = Szz * Szz;

   Sxy2 = Sxy * Sxy;
   Syz2 = Syz * Syz;
   Sxz2 = Sxz * Sxz;

   Syx2 = Syx * Syx;
   Szy2 = Szy * Szy;
   Szx2 = Szx * Szx;

   SyzSzymSyySzz2 = 2.0*(Syz*Szy - Syy*Szz);
   Sxx2Syy2Szz2Syz2Szy2 = Syy2 + Szz2 - Sxx2 + Syz2 + Szy2;

   C[2] = -2.0 * (Sxx2 + Syy2 + Szz2 + Sxy2 + Syx2 + Sxz2 + Szx2 + Syz2 + Szy2);
   C[1] = 8.0 * (Sxx*Syz*Szy + Syy*Szx*Sxz + Szz*Sxy*Syx - Sxx*Syy*Szz - Syz*Szx*Sxy - Szy*Syx*Sxz);

   SxzpSzx = Sxz + Szx;
   SyzpSzy = Syz + Szy;
   SxypSyx = Sxy + Syx;
   SyzmSzy = Syz - Szy;
   SxzmSzx = Sxz - Szx;
   SxymSyx = Sxy - Syx;
   SxxpSyy = Sxx + Syy;
   SxxmSyy = Sxx - Syy;
   Sxy2Sxz2Syx2Szx2 = Sxy2 + Sxz2 - Syx2 - Szx2;

   C[0] = Sxy2Sxz2Syx2Szx2 * Sxy2Sxz2Syx2Szx2
          + (Sxx2Syy2Szz2Syz2Szy2 + SyzSzymSyySzz2) * (Sxx2Syy2Szz2Syz2Szy2 - SyzSzymSyySzz2)
          + (-(SxzpSzx)*(SyzmSzy)+(SxymSyx)*(SxxmSyy-Szz)) * (-(SxzmSzx)*(SyzpSzy)+(SxymSyx)*(SxxmSyy+Szz))
          + (-(SxzpSzx)*(SyzpSzy)-(SxypSyx)*(SxxpSyy-Szz)) * (-(SxzmSzx)*(SyzmSzy)-(SxypSyx)*(SxxpSyy+Szz))
          + (+(SxypSyx)*(SyzpSzy)+(SxzpSzx)*(SxxmSyy+Szz)) * (-(SxymSyx)*(SyzmSzy)+(SxzpSzx)*(SxxpSyy+Szz))
          + (+(SxypSyx)*(SyzmSzy)+(SxzmSzx)*(SxxmSyy-Szz)) * (-(SxymSyx)*(SyzpSzy)+(SxzmSzx)*(SxxpSyy-Szz));


   mxEigenV = E0;
   for (i = 0; i < 100; ++i)
   {
     oldg = mxEigenV;
     x2 = mxEigenV*mxEigenV;
     b = (x2 + C[2])*mxEigenV;
     a = b + C[1];
     delta = ((a*mxEigenV + C[0])/(2.0*x2*mxEigenV + b + a));
     mxEigenV -= delta;
     if (fabs(mxEigenV - oldg) < fabs((1e-6)*mxEigenV)) {
       break;
     }
   }
   if (i == 100) 
   {
      fprintf(stderr,"\n WARNING40: More than %d iterations needed in FastCalcRMSDAndRotation()\n", i);
   }

   ms = 2.0 * (E0 - mxEigenV) / len;
   (*msd) = ms;

   if (minScore > 0) 
   {
        if (ms < minScore)
        {
            // Don't bother with rotation. 
            return -1;
        }   
   }

   d11 = SxxpSyy + Szz-mxEigenV; d12 = SyzmSzy; d13 = - SxzmSzx; d14 = SxymSyx;
   d21 = SyzmSzy; d22 = SxxmSyy - Szz-mxEigenV; d23 = SxypSyx; d24= SxzpSzx;
   d31 = d13; d32 = d23; d33 = Syy-Sxx-Szz - mxEigenV; d34 = SyzpSzy;
   d41 = d14; d42 = d24; d43 = d34; d44 = Szz - SxxpSyy - mxEigenV;
   d3344_4334 = d33 * d44 - d43 * d34; d3244_4234 = d32 * d44-d42*d34;
   d3243_4233 = d32 * d43 - d42 * d33; d3143_4133 = d31 * d43-d41*d33;
   d3144_4134 = d31 * d44 - d41 * d34; d3142_4132 = d31 * d42-d41*d32;
   q1 =  d22*d3344_4334-d23*d3244_4234+d24*d3243_4233;
   q2 = -d21*d3344_4334+d23*d3144_4134-d24*d3143_4133;
   q3 =  d21*d3244_4234-d22*d3144_4134+d24*d3142_4132;
   q4 = -d21*d3243_4233+d22*d3143_4133-d23*d3142_4132;

   normq = sqrt(q1 * q1 + q2 * q2 + q3 * q3 + q4 * q4);
   q1 /= normq; q2 /= normq; q3 /= normq; q4 /= normq;

   a2 = q1 * q1;
   x2 = q2 * q2;
   y2 = q3 * q3;
   z2 = q4 * q4;

   xy = q2 * q3;
   az = q1 * q4;
   zx = q4 * q2;
   ay = q1 * q3;
   yz = q3 * q4;
   ax = q1 * q2;

   rot[0] = a2 + x2 - y2 - z2;
   rot[1] = 2 * (xy + az);
   rot[2] = 2 * (zx - ay);
   rot[3] = 2 * (xy - az);
   rot[4] = a2 - x2 + y2 - z2;
   rot[5] = 2 * (yz + ax);
   rot[6] = 2 * (zx + ay);
   rot[7] = 2 * (yz - ax);
   rot[8] = a2 - x2 - y2 + z2;

   return 0;
}


/* static void  */
/* CenterCds(Cds *cds, const int len) */
/* { */
/*     int             i; */
/*     double          xsum, ysum, zsum; */
/*     double         *x = cds->x, *y = cds->y, *z = cds->z; */
/*  */
/*     xsum = ysum = zsum = 0.0; */
/*     for (i = 0; i < len; ++i) */
/*     { */
/*         xsum += x[i]; */
/*         ysum += y[i]; */
/*         zsum += z[i]; */
/*     } */
/*  */
/*     xsum /= len; */
/*     ysum /= len; */
/*     zsum /= len; */
/*  */
/*     for (i = 0; i < len; ++i) */
/*     { */
/*         x[i] -= xsum; */
/*         y[i] -= ysum; */
/*         z[i] -= zsum; */
/*     } */
/* } */


static double
CalcRMSDRotationalMatrix(Cds *cds1, Cds *cds2, const int len, double *rot, const double *weight)
{
    double          A[9];
    double          rmsd;

    /* center the structures */
    //CenterCds(cds1, len);
    //CenterCds(cds2, len);

    /* calculate the (weighted) inner product of two structures */
    double E0 = InnerProduct(A, cds1, cds2, len, weight);

    /* calculate the RMSD & rotational matrix */
    FastCalcRMSDAndRotation(rot, A, &rmsd, E0, len, -1);

    return rmsd;
}


double
CalcRotations(CdsArray *cdsA)
{
    Cds        **cds = cdsA->cds;
    const Cds   *avecds = cdsA->avecds;
    const double   *wts = (const double *) cdsA->w;
    Cds         *tcds = cdsA->tcds;
    double          deviation = 0.0, deviation_sum = 0.0;
    int             i;

    if (cdsA->algo->norot == 0)
    {
        if (cdsA->algo->method == 3) /* default */
        {
            if (cdsA->algo->covweight == 1)
            {
                MatMultCdsMultMatDiag(tcds,
                                         (const double **) cdsA->WtMat,
                                         avecds);
            }
            else if (cdsA->algo->varweight == 1 || cdsA->algo->leastsquares == 1)
            {
                MatDiagMultCdsMultMatDiag(tcds,
                                             wts,
                                             avecds);
            }

            for (i = 0; i < cdsA->cnum; ++i)
            {
                if (cdsA->algo->tenberge == 1)
                {
                    AveCdsTB(cdsA, i);
                    MatDiagMultCdsMultMatDiag(tcds,
                                 wts,
                                 avecds);
                }

                /* note that the avecds are already multiplied by the weight matrices */
                deviation = CalcRMSDRotationalMatrix(cds[i], tcds, cds[i]->vlen, &cds[i]->matrix[0][0], NULL);

/*                     deviation = Kabsch(cds[i], */
/*                                        tcds, */
/*                                        cds[i]->matrix, */
/*                                        cdsA->tmpmat3a, */
/*                                        cdsA->tmpmat3b, */
/*                                        cdsA->tmpmat3c, */
/*                                        cdsA->tmpvec3a); */

                    deviation = ProcGSLSVDvan(cds[i],
                                              tcds,
                                              cds[i]->matrix,
                                              cdsA->tmpmat3a,
                                              cdsA->tmpmat3b,
                                              cdsA->tmpmat3c,
                                              cdsA->tmpvec3a);

                /* find global rmsd and average cds (both held in structure) */
                cds[i]->wRMSD_from_mean = sqrt(deviation / (3 * cdsA->vlen));
                deviation_sum += deviation;
            }
        }
    }

    return(deviation_sum);
}


/* This is the classic iterative (not eigendecomp) solution given by Gower 1975 and in 
   Gower and Dijksterhuis 2004, Ch 9, page 113, Eqn 9.21 */
double
CalcScaleFactors(CdsArray *cdsA)
{
    Cds         *cdsi = NULL;
    Cds        **cds = cdsA->cds;
    Cds         *avecds = cdsA->avecds;
    double         *wts = cdsA->w;
    int             i;
    const int       cnum = cdsA->cnum, vlen = cdsA->vlen;
    double          scaleprod, selfprod, innprod, norm, avecdstr, oldscale, factor;


    if (cdsA->algo->leastsquares == 1)
    {
        avecdstr = TrCdsInnerProd(avecds, vlen);

        norm = 0.0;
        for (i = 0; i < cnum; ++i)
            norm += TrCdsInnerProd(cds[i], vlen);
    }
    else if (cdsA->algo->varweight == 1)
    {
        avecdstr = TrCdsInnerProdWt(avecds, vlen, wts);

        norm = 0.0;
        for (i = 0; i < cnum; ++i)
            norm += TrCdsInnerProdWt(cds[i], vlen, wts);
    }
    else
    {
        norm = avecdstr = 1.0;
    }

        for (i = 0; i < vlen; ++i)
            wts[i] = 1.0 / cdsA->var[i];

    scaleprod = 0.0;
    for (i = 0; i < cnum; ++i)
    {
        cdsi = cdsA->cds[i];
        oldscale = cdsi->scale;

        if (cdsA->algo->leastsquares == 1)
        {
            selfprod = TrCdsInnerProd(cdsi, vlen) / (oldscale * oldscale);
            innprod = TrCdsInnerProd2(cdsi, avecds, vlen) / oldscale;

        }
        else if (cdsA->algo->varweight == 1)
        {
            selfprod = TrCdsInnerProdWt(cdsi, vlen, wts) / (oldscale * oldscale);
            innprod = TrCdsInnerProdWt2(cdsi, avecds, vlen, wts) / oldscale-1.0;
        }
        else
        {
            innprod = selfprod = 1.0;
        }

        cdsi->scale = norm * innprod / (cnum * avecdstr * selfprod);
        cdsi->scale = (sqrt(innprod*innprod + 12.0 * (double) vlen * selfprod) + innprod) / (2.0 * selfprod);
        //cds[i]->scale = innprod / selfprod;
        scaleprod += log(cds[i]->scale);
        factor = cdsi->scale / oldscale;
        ScaleCds(cdsi, factor);
        printf("\nfactor[%3d] = %12.6e -- scale = %12.6e", i+1, factor, cdsi->scale);
    }

    scaleprod = exp(scaleprod / (double) cnum);

    double bsum = 0.0;
    for (i = 0; i < cnum; ++i)
        bsum += cdsA->cds[i]->scale;

    for (i = 0; i < cnum; ++i)
        printf("\nscale[%3d]: %12.6f", i+1, 15.5 * 30.0 * cdsA->cds[i]->scale / bsum);

//    for (i = 0; i < cnum; ++i)
//        cds[i]->scale /= scaleprod;

    return(scaleprod);
}


void
ConstrainCovMat(CdsArray *cdsA)
{
    int             i, j;
    double          tmpx, tmpy, tmpz;
    double          lagrange, vari;
    const int       cnum = cdsA->cnum, vlen = cdsA->vlen;
    double         *var = cdsA->var;
    const Cds  **cds = (const Cds **) cdsA->cds;
    Cds         *cdsj;
    const double   *avex = (const double *) cdsA->avecds->x,
                   *avey = (const double *) cdsA->avecds->y,
                   *avez = (const double *) cdsA->avecds->z;

    lagrange = 0.0;
    for (j = 0; j < cnum; ++j)
    {
        cdsj = (Cds *) cds[j];

		for (i = 0; i < vlen; ++i)
		{
			tmpx = cdsj->x[i] - avex[i];
			vari = tmpx * tmpx;
			tmpy = cdsj->y[i] - avey[i];
			vari += tmpy * tmpy;
			tmpz = cdsj->z[i] - avez[i];
			vari += tmpz * tmpz;

            lagrange += vari / var[i];
		}
    }

    lagrange = lagrange / (3.0 * cnum * vlen) - 1.0;
    
    if (lagrange < 0.0)
        lagrange = 0.0;
    
    printf("\nlagrange = % 12.6e", lagrange);

    for (i = 0; i < vlen; ++i)
        var[i] -= lagrange * (avex[i]*avex[i] + avey[i]*avey[i] + avez[i]*avez[i]) / (3.0 * cnum);
}


/* This is the unconstrained ML solution */
double
CalcScaleFactorsML(CdsArray *cdsA)
{
    Cds        **cds = cdsA->cds;
    Cds         *avecds = cdsA->avecds;
    Cds         *cdsi = NULL;
    double         *wts = cdsA->w;
    int             i;
    const int       cnum = cdsA->cnum, vlen = cdsA->vlen;
    double          bsum, scaleprod, phi, gamma, sigma2, oldscale, factor, nkd = 3.0 * cnum * vlen;
//    double          var = cdsA->stats->var;

    CalcCovariances(cdsA);
    CalcWts(cdsA);

    scaleprod = 0.0;
    if (cdsA->algo->leastsquares == 1)
    {
        sigma2 = 0.0;
        for (i = 0; i < vlen; ++i)
            sigma2 += cdsA->var[i];
        sigma2 /= (double) vlen;

        bsum = 1.0;
    
        printf("\nsigma2 = %12.6e \n", sigma2);

        for (i = 0; i < cnum; ++i)
        {
            cdsi = cdsA->cds[i];
            oldscale = cdsi->scale;
            phi = bsum * TrCdsInnerProd(cdsi, vlen) / (oldscale * oldscale);
            gamma = TrCdsInnerProd2(cdsi, avecds, vlen) / oldscale;
            cdsi->scale = (sqrt(gamma*gamma + 12.0 * vlen * sigma2 * phi) + gamma) / (2.0 * phi);
            scaleprod += log(cdsi->scale);
            factor = cdsi->scale / oldscale;
            ScaleCds(cdsi, factor);
            printf("\nfactor[%3d] = %12.6e -- scale = %12.6e", i+1, factor, cdsi->scale);
        }

        /* This is to verify that our implicit constraint is actually in effect. */
        bsum = 0.0;
        for (i = 0; i < cnum; ++i)
            bsum += log(cdsA->cds[i]->scale);

        printf("\nblogsum = %12.6e", bsum);

        bsum = 0.0;
        for (i = 0; i < cnum; ++i)
            bsum += TrCdsInnerProd(cds[i], vlen) - TrCdsInnerProd2(cds[i], avecds, vlen);

        printf("\nbsum = %12.6e %12.6e % 12.6e", bsum/sigma2, nkd, bsum/sigma2 - nkd);
        //bsum = (bsum / (3.0 * cnum * vlen)) + 1.0;

        scaleprod = exp(scaleprod / (double) cnum);
    }
    else if (cdsA->algo->varweight == 1)
    {
        for (i = 0; i < vlen; ++i)
            wts[i] = 1.0 / cdsA->var[i];

        double constraint = 0.0;

        for (i = 0; i < vlen; ++i)
            constraint += wts[i] * (avecds->x[i] * avecds->x[i] + 
                                    avecds->y[i] * avecds->y[i] + 
                                    avecds->z[i] * avecds->z[i]);

        constraint = constraint / (3.0 * vlen) + 1.0;
        printf("\nconstraint = % 12.6e", constraint);

        for (i = 0; i < cnum; ++i)
        {
            cdsi = cdsA->cds[i];
            oldscale = cdsi->scale;
            phi = constraint * TrCdsInnerProdWt(cdsi, vlen, wts) / (oldscale * oldscale);
            gamma = TrCdsInnerProdWt2(cdsi, avecds, vlen, wts) / oldscale;
            cdsi->scale = (sqrt(gamma*gamma + 12.0 * vlen * phi) + gamma) / (2.0 * phi);
            scaleprod += log(cdsi->scale);
            factor = cdsi->scale / oldscale;
            ScaleCds(cdsi, factor);
            printf("\nfactor[%3d] = %12.6e -- scale = %12.6e", i+1, factor, cdsi->scale);
        }

        /* This is to verify that our implicit constraint is actually in effect. */
        bsum = 0.0;
        for (i = 0; i < cnum; ++i)
            bsum += TrCdsInnerProdWt(cds[i], vlen, wts) - TrCdsInnerProdWt2(cds[i], avecds, vlen, wts);

        printf("\nbsum = %12.6e %12.6e % 12.6e", bsum, nkd, bsum - nkd);

        double phisum = 0.0;
        for (i = 0; i < cnum; ++i)
            phisum += TrCdsInnerProdWt(cds[i], vlen, wts);

        double gammasum = 0.0;
        for (i = 0; i < cnum; ++i)
            gammasum +=TrCdsInnerProdWt2(cds[i], avecds, vlen, wts);

        printf("\nphisum, gammasum: % 12.6e % 12.6e % 12.6e", phisum, gammasum, 3.0*vlen);

        scaleprod = exp(scaleprod / (double) cnum);

/*         CalcRotations(cdsA); */
/*         for (i = 0; i < cnum; ++i) */
/*             RotateCdsIp(cds[i], (const double **) cds[i]->matrix); */
/*         AveCds(cdsA); */
        CalcCovariances(cdsA);
        //ConstrainCovMat(cdsA);
        CalcWts(cdsA);
    }
    else
    {
        gamma = phi = 1.0;
        for (i = 0; i < cnum; ++i)
            cds[i]->scale = 1.0;
        scaleprod = 1.0;
    }

/*     bsum = 0.0; */
/*     for (i = 0; i < cnum; ++i) */
/*         bsum += cdsA->cds[i]->scale; */
/*  */
/*     for (i = 0; i < cnum; ++i) */
/*         printf("\nscale[%3d]: %12.6f", i+1, 15.5 * 30.0 * cdsA->cds[i]->scale / bsum); */

/*     for (i = 0; i < cnum; ++i) */
/*         cds[i]->scale /= scaleprod; */

    return(scaleprod);
}


static void
evallognormal(const double beta, const double phi, const double gamma, const double mu, const int vlen, const double lambda, double *fx, double *dfx)
{
    *fx = phi * beta * beta - gamma * beta + log(beta)/mu - 3.0 * vlen - lambda;
    *dfx = 2.0 * beta * phi - gamma + 1.0 / (mu * beta);
}


static double
NewtRaphScaleLogNorm(const double init, const double phi, const double gamma, const double mu, const int vlen, const double lambda,  const double tol)
{
    int            i;
    double         beta, fx, dfx;

    /* Use Newton-Raphson to find ML estimate of lognormally distributed
       scale factors.

       must find root of:

          F1 =  = 0

       where the first derivative with repect to the lognormal scale
       estimate x (dF1/dx) is:

          F1' = 
    */
    beta = init;
    for (i = 0; i < 200; ++i)
    {
        evallognormal(beta, phi, gamma, mu, vlen, lambda, &fx, &dfx);

        if (fabs(fx) < tol)
            break; /* success */

        beta -= (fx / dfx); /* Newton-Raphson correction */
    }

    if (i == 200)
        beta = init;

    printf("\n init, beta: %10.5f %10.5f", init, beta);
    return(beta);
}


static double
CalcMu(CdsArray *cdsA)
{
    int             i;
    const int       cnum = cdsA->cnum;
    double          logb, logbsum;

    logbsum = 0.0;
    for (i = 0; i < cnum; ++i)
    {
        logb = log(cdsA->cds[i]->scale);
        logbsum += logb*logb;
    }
    printf("\n logbsum: %10.5f", logbsum);

    return(0.5 * (sqrt(1.0 + 4.0 * logbsum / cnum) - 1.0));
}


static double
CalcTheta(CdsArray *cdsA)
{
    int             i;
    const int       cnum = cdsA->cnum;
    double          logb, logbsum;

    logbsum = 0.0;
    for (i = 0; i < cnum; ++i)
    {
        logb = log(cdsA->cds[i]->scale);
        logbsum += logb*logb;
    }

    return(logbsum / cnum);
}


double
CalcScaleFactorsMLLogNorm(CdsArray *cdsA)
{
    Cds        **cds = cdsA->cds;
    Cds         *cdsi = NULL;
    Cds         *avecds = cdsA->avecds;
    double         *wts = cdsA->w;
    const int       cnum = cdsA->cnum, vlen = cdsA->vlen;
    double          scaleprod, init, mu, theta, lambda, phi, gamma, sigma2, oldscale, factor;
    int             i;
    double          tol = cdsA->algo->precision;
    //double         *variance = cdsA->var;

    scaleprod = 0.0;
    if (cdsA->algo->leastsquares == 1)
    {
        lambda = 0.0;
        for (i = 0; i < cnum; ++i)
            lambda += TrCdsInnerProd(cds[i], vlen) - TrCdsInnerProd2(cds[i], avecds, vlen);

        lambda = (lambda - 3.0 * vlen * cnum) / cnum;

        sigma2 = 0.0;
        for (i = 0; i < vlen; ++i)
            sigma2 += cdsA->var[i];
        sigma2 /= (double) vlen;

        printf("\nsigma2 = %12.6e \n", sigma2);

        for (i = 0; i < cnum; ++i)
        {
/*             lambda = 0.0; */
/*             for (j = 0; j < cnum; ++j) */
/*                 lambda += TrCdsInnerProd(cds[j], vlen) - TrCdsInnerProd2(cds[j], avecds, vlen); */
/*      */
/*             lambda = (lambda - 3.0 * vlen * cnum + cnum) / cnum; */
/*             printf("\nlambda = %12.6e \n", lambda); */
            lambda = 0.0;

            cdsi = cdsA->cds[i];
            oldscale = cdsi->scale;
            phi = TrCdsInnerProd(cdsi, vlen) / (sigma2 * oldscale * oldscale);
            gamma = TrCdsInnerProd2(cdsi, avecds, vlen) / (sigma2 * oldscale);
            init = gamma / phi;
            mu = CalcMu(cdsA);
            theta = CalcTheta(cdsA);
            printf("\nmu = %12.6e \n", mu);
            printf("\ntheta = %12.6e \n", theta);
            cdsi->scale = NewtRaphScaleLogNorm(init, phi, gamma, mu, vlen, lambda, tol);
            //cdsi->scale = (sqrt(gamma*gamma + 12.0 * (double) vlen * sigma2 * phi) + gamma) / (2.0 * phi);
            scaleprod += log(cdsi->scale);
            factor = cdsi->scale / oldscale;
            ScaleCds(cdsi, factor);
            printf("\nfactor[%3d] = %12.6e -- scale = %12.6e", i+1, factor, cdsi->scale);
        }

        scaleprod = exp(scaleprod / (double) cnum);
    }
    else if (cdsA->algo->varweight == 1)
    {
/*         int j;         */
/*         lambda = 0.0; */
/*         for (j = 0; j < cnum; ++j) */
/*             lambda += TrCdsInnerProdWt(cds[j], vlen, wts) - TrCdsInnerProdWt2(cds[j], avecds, vlen, wts); */
/*  */
/*         lambda = (lambda - 3.0 * vlen * cnum) / cnum; */
/*         printf("\nlambda = %12.6e \n", lambda); */

        lambda = 0.0;

        for (i = 0; i < vlen; ++i)
            wts[i] = 1.0 / cdsA->var[i];

        for (i = 0; i < cnum; ++i)
        {
            cdsi = cdsA->cds[i];
            oldscale = cdsi->scale;
            phi = TrCdsInnerProdWt(cdsi, vlen, wts) / (oldscale * oldscale);
            gamma = TrCdsInnerProdWt2(cdsi, avecds, vlen, wts) / oldscale;

            if (cdsA->algo->rounds > 8)
                init = cdsi->scale;
            else
                init = gamma / phi;

            mu = CalcMu(cdsA);
            theta = CalcTheta(cdsA);
            printf("\nmu = %12.6e ", mu);
            printf("\ntheta = %12.6e ", theta);
            cdsi->scale = NewtRaphScaleLogNorm(init, phi, gamma, mu, vlen, lambda, tol);
            //cdsi->scale = (sqrt(gamma*gamma + 12.0 * (double) vlen * phi) + gamma) / (2.0 * phi);
            scaleprod += log(cdsi->scale);
            factor = cdsi->scale / oldscale;
            ScaleCds(cdsi, factor);
            printf("\nfactor[%3d] = %12.6e -- scale = %12.6e", i+1, factor, cdsi->scale);
        }
    }
    else
    {
        phi = gamma = 1.0;
        for (i = 0; i < cnum; ++i)
            cds[i]->scale = 1.0;
        scaleprod = 1.0;
    }

    double bsum = 0.0;
    for (i = 0; i < cnum; ++i)
        bsum += cdsA->cds[i]->scale;

    for (i = 0; i < cnum; ++i)
        printf("\nscale[%3d]: %12.6f", i+1, 15.5 * 30.0 * cdsA->cds[i]->scale / bsum);

    return(scaleprod);
}


/* */
double
CalcScaleFactorsMLLog(CdsArray *cdsA)
{
    Cds        **cds = cdsA->cds;
    Cds         *avecds = cdsA->avecds;
    const double   *wts = (const double *) cdsA->w;
    int             i, cnum = cdsA->cnum, vlen = cdsA->vlen;
    double          scaleprod, selfprod, innprod, sigma2, theta;

    theta = 0.0;
    for (i = 0; i < cnum; ++i)
        theta += cds[i]->scale * cds[i]->scale;
    theta /= (double) cnum;

    CalcCovariances(cdsA);
    CalcWts(cdsA);

    sigma2 = 0.0;
    for (i = 0; i < vlen; ++i)
        sigma2 += cdsA->var[i];
    sigma2 /= (double) vlen;

    printf("\nsigma2 = %12.6e \n", sigma2);

    scaleprod = 0.0;
    if (cdsA->algo->leastsquares == 1)
    {
        for (i = 0; i < cnum; ++i)
        {
            innprod = TrCdsInnerProd2(cds[i], avecds, vlen);
            selfprod = TrCdsInnerProd(cds[i], vlen);
            cds[i]->scale =
            (sqrt(innprod*innprod + 4.0 * (3.0 * (double) vlen  - 1.0 - log(cds[i]->scale)/theta) * sigma2 * selfprod) + innprod) / (2.0 * selfprod);
            scaleprod += log(cds[i]->scale);
        }

        scaleprod = exp(scaleprod / (double) cnum);
    }
    else if (cdsA->algo->varweight == 1)
    {
        for (i = 0; i < cnum; ++i)
        {
            innprod = TrCdsInnerProdWt2(cds[i], avecds, vlen, wts);
            selfprod = TrCdsInnerProdWt(cds[i], vlen, wts);
            cds[i]->scale = (sqrt(innprod*innprod + 4.0 * (3.0 * (double) vlen  - 1.0 - log(cds[i]->scale)/theta) * selfprod) + innprod) / (2.0 * selfprod);
            scaleprod += log(cds[i]->scale);
        }

        scaleprod = exp(scaleprod / (double) cnum);
    }
    else
    {
        innprod = selfprod = 1.0;
        for (i = 0; i < cnum; ++i)
            cds[i]->scale = 1.0;
        scaleprod = 1.0;
    }

    return(scaleprod);
}


/* This is the constrained ML solution, without the scale factor Jacobian in the PDF */
double
CalcScaleFactorsMLConstr(CdsArray *cdsA)
{
    Cds        **cds = cdsA->cds;
    Cds         *avecds = cdsA->avecds;
    const double   *wts = (const double *) cdsA->w;
    int             i, cnum = cdsA->cnum, vlen = cdsA->vlen;
    double          scaleprod, selfprod, innprod, sigma2, trsig;
    double         *varu = malloc(vlen * sizeof(double));

    CalcCovariances(cdsA);

    memcpy(varu, cdsA->var, vlen);

    CalcWts(cdsA);

    sigma2 = 0.0;
    for (i = 0; i < vlen; ++i)
        sigma2 += cdsA->var[i];
    sigma2 /= (double) vlen;

    printf("\nsigma2 = %12.6e \n", sigma2);

    scaleprod = 0.0;

    if (cdsA->algo->leastsquares == 1)
    {
        for (i = 0; i < cnum; ++i)
        {
            innprod = TrCdsInnerProd2(cds[i], avecds, vlen);
            selfprod = TrCdsInnerProd(cds[i], vlen);
            cds[i]->scale = (sqrt(innprod*innprod + 12.0 * (double) vlen * sigma2 * selfprod) + innprod) / (2.0 * selfprod);
            scaleprod += log(cds[i]->scale);
        }
    }
    else if (cdsA->algo->varweight == 1)
    {
        trsig = 0.0;
        for (i = 0; i < vlen; ++i)
            trsig += varu[i]/cdsA->var[i];

        printf("\ntrsig = %12.6e \n", trsig);

        for (i = 0; i < cnum; ++i)
        {
            innprod = TrCdsInnerProdWt2(cds[i], avecds, vlen, wts);
            selfprod = TrCdsInnerProdWt(cds[i], vlen, wts);
            cds[i]->scale = (sqrt(innprod*innprod + 12.0 * trsig * selfprod) + innprod) / (2.0 * selfprod);
            scaleprod += log(cds[i]->scale);
        }
    }
    else
    {
        innprod = selfprod = 1.0;
        cds[i]->scale = 1.0;
        scaleprod += log(cds[i]->scale);
    }

    free(varu);

    return(exp(scaleprod / (double) cnum));
}


/* This is the constrained ML solution, with (or without) the scale factor Jacobian in the PDF */
double
CalcScaleFactorsMLGoodall(CdsArray *cdsA)
{
    Cds        **cds = cdsA->cds;
    Cds         *avecds = cdsA->avecds;
    const double   *wts = (const double *) cdsA->w;
    int             i, cnum = cdsA->cnum, vlen = cdsA->vlen;
    double          scaleprod, selfprod, innprod, sigma2;

    CalcCovariances(cdsA);
    CalcWts(cdsA);

    sigma2 = 0.0;
    for (i = 0; i < vlen; ++i)
        sigma2 += cdsA->var[i];
    sigma2 /= (double) vlen;

    printf("\nsigma2 = %12.6e \n", sigma2);

    scaleprod = 0.0;
    for (i = 0; i < cnum; ++i)
    {
        if (cdsA->algo->leastsquares == 1)
        {
            innprod = TrCdsInnerProd2(cds[i], avecds, vlen);
            selfprod = TrCdsInnerProd(cds[i], vlen);
        }
        else if (cdsA->algo->varweight == 1)
        {
            innprod = TrCdsInnerProdWt2(cds[i], avecds, vlen, wts);
            selfprod = TrCdsInnerProdWt(cds[i], vlen, wts);
        }
        else
        {
            innprod = selfprod = 1.0;
        }

        cds[i]->scale = (sqrt(innprod*innprod + 12.0 * (double) vlen * sigma2 * selfprod) + innprod) / (2.0 * selfprod);
        scaleprod += log(cds[i]->scale);
    }

    return(exp(scaleprod / (double) cnum));
}


/* constrained LS, so that \Prod_i^N scale_i = 1 */
double
CalcScaleFactorsML2(CdsArray *cdsA)
{
    Cds        **cds = cdsA->cds;
    Cds         *avecds = cdsA->avecds;
    const double   *wts = (const double *) cdsA->w;
    int             i, cnum = cdsA->cnum, vlen = cdsA->vlen;
    double          scaleprod, selfprod, innprod;


    scaleprod = 1.0;
    for (i = 0; i < cnum; ++i)
    {
        if (cdsA->algo->leastsquares == 1)
        {
            innprod = TrCdsInnerProd2(cds[i], avecds, vlen);
            selfprod = TrCdsInnerProd(cds[i], vlen);
        }
        else if (cdsA->algo->varweight == 1)
        {
            innprod = TrCdsInnerProdWt2(cds[i], avecds, vlen, wts);
            selfprod = TrCdsInnerProdWt(cds[i], vlen, wts);
        }
        else
        {
            innprod = selfprod = 1.0;
        }

        cds[i]->scale = innprod / selfprod;
        scaleprod *= cds[i]->scale;
    }

    scaleprod = powf(scaleprod, 1.0 / (double) cnum);

    for (i = 0; i < cnum; ++i)
        cds[i]->scale /= scaleprod;

    return(scaleprod);
}


void
ScaleCdsArray(CdsArray *cdsA)
{
    Cds        **cds = cdsA->cds;
    int             i;

    for (i = 0; i < cdsA->cnum; ++i)
    {
        ScaleCds(cds[i], cds[i]->scale);
        printf("Scale[%3d]: %12.6f\n", i, cds[i]->scale);
    }
}


static void
*CalcRot_pth(void *rotdata_ptr)
{
    int             i;
    double          deviation;
    RotData        *rotdata = (RotData *) rotdata_ptr;
    Cds         *cds;

    for (i = rotdata->start; i < rotdata->end; ++i)
    {
        cds = rotdata->cds[i];
        /* note that the avecds are already multiplied by the weight matrices */
        deviation = CalcRMSDRotationalMatrix(cds, rotdata->tcds, cds->vlen, &cds->matrix[0][0], NULL);

        /* rotate the scratch cds with new rotation matrix */
        RotateCdsIp(cds, (const double **) cds->matrix);

        /* find global rmsd and average cds (both held in structure) */
        cds->wRMSD_from_mean = sqrt(deviation / (3 * rotdata->vlen));
    }

    pthread_exit((void *) 0);
}


static double
CalcRotations_pth(CdsArray *cdsA, RotData **rotdata, pthread_t *callThd,
                  pthread_attr_t *attr, const int thrdnum)
{
    Cds        **cds = cdsA->cds;
    const Cds   *avecds = cdsA->avecds;
    const double   *wts = (const double *) cdsA->w;
    Cds         *tcds = cdsA->tcds;
    double          deviation_sum = 0.0;
    int             i, rc = 0, incr;

    if (cdsA->algo->covweight == 1)
    {
        MatMultCdsMultMatDiag(tcds,
                                 (const double **) cdsA->WtMat,
                                 avecds);
    }
    else if (cdsA->algo->varweight == 1 || cdsA->algo->leastsquares == 1)
    {
        MatDiagMultCdsMultMatDiag(tcds,
                                     wts,
                                     avecds);
    }

    incr = cdsA->cnum / thrdnum;

    for (i = 0; i < thrdnum - 1; ++i)
    {
        rotdata[i]->cds = cds;
        rotdata[i]->tcds = tcds;
        rotdata[i]->start = i * incr;
        rotdata[i]->end = i*incr + incr;
        rotdata[i]->vlen = cdsA->vlen;

        rc = pthread_create(&callThd[i], attr, CalcRot_pth, (void *) rotdata[i]);

        if (rc)
        {
            printf("ERROR811: return code from pthread_create() %d is %d\n", i, rc);
            exit(EXIT_FAILURE);
        }
    }

    rotdata[thrdnum - 1]->cds = cds;
    rotdata[thrdnum - 1]->tcds = tcds;
    rotdata[thrdnum - 1]->start = (thrdnum - 1) * incr;
    rotdata[thrdnum - 1]->end = cdsA->cnum;
    rotdata[thrdnum - 1]->vlen = cdsA->vlen;

    rc = pthread_create(&callThd[thrdnum - 1], attr, CalcRot_pth, (void *) rotdata[thrdnum - 1]);

    if (rc)
    {
        printf("ERROR811: return code from pthread_create() %d is %d\n", i, rc);
        exit(EXIT_FAILURE);
    }

    for (i = 0; i < thrdnum; ++i)
    {
        rc = pthread_join(callThd[i], (void **) NULL);

        if (rc)
        {
            printf("ERROR812: return code from pthread_join() %d is %d\n", i, rc);
            exit(EXIT_FAILURE);
        }
    }

    for (i = 0; i < cdsA->cnum; ++i)
        deviation_sum += 3 * cdsA->vlen * cds[i]->wRMSD_from_mean * cds[i]->wRMSD_from_mean;

    return(deviation_sum);
}


void
HierarchVars(CdsArray *cdsA)
{
    int             i;
    double          mean, mu, lambda, zeta, sigma;

    switch(cdsA->algo->hierarch)
    {
        case 0:
            break;

        /* Assuming a known shape param c, real ML-EM fit */
        case 1:
            if (cdsA->algo->rounds > 4)
                InvGammaEMFixedCFitEvals(cdsA, 0.5, 1);
            else
                InvGammaEMFixedCFitEvals(cdsA, 0.5, 0);
            break;

        /* real ML-EM fit, fitting unknown b and c inverse gamma params (scale and shape, resp.) */
        case 2:
            if (cdsA->algo->rounds > 4)
                InvGammaMLFitEvals(cdsA, 1);
            else
                InvGammaMLFitEvals(cdsA, 0);
            break;

        case 3:
            InvGammaFitEvalsBfact(cdsA, 1); 
            break;

        case 4: 
            /* This is the old approximate method, used in versions 1.0-1.1  */
            /* inverse gamma fit of variances, excluding the smallest 3 */
            /* This accounts for the fact that the smallest three eigenvalues of the covariance
               matrix are always zero, i.e. the covariance matrix is necessarily of rank
               vlen - 3 (or usually less, with inadequate amounts of data 3N-6). */
            if (cdsA->algo->rounds > 4)
                InvGammaFitEvals(cdsA, 1);
            else
                InvGammaFitEvals(cdsA, 0);
            break;

        case 5: /* inverse gamma fit of variances, excluding the smallest 3 */
            /* This accounts for the fact that the smallest three eigenvalues of the covariance
               matrix are always zero, i.e. the covariance matrix is necessarily of rank
               vlen - 3 (or usually less, with inadequate amounts of data 3N-6). 
               __Bayesian Bernardo reference prior on the scale and shape params__. */
            if (cdsA->algo->rounds > 4)
                InvGammaBayesFitEvals(cdsA, 1);
            else
                InvGammaBayesFitEvals(cdsA, 0);
            break;

        case 6:
            if (cdsA->algo->rounds > 4)
                InvGammaEMFixedC(cdsA, 0.5, 1);
            else
                InvGammaEMFixedC(cdsA, 0.5, 0);
/*             if (cdsA->algo->rounds > 4) */
/*                 InvGammaBayesFitEvals3(cdsA, 1); */
/*             else */
/*                 InvGammaBayesFitEvals3(cdsA, 0); */
            break;

//         case 7:
//             /* InvGammaFitVars_minc(cdsA, 1.0, 1); */
//             if (cdsA->algo->rounds > 4)
//                 InvGammaFitEvalsEq(cdsA, 1);
//             else
//                 InvGammaFitEvalsEq(cdsA, 0);
// 
//             if (cdsA->algo->verbose != 0)
//                 printf("    HierarchVars() chi2:%f\n", cdsA->stats->hierarch_chi2);
//             break;

        case 7:
            if (cdsA->algo->rounds > 4)
                InvGammaMLFixedCFitEvals(cdsA, cdsA->algo->minc, 1);
            else
                InvGammaMLFixedCFitEvals(cdsA, cdsA->algo->minc, 0);
            break;

        case 8:
            if (cdsA->algo->rounds > 4)
                InvGammaMLFixedCFitEvals(cdsA, 0.5, 1);
            else
                InvGammaMLFixedCFitEvals(cdsA, 0.5, 0);
            break;

/*         case 8: */
/*             // InvGammaMMFitVars(cdsA, &b, &c); */
/*             if (cdsA->algo->rounds > 4) */
/*                 InvGammaFitVars_minc(cdsA, cdsA->algo->minc, 1); */
/*             else */
/*                 InvGammaFitVars_minc(cdsA, cdsA->algo->minc, 0); */
/*             break; */

        case 9:
            if (cdsA->algo->rounds > 4)
                InvGammaFitVars_fixed_c(cdsA, cdsA->algo->minc, 1);
            else
                InvGammaFitVars_fixed_c(cdsA, cdsA->algo->minc, 0);
            break;

        case 10:
            InvGammaFitVars_minc(cdsA, cdsA->algo->minc, 0);
            break;

        case 11:
            if (cdsA->algo->rounds > 4)
                InvGammaFitModeEvals(cdsA, 1);
            else
                InvGammaFitModeEvals(cdsA, 0);
            break;

        case 12: /* inverse gamma fit of variances, excluding the smallest 3 */
            /* This accounts for the fact that the smallest three eigenvalues of the covariance
               matrix are always zero, i.e. the covariance matrix is necessarily of rank
               vlen - 3 (or usually less, with inadequate amounts of data 3N-6). 
               No iterations */
            InvGammaFitEvals(cdsA, 0);
            break;

        case 13: /* Bayesian diagonal Wishart prior (proportional to the identity mat) on the
                    variances/covmat, assuming improper reference prior on the precision 
                    hyperparameter */
            WishartFitVar(cdsA, 1);
            break;

        case 14:
            WishartFitVar2(cdsA, 1);
//             if (cdsA->algo->rounds > 4)
//                 InvGammaBayesFitVars_fixed_c(cdsA, cdsA->algo->minc, 1);
//             else
//                 InvGammaBayesFitVars_fixed_c(cdsA, cdsA->algo->minc, 0);
// 
//             if (cdsA->algo->verbose != 0)
//                 printf("    HierarchVars() chi2:%f\n", cdsA->stats->hierarch_chi2);
            break;

        case 15:
            if (cdsA->algo->rounds >= 10)
                WishartAdjustVar(cdsA->var, cdsA->var, cdsA->vlen, cdsA->cnum, cdsA->stats->lsvar);
            break;

        case 16:
            if (cdsA->algo->rounds >= 10)
                ConjBayesAdjustVar(cdsA->var, cdsA->var, cdsA->vlen, cdsA->cnum, cdsA->stats->lsvar);
            break;

        case 17: /* inverse gamma fit of variances, excluding the smallest 3 */
            /* This accounts for the fact that the smallest three eigenvalues of the covariance
               matrix are always zero, i.e. the covariance matrix is necessarily of rank
               vlen - 3 (or usually less, with inadequate amounts of data 3N-6). */
            if (cdsA->algo->rounds > 4)
                InvGammaFitEvalsNoN(cdsA, 1);
            else
                InvGammaFitEvalsNoN(cdsA, 0);
            break;

        case 18:
            WishartAdjustVar(cdsA->var, cdsA->var, cdsA->vlen, cdsA->cnum, cdsA->algo->param[0]);
            break;

        case 19:
            WishartFitVar2(cdsA, 1);
            break;

/*             for (i = 0; i < cdsA->vlen; ++i) */
/*                 cdsA->var[i] = cdsA->CovMat[i][i]; */
/*             cdsA->algo->covweight = 0; */
/*             cdsA->algo->varweight = 1; */
/*             InvGammaFitVars(cdsA, 1); */
/*             cdsA->algo->covweight = 1; */
/*             cdsA->algo->varweight = 0; */
/*             CovMat2CorMat(cdsA->CovMat, cdsA->vlen); */
/*             CorMat2CovMat(cdsA->CovMat, (const double *) cdsA->var, cdsA->vlen); */
/*             break; */

        case 20: /* ML fit of variances to a reciprocal inverse gaussian dist */
            RecipInvGaussFitVars(cdsA, &mu, &lambda);
            RecipInvGaussAdjustVars(cdsA, mu, lambda);
            break;

        case 21: /* ML fit of variances to a lognorml distribution */
            LognormalFitVars(cdsA, &zeta, &sigma);
            LognormalAdjustVars(cdsA, zeta, sigma);             
            break;

        case 22:
            InvgaussFitVars(cdsA, &mean, &lambda);
            InvgaussAdjustVars(cdsA, zeta, sigma);
            break;

        case 30: /* inv gamma fit to eigenvalues of covariance mat, but only weighting by variances */
            cdsA->algo->covweight = 1;
            cdsA->algo->varweight = 0;
            if (cdsA->algo->alignment == 1)
                CalcCovMatOcc(cdsA);
            else
                CalcCovMat(cdsA);
            InvGammaFitEvals(cdsA, 1);
            cdsA->algo->covweight = 0;
            cdsA->algo->varweight = 1;
            for (i = 0; i < cdsA->vlen; ++i)
                cdsA->var[i] = cdsA->CovMat[i][i];
            break;

        case 31: /* inv gamma fit to eigenvalues of covariance mat, but only weighting by variances */
            cdsA->algo->covweight = 1;
            cdsA->algo->varweight = 0;
            if (cdsA->algo->alignment == 1)
                CalcCovMatOcc(cdsA);
            else
                CalcCovMat(cdsA);
            InvGammaFitVars(cdsA, 0); /* no iterations */
            cdsA->algo->covweight = 0;
            cdsA->algo->varweight = 1;
            for (i = 0; i < cdsA->vlen; ++i)
                cdsA->var[i] = cdsA->CovMat[i][i];
            break;

        default:
            printf("\n  ERROR:  Bad -g option \"%d\" \n", cdsA->algo->hierarch);
            Usage(0);
            exit(EXIT_FAILURE);
            break;

        if (cdsA->algo->verbose != 0)
            printf("    HierarchVars() chi2:%f\n", cdsA->stats->hierarch_chi2);
    }

    if (cdsA->algo->lele5 == 1 && cdsA->algo->covweight != 0)
    {
        /* Correct Lele's 5-landmark testset covariance matrix (only two off-diags are non-zero) */
        cdsA->CovMat[0][1] = 0.0;
        cdsA->CovMat[0][2] = 0.0;
        cdsA->CovMat[0][3] = 0.0;
        cdsA->CovMat[0][4] = 0.0;

        cdsA->CovMat[1][0] = 0.0;
        cdsA->CovMat[1][2] = 0.0;
        cdsA->CovMat[1][4] = 0.0;

        cdsA->CovMat[2][0] = 0.0;
        cdsA->CovMat[2][1] = 0.0;
        cdsA->CovMat[2][3] = 0.0;
        cdsA->CovMat[2][4] = 0.0;

        cdsA->CovMat[3][0] = 0.0;
        cdsA->CovMat[3][2] = 0.0;
        cdsA->CovMat[3][4] = 0.0;

        cdsA->CovMat[4][0] = 0.0;
        cdsA->CovMat[4][1] = 0.0;
        cdsA->CovMat[4][2] = 0.0;
        cdsA->CovMat[4][3] = 0.0;
    }
}


int
CheckConvergenceInner(CdsArray *cdsA, const double precision)
{
    Algorithm      *algo = cdsA->algo;
    int             i;

    if (algo->abort == 1)
        return(1);

    for (i = 0; i < cdsA->cnum; ++i)
    {
        if (TestIdentMat((const double **) cdsA->cds[i]->matrix, 3, precision) == 0)
        /* if (Mat3FrobEq((const double **) cdsA->cds[i]->last_matrix, (const double **) cdsA->cds[i]->matrix, precision) == 0) */
            return(0);
    }

    return(1);
}


int
CheckConvergenceOuter(CdsArray *cdsA, int round, const double precision)
{
    Algorithm      *algo = cdsA->algo;
    int             i;

    if (round >= algo->iterations)
        return(1);

    if (algo->abort == 1)
        return(1);

/*     else if (algo->alignment == 1 && round < 10) */
/*         return(0); */
    else if (round > 6)
    {
/*         if (Mat3FrobEq((const double **) cdsA->cds[0]->matrix, (const double **) cdsA->cds[0]->last_matrix, algo->precision) == 0) */
/*             return(1); */
/*         else */
/*             return(0); */

        cdsA->stats->precision = 0.0;
        for (i = 0; i < cdsA->cnum; ++i)
            cdsA->stats->precision += FrobDiffNormIdentMat((const double **) cdsA->cds[i]->matrix, 3);
        cdsA->stats->precision /= cdsA->cnum;

        if (cdsA->stats->precision > precision)
            return(0);
        else
            return(1);
    }
    else
        return(0);
}


double
SuperPoseArray2Orig(CdsArray *cdsA, CdsArray *targetA, double *sumdev)
{
    int             i, j;
    const int       vlen = cdsA->vlen;
    const int       cnum = cdsA->cnum;
    double         *trans = malloc(3 * sizeof(double));
    double          norm1, norm2, innprod, fpe;
    Cds        **cds = cdsA->cds;
    PDBCdsArray *pdbA;

    /* Superimpose originals on ML superimposed family, just to clean up
       any floating point problems due to extensive iteration,
       and to get the proper rotations & translations to apply to the original
       PDB cds. */
    pdbA = PDBCdsArrayInit();
    PDBCdsArrayAlloc(pdbA, cnum, vlen);

    for (i = 0; i < cnum; ++i)
        CopyCds2PDB(pdbA->cds[i], cds[i]);

    if (cdsA->algo->verbose == 1)
    {
        char *ca_name = mystrcat(cdsA->algo->rootname, "_MultiPose_CA.pdb");
        WriteModelFile(pdbA, ca_name);
        free(ca_name);
    }

    *sumdev = fpe = 0.0;
    for (i = 0; i < cnum; ++i)
    {
        *sumdev += fabs(SuperPose(targetA->cds[i], cds[i], cds[i]->matrix, trans,
                                 &norm1, &norm2, &innprod));

        fpe += fabs(norm1 - innprod)/innprod;

        for (j = 0; j < 3; ++j)
            cds[i]->center[j] = cds[i]->translation[j] =
            targetA->cds[i]->center[j] = targetA->cds[i]->translation[j] =
            -trans[j];

/*             printf("\n new trans: %f %f %f", */
/*                    cds[i]->center[0], cds[i]->center[1], cds[i]->center[2]); */
    }

    free(trans);
    PDBCdsArrayDestroy(&pdbA);

    *sumdev = sqrt(*sumdev / (cnum * vlen));

    return(fpe / cnum);
}


double
CalcInnProd(const Cds *cds1, const Cds *cds2)
{
    int             i;
    double          innprod, tmpx, tmpy, tmpz;

    innprod = 0.0;
    for (i = 0; i < cds1->vlen; ++i)
    {
        tmpx = cds1->x[i] * cds2->x[i];
        /* printf("\n% 8.3f %8.3f", cds1->x[i], cds2->x[i]); */
        tmpy = cds1->y[i] * cds2->y[i];
        tmpz = cds1->z[i] * cds2->z[i];
        innprod += (tmpx*tmpx + tmpy*tmpy + tmpz*tmpz);
    }

    return(innprod / cds1->vlen);
}


static void
WriteInstModelFile(char *fext, CdsArray *cdsA)
{
    int         i;
    PDBCdsArray *mpA;
    mpA = PDBCdsArrayInit();
    char       *fext_name = NULL;

    PDBCdsArrayAlloc(mpA, cdsA->cnum, cdsA->vlen);

    for (i = 0; i < mpA->cnum; ++i)
        CopyCds2PDB(mpA->cds[i], cdsA->cds[i]);

    fext_name = mystrcat(cdsA->algo->rootname, fext);
    WriteTheseusModelFileNoStats(mpA, cdsA->algo, fext_name);

    free(fext_name);
    PDBCdsArrayDestroy(&mpA);
}


/* The real thing */
int
MultiPose(CdsArray *baseA)
{
    /* FILE           *fp; */
    int             i, round, innerround;
    int             slxn; /* index of random coord to select as first */
    double          frobnorm, sumdev, percent, lastpercent, logL, lastlogL, lastscale;
    double          deviation_sum = 0.0;
    const int       cnum = baseA->cnum;
    const int       vlen = baseA->vlen;
    Algorithm      *algo = NULL;
    Statistics     *stats = NULL;
    Cds        **cds = NULL;
    Cds         *avecds = NULL;
    CdsArray    *scratchA = NULL; /* working scratch array, also holds average cds, */
                                     /* rotation matrices, translation and center vectors */

#if defined(__APPLE__)
    double          starttime, endtime;
    double          init, setup = 0.0, innerloop, exitloop;

    starttime = seconds();
#endif

    gsl_rng               *r2 = NULL;
    const gsl_rng_type    *T = NULL;
    T = gsl_rng_ranlxs2;
    r2 = gsl_rng_alloc(T);

/*     for (i = 0; i < cnum; ++i) */
/*         for (int j = 0; j < vlen; ++j) */
/*             baseA->cds[i]->z[j] = 0.0; */

    /* setup scratchA */
    scratchA = CdsArrayInit();
    CdsArrayAlloc(scratchA, cnum, vlen);
    CdsArraySetup(scratchA);

    baseA->scratchA = scratchA;

    /* duplicate baseA -- copy to scratchA */
    CdsArrayCopy(scratchA, baseA);

    /* setup local aliases based on scratchA */
    algo = scratchA->algo;
    stats = scratchA->stats;
    cds = scratchA->cds;
    avecds = scratchA->avecds;

    if (algo->covweight == 1)
    {
        SetupCovWeighting(scratchA); /* DLT debug */
        SetupCovWeighting(baseA); /* DLT debug */
    }

    memsetd(scratchA->w, 1.0, vlen);
    memsetd(baseA->w, 1.0, vlen);

    stats->hierarch_p1 = 1.0;
    stats->hierarch_p2 = 1.0;

    //algo->constant = 0.001;

#if defined(__APPLE__)
    endtime = seconds();
    init = (double) (endtime - starttime) / 0.001;
    starttime = seconds();
#endif

    /* Initialize the algorithm -- we need a centered mean structure as first guess */
    /* determine a structure to use as the initial mean structure */
    if (algo->embedave != 0 || algo->alignment == 1)
    {
        printf("    Calculating distance matrix for embedding average ... \n");
        fflush(NULL);

        CdsCopyAll(avecds, cds[0]);
        DistMatsAlloc(scratchA);

        if (algo->alignment == 1)
            CalcMLDistMatOcc(scratchA);
        else
            CalcMLDistMat(scratchA);

        printf("    Embedding average structure (ML) ... \n");
        fflush(NULL);

        EmbedAveCds(scratchA);

        for (i = 0; i < vlen; ++i)
            avecds->resSeq[i] = i+1;

        // DistMatsDestroy(scratchA); // DLT debug FIX

        printf("    Finished embedding \n");
        fflush(NULL);

        if (algo->write_file == 1)
        {
            char *embed_ave_name = mystrcat(algo->rootname, "_embed_ave.pdb");
            WriteAveCdsFile(scratchA, embed_ave_name);
            free(embed_ave_name);
        }
    }
    else
    {
        //slxn = (int) (genrand_real2() * cnum);
        slxn = gsl_rng_uniform_int(r2, cnum);
        CdsCopyAll(avecds, baseA->cds[slxn]);
    }

    if (algo->notrans == 0)
    {
        CenMass(avecds);
        ApplyCenterIp(avecds);
    }

    if (algo->seed == 1)
    {
        CalcStats(scratchA);
        round = 10;
    }

    if (algo->bfact > 0)
    {
        for (i = 0; i < cnum; ++i)
            Bfacts2PrVars(scratchA, i);
    }

    //if (algo->alignment == 1)
        CalcDf(scratchA);

    if (algo->scale > 0)
    {
		//double fac, facsum = 0.0;
		for (i = 0; i < cnum; ++i)
		{
			//fac = genrand_real2() * 100.0;
			//facsum += fac;
			//printf("\nfac[%3d]: % 12.6f", i+1, fac);
			//ScaleCds(scratchA->cds[i], 1.0 / (i+1.0));
			//ScaleCds(scratchA->cds[i], 1.0 / fac);
			scratchA->cds[i]->scale = 1.0 / (i+1.0);
		}
		//printf("\nfacsum: %12.6f", facsum);
    }

    /* The EM algorithm */
    /* The outer loop:
       (1) First calculates the translations
       (2) Does inner loop -- calc rotations and average till convergence
       (3) Holding the superposition constant, calculates the covariance
           matrices and corresponding weight matrices, looping till 
           convergence when using a dimensional/axial covariance matrix */
    round = 0;
    percent = lastpercent = 0.0;
    logL = lastlogL = lastscale = -DBL_MAX;
    while(1)
    {
/*         if (round % 62 == 0) */
/*              printf("\n    "); */
/*         else */
/*             putchar('.'); */
/*         fflush(NULL); */

        if (algo->nullrun == 1)
            break;

        lastlogL = logL;
        ++round;
        baseA->algo->rounds = algo->rounds = round;

        if (algo->verbose == 1)
        {
            printf("\n\n\nNew Outer Round:%3d ////////////////////////////////////////////////////////////",
                   round);
            fflush(NULL);
        }

        /* Calculate the minimum variance empirically -- this is really just inherent floating point error */
        if (round == 2 && algo->constant < 0.0)
        {
            SuperPoseArray2Orig(scratchA, baseA, &sumdev);
            algo->constant = sumdev * sumdev;
/*             printf("\n    Minimum variance: %8.3e (sigma:%8.3e)", sumdev*sumdev, sumdev); */
/*             fflush(NULL); */
        }

        /* Find weighted center and translate all cds */
        CalcTranslationsIp(scratchA, algo);
        //CalcTranslationsOp(scratchA, baseA, algo);

        for (i = 0; i < cnum; ++i)
            ApplyCenterIp(cds[i]);
            //ApplyCenterOp(cds[i], (const Cds *) baseA->cds[i]);

        /* save the translation vector for each coord in the array */
        for (i = 0; i < cnum; ++i)
            memcpy(cds[i]->translation, cds[i]->center, 3 * sizeof(double));

        /* when superimposing to an alignemnt, initially iterate unweighted LS for a few rounds */
        //if (algo->alignment == 1 && round < 10) /* DLT debug -- I changed this just to find the LS answer first */
        //    memsetd(scratchA->w, 1.0, vlen);

        /* Inner loop:
           (1) Calc rotations given weights/weight matrices
           (2) Rotate cds with new rotations
           (3) Recalculate average

           Loops till convergence, holding constant the weights, variances, and covariances
           (and thus the translations too) */
        innerround = 0;
        do
        {
/*          putchar('*'); */
/*          fflush(NULL); */
            ++innerround;
            algo->innerrounds += innerround;

/* char *tempstr = malloc(512 * sizeof(char)); */
/* sprintf(tempstr, "_mp_%d.pdb", algo->innerrounds); */
/* WriteInstModelFile(tempstr, scratchA); */
/* free(tempstr); */

            if (algo->verbose == 1)
            {
                printf("\n    New Inner Round:%d \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\n", innerround);
                fflush(NULL);
            }

            /* save the old rotation matrices to test convergence at bottom of loop */
            for (i = 0; i < cnum; ++i)
                MatCpySym(cds[i]->last_matrix, (const double **) cds[i]->matrix, 3);

////////////////////////////////////////////////////////////////////////////////////////////////////
/*             double *mytrans = malloc(3 * sizeof(double)); */
/*              */
/*             mytrans[0] = 20.0; mytrans[1] = 30.0; mytrans[2] = 40.0; */
/*  */
/*             TransCdsIp(cds[0], mytrans); */
/*             CalcRotations(scratchA); */
/*              */
/*             printf("\nBefore[%d][%d]:", innerround, round); */
/*             Mat3Print(cds[0]->matrix); */
/*              */
/*             mytrans[0] = -20.0; mytrans[1] = -30.0; mytrans[2] = -40.0; */
/*             TransCdsIp(cds[0], mytrans); */
/*              */
/*             free(mytrans); */

            /* find the optimal rotation matrices */
            if (algo->alignment == 1 /* && (round == 1 || cnum == 2) */)
                deviation_sum = CalcRotationsOcc(scratchA);
            else
                deviation_sum = CalcRotations(scratchA);

/*             printf("\nAfter:"); */
/*             Mat3Print(cds[0]->matrix); */
////////////////////////////////////////////////////////////////////////////////////////////////////

            if (algo->verbose == 1 && innerround == 1)
            {
                frobnorm = 0.0;
                for (i = 0; i < cnum; ++i)
                    /* frobnorm += MatFrobNorm((const double **) cds[i]->last_matrix, (const double **) cds[i]->matrix, 3, 3); */
                    frobnorm += FrobDiffNormIdentMat((const double **) cds[i]->matrix, 3);
                frobnorm /= cnum;

                printf("-----<<<<< %3d Frobenius Norm (Outer): % 8.3e //////////////////////////////\n",
                       round, frobnorm);
                fflush(NULL);
            }

            if (innerround == 1 &&
                CheckConvergenceOuter(scratchA, round, algo->precision) == 1)
                   goto outsidetheloops;

            if (stats->precision > 0.0)
                percent = 100.0 * log(fabs(stats->precision))/log(algo->precision);
            else
                percent = 0.0;

//            if (percent > lastpercent)
//            {
//                lastpercent = percent;
//                printf("    %5.1f%%\n", percent);
//                /* printf("\n%e\n", stats->precision); */
//                printf("\033[<1>A"); /* moves the cursor up one line */
//                fflush(NULL);
//            }

            /* rotate the scratch cds with new rotation matrix */
            for (i = 0; i < cnum; ++i)
            {
                RotateCdsIp(cds[i], (const double **) cds[i]->matrix);
                //printf("\n\nCds %d\n", i);
                //PrintCds(cds[i]);
            }

            if (algo->scale > 0)
            {
                lastscale = cds[0]->scale;

                double scaleprod;
    
                if (algo->scale == 1)
                    scaleprod = CalcScaleFactorsML(scratchA);
                else if (algo->scale == 2)
                    scaleprod = CalcScaleFactors(scratchA);
                else if (algo->scale == 3)
                    scaleprod = CalcScaleFactorsMLLogNorm(scratchA);
                else
                    scaleprod = 1.0;
    
                printf("\n%5d scaleprod = %12.6f\n", round, scaleprod);
            }

            /* find global rmsd and average cds (both held in structure) */
            if (algo->noave == 0)
            {
                if (algo->alignment == 1)
                {
                    AveCdsOcc(scratchA);
                    EM_MissingCds(scratchA);
                    //printf("\n\nAveCds\n");
                    //PrintCds(scratchA->avecds);
                }
                else
                {
                    AveCds(scratchA);

/* Calculate the ML estimate of a hierarchical mean, where the variance-weighted atoms 
   are normally distributed with mean zero */
/* See pdbUtils.c */
/*                     double oldpsi = 0.0, psi = 0.0; */
/*                     int q; */
/*  */
/*                     q = 0; */
/*                     do */
/*                     { */
/*                         oldpsi = psi; */
/*                         psi = HierAveCds(scratchA); */
/*                         //printf("\n  psi[%d] = %e", q, psi); */
/*                         q++; */
/*                     } */
/*                     while(fabs(psi - oldpsi) > psi * algo->precision); */
                }
                //PrintCds(scratchA->avecds);
            }

            if (algo->mbias == 1)
                UnbiasMean(scratchA);

            stats->wRMSD_from_mean = sqrt(deviation_sum / (3 * vlen * cnum));

            if (algo->verbose == 1)
            {
                frobnorm = 0.0;
                for (i = 0; i < cnum; ++i)
                    frobnorm += FrobDiffNormIdentMat((const double **) cds[i]->matrix, 3);
                frobnorm /= cnum;
                printf("    ----->>>>> %3d Frobenius Norm (Inner %d): % e\n", round, innerround, frobnorm);
                printf("    End Inner Round:%d \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\n", innerround);
                fflush(NULL);
            }

            if (algo->noinnerloop == 1)
                break;
            else if (innerround > 160)
            {
                putchar(',');
                fflush(NULL);
                break;
            }
        }
        //while(CheckConvergenceInner(scratchA, algo->precision) == 0 &&
          //    fabs(cds[0]->scale - lastscale) > cds[0]->scale * algo->precision);
        while(CheckConvergenceInner(scratchA, algo->precision) == 0);

        if (round < 10)
        {
            if (algo->alignment == 1)
                VarianceCdsOcc(scratchA);
            else
                VarianceCds(scratchA);

            stats->lsvar = stats->stddev * stats->stddev;
        }

/* printf("\nvar  = %10.5e",   VarianceCdsOcc(scratchA)); */
/* printf("\nrmsd = %10.5e\n", CalcPRMSD(scratchA)); */
        if (algo->instfile == 1)
            WriteInstModelFile("_inst.pdb", scratchA);

        /* Weighting by dimensional, axial Xi covariance matrix, here diagonal. */
        /* Holding the superposition constant, calculates the covariance
           matrices and corresponding weight matrices, looping till 
           convergence. */
        CalcCovariances(scratchA);
        
        //if (algo->scale > 0)
          //  ConstrainCovMat(scratchA);

        if (scratchA->algo->lele5 == 1)
        {
            /* Correct Lele's 5-landmark testset covariance matrix (only two off-diags are non-zero) */
            scratchA->CovMat[0][1] = 0.0;
            scratchA->CovMat[0][2] = 0.0;
            scratchA->CovMat[0][3] = 0.0;
            scratchA->CovMat[0][4] = 0.0;

            scratchA->CovMat[1][0] = 0.0;
            scratchA->CovMat[1][2] = 0.0;
            scratchA->CovMat[1][4] = 0.0;

            scratchA->CovMat[2][0] = 0.0;
            scratchA->CovMat[2][1] = 0.0;
            scratchA->CovMat[2][3] = 0.0;
            scratchA->CovMat[2][4] = 0.0;

            scratchA->CovMat[3][0] = 0.0;
            scratchA->CovMat[3][2] = 0.0;
            scratchA->CovMat[3][4] = 0.0;

            scratchA->CovMat[4][0] = 0.0;
            scratchA->CovMat[4][1] = 0.0;
            scratchA->CovMat[4][2] = 0.0;
            scratchA->CovMat[4][3] = 0.0;
        }

        if (CheckZeroVariances(scratchA) == 1)
        {
            algo->varweight = 0;
            algo->covweight = 0;
            algo->leastsquares = 1;
        }

        /* calculate the weights/weight matrices */
        /* and first the hierarchical adjustment */
        CalcWts(scratchA);

        if (algo->printlogL == 1)
        {
            if (algo->leastsquares == 1)
                CalcNormResidualsLS(scratchA);
            else
                CalcNormResiduals(scratchA);
            logL = CalcLogL(scratchA);
            printf("----> %4d logL: % e  % e <----\n", round, logL, logL - lastlogL);
        }

        if (algo->verbose == 1)
        {
            printf("END Outer Round:%3d ////////////////////////////////////////////////////////////\n\n",
                   round);
            fflush(NULL);
        }
    }

    outsidetheloops:

/*     for (i = 0; i < cnum; ++i) */
/*      printf("\ntrans [%3d]: %f %f %f", */
/*             i+1, */
/*             cds[i]->transsum[0], */
/*             cds[i]->transsum[1], */
/*             cds[i]->transsum[2]); */

    if (algo->seed == 1)
        round -= 10;

    if (algo->bayes > 0)
    {
        #include "GibbsMet.h"
        printf("    Calculating Gibbs-Metropolis Bayesian superposition ... \n");
        fflush(NULL);
        GibbsMet(scratchA);
    }

#if defined(__APPLE__)
    endtime = seconds();
    innerloop = (double) (endtime - starttime) / 0.001;
    starttime = seconds();
#endif

    printf("    Calculating statistics ... \n");
    fflush(NULL);

    if (algo->instfile == 1)
        WriteInstModelFile("_inst_final.pdb", scratchA);

/*
    fp = fopen("distcor.txt", "w");
    if (scratchA->CovMat == NULL)
        scratchA->CovMat = MatAlloc(vlen, vlen);

    CalcCovMat(scratchA);
    DistMatsAlloc(cdsA);

    CalcMLDistMat(scratchA);

    for (i = 0; i < vlen; ++i)
        for (j = 0; j < i; ++j)
            fprintf(fp, "%6d % 10.3f  % 8.3e\n",
                   i-j,
                   scratchA->Dij_matrix[i][j],
                   scratchA->CovMat[i][j] / sqrt(scratchA->CovMat[i][i] * scratchA->CovMat[j][j]));

    fclose(fp);
*/

/*     if (algo->weight == 200) */
/*         unremlvar(scratchA); */

/* #include "internmat.h" */
/* AveCds(scratchA); */
/* CalcCovMat(scratchA); */
/* PrintCovMatGnuPlot((const double **) scratchA->CovMat, vlen, "cov.mat"); */
/* for (i = 0; i < vlen; ++i) */
/*  for (j = 0; j < vlen; ++j) */
/*      scratchA->CovMat[i][j] -= internmat[i][j]; */
/* PrintCovMatGnuPlot((const double **) scratchA->CovMat, vlen, "covdiff.mat"); */

/*     CovMat2CorMat(scratchA->CovMat, vlen); */
/*     PrintCovMatGnuPlot((const double **) scratchA->CovMat, vlen, "corr.mat"); */
/*     memcpy(&scratchA->CovMat[0][0], &internmat[0][0], vlen * vlen * sizeof(double)); */
/*     PrintCovMatGnuPlot((const double **) scratchA->CovMat, vlen, "cov_true.mat"); */
/*     CovMat2CorMat(scratchA->CovMat, vlen); */
/*     PrintCovMatGnuPlot((const double **) scratchA->CovMat, vlen, "corr_true.mat"); */

/*     CovMatsDestroy(scratchA); */ /* DLT debug */
/*     CovMatsDestroy(baseA); */

    if (algo->covweight == 1 && (algo->write_file > 0 || algo->info != 0))
    {
        double         *evals = malloc(vlen * sizeof(double));
        double        **evecs = scratchA->tmpmatKK2;
        char           *mp_cov_name = NULL;

        eigenvalsym((const double **) scratchA->CovMat, evals, evecs, vlen);
        /* VecPrint(evals, vlen); */
        mp_cov_name = mystrcat(algo->rootname, "_mp_cov.mat");
        PrintCovMatGnuPlot((const double **) scratchA->CovMat, vlen, mp_cov_name);
        free(mp_cov_name);
/*         CovMat2CorMat(scratchA->CovMat, vlen); */
/*         PrintCovMatGnuPlot((const double **) scratchA->CovMat, vlen, mystrcat(algo->rootname, "_cor.mat")); */
        CalcPRMSD(scratchA);
        WriteInstModelFile("_mp.pdb", scratchA);
        free(evals);
    }

    { /* Write out a taxa distance matrix in NEXUS format */
        #include "DistMat.h"

        DISTMAT *distmat;
        double sum;
        int j,k;
        char num[32];
        char *ptr = NULL;
        char *tree_name = NULL;

        distmat = DISTMATalloc(cnum);

        for (i = 0; i < cnum; ++i)
        {
            strcpy(distmat->taxa[i], cds[i]->filename);
            ptr = strrchr(distmat->taxa[i], '.');
            if (ptr != NULL)
                *ptr = '\0';
            sprintf(num, "_%d", i);
            strcat(distmat->taxa[i], num);
        }

        for (i = 0; i < cnum; ++i)
        {
            for (j = 0; j < cnum; ++j)
            {
                sum = 0.0;
                for (k = 0; k < vlen; ++k)
                    sum += SqrCdsDistMahal2((const Cds *) cds[i], k,
                                               (const Cds *) cds[j], k,
                                               (const double) scratchA->w[k]);

                distmat->dist[i][j] = sqrt(sum);
            }
        }

        tree_name = mystrcat(algo->rootname, "_tree.nxs");
        print_NX_distmat(distmat, tree_name);

        if (tree_name != NULL)
            free(tree_name);

        DISTMATdestroy(&distmat);
    }

    CalcStats(scratchA);
    stats->fperr = SuperPoseArray2Orig(scratchA, baseA, &stats->minvar);

    if (algo->ssm == 1)
    {
        printf("    Calculating SSM ... \n");
        fflush(NULL);

        #include "pdbSSM.h"
        SSM *ssm = SSMInit();
        SSMAlloc(ssm, scratchA);
        //for (i=0; i < 2; ++i)
        SSMCalc(ssm, scratchA);

        printf("    Writing SSM ... \n");
        fflush(NULL);

        WriteSSM(ssm);
        SSMDestroy(&ssm);
    }

    if (baseA->anchorf_name != NULL) /* orient entire family to a user-specified structure */
        SuperPose2Anchor(scratchA, baseA, baseA->anchorf_name);
    else if (algo->princaxes == 1) /* orient the family perpendicular to principal axes of the average cds -- */
        RotPrincAxes(scratchA);    /* makes for nice viewing */

    if (algo->write_file == 1)
    {
        char *transf_name = mystrcat(algo->rootname, "_transf2.txt");
        WriteTransformations(scratchA, transf_name);
        free(transf_name);
    }

    if (algo->olve == 1 && algo->write_file == 1)
    {
        PDBCdsArray *olveA;
        printf("    Writing Olve's file ... \n");
        fflush(NULL);

        olveA = PDBCdsArrayInit();
        PDBCdsArrayAlloc(olveA, cnum, vlen);

        for (i = 0; i < cnum; ++i)
            CopyCds2PDB(olveA->cds[i], cds[i]);

        char *olve_name = mystrcat(algo->rootname, "_olve.pdb");
        WriteOlveModelFile(olveA, algo, stats, olve_name);
        free(olve_name);
        PDBCdsArrayDestroy(&olveA);
    }

    CopyStats(baseA, scratchA);

    /* wRMSD_from_mean does not need 2 in denominator, since it is already from the average */
    stats->wRMSD_from_mean = sqrt(deviation_sum / (double) (vlen * cnum));

#if defined(__APPLE__)
    endtime = seconds();
    exitloop = (double) (endtime - starttime) / 0.001;
    if (algo->verbose == 1)
    {
        printf("    init    setup  inner loop  exit loop \n");
        printf(" %7.2f  %7.2f     %7.2f    %7.2f (ms) \n", init, setup, innerloop, exitloop);
        fflush(NULL);
    }
#endif

    CdsArrayDestroy(&scratchA);

    gsl_rng_free(r2);
    r2 = NULL;

    return(round);
}


int
MultiPose_pth(CdsArray *baseA)
{
    /* FILE           *fp; */
    int             i, round, innerround;
    int             slxn; /* index of random coord to select as first */
    double          frobnorm, sumdev, percent, lastpercent;
    double          deviation_sum = 0.0;
    const int       cnum = baseA->cnum;
    const int       vlen = baseA->vlen;
    double         *evals = malloc(3 * sizeof(double));
    Algorithm      *algo = NULL;
    Statistics     *stats = NULL;
    Cds        **cds = NULL;
    Cds         *avecds = NULL;
    CdsArray    *scratchA = NULL;

    const int       thrdnum = baseA->algo->threads;
    RotData       **rotdata = malloc(thrdnum * sizeof(RotData *));
    AveData       **avedata = malloc(thrdnum * sizeof(AveData *));
    pthread_t      *callThd = malloc(thrdnum * sizeof(pthread_t));
    pthread_attr_t  attr;


#if defined(__APPLE__)
    double          starttime, endtime;
    double          init, setup = 0.0, innerloop, exitloop;

    starttime = seconds();
#endif

    gsl_rng               *r2 = NULL;
    const gsl_rng_type    *T = NULL;

    T = gsl_rng_ranlxs2;
    r2 = gsl_rng_alloc(T);

    for (i = 0; i < thrdnum; ++i)
    {
        rotdata[i] = malloc(sizeof(RotData));
        avedata[i] = malloc(sizeof(AveData));
    }

    pthread_attr_init(&attr);
/*     pthread_attr_getstacksize (&attr, &stacksize); */
/*     printf("\nDefault stack size = %d", (int) stacksize); */
    pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
    pthread_attr_setscope(&attr, PTHREAD_SCOPE_SYSTEM);

/*     if (baseA->algo->weight == 200) */
/*         remlvar(baseA); */

    /* setup scratchA */
    scratchA = CdsArrayInit();
    CdsArrayAlloc(scratchA, cnum, vlen);
    CdsArraySetup(scratchA);

    baseA->scratchA = scratchA;

    /* duplicate baseA -- copy to scratchA */
    CdsArrayCopy(scratchA, baseA);

    /* setup local aliases based on scratchA */
    algo = scratchA->algo;
    stats = scratchA->stats;
    cds = scratchA->cds;
    avecds = scratchA->avecds;

    SetupCovWeighting(scratchA); /* DLT debug */
    SetupCovWeighting(baseA); /* DLT debug */

    stats->hierarch_p1 = 1.0;
    stats->hierarch_p2 = 1.0;

#if defined(__APPLE__)
    endtime = seconds();
    init = (double) (endtime - starttime) / 0.001;
    starttime = seconds();
#endif

    if (algo->embedave != 0 || algo->alignment == 1) // DLTIP
    {
        printf("    Calculating distance matrix for embedding average ... \n");
        fflush(NULL);

        CdsCopyAll(avecds, cds[0]);
        DistMatsAlloc(scratchA);

        if (algo->alignment == 1)
            CalcMLDistMatOcc(scratchA);
        else
            CalcMLDistMat(scratchA);

        printf("    Embedding average structure (ML) ... \n");
        fflush(NULL);

        EmbedAveCds(scratchA);

        for (i = 0; i < vlen; ++i)
            avecds->resSeq[i] = i+1;

        printf("    Finished embedding \n");
        fflush(NULL);

        if (algo->write_file == 1)
        {
            char *embed_ave_name = mystrcat(algo->rootname, "_embed_ave.pdb");
            WriteAveCdsFile(scratchA, embed_ave_name);
            free(embed_ave_name);
        }
    }
    else
    {
        //slxn = (int) (genrand_real2() * cnum);
        slxn = gsl_rng_uniform_int(r2, cnum);
        CdsCopyAll(avecds, baseA->cds[slxn]);
    }

    if (algo->notrans == 0)
    {
        CenMass(avecds);
        ApplyCenterIp(avecds);
    }

    if (algo->seed == 1)
    {
        CalcStats(scratchA);
        round = 10;
    }

    /* The outer loop:
       (1) First calculates the translations
       (2) Does inner loop -- calc rotations and average till convergence
       (3) Holding the superposition constant, calculates the covariance
           matrices and corresponding weight matrices, looping till 
           convergence when using a dimensional/axial covariance matrix 
    */
    percent = lastpercent = 0.0;
    round = 0;
    while(1)
    {
/*         if (round % 62 == 0) */
/*              printf("    \n"); */
/*         else */
/*             putchar('.'); */
/*         fflush(NULL); */

        if (algo->nullrun == 1)
            break;

        ++round;
        baseA->algo->rounds = algo->rounds = round;

        if (algo->verbose == 1)
        {
            printf("\n\n\nNew Outer Round:%3d ////////////////////////////////////////////////////////////",
                   round);
            fflush(NULL);
        }

        /* Calculate the minimum variance empirically */
        if (round == 2 && algo->constant < 0.0)
        {
            SuperPoseArray2Orig(scratchA, baseA, &sumdev);
            algo->constant = sumdev * sumdev;
/*             printf("\n    Minimum variance: %8.3e (sigma:%8.3e)", sumdev*sumdev, sumdev); */
/*             fflush(NULL); */
        }

        /* Find weighted center and translate all cds */
        CalcTranslationsIp(scratchA, algo);
        for (i = 0; i < cnum; ++i)
            ApplyCenterIp(cds[i]);

        /* save the translation vector for each coord in the array */
        for (i = 0; i < cnum; ++i)
            memcpy(cds[i]->translation, cds[i]->center, 3 * sizeof(double));

        /* when superimposing to an alignemnt, initially iterate into unwted LS for a few rounds */
//        if (algo->alignment == 1 && round < 5)
//            memsetd(scratchA->w, 1.0, vlen);

        /* Inner loop:
           (1) Calc rotations given weights/weight matrices
           (2) Rotate cds with new rotations
           (3) Recalculate average

           Loops till convergence, holding constant the weights, variances, and covariances
           (and thus the translations too) */
        innerround = 0;
        do
        {
/*          putchar('*'); */
/*          fflush(NULL); */
            ++innerround;
            algo->innerrounds += innerround;

            if (algo->verbose == 1)
            {
                printf("\n    New Inner Round:%d \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\n", innerround);
                fflush(NULL);
            }

            /* save the old rotation matrices to test convergence at bottom of loop */
            for (i = 0; i < cnum; ++i)
                MatCpySym(cds[i]->last_matrix, (const double **) cds[i]->matrix, 3);

            /* find the optimal rotation matrices */
            if (algo->alignment == 1 /* && (round == 1 || cnum == 2) */)
                deviation_sum = CalcRotationsOcc(scratchA);
            else
                deviation_sum = CalcRotations_pth(scratchA, rotdata, callThd, &attr, thrdnum);

            if (algo->verbose == 1 && innerround == 1)
            {
                frobnorm = 0.0;
                for (i = 0; i < cnum; ++i)
                    /* frobnorm += MatFrobNorm((const double **) cds[i]->last_matrix, (const double **) cds[i]->matrix, 3, 3); */
                    frobnorm += FrobDiffNormIdentMat((const double **) cds[i]->matrix, 3);
                frobnorm /= cnum;

                printf("-----<<<<< %3d Frobenius Norm (Outer): % 8.3e //////////////////////////////\n",
                       round, frobnorm);
                fflush(NULL);
            }

            if (innerround == 1 &&
                CheckConvergenceOuter(scratchA, round, algo->precision) == 1)
                   goto outsidetheloops;

            if (stats->precision > 0.0)
                percent = 100.0 * log(fabs(stats->precision))/log(algo->precision);
            else
                percent = 0.0;

            if (percent > lastpercent)
            {
                lastpercent = percent;
                printf("    %5.1f%%\n", percent);
                /* printf("\n%e\n", stats->precision); */
                printf("\033[<1>A"); /* moves the cursor up one line */
                fflush(NULL);
            }

            /* find global rmsd and average cds (both held in structure) */
            if (algo->noave == 0)
            {
                if (algo->alignment == 1)
                {
                    AveCdsOcc(scratchA);
                    EM_MissingCds(scratchA);
                    /* PrintCds(scratchA->avecds); */
                }
                else
                {
                    AveCds_pth(scratchA, avedata, callThd, &attr, thrdnum);
                    /* AveCds(scratchA); */
                }
            }

            if (algo->mbias == 1)
                UnbiasMean(scratchA);

            stats->wRMSD_from_mean = sqrt(deviation_sum / (3 * vlen * cnum));

            if (algo->verbose == 1)
            {
                frobnorm = 0.0;
                for (i = 0; i < cnum; ++i)
                    frobnorm += FrobDiffNormIdentMat((const double **) cds[i]->matrix, 3);
                frobnorm /= cnum;
                printf("    ----->>>>> %3d Frobenius Norm (Inner %d): % e\n", round, innerround, frobnorm);
                printf("    End Inner Round:%d \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\n", innerround);
                fflush(NULL);
            }

            if (algo->noinnerloop == 1)
                break;
            else if (innerround > 160)
            {
                putchar(',');
                fflush(NULL);
                break;
            }
        }
        while(CheckConvergenceInner(scratchA, algo->precision) == 0);

        /* Weighting by dimensional, axial Xi covariance matrix, here diagonal. */
        /* Holding the superposition constant, calculates the covariance
           matrices and corresponding weight matrices, looping till 
           convergence. */
        CalcCovariances(scratchA);

        /* calculate the weights/weight matrices */
        CalcWts(scratchA);
        /* printf("\n----> logL: %e <----", CalcLogL(scratchA)); */
    }

    outsidetheloops:

    if (algo->seed == 1)
        round -= 10;

#if defined(__APPLE__)
    endtime = seconds();
    innerloop = (double) (endtime - starttime) / 0.001;
    starttime = seconds();
#endif

    printf("    Calculating statistics ... \n");
    fflush(NULL);

/*
    fp = fopen("distcor.txt", "w");
    if (scratchA->CovMat == NULL)
        scratchA->CovMat = MatAlloc(vlen, vlen);

    CalcCovMat(scratchA);
    DistMatsAlloc(cdsA);

    CalcMLDistMat(scratchA);

    for (i = 0; i < vlen; ++i)
        for (j = 0; j < i; ++j)
            fprintf(fp, "%6d % 10.3f  % 8.3e\n",
                   i-j,
                   scratchA->Dij_matrix[i][j],
                   scratchA->CovMat[i][j] / sqrt(scratchA->CovMat[i][i] * scratchA->CovMat[j][j]));

    fclose(fp);
*/

/*     if (algo->weight == 200) */
/*         unremlvar(scratchA); */

/* #include "internmat.h" */
/* AveCds(scratchA); */
/* CalcCovMat(scratchA); */
/* PrintCovMatGnuPlot((const double **) scratchA->CovMat, vlen, "cov.mat"); */
/* for (i = 0; i < vlen; ++i) */
/*  for (j = 0; j < vlen; ++j) */
/*      scratchA->CovMat[i][j] -= internmat[i][j]; */
/* PrintCovMatGnuPlot((const double **) scratchA->CovMat, vlen, "covdiff.mat"); */

/*     CovMat2CorMat(scratchA->CovMat, vlen); */
/*     PrintCovMatGnuPlot((const double **) scratchA->CovMat, vlen, "corr.mat"); */
/*     memcpy(&scratchA->CovMat[0][0], &internmat[0][0], vlen * vlen * sizeof(double)); */
/*     PrintCovMatGnuPlot((const double **) scratchA->CovMat, vlen, "cov_true.mat"); */
/*     CovMat2CorMat(scratchA->CovMat, vlen); */
/*     PrintCovMatGnuPlot((const double **) scratchA->CovMat, vlen, "corr_true.mat"); */

/*     CovMatsDestroy(scratchA); */ /* DLT debug */
/*     CovMatsDestroy(baseA); */

    CalcStats(scratchA);
    stats->fperr = SuperPoseArray2Orig(scratchA, baseA, &stats->minvar);

    /* orient the family perpendicular to principal axes of the average cds --
       makes for nice viewing */
    if (baseA->anchorf_name != NULL)
        SuperPose2Anchor(scratchA, baseA, baseA->anchorf_name);
    else if (algo->princaxes == 1)
        RotPrincAxes(scratchA);

    if (algo->write_file == 1)
    {
        char *transf_name = mystrcat(algo->rootname, "_transf.txt");
        WriteTransformations(scratchA, transf_name);
        free(transf_name);
    }

    if (algo->olve == 1 && algo->write_file == 1)
    {
        PDBCdsArray *olveA;
        printf("    Writing Olve's file ... \n");
        fflush(NULL);

        olveA = PDBCdsArrayInit();
        PDBCdsArrayAlloc(olveA, cnum, vlen);

        for (i = 0; i < cnum; ++i)
            CopyCds2PDB(olveA->cds[i], cds[i]);

        /* WriteTheseusModelFile(olveA, algo, stats, "theseus_olve.pdb"); */
        char *olve_name = mystrcat(algo->rootname, "_olve.pdb");
        WriteOlveModelFile(olveA, algo, stats, olve_name);
        free(olve_name);
        PDBCdsArrayDestroy(&olveA);
    }

    CopyStats(baseA, scratchA);

    /* wRMSD_from_mean does not need 2 in denominator, since it is already from the average */
    stats->wRMSD_from_mean = sqrt(deviation_sum / (double) (vlen * cnum));

    if (algo->write_file == 1)
    {
        char *cov_name = mystrcat(algo->rootname, "_cov.mat");
        char *cor_name = mystrcat(algo->rootname, "_cor.mat");
        CalcCovMat(scratchA);
        PrintCovMatGnuPlot((const double **) scratchA->CovMat, vlen, cov_name);
        CovMat2CorMat(scratchA->CovMat, vlen);
        PrintCovMatGnuPlot((const double **) scratchA->CovMat, vlen, cor_name);
        free(cov_name);
        free(cor_name);
    }

    CdsArrayDestroy(&scratchA);
    free(evals);

    pthread_attr_destroy(&attr);
    for (i = 0; i < thrdnum; ++i)
        free(rotdata[i]);
    for (i = 0; i < thrdnum; ++i)
        free(avedata[i]);
    free(rotdata);
    free(avedata);
    free(callThd);

#if defined(__APPLE__)
    endtime = seconds();
    exitloop = (double) (endtime - starttime) / 0.001;
    if (algo->verbose == 1)
    {
        printf("    init    setup  inner loop  exit loop \n");
        printf(" %7.2f  %7.2f     %7.2f    %7.2f (ms) \n", init, setup, innerloop, exitloop);
        fflush(NULL);
    }
#endif

    gsl_rng_free(r2);
    r2 = NULL;

    return(round);
}


void
RotPrincAxes(CdsArray *cdsA)
{
    int             i;
    double        **x90z90 = MatAlloc(3,3);
    /* double x90[3][3]    = {{ 1, 0, 0}, { 0, 0, 1}, { 0,-1, 0}}; */
    /* double z90[3][3]    = {{ 0, 1, 0}, {-1, 0, 0}, { 0, 0, 1}}; */
    /* double x90z90[3][3] = {{ 0, 1, 0}, { 0, 0, 1}, { 1, 0, 0}}; */

    /* this orients the least -> most variable axes along x, y, z respectively */
    CalcCdsPrincAxes(cdsA->avecds, cdsA->avecds->matrix);

    memset(&x90z90[0][0], 0, 9 * sizeof(double));
    x90z90[0][1] = x90z90[1][2] = x90z90[2][0] = 1.0;

    /* Rotate the family 90deg along x and then along z.
       This puts the most variable axis horizontal, the second most variable
       axis vertical, and the least variable in/out of screen. */
    Mat3MultIp(cdsA->avecds->matrix, (const double **) x90z90);

    for (i = 0; i < cdsA->cnum; ++i)
        Mat3MultIp(cdsA->cds[i]->matrix, (const double **) cdsA->avecds->matrix);

    MatDestroy(&x90z90);
}


/* Calculates weights corresponding to the atomic, row-wise covariance matrix only */
void
CalcWts(CdsArray *cdsA)
{
    int             i, j;
    Algorithm      *algo = cdsA->algo;
    double         *variance = cdsA->var;
    double         *weight = cdsA->w;
    const int       vlen = cdsA->vlen;

    if (algo->leastsquares != 0)
    {
        for (i = 0; i < vlen; ++i)
            weight[i] = 1.0;

        return;
    }

    if (algo->varweight != 0)
    {
        for (i = 0; i < vlen; ++i)
            if (variance[i] < algo->constant)
                variance[i] = algo->constant;

        HierarchVars(cdsA);

        for (i = 0; i < vlen; ++i)
        {
            if (variance[i] >= DBL_MAX)
                weight[i] = 0.0;
            else if (variance[i] == 0.0)
                weight[i] = 0.0;
            else
                weight[i] =  1.0 / variance[i];
        }

/*         if (algo->alignment == 1) */
/*             cdsA->stats->wtnorm = NormalizeWeightsOcc(weight, cdsA->avecds->o, vlen); */
/*         else */
        cdsA->stats->wtnorm = NormalizeWeights(weight, vlen);

// for (i=0; i<vlen;++i)
//      printf("\n%d: % e  % e", i, variance[i], weight[i]);
    }
    else if (algo->covweight != 0)
    {
        if (cdsA->algo->rounds < 5)
        {
            for (i = 0; i < vlen; ++i)
                for (j = 0; j < i; ++j)
                    cdsA->CovMat[i][j] = cdsA->CovMat[j][i] = 0.0;
        }

        if (algo->hierarch != 0 && algo->rounds > 2)
            HierarchVars(cdsA);

        /* minimum variance boundary condition */
        for (i = 0; i < vlen; ++i)
            if (cdsA->CovMat[i][i] < algo->constant)
                cdsA->CovMat[i][i] = algo->constant;

        /* CovInvWeightLAPACK(cdsA); */
        /* pseudoinv_sym(cdsA->CovMat, cdsA->WtMat, vlen, DBL_MIN); */
        InvSymEigenOp(cdsA->WtMat, (const double **) cdsA->CovMat, vlen, cdsA->tmpvecK, cdsA->tmpmatKK1, DBL_MIN);

        cdsA->stats->wtnorm = NormalizeCovMat(cdsA->WtMat, vlen);

        cdsA->stats->trace_inv_sigma = 0.0;
        for (i = 0; i < vlen; ++i)
            for (j = 0; j < vlen; ++j)
                cdsA->stats->trace_inv_sigma += cdsA->WtMat[i][j];
    }
}


void
CalcWtsFinal(CdsArray *cdsA)
{
    int             i;
    double         *weight = cdsA->w;
    const double   *variance = (const double *) cdsA->var;

    for (i = 0; i < cdsA->vlen; ++i)
    {
        if (variance[i] >= DBL_MAX)
            weight[i] = 0.0;
        else
            weight[i] = 1.0 / variance[i];
    }

    NormalizeWeights(weight, cdsA->vlen);
}


double
SuperPose(Cds *cds1, Cds *cds2, double **rotmat, double *trans,
          double *norm1, double *norm2, double *innprod)
{
    const int       vlen = cds1->vlen;
    double        **tmpmat1 = MatAlloc(3, 3);
    double        **tmpmat2 = MatAlloc(3, 3);
    double        **tmpmat3 = MatAlloc(3, 3);
    double         *tmpvec = malloc(3 * sizeof(double));
    double         *newtrans = malloc(3 * sizeof(double));
    double         *cen1 = calloc(3, sizeof(double));
    double         *cen2 = calloc(3, sizeof(double));
    double          sumdev;
    int             i;

    CenMassOccVec(cds1, cen1);
    CenMassOccVec(cds2, cen2);

    NegTransCdsIp(cds1, cen1);
    NegTransCdsIp(cds2, cen2);

    sumdev = ProcGSLSVDvanOcc(cds1, cds2, rotmat,
                                 tmpmat1, tmpmat2, tmpmat3, tmpvec,
                                 norm1, norm2, innprod);

    if (sumdev > 1)
    {
        printf("  ERROR1111: -> sumdev: % 12.7e % 12.7e \n",
                0.5 * sumdev / vlen, sqrt(fabs(0.5 * sumdev / vlen)) ); 
        printf("  ERROR1111: Please report to dtheobald@brandeis.edu \n");
//        PrintTheseusTag();
//        exit(EXIT_FAILURE);
    }

    TransCdsIp(cds1, cen1);
    TransCdsIp(cds2, cen2);

    InvRotVec(newtrans, cen2, rotmat);

/*     printf("\n nt: %f %f %f", */
/*            newtrans[0], newtrans[1], newtrans[2]); */

    for (i = 0; i < 3; ++i)
        trans[i] = newtrans[i] - cen1[i];

    MatDestroy(&tmpmat1);
    MatDestroy(&tmpmat2);
    MatDestroy(&tmpmat3);
    free(tmpvec);
    free(newtrans);
    free(cen1);
    free(cen2);

    return(sumdev);
}
