/*!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
! Copyright 2010.  Los Alamos National Security, LLC. This material was    !
! produced under U.S. Government contract DE-AC52-06NA25396 for Los Alamos !
! National Laboratory (LANL), which is operated by Los Alamos National     !
! Security, LLC for the U.S. Department of Energy. The U.S. Government has !
! rights to use, reproduce, and distribute this software.  NEITHER THE     !
! GOVERNMENT NOR LOS ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY,     !
! EXPRESS OR IMPLIED, OR ASSUMES ANY LIABILITY FOR THE USE OF THIS         !
! SOFTWARE.  If software is modified to produce derivative works, such     !
! modified software should be clearly marked, so as not to confuse it      !
! with the version available from LANL.                                    !
!                                                                          !
! Additionally, this program is free software; you can redistribute it     !
! and/or modify it under the terms of the GNU General Public License as    !
! published by the Free Software Foundation; version 2.0 of the License.   !
! Accordingly, this program is distributed in the hope that it will be     !
! useful, but WITHOUT ANY WARRANTY; without even the implied warranty of   !
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General !
! Public License for more details.                                         !
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!*/

#include "Matrix.h"

#ifdef BLAS

#if REALSIZE==4
  #undef CGIterateParameterList
  #define CGIterateParameterList CGIterateParameterList4
  #undef CGIterateThreaded
  #define CGIterateThreaded CGIterateThreaded4
#elif REALSIZE==8
  #undef CGIterateParameterList
  #define CGIterateParameterList CGIterateParameterList8
  #undef CGIterateThreaded
  #define CGIterateThreaded CGIterateThreaded8
#endif

typedef struct {
  int index, M, N, num_threads;
  REAL *r0, *p0, *tmpmat, *bo;
  REAL error2;
} CGIterateParameterList;

void *CGIterateThreaded(void *arg) {
  REAL r0vec, p0vec, r1vec, xalpha, xbeta;
  REAL *p0, *tmpmat, *r0, *bo;
  int offset, i, j, M, N, num_threads, index;

  CGIterateParameterList *data=(CGIterateParameterList *)arg;

  index=data->index;
  num_threads=data->num_threads;
  M=data->M; 
  N=data->N;
  p0=data->p0;
  tmpmat=data->tmpmat;
  r0=data->r0;
  bo=data->bo;

  data->error2=0.0;
  for (j=index; j<N; j+=num_threads) {
    r0vec = 0.0;
    p0vec = 0.0;
    r1vec = 0.0;

    offset=j*M;
    for (i=0; i<M; i++) {
      p0vec+=(p0[offset]*tmpmat[offset]);
      r0vec+=(r0[offset]*r0[offset]);
      offset++;
    }
    if (p0vec>0.0) xalpha = r0vec/p0vec;
    else xalpha=0.0;

    offset=j*M;
    for (i=0; i<M; i++) {
      bo[offset]+=(xalpha*p0[offset]);
      r0[offset]+=(xalpha*tmpmat[offset]);
      offset++;
    }

    offset=j*M;
    for (i=0; i<M; i++) {
      r1vec+=(r0[offset]*r0[offset]);
      offset++;
    }

    (data->error2) += r1vec;
    if (r0vec>0.0) xbeta = r1vec/r0vec;
    else xbeta=0.0;

    offset=j*M;
    for (i=0; i<M; i++) {
      p0[offset]=r0[offset]-xbeta*p0[offset];
      offset++;
    }
  }
  return NULL;
}
#endif

REAL M_CGIterate(Matrix bo, Matrix p0, Matrix tmpmat, Matrix r0) {
  REAL error2=0.0;
  #ifdef CUDA
    REAL *device_error2;
    cudaMalloc(&device_error2, sizeof(REAL));
    CGIterateKernel<<<1,NUM_THREADS>>>(bo.DM, bo.DN, p0.Device, tmpmat.Device, r0.Device, bo.Device, device_error2, NUM_THREADS);
    // Copy to local variable
    cudaThreadSynchronize();
    cudaMemcpy(&error2, device_error2, sizeof(REAL), cudaMemcpyDeviceToHost);
    cudaFree(device_error2);
  #endif
  #ifdef BLAS
    pthread_t *threads;
    const int num_threads=8;
    CGIterateParameterList cgiterate_parameter_list[num_threads];

    threads=(pthread_t *)malloc(num_threads*sizeof(pthread_t));
    for (int i=0; i<num_threads; i++) {
      cgiterate_parameter_list[i].index=i;
      cgiterate_parameter_list[i].num_threads=num_threads;
      cgiterate_parameter_list[i].M=bo.M;
      cgiterate_parameter_list[i].N=bo.N;
      cgiterate_parameter_list[i].p0=(REAL *)p0.Local;
      cgiterate_parameter_list[i].tmpmat=(REAL *)tmpmat.Local;
      cgiterate_parameter_list[i].r0=(REAL *)r0.Local;
      cgiterate_parameter_list[i].bo=(REAL *)bo.Local;
      pthread_create(&threads[i], NULL, CGIterateThreaded, &cgiterate_parameter_list[i]);
    }

    error2=0.0;
    for (int i=0; i<num_threads; i++) {
      error2+=cgiterate_parameter_list[i].error2;
      pthread_join(threads[i], NULL);
    }

    free(threads);
  #endif  
  return error2;
}
