/*!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
! Copyright 2010.  Los Alamos National Security, LLC. This material was    !
! produced under U.S. Government contract DE-AC52-06NA25396 for Los Alamos !
! National Laboratory (LANL), which is operated by Los Alamos National     !
! Security, LLC for the U.S. Department of Energy. The U.S. Government has !
! rights to use, reproduce, and distribute this software.  NEITHER THE     !
! GOVERNMENT NOR LOS ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY,     !
! EXPRESS OR IMPLIED, OR ASSUMES ANY LIABILITY FOR THE USE OF THIS         !
! SOFTWARE.  If software is modified to produce derivative works, such     !
! modified software should be clearly marked, so as not to confuse it      !
! with the version available from LANL.                                    !
!                                                                          !
! Additionally, this program is free software; you can redistribute it     !
! and/or modify it under the terms of the GNU General Public License as    !
! published by the Free Software Foundation; version 2.0 of the License.   !
! Accordingly, this program is distributed in the hope that it will be     !
! useful, but WITHOUT ANY WARRANTY; without even the implied warranty of   !
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General !
! Public License for more details.                                         !
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!*/

#include "Kernels.h"	

__global__ void CGIterateKernel(int M, int N, REAL *p0, REAL *tmpmat, REAL *r0, REAL *bo, REAL *error2_ptr, int num_threads) {
  REAL r0vec, p0vec, r1vec, xalpha, xbeta, error2;
  int offset, i, j, index;
  __shared__ REAL intermediate_sums[512];

  index=threadIdx.x;

  error2=ZERO;
  for (j=index; j<N; j+=num_threads) {
    r0vec = ZERO;
    p0vec = ZERO;
    r1vec = ZERO;

    for (offset=j*M; offset<j*M+M; offset++) {
      p0vec+=(p0[offset]*tmpmat[offset]);
      r0vec+=(r0[offset]*r0[offset]);
    }
    if (p0vec>ZERO) xalpha = r0vec/p0vec;
    else xalpha=ZERO;

    for (offset=j*M; offset<j*M+M; offset++) {
      bo[offset]+=(xalpha*p0[offset]);
      r0[offset]+=(xalpha*tmpmat[offset]);
      r1vec+=(r0[offset]*r0[offset]);
    }

    error2 += r1vec;
    if (r0vec>ZERO) xbeta = r1vec/r0vec;
    else xbeta=ZERO;

    for (offset=j*M; offset<j*M+M; offset++) {
      p0[offset]=r0[offset]-xbeta*p0[offset];
    }
  }
  intermediate_sums[index]=error2;

  // make sure all intermediate sums have been calculated
  __syncthreads();

  // calculate total sum using binary tree method
  int modulus=2;
  while(modulus<num_threads*2) {
    if (index%modulus==0) {
    intermediate_sums[index]+=intermediate_sums[index+modulus/2];
    }
    modulus*=2;
    __syncthreads();
  }
  if (index==0) *error2_ptr=intermediate_sums[0];
}

