// -*- C++ -*-
//
// Copyright (C) 1998, 1999, 2000, 2002  Los Alamos National Laboratory,
// Copyright (C) 1998, 1999, 2000, 2002  CodeSourcery, LLC
//
// This file is part of FreePOOMA.
//
// FreePOOMA is free software; you can redistribute it and/or modify it
// under the terms of the Expat license.
//
// This program is distributed in the hope that it will be useful, but
// WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Expat
// license for more details.
//
// You should have received a copy of the Expat license along with
// FreePOOMA; see the file LICENSE.
//
// ----------------------------------------------------------------------

#ifdef POOMA_BENCHMARKS_MESSAGING_TULIP

#include "TulipCoreOneRound.h"
#include <stdio.h>

static int *makeCopy(int *p, int n);
static void increment(int *offset, int dim);
static int *makeMyLoc(int *tiling, int dim);
static int *makeStrides(int *tiling, int dim);

//----------------------------------------------------------------------
//
// Constructor.
//
// Initialize all the data structures except the bulk data.  That will
// be set up in initialize.
//
//----------------------------------------------------------------------

TulipCoreOneRound::
TulipCoreOneRound(int dim, int *tiling, int guards)
  : dim_m(dim),
    guards_m(guards),
    data_m(0),
    elems_m(0),
    totalElems_m(0)
{
  //
  // Just copy the input tiling.
  //
  tiling_m = makeCopy(tiling,dim);

  //
  // Giving the tiling and myproc, find which tile we are.
  //
  myLoc_m = makeMyLoc(tiling,dim);

  //
  // Calculate the strides in the neighbor array.
  //
  strides_m = makeStrides(tiling,dim);

  //
  // Calculate processors with the neighboring patches.
  //
  neighbors_m = makeNeighbors();

  /*
  std::cout << "myproc = " << tulip_core_mycontext() << "\n";

  std::cout << "tiling = ";
  for (int i=0; i<dim; ++i)
    std::cout << tiling_m[i] << " ";
  std::cout << "\n";

  std::cout << "strides= ";
  for (int i=0; i<dim; ++i)
    std::cout << strides_m[i] << " ";
  std::cout << "\n";

  std::cout << "myLoc  = ";
  for (int i=0; i<dim; ++i)
    std::cout << myLoc_m[i] << " ";
  std::cout << "\n";

  std::cout << "neighbors = ";
  for (int i=0; i<numNeighbors_m; ++i)
    std::cout << neighbors_m[i] << " ";
  std::cout << "\n";
  */
}

//----------------------------------------------------------------------
//
// Destructor.
// Just clear up the allocated data.
//
//----------------------------------------------------------------------

TulipCoreOneRound::~TulipCoreOneRound()
{
  delete [] tiling_m;
  delete [] strides_m;
  delete [] myLoc_m;
  delete [] neighbors_m;
  delete [] data_m;
}

//----------------------------------------------------------------------
//
// increment
//
// Utility function that takes in an array of dim integers, and
// increments them as if they were digits in a single number.  Each
// digit takes on the values -1, 0, 1.
//
//----------------------------------------------------------------------

static void
increment(int *offset, int dim)
{
  for (int i=0; i<dim; ++i)
    {
      offset[i] += 1;
      if ( offset[i] > 1 )
	offset[i] = -1;
      else
	break;
    }
}

//----------------------------------------------------------------------
//
// makeMyLoc 
//
// Given a tiling and myproc, work out where this patch is in the
// tiles.  Return a new array of size dim with the location of this
// patch.
//
//----------------------------------------------------------------------

static int *
makeMyLoc(int *tiling, int dim)
{
  //
  // Allocate space for the result and find myproc.
  //
  int *myLoc = new int[dim];
  int me = tulip_core_mycontext();

  //
  // Calculate the total number of tiles.
  //
  int m = 1;
  for (int i=0; i<dim; ++i)
    m *= tiling[i];

  //
  // Loop over dimensions, pulling out the location one dimension at a
  // time.
  //
  for (int i=dim-1; i>=0; --i)
    {
      m /= tiling[i];
      myLoc[i] = me / m;
      me -= myLoc[i]*m;
    }

  //
  // Return the location we calculated.
  //
  return myLoc;
}

//----------------------------------------------------------------------
//
// makeCopy
//
// Just make a copy of a list of integers of length n.  Allocate a new
// array of size n for it and return that.
//
//----------------------------------------------------------------------

static int *
makeCopy(int *p, int n)
{
  int *x = new int[n];
  for (int i=0; i<n; ++i)
    x[i] = p[i];
  return x;
}

//----------------------------------------------------------------------
//
// makeStrides
//
// Calculate the strides in the array of neighbors.  Return the result
// in a new array of size dim.
//
//----------------------------------------------------------------------

static int *
makeStrides(int *tiling, int dim)
{
  //
  // Allocate the space for the strides.
  //
  int *strides = new int[dim];

  //
  // The stride in the first dimension is 1.
  //
  strides[0] = 1;

  //
  // Calculate each stride in turn.
  //
  for (int i=1; i<dim; ++i)
    strides[i] = strides[i-1]*tiling[i-1];

  //
  // Return the resulting strides.
  //
  return strides;
}

//----------------------------------------------------------------------
//
// makeNeighbors
//
// Calculate who my neighbors are in each direction, with periodic
// boundary conditions.
//
//----------------------------------------------------------------------

int *
TulipCoreOneRound::makeNeighbors()
{
  //
  // Calculate the total number of neighbors.  3^dim
  //
  numNeighbors_m = 3;
  for (int i=1; i<dim_m; ++i)
    numNeighbors_m *= 3;

  //
  // Allocate space for the result.
  //
  int *neighbors = new int[numNeighbors_m];

  //
  // As we loop over the neighbors, we keep track of where it is
  // compared to this one.
  //
  int *offset = new int[dim_m];
  for (int i=0; i<dim_m; ++i)
    offset[i] = -1;

  //
  // Loop over all of the neighbors, and calculate the processor
  // number for each.
  //
  for (int nbr=0; nbr<numNeighbors_m ; ++nbr)
    {
      //
      // We'll be calculating nbrProc for each neighbor.
      //
      int nbrProc = 0;

      //
      // Loop over dimensions, and offset in each.
      //
      for (int i=0; i<dim_m; ++i)
	{
	  //
	  // Calculate the location in this dimension and wrap around.
	  //
	  int l = offset[i] + myLoc_m[i];
	  if ( l < 0 )
	    l += tiling_m[i];
	  else if ( l >= tiling_m[i] )
	    l -= tiling_m[i];

	  //
	  // Use the strides to find the contribution to the processor
	  // number for this dimension.
	  //
	  nbrProc += l*strides_m[i];
	}
      //
      // We've calculated the location, so write it down.
      //
      neighbors[nbr] = nbrProc;

      //
      // Increment the offsets to go on to the next neighbor.
      //
      increment(offset,dim_m);
    }
  //
  // Delete the local array and return the result array.
  //
  delete [] offset;
  return neighbors;
}

//----------------------------------------------------------------------
//
// TulipCoreOneRound::initialize
//
// Set up to do a run of size n.  Allocate the bulk data and record
// the size.
//
//----------------------------------------------------------------------

void 
TulipCoreOneRound::initialize(int n)
{
  //
  // Record the size.
  //
  elems_m = n;

  //
  // Find the number of elements in this size including the guard
  // cells.  Remember the total number of elements for later.
  //
  int s = 1;
  for (int i=0; i<dim_m; ++i)
    s *= (elems_m+guards_m*2);
  totalElems_m = s;

  //
  // Clear up any old data and allocate new.
  //
  delete [] data_m;
  data_m = new double[s];

  //
  // Fill it with zeros to keep it clean.
  //
  for (int i=0; i<s; ++i)
    data_m[i] = 0;
}

//----------------------------------------------------------------------
// 
// TulipCoreOneRound::run
//
// Perform the computation we will be timing.  For the moment this is
// an in-place operation with no communication.
//
//----------------------------------------------------------------------

void 
TulipCoreOneRound::run()
{
  for (int i=0; i<totalElems_m; ++i)
    data_m[i] *= 2.0;
}

#endif // POOMA_BENCHMARKS_MESSAGING_TULIP


