/*
 * Copyright 1998-2001, University of Notre Dame.
 * Authors: Jeffrey M. Squyres and Arun Rodrigues with Brian Barrett,
 *          Kinis L. Meyer, M. D. McNally, and Andrew Lumsdaine
 * 
 * This file is part of the Notre Dame LAM implementation of MPI.
 * 
 * You should have received a copy of the License Agreement for the Notre
 * Dame LAM implementation of MPI along with the software; see the file
 * LICENSE.  If not, contact Office of Research, University of Notre
 * Dame, Notre Dame, IN 46556.
 * 
 * Permission to modify the code and to distribute modified code is
 * granted, provided the text of this NOTICE is retained, a notice that
 * the code was modified is included with the above COPYRIGHT NOTICE and
 * with the COPYRIGHT NOTICE in the LICENSE file, and that the LICENSE
 * file is distributed with the modified code.
 * 
 * LICENSOR MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED.
 * By way of example, but not limitation, Licensor MAKES NO
 * REPRESENTATIONS OR WARRANTIES OF MERCHANTABILITY OR FITNESS FOR ANY
 * PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE COMPONENTS
 * OR DOCUMENTATION WILL NOT INFRINGE ANY PATENTS, COPYRIGHTS, TRADEMARKS
 * OR OTHER RIGHTS.
 * 
 * Additional copyrights may follow.
 * 
 *	Ohio Trollius
 *	Copyright 1996 The Ohio State University
 *	NJN
 *
 *	$Id: mpil_spawn.c,v 6.15 2001/01/29 20:34:45 jsquyres Exp $
 *
 *	Function:	- spawn MPI processes
 */

#include <errno.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

#include <app_mgmt.h>
#include <app_schema.h>
#include <args.h>
#include <blktype.h>
#include <mpi.h>
#include <mpisys.h>
#include <net.h>
#include <portable.h>
#include <rpisys.h>
#include <sfh.h>
#include <terror.h>
#include <typical.h>

/*
 * private functions
 */
static int spawn(char *schema, MPI_Comm comm, int cid, int *numkids, 
		 struct _gps **kids);
static char *locate_aschema(char *filename);

/*
 *	MPIL_Spawn
 *
 *	Function:	- spawn MPI processes
 *	Accepts:	- application schema
 *			- root process rank
 *			- parent communicator
 *			- intercomm between parents and children (returned)
 *	Returns:	- MPI_SUCCESS or error code
 */
/*
-----This is the original man page.  Just so that the text isn't lost.
The real man page is listed below this one.

The original 'MPIL_Spawn' man page is still included below (some
information may be inaccurate, since this function isn't actively
maintained anymore), but as stated above, users are `highly`
encouraged to migrate to the official MPI-2 dynamic functions instead
of this LAM-specific function.  If users need to utilize an
application schema, see information about the "file" MPI_Info key in
'MPI_Comm_spawn'(2).

Description:

A group of processes can create another group of processes with
'MPIL_Spawn' The parent group is indicated by the first communicator
argument.  'MPIL_Spawn' is a collective operation over the parent
processes.  The child group starts up like any MPI application.  The
processes must begin by calling 'MPI_Init', after which the
pre-defined communicator, 'MPI_COMM_WORLD', may be used.  This world
communicator contains only the child processes.  It is distinct from
the 'MPI_COMM_WORLD' of the parent processes.

Communication With Spawned Processes:

The natural communication mechanism between two groups is the
inter-communicator.  The second communicator argument to 'MPIL_Spawn'
returns an inter-communicator whose local group contains the parent
processes (same as the first communicator argument) and whose remote
group contains child processes.  The child processes get the same
inter-communicator by calling 'MPIL_Comm_parent' which returns
'MPI_COMM_NULL' if the process was created by 'mpirun'(1) instead of
'MPIL_Spawn'.  Both groups can decide to merge the inter-communicator
into an intra-communicator and take advantage of other MPI collective
operations.  They can then use the merged intra-communicator to create
new inter-communicators and reach other processes in the MPI
application.

Choosing Programs and Resources:

The program names, process counts and computing resources for the new
child processes are specified in an application schema identical to
that used by 'mpirun'(1).  The application schema filename is given in
the 'app' argument to 'MPIL_Spawn' As with 'mpirun'(1), a simple SPMD
application can be specified in the 'app' argument itself (equivalent
to the mpirun(1) command line).  The simplest way to run one copy of
an executable file on any node chosen by LAM is to specify "a.out -c
1" for the 'app' argument.  A single filename by itself is taken to be
an application schema, not an excutable filename.  See 'mpirun'(1) for
complete details.

Locating an Application Schema:

If an application schema is specified, it is located on the node that
is running the process indicated by the `root` argument.  On that
node, the schema file is located by searching a few directories.  The
directory defined by the LAMAPPLDIR environment variable, if set, is
searched first.  Then the local directory is searched.

Locating an Executable Program:

If an executable file is specified, it is located on the node(s) where
the process(es) will execute, unless the -s option is specified as
with 'mpirun'(1).  On any node, the directories specified by the
user's PATH environment variable are searched to find an executable
file.

All MPI runtime options selected by mpirun(1) in the initial
application launch remain in effect for all child processes created by
'MPIL_Spawn()'.

The Universe:

The number of processes that an application spawns is often
constrained by the number of processors available.  That information
(in nodes, not necessarily processors) is obtained from LAM with
'MPIL_Universe_size()' Since LAM can dynamically add and subtract
nodes, the universe size can change at any time.

.seealso MPI_Comm_spawn, MPI_Comm_spawn_multiple, mpirun
*/
/*@

MPIL_Spawn - LAM/MPI-specific function to spawn MPI processes

Input Parameters:
+ comm - Source communicator (handle)
. schema - character string LAM app schema
- root - root node (integer)

Output Parameter:
. intercomm - new intercommunicator spanning 'comm' and the
'MPI_COMM_WORLD' of the new processes

.N Wrapper MPI_Comm_spawn
@*/
int MPIL_Spawn(MPI_Comm comm, char *schema, int root, 
	       MPI_Comm *intercomm)
{
	MPI_Group	kgrp;			/* child group */
	struct _proc	**p;
	struct _gps	*g;
	struct _gps	*kids;			/* array of child GPS */
	int		rank;			/* caller rank */
	int		size;			/* group size */
	int		err;			/* error code */
	int		numkids;		/* num. of children spawned */
	int		mycid;			/* local max context ID */
	int		cid;			/* context ID for intercomm */
	int		msg[2];			/* two int message buffer */
	int		i;

	lam_initerr_m();
	lam_setfunc_m(BLKMPILSPAWN);
/*
 * Check the arguments.
 */
	if ((comm == MPI_COMM_NULL) || LAM_IS_INTER(comm)) {
		return(lam_errfunc(comm, BLKMPILSPAWN,
				lam_mkerr(MPI_ERR_COMM, 0)));
	}

	MPI_Comm_size(comm, &size);

	if ((root >= size) || (root < 0)) {
		return(lam_errfunc(comm, BLKMPILSPAWN,
				lam_mkerr(MPI_ERR_ROOT, 0)));
	}

	if ((intercomm == 0) || (schema == 0)) {
		return(lam_errfunc(MPI_COMM_WORLD,
			BLKMPILSPAWN, lam_mkerr(MPI_ERR_ARG, 0)));
	}

	LAM_TRACE(lam_tr_cffstart(BLKMPILSPAWN));
/*
 * Set debugging parameters.
 */
	g = &(comm->c_group->g_procs[root]->p_gps);

	lam_setparam(BLKMPILSPAWN, root | (g->gps_grank << 16),
				(g->gps_node << 16) | g->gps_idx);
/*
 * Synchronize all members of the parent group and get the context ID
 * for the parent-child intercommunicator.
 */
	MPI_Comm_rank(comm, &rank);

	mycid = lam_getcid();

	if (mycid < 0) {
		return(lam_errfunc(comm, BLKMPILSPAWN,
				lam_mkerr(MPI_ERR_INTERN, EFULL)));
	}

	err = MPI_Reduce(&mycid, &cid, 1, MPI_INT, MPI_MAX, root, comm);
	if (err != MPI_SUCCESS) {
		return(lam_errfunc(comm, BLKMPILSPAWN, err));
	}

	if (rank == root) {
/*
 * The root does the process spawning.
 */
		if (spawn(schema, comm, cid, &numkids, &kids)) {
/*
 * Inform parent group of spawn error.
 */
			err = lam_mkerr(MPI_ERR_OTHER, errno);
			
			msg[0] = -1; msg[1] = err;
			
			MPI_Bcast(msg, 2, MPI_INT, root, comm);
			
			return(lam_errfunc(comm, BLKMPILSPAWN, err));
		}
		
		msg[0] = cid; msg[1] = numkids;
	}
/*
 * Broadcast the context ID for the parent-child intercommunicator and the
 * number of children spawned to the parents. In the case of an error
 * in spawning we are actually broadcasting the error code.
 */
	err = MPI_Bcast(msg, 2, MPI_INT, root, comm);
	if (err != MPI_SUCCESS) {
		return(lam_errfunc(comm, BLKMPILSPAWN, err));
	}

	if (rank != root) {

		cid = msg[0];
/*
 * A context ID of -1 means an error occurred in spawning so we
 * return with the error.
 */
		if (cid == -1) {
			err = msg[1];
			return(lam_errfunc(comm, BLKMPILSPAWN, err));
		}
/*
 * Allocate buffer to receive array of child GPS.
 */
		numkids = msg[1];
		kids = (struct _gps *)
			malloc((unsigned) (numkids * sizeof(struct _gps)));
		if (kids == 0) {
			return(lam_errfunc(comm, BLKMPILSPAWN,
					lam_mkerr(MPI_ERR_OTHER, errno)));
		}
	}
/*
 * Broadcast the array of child GPS to parent group.
 */
	err = MPI_Bcast(kids, numkids * sizeof(struct _gps) / sizeof(int),
			MPI_INT, root, comm);
	if (err != MPI_SUCCESS) {
		free((char *) kids);
		return(lam_errfunc(comm, BLKMPILSPAWN, err));
	}
/*
 * Create the child group.
 */
	kgrp = (MPI_Group) malloc((unsigned) (sizeof(struct _group) +
					(numkids * sizeof(struct _proc *))));
	if (kgrp == 0) {
		free((char *) kids);
		return(lam_errfunc(comm, BLKMPILSPAWN,
					lam_mkerr(MPI_ERR_OTHER, errno)));
	}
	kgrp->g_nprocs = numkids;
	kgrp->g_myrank = MPI_UNDEFINED;
	kgrp->g_refcount = 1;
	kgrp->g_f77handle = -1;
	kgrp->g_procs = (struct _proc **) 
	  ((char *) kgrp + sizeof(struct _group));

	g = kids;
	p = kgrp->g_procs;

	for (i = 0; i < numkids; ++i, ++p, ++g) {

		*p = lam_procadd(g);
		if (*p == 0) {
			free((char *) kids);
			free((char *) kgrp);
			return(lam_errfunc(comm, BLKMPILSPAWN,
					lam_mkerr(MPI_ERR_OTHER, errno)));
		}
		(*p)->p_mode |= LAM_PCLIENT;
		(*p)->p_refcount++;
	}
/*
 * Create the parent-child intercommunicator.
 */
	*intercomm = 0;
	if (lam_comm_new(cid, comm->c_group, kgrp, LAM_CINTER, intercomm)) {
		free((char *) kids);
		free((char *) kgrp);
		return(lam_errfunc(comm, BLKMPILSPAWN,
				lam_mkerr(MPI_ERR_OTHER, errno)));
	}

	comm->c_group->g_refcount++;
	(*intercomm)->c_errhdl = comm->c_errhdl;
	comm->c_errhdl->eh_refcount++;

	if (!al_insert(lam_comms, intercomm)) {
		return(lam_errfunc(comm, BLKMPILSPAWN,
				lam_mkerr(MPI_ERR_INTERN, errno)));
	}

	if (lam_tr_comm(*intercomm)) {
		return(lam_errfunc(comm, BLKMPILSPAWN,
				lam_mkerr(MPI_ERR_INTERN, errno)));
	}

	lam_setcid(cid);
/*
 * setup new processes
 */
	if (RPI_SPLIT(_rpi_lamd_addprocs, _rpi_c2c_addprocs, ())) {
		return(lam_errfunc(comm, BLKMPILSPAWN,
				lam_mkerr(MPI_ERR_OTHER, errno)));
	}
/*
 * Wait until all the children have initialized.
 * The root waits for rank 0 in the child world to communicate this fact and
 * then broadcasts it to the other parents.
 */
	if (rank == root) {
		err = MPI_Recv((void *) 0, 0,
				MPI_BYTE, 0, 0, *intercomm, MPI_STATUS_IGNORE);
		if (err != MPI_SUCCESS) {
			return(lam_errfunc(comm, BLKMPILSPAWN, err));
		}
		err = MPI_Send((void *)0, 0, MPI_BYTE, 0, 0, *intercomm);
		if (err != MPI_SUCCESS) {
			return(lam_errfunc(comm, BLKMPICOMMSPAWN, err));
		}
	}

	err = MPI_Bcast((void *) 0, 0, MPI_BYTE, root, comm);
	
	free((char*) kids);
	LAM_TRACE(lam_tr_cffend(BLKMPILSPAWN, root, comm, 0, 0));
	
	lam_resetfunc_m(BLKMPILSPAWN);
	return(MPI_SUCCESS);
}

/*
 *	MPIL_Comm_parent
 *
 *	Function:	- returns the parent inter-communicator
 *	Accepts:	- communicator (out)
 */
/*@

MPIL_Comm_parent - LAM/MPI-specific function to obtain the
communicator containing the parent process

Output Parameter:
. comm - Parent communicator (handle)

.N Wrapper MPI_Comm_parent
@*/
int MPIL_Comm_parent(MPI_Comm *comm)
{
	lam_initerr_m();
	lam_setfunc_m(BLKMPILCOMMPARENT);
/*
 * Check the arguments.
 */
	if (comm == 0) {
		return(lam_errfunc(MPI_COMM_WORLD, BLKMPILCOMMPARENT,
				lam_mkerr(MPI_ERR_ARG, 0)));
	}
/*
 * Set the intercomm.
 */
	*comm = lam_comm_parent;

	lam_resetfunc_m(BLKMPILCOMMPARENT);
	return(MPI_SUCCESS);
}

/*
 *	MPIL_Universe_size
 *
 *	Function:	- returns the number of nodes
 *	Accepts:	- number of nodes (out)
 */
/*@

MPIL_Universe_size - LAM/MPI-specific function to obtain the
number of nodes in the MPI environment

Output Parameter:
. size - number of nodes in the MPI environment (from 'lamboot')

Notes:

This function is deprecated.  It has been replaced with an MPI-2
attribute on 'MPI_COMM_WORLD' - 'MPI_UNIVERSE_SIZE', which provides
the same functionality.  This function is (or effectively is) a
wrapper to the replacement attribute, anyway.  User programs should
use the MPI-2 replacement attribute instead of this function.

Some functions are deprecated because their names did not conform to
the stricter naming conventions of MPI-2; others are deprecated
because they have been replaced with more flexible functionality.

.seealso: MPI_Comm_attr_get
@*/
int MPIL_Universe_size(int *size)
{
	lam_initerr_m();
	lam_setfunc_m(BLKMPILUNIVERSESIZE);
/*
 * Check the arguments.
 */
	if (size == 0) {
		return(lam_errfunc(MPI_COMM_WORLD, BLKMPILUNIVERSESIZE,
				lam_mkerr(MPI_ERR_ARG, 0)));
	}
/*
 * Set the intercomm.
 */
	*size = getncomp();

	if (*size < 0) {
		return(lam_errfunc(MPI_COMM_WORLD, BLKMPILUNIVERSESIZE,
				lam_mkerr(MPI_ERR_ARG, errno)));
	}
	
	lam_resetfunc_m(BLKMPILUNIVERSESIZE);
	return(MPI_SUCCESS);
}

/*
 *	spawn
 *
 *	Function:	- spawn MPI processes according to schema
 *	Accepts:	- schema
 *			- parent communicator
 *			- context ID for parent-child intercommunicator
 *			- ptr number of children (returned)
 *			- ptr array of child GPS (returned)
 *	Returns:	- 0 or LAMERROR
 */
static int
spawn(char *schema, MPI_Comm comm, int cid, int *numkids, 
      struct _gps **kids)
{
	struct nmsg	nhead;			/* network msg header */
	struct _proc	**g;			/* ptr process in group */
	struct _gps	*procgps;		/* procs. GPS */
	struct _gps	*p;			/* favourite pointer */
	LIST		*app;			/* application */
	LIST		*app_sched;		/* scheduled application */
	char		*aschema;		/* application schema */
	int		nparent;		/* size of parent world */
	int		nworld;			/* size of child world */
	int4		rtf;			/* child runtime flags */
	int		rank;			/* my (spawner's) rank */
	int		ignore;			/* ignored argument */
	int		i;
	struct _gps	*worldout;		/* real child GPS array */
	struct jobid_t	jobid;
	char            **env = 0;
/*
 * If schema is a single argument then it is an application schema file,
 * otherwise it is an explicit specification of what to execute like that
 * given to mpirun.
 */
	asc_environment(1, NULL, &env);
	if (strchr(schema, ' ')) {
		app = asc_bufparse(schema, strlen(schema), &ignore, env);
	} else {
		aschema = locate_aschema(schema);

		if (aschema == 0) {
			errno = EINVAL;
			sfh_argv_free(env);
			return(LAMERROR);
		}

		app = asc_parse(aschema, &ignore, env);
		free(aschema);
	}
	sfh_argv_free(env);

	if (app == 0) {
		errno = EUSAGE;
		return(LAMERROR);
	}

	app_sched = asc_schedule(app);
	asc_free(app);
	if (app_sched == 0) {
		return(LAMERROR);
	}
/*
 * Allocate child and parent GPS array.
 */
	MPI_Comm_size(comm, &nparent);
	nworld = al_count(app_sched);
	procgps = (struct _gps *)
		malloc((unsigned) (nworld + nparent) * sizeof(struct _gps));
	if (procgps == 0) {
		asc_free(app_sched);
		return(LAMERROR);
	}
	worldout = (struct _gps *)
		malloc((unsigned) (nworld + nparent) * sizeof(struct _gps));
	if (worldout == 0) {
		asc_free(app_sched);
		free((char*) procgps);
		return(LAMERROR);
	}
/*
 * Set environment inherited by children.  The world spawning them consists
 * solely of the parent group.
 */
	rtf = _kio.ki_rtf | RTF_MPIRUN;
#ifdef RTF_IMPI
	rtf &= ~(RTF_TRON | RTF_FLAT | RTF_WAIT | RTF_IMPI);
#else
	rtf &= ~(RTF_TRON | RTF_FLAT | RTF_WAIT);
#endif
/*
 * Set job identifier to be inherited by the application.
 */
	jobid.jid_node = _kio.ki_jobid.jid_node;
	jobid.jid_pid = _kio.ki_jobid.jid_pid;
	_kio.ki_jobid.jid_node = getnodeid();
	_kio.ki_jobid.jid_pid = getpid();
/*
 * Run the application.
 */
	if (asc_run(app_sched, nparent, rtf, 0, 0, procgps)) {
		asc_free(app_sched);
		free((char *) procgps);
		return(LAMERROR);
	}
	asc_free(app_sched);
	_kio.ki_jobid.jid_node = jobid.jid_node;
	_kio.ki_jobid.jid_pid = jobid.jid_pid;
/*
 * Stole this code almost verbatim out of mpirun.c so that we can
 * MPI_Comm_spawn non-MPI jobs (of course, they must eventually run
 * LAM/MPI programs that call MPI_Init).
 */
	if (lam_get_mpi_world(nworld, procgps, worldout, "MPIL_Spawn")) {
		free((char *) procgps);
		return(LAMERROR);
	}
	free((char *) procgps);
/*
 * Fill in child ranks in their MPI_COMM_WORLD.
 */
	for (i = 0, p = worldout; i < nworld; ++i, ++p) {
		p->gps_grank = i;
	}
/*
 * Fill in the parent world GPS.
 */
	g = comm->c_group->g_procs;

	for (i = 0; i < nparent; ++i, ++p, ++g) {
		*p = (*g)->p_gps;
	}
/*
 * Set up the message.
 */
	MPI_Comm_rank(comm, &rank);
	nhead.nh_type = 0;
	nhead.nh_flags = DINT4MSG;
	nhead.nh_msg = (char *) worldout;
	nhead.nh_length = (nworld + nparent) * sizeof(struct _gps);
	nhead.nh_data[1] = (int4) cid;
	nhead.nh_data[2] = (int4) rank;
	nhead.nh_data[3] = (int4) lam_universe_size;
	nhead.nh_data[4] = (int4) 0;		/* application number */
/*
 * Loop sending to each process.
 */
	for (i = 0, p = worldout; i < nworld; ++i, ++p) {
		nhead.nh_node = p->gps_node;
		nhead.nh_event = (-p->gps_pid) & 0xBFFFFFFF;
		if (nsend(&nhead)) {
			free((char *) worldout);
			return(LAMERROR);
		}
	}

	*numkids = nworld;
	*kids = worldout;

	return(0);
}

/*
 *	locate_aschema
 *
 *	Function:	- locate an application schema
 *	Accepts:	- filename
 *	Returns:	- full pathname or NULL
 */
static char *
locate_aschema(char *filename)
{
	int		pathc = 0;		/* paths argc */
	char		**pathv = 0;		/* paths argv */
	char		*appdir;		/* application dir */
	char		*fullpath;		/* full pathname */

	if ((appdir = getenv("LAMAPPLDIR"))) {
		argvadd(&pathc, &pathv, appdir);
	}
	argvadd(&pathc, &pathv, "");
	argvadd(&pathc, &pathv, "$LAMHOME/etc");
	argvadd(&pathc, &pathv, "$TROLLIUSHOME/etc");

	fullpath = sfh_path_find(filename, pathv, R_OK);
	argvfree(pathv);
	return(fullpath);
}
