/*
 * Copyright (c) 2001-2002 The Trustees of Indiana University.  
 *                         All rights reserved.
 * Copyright (c) 1998-2001 University of Notre Dame. 
 *                         All rights reserved.
 * Copyright (c) 1994-1998 The Ohio State University.  
 *                         All rights reserved.
 * 
 * This file is part of the LAM/MPI software package.  For license
 * information, see the LICENSE file in the top level directory of the
 * LAM/MPI source distribution.
 * 
 *	Ohio Trollius
 *	Copyright 1997 The Ohio State University
 *	NJN
 *
 *	$Id: shm_low.c,v 1.2.2.1 2002/10/09 19:49:51 brbarret Exp $
 *
 *	Function:	- universal shared memory low-level routines
 */

#include <lam_config.h>

#include <errno.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/uio.h>

#if LAM_NEED_SYS_SELECT_H
#include <sys/select.h>
#endif

#include <blktype.h>
#include <dl_inet.h>
#include <mpi.h>
#include <mpisys.h>
#include <net.h>
#include <rpisys.h>
#include <terror.h>
#include <typical.h>
#include <t_types.h>

#include <../tcp/tcp_low.h>

/*
 * external functions
 */
extern void		_c2c_fill_mpi_status();
extern void		_c2c_fill_wildcards();

/*
 * external variables
 */
extern int		_c2c_flblock;		/* blocking flag */
extern int		_c2c_haveadv;		/* have advanced */

/*
 * private functions
 */
static int		waitforack(struct c2c_proc *, int, int, MPI_Comm, int*);


/*
 *	_shm_proc_read_env
 *
 *	Function:	- read envelope from process
 *			- if full envelope read in then try to match with
 *			  and advance a receiving request
 *	Accepts:	- process
 *	Returns:	- 0 or LAMERROR
 */
int
_shm_proc_read_env(struct c2c_proc *ps)
{
	int		lock;
	double		starttime;

	if (_c2c_flblock) {
		LAM_TRACE(starttime = ttime());
		if (_shm_readlock(ps)) return(LAMERROR);
		LAM_TRACE(_kio.ki_blktime += (ttime() - starttime));
	}
	else {
	    	if ((lock = _shm_readtrylock(ps)) < 0) {
			return(LAMERROR);
		} else if (lock == 1) {
			return(0);
		}
	}
	ps->cp_locked = 1;

	return(_shm_match_adv(ps));
}

/*
 *	_shm_proc_read_body_box
 *
 *	Function:	- read the body of an MPI message from process
 *			  via the postbox
 *			- this is only called when there is a receiving request
 *	Accepts:	- process
 *	Returns:	- 0 or LAMERROR
 */
int
_shm_proc_read_body_box(struct c2c_proc *ps)
{
	int		len;
	int		lock;

	if (ps->cp_locked) {
/*
 * Already locked, copy data minus the envelope from shared buffer to receiver.
 */
	    	len = LAM_min(ps->cp_nmsgin, LAM_SHMSHORTMSGLEN);
		memcpy(ps->cp_msgbuf, ps->cp_inbox + 1, len);

		if (_shm_readunlock(ps)) return(LAMERROR);

		ps->cp_msgbuf += len;
		ps->cp_nmsgin -= len;
		ps->cp_locked = 0;
	}

	if (_c2c_flblock) {
/*
 * Blocking case. Loop until all of the message has been read.
 */
		while (ps->cp_nmsgin) {
		    	if (_shm_readlock(ps)) return(LAMERROR);

			len = LAM_min(ps->cp_nmsgin, LAM_SHMSHORTMSGLEN);
			memcpy(ps->cp_msgbuf, ps->cp_inbox + 1, len);

			if (_shm_readunlock(ps)) return(LAMERROR);

			ps->cp_msgbuf += len;
			ps->cp_nmsgin -= len;
		}
	}
	else {
/*
 * Non-blocking case.  Loop until all ofthe message has been read or we
 * cannot get the lock in which case we just exit to try again another
 * day.  
 */
		while (ps->cp_nmsgin) {
			
			if ((lock = _shm_readtrylock(ps)) < 0) {
				return(LAMERROR);
			} else if (lock == 1) {
				return(0);
			}

			len = LAM_min(ps->cp_nmsgin, LAM_SHMSHORTMSGLEN);
			memcpy(ps->cp_msgbuf, ps->cp_inbox + 1, len);

			if (_shm_readunlock(ps)) return(LAMERROR);

			ps->cp_msgbuf += len;
			ps->cp_nmsgin -= len;
		}
	}
/*
 * All of message has been read.  Advance the request receiving the message.
 */
	if (ps->cp_rreq->rq_rpi.c2c.cq_adv(ps, ps->cp_rreq)) {
		return(LAMERROR);
	}
/*
 * Set process up to read the next incoming envelope.
 */
	ps->cp_rreq = 0;
	ps->cp_readfn = _shm_proc_read_env;
	return(0);
}

/*
 *	_shm_proc_read_body_pool
 *
 *	Function:	- read the body of an MPI message from a process
 *			  via the shared pool
 *			- this is only called when there is a receiving request
 *	Accepts:	- process
 *	Returns:	- 0 or LAMERROR
 */
int
_shm_proc_read_body_pool(struct c2c_proc *ps)
{
	int		lock;
	int		len;
	char		*buf = 0;

	if (ps->cp_locked) {
/*
 * Already locked, copy data minus the envelope from shared pool to receiver.
 */
		buf = _shm_membase + ps->cp_inbox->pb_header.bh_bufoff;
		len = LAM_min(ps->cp_nmsgin, ps->cp_insize);
		memcpy(ps->cp_msgbuf, buf, len);

		if (_shm_readunlock(ps)) return(LAMERROR);

		ps->cp_msgbuf += len;
		ps->cp_nmsgin -= len;
		ps->cp_locked = 0;
	}

	if (_c2c_flblock) {
/*
 * Blocking case. Loop until all of message has been read.
 */
	    while (ps->cp_nmsgin) {
			if (_shm_readlock(ps)) return(LAMERROR);

			buf = _shm_membase + ps->cp_inbox->pb_header.bh_bufoff;
			len = LAM_min(ps->cp_nmsgin, ps->cp_insize);
			memcpy(ps->cp_msgbuf, buf, len);

			if (_shm_readunlock(ps)) return(LAMERROR);

			ps->cp_msgbuf += len;
			ps->cp_nmsgin -= len;
		}
	}
	else {
/*
 * Non-blocking case.  Loop until all of message has been read or we cannot
 * get the lock in which case we just exit to try again another day.
 */
		while (ps->cp_nmsgin) {
			if ((lock = _shm_readtrylock(ps)) < 0) {
				return(LAMERROR);
			} else if (lock == 1) {
				return(0);
			}

			buf = _shm_membase + ps->cp_inbox->pb_header.bh_bufoff;
			len = LAM_min(ps->cp_nmsgin, ps->cp_insize);
			memcpy(ps->cp_msgbuf, buf, len);

			if (_shm_readunlock(ps)) return(LAMERROR);

			ps->cp_msgbuf += len;
			ps->cp_nmsgin -= len;
		}
	}
/*
 * All of message has been read.  Free shared area and advance the
 * request receiving the message.  
 */
	if (buf) {
		lam_shfree(buf);
	}

	if (ps->cp_rreq->rq_rpi.c2c.cq_adv(ps, ps->cp_rreq)) {
		return(LAMERROR);
	}
/*
 * Set process up to read the next incoming envelope.
 */
	ps->cp_rreq = 0;
	ps->cp_readfn = _shm_proc_read_env;
	return(0);
}

/*
 *	_shm_req_done_synch
 *
 *	Function:	- mark synchronous send request as done
 *	Accepts:	- source process
 *			- request
 *	Returns:	- 0 or LAMERROR
 */
int
_shm_req_done_synch(struct c2c_proc *ps, MPI_Request req)
{
	if (_shm_readunlock(ps)) return(LAMERROR);
	ps->cp_locked = 0;

	_c2c_haveadv = 1;
	req->rq_state = LAM_RQSDONE;
	lam_rq_nactv--;

	return(0);
}

/*
 *	_shm_req_rcvd_long_ack
 *
 *	Function:	- long protocol transition from reading ack to
 *			  done (if receiver wants 0 bytes) or sending
 *			  requested # of bytes
 *	Accepts:	- destination process
 *			- request
 *	Returns:	- 0 or LAMERROR
 */
int
_shm_req_rcvd_long_ack(struct c2c_proc *ps, MPI_Request req)
{
	int		len;

	len = ps->cp_inbox->pb_header.bh_env.ce_len;

	if (_shm_readunlock(ps)) return(LAMERROR);
	ps->cp_locked = 0;

	_c2c_haveadv = 1;
/*
 * If the receiver truncated the message we may already be done.  The
 * length field of the incoming ack packet specifies the amount of data
 * the receiver expected in total.  We have already send a short packets
 * worth.  
 */
	len -= LAM_SHMSHORTMSGLEN;

	if (len <= 0) {
		req->rq_state = LAM_RQSDONE;
		lam_rq_nactv--;
	} else {
		req->rq_rpi.c2c.cq_state = C2CWRITE;
		req->rq_rpi.c2c.cq_env.ce_len = len;
		req->rq_rpi.c2c.cq_env.ce_flags &= ~C2CACK;
		req->rq_rpi.c2c.cq_env.ce_flags |= C2C2ND;
		req->rq_rpi.c2c.cq_env.ce_rank =
				req->rq_comm->c_group->g_myrank;
		req->rq_rpi.c2c.cq_adv = _shm_req_send_body_first;
		req->rq_rpi.c2c.cq_nenvout = ENVSIZE;
		req->rq_rpi.c2c.cq_nmsgout = len;
	}

	return(0);
}

/*
 *	_shmtcp_req_recv
 *
 *	Function:	- determine protocol for receive request on
 *			  matched incoming envelope and act upon it
 *			- this is called for the first packet only
 *	Accepts:	- source process
 *			- request
 *	Returns:	- 0 or LAMERROR
 */
int
_shmtcp_req_recv(struct c2c_proc *ps, MPI_Request req)
{
    envp_t		env;			/* the envelope */
    int			len;

    if (ps->cp_sock >= 0) {
/*
 * TCP protocol takes care of it.
 */
	return(_tcp_req_recv(ps, req));
    }

    env = (envp_t) ps->cp_inbox;
    _c2c_fill_wildcards(req, env);
    _c2c_haveadv = 1;
/*
 * Check for message length mismatch and set status.
 */
    if (env->ce_len > req->rq_packsize) {
	req->rq_flags |= LAM_RQFTRUNC;
	env->ce_len = req->rq_packsize;
    }
    _c2c_fill_mpi_status(req, env->ce_rank, env->ce_tag, env->ce_len);

    if (env->ce_flags & C2CLONG) {
/*
 * Got a long protocol envelope. Copy data part into the receive buffer
 * and reply with an ack.  
 */
	len = env->ce_len;
	memcpy(req->rq_packbuf, ps->cp_inbox + 1, LAM_min(len, 
							  LAM_SHMSHORTMSGLEN));
/*
 * Done with data transfer, unlock the postbox.
 */
	if (_shm_readunlock(ps)) return(LAMERROR);
	ps->cp_locked = 0;

	req->rq_state = LAM_RQSACTIVE;
	req->rq_rpi.c2c.cq_state = C2CWRITE;
	req->rq_rpi.c2c.cq_env.ce_flags |= (C2CACK | C2CLONG);
	req->rq_rpi.c2c.cq_env.ce_rank = req->rq_comm->c_group->g_myrank;
	req->rq_rpi.c2c.cq_nenvout = ENVSIZE;
	if (len > LAM_SHMSHORTMSGLEN) {
	    req->rq_rpi.c2c.cq_env.ce_len = len;
	    req->rq_rpi.c2c.cq_adv = _shm_req_send_ack_long;
	} else {
	    req->rq_rpi.c2c.cq_env.ce_len = 0;
	    req->rq_rpi.c2c.cq_adv = _shm_req_send_ack_only;
	}
    }
    else {
/*
 * Got a short protocol envelope.  Copy the message from postbox.
 */
	if (env->ce_len > 0) {
	    memcpy(req->rq_packbuf, ps->cp_inbox + 1, env->ce_len);
	}
/*
 * Send ack if matched a synchronous send otherwise complete the request.
 */
	if (env->ce_flags & C2CSSEND) {
	    if (_shm_readunlock(ps)) return(LAMERROR);
	    ps->cp_locked = 0;

	    req->rq_state = LAM_RQSACTIVE;
	    if (_shm_req_rcvd_body_synch(0, req)) {
		return(LAMERROR);
	    }
	} else {
	    if (_shm_readunlock(ps)) return(LAMERROR);
	    ps->cp_locked = 0;

	    req->rq_state = LAM_RQSDONE;
	    lam_rq_nactv--;
	}
    }

    return(0);
}

/*
 *	_shm_buffer
 *
 *	Function:	- buffer incoming envelope/message
 *			- there is never any data to be read to the
 *			  data sink when buffering
 *	Accepts:	- process envelope came in from
 *	Returns:	- 0 or LAMERROR
 */
int
_shm_buffer(struct c2c_proc *ps)
{
	struct cbuf_msg	msg;			/* buffer list entry */
	int		len;

	msg.cm_env = ps->cp_inbox->pb_header.bh_env;
	len = LAM_min(msg.cm_env.ce_len, LAM_SHMSHORTMSGLEN);

	if (len > 0) {
/*
 * Set up the buffer for the message body and copy into it.
 */
		msg.cm_buf = (char *) malloc(len);
		if (msg.cm_buf == 0) return(LAMERROR);

		memcpy(msg.cm_buf, ps->cp_inbox + 1, len);
	}
	else {
		msg.cm_buf = 0;
	}

	if (_shm_readunlock(ps)) return(LAMERROR);
	ps->cp_locked = 0;

	msg.cm_proc = 0;
	msg.cm_req = 0;
	return(_cbuf_append(&msg) ? 0 : LAMERROR);
}


/*
 *	_shm_push_body_box
 *
 *	Function:	- push request envelope and message body down the pike
 *			  via the postbox
 *	Accepts:	- process
 *			- request
 *	Returns:	- 0 or LAMERROR
 */
int
_shm_push_body_box(struct c2c_proc *ps, MPI_Request req)
{
    int			lock;
    double		starttime;
    int			len;

    if (_c2c_flblock) {
/*
 * Blocking case.
 */
	if (req->rq_rpi.c2c.cq_nenvout > 0) {
	    LAM_TRACE(starttime = ttime());
	    if (_shm_writelock(ps)) return(LAMERROR);
	    LAM_TRACE(_kio.ki_blktime += (ttime() - starttime));

	    ps->cp_outbox->pb_header.bh_env = req->rq_rpi.c2c.cq_env;

	    if (req->rq_rpi.c2c.cq_nmsgout > 0) {
		len = LAM_min(req->rq_rpi.c2c.cq_nmsgout, LAM_SHMSHORTMSGLEN);
		memcpy(ps->cp_outbox + 1, req->rq_rpi.c2c.cq_msgbuf, len);
		req->rq_rpi.c2c.cq_msgbuf += len;
		req->rq_rpi.c2c.cq_nmsgout -= len;
	    }

	    if (_shm_writeunlock(ps)) return(LAMERROR);

	    req->rq_rpi.c2c.cq_nenvout = 0;
	}

	while (req->rq_rpi.c2c.cq_nmsgout > 0) {
	    if (_shm_writelock(ps)) return(LAMERROR);

	    len = LAM_min(req->rq_rpi.c2c.cq_nmsgout, LAM_SHMSHORTMSGLEN);
	    memcpy(ps->cp_outbox + 1, req->rq_rpi.c2c.cq_msgbuf, len);

	    if (_shm_writeunlock(ps)) return(LAMERROR);

	    req->rq_rpi.c2c.cq_msgbuf += len;
	    req->rq_rpi.c2c.cq_nmsgout -= len;
	}
    }
    else {
/*
 * Non-blocking case.  Exit to try again another day as soon as we fail
 * to obtain the lock.
 */
	if (req->rq_rpi.c2c.cq_nenvout > 0) {
	    if ((lock = _shm_writetrylock(ps)) < 0) {
		return(LAMERROR);
	    } else if (lock == 1) {
		return(0);
	    }

	    ps->cp_outbox->pb_header.bh_env = req->rq_rpi.c2c.cq_env;

	    if (req->rq_rpi.c2c.cq_nmsgout > 0) {
		len = LAM_min(req->rq_rpi.c2c.cq_nmsgout, LAM_SHMSHORTMSGLEN);
		memcpy(ps->cp_outbox + 1, req->rq_rpi.c2c.cq_msgbuf, len);
		req->rq_rpi.c2c.cq_msgbuf += len;
		req->rq_rpi.c2c.cq_nmsgout -= len;
	    }

	    if (_shm_writeunlock(ps)) return(LAMERROR);

	    req->rq_rpi.c2c.cq_nenvout = 0;
	}

	while (req->rq_rpi.c2c.cq_nmsgout > 0) {
	    if ((lock = _shm_writetrylock(ps)) < 0) {
		return(LAMERROR);
	    } else if (lock == 1) {
		return(0);
	    }

	    len = LAM_min(req->rq_rpi.c2c.cq_nmsgout, LAM_SHMSHORTMSGLEN);
	    memcpy(ps->cp_outbox + 1, req->rq_rpi.c2c.cq_msgbuf, len);

	    if (_shm_writeunlock(ps)) return(LAMERROR);

	    req->rq_rpi.c2c.cq_msgbuf += len;
	    req->rq_rpi.c2c.cq_nmsgout -= len;
	}
    }

    return(1);
}

/*
 *	_shm_push_body_pool
 *
 *	Function:	- push request envelope and message body down the pike
 *			  via the shared pool
 *	Accepts:	- process
 *			- request
 *	Returns:	- 0 or LAMERROR
 */
int
_shm_push_body_pool(struct c2c_proc *ps, MPI_Request req)
{
    int			lock;
    double		starttime;
    int			len;
    int			size;			/* size of shared pool buffer */
    char		*buf;			/* shared pool buffer */

    size = req->rq_rpi.c2c.cq_bufsize;
    buf = _shm_membase + req->rq_rpi.c2c.cq_bufoff;

    if (_c2c_flblock) {
/*
 * Blocking case.
 */
	if (req->rq_rpi.c2c.cq_nenvout > 0) {
	    LAM_TRACE(starttime = ttime());
	    if (_shm_writelock(ps)) return(LAMERROR);
	    LAM_TRACE(_kio.ki_blktime += (ttime() - starttime));

	    ps->cp_outbox->pb_header.bh_env = req->rq_rpi.c2c.cq_env;

	    if (req->rq_rpi.c2c.cq_nmsgout > 0) {
		len = LAM_min(req->rq_rpi.c2c.cq_nmsgout, size);
		memcpy(buf, req->rq_rpi.c2c.cq_msgbuf, len);
		ps->cp_outbox->pb_header.bh_size = size;
		ps->cp_outbox->pb_header.bh_bufoff = req->rq_rpi.c2c.cq_bufoff;
		req->rq_rpi.c2c.cq_msgbuf += len;
		req->rq_rpi.c2c.cq_nmsgout -= len;
	    }

	    if (_shm_writeunlock(ps)) return(LAMERROR);

	    req->rq_rpi.c2c.cq_nenvout = 0;
	}

	while (req->rq_rpi.c2c.cq_nmsgout > 0) {
	    len = LAM_min(req->rq_rpi.c2c.cq_nmsgout, size);

	    if (_shm_writelock(ps)) return(LAMERROR);

	    memcpy(buf, req->rq_rpi.c2c.cq_msgbuf, len);
	    ps->cp_outbox->pb_header.bh_size = size;
	    ps->cp_outbox->pb_header.bh_bufoff = req->rq_rpi.c2c.cq_bufoff;

	    if (_shm_writeunlock(ps)) return(LAMERROR);

	    req->rq_rpi.c2c.cq_msgbuf += len;
	    req->rq_rpi.c2c.cq_nmsgout -= len;
	}
    }
    else {
/*
 * Non-blocking case.  Exit to try again another day as soon as we fail
 * to obtain the lock.
 */
	if (req->rq_rpi.c2c.cq_nenvout > 0) {
	    if ((lock = _shm_writetrylock(ps)) < 0) {
		return(LAMERROR);
	    } else if (lock == 1) {
		return(0);
	    }

	    ps->cp_outbox->pb_header.bh_env = req->rq_rpi.c2c.cq_env;
	    req->rq_rpi.c2c.cq_nenvout = 0;

	    if (req->rq_rpi.c2c.cq_nmsgout > 0) {
		len = LAM_min(req->rq_rpi.c2c.cq_nmsgout, size);
		memcpy(buf, req->rq_rpi.c2c.cq_msgbuf, len);
		ps->cp_outbox->pb_header.bh_size = size;
		ps->cp_outbox->pb_header.bh_bufoff = req->rq_rpi.c2c.cq_bufoff;
		req->rq_rpi.c2c.cq_msgbuf += len;
		req->rq_rpi.c2c.cq_nmsgout -= len;
	    }

	    if (_shm_writeunlock(ps)) return(LAMERROR);
	}

	while (req->rq_rpi.c2c.cq_nmsgout > 0) {
	    if ((lock = _shm_writetrylock(ps)) < 0) {
		return(LAMERROR);
	    } else if (lock == 1) {
		return(0);
	    }

	    len = LAM_min(req->rq_rpi.c2c.cq_nmsgout, size);
	    memcpy(buf, req->rq_rpi.c2c.cq_msgbuf, len);
	    ps->cp_outbox->pb_header.bh_size = size;
	    ps->cp_outbox->pb_header.bh_bufoff = req->rq_rpi.c2c.cq_bufoff;

	    if (_shm_writeunlock(ps)) return(LAMERROR);

	    req->rq_rpi.c2c.cq_msgbuf += len;
	    req->rq_rpi.c2c.cq_nmsgout -= len;
	}
    }

    return(1);
}

/*
 *	_shm_push_env
 *
 *	Function:	- push request envelope down the pike
 *	Accepts:	- process
 *			- request
 *	Returns:	- 0 or LAMERROR
 */
int
_shm_push_env(struct c2c_proc *ps, MPI_Request req)
{
	int		lock;
	double		starttime;

	if (_c2c_flblock) {
		LAM_TRACE(starttime = ttime());
		if (_shm_writelock(ps)) return(LAMERROR);
		LAM_TRACE(_kio.ki_blktime += (ttime() - starttime));
	}
	else if ((lock = _shm_writetrylock(ps)) < 0) {
		return(LAMERROR);
	}
	else if (lock == 1) {
		return(0);
	}
/*
 * Got the lock.
 */
	ps->cp_outbox->pb_header.bh_env = req->rq_rpi.c2c.cq_env;

	if (_shm_writeunlock(ps)) return(LAMERROR);
	return(1);
}


/*
 * The routines below implement the progress engine short circuit for
 * blocking send/receive requests when there are NO outstanding
 * requests.  Yes they are long, verbose and duplicate code found in
 * other functions but they are written this way for speed.
 */

/*
 *	_shm_fastsend
 *
 *	Function:	- fast message send
 *			- does not create requests and does not use
 *			  the normal advance mechanism
 *	Accepts:	- packed buffer
 *			- size of packed data
 *			- destination process
 *			- destination rank
 *			- message tag
 *			- message communicator
 *	Returns:	- MPI_SUCCESS
 */
int
_shm_fastsend(char *packbuf, int packsize, struct c2c_proc *ps,
	      int dest, int tag, MPI_Comm comm)
{
    char		*buf;
    int			len;
    int			offset;			/* offset into shared pool */
    unsigned int	size;
    double		starttime;

/*
 * Wait until outgoing postbox is free.
 */
    LAM_TRACE(starttime = ttime());
    if (_shm_writelock(ps)) return(LAMERROR);
    LAM_TRACE(_kio.ki_blktime += (ttime() - starttime));
/*
 * Copy data and envelope into the outgoing postbox.
 */
    ps->cp_outbox->pb_header.bh_env.ce_len = packsize;
    ps->cp_outbox->pb_header.bh_env.ce_tag = tag;
    ps->cp_outbox->pb_header.bh_env.ce_rank = comm->c_group->g_myrank;
    ps->cp_outbox->pb_header.bh_env.ce_cid = comm->c_contextid;
    ps->cp_outbox->pb_header.bh_env.ce_seq = lam_seqnum++;

    if (packsize <= LAM_SHMSHORTMSGLEN) {
/*
 * Short protocol.
 */
	ps->cp_outbox->pb_header.bh_env.ce_flags = 0;
	if (packsize > 0) {
	    memcpy(ps->cp_outbox + 1, packbuf, packsize);
	}

	if (_shm_writeunlock(ps)) return(LAMERROR);
    }
    else {
/*
 * Long protocol.  Send first packet.
 */
	ps->cp_outbox->pb_header.bh_env.ce_flags = C2CLONG;
	memcpy(ps->cp_outbox + 1, packbuf, LAM_SHMSHORTMSGLEN);

	if (_shm_writeunlock(ps)) return(LAMERROR);

	packbuf += LAM_SHMSHORTMSGLEN;
/*
 * Wait for the acknowledgment.
 */
	if (waitforack(ps, dest, tag, comm, &packsize)) {
		return(LAMERROR);
	}

	packsize -= LAM_SHMSHORTMSGLEN;
	if (packsize <= 0) {
	    return(MPI_SUCCESS);
	}
/*
 * Send the message body first packet.
 */
	offset = 0;
	size = packsize;

	if (_shm_writelock(ps)) return(LAMERROR);

	if (packsize > LAM_SHMSHORTMSGLEN && lam_shmalloc(&size, &offset)) {
	    ps->cp_outbox->pb_header.bh_env.ce_flags = C2C2ND;
	    ps->cp_outbox->pb_header.bh_bufoff = offset;
	    buf = _shm_membase + offset;
	}
	else {
	    ps->cp_outbox->pb_header.bh_env.ce_flags = C2CBOX | C2C2ND;
	    buf = (char *) (ps->cp_outbox + 1);
	    size = LAM_SHMSHORTMSGLEN;
	}

	len = LAM_min(packsize, size);
	memcpy(buf, packbuf, len);
	ps->cp_outbox->pb_header.bh_size = size;
	ps->cp_outbox->pb_header.bh_env.ce_len = packsize;

	if (_shm_writeunlock(ps)) return(LAMERROR);

	packbuf += len;
	packsize -= len;
/*
 * Continue sending packets until the whole message has been sent.
 */
	while (packsize > 0) {
	    len = LAM_min(packsize, size);

	    if (_shm_writelock(ps)) return(LAMERROR);

	    memcpy(buf, packbuf, len);
	    
	    if (_shm_writeunlock(ps)) return(LAMERROR);

	    packbuf += len;
	    packsize -= len;
	}
    }

    return(MPI_SUCCESS);
}

/*
 *	_shm_fastrecv
 *
 *	Function:	- fast message receive
 *			- does not create requests and does not use
 *			  the normal advance mechanism
 *	Accepts:	- buffer to receive packed data
 *			- size of buffer (inout)
 *			- source process
 *			- source rank
 *			- message tag (inout)
 *			- message communicator
 *			- message sequence number (out)
 *	Returns:	- MPI_SUCCESS or error code
 */
int
_shm_fastrecv(char *packbuf, int *packsize, struct c2c_proc *ps, int src,
	      int *tag, MPI_Comm comm, int *seqnum)
{
    double		starttime;
    int			len;
    int			msglen;
    struct c2c_envl	env;
    struct c2c_envl	*inenv;			/* proc's incoming envelope */
    struct cbuf_msg	*bmsg;			/* buffered message */
    char		*buf;
    int			size;
    int			synch;			/* synchronous send? */
    int			err = MPI_SUCCESS;

    env.ce_flags = 0;
    env.ce_rank = src;
    env.ce_tag = *tag;
    env.ce_cid = comm->c_contextid;
/*
 * Check for match with buffered message.
 */
    if ((bmsg = _cbuf_find(&env))) {
	if (bmsg->cm_env.ce_len > *packsize) {
	    err = lam_mkerr(MPI_ERR_TRUNCATE, 0);
	} else {
	    *packsize = bmsg->cm_env.ce_len;
	}

	*tag = bmsg->cm_env.ce_tag;
	*seqnum = bmsg->cm_env.ce_seq;
	msglen = *packsize;

	len = LAM_min(msglen, LAM_SHMSHORTMSGLEN);
	if (len) {
	    memcpy(packbuf, bmsg->cm_buf, len);
	}

	if (bmsg->cm_env.ce_flags & C2CLONG) {
	    _cbuf_delete(bmsg);
	    msglen -= len;
	    packbuf += len;
	    goto longmsg;
	} else if (bmsg->cm_env.ce_flags & C2CSSEND) {
	    _cbuf_delete(bmsg);
	    goto synchmsg;
	} else {
	    _cbuf_delete(bmsg);
	}

	return(err);
    }

    inenv = &ps->cp_inbox->pb_header.bh_env;

    while (1) {
	LAM_TRACE(starttime = ttime());
	if (_shm_readlock(ps)) return(LAMERROR);
	LAM_TRACE(_kio.ki_blktime += (ttime() - starttime));

	if (_c2c_envl_cmp(inenv, &env)) {
	    _shm_buffer(ps);
	} else {
	    break;
	}
    }
/*
 * Check for message length mismatch.
 */
    if (inenv->ce_len > *packsize) {
	err = lam_mkerr(MPI_ERR_TRUNCATE, 0);
    } else {
	*packsize = inenv->ce_len;
    }

    *tag = inenv->ce_tag;
    *seqnum = inenv->ce_seq;
    msglen = *packsize;
    
    if (inenv->ce_flags & C2CLONG) {
/*
 * Got a long protocol envelope. Copy data part into the receive buffer
 * and reply with an ack.  
 */
	len = LAM_min(msglen, LAM_SHMSHORTMSGLEN);
	memcpy(packbuf, ps->cp_inbox + 1, len);
/*
 * Done with data transfer, unlock the postbox.
 */
	if (_shm_readunlock(ps)) return(LAMERROR);

	msglen -= len;
	packbuf += len;
/*
 * Send an acknowledgment.
 */
longmsg:
	LAM_TRACE(starttime = ttime());
	if (_shm_writelock(ps)) return(LAMERROR);
	LAM_TRACE(_kio.ki_blktime += (ttime() - starttime));

	ps->cp_outbox->pb_header.bh_env.ce_tag = *tag;
	ps->cp_outbox->pb_header.bh_env.ce_rank = comm->c_group->g_myrank;
	ps->cp_outbox->pb_header.bh_env.ce_cid = comm->c_contextid;
	ps->cp_outbox->pb_header.bh_env.ce_flags = C2CACK | C2CLONG;
	ps->cp_outbox->pb_header.bh_env.ce_len = *packsize;

	if (_shm_writeunlock(ps)) return(LAMERROR);

	if (msglen) {
/*
 * Receive the first packet of the message body.
 */
	    env.ce_tag = *tag;
	    env.ce_flags = C2C2ND;
	    inenv = &ps->cp_inbox->pb_header.bh_env;

	    while (1) {
		LAM_TRACE(starttime = ttime());
		if (_shm_readlock(ps)) return(LAMERROR);
		LAM_TRACE(_kio.ki_blktime += (ttime() - starttime));

		if (_c2c_envl_cmp(inenv, &env)) {
		    _shm_buffer(ps);
		} else {
		    break;
		}
	    }
/*
 * Determine the transfer method.
 */
	    if (inenv->ce_flags & C2CBOX) {
		buf = (char *) (ps->cp_inbox + 1);
		size = LAM_SHMSHORTMSGLEN;
	    } else {
		buf = _shm_membase + ps->cp_inbox->pb_header.bh_bufoff;
		size = ps->cp_inbox->pb_header.bh_size;
	    }

	    len = LAM_min(msglen, size);
	    memcpy(packbuf, buf, len);

	    if (_shm_readunlock(ps)) return(LAMERROR);

	    msglen -= len;
	    packbuf += len;
/*
 * Receive the rest of the message body.
 */
	    while (msglen > 0) {
		len = LAM_min(msglen, size);

		LAM_TRACE(starttime = ttime());
		if (_shm_readlock(ps)) return(LAMERROR);
		LAM_TRACE(_kio.ki_blktime += (ttime() - starttime));

		memcpy(packbuf, buf, len);

		if (_shm_readunlock(ps)) return(LAMERROR);

		msglen -= len;
		packbuf += len;
	    }

	    if (buf != (char *) (ps->cp_inbox + 1)) {
		lam_shfree(buf);
	    }
	}
    }
    else {
/*
 * Got a short protocol envelope.  Check for a synchronous send and copy
 * the message from the postbox.  
 */
	synch = inenv->ce_flags & C2CSSEND;

	if (*packsize > 0) {
	    memcpy(packbuf, ps->cp_inbox + 1, *packsize);
	}

	if (_shm_readunlock(ps)) return(LAMERROR);

	if (synch) {
/*
 * Send a synchronous send acknowledgment.
 */
synchmsg:
	    LAM_TRACE(starttime = ttime());
	    if (_shm_writelock(ps)) return(LAMERROR);
	    LAM_TRACE(_kio.ki_blktime += (ttime() - starttime));

	    ps->cp_outbox->pb_header.bh_env.ce_tag = *tag;
	    ps->cp_outbox->pb_header.bh_env.ce_rank = comm->c_group->g_myrank;
	    ps->cp_outbox->pb_header.bh_env.ce_cid = comm->c_contextid;
	    ps->cp_outbox->pb_header.bh_env.ce_flags = C2CACK;

	    if (_shm_writeunlock(ps)) return(LAMERROR);
	}
    }

    return(err);
}


/*
 *	waitforack
 *
 *	Function:	- wait for a long message acknowledgment
 *	Accepts:	- process sending the ack
 *			- rank of process sending the ack
 *			- ack tag
 *			- ack communicator
 *			- length ack'er will receive (out)
 * 	Returns:	- 0 or LAMERROR
 */
static int
waitforack(struct c2c_proc *ps, int from, int tag, MPI_Comm comm, int *len)
{
    struct c2c_envl	env;
    double		starttime;

    env.ce_flags = C2CACK;
    env.ce_rank = from;
    env.ce_tag = tag;
    env.ce_cid = comm->c_contextid;

    while (1) {
	LAM_TRACE(starttime = ttime());
	if (_shm_readlock(ps)) return(LAMERROR);
	LAM_TRACE(_kio.ki_blktime += (ttime() - starttime));

	if (_c2c_envl_cmp(&ps->cp_inbox->pb_header.bh_env, &env)) {
	    _shm_buffer(ps);
	} else {
	    *len = ps->cp_inbox->pb_header.bh_env.ce_len;
	    if (_shm_readunlock(ps)) return(LAMERROR);
	    break;
	}
    }

    return(0);
}
