/*
 * Copyright 1998-2001, University of Notre Dame.
 * Authors: Jeffrey M. Squyres and Arun Rodrigues with Brian Barrett,
 *          Kinis L. Meyer, M. D. McNally, and Andrew Lumsdaine
 * 
 * This file is part of the Notre Dame LAM implementation of MPI.
 * 
 * You should have received a copy of the License Agreement for the Notre
 * Dame LAM implementation of MPI along with the software; see the file
 * LICENSE.  If not, contact Office of Research, University of Notre
 * Dame, Notre Dame, IN 46556.
 * 
 * Permission to modify the code and to distribute modified code is
 * granted, provided the text of this NOTICE is retained, a notice that
 * the code was modified is included with the above COPYRIGHT NOTICE and
 * with the COPYRIGHT NOTICE in the LICENSE file, and that the LICENSE
 * file is distributed with the modified code.
 * 
 * LICENSOR MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED.
 * By way of example, but not limitation, Licensor MAKES NO
 * REPRESENTATIONS OR WARRANTIES OF MERCHANTABILITY OR FITNESS FOR ANY
 * PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE COMPONENTS
 * OR DOCUMENTATION WILL NOT INFRINGE ANY PATENTS, COPYRIGHTS, TRADEMARKS
 * OR OTHER RIGHTS.
 * 
 * Additional copyrights may follow.
 * 
 *	Ohio Trollius
 *	Copyright 1997 The Ohio State University
 *	NJN
 *
 *	$Id: shm_common.c,v 1.3 2001/02/15 20:48:59 jsquyres Exp $
 *
 *	Function:	- common shared memory transport low-level routines
 */

#include <lam_config.h>

#define SKIP_SHMEM_GLOBAL_INIT 1
#include <rpi_shm.h>

#include <assert.h>
#include <errno.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/uio.h>

#ifdef PTHREAD_FREELOCK
#include <pthread.h>
#endif

#include <blktype.h>
#include <mpi.h>
#include <mpisys.h>
#include <rpisys.h>
#include <terror.h>
#include <typical.h>
#include <t_types.h>

/*
 * external functions
 */
extern void		lam_register_objects();

/*
 * private defines
 *
 * size taken up by postboxes and global lock 
 */
#define BOXTOTSIZE	(2*CACHELINESIZE + nshm*nshm*SHMBOXSIZE)

/*
 * external variables
 */
extern int		_shm_poolsize;		/* size of long message pool */

/*
 * global semaphores
 */
static struct sembuf	shm_lock = { 2, -1, 0 };
static struct sembuf	shm_unlock = { 2, 1, 0 };
static int		semaphores;

#ifdef PTHREAD_FREELOCK
static pthread_mutex_t	*freelock;
#endif

#if LAM_HAVE_UNION_SEMUN
static union semun	semctl_arg;
#else
static union {
	int		val;
	struct semid_ds	*buf;
	unsigned short	*array;
} semctl_arg;
#endif

/*
 * free list data structures
 */
union header {
    struct {
	unsigned int    ptr;		/* next block offset if on free list */
	unsigned int	size;		/* size of this block */
    } s;
    char align[ALIGNMENT];		/* alignment */
};

typedef union header 	shm_header_t;

/*
 * private functions
 */
static int		shm_req_send_body_box(struct c2c_proc *, MPI_Request);
static int		shm_req_send_body_pool(struct c2c_proc *, MPI_Request);
static int		shm_req_rcvd_2nd(struct c2c_proc *, MPI_Request);
static int		shm_req_done(struct c2c_proc *, MPI_Request);

static void		shminit(void *, unsigned int, int);
static void		shmlock(void);
static void		shmunlock(void);

/*
 * public variables
 */
char			*_shm_membase = 0;	/* base of shared memory */
struct c2c_proc		**_shm_read = 0;	/* reading shm processes */
struct c2c_proc		**_shm_write = 0;	/* writing shm processes */
int			_shm_nread;		/* # reading shm processes */
int			_shm_nwrite;		/* # writing shm processes */
int			_shm_nprocs = 0;	/* number of shm processes */
int			_shm_narray = 0;	/* shm read/write array sizes */
int			_shm_poll_yield = 1;	/* yield in poll loop */
int			_shm_poolsize = LAM_MPI_SHMPOOLSIZE; /* size
of long message pool */
int			_shm_maxalloc = LAM_MPI_SHMMAXALLOC; /* max
allocation of shmem */

/*
 * external variables
 */
extern int		_c2c_flblock;		/* blocking flag */
extern int		_c2c_haveadv;		/* have advanced */
extern int		_tcp_nio;		/* # processes doing tcp io */
extern MPI_Request	_tcp_lastreq;		/* last tcp request */

/*
 * private variables
 */
static unsigned int	*freelist;
static shm_header_t	*poolbase;

/*
 *	_shm_advance
 *
 *	Function:	- advance tcp and shm requests where possible
 *	Returns:	- 0 or LAMERROR
 */
int
_shm_advance(void)
{
    MPI_Request		req;
    double		blkstart = 0.0;
    double		loopstart = 0.0;
    int			blksave;		/* save blocking state */
    int			i;
/*
 * Save and set blocking state.
 */
    blksave = _c2c_flblock;
    _c2c_flblock &=
	    ((_shm_nread + _shm_nwrite == 0)
		    || (_shm_nread + _shm_nwrite + _tcp_nio == 1));

    if ((_kio.ki_rtf & RTF_TRON) == RTF_TRON && blksave && !_c2c_flblock) {
	loopstart = blkstart = ttime();
    }
/*
 * Advance reading shared memory processes.
 */
    do {
	for (i = 0; i < _shm_nread; i++) {
	    _shm_read[i]->cp_read = 0;
	    if (_shm_read[i]->cp_readfn(_shm_read[i])) {
		return(LAMERROR);
	    }
	}
/*
 * Advance writing shared memory processes.
 */
	for (i = 0; i < _shm_nwrite; i++) {
	    _shm_write[i]->cp_write = 0;
	    req = _shm_write[i]->cp_wreq;
	    if (req->rq_rpi.c2c.cq_adv(_shm_write[i], req)) {
		return(LAMERROR);
	    }
	}
/*
 * Advance the TCP requests (if any).  In the case of a single TCP
 * request make sure that the request has not been completed and is
 * still A TCP request.  This can not be true when a receive from
 * MPI_ANY_SOURCE is completed by a send from a shared memory source.
 */
	if (_tcp_nio == 1 && _tcp_lastreq->rq_state != LAM_RQSDONE
		&& (_tcp_lastreq->rq_proc == 0 
		|| _tcp_lastreq->rq_proc->p_rpi.c2c.cp_sock >= 0)) {
	    if (_tcp_adv1()) return(LAMERROR);
	}
	else if (_tcp_nio > 1) {
	    if (_tcp_advmultiple()) return(LAMERROR);
	}

	if (blksave && !_c2c_haveadv) {
	    if (_shm_poll_yield) {
		lam_yield();
	    }
	    LAM_TRACE(loopstart = ttime());
	}

    } while (blksave && !_c2c_haveadv);

    if ((_kio.ki_rtf & RTF_TRON) == RTF_TRON && blksave && !_c2c_flblock) {
	_kio.ki_blktime += (loopstart - blkstart);
    }

    _c2c_flblock = blksave;
    return(_c2c_haveadv);
}

/*
 *	_shmtcp_req_probe
 *
 *	Function:	- mark probe request as done
 *			- the matched incoming envelope/message is buffered
 *	Accepts:	- source process
 *			- request
 *	Returns:	- 0 or LAMERROR
 */
int
_shmtcp_req_probe(struct c2c_proc *ps, MPI_Request req)
{
	envp_t		env;			/* matched envelope */

	if (ps->cp_sock >= 0) {
/*
 * TCP protocol takes care of it.
 */
		return(_tcp_req_probe(ps, req));
	}

	_c2c_haveadv = 1;
	req->rq_state = LAM_RQSDONE;
	lam_rq_nactv--;

	env = (envp_t) ps->cp_inbox;
	_c2c_fill_mpi_status(req, env->ce_rank, env->ce_tag, env->ce_len);

	return(_shm_buffer(ps));
}

/*
 *	_shm_buffered_adv
 *
 *	Function:	- determine protocol for receive request matching
 *			  a buffered envelope/message and act upon it
 *	Accepts:	- request
 *			- buffered envelope/message
 *	Returns:	- 0 or LAMERROR
 */
int
_shm_buffered_adv(MPI_Request req, struct cbuf_msg *msg)
{
    envp_t		env;			/* matching incoming env. */

    env = &msg->cm_env;

    if (req->rq_type == LAM_RQIPROBE) {
/*
 * The request is a probe.  Set the status and leave the envelope buffered.
 */
	_c2c_fill_mpi_status(req, env->ce_rank, env->ce_tag, env->ce_len);
	req->rq_state = LAM_RQSDONE;
	lam_rq_nactv--;

	return(0);
    }

    if (env->ce_len > req->rq_packsize) {
	req->rq_flags |= LAM_RQFTRUNC;
	env->ce_len = req->rq_packsize;
    }
    _c2c_fill_mpi_status(req, env->ce_rank, env->ce_tag, env->ce_len);

    if (env->ce_flags & C2CLONG) {
/*
 * Matched a long protocol envelope.  Copy the partially buffered body and
 * reply with an ack.
 */
	req->rq_state = LAM_RQSACTIVE;
	req->rq_rpi.c2c.cq_state = C2CWRITE;
	req->rq_rpi.c2c.cq_env.ce_flags |= (C2CACK | C2CLONG);

	if (env->ce_len > LAM_SHMSHORTMSGLEN) {

	    memcpy(req->rq_packbuf, msg->cm_buf, LAM_SHMSHORTMSGLEN);
	    req->rq_rpi.c2c.cq_env.ce_len = env->ce_len;
	    req->rq_rpi.c2c.cq_adv = _shm_req_send_ack_long;
	}
	else {
	    if (env->ce_len > 0) {
		memcpy(req->rq_packbuf, msg->cm_buf, env->ce_len);
	    }

	    req->rq_rpi.c2c.cq_adv = _shm_req_send_ack_only;
	    req->rq_rpi.c2c.cq_env.ce_len = 0;
	}

	req->rq_rpi.c2c.cq_env.ce_rank = req->rq_comm->c_group->g_myrank;
	req->rq_rpi.c2c.cq_nenvout = ENVSIZE;
    }
    else {
/*
 * Matched a short protocol envelope.  Copy the message (if any) from
 * the buffer and advance the request.  
 */
	if (env->ce_len) {
	    memcpy(req->rq_packbuf, msg->cm_buf, env->ce_len);
	}
	if (env->ce_flags & C2CSSEND) {
	    req->rq_state = LAM_RQSACTIVE;
	    if (_shm_req_rcvd_body_synch(0, req)) {
		return(LAMERROR);
	    }
	} else {
	    req->rq_state = LAM_RQSDONE;
	    lam_rq_nactv--;
	}
    }
/*
 * Discard the buffered message.
 */
    _cbuf_delete(msg);

    return(0);
}

/*
 *	shm_match_adv
 *
 *	Function:	- match env read from process with a read request
 *			  and advance the matched request
 *			- if no match is found then the env/msg is buffered
 *	Accepts:	- envelope's source process
 */
int
_shm_match_adv(struct c2c_proc *ps)
{
	MPI_Request	req;			/* request */
	envp_t		env;			/* envelope */
/*
 * There cannot be any matching recvs after a matching probe because
 * probes are blocking.	 Thus we may return upon the first match
 * (buffering the envelope in the case of a probe) and maintain the
 * invariant "no requests in the list match buffered envelopes".  This
 * means once a request is in the list after being checked against
 * buffered envelopes it need never again be checked against any
 * buffered envelopes.
 */
	env = (envp_t) ps->cp_inbox;

	for (req = ps->cp_mreq; req; req = req->rq_next) {

		if ((req->rq_state != LAM_RQSDONE)
			&& (req->rq_rpi.c2c.cq_state == C2CREAD)
			&& (!_c2c_envl_cmp(env, &req->rq_rpi.c2c.cq_env))) {

			return(req->rq_rpi.c2c.cq_adv(ps, req));
		}
	}

	return(_shm_buffer(ps));
}

/*
 *	_shm_req_send_ack_long
 *
 *	Function:	- send ack and prepare to receive message body
 *			- long protocol
 *			- request
 *	Returns:	- 0 or LAMERROR
 */
int
_shm_req_send_ack_long(struct c2c_proc *ps, MPI_Request req)
{
	int		done;

	if ((done = _shm_push_env(ps, req)) <= 0) {
		return(done);
	}
/*
 * The acknowledgment has been sent.
 */
	_c2c_haveadv = 1;
	ps->cp_wreq = 0;
/*
 * Prepare to receive the message body.
 */
	req->rq_rpi.c2c.cq_state = C2CREAD;
	req->rq_rpi.c2c.cq_env.ce_flags &= ~C2CACK;
	req->rq_rpi.c2c.cq_env.ce_flags |= C2C2ND;
	req->rq_rpi.c2c.cq_env.ce_rank = req->rq_rpi.c2c.cq_peer;
	req->rq_rpi.c2c.cq_adv = shm_req_rcvd_2nd;

	return(0);
}

/*
 *	_shm_req_send_ack_only
 *
 *	Function:	- send an acknowledgment
 *	Accepts:	- source process
 *			- request
 *	Returns:	- 0 or LAMERROR
 */
int
_shm_req_send_ack_only(struct c2c_proc *ps, MPI_Request req)
{
	int		done;

	if ((done = _shm_push_env(ps, req)) <= 0) {
		return(done);
	}
/*
 * The acknowledgment has been sent.
 */
	_c2c_haveadv = 1;
	ps->cp_wreq = 0;
	req->rq_state = LAM_RQSDONE;
	lam_rq_nactv--;

	return(0);
}

/*
 *	_shm_req_rcvd_body_synch
 *
 *	Function:	- synchronous protocol transition from reading
 *			  message body to sending ack
 *	Accepts:	- source process (ignored)
 *			- request
 *	Returns:	- 0 or LAMERROR
 */
int
_shm_req_rcvd_body_synch(struct c2c_proc *ps, MPI_Request req)
{
	_c2c_haveadv = 1;
	req->rq_rpi.c2c.cq_state = C2CWRITE;
	req->rq_rpi.c2c.cq_env.ce_flags |= C2CACK;
	req->rq_rpi.c2c.cq_env.ce_rank = req->rq_comm->c_group->g_myrank;
	req->rq_rpi.c2c.cq_adv = _shm_req_send_ack_only;

	return(0);
}

/*
 *	_shm_req_send_body_first
 *
 *	Function:	- first send of long message body
 *			- we decide here whether to use the postbox or
 *			  shared pool
 *	Accepts:	- destination process
 *			- request
 *	Returns:	- 0 or LAMERROR
 */
int
_shm_req_send_body_first(struct c2c_proc *ps, MPI_Request req)
{
	int		done;
	unsigned int	nbytes;
	int		bufoffset;
/*
 * We use the postbox if the number of message bytes left to send is
 * smaller than or equal to the postbox size.  We may also have to use
 * it if we cannot get a piece of the shared pool.  
 */
	nbytes = req->rq_rpi.c2c.cq_env.ce_len;
	if (nbytes > LAM_SHMSHORTMSGLEN && lam_shmalloc(&nbytes, &bufoffset)) {
		req->rq_rpi.c2c.cq_bufoff = bufoffset;
		req->rq_rpi.c2c.cq_bufsize = nbytes;
		req->rq_rpi.c2c.cq_adv = shm_req_send_body_pool;

		if ((done = _shm_push_body_pool(ps, req)) <= 0) {
			return(done);
		}
	} else {
		req->rq_rpi.c2c.cq_env.ce_flags |= C2CBOX;
		req->rq_rpi.c2c.cq_adv = shm_req_send_body_box;

		if ((done = _shm_push_body_box(ps, req)) <= 0) {
			return(done);
		}
	}
/*
 * All of message has been written.
 */
	_c2c_haveadv = 1;
	ps->cp_wreq = 0;
	req->rq_state = LAM_RQSDONE;
	lam_rq_nactv--;

	return(0);
}


/*
 *	shm_req_send_body_box
 *
 *	Function:	- send message body via the postbox
 *	Accepts:	- destination process
 *			- request
 *	Returns:	- 0 or LAMERROR
 */
static int
shm_req_send_body_box(struct c2c_proc *ps, MPI_Request req)
{
	int		done;

	if ((done = _shm_push_body_box(ps, req)) <= 0) {
		return(done);
	}
/*
 * All of the message has been written.
 */
	_c2c_haveadv = 1;
	ps->cp_wreq = 0;
	req->rq_state = LAM_RQSDONE;
	lam_rq_nactv--;
	return(0);
}

/*
 *	shm_req_send_body_pool
 *
 *	Function:	- send message body via the shared pool
 *	Accepts:	- destination process
 *			- request
 *	Returns:	- 0 or LAMERROR
 */
static int
shm_req_send_body_pool(struct c2c_proc *ps, MPI_Request req)
{
	int		done;

	if ((done = _shm_push_body_pool(ps, req)) <= 0) {
		return(done);
	}
/*
 * All of the message has been written.
 */
	_c2c_haveadv = 1;
	ps->cp_wreq = 0;
	req->rq_state = LAM_RQSDONE;
	lam_rq_nactv--;

	return(0);
}

/*
 *	_shm_req_send_long
 *
 *	Function:	- send the first envelope of a long message and
 *			  prepare to receive the ack
 *	Accepts:	- destination process
 *			- request
 *	Returns:	- 0 or LAMERROR
 */
int
_shm_req_send_long(struct c2c_proc *ps, MPI_Request req)
{
	int		done;

	if ((done = _shm_push_body_box(ps, req)) <= 0) {
		return(done);
	}
/*
 * Prepare to read long protocol ack.
 */
	_c2c_haveadv = 1;
	ps->cp_wreq = 0;
	req->rq_rpi.c2c.cq_state = C2CREAD;
	req->rq_rpi.c2c.cq_env.ce_flags |= C2CACK;
	req->rq_rpi.c2c.cq_env.ce_rank = req->rq_rpi.c2c.cq_peer;
	req->rq_rpi.c2c.cq_adv = _shm_req_rcvd_long_ack;

	return(0);
}

/*
 *	_shm_req_send_short
 *
 *	Function:	- send a short protocol message
 *	Accepts:	- destination process
 *			- request
 *	Returns:	- 0 or LAMERROR
 */
int
_shm_req_send_short(struct c2c_proc *ps, MPI_Request req)
{
	int		done;

	if ((done = _shm_push_body_box(ps, req)) <= 0) {
		return(done);
	}

	_c2c_haveadv = 1;
	ps->cp_wreq = 0;
	req->rq_state = LAM_RQSDONE;
	lam_rq_nactv--;

	return(0);
}


/*
 *	_shm_req_send_synch
 *
 *	Function:	- send short synchronous protocol message body
 *			  and prepare to read the ack
 *	Accepts:	- destination process
 *			- request
 *	Returns:	- 0 or LAMERROR
 */
int
_shm_req_send_synch(struct c2c_proc *ps, MPI_Request req)
{
	int		done;

	if ((done = _shm_push_body_box(ps, req)) <= 0) {
		return(done);
	}

	_c2c_haveadv = 1;
	ps->cp_wreq = 0;
	req->rq_rpi.c2c.cq_state = C2CREAD;
	req->rq_rpi.c2c.cq_env.ce_flags |= C2CACK;
	req->rq_rpi.c2c.cq_env.ce_rank = req->rq_rpi.c2c.cq_peer;
	req->rq_rpi.c2c.cq_adv = _shm_req_done_synch;

	return(0);
}

/*
 *	shm_req_rcvd_2nd
 *
 *	Function:	- read the body of a long protocol message
 *	Accepts:	- source process
 *			- request
 *	Returns:	- 0 or LAMERROR
 */
static int
shm_req_rcvd_2nd(struct c2c_proc *ps, MPI_Request req)
{
	ps->cp_rreq = req;
	ps->cp_msgbuf = req->rq_packbuf + LAM_SHMSHORTMSGLEN;
	ps->cp_nmsgin = ps->cp_inbox->pb_header.bh_env.ce_len;
	req->rq_rpi.c2c.cq_adv = shm_req_done;
/*
 * We may be being sent to via the postbox or the global pool.
 */
	if (ps->cp_inbox->pb_header.bh_env.ce_flags & C2CBOX) {
		ps->cp_readfn = _shm_proc_read_body_box;
		return(_shm_proc_read_body_box(ps));
	} else {
		ps->cp_readfn = _shm_proc_read_body_pool;
		ps->cp_insize = ps->cp_inbox->pb_header.bh_size;
		return(_shm_proc_read_body_pool(ps));
	}
}

/*
 *	shm_req_done
 *
 *	Function:	- mark request as done
 *	Accepts:	- source process
 *			- request
 *	Returns:	- 0 or LAMERROR
 */
static int
shm_req_done(struct c2c_proc *ps, MPI_Request req)
{
	_c2c_haveadv = 1;
	req->rq_state = LAM_RQSDONE;
	lam_rq_nactv--;

	return(0);
}

/*
 *	shminit
 *
 *	Function:	- initialize shared pool
 *	Accepts:	- base of pool
 *			- size of pool (bytes)
 *			- first to initialize?
 */
static void
shminit(void *memory, unsigned int nbytes, int first)
{
	int		nunits;
	
	nunits = nbytes >> LOG_ALIGN;

	freelist = memory;
	poolbase = memory;

	*freelist = 1;

	if (first) {
	    (poolbase+1)->s.ptr = 1;
	    (poolbase+1)->s.size = nunits - 1;
	}
}


/*
 *	lam_shmalloc
 *
 *	Function:	- allocate memory from shared area
 *	Accepts:	- number of bytes required/obtained (inout)
 *			- offset of allocated memory (out)
 *	Returns:	- 1 => allocated some or all required
 *			  0 => nothing allocated
 */
int
lam_shmalloc(unsigned int *nbytes, int *offset)
{
    shm_header_t	*p;
    shm_header_t	*prev;
    unsigned int	nunits;

    shmlock();

    if (*freelist == 0) {
	shmunlock();
	return(0);
    }
/*
 * Round allocation request down to maximum allowed.
 */
    nunits = (*nbytes + sizeof(shm_header_t) - 1) >> LOG_ALIGN;

    if (nunits > _shm_maxalloc) {
        nunits = _shm_maxalloc;
        *nbytes = _shm_maxalloc << LOG_ALIGN;
    }
/*
 * One extra unit for the header.
 */
    nunits++;
/*
 * Look for large enough free block.
 */
    prev = poolbase + *freelist;
    for (p = poolbase + prev->s.ptr; ; prev = p, p = poolbase + p->s.ptr) {
	if (p->s.size >= nunits) {
	    if (p->s.size == nunits) {
		if (p == poolbase + p->s.ptr) {
		    prev = poolbase;
		} else {
		    prev->s.ptr = p->s.ptr;
		}
	    } else {
		p->s.size -= nunits;
		p += p->s.size;
		p->s.size = nunits;
	    }

	    *freelist = prev - poolbase;
	    *offset = (char *) (p + 1) - _shm_membase;
	    break;
	}
/*
 * Wrapped around the free list.  No fit found.
 */
	if (p == poolbase + *freelist) {
	    shmunlock();
	    return(0);
	}
    }

    shmunlock();
    return(1);
}


/*
 *	lam_shfree
 *
 *	Function:	- free up a block of the shared area
 *	Accepts:	- address of data part of block
 */
void
lam_shfree(void *block)
{
    shm_header_t	*head;			/* block's header */
    shm_header_t	*p;

    if (block == 0) return;

    shmlock();

    head = (shm_header_t *) block - 1;

    if (*freelist) {
/*
 * There are already free region(s) in the shared memory region.
 */
	for (p = poolbase + *freelist;
	         !(head > p && head < poolbase + p->s.ptr);
	         p = poolbase + p->s.ptr) {
	    if (p >= poolbase + p->s.ptr
		    && (head > p || head < poolbase + p->s.ptr))
/*
 * Freed block at start of end of arena.
 */
		break;
	}
/*
 * Integrate block in list.
 */
	*freelist = p - poolbase;

	if (head + head->s.size == poolbase + p->s.ptr) {
/*
 * Join to upper neighbour.
 */
	    if (p->s.ptr == *freelist) {
		*freelist = head - poolbase;
	    }

	    if (poolbase + p->s.ptr == p) {
		head->s.ptr = head - poolbase;
	    } else {
		head->s.ptr = (poolbase + p->s.ptr)->s.ptr;
	    }

	    head->s.size += (poolbase + p->s.ptr)->s.size;
	}
	else {
	    head->s.ptr = p->s.ptr;
	}

	if (p + p->s.size == head) {
/*
 * Join to lower neighbour.
 */
	    p->s.size += head->s.size;
	    p->s.ptr = head->s.ptr;
	}
	else
	    p->s.ptr = head - poolbase;
    }
    else {
/*
 * There wasn't a free shared memory region before.
 */
	head->s.ptr = head - poolbase;
	*freelist = head - poolbase;
    }

    shmunlock();
}

/*
 *	_shm_create_area
 *
 *	Function:	- create shared memory area
 *	Accepts:	- number of processes sharing area
 *			- process (in-out)
 *			- LAM msg to fill with shared info (out)
 *	Returns:	- 0 or LAMERROR
 */
int
_shm_create_area(int nshm, struct c2c_proc *ps, struct nmsg *msg)
{
	char		objs[2][32];
	int		shmid;
	int		semid;
	int		shmsize;
#ifdef PTHREAD_FREELOCK
	pthread_mutexattr_t mattr;
#endif
/*
 * Create the shared memory segment.
 */
	shmsize = BOXTOTSIZE + _shm_poolsize;

	shmid = shmget(IPC_PRIVATE, shmsize, 0600 | IPC_CREAT);
	if (shmid < 0) {
		errno = ESHMCREATE;
		return(LAMERROR);
	}

	_shm_membase = (char *) shmat(shmid, (char *) 0, 0);
	if (_shm_membase == (char *) -1) {
		shmctl(shmid, IPC_RMID, (struct shmid_ds *) 0);
		return(LAMERROR);
	}

	ps->cp_shm = shmid;

	memset(_shm_membase, 0, BOXTOTSIZE);
#ifdef PTHREAD_FREELOCK
	assert(2*CACHELINESIZE >= sizeof(pthread_mutex_t));
	freelock = (pthread_mutex_t *) _shm_membase;
	pthread_mutexattr_init(&mattr);
	pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED);
	pthread_mutex_init(freelock, &mattr);
#endif
	shminit(_shm_membase + BOXTOTSIZE, _shm_poolsize, TRUE);
/*
 * Create the semaphores.
 */
	if ((semid = semget(IPC_PRIVATE, 3, 0600 | IPC_CREAT)) < 0) {
		shmctl(shmid, IPC_RMID, (struct shmid_ds *) 0);
		errno = ESEMCREATE;
		return(LAMERROR);
	}
/*
 * Register id's for cleanup.
 */
	sprintf(objs[0], "%d", shmid);
	sprintf(objs[1], "%d", semid);
	lam_register_objects(2, 'm', objs[0], 's', objs[1]);
/*
 * Initilize semaphore.
 */
	ps->cp_sem = semid;
	semaphores = semid;

	semctl_arg.val = 1;
	if (semctl(semid, 0, SETVAL, semctl_arg) < 0) return(LAMERROR);
	if (semctl(semid, 2, SETVAL, semctl_arg) < 0) return(LAMERROR);
	semctl_arg.val = nshm;
	if (semctl(semid, 1, SETVAL, semctl_arg) < 0) return(LAMERROR);
/*
 * Set information to pass to other processes.
 */
	msg->nh_data[1] = (int4) shmid;
	msg->nh_data[2] = (int4) semid;

	return(0);
}


/*
 *	_shm_attach_area
 *
 *	Function:	- attach to shared memory area
 *	Accepts:	- number of processes sharing area
 *			- process (in-out)
 *			- LAM msg containing shared info
 *	Returns:	- 0 or LAMERROR
 */
int
_shm_attach_area(int nshm, struct c2c_proc *ps, struct nmsg *msg)
{
	ps->cp_shm = (int) msg->nh_data[1];
	ps->cp_sem = (int) msg->nh_data[2];
	semaphores = ps->cp_sem;
/*
 * Attach the shared memory segment and mark it for removal.  It will be
 * automatically removed once all processes exit.
 */
	_shm_membase = (char *) shmat(ps->cp_shm, (char *) 0, 0);
	if (_shm_membase == (char *) -1) {
	    shmctl(ps->cp_shm, IPC_RMID, (struct shmid_ds *) 0);
	    semctl_arg.val = 0;
	    semctl(ps->cp_sem, 0, IPC_RMID, semctl_arg);
	    return(LAMERROR);
	}

#ifdef PTHREAD_FREELOCK
	freelock = (pthread_mutex_t *) _shm_membase;
#endif
	shminit(_shm_membase + BOXTOTSIZE, _shm_poolsize, FALSE);

	return(0);
}


#ifdef PTHREAD_FREELOCK

/*
 *	shmlock
 *
 *	Function:	- lock the shared memory area
 */
static void
shmlock(void)

{
    pthread_mutex_lock(freelock);
}


/*
 *	shmunlock
 *
 *	Function:	- unlock the shared memory area
 */
static void
shmunlock(void)

{
    pthread_mutex_unlock(freelock);
}

#else  /* use SYSV semaphore */

/*
 *	shmlock
 *
 *	Function:	- lock the shared memory area
 */
static void
shmlock(void)

{
	do {
		if (semop(semaphores, &shm_lock, 1) == 0) {
			return;
		} else if (errno != EINTR) {
			lam_err_comm(MPI_COMM_NULL, MPI_ERR_OTHER, errno,
				"locking shared memory area");
		}
	} while (1);
}


/*
 *	shmunlock
 *
 *	Function:	- unlock the shared memory area
 */
static void
shmunlock(void)

{
	semop(semaphores, &shm_unlock, 1);
}

#endif
