/*
 * Copyright (c) 2001-2002 The Trustees of Indiana University.  
 *                         All rights reserved.
 * Copyright (c) 1998-2001 University of Notre Dame. 
 *                         All rights reserved.
 * Copyright (c) 1994-1998 The Ohio State University.  
 *                         All rights reserved.
 * 
 * This file is part of the LAM/MPI software package.  For license
 * information, see the LICENSE file in the top level directory of the
 * LAM/MPI source distribution.
 * 
 *
 *	$Id: lamhalt.c,v 1.8.2.1 2002/10/09 19:49:58 brbarret Exp $
 *
 *	Function:	- kill an entire running LAM run time system
 */

#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <netdb.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>

#include <args.h>
#include <dl_inet.h>
#include <portable.h>
#include <net.h>
#include <events.h>
#include <priority.h>
#include <hreq.h>
#include <laminternal.h>
#include <etc_misc.h>


/*
 * Local variables
 */
static int4		nlinks;
static struct dolink	*links;


static char *
nodename(int node, int want_name)
{
  static char name[1024];
  struct hostent *hent;
  
  if (node < 0 || node >= nlinks
      || links[node].dol_link == NOTLINKID) {
    strcpy(name, "invalid node");
  } else {
    hent = gethostbyaddr((char *)&links[node].dol_addr.sin_addr,
			 sizeof(struct in_addr), AF_INET);
    if (hent && want_name)
      strncpy(name, hent->h_name, sizeof(name));
    else
      sprintf(name, "%s", inet_ntoa(links[node].dol_addr.sin_addr));
  }

  return name;
}

int
main(int argc, char **argv)
{
  char *data;
  int4 node;
  int i, count = 0, n_index, n_flags, local;
  int fl_debug = 0;
  int fl_verbose = 0;
  struct nmsg sendhead, recvhead;
  struct hreq *request = (struct hreq*) sendhead.nh_data;
  int my_argc = 0;
  char **my_argv = 0;
  
  validopts("Hhvd");
  if (do_args(&argc, argv)) {
    show_help("lamhalt", "usage", NULL);
    exit(errno);
  }
  
  if (opt_taken('h')) {
    show_help("lamhalt", "usage", NULL);
    exit(0);
  }

/*
 * Get flags
 */
  fl_debug = opt_taken('d');
  fl_verbose = opt_taken('v') || fl_debug;
  
/*
 * Announce
 */
  if (!opt_taken('H'))
    lam_show_version(0);

/*
 * Attach to the local LAM daemon
 */
  if (kinit(PRCMD)) {
    show_help(NULL, "no-lamd", "lamhalt", NULL);
    exit(LAM_EEXIT);
  }
  if (ldogetlinks(&links, &nlinks))
    lamfail("lamhalt (ldogetlinks)");

  if (fl_verbose)
    printf("Shutting down LAM\n");

/*
 * Setup the halt ping messages.  The first message just pings the
 * remote daemon (to which it will respond with an ACK).  The second
 * message will have the lamd actually kill itself.  Need to break
 * this into two parts so that the dying LAM daemon will be guaranteed
 * to be able to send us the ACK.
 */
  LAM_ZERO_ME(sendhead);
  data = get_batchid();
  sendhead.nh_event = EVHALTD;
  sendhead.nh_type = LAM_HALT_PING;
  sendhead.nh_length = strlen(data) + 1;
  sendhead.nh_flags = 0;
  sendhead.nh_msg = data;

  request->hq_node = getnodeid();
  request->hq_event = (-getpid()) & 0xBFFFFFFF;

  /*
   * Fake using "N" to specify all the nodes.  If the lamd's are in
   * fault tolerant mode and one or more have disappeared, using "N"
   * will get the Right Things.
   */
  sfh_argv_add(&my_argc, &my_argv, "ignored argument");
  sfh_argv_add(&my_argc, &my_argv, "N");
  if (nid_parse(&my_argc, my_argv) || (errno = (argc == 1) ? 0 : EUSAGE)) {
    show_help("ALL", "unknown", NULL);
    kexit(errno);
  }
  sfh_argv_free(my_argv);
  local = getnodeid();
/*
 * Send shutdown notices to all LAM daemons except the local one (who
 * we need to keep up to pass messages for us)
 */
  nid_get(&n_index, &node, &n_flags);
  do {
    if (node != local) {
      count++;
      if (fl_debug)
	printf("lamhalt: sending HALT to n%d (%s)\n", n_index, 
 	       nodename(node, 1));
      sendhead.nh_node = node;
      if (nsend(&sendhead))
	lamfail("lamhalt (nsend)");
    }
    
    nid_get(&n_index, &node, &n_flags);
  } while (n_index);
/*
 * Wait for ACKs.  Once we get an ACK, send the actually 
 */
  recvhead.nh_event = request->hq_event;
  request = (struct hreq *) recvhead.nh_data;
  sendhead.nh_type = LAM_HALT_DIE;
  if (fl_debug && nlinks > 1)
    printf("lamhalt: waiting for HALT ACKs from remote LAM daemons\n");
  for (i = 0; i < count; i++) {
    recvhead.nh_type = 0;
    recvhead.nh_length = 0;
    recvhead.nh_flags = 0;
    if (nrecv(&recvhead))
      lamfail("lamhalt (nrecv)");

    if (fl_debug)
      printf("lamhalt: received HALT ACK from n%d (%s)\n", 
	     request->hq_node, nodename(request->hq_node, 1));

    sendhead.nh_node = request->hq_node;
    if (nsend(&sendhead))
      lamfail("lamhalt (nsend)");
  }
/*
 * Send one more message, telling local LAM daemon that it's a good
 * day to die.
 */
  sendhead.nh_node = LOCAL;
  sendhead.nh_event = EVHALTD;
  sendhead.nh_type = LAM_HALT_DIE;
  sendhead.nh_length = strlen(data) + 1;
  sendhead.nh_flags = 0;
  sendhead.nh_msg = data;
  if (fl_debug)
    printf("lamhalt: sending final HALT to n%d (%s)\n", getnodeid(), 
	   nodename(getnodeid(), 1));
  if (nsend(&sendhead))
    lamfail("lamhalt (nsend)");
/*
 * Do not attempt to receive final ACK message
 */

  if (fl_debug)
    printf("lamhalt: local LAM daemon halted\n");
  if (fl_verbose)
    printf("LAM halted\n");

  return 0;
}
