/* ``The contents of this file are subject to the Erlang Public License,
 * Version 1.0, (the "License"); you may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 * http://www.erlang.org/EPL1_0.txt
 * 
 * Software distributed under the License is distributed on an "AS IS"
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
 * the License for the specific language governing rights and limitations
 * under the License.
 * 
 * The Original Code is Erlang-4.7.3, December, 1998.
 * 
 * The Initial Developer of the Original Code is Ericsson Telecom
 * AB. Portions created by Ericsson are Copyright (C), 1998, Ericsson
 * Telecom AB. All Rights Reserved.
 * 
 * Contributor(s): ______________________________________.''
 */
/* Copyright (C) 1993, Ellemtel Telecommunications Systems Laboratories */
/* Author: Claes Wikstrom  klacke@erix.ericsson.se */
/* Purpose:   Implement the erlang distribution protocoll  */


#define WANT_NONBLOCKING    /* must define this to pull in defs from sys.h */
#include "sys.h"
#define MEM_COST_GROUP 9
#ifndef VXWORKS
#include <sys/time.h>	    /* Needed for FD_CLR etc on Linux */
#endif
#include <sys/types.h>
#include <signal.h>
#include <sys/socket.h>
#include <netdb.h>
#include <errno.h>
#include <netinet/in.h>
#include <errno.h>

#ifdef ISC32
#include <net/errno.h>
#include <sys/bsdtypes.h>
#endif

#ifdef SYS_SELECT_H   /* on AIX */
#include <sys/select.h>
#endif

#include <netinet/tcp.h>

#include <stdio.h>   

#ifdef __STDC__
#include <stdarg.h>   /* For tcp_error */
#else
#include <varargs.h>
#endif

#include "proto.h"
#include "epmd.h"
#include "driver.h"
#include "erl_resolv.h"

extern char *strchr();

static int SOCKET_TAB_SIZE;
#define MAXHOSTLEN 255
#define MAXALIVELEN 63
#define LARGE_BUFFER 66000
#define KEEP_LARGE_MESS 4
#define MAX_READ_MESSAGES 35
#define MAX_LONG (0xffffffff - 1)


typedef struct connection {
  char nodename[MAXHOSTLEN+1];
  char alivename[MAXALIVELEN+1];
  struct in_addr ip_addr;
  char *buf,large;
  unsigned int status;
  int sz, cpos, remain, end, count, ticked;
} Connection;

#define FREE       1		/* bits for the status field */
#define CONNECTED  2
#define TICK_READ  4
#define TICK_WRITE 8
#define HIDDEN    16

#define CLOSE_RET(fd) {close(fd); return(-1);}
#define CLOSE_CLEAR(fd) {close(fd); clear_entry(fd); return(-1);}

/* We maintain a linked fifo queue of these structs in order */
/* to manage unfinished reads/and writes on differenet fd's */

typedef struct pend {
    int cpos;
    int fd;
    int remain;
    struct pend *next;
    char buf[1];   /* this is a trick to be able to malloc one chunk */
} Pend;

/* Forward declarations */
static int cnct(), schedule_write();
static void bad_fd(), clear_entry();
static Connection* create_conn_entry();
static int new_connection();

/* driver interface */

static long tcp_start();
static int tcp_init(), tcp_stop(), 
    tcp_from_erlang(), tcp_net_inp(), do_scheduled_write();

const struct driver_entry tcp_driver_entry = {
    tcp_init, tcp_start, tcp_stop, tcp_from_erlang, tcp_net_inp, 
    do_scheduled_write, "tcp"
};


#define READY 1
#define CONTINUE 2
#define READ_ERROR 3
static int erlang_port;
static char rbuf[BUFSIZ];

static int use_fully_qual_names;
static struct timeval tv_connect_timeout   = {2,0};
static struct timeval tv_handshake_timeout = {1,0};
static struct timeval tv_message_timeout   = {2,0};
static char thishostname[MAXHOSTLEN+1];
static char thisnodename[MAXHOSTLEN+1+MAXALIVELEN+1];
static char thisalivename[MAXALIVELEN+1];

static struct in_addr this_ipaddr;  /* stored in network byte order */
static fd_set pending_reads;        /* Unfinished read business */
static fd_set pending_write_mask;   /* Unfinished write business */
static int listensock; 
static int mappersock;              /* Leading to epmd */
static Connection **conn_tab;/* table of active connections */
static const int one = 1;
static Pend *pending;        /* Queue of pending operations  */

/* Standard set of integer macros  .. */

#define get_int32(s) ((((unsigned char*) (s))[0] << 24) | \
		      (((unsigned char*) (s))[1] << 16) | \
		      (((unsigned char*) (s))[2] << 8)  | \
		      (((unsigned char*) (s))[3]))

#define put_int32(i, s) {((unsigned char*)(s))[0] = ((i) >> 24) & 0xff; \
                        ((unsigned char*)(s))[1] = ((i) >> 16) & 0xff; \
                        ((unsigned char*)(s))[2] = ((i) >> 8)  & 0xff; \
                        ((unsigned char*)(s))[3] = (i)         & 0xff;}

#define get_int16(s) ((((unsigned char*)  (s))[0] << 8) | \
		      (((unsigned char*)  (s))[1]))


#define put_int16(i, s) {((unsigned char*)(s))[0] = ((i) >> 8) & 0xff; \
                        ((unsigned char*)(s))[1] = (i)         & 0xff;}

#ifdef __STDC__
static void tcp_error(char *fmt,...)
{
    va_list args;
    char t[BUFSIZ];

    va_start(args, fmt);
#else
static void tcp_error(va_alist)
va_dcl
{
    va_list args;
    char *fmt;
    char t[BUFSIZ];

    va_start(args);
    fmt = va_arg(args, char *);
#endif
    *t = DISTRIBUTION_ERROR;
    vsprintf(t+1,fmt,args);
    driver_output(erlang_port, t, strlen(t));
    va_end(args);
}

/* Check alloc mem */
static int challoc(fd, sz)
int fd, sz;
{
    if (sz > conn_tab[fd]->sz) {
	conn_tab[fd]->sz  = sz + BUFSIZ;
	if (conn_tab[fd]->buf) 
	    free(conn_tab[fd]->buf);
	if ((conn_tab[fd]->buf = (char* ) malloc(sz + BUFSIZ)) == NULL)
	    return(-1);
	if (sz > LARGE_BUFFER) 
	    conn_tab[fd]->large = KEEP_LARGE_MESS;
    }
    return(0);
}

/* Either we use fully qualified hostnames everywhere, or not */
/* at all, we can't have both since we need to be able to guarantee */
/* that nodenames are unique  , so */
/* Whenever we aquire a nodename by means of get_name(fd) */
/* we must check the name to see if it is on the same */
/* form as all other node names we have aquired */

static int check_host_name(hname)
char* hname;
{
    if ((strchr(hname,'.')) == NULL) {  /* No dots */
	if (use_fully_qual_names == 0) /* Only read /etc/hosts */
	    return(1);
	tcp_error("\n** %s **\n** Hostname %s is illegal **\n",
		  "System running to use fully qualified hostnames",
		  hname);
	return(0);
    }
    /* Dots in name .. */
    if (use_fully_qual_names == 1) /* std case */
	return(1);
    tcp_error("\n** %s **\n** Hostname %s is illegal **\n",
	      "System NOT running to use fully qualified hostnames",
	      hname);
    return(0);
}

static int find_connection(nodename)
char *nodename;
{
    int s;

    for (s = 0; s < SOCKET_TAB_SIZE; s++) {
	if ((conn_tab[s] != NULL) &&
	    (strcmp(conn_tab[s]->nodename, nodename) == 0))
	    return s;
    }
    return -1;
}

static int read_fill_timeout(fd, buf, len, timeout)
int fd;
char *buf;
int len;
struct timeval* timeout;
{
    int i, got = 0;
    fd_set fds_r;

    do {
        FD_ZERO(&fds_r);
        FD_SET(fd, &fds_r); 
        if (select(fd+1, &fds_r, (fd_set*)0, (fd_set*) 0, timeout) == 0) 
            return got;
	if ((i = read(fd, buf+got, len-got)) <= 0) {
	    if (i == 0 || errno != EINTR)
		return got;
	    i = 0;
	}
	got += i;
    } while (got < len);
    return (len);
}
    
static int read_fill(fd, buf, len)
int fd;
char *buf;
int len;
{
    return read_fill_timeout(fd, buf, len, &tv_message_timeout);
}

static int write_fill(fd, buf, len)
int fd;
char *buf;
int len;
{
    int i, done = 0; 
    
    do {
	if ((i = write(fd, buf+done, len-done)) < 0) {
	    if (errno != EINTR)
		return (i);
	    i = 0;
	}
	done += i;
    } while (done < len);
    return (len);
}


/* All messages on sockets come with an initial 4byte header */
/* Indicationg the len of the entire message                 */
/* get four byte header                                      */
static int gfbh(fd,skip_if_none)		    
int fd;
int skip_if_none;
{

    int i,j;
    unsigned char fourbytes[4];

    conn_tab[fd]->status |= TICK_READ;
    i = read(fd, fourbytes, 4);
    if (i == 4)  {
	i = get_int32(fourbytes);
	return(i);
    }
    if (i == 0) {  /* EOF */
	return -1;
    }
    if (i == -1) {
	if (errno == ERRNO_BLOCK)
	    i = 0;
	else
	    return -1;
    }
    if (i==0 && skip_if_none)
      return -1;
    if (i < 4)  {
	/* worst case block and read */
	SET_BLOCKING(fd);
	j = read_fill(fd, fourbytes+i, 4-i);
	SET_NONBLOCKING(fd); 
	if (j < 4-i) return -1;
	i = get_int32(fourbytes);
	return i;
    }
    else {
#ifdef DEBUG
	fprintf(stderr, "tcp: unexpected gfbh got %d\n\r", i);
#endif
	driver_failure(erlang_port, -1);
	return(-1);
    }
}


/*    need to be called with sz supplied as either the value */
/*    returned by gtbh() or the value that remains to be read */
/*    on fd  */
/*    The idea here is that we have set the socket to non blocking, */
/*    we then try to read() once, if we get it all, all is fine  */
/*    Otherwise we will automatically read the rest when it arrives. */
   
static int nb_read(fd, sz) 
int fd, sz;
{
    int i;

    conn_tab[fd]->status |= TICK_READ;
    i = read(fd, conn_tab[fd]->buf + conn_tab[fd]->cpos, sz);
    if (i == sz) {
	conn_tab[fd]->remain = 0;
	conn_tab[fd]->end = conn_tab[fd]->cpos + i;
	conn_tab[fd]->cpos = 0;
	conn_tab[fd]->count = 0;
	FD_CLR(fd, &pending_reads);
	return READY;
    }
    if (conn_tab[fd]->count++ == 10000) {  /* Probably looping madly */
#ifdef DEBUG
	fprintf(stderr, "tcp: too many read tries on %d sz == %d\n\r", fd, sz);
#endif
	bad_fd(fd);
	return(READ_ERROR);
    }
    if (i == 0) { /* EOF */
	bad_fd(fd);
	return(READ_ERROR);
    }
    if (i < 0) {
	if (errno == ERRNO_BLOCK || errno == EINTR) {
	    i = 0;
	}
	else {
#ifdef DEBUG
	    fprintf(stderr, "tcp: nb_read failed \n\r");
	    perror("read failed ");
#endif
	    bad_fd(fd);
	    return(READ_ERROR);
	}
    }
    if (i < sz) {
	FD_SET(fd, &pending_reads);
	conn_tab[fd]->remain = sz - i;
	conn_tab[fd]->cpos += i;
	return CONTINUE;
    }
    else {   
#ifdef DEBUG
	fprintf(stderr, "tcp: nb_read failed  \n\r");
#endif
	bad_fd(fd);
	return(READ_ERROR);
    }
}


static void add_to_queue(p)   /* adds last to queue */
Pend* p;
{
    Pend *p2;
    p->next = (Pend*) 0;
    if ((p2 = pending) == (Pend*) 0) {
	pending = p;
    }
    else {
	while(p2) {
	    if (p2->next) {
		p2 = p2->next;
		continue;
	    }
	    p2->next = p;
	    return;
	}
    }
}


static void remove_from_queue(p)
Pend* p;
{
    Pend *ptr;
    Pend **prev;
    
    prev = &pending;
    ptr = pending;
    while(ptr) {
	if (ptr == p) {
	    *prev = ptr->next;
	    free(p);   /* will get rid of the buf as well */
	    return;
	}
	else {
            prev = &ptr->next;
            ptr = ptr->next;
        }
    }
}


/*   connects to a remote node (via epmd) and interchanges */
/*   hand shake messages with it */

static int cnct_node(nodename)
char *nodename;
{
    int i, len, s, rval ;
    uint16 rp;
    char *hostname, alivename[BUFSIZ];
    struct hostent *hp;
    struct in_addr *ip_addr;

    /* first extract the host part from nodename */
    i = 0;
    hostname = nodename;
    while(*hostname != '@') {
	if (*hostname == '\0') {
	    tcp_error("** Nodename %s illegeal, no '@' character ** \n", 
		      nodename);
	    return(-1);
	}
	alivename[i++] = *hostname++;
    }
    alivename[i] = '\0';
    hostname++;

    /* now hostname points to a host name and alivename 
       contains the name of the node */

    if (check_host_name(hostname) == 0) 
	return(-1);

    if ((hp = erl_gethostbyname(hostname)) == NULL) {
	tcp_error("\n** Can't find host for nodename %s\n%s\n%s\n",
		  nodename,"** maybe named/resolver configuration error",
		  "** Or unknown host \n");
	return(-1);
    }

    ip_addr = (struct in_addr*) *hp->h_addr_list;

    /* ip_addr is now in network byte order */
    /* first we have to get hold of the portnumber to 
       the node through epmd at that host */

    if ((s = cnct(ERLANG_DAEMON_PORT, ip_addr, 
		  sizeof (struct in_addr), nodename)) < 0)
	return(s);

    rbuf[2] = EPMD_PORT_PLEASE;
    strcpy(&rbuf[3], alivename);
    len = strlen(&rbuf[2]);
    put_int16(len, &rbuf[0]);
    len += 2;
    if (write_fill(s, rbuf, len) != len) 
	CLOSE_RET(s);
    if((rval = read_fill_timeout(s, rbuf, 2, &tv_handshake_timeout)) != 2)
	CLOSE_RET(s);
    rp = get_int16(rbuf);	/* got the portnumber now in hbo */

    /* epmd stores all port numbers in host byte order */
    close(s);

    if((s = cnct(rp, ip_addr, sizeof(struct in_addr), nodename)) < 0)
	return(s);

    if (tell_name(s) == -1)
	return(-1);

    if ((i = get_name(s)) < 0) 
	return(i);

    if (strcmp(conn_tab[s]->nodename, nodename) != 0) {
	tcp_error("** Ambigous nodename %s **\n", nodename);
	CLOSE_CLEAR(s);
    }
    return(s);
}  /* end cnct_node() */


static int open_socket(domaine,type,protocoll)
int domaine, type, protocoll;
{
    int s;
    s = socket(domaine,type,protocoll);
    if (( s >= SOCKET_TAB_SIZE) ||
	((s < 0 ) && ((errno == EMFILE) || (errno == ENOBUFS)))) {
	/*   ughh here is the case where we have have run out of file */
	/*   descriptors in our unix process */
	/*   let the system print out a warning */
	close(s);   /* No harm */
	tcp_error("\n** run out of filedecriptors (max = %d)", 
		  SOCKET_TAB_SIZE);
	return(-1);
    }
    else if (s < 0) 
	return(-1);
    else 
	return (s);
}


/* connects to port at ip-address ip_addr 
   and returns a blocking fd to socket 
   port has to be in host byte order */

static int cnct(port, ip_addr,addr_len, nodename)
uint16 port;
struct in_addr *ip_addr;
int addr_len;
char *nodename;
{
    int max, res, i, s;
    int sim_connect = 0;
    struct sockaddr_in iserv_addr;
    fd_set fds_r, fds_w;

    if ((s = open_socket(AF_INET, SOCK_STREAM, 0)) < 0)
	return(-1);
    SET_NONBLOCKING(s);
    memzero((char*)&iserv_addr, sizeof(struct sockaddr_in));
    memcpy((char*)&iserv_addr.sin_addr, (char*)ip_addr, addr_len);
    iserv_addr.sin_family = AF_INET;
    iserv_addr.sin_port = htons(port);
    
    /*
     * We have set the connect socket to non-blocking
     * This code handles the case with simultaneous connects
     */

    if((res = connect(s, (struct sockaddr*)&iserv_addr, sizeof iserv_addr))
       == 0)  {
	SET_BLOCKING(s);
	return(s);
    }
    if (errno == EINPROGRESS) {
	while(1) {
	    FD_ZERO(&fds_r);
	    FD_ZERO(&fds_w);
	    FD_SET(listensock, &fds_r); 
	    FD_SET(s, &fds_w);
	    max = s > listensock ? s + 1 : listensock + 1;
	    if (select(max, &fds_r, &fds_w, (fd_set*) 0, &tv_connect_timeout)
		      == 0) 
		goto cnct_fail;

	    if (FD_ISSET(listensock, &fds_r)) { /* remote connect */
		sim_connect = 0;
		i = new_connection(ip_addr, port, &sim_connect);
		if (( i > 0) && (conn_tab[i] != NULL) &&
		    (port == ERLANG_DAEMON_PORT) &&
		    (strcmp(conn_tab[i]->nodename, nodename) == 0)) {
		    close(s);  /* We're done */
		    return -1;
		}

		if (sim_connect == 0)
		    continue;   /* someone else connected */
		if (sim_connect == 1) 
		    /* gory case and we are done */
		    /* we want to return s, but not just yet */
		    close(i);
		if (sim_connect == 2) {
		    /* We have accepted the other end which is now */
		    /* in the middle of the cnct message exchange */
		    close(s);
		    return(i);
		}
	    }
		
	    if (FD_ISSET(s, &fds_w)) {  /* Check connection */
		if (connect(s, (struct sockaddr*)&iserv_addr, 
			    sizeof iserv_addr) != 0) {
		    if (errno != EISCONN) 
			goto cnct_fail; 
		}
		SET_BLOCKING(s);      
		return(s);
	    }
	}
    }
 cnct_fail:
    close(s);
    return(-1);
} 

/* tells our local daemon abour our existence            */
/* and set value of global value thisnodename            */
/* port is in host byte order ar returnd by              */
/* get_and_open_local_port()                             */
/* name is a string on the form                          */
/* "alivename [longname|shortnames]  [connect_timeout] " */

static int pub(name, port)
char *name;   /* as given to alive */
uint16 port;
{
    int len, s;
    char *longname, *ct;

    struct hostent *hp;
    if(strlen(name) > MAXALIVELEN)
        return(-1);
    if ((longname = strchr(name, ' ')) == NULL) {
        use_fully_qual_names = 1;
	tv_connect_timeout.tv_sec = 2;  /* global variable */
	tv_connect_timeout.tv_usec = 0;
    }
    else {
        *longname++ = '\0';
	ct = strchr(longname, ' ');
	if (ct != NULL)
	    *ct = '\0';
        if (strcmp(longname, "shortnames") == 0)
            use_fully_qual_names = 0;
        else if ( strcmp(longname, "longnames") == 0)
            use_fully_qual_names = 1;
        else if (*longname == '\0')
            use_fully_qual_names = 1;
        else {
            tcp_error("Bad args to tcp driver \n");
            return(-1);
        }
	if (ct != NULL) {
	    tv_connect_timeout.tv_usec = 0;
	    if ((tv_connect_timeout.tv_sec = atoi(ct+1)) == 0) {
		tcp_error("Bad args to tcp driver \n");
		return(-1);
	    }
	}
    }

    strcpy(thisalivename,name);

    if ((hp = erl_gethostbyname(thishostname)) == NULL) {
	tcp_error("\n** Fatal error, Host %s not found in nameserver\n", 
		  thishostname);
	return(-1);
    }

    if (use_fully_qual_names == 0) /* shortnames */
	if ((ct = strchr(hp->h_name, '.')) != NULL)
	    *ct = '\0';  

    /*   Now set thishostname again  */
    strcpy(thishostname,hp->h_name);

    if ((s = cnct(ERLANG_DAEMON_PORT, hp->h_addr,hp->h_length, "")) < 0) 
	return(-1);
    memcpy(&this_ipaddr.s_addr, *hp->h_addr_list, sizeof(struct in_addr));

    mappersock = s;
    
    sprintf(thisnodename, "%s@%s", name, hp->h_name);
    rbuf[2] = EPMD_ALIVE;
    put_int16(port, &rbuf[3]);
    strcpy(&rbuf[5], name);
    len = 3 + strlen(name);
    put_int16(len, rbuf);
    write_fill(s, rbuf, len+2);
    if( read_fill_timeout(s, rbuf, 3, &tv_handshake_timeout) != 3) {
	tcp_error("** Can't go distributed, epmd rejects us\n** maybe "
		  "name %s is already occupied !! \n", thisnodename);
	return(-1);
    }
    rbuf[0] = ALIVE_REQUEST_OK;
    strcpy(&rbuf[3], thishostname);
    driver_output(erlang_port, rbuf, strlen(&rbuf[3]) + 4);
    /* 4 == 3 + the NULL sign */
    return(1);
}


/* 
 * opens our global variable listen_socket to the world 
 * and retruns the port number assigned to it
 * in host byte order
 *   
*/
	    
static sint32  get_and_open_local_port() 
{

    struct sockaddr_in iserv_addr;
    int length;

    listensock = open_socket(AF_INET, SOCK_STREAM, 0);
    memzero((char*) &iserv_addr, sizeof(iserv_addr));
    iserv_addr.sin_family = AF_INET;
    iserv_addr.sin_addr.s_addr = htonl(INADDR_ANY);
    iserv_addr.sin_port = htons(0);
    if(bind(listensock, (struct sockaddr*) &iserv_addr, 
	    sizeof(iserv_addr)) <0) {
	perror("bind bad portnumber");
	driver_failure(erlang_port, -1);
	return(-1);
    }
    listen(listensock, 5);
    driver_select(erlang_port, listensock, DO_READ, 1);

     /* find out assigned portnumber */
    length = sizeof iserv_addr;
    if (getsockname(listensock, 
		    (struct sockaddr *)&iserv_addr, 
		    &length) < 0) {
	perror("getsockname");
	driver_failure(erlang_port, -1);
	return(-1);
    }
    /* iserv_addr.sin_port shall now be in network byte order
       as returned by the syscall*/
    
    return(ntohs(iserv_addr.sin_port));
}


static int nb_write(fd, buf, len)
int fd, len;
char *buf;
{
    int rval;
    if (FD_ISSET(fd, &pending_write_mask)) {
	return(schedule_write(fd, buf, len));
    }
    conn_tab[fd]->status |= TICK_WRITE;
    rval = write(fd, buf, len);
    if (rval == len) return(1);
    if (rval < 0) {
	if (errno == ERRNO_BLOCK || errno == EINTR) {
	    rval = 0;
	}
	else {
#ifdef DEBUG
	    fprintf(stderr, "nb_write failed\n\r");
#endif
	    return (-1);
	}
    }
    return(schedule_write(fd, buf+rval, len-rval));
}


static void bad_node(nodename)	
char* nodename;
{
    int i;
    for (i=0;i<SOCKET_TAB_SIZE; i++) {
	if(!conn_tab[i])
	    continue;
	if (strcmp(conn_tab[i]->nodename, nodename) == 0) {
	    driver_select(erlang_port, i, DO_READ|DO_WRITE, 0);
	    clear_entry(i);
	    break;
	}
    }
    rbuf[0] = BAD_SYSTEM;
    strcpy(&rbuf[1], nodename);
    driver_output(erlang_port, rbuf, strlen(rbuf));
}


static void sync_bad_fd(fd)
int fd;
{
    if (fd < 0) 
	return;
    clear_entry(fd);
    rbuf[0] = SYNC_BAD_CHANNEL;
    put_int16(fd,&rbuf[1]);
    driver_select(erlang_port, fd, DO_READ|DO_WRITE, 0);
    driver_output(erlang_port, rbuf, 3);
}

static void bad_fd(fd)
int fd;
{
    if (fd < 0) 
	return;
    clear_entry(fd);
    rbuf[0] = BAD_CHANNEL;
    put_int16(fd,&rbuf[1]);
    driver_select(erlang_port, fd, DO_READ|DO_WRITE, 0);
    driver_output(erlang_port, rbuf, 3);
}



static Connection * create_conn_entry(fd)
int fd;
{
    if ((conn_tab[fd] = (Connection*) malloc(sizeof(Connection))) == NULL)
	return(NULL);
    
    conn_tab[fd]->buf = NULL;
    conn_tab[fd]->status = FREE;
    conn_tab[fd]->sz = 0;
    conn_tab[fd]->large  =  conn_tab[fd]->end = 0;
    conn_tab[fd]->cpos = conn_tab[fd]->remain = conn_tab[fd]->count = 0;
    conn_tab[fd]->ticked = 0;
    return(conn_tab[fd]);
}


static void clear_entry(fd)
int fd;
{

    Pend *tmp, *ptr;
    Pend **prev;

    if ((fd < 0) || (!conn_tab[fd]))
	return;
    close(fd);
    FD_CLR(fd, &pending_reads);
    FD_CLR(fd, &pending_write_mask);    

    /* Go through the list of pending writes and delete */
    prev = &pending;
    ptr = pending;
    while(ptr) {
	if (ptr->fd == fd) {
	    *prev = ptr->next;
	    tmp = ptr;
            ptr = ptr->next;
	    free(tmp);
	}
	else {
	    prev = &ptr->next;
            ptr = ptr->next;
        }
    }
    if (conn_tab[fd]->buf) 
	free(conn_tab[fd]->buf);
    free(conn_tab[fd]);
    conn_tab[fd] = NULL;
}


static void new_net_mess(fd)
int fd;
{
    int i;
    char *t;
    int allow_reads;
    int allow_skip;
    
    /* the idea here is that if a very large message has come on an fd */
    /* mem has been allocated for that message, let's keep the buffer for */
    /* a while and then after some time release it */

    if (conn_tab[fd] == NULL) return;  /* pathological case  */
    if (conn_tab[fd]->large > 0) {
	conn_tab[fd]->large--;
	if (conn_tab[fd]->large == 0)  {
	    if (conn_tab[fd]->buf) 
		free(conn_tab[fd]->buf);
	    conn_tab[fd]->buf = NULL;
	    conn_tab[fd]->sz  = 0;
	    conn_tab[fd]->large = 0;
	}
    }
    allow_skip = 0;
    for(allow_reads=MAX_READ_MESSAGES; allow_reads; allow_reads--) {
      i = gfbh(fd,allow_skip);  /* eat four byte header */
      if( i < 0 ) {
	if( allow_skip )
	  return;
	bad_fd(fd);
	return;
      }
      allow_skip = 1;
      if (i == 0) /* The tick */
	continue;
      if (challoc(fd, i + 4) < 0) {
	bad_fd(fd);
	return;
      }
      t = conn_tab[fd]->buf;
      t[0] = PASS_THROUGH;
      put_int16(fd,t+1);
      conn_tab[fd]->cpos = 3;
      i = nb_read(fd, i);
      if (i == READY) {
	driver_output(erlang_port, conn_tab[fd]->buf, conn_tab[fd]->end);
      } else
	break;
    }
    return;
}

/* Other and port are the ip address and port respectively  */
/* that we are simultaneously connecting to if we are called from */
/* within the cnct() function. */

static int new_connection(other, port, sim_connect)
struct in_addr *other;
uint16 port;
int *sim_connect;

{
    int i, fd;
    struct sockaddr_in icli_addr;    /* workaround for QNX bug - can not */
    int icli_addr_len;               /* handle NULL pointers to accept.  */

    icli_addr_len = sizeof(icli_addr);
    fd = accept(listensock, (struct sockaddr*) &icli_addr,
                (int*) &icli_addr_len );
    if ((fd < 0) || (fd >= SOCKET_TAB_SIZE)) {
	if (((errno == EMFILE) || (errno == ENOBUFS)) ||
	    (fd >= SOCKET_TAB_SIZE)) {
	    close(fd);   /* No harm */
	    tcp_error("\n** run out of filedecriptors (max == %d)\n", 
		      SOCKET_TAB_SIZE);
	}
	perror("failed in accept()");
	return(-1);
    }
    /* If port is ERLANG_DAEMON_PORT, we always accept any incoming */
    /* connection attempts */
    
    if ((port != ERLANG_DAEMON_PORT) &&
	memcmp((char*) &(icli_addr.sin_addr), (char*) other, 
	       sizeof(struct in_addr)) ==0 ) {
	*sim_connect = 1;
	/* Gory case, *we* are sitting in a non-blocking connect to */
	/* the same node which is now connecting to us */
	if (memcmp((char*) &this_ipaddr, (char*) &(icli_addr.sin_addr), 
		   sizeof(struct in_addr)) < 0) { /* we are smaller, we lose */
	    *sim_connect = 2;
	}
	return(fd);
    }
    if (tell_name(fd) == -1) 
	return (-1);
    if ((i = get_name(fd)) < 0)
	return(i);
    activate_name(fd);
    return(fd);
}



int activate_name(fd)
int fd;
{
    char buf[BUFSIZ];

    if (conn_tab[fd]->status & HIDDEN)
	buf[0] = NEW_HIDDEN_CONNECTION;
    else
	buf[0] = NEW_CONNECTION;
    put_int16(fd, &buf[1]);
    strcpy(&buf[3], conn_tab[fd]->nodename);
    driver_select(erlang_port, fd, DO_READ, 1);
    driver_output(erlang_port, buf, strlen(&buf[3])+1+3);
    return(1);
}



tell_name(fd)
int fd;

{
    char buf[BUFSIZ];
    int len;

    buf[2] = MY_NAME;
    memcpy(&buf[3], &this_ipaddr.s_addr, 4);
    strcpy(&buf[7], thisnodename);   /* node name here */
    len = 5 + strlen(&buf[7]);
    put_int16(len, buf);
    len += 2;
    if (write_fill(fd, buf, len) != len)
	CLOSE_RET(fd);

    return(1);
}


/* Here we expect to get a message produced by tell_name() */
/* This means that we expext a */
/* [Type, IpAddr, NodeName] message */
/* This function is called by both sides */

get_name(fd) 
int fd;
{
    int rval, i;
    char buf[BUFSIZ],  *cp;
    struct in_addr ip_addr;

    if ((rval = read_fill_timeout(fd, buf, 2, &tv_handshake_timeout)) != 2) 
	CLOSE_RET(fd);
    if ((rval = get_int16(buf)) >= BUFSIZ)
	CLOSE_RET(fd);
    if ((read_fill_timeout(fd, buf, rval, &tv_handshake_timeout)) != rval) 
	CLOSE_RET(fd);
    if ((buf[0] != MY_NAME) && (buf[0] != MY_HIDDEN_NAME))  
	CLOSE_RET(fd);
    
    /* Now we create Connection entry and install */
    /* all the data we have received */


    if ((conn_tab[fd] = create_conn_entry(fd)) == NULL) 
       CLOSE_RET(fd); 
    buf[rval] = '\0';

    cp = buf + 1 + sizeof(ip_addr);
    cp = strchr(cp, '@');
    if (check_host_name(cp+1) == 0) 
	CLOSE_CLEAR(fd);

    *cp = '\0';
    strcpy(conn_tab[fd]->alivename, buf + 1 + sizeof(struct in_addr));
    *cp = '@';
    strcpy(conn_tab[fd]->nodename, buf + 1 + sizeof(struct in_addr));

    memcpy(&(conn_tab[fd]->ip_addr), buf+1, sizeof(struct in_addr));


    for (i=0; i < SOCKET_TAB_SIZE; i++) {
        if (!conn_tab[i] || i ==fd)
	    continue;
	if (conn_tab[i]->ip_addr.s_addr != conn_tab[fd]->ip_addr.s_addr)
	    continue;
	if (strcmp(conn_tab[i]->alivename, conn_tab[fd]->alivename) == 0) {
	    sync_bad_fd(i);/* Gory case, this node has not yet understood */
	    break;      /* that the other node is down, too short tick time */
	}               /* And the other node is now restarted !!      */
    }

    conn_tab[fd]->status = (TICK_READ | TICK_WRITE | CONNECTED);
    if (buf[0] == MY_HIDDEN_NAME)
	conn_tab[fd]->status |= HIDDEN;

#ifdef VXWORKS
    /* VxWorks workaround - socket fd is corrupted sometimes and
       getpeername is used to see if this has happened */
    {
	struct sockaddr peeraddr;
	int peersize = sizeof(struct sockaddr);
	int res;

	if((res = getpeername(fd, &peeraddr, &peersize)) < 0)
	    CLOSE_CLEAR(fd);
    }
#endif

    setsockopt (fd, IPPROTO_TCP, TCP_NODELAY, (char *)&one, sizeof(one));

    SET_NONBLOCKING(fd); 
    return(1);
}


static int schedule_write(fd, buf, len)    
int fd, len;
char *buf;
{

    Pend *p;
    byte obuf[3];

    if ((p = (Pend*) malloc(len + sizeof(Pend))) == NULL)
	return(-1);

    put_int16(fd, obuf+1); /* Tell erlang to back off */
    *obuf = REAL_BUSY;
    driver_output(erlang_port, obuf, 3);
    memcpy(p->buf, buf, len);
    FD_SET(fd, &pending_write_mask);
    p->cpos = 0;
    p->fd = fd;
    p->remain = len;
    add_to_queue(p);
    driver_select(erlang_port, fd, DO_WRITE, 1);
    return(1);
}



/* driver interface */  

static int tcp_init()
{
    erlang_port = -1;
    return(0);
}


/* ARGSUSED */
static long tcp_start(port, buf) 
int port;
char *buf;
{
    int i;

    SOCKET_TAB_SIZE = sys_max_files();  /* Call direct here */
    if (erlang_port != -1 ||
	(conn_tab =
	 (Connection **)malloc(SOCKET_TAB_SIZE * sizeof (Connection*)))
	== NULL)
	return(-1);
    
    erlang_port = port;
    mappersock = listensock = 0;
    pending = (Pend*)0;
    FD_ZERO(&pending_reads);
    FD_ZERO(&pending_write_mask);
    gethostname(thishostname, MAXHOSTLEN+1);
    for(i=0; i<SOCKET_TAB_SIZE; i++) 
	conn_tab[i] = NULL;
    return(port);
}

/* ARGSUSED */
static int tcp_stop(port)
long port;
{
    int i;
    Pend *p, *tmpp;
    erlang_port = -1;
    if (listensock > 0 )
	driver_select(erlang_port, listensock, DO_READ|DO_WRITE, 0);
    if (mappersock > 0) 
	close(mappersock);
    for(i=0; i<SOCKET_TAB_SIZE; i++) {
	if(!conn_tab[i]) continue;
	bad_fd(i);
    }
    free((char *)conn_tab);

    p = pending;
    while(p) {
	tmpp = p;
	p = p->next;
	free(tmpp);
    }

    return(0);
}


static int tick = 0;

/* ARGSUSED */
static int tcp_from_erlang(port, buf, count)
long port;
char *buf;
int count;
{
    int  s, len;
    char *mess;
    char *t, tmp[MAXHOSTLEN+1];
    unsigned char fourbytes[4];
    t = buf;

    switch(*t) {
    case ALIVE_REQUEST:
	if (((s = get_and_open_local_port()) < 0) || (pub(t+1, s) <0)) {
	    rbuf[0] = ALIVE_REQUEST_ERROR;
	    driver_output(erlang_port, rbuf, 1);
	    driver_failure(erlang_port, -1);
	}
	return(0);
    case NEW_CONNECTION:
	strcpy(tmp, t+1);
	if ((s = cnct_node(tmp)) <0) {
	    /* We may fail due to simultaneous connections */
	    /* check if we already are connected */
	    if ((s = find_connection(tmp)) < 0) {
#ifdef MESSDEBUG
		fprintf(stderr,"tcp_drv: failed to send message to %s\n\r",
			tmp);
		/* bin_write is an internal emulator function... */
		bin_write(stderr, (t+1)+strlen(t+1) +1,30);
#endif
		bad_node(tmp);
		return(0);
	    }
	}

	mess = (t+1) + strlen(t+1) + 1;  /* beyond the NUL sign */
	while(*mess == '\0')
	    mess++; /* might be padded  */
	
	/* have to send the message as well */
	if ((mess - buf) <= count) {
	    *(mess-1) = PASS_THROUGH;
	    len = count -  (mess - buf) + 1;
	    put_int32(len, mess-5);
	    if (nb_write(s, mess-5, len+4) < 0 ) {
		bad_node(tmp);  /* NO valid fd in erlang yet */
		return(0);
	    }
	}
	
	activate_name(s);

	return(0);
    case PASS_THROUGH:
	s = get_int16(t+1);
	if (s < 0 || s > SOCKET_TAB_SIZE) return(-1);
	if (!conn_tab[s]) {
	    bad_fd(s); 
	    return(0);
	}
	put_int32(count-4, t);
	t[4] = PASS_THROUGH;
	if (nb_write(s, t, count) < 0) {
	    bad_fd(s); 
	    return(0);
	} 
	return(0);
    case BAD_CHANNEL:  /* Erlang  is not pleased with a channel */
	s = get_int16(t+1);
	bad_fd(s);
	return(0);
    case TICK:  

	/* This will happen every 15 seconds (by default) */
	/* The idea here is that everey 15 secs, we write a little */
	/* something on all fd's we haven't written anything on for */
	/* the last 15 secs */
	/* This will ensure that nodes that are nor responding due to */
	/* hardware errors (Or being suspended by means of ^Z) will */
	/* be considered to be down. If we do not want to have this  */
	/* we must start the net_kernel (in erlang) without its */
	/* ticker process, In that case this code will never run */

	/* And then every 60 seconds per fd we also check the fds and */
	/* close it if we havn't received anything on it for the */
	/* last 60 secs. If ticked == tick we havn't received anything */
        /* on this fd the last 60 secs. */

        /* The detection time interval is thus, by default, 45s < DT < 75s */

        /* A HIDDEN node is always (if not a pending write) ticked if */
        /* no TICK_READ exists as a hidden node only ticks then it receives */
        /* a TICK !! */
	
	tick++;

	put_int32(0, fourbytes);
	for(s=0; s<SOCKET_TAB_SIZE; s++) {
	    if (!conn_tab[s]) 
		continue;

	    if (!(conn_tab[s]->status & TICK_READ)) {
	        if (conn_tab[s]->ticked == tick) {
                    /* We found a dead fd  */
		    
		    tcp_error("** Node %s not responding **\n",
		              conn_tab[s]->nodename);
		    tcp_error("** Removing (timedout) connection **\n");
		    bad_fd(s);   /* Dead */
		    continue;
		}
		else if ((conn_tab[s]->status & HIDDEN) &&
			 (!FD_ISSET(s, &pending_write_mask))) {
		    nb_write(s, fourbytes, 4);
		    conn_tab[s]->status &= ~TICK_WRITE;   /* Clear our TICK write */
		    continue;
		}
	    }
	    else {
	      conn_tab[s]->ticked = tick;
	      conn_tab[s]->status &= ~TICK_READ;  /* Clear flag */
	    }

	    if (conn_tab[s]->status & TICK_WRITE) {
		conn_tab[s]->status &= ~TICK_WRITE;   /* Clear */
		continue;
	    }
	    if (FD_ISSET(s, &pending_write_mask))
	        continue;

	    nb_write(s, fourbytes, 4);
	    conn_tab[s]->status &= ~TICK_WRITE;   /* Clear our TICK write */
	}

	if (tick == 4) tick = 0;

	return(0);
    default:
#ifdef DEBUG
	fprintf(stderr, "tcp_drv: unknown op code from erlang %c\n\r", *t);
#endif
	driver_failure(erlang_port, -1);
	return(0);
    }
}


/* ARGSUSED */
static int tcp_net_inp(port, fd)
long port;
int fd;
{
    int zero = 0;
    if (FD_ISSET(fd, &pending_reads)) { /* unfinished business */
	if (conn_tab[fd]->remain && nb_read(fd, conn_tab[fd]->remain) == READY)
	    driver_output(erlang_port, conn_tab[fd]->buf, conn_tab[fd]->end);
    }
    else if (fd == listensock)
	new_connection(&zero, 0, &zero); 
    else  
	new_net_mess(fd);	
    return(0);
}


/* ARGSUSED */
static int do_scheduled_write(port, fd)
long port;
int fd;
{
    int wval;
    
    Pend* p = pending;
    while(p) {
	if (p->fd == fd) {
	    conn_tab[fd]->status |= TICK_WRITE;
	    wval = write(fd, p->buf + p->cpos, p->remain);
	    if (wval == p->remain) {                /* phuu all gone */
		Pend *p2;
		int found = 0;
		p2 = p->next;
		while(p2) {
		    if(p2->fd == p->fd)
			found = 1;
		    p2 = p2->next;
		}
		if (!found) {
		    byte buf[3];
		    *buf = NOT_REAL_BUSY;
		    put_int16(fd, buf+1);
		    driver_output(erlang_port, buf, 3);
		    FD_CLR(fd, &pending_write_mask);
		    driver_select(erlang_port, fd, DO_WRITE, 0);
		}
		remove_from_queue(p);
		return(0);
	    }
	    if (wval < p->remain && wval >= 0) {
		p->cpos += wval;
		p->remain -= wval;
		return(0);
	    }
	    if (wval < 0 && (errno == ERRNO_BLOCK || errno == EINTR)) {
		return(0);
	    }
#ifdef DEBUG
	    fprintf(stderr, "tcp: do_scheduled_write failed on %d\n\r", fd);
	    fprintf(stderr, "remain = %d , cpos = %d ", p->remain, p->cpos);
#endif
	    bad_fd(fd);
	    return(0);
	}
	p = p->next;
    }
#ifdef DEBUG
    fprintf(stderr, "tcp: no pend queue\n\r");
#endif
    return(0);
}
