/*+++++++++++
  tokenize.c: functions to split a command line into tokens and stash them
  into an array similar to argc/argv
  markus@mhoenicka.de 2-17-00

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.
   
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.
   
   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

  ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "strfncs.h"
#include "linklist.h"
#include "tokenize.h"

#ifndef HAVE_ATOLL
long long atoll(const char *str);
#endif

#ifndef HAVE_STRTOLL
long long strtoll(const char *nptr, char **endptr, int base);
#endif

/* forward declaration of local functions */
static char* next_unescaped_char_in_string(const char* string, const char* chars);


/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  cmdln_tokenize(): splits a command line into tokens and stashes them
  in an array similar to argc/argv. Single and double quotes are
  respected to group tokens.

  int cmdln_tokenize returns 0 if successful, 1 if out of memory, 2 if
  command line empty

  int *inargc ptr to a counter for the tokens. Should be initialized
  to 0 before you call this function, unless you add to an existing
  array.

  char ***ptr_inargv ptr to the array of ptrs to the token
  strings. This array must be allocated with malloc() before you call
  this function. It must be able to hold at least inargcmax string
  entries. If more than inargcmax tokens are found, the array grows
  dynamically.

  int inargcmax size of ***ptr_inargv array (number of entries). The array
  grows in increments of inargcmax, i.e. selecting a higher inargcmax
  reduces the number of calls to realloc().

  char *inbuffer buffer holding the command line. This buffer will be
  modified while parsing, so keep a copy before you call this function
  if you need the original afterwards

  ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
int cmdln_tokenize(int *inargc, char ***ptr_inargv, int inargcmax, char* inbuffer) {
  int quot0_detect = 0;
  int quot1_detect = 0;
  int inargc_increment;
  char *this_token;
  char *next_token = NULL;
  char *delimiter;
  char *eostring;
  char **resizedinargv; /* temporary inargv for realloc */

  /* we have to take care of quotation marks. strings
     included in pairs of "" or '' should be treated
     as one argument. We keep pointers to the first
     occurrences of either quotation mark. If after
     the first call to strtok one of the quotation
     marks disappears, we flip a switch to modify the
     strtok search string in the next round of the while loop */

  /* todo: honour escaping backslashes */
/*   printf("inbuffer:%s<<\n", inbuffer); */
  /* first we save the end of the original string */
  stripwhite(inbuffer, 2, 0);
  eostring = &inbuffer[strlen(inbuffer)];

  inargc_increment = inargcmax; /* save increment for later use */

  /* in the first round we look for the command, this must not
     have quotes, so we ignore them for now */
  this_token = inbuffer;
  delimiter = strpbrk(inbuffer, " ~\n\r");
  if (delimiter == NULL) { /* no more whitespace detected */
    next_token = NULL;
  }
  else if (*delimiter == ' ') { /* space detected */
    *delimiter = '\0';
    if (*(delimiter+1) == '\0') { /* if string ends here */
      next_token = NULL;
    }
    else { /* if string continues */
      next_token = stripwhite(delimiter+1, 1, 0);
      if (*next_token == '\'') { /* single quote detected */
	quot0_detect++;
	*next_token = '\0';
	next_token += 1;
      }
      else if (*next_token == '\"') { /* double quote detected */
	quot1_detect++;
	*next_token = '\0';
	next_token += 1;
      }
    }
  }
  else if (*delimiter == '\0') { /* end of string detected */
    if (delimiter == inbuffer) {
      return 2; /* empty string detected */
    }
  }
/*    printf("this %s\n", this_token); */
/*    printf("next %s\n", next_token); */

  while (this_token != NULL) { /* loop until string ends */

    /* save token in token array */
    (*ptr_inargv)[(*inargc)++] = stripwhite(this_token, 1, 0);
/*     printf("%s\n", this_token);  */

    /* check size of array and adjust if necessary */
    if (*inargc == inargcmax) {
      inargcmax += inargc_increment;
      resizedinargv = (char**)realloc(*ptr_inargv,
				      (size_t)inargcmax*sizeof(char*));
      if (resizedinargv == NULL) {
	return 1;
      }
      else {
	*ptr_inargv = resizedinargv;
      }
    }
	      
    /* prepare next round */
    if (next_token == NULL) { /* if string ends */
      this_token = NULL;
    }
    else if (quot0_detect) { /* if single quote was detected prev. */
      quot0_detect--;
      delimiter = next_unescaped_char_in_string(next_token, "\'");

      if (delimiter == NULL) {
	this_token = next_token;
	next_token = NULL;
      }
      else {
	this_token = next_token;
	*delimiter = '\0';
	if (delimiter+1 == eostring) {
	  next_token = NULL;
	}
	else {
	  next_token = stripwhite(delimiter+1, 1, 0);
	  if (*next_token == '\'') {
	    quot0_detect++;
	    next_token++;
	  }
	  else if (*next_token == '\"') {
	    quot1_detect++;
	    next_token++;
	  }
	}
      }
    }
    else if (quot1_detect) {
      quot1_detect--;
      delimiter = next_unescaped_char_in_string(next_token, "\"");

      if (delimiter == NULL) {
	this_token = next_token;
	next_token = NULL;
      }
      else {
	this_token = next_token;
	*delimiter = '\0';
	if (delimiter+1 == eostring) {
	  next_token = NULL;
	}
	else {
	  next_token = stripwhite(delimiter+1, 1, 0);
	  if (*next_token == '\'') {
	    quot0_detect++;
	    next_token++;
	  }
	  else if (*next_token == '\"') {
	    quot1_detect++;
	    next_token++;
	  }
	}
      }
    }
    else { /* no quotes detected in previous round */
      delimiter = next_unescaped_char_in_string(next_token, "\"\' \n\r");

      if (delimiter == NULL) { /* end of string */
	this_token = next_token;
	next_token = NULL;
      }
      else if (*delimiter == ' ') {
/*  	printf("space\n"); */
	*delimiter = '\0';
	this_token = next_token;
	if (*(delimiter+1) == '\0') {
	  next_token = NULL;
	}
	else {
	  delimiter = stripwhite(delimiter+1, 1, 0);
	  if (*delimiter == '\'') {
/*  	    printf("single quote after space\n"); */
	    quot0_detect++;
	    *delimiter = '\0';
	    if (delimiter+1 == eostring) {
	      next_token = NULL;
	    }
	    else {
	      next_token = stripwhite(delimiter+1, 1, 0);
	    }
	  }
	  else if (*delimiter == '\"') {
/*  	    printf("double quote after space\n"); */
	    quot1_detect++;
	    *delimiter = '\0';
	    if (delimiter+1 == eostring) {
	      next_token = NULL;
	    }
	    else {
	      next_token = stripwhite(delimiter+1, 1, 0);
	    }
	  }
	  else {
	    next_token = (delimiter == eostring) ? NULL : delimiter;
	  }
	}
      }
      else if (*delimiter == '\'') {
/*  	printf("single quote\n"); */
	quot0_detect++;
	this_token = next_token;
	*delimiter = '\0';
	if (delimiter+1 == eostring) {
	  next_token = NULL;
	}
	else {
	  next_token = stripwhite(delimiter+1, 1, 0);
	  if (*next_token == '\'') {
	    quot0_detect++;
	    next_token++;
	  }
	  else if (*next_token == '\"') {
	    quot1_detect++;
	    next_token++;
	  }
	}
      }
      else if (*delimiter == '\"') {
/*  	printf("double quote\n"); */
	quot1_detect++;
	this_token = next_token;
	*delimiter = '\0';
	if (delimiter+1 == eostring) {
	  next_token = NULL;
	}
	else {
	  next_token = stripwhite(delimiter+1, 1, 0);
	  if (*next_token == '\'') {
	    quot0_detect++;
	    next_token++;
	  }
	  else if (*next_token == '\"') {
	    quot1_detect++;
	    next_token++;
	  }
	}
      }
    }
/*      printf("this %s\n", this_token); */
/*      printf("next %s\n", next_token); */

  }
  return 0;
}


/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  next_unescaped_char_in_string(): finds the next unescaped character (any
                                  of the characters in chars will do)
				  in string. The escape char is a backslash

  char* next_unescaped_char_in_string returns a ptr to the character,
                                  or NULL if none was found

  const char* string ptr to the long string to find something in

  const char* chars string containing all characters to look for
  
  ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
static char* next_unescaped_char_in_string(const char* string, const char* chars) {
  int n_have_escape = 0;
  char* cur;

  cur = (char*)string;

  /* loop until string ends, or until we find an appropriate character */
  while (*cur) {
    if (*cur == '\\') {
      n_have_escape++;
    }
    else if (strchr(chars, (int)*cur)) {
      if ((n_have_escape & 1) == 0) {
	/* number of consecutive escape chars is even*/
	return cur;
      }
      else {
	/* the desired character was escaped. skip and reset counter */
	n_have_escape = 0;
      }
    }
    else {
      /* there was no character match and no escape character. reset counter */
      n_have_escape = 0;
    }
    cur++;
  }
  return NULL; /* character not found */
}


/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  sql_tokenize(): splits a pseudo-SQL request into tokens. Call
                  repeatedly and give the sqltoken.next_token of the
                  previous iteration as an argument for inbuffer,
                  until sqltoken.next_token is NULL. The field name
		  separator is currently set to : (colon)

  char* sql_tokenize returns a pointer to the next token, or NULL
  if no token is found

  char *inbuffer buffer holding the command line.

  struct SQLTOKEN *ptr_sqltoken pointer to a structure which will be
                  filled with the length and the type of the token
                  and with the start of the following token, if any
  ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
char *sql_tokenize(char* inbuffer, struct SQLTOKEN *ptr_sqltoken) {
  int i = 0;
  size_t j;
  int have_token = 0;
  char *this_token = NULL;
  char *next_token[4];
  char *endtoken;

  if (!inbuffer || ! *inbuffer) {
    return NULL;
  }

  while (!have_token) {
    
    if (inbuffer[i] == '(') {
      have_token = 1;
      this_token = &inbuffer[i];
      ptr_sqltoken->next_token = strchr(&inbuffer[i], (int)':');
      ptr_sqltoken->type = 2;
      if (ptr_sqltoken->next_token != NULL) {
	ptr_sqltoken->length = (int)(ptr_sqltoken->next_token-&inbuffer[i]);
      }
      else {
	ptr_sqltoken->length = strlen(&inbuffer[i]);
      }
    }
    else if (inbuffer[i] == ')') {
      have_token = 1;
      this_token = &inbuffer[i];
      next_token[0] = strstr(&inbuffer[i], " AND ");
      next_token[1] = strstr(&inbuffer[i], " OR ");
      next_token[2] = strstr(&inbuffer[i], " NOT ");

      qsort(next_token, 3, sizeof(char*), compare_ptr);
      ptr_sqltoken->next_token = next_token[0];
      ptr_sqltoken->type = 3;
      if (ptr_sqltoken->next_token != NULL) {
	ptr_sqltoken->length = (int)(ptr_sqltoken->next_token-&inbuffer[i]);
      }
      else {
	ptr_sqltoken->length = strlen(&inbuffer[i]);
      }
    }
    else if (strncmp(&inbuffer[i], " AND ", 5) == 0) {
      have_token = 1;
      this_token = &inbuffer[i];
      next_token[0] = strchr(&inbuffer[i], (int)':');
      next_token[1] = strchr(&inbuffer[i], (int)'(');
      qsort(next_token, 2, sizeof(char*), compare_ptr);
      ptr_sqltoken->next_token = next_token[0];
      ptr_sqltoken->type = 1;
      if (ptr_sqltoken->next_token != NULL) {
	ptr_sqltoken->length = (int)(ptr_sqltoken->next_token-&inbuffer[i]);
      }
      else {
	ptr_sqltoken->length = strlen(&inbuffer[i]);
      }
    }
    else if (strncmp(&inbuffer[i], " OR ", 4) == 0) {
      have_token = 1;
      this_token = &inbuffer[i];
      next_token[0] = strchr(&inbuffer[i], (int)':');
      next_token[1] = strchr(&inbuffer[i], (int)'(');
      qsort(next_token, 2, sizeof(char*), compare_ptr);
      ptr_sqltoken->next_token = next_token[0];
      ptr_sqltoken->type = 1;
      if (ptr_sqltoken->next_token != NULL) {
	ptr_sqltoken->length = (int)(ptr_sqltoken->next_token-&inbuffer[i]);
      }
      else {
	ptr_sqltoken->length = strlen(&inbuffer[i]);
      }
    }
    else if (strncmp(&inbuffer[i], " NOT ", 5) == 0) {
      have_token = 1;
      this_token = &inbuffer[i];
      next_token[0] = strchr(&inbuffer[i], (int)':');
      next_token[1] = strchr(&inbuffer[i], (int)'(');
      qsort(next_token, 2, sizeof(char*), compare_ptr);
      ptr_sqltoken->next_token = next_token[0];
      ptr_sqltoken->type = 1;
      if (ptr_sqltoken->next_token != NULL) {
	ptr_sqltoken->length = (int)(ptr_sqltoken->next_token-&inbuffer[i]);
      }
      else {
	ptr_sqltoken->length = strlen(&inbuffer[i]);
      }
    }
    else if (inbuffer[i] == ':') {
      size_t fieldname_len;

      if (strlen(&inbuffer[i]) > 5) {
	/* check the length of the field name. Currently only 2 or 3 are
	   permitted */
	if (inbuffer[i+3] == ':') {
	  fieldname_len = 2;
	}
	else {
	  fieldname_len = 3;
	}
	/* ToDo: use strcspn() if we ever allow longer field names */

	if (inbuffer[i+fieldname_len+2] == '!') {
	  if (inbuffer[i+fieldname_len+3] == '='
	      || inbuffer[i+fieldname_len+3] == '~') {
	    have_token = 1;
	    this_token = &inbuffer[i];
	    ptr_sqltoken->length = fieldname_len+4;
	  }
	}
	else if (inbuffer[i+fieldname_len+2] == '='
		 || inbuffer[i+fieldname_len+2] == '~') {
	  have_token = 1;
	  this_token = &inbuffer[i];
	  ptr_sqltoken->length = fieldname_len+3;
	}
	else if (inbuffer[i+fieldname_len+2] == '<') {
	  have_token = 1;
	  this_token = &inbuffer[i];
	  if (inbuffer[i+fieldname_len+3] == '>'
	      || inbuffer[i+fieldname_len+3] == '='
	      || inbuffer[i+fieldname_len+3] == '~') {
	    ptr_sqltoken->length = fieldname_len+4;
	  }
	  else {
	    ptr_sqltoken->length = fieldname_len+3;
	  }
	}
	else if (inbuffer[i+fieldname_len+2] == '>') {
	  have_token = 1;
	  this_token = &inbuffer[i];
	  if (inbuffer[i+fieldname_len+3] == '='
	      || inbuffer[i+fieldname_len+3] == '~') {
	    ptr_sqltoken->length = fieldname_len+4;
	  }
	  else {
	    ptr_sqltoken->length = fieldname_len+3;
	  }
	}
      }
      if (have_token) {
	ptr_sqltoken->next_token = &inbuffer[i+ptr_sqltoken->length];
	ptr_sqltoken->type = 4;
      }
    }
    else if (inbuffer[i] == '\'') {
      int j = 1;

      have_token = 1;
      ptr_sqltoken->type = 0;
      this_token = &inbuffer[i+1];
      endtoken = NULL;
      while (!endtoken) {
	endtoken = strchr(&inbuffer[i+j], (int)'\''); /* jump to next ' */
	if (endtoken && *(endtoken-1) == '\\') { /* skip escaped \' */
	  j++;
	  endtoken = NULL;
	  continue;
	}
      }
      
      if (endtoken != NULL) {
	next_token[0] = strstr(endtoken, " AND ");
	next_token[1] = strstr(endtoken, " OR ");
	next_token[2] = strstr(endtoken, " NOT ");
	next_token[3] = strchr(endtoken, (int)')');
	qsort(next_token, 4, sizeof(char*), compare_ptr);
      
	ptr_sqltoken->next_token = next_token[0];
	if (endtoken != NULL) {
	  ptr_sqltoken->length = (int)(endtoken-&inbuffer[i+1]);
	}
	else {
	  ptr_sqltoken->length = strlen(&inbuffer[i]);
	}
      }
    }
    else { /* this is obviously some value */
      int j = i;
      have_token = 1;
      ptr_sqltoken->type = 0;
      this_token = &inbuffer[i];
      next_token[0] = strstr(&inbuffer[i], " AND ");
      next_token[1] = strstr(&inbuffer[i], " OR ");
      next_token[2] = strstr(&inbuffer[i], " NOT ");
      next_token[3] = strchr(&inbuffer[i], (int)')');
      /* make sure this is not an escaped paren 
	 TODO: this is not foolproof as we could have \\) */
      while (next_token[3]
	     && *(next_token[3]) == ')'
	     && *(next_token[3]-1) == '\\') {
	next_token[3] = strchr(&inbuffer[j+1], (int)')');
	j++;
      }
      qsort(next_token, 4, sizeof(char*), compare_ptr);
      
      ptr_sqltoken->next_token = next_token[0];
      if (ptr_sqltoken->next_token != NULL) {
	ptr_sqltoken->length = (int)(ptr_sqltoken->next_token-&inbuffer[i]);
      }
      else {
	ptr_sqltoken->length = strlen(&inbuffer[i]);
      }
    }
    i++;
    if (inbuffer[i] == '\0' && !have_token) { /* don't read past the end */
      this_token = NULL;
      have_token = 1;
      ptr_sqltoken->type = 0;
      ptr_sqltoken->next_token = NULL;
      ptr_sqltoken->length = 0;
    }
  }

  if (this_token && ptr_sqltoken->type == 0) {
    /* strip escaping backslashes from values */
    for (j = 0; j < ptr_sqltoken->length; j++) {
      if (*(this_token+j) == '\\'
	  && (*(this_token+j+1) == '\''
	      || *(this_token+j+1) == '\"')) {
	memmove(this_token+j, this_token+j+1, ptr_sqltoken->length-j);
	ptr_sqltoken->length--;
      }
    }
  }
  return this_token;
}

/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  link_tokenize(): splits a link request into tokens. Call
                  repeatedly and give the sqltoken.next_token of the
                  previous iteration as an argument for inbuffer,
                  until sqltoken.next_token is NULL. The field name
		  separator is currently set to : (colon)

  char* link_tokenize returns a pointer to the next token, or NULL
  if no token is found

  char *inbuffer buffer holding the command line.

  struct SQLTOKEN *ptr_sqltoken pointer to a structure which will be
                  filled with the length and the type of the token
                  and with the start of the following token, if any
  ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
char *link_tokenize(char* inbuffer, struct SQLTOKEN *ptr_sqltoken) {
  int i = 0;
  int have_token = 0;
  char *this_token = NULL;
  char *endtoken = NULL;

  while (!have_token) {
    if (inbuffer[i] == ':') {
      size_t fieldname_len;

/*       printf("here0\n"); */
      if (strlen(&inbuffer[i]) > 5) {
	/* check the length of the field name. Currently only 2 or 3 are
	   permitted */
	if (inbuffer[i+3] == ':') {
	  fieldname_len = 2;
	}
	else {
	  fieldname_len = 3;
	}
	/* ToDo: use strcspn() if we ever allow longer field names */

	if (inbuffer[i+fieldname_len+2] == '!') {
	  if (inbuffer[i+fieldname_len+3] == '=') {
	    have_token = 1;
	    this_token = &inbuffer[i];
	    ptr_sqltoken->length = fieldname_len+4;
	  }
	}
	else if (inbuffer[i+fieldname_len+2] == '=') {
	  have_token = 1;
	  this_token = &inbuffer[i];
	  ptr_sqltoken->length = fieldname_len+3;
	}
	else if (inbuffer[i+fieldname_len+2] == '<') {
	  have_token = 1;
	  this_token = &inbuffer[i];
	  if (inbuffer[i+fieldname_len+3] == '>' || inbuffer[i+fieldname_len+3] == '=') {
	    ptr_sqltoken->length = fieldname_len+4;
	  }
	  else {
	    ptr_sqltoken->length = fieldname_len+3;
	  }
	}
	else if (inbuffer[i+fieldname_len+2] == '>') {
	  have_token = 1;
	  this_token = &inbuffer[i];
	  if (inbuffer[i+fieldname_len+3] == '=') {
	    ptr_sqltoken->length = fieldname_len+4;
	  }
	  else {
	    ptr_sqltoken->length = fieldname_len+3;
	  }
	}
      }
      if (have_token) {
	ptr_sqltoken->next_token = &inbuffer[i+ptr_sqltoken->length];
	ptr_sqltoken->type = 4;
      }
    }
    else if (inbuffer[i] == '\'' && inbuffer[i-1] != '\\') {
      char *end;
/*       printf("here1\n"); */
      have_token = 1;
      ptr_sqltoken->type = 0;
      this_token = &inbuffer[i+1];
      end = this_token;

      /* loop until we find an unescaped matching end quote */
      while (end != NULL) {
	endtoken = next_unescaped_char_in_string(end, "'"); /* jump to next ' */
	if (!endtoken || *(endtoken-1) != '\\') {
	  end = NULL;
	}
	else {
	  end = endtoken+1;
	}
      }

      if (endtoken != NULL) {
	ptr_sqltoken->next_token = strstr(endtoken, " ");

	if (endtoken != NULL) {
	  ptr_sqltoken->length = (int)(endtoken-&inbuffer[i+1]);
	}
	else {
	  ptr_sqltoken->length = strlen(&inbuffer[i]);
	}
      }
    }
    else if (inbuffer[i] == ' ') {
      /* do nothing, will skip */
    }
    else { /* this is obviously some value */
/*       printf("here2\n"); */
      have_token = 1;
      ptr_sqltoken->type = 0;
      this_token = &inbuffer[i];
      ptr_sqltoken->next_token = strstr(&inbuffer[i], " ");

      if (ptr_sqltoken->next_token != NULL) {
	ptr_sqltoken->length = (int)(ptr_sqltoken->next_token-&inbuffer[i]);
      }
      else {
	ptr_sqltoken->length = strlen(&inbuffer[i]);
      }
    }
    i++;
    if (inbuffer[i] == '\0' && !have_token) { /* don't read past the end */
      this_token = NULL;
      have_token = 1;
      ptr_sqltoken->type = 0;
      ptr_sqltoken->next_token = NULL;
      ptr_sqltoken->length = 0;
    }
  }
  return this_token;
}

/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  nstrtok(): (hopefully) thread-safe replacement for strtok()
             first call it with the string as first argument. In
             subsequent calls, use previous_token+len instead.
             In contrast to strtok(), nstrtok() does *not* modify
	     the string, so you have to terminate the token
	     yourself, e.g.:
	     strncpy(buffer, token, len);
	     buffer[len] = '\0';
	     As in strtok(), the delim strings can be different in
             subsequent calls to nstrtok()

  char* nstrtok returns a pointer to the next token, or NULL
  if no token is found

  char* value the string to search in

  size_t* ptr_len ptr to a variable that will receive the length of a token
           or 0 if no token is found

  char* delim ptr to a string consisting of the token-delimiting
              characters

  ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
char* nstrtok(char* value, size_t* ptr_len, char* delim) {
  int delim_found;
  size_t i;
  char *eostring;
  char *tokenend;

  /* take a shortcut if the string is empty */
  if (!value[0]) {
    *ptr_len = 0;
    return NULL;
  }

  eostring = &value[strlen(value)]; /* position of terminating \0 */

  /* remove all delimiters at the beginning of the token */
  do {
    delim_found = 0;
    for (i = 0; i < strlen(delim); i++) {
      if (*value == delim[i]) {
	delim_found++;
	break;
      }
    }
    if (delim_found) {
      value++;
    }
  } while (delim_found && value < eostring);
  if (delim_found || value == eostring) {
    *ptr_len = 0;
    return NULL;
  }

  /* now search for delimiters at the end of the token */
  /* value now points to start of token */
  tokenend = value;

  /* delim_found must be 0 now */
  while (!delim_found && tokenend < eostring) {
    for (i = 0; i < strlen(delim); i++) {
      if (*tokenend == delim[i]) {
	delim_found++;
	break;
      }
    }
    if (!delim_found) {
      tokenend++;
    }
  }
  *ptr_len = tokenend-value;
  return value;
}

/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  ris_strtok(): essentially the same as nstrtok() above, and all said
                there applies here as well. The added benefit of this
		function is that it peeks ahead and glues together
		subsequent lines that belong to the same tag, i.e.
		the following lines:
		N1  - This is a note
		written in two lines

		will be returned as a single token

  char* nstrtok returns a pointer to the next token, or NULL
  if no token is found

  char* value the string to search in

  size_t* ptr_len ptr to a variable that will receive the length of a token
           or 0 if no token is found

  char* delim ptr to a string consisting of the token-delimiting
              characters

  ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
char* ris_strtok(char* value, size_t* ptr_len, char* delim) {
  int delim_found;
  size_t i;
  char *eostring;
  char *tokenend;

  /* take a shortcut if the string is empty */
  if (!value[0]) {
    *ptr_len = 0;
    return NULL;
  }

  eostring = &value[strlen(value)]; /* position of terminating \0 */

  /* remove all delimiters at the beginning of the token */
  do {
    delim_found = 0;
    for (i = 0; i < strlen(delim); i++) {
      if (*value == delim[i]) {
	delim_found++;
	break;
      }
    }
    if (delim_found) {
      value++;
    }
  } while (delim_found && value < eostring);
  if (delim_found || value == eostring) {
    *ptr_len = 0;
    return NULL;
  }

  /* now search for delimiters at the end of the token */
  /* value now points to start of token */
  tokenend = value;

  /* delim_found must be 0 now */
  while (!delim_found && tokenend < eostring) {
    for (i = 0; i < strlen(delim); i++) {
      if (*tokenend == delim[i]) {
	/* check whether the next token starts with a RIS tag */
	if (eostring - tokenend > 6) {
	  if (!strncmp(tokenend+3, "  - ", 4)) {
	    delim_found++;
	  }
	  else {
	    /* replace any CR preceeding the LF by a space */
	    if (*(tokenend-1) == '\r') {
	      *(tokenend-1) = ' ';
	    }
	  }
	}
	else {
	  delim_found++;
	}
	break;
      }
    }
    if (!delim_found) {
      tokenend++;
    }
  }
  *ptr_len = tokenend-value;
  return value;
}

/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  string_tokenize(): splits a string into tokens and stashes them
                     in an array similar to argc/argv. This implementation
		     relies on the fact that all printable characters
		     except the space have ASCII values 33 and up. The
		     tokens are assumed to be separated by whitespace,
		     which is \t (9), \n (10), \r (13), SPC (32). Do not
		     use this function where this coarse separation is not
		     sufficient

  int string_tokenize returns 0 if successful, 1 if out of memory, 2 if
                     string empty

  int *inargc ptr to a counter for the tokens. Should be initialized
                     to 0 before you call this function, unless you add
		     to an existing array.

  char ***ptr_inargv ptr to the array of ptrs to the token strings. This
                     array must be allocated with malloc() before you call
		     this function. It must be able to hold at least
		     inargcmax string entries. If more than inargcmax
		     tokens are found, the array grows dynamically.

  int inargcmax size of ***ptr_inargv array (number of entries). The array
                     grows in increments of inargcmax, i.e. selecting a
		     higher inargcmax reduces the number of calls to realloc().

  char *inbuffer buffer holding the string. This buffer will be
                     modified while parsing, so keep a copy before you call
		     this function if you need the original afterwards

  ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
int string_tokenize(int *inargc, char ***ptr_inargv, int inargcmax, char* inbuffer) {
  int num_tokens = 0;
  int inargc_increment;
  char *token;
  char *eof_string;
  char **new_inargv;

  inargc_increment = inargcmax; /* save increment for later use */
  token = inbuffer;
  eof_string = &inbuffer[strlen(inbuffer)]; /* points to the terminating \0 */

  while(1) {
    /* search for the start of a token */
    while (*token < 33 && token < eof_string) { /* all whitespace has ASCII values 32 or less */
      token++;
    }
    
    if (token == eof_string){
      if (num_tokens) {
	return 0;
      }
      else {
	return 2;
      }
    }

    /* we obviously are at the start of a token. Save a pointer */
    (*ptr_inargv)[(*inargc)++] = token;
    num_tokens++;
/*      printf("%s\n", start_token); */

    /* check size of array and adjust if necessary */
    if (*inargc == inargcmax) {
      inargcmax += inargc_increment;
      new_inargv = (char**)realloc(*ptr_inargv,
				      (size_t)inargcmax*sizeof(char*));
      if (new_inargv == NULL) {
	return 1;
      }
      else {
	*ptr_inargv = new_inargv;
      }
    }
	      

    /* Look for the end */
    while (*token > 32 && token < eof_string) {
      token++;
    }
    
    if (token == eof_string){
      if (num_tokens) {
	return 0;
      }
      else { /* well, this should never happen */
	return 2;
      }
    }
    else {
      *token = '\0'; /* terminate token */
      token++;
    }
  }
}

/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  tokenize_author(): fills a STRUCT with pointers to the parts of a name
                     If a part is missing, the corresponding ptr will
		     point to an empty string (actually the end of
		     the author string). Thus the calling function 
		     could just concatenate the parts without testing
		     for their presence if this makes any sense. If you
		     want to test for their presence, check the length
		     The middle names are implemented as a linked list in
		     case there is more than one (the NO researchers
		     among us certainly remember "Triple-H" Schmidt!)

  char* tokenize_author returns a ptr to the surname or NULL in case
                     of an error

  struct AUTHORTOKENS* ptr_atoken ptr to a struct which will receive
                     the ptrs

  ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
char* tokenize_author(char* author, struct AUTHORTOKENS* ptr_atoken) {
  char* term;
  char* suffix;
  char* middle;
  char* the_end;

  if (!author) {
    return NULL;
  }

  /* let the_end point to an empty string */
  the_end = &(author[strlen(author)]);

  /* skip leading spaces */
  while (*author == ' ') {
    author++;
  }

  /* initialize the nameparts structure */
  ptr_atoken->sur = author; /* always the first, always assumed to be
				     present */
  ptr_atoken->first = the_end; /* all other parts may be missing */

  ptr_atoken->ptr_middlelist = malloc(sizeof(Lilistring));
  if (!ptr_atoken->ptr_middlelist) {
    return NULL;
  }
  ptr_atoken->ptr_middlelist->ptr_next = NULL;
  middle = the_end;

  ptr_atoken->lineage = the_end;
  
  if ((term = strchr(author, (int)',')) != NULL) {
    *term = '\0';
    ptr_atoken->first = term+1;

    /* first check for a suffix, starting after a second comma */
    if ((suffix = strchr(ptr_atoken->first, (int)',')) != NULL) {
      *suffix = '\0';
      ptr_atoken->lineage = suffix+1;
      while (*(ptr_atoken->lineage) == ' ') {
	ptr_atoken->lineage++;
      }
    }
    else {
      suffix = the_end;
    }

    /* suffix now points to the end of the name proper */

    if (*(ptr_atoken->first)) { /* string doesn't end here */
      /* skip leading spaces */
      while (*(ptr_atoken->first) == ' ') {
	ptr_atoken->first++;
      }

      /* now there's three options: 
	 - the next char is the first initial, followed by a period 
	 - the next char is the first initial of a hyphenated double
	   name
	 - the next word is the full first name */
      
      if (*(ptr_atoken->first + 1) == '.') {
	if (*(ptr_atoken->first + 2) == '-') {
	  /* now we've got to find the end of the hyphenated name, remove
	     the periods, and terminate the string appropriately */
	  middle = ptr_atoken->first + 3;
	  while (*middle && *middle != ' ') {
	    middle++;
	  }

	  if (*(middle-1) == '.') {
	    *(middle-1) = '\0';
	  }
	  else if (*middle == ' ') {
	    *middle = '\0';
	    middle++;
	  }
	  
	  memmove(ptr_atoken->first + 1, ptr_atoken->first + 2, strlen(ptr_atoken->first + 2)+1);
	}
	else {
	  *(ptr_atoken->first + 1) = '\0';
	  middle = ptr_atoken->first + 2;
	}
      }
      else if (*(ptr_atoken->first + 1)) { /* string doesn't end here */
	/* got a full first name, search for end of word */
	term = ptr_atoken->first + 1;

	while (*term && *term != ' ') { /* we don't have to check for a comma
					   as the suffix comma was already
					   replaced by \0 if there was one */
	  term++;
	}

	if (*term) { /* found space */
	  *term = '\0';
	  middle = term + 1;
	}

	/* else: no middle name */
      }

      if (*middle) {
	char* next_middle;

	while (middle < suffix) {
	  /* skip leading spaces */
	  while (*middle == ' ') {
	    middle++;
	  }
	
	  /* middle name could be abbreviated, so look for a dot */
	  if (*(middle + 1) == '.') {
	    if (*(middle + 2) == '-') {
	      /* now we've got to find the end of the hyphenated name, remove
		 the periods, and terminate the string appropriately */
	      next_middle = middle + 3;
	      while (*next_middle && *next_middle != ' ') {
		next_middle++;
	      }

	      if (*(next_middle-1) == '.') {
		*(next_middle-1) = '\0';
	      }
	      else if (*next_middle == ' ') {
		*next_middle = '\0';
		next_middle++;
	      }
	  
	      memmove(middle + 1, middle + 2, strlen(middle + 2)+1);
	    }
	    else {
	      *(middle + 1) = '\0';
	      next_middle = middle + 2;
	    }
	  }
	  else {
	    next_middle = middle;
	    while (*next_middle && *next_middle != ' ') {
	      next_middle++;
	    }
	    if (*next_middle == ' ') {
	      *next_middle = '\0';
	      next_middle++;
	    }
	  }
	
	  if (middle < suffix && *middle) {
	    if (append_lilistring(ptr_atoken->ptr_middlelist, middle)) {
	      return NULL;
	    }
	  }
	  middle = next_middle;
	}
      }
      /* else: there was only a first initial */
    }
    else {
      ptr_atoken->first = the_end; /* no firstname in spite of comma */
    }
  }
  /* else: some cultures like India use only one name, so lastname is all we have */

  /* we strip whitespace only at the ends, the starts have been taken
     care of before */
  if (*(ptr_atoken->sur)) {
    stripwhite(ptr_atoken->sur, 2, 0);
  }
  if (*(ptr_atoken->first)) {
    stripwhite(ptr_atoken->first, 2, 0);
  }
  if (*(ptr_atoken->lineage)) {
    stripwhite(ptr_atoken->lineage, 2, 0);
  }

  return ptr_atoken->sur;
}

/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  clean_authortokens(): takes care of memory cleanup after a tokenized
                        author is no longer used

  void clean_authortokens returns nothing

  struct AUTHORTOKENS* ptr_atoken ptr to a struct which holds the
                     the ptrs to the author name parts

  ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
void clean_authortokens(struct AUTHORTOKENS* ptr_atoken) {
  if (ptr_atoken->ptr_middlelist) {
    delete_all_lilistring(ptr_atoken->ptr_middlelist);
    free(ptr_atoken->ptr_middlelist);
    ptr_atoken->ptr_middlelist = NULL;
  }
  return;
}

/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  string_tokenize_lili(): splits a string containing whitespace-separated
                     numbers into tokens and stashes them
                     into a linked list. Numeric ranges are expanded,
		     non-number tokens are ignored. This implementation
		     relies on the fact that all printable characters
		     except the space have ASCII values 33 and up. The
		     tokens are assumed to be separated by whitespace,
		     which is \t (9), \n (10), \r (13), SPC (32). Do not
		     use this function where this coarse separation is not
		     sufficient

  int string_tokenize returns 0 if successful, 1 if out of memory, 2 if
                     string empty

  Lilid *ptr_first ptr to the sentinel of a linked list that will
                     store the numbers

  char *inbuffer buffer holding the string. This buffer will be
                     modified while parsing, so keep a copy before you call
		     this function if you need the original afterwards

  ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
int string_tokenize_lili(Lilid *ptr_first, char *inbuffer) {
  int retval = -1;
  int num_tokens = 0;
  unsigned long long n_rangestart;
  unsigned long long n_rangeend;
  unsigned long long ulonglong_i;
  char *token;
  char *token_start;
  char *eof_string;
  char *rangesep;

  token = inbuffer;
  eof_string = &inbuffer[strlen(inbuffer)]; /* points to the terminating \0 */

  while(retval < 0) {
    /* search for the start of a token */
    while (*token < 33 && token < eof_string) { /* all whitespace has ASCII values 32 or less */
      token++;
    }
    
    if (token == eof_string){
      if (num_tokens) {
	return 0;
      }
      else {
	return 2;
      }
    }

    /* we obviously are at the start of a token. Add to list */
    token_start = token;
    num_tokens++;
/*      printf("%s\n", start_token); */

    /* Look for the end */
    while (*token > 32 && token < eof_string) {
      token++;
    }
    
    if (token == eof_string){
      if (num_tokens) {
	retval = 0;
      }
      else { /* well, this should never happen */
	retval = 2;
      }
    }
    else {
      *token = '\0'; /* terminate token */
      token++;
    }

    /* see whether token is a range */
    if ((rangesep = strchr(token_start, (int)'-')) != NULL) {
      *rangesep = '\0';
      n_rangestart = atoll(token_start);
      n_rangeend = atoll(rangesep+1);

      if (n_rangestart && n_rangeend) {
	for (ulonglong_i = n_rangestart+1; ulonglong_i <= n_rangeend; ulonglong_i++) {
	  if (insert_lilid(ptr_first, ulonglong_i)) {
	    return 1; /* out of memory */
	  }
	} 
      } 
    }

    /* reuse ulonglong_i */
    ulonglong_i = atoll(token_start);
    if (ulonglong_i) {
      if (insert_lilid(ptr_first, ulonglong_i)) {
	return 1; /* out of memory */
      }
    }
  } /* end while */

  return retval;
}
