%{
/*
 *   Copyright (C) 1997, 1998
 *   	Free Software Foundation, Inc.
 *
 *   This program is free software; you can redistribute it and/or modify it
 *   under the terms of the GNU General Public License as published by the
 *   Free Software Foundation; either version 2, or (at your option) any
 *   later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program; if not, write to the Free Software
 *   Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 
 *
 */
/* 
 * Extract the words from an HTML document and translate ISO-8859-1 and
 * HTML accents to iso8859 accented equivalent.
 */

/* Head */
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif /* HAVE_CONFIG_H */

#include <string.h>
#include <sys/stat.h>
#include <ctype.h>
#ifdef HAVE_DMALLOC_H
#include <dmalloc.h>
#endif /* HAVE_DMALLOC_H */

#include <isomap.h>
#include <file_size.h>

#include <bodyparse.h>

#define min(a,b) ((a) < (b) ? (a) : (b))

static int previous_condition = INITIAL;

/* Buffer to hold the content of the text. */
static char* body = 0;
/* Total size of the area allocated in body. */
static int body_size = 0;
/* Actual length of data contained in body. */
static int body_length = 0;

/* Buffer to hold the content of the title. */
static char* title = 0;
/* Total size of the area allocated in title. */
static int title_size = 0;
/* Actual length of data contained in title. */
static int title_length = 0;

/* Buffer to hold the content of the meta_name. */
static char* meta_name = 0;
/* Total size of the area allocated in meta_name. */
static int meta_name_size = 0;
/* Actual length of data contained in meta_name. */
static int meta_name_length = 0;

#if 0
static char meta_name_tag[] = "name";
static int meta_name_tag_size = 4;
#endif 

/* Buffer to hold the content of the meta_content. */
static char* meta_content = 0;
/* Total size of the area allocated in meta_content. */
static int meta_content_size = 0;
/* Actual length of data contained in meta_content. */
static int meta_content_length = 0;

#if 0
static char meta_content_tag[] = "content";
static int meta_content_tag_size = 7;
#endif

static char meta_name_description[] = "description";
static int meta_name_description_size = 11;
/* keyword instead of keywords allows misspelling */
static char meta_name_keywords[] = "keyword";
static int meta_name_keywords_size = 7;

/* Iso char codes conversion table. */
char* iso2letter = 0;

#define META_DESCRIPTION 0
#define META_KEYWORDS 1
#define META_MAX 2
static char* meta_values[2] = { 0, 0, };
static int meta_values_length[2];

/* Current buffer being filled in. */
static char* buffer;
/* Actual length of data contained in buffer. */
static int buffer_length;

/* True if previous char was a space */
static int white_space;

/* True if body should go to output_fp */
static int in_file;

/* The file descriptor where the output should go. */
static FILE* output_fp = 0;
/* Stop writing to file if this limit is reached. */
static int output_limit = 1 * 1024 * 1024;

/* Debugging. */
static int verbose = 0;

static int activate(char* new_buffer, int* new_length);
static int initactivation();
static int deactivate();
static int meta_end();
static void fill_white();
static void fill(unsigned char c);
%}

%option caseless noyywrap 8bit prefix="body" outfile="lex.yy.c" nounput

%x COMMENT_TAG TITLE_TAG STYLE_TAG SCRIPT_TAG META_TAG OTHER_TAG META_NAME META_CONTENT_DOUBLEQUOTE META_CONTENT_QUOTE META_CONTENT_NOQUOTE

OPTWS		[[:blank:]\n\r]*

/*
  BUG: in string functions null terminated strings are assumed but
       html documents sometimes contain nulls.

  The redundancy of some specifications (dot .) is mandatory for
  precedence considerations. If the dot (.) is shared by many start 
  conditions, it prevents some of them to match a single char (because
  the parser puts the rule from the shared start condition (for .) in the 
  first position.
*/

%%
<INITIAL>{
	"<"{OPTWS}meta {
	   if(verbose) fprintf(stderr, "meta found \n");
	   BEGIN(META_TAG);
	}
	"<"{OPTWS}script[^<>]*">" {
	   if(verbose) fprintf(stderr, "script found \n");
	   BEGIN(SCRIPT_TAG);
	 }
	"<"{OPTWS}style[^<>]*">" {
	   if(verbose) fprintf(stderr, "style found \n");
	   BEGIN(STYLE_TAG);
	 }
	"<"{OPTWS}title[^<>]*">" {
	  if(verbose) fprintf(stderr, "title found\n");
          activate(title, &title_length);
          BEGIN(TITLE_TAG);
	}
	"<!--" {
	  if(verbose) fprintf(stderr, "comment found\n");
          previous_condition = INITIAL;
          BEGIN(COMMENT_TAG);
	}
	"<" {
	  if(verbose) fprintf(stderr, "other start tag found\n");
	  fill_white();
          previous_condition = INITIAL; BEGIN(OTHER_TAG);
	}
	[[:blank:]\n\r]+	fill_white();
	[\001-\037] ;
	.      fill(bodytext[0] & 0xff);
}

<OTHER_TAG>{
        ">" BEGIN(previous_condition);
	[[:blank:]\n\r]+ ;
	. ;
}

<COMMENT_TAG>{
        "-->" BEGIN(previous_condition);
	[[:blank:]\n\r]+ ;
	. ;
}

<TITLE_TAG>{
	"<"{OPTWS}"/"{OPTWS}title[^>]*">" {
	  if(verbose) fprintf(stderr, "finished title\n");
	  deactivate();
	  BEGIN(INITIAL);
	}
	"<"{OPTWS}body{OPTWS}">" {
	  if(verbose) fprintf(stderr, "body in title: break\n");
	  deactivate();
	  BEGIN(INITIAL);
	}
	"<" {
	  if(verbose) fprintf(stderr, "other start tag found\n");
	  fill_white();
          previous_condition = TITLE_TAG; BEGIN(OTHER_TAG);
	}
	[[:blank:]\n\r]+	fill_white();
	[\001-\037] ;
	.      fill(bodytext[0] & 0xff);
}

<TITLE_TAG,INITIAL,META_CONTENT_DOUBLEQUOTE,META_CONTENT_QUOTE,META_CONTENT_NOQUOTE>{
	[[:blank:]\n\r]+	fill_white();
	[\001-\037] ;
	\\&                     fill(bodytext[1]);
	"&"quot";"? fill('"');
	"&"amp";"? fill('&');
	"&"lt";"? fill('<');
	"&"gt";"? fill('>');
	"&"iexcl";"? fill('');
	"&"cent";"? fill('');
	"&"pound";"? fill('');
	"&"curren";"? fill('');
	"&"yen";"? fill('');
	"&"brkbar";"? fill('');
	"&"sect";"? fill('');
	"&"uml";"? fill('');
	"&"copy";"? fill('');
	"&"ordf";"? fill('');
	"&"laquo";"? fill('');
	"&"not";"? fill('');
	"&"shy";"? fill('');
	"&"reg";"? fill('');
	"&"hibar";"? fill('');
	"&"deg";"? fill('');
	"&"plusmn";"? fill('');
	"&"sup2";"? fill('');
	"&"sup3";"? fill('');
	"&"acute";"? fill('');
	"&"micro";"? fill('');
	"&"para";"? fill('');
	"&"middot";"? fill('');
	"&"cedil";"? fill('');
	"&"sup1";"? fill('');
	"&"ordm";"? fill('');
	"&"raquo";"? fill('');
	"&"frac14";"? fill('');
	"&"frac12";"? fill('');
	"&"frac34";"? fill('');
	"&"iquest";"? fill('');
	"&"Agrave";"? fill(yytext[1] == 'A' ? '' : '');
	"&"Aacute";"? fill(yytext[1] == 'A' ? '' : '');
	"&"Acirc";"? fill(yytext[1] == 'A' ? '' : '');
	"&"Atilde";"? fill(yytext[1] == 'A' ? '' : '');
	"&"Auml";"? fill(yytext[1] == 'A' ? '' : '');
	"&"Aring";"? fill(yytext[1] == 'A' ? '' : '');
	"&"AElig";"? fill(yytext[1] == 'A' ? '' : '');
	"&"Ccedil";"? fill(yytext[1] == 'C' ? '' : '');
	"&"Egrave";"? fill(yytext[1] == 'E' ? '' : '');
	"&"Eacute";"? fill(yytext[1] == 'E' ? '' : '');
	"&"Ecirc";"? fill(yytext[1] == 'E' ? '' : '');
	"&"Euml";"? fill(yytext[1] == 'E' ? '' : '');
	"&"Igrave";"? fill(yytext[1] == 'I' ? '' : '');
	"&"Iacute";"? fill(yytext[1] == 'I' ? '' : '');
	"&"Icirc";"? fill(yytext[1] == 'I' ? '' : '');
	"&"Iuml";"? fill(yytext[1] == 'I' ? '' : '');
	"&"Dstrok";"? fill('');
	"&"Ntilde";"? fill(yytext[1] == 'N' ? '' : '');
	"&"Ograve";"? fill(yytext[1] == 'O' ? '' : '');
	"&"Oacute";"? fill(yytext[1] == 'O' ? '' : '');
	"&"Ocirc";"? fill(yytext[1] == 'O' ? '' : '');
	"&"Otilde";"? fill(yytext[1] == 'O' ? '' : '');
	"&"Ouml";"? fill(yytext[1] == 'O' ? '' : '');
	"&"times";"? fill('');
	"&"Oslash";"? fill(yytext[1] == 'O' ? '' : '');
	"&"Ugrave";"? fill(yytext[1] == 'U' ? '' : '');
	"&"Uacute";"? fill(yytext[1] == 'U' ? '' : '');
	"&"Ucirc";"? fill(yytext[1] == 'U' ? '' : '');
	"&"Uuml";"? fill(yytext[1] == 'U' ? '' : '');
	"&"Yacute";"? fill(yytext[1] == 'U' ? '' : '');
	"&"THORN";"? fill(yytext[1] == 'T' ? '' : '');
	"&"szlig";"? fill('');
	"&"eth";"? fill('');
	"&"divide";"? fill('');
	"&"yuml";"? fill('');
	"&#"[0-9]{1,3}";"? {
	  fill(atoi(bodytext + 2) & 0xff);
	}
	"&"nbsp";"? fill_white();
	"<"{OPTWS}"font"[^>]*">"    ;
	"</"{OPTWS}"font"[^>]*">"    ;
	"<"{OPTWS}"blink"[^>]*">"    ;
	"</"{OPTWS}"blink"[^>]*">"    ;
	"<"{OPTWS}[bi]{OPTWS}">"    ;
	"</"{OPTWS}[bi]{OPTWS}">"    ;
}

<SCRIPT_TAG>{
	"<"{OPTWS}"/"{OPTWS}script[^<>]*">" {
	  if(verbose) fprintf(stderr, "finished script\n");
	  BEGIN(INITIAL);
	}
	[[:blank:]\n\r]+ ;
	. ;
}

<STYLE_TAG>{
	"<"{OPTWS}"/"{OPTWS}style[^<>]*">" {
	  if(verbose) fprintf(stderr, "finished style\n");
	  BEGIN(INITIAL);
	}
	[[:blank:]\n\r]+ ;
	. ;
}

<META_TAG>{
	name{OPTWS}={OPTWS}["']|name{OPTWS}={OPTWS} {
	    if(verbose) fprintf(stderr, "found meta name\n");
            activate(meta_name, &meta_name_length);
            BEGIN(META_NAME);
        }
	content{OPTWS}={OPTWS}\" {
	    if(verbose) fprintf(stderr, "found meta content (double quote)\n");
            activate(meta_content, &meta_content_length);
            BEGIN(META_CONTENT_DOUBLEQUOTE);
        }
	content{OPTWS}={OPTWS}\' {
	    if(verbose) fprintf(stderr, "found meta content (single quote)\n");
            activate(meta_content, &meta_content_length);
            BEGIN(META_CONTENT_QUOTE);
        }
	content{OPTWS}={OPTWS} {
	    if(verbose) fprintf(stderr, "found meta content (no quote)\n");
            activate(meta_content, &meta_content_length);
            BEGIN(META_CONTENT_NOQUOTE);
        }
	">" { meta_end(); BEGIN(INITIAL); }
	[[:blank:]\n\r]+ ;
	. ;
}

<META_NAME>{
	['">[:blank:]] { 
	  if(verbose) fprintf(stderr, "finished meta name, length = %d, string = %.*s\n", buffer_length, buffer_length, buffer);
	  deactivate();
	  BEGIN(META_TAG);
	}
	.      fill(bodytext[0] & 0xff);
}

<META_CONTENT_DOUBLEQUOTE>{
	\" { 
	  if(verbose) fprintf(stderr, "finished meta content (double quote)\n");
	  deactivate();
	  BEGIN(META_TAG);
	}
	.      fill(bodytext[0] & 0xff);
}

<META_CONTENT_QUOTE>{
	\' { 
	  if(verbose) fprintf(stderr, "finished meta content (single quote)\n");
	  deactivate();
	  BEGIN(META_TAG);
	}
	.      fill(bodytext[0] & 0xff);
}

<META_CONTENT_NOQUOTE>{
	\> { 
	  if(verbose) fprintf(stderr, "finished meta content (no quote)\n");
	  deactivate();
	  BEGIN(META_TAG);
	}
	.      fill(bodytext[0] & 0xff);
}

<<EOF>> {
  yyterminate();
  BEGIN(INITIAL);
}

%%

static int* old_length = 0;

static void fill_white()
{
  if(white_space)
    return;
  white_space = 1;

  fill(' ');
}

static void fill(unsigned char c)
{
  if(c != ' ') white_space = 0;
  if(verbose > 10) fprintf(stderr, "initialy %c 0x%x => ", c, (c & 0xff));
  c = iso2letter[c];
  if(verbose > 10) fprintf(stderr, "now %c 0x%x\n", c, (c & 0xff));
  if(in_file && buffer == body)
    fputc(c, output_fp);
  else
    buffer[buffer_length++] = c;
}

static int activate(char* new_buffer, int* new_length)
{
  if(old_length == 0) {
    fprintf(stderr, "old_length null\n");
  }

  *old_length = buffer_length;

  *new_length = 0;
  buffer_length = 0;
  buffer = new_buffer;

  old_length = new_length;

  return 1;
}

static int initactivation()
{
  buffer_length = body_length;
  buffer = body;

  old_length = &body_length;

  return 1;
}

static int deactivate()
{
  if(old_length) {
    *old_length = buffer_length;
  }

  buffer_length = body_length;
  buffer = body;

  old_length = &body_length;

  return 1;
}

static int meta_end()
{
  if(meta_name_length) {
    int index = -1;

    if(!strncasecmp(meta_name_description, meta_name, min(meta_name_description_size, meta_name_length))) {
      index = META_DESCRIPTION;
    } else if(!strncasecmp(meta_name_keywords, meta_name, min(meta_name_keywords_size, meta_name_length))) {
      index = META_KEYWORDS;
    }
    if(verbose) {
      fprintf(stderr, "meta_end: found %d chars for %.*s\n", index, meta_name_length, meta_name);
    }
    if(index >= 0) {
      if(meta_values[index]) {
	free(meta_values[index]);
      }
      meta_values[index] = malloc(meta_content_length + 1);
      if(meta_values[index] == 0) {
				    fprintf(stderr, "meta_end: cannot allocate %d bytes\n", meta_content_length + 1);
				    exit(3);
				  }
      memcpy(meta_values[index], meta_content, meta_content_length);
      meta_values[index][meta_content_length] = '\0';
      meta_values_length[index] = meta_content_length;
    }
  }
  meta_content_length = meta_name_length = 0;

  return 1;
}

static int bodyparse_init_common(int size, int limit, int flag)
{
  white_space = 0;
  in_file = 0;
  /*
   * Init accent table. 
   */
  unaccent("");

  if(limit)
    output_limit = limit;
  /* First time thru, init the iso table and allocate buffers. */
  if(body_size == 0) {
    body_size = 50 * 1024;
    body = (char*)malloc(body_size);
    if(body == 0) {
      fprintf(stderr, "body allocation out of memory\n");
      exit(2);
    }
    title_size = 10 * 1024;
    title = (char*)malloc(title_size);
    if(title == 0) {
      fprintf(stderr, "title allocation out of memory\n");
      exit(2);
    }
    meta_name_size = 10 * 1024;
    meta_name = (char*)malloc(meta_name_size);
    if(meta_name == 0) {
      fprintf(stderr, "meta_name allocation out of memory\n");
      exit(2);
    }
    meta_content_size = 10 * 1024;
    meta_content = (char*)malloc(meta_content_size);
    if(meta_content == 0) {
      fprintf(stderr, "meta_content allocation out of memory\n");
      exit(2);
    }
  }
  /* 
   * Allocate enough space in the variables to hold the
   * result of the parsing.
   */
  {
    if(body_size < size) {
      body_size = size + 10 * 1024;
      body = (char*)realloc(body, body_size);
      if(body == 0) {
	fprintf(stderr, "body reallocation out of memory\n");
	exit(2);
      }
    }
    if(title_size < size) {
      title_size = size + 10 * 1024;
      title = (char*)realloc(title, title_size);
      if(title == 0) {
	fprintf(stderr, "title reallocation out of memory\n");
	exit(2);
      }
    }
    if(meta_name_size < size) {
      meta_name_size = size + 10 * 1024;
      meta_name = (char*)realloc(meta_name, meta_name_size);
      if(meta_name == 0) {
	fprintf(stderr, "meta_name reallocation out of memory\n");
	exit(2);
      }
    }
    if(meta_content_size < size) {
      meta_content_size = size + 10 * 1024;
      meta_content = (char*)realloc(meta_content, meta_content_size);
      if(meta_content == 0) {
	fprintf(stderr, "meta_content reallocation out of memory\n");
	exit(2);
      }
    }
  }
  /*
   * Reset pointers to begin parsing.
   */
  body_length = 0;
  title_length = 0;
  meta_name_length = 0;
  meta_content_length = 0;
  {
    int i;
    for(i = 0; i < META_MAX; i++) {
      if(meta_values[i]) {
	free(meta_values[i]);
      }
      meta_values[i] = 0;
    }
  }
  if(verbose) fprintf(stderr, "init done\n");
  return 0;
}

/* 
 * Set up everything to parse the HTML file.
 */
static int bodyparse_init(char* filename, char* output, int limit, int flag)
{
  char* mode = (flag & BODY_PARSE_APPEND) ? "a" : "w";

  /* Change/create the output file descriptor. */
  bodyparse_init_common(file_size(filename), limit, flag);

  output_fp = fopen(output, mode);
  if(output_fp == 0) {
    fprintf(stderr, "bodyparse_init(%s, %s): cannot open output for %s\n", filename, output, mode);
    return -1;
  }
  /*
   * Add a space to prevent agregation of word at then end of the
   * previous document with the first word of the current document.
   */
  if(mode[0] == 'a') {
    fprintf(output_fp, " ");
  }
  in_file = 1;

  return 0;
}

/*
 * Test if the string is not null and does not contain only blanks.
 */
int not_empty(string, string_length)
char* string;
int string_length;
{
  int i;
  for(i = 0; i < string_length; i++) {
    if(!isspace(string[i]))
      return 1;
  }
  return 0;
}

/*
 * Entry point : parse filename, output result to output, stop at limit bytes.
 */
int bodyparse_string(string, string_size, limit, min, flag,
		     rbody, rbody_length,
		     rkeywords, rkeywords_length,
		     rdescription, rdescription_length,
		     rtitle, rtitle_length)
char* string;
int string_size;
int limit;
int min;
int flag;
char** rbody; int* rbody_length;
char** rkeywords; int* rkeywords_length;
char** rdescription; int* rdescription_length;
char** rtitle; int* rtitle_length;
{
  if(verbose) fprintf(stderr, "bodyparse_string: %d bytes\n", string_size);
  if(bodyparse_init_common(string_size, limit, flag) < 0) {
    return -3;
  }

  {
    YY_BUFFER_STATE yy_buffer = yy_scan_bytes(string, string_size);
    yy_switch_to_buffer(yy_buffer);

    iso2letter = (flag & BODY_PARSE_UNACCENT) ? iso2letter_unaccent : iso2letter_transparent;

    /*
     * Initialy buffer points to body
     */
    initactivation();
    /*
     * Run the parser
     */
    bodylex();

    yy_delete_buffer(yy_buffer);
    /*
     * Save body state
     */
    if(body == buffer) {
      body_length = buffer_length;
    } else {
      /* We lost track somewhere, no body at all. */
      body_length = 0;
    }
#if 0
    printf("body_length = %d\n", body_length);
    {
      int i;
      for(i = 0; i < body_length; i++)
	printf(">%c<", body[i]);
      printf("\n");
    }
#endif
  }
  /*
   * Parsing done, output data
   */
  {
    /* Ignore small documents. */
    if(body_length < min) {
      return 0;
    }
    /*
     * Ouput title or URL if no title
     */
    if(title_length > 0 && not_empty(title, title_length)) {
      *rtitle = title; 
      *rtitle_length = title_length > output_limit ? output_limit : title_length;
      (*rtitle)[*rtitle_length] = '\0';
    } else {
      *rtitle = "";
      *rtitle_length = 0;
    }
    /*
     * Output meta description, if any, first chars of the body by default
     */
    if(meta_values[META_DESCRIPTION]) {
      *rdescription = meta_values[META_DESCRIPTION];
      *rdescription_length = meta_values_length[META_DESCRIPTION] > output_limit ? output_limit : meta_values_length[META_DESCRIPTION];
      (*rdescription)[*rdescription_length] = '\0';
    } else {
      *rdescription = "";
      *rdescription_length = 0;
    }
    if(meta_values[META_KEYWORDS]) {
      *rkeywords = meta_values[META_KEYWORDS];
      *rkeywords_length = meta_values_length[META_KEYWORDS] > output_limit ? output_limit : meta_values_length[META_KEYWORDS];
      (*rkeywords)[*rkeywords_length] = '\0';
    } else {
      *rkeywords = "";
      *rkeywords_length = 0;
    }

    {
      /*
       * Striping leading white spaces. 
       */
      char* p = body;
      while(body_length > 0 && *p == ' ') {
	p++;
	body_length--;
      }
      
      if(verbose > 10) fprintf(stderr, "bodyparse_string: %.*s\n", body_length, p);
      if(body_length > 0) {
	*rbody = p;
	*rbody_length = body_length > output_limit ? output_limit : body_length;
	(*rbody)[*rbody_length] = '\0';
      } else {
	*rbody = "";
	*rbody_length = 0;
      }
    }
  }
  return 1;
}

/*
 * Entry point : parse filename, output result to output, stop at limit bytes.
 */
int bodyparse(char* filename, char* output, char* url, int limit, int min, int flag)
{
  if(bodyparse_init(filename, output, limit, flag) < 0)
    return -3;
  {
    /* 
     * Open the input file
     */
    FILE* fp = fopen(filename, "r");
    YY_BUFFER_STATE file_buffer;

    if(fp == 0) {
      fprintf(stderr, "bodyparse: cannot open %s for reading\n", filename);
      fclose(output_fp);
      perror("");
      return -1;
    }

    file_buffer = yy_create_buffer(fp, YY_BUF_SIZE);
    yy_switch_to_buffer(file_buffer);

    iso2letter = (flag & BODY_PARSE_UNACCENT) ? iso2letter_unaccent : iso2letter_transparent;

    /*
     * Initialy buffer points to body
     */
    initactivation();
    /*
     * Run the parser
     */
    bodylex();

    /*
     * Save body state
     */
    if(body == buffer) {
      body_length = buffer_length;
    } else {
      /* We lost track somewhere, no body at all. */
      body_length = 0;
    }
#if 0
    printf("body_length = %d\n", body_length);
    {
      int i;
      for(i = 0; i < body_length; i++)
	printf(">%c<", body[i]);
      printf("\n");
    }
#endif
    yy_delete_buffer(file_buffer);
    fclose(fp);
  }

  fclose(output_fp);
  return 0;
}


/*
 Local Variables: ***
 mode: C ***
 End: ***
*/
