%{
/*
 *   Copyright (C) 1997, 1998, 1999 Loic Dachary
 *
 *   This program is free software; you can redistribute it and/or modify it
 *   under the terms of the GNU General Public License as published by the
 *   Free Software Foundation; either version 2, or (at your option) any
 *   later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program; if not, write to the Free Software
 *   Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 
 *
 */
/* 
 * Extract the words from an HTML document and translate ISO-8859-1 and
 * HTML accents to iso8859 accented equivalent.
 */
/* Head */
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif /* HAVE_CONFIG_H */

#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#include <ctype.h>
#include <string.h>
#ifdef HAVE_DMALLOC_H
#include <dmalloc.h>
#endif /* HAVE_DMALLOC_H */

#include <isomap.h>
#include <split.h>
#include <salloc.h>

#include <html_content.h>

struct content_info_t {
  /*
   * Hold the content string
   */
  char* content;
  /* 
   * Total size of the buffer pointed by content
   */
  int content_size;
  /* 
   * Length of the data in content
   */
  int content_length;
  /*
   * True if previous char was a white space
   */
  int white_space;
  /*
   * Unaccent map
   */
  char* iso2letter;
  /*
   * Token to be returned
   */
  int token;
  /*
   * Cumulated tokens
   */
  int tokens;
  /*
   * If content_length reach this limit, return token.
   */
#define HTML_CONTENT_CHUNK	10240
  int chunk_size;
};

static struct content_info_t content_info = { 0, 0, 0, 0 };

/*
 * Verbosity level
 */
static int verbose = 0;

/*
 * Start condition before jump to another start condition.
 */
static int previous_condition; 

static void init();
static int html_content_lexer(html_content_t* arg);

#define fill(inchar) \
{ \
  unsigned char c = (inchar); \
  if(verbose > 10) fprintf(stderr, "initialy %c 0x%x => ", c, (c & 0xff)); \
  if(content_info.iso2letter) c = content_info.iso2letter[c]; \
  if(verbose > 10) fprintf(stderr, "now %c 0x%x\n", c, (c & 0xff)); \
  content_info.content[content_info.content_length++] = c; \
  if(c != ' ') content_info.white_space = 0; \
  else if(content_info.content_length >= content_info.chunk_size) \
     return content_info.token; \
  if(content_info.content_length >= content_info.content_size - 1) \
    static_alloc(&content_info.content, &content_info.content_size, content_info.content_size + 1); \
} 

#define fill_white { \
  if(!content_info.white_space) { \
    content_info.white_space = 1; \
    fill(' '); \
  } \
}
%}

/*
 * Flex configuration
 */
%option nounput noyywrap prefix="htmlcontent" outfile="lex.yy.c"
/*
 * Input related options
 */
%option case-insensitive 8bit
/*
 * Performance tuning options
 * fast : object code 4 times bigger, 20% faster
 * full : object code 6 times bigger, 20% faster
 * 
 */
%option fast

%x COMMENT_TAG TITLE_TAG STYLE_TAG SCRIPT_TAG META_TAG OTHER_TAG META_NAME META_CONTENT_DOUBLEQUOTE META_CONTENT_QUOTE META_CONTENT_NOQUOTE

OPTWS		[[:blank:]\n\r]*

/*
  BUG: in string functions null terminated strings are assumed but
       html documents sometimes contain nulls.

  The redundancy of some specifications (dot .) is mandatory for
  precedence considerations. If the dot (.) is shared by many start 
  conditions, it prevents some of them to match a single char (because
  the parser puts the rule from the shared start condition (for .) in the 
  first position.
*/

%%
<INITIAL>{
	"<"{OPTWS}meta {
	   if(verbose) fprintf(stderr, "meta found \n");
	   BEGIN(META_TAG);
	   return HTML_CONTENT_TEXT;
	}
	"<"{OPTWS}script[^<>]*">" {
	   if(verbose) fprintf(stderr, "script found \n");
	   BEGIN(SCRIPT_TAG);
	 }
	"<"{OPTWS}style[^<>]*">" {
	   if(verbose) fprintf(stderr, "style found \n");
	   BEGIN(STYLE_TAG);
	 }
	"<"{OPTWS}title[^<>]*">" {
	  if(verbose) fprintf(stderr, "title found\n");
          BEGIN(TITLE_TAG);
	  content_info.token = HTML_CONTENT_TITLE;
	  return HTML_CONTENT_TEXT;
	}
	"<!--" {
	  if(verbose) fprintf(stderr, "comment found\n");
          previous_condition = INITIAL;
          BEGIN(COMMENT_TAG);
	}
	"<" {
	  if(verbose) fprintf(stderr, "other start tag found\n");
          previous_condition = INITIAL; BEGIN(OTHER_TAG);
	  fill_white;
	}
	[[:blank:]\n\r]+	fill_white;
	[\001-\037] ;
	.      fill(htmlcontenttext[0] & 0xff);
}

<OTHER_TAG>{
        ">" BEGIN(previous_condition);
	[[:blank:]\n\r]+ ;
	. ;
}

<COMMENT_TAG>{
        "-->" BEGIN(previous_condition);
	[[:blank:]\n\r]+ ;
	. ;
}

<TITLE_TAG>{
	"<"{OPTWS}"/"{OPTWS}title[^>]*">" {
	  if(verbose) fprintf(stderr, "finished title\n");
	  BEGIN(INITIAL);
	  content_info.token = HTML_CONTENT_TEXT;
	  return HTML_CONTENT_TITLE;
	}
	"<"{OPTWS}body{OPTWS}">" {
	  if(verbose) fprintf(stderr, "body in title: break\n");
	  BEGIN(INITIAL);
	  content_info.token = HTML_CONTENT_TEXT;
	  return HTML_CONTENT_TITLE;
	}
	"<" {
	  if(verbose) fprintf(stderr, "other start tag found\n");
          previous_condition = TITLE_TAG; BEGIN(OTHER_TAG);
	  fill_white;
	}
	[[:blank:]\n\r]+	fill_white;
	[\001-\037] ;
	.      fill(htmlcontenttext[0] & 0xff);
}

<TITLE_TAG,INITIAL,META_CONTENT_DOUBLEQUOTE,META_CONTENT_QUOTE,META_CONTENT_NOQUOTE>{
	[[:blank:]\n\r]+	fill_white;
	[\001-\037] ;
	\\&                     fill(htmlcontenttext[1]);
	"&"quot";"? fill('"');
	"&"amp";"? fill('&');
	"&"lt";"? fill('<');
	"&"gt";"? fill('>');
	"&"iexcl";"? fill('');
	"&"cent";"? fill('');
	"&"pound";"? fill('');
	"&"curren";"? fill('');
	"&"yen";"? fill('');
	"&"brkbar";"? fill('');
	"&"sect";"? fill('');
	"&"uml";"? fill('');
	"&"copy";"? fill('');
	"&"ordf";"? fill('');
	"&"laquo";"? fill('');
	"&"not";"? fill('');
	"&"shy";"? fill('');
	"&"reg";"? fill('');
	"&"hibar";"? fill('');
	"&"deg";"? fill('');
	"&"plusmn";"? fill('');
	"&"sup2";"? fill('');
	"&"sup3";"? fill('');
	"&"acute";"? fill('');
	"&"micro";"? fill('');
	"&"para";"? fill('');
	"&"middot";"? fill('');
	"&"cedil";"? fill('');
	"&"sup1";"? fill('');
	"&"ordm";"? fill('');
	"&"raquo";"? fill('');
	"&"frac14";"? fill('');
	"&"frac12";"? fill('');
	"&"frac34";"? fill('');
	"&"iquest";"? fill('');
	"&"Agrave";"? fill(yytext[1] == 'A' ? '' : '');
	"&"Aacute";"? fill(yytext[1] == 'A' ? '' : '');
	"&"Acirc";"? fill(yytext[1] == 'A' ? '' : '');
	"&"Atilde";"? fill(yytext[1] == 'A' ? '' : '');
	"&"Auml";"? fill(yytext[1] == 'A' ? '' : '');
	"&"Aring";"? fill(yytext[1] == 'A' ? '' : '');
	"&"AElig";"? fill(yytext[1] == 'A' ? '' : '');
	"&"Ccedil";"? fill(yytext[1] == 'C' ? '' : '');
	"&"Egrave";"? fill(yytext[1] == 'E' ? '' : '');
	"&"Eacute";"? fill(yytext[1] == 'E' ? '' : '');
	"&"Ecirc";"? fill(yytext[1] == 'E' ? '' : '');
	"&"Euml";"? fill(yytext[1] == 'E' ? '' : '');
	"&"Igrave";"? fill(yytext[1] == 'I' ? '' : '');
	"&"Iacute";"? fill(yytext[1] == 'I' ? '' : '');
	"&"Icirc";"? fill(yytext[1] == 'I' ? '' : '');
	"&"Iuml";"? fill(yytext[1] == 'I' ? '' : '');
	"&"Dstrok";"? fill('');
	"&"Ntilde";"? fill(yytext[1] == 'N' ? '' : '');
	"&"Ograve";"? fill(yytext[1] == 'O' ? '' : '');
	"&"Oacute";"? fill(yytext[1] == 'O' ? '' : '');
	"&"Ocirc";"? fill(yytext[1] == 'O' ? '' : '');
	"&"Otilde";"? fill(yytext[1] == 'O' ? '' : '');
	"&"Ouml";"? fill(yytext[1] == 'O' ? '' : '');
	"&"times";"? fill('');
	"&"Oslash";"? fill(yytext[1] == 'O' ? '' : '');
	"&"Ugrave";"? fill(yytext[1] == 'U' ? '' : '');
	"&"Uacute";"? fill(yytext[1] == 'U' ? '' : '');
	"&"Ucirc";"? fill(yytext[1] == 'U' ? '' : '');
	"&"Uuml";"? fill(yytext[1] == 'U' ? '' : '');
	"&"Yacute";"? fill(yytext[1] == 'U' ? '' : '');
	"&"THORN";"? fill(yytext[1] == 'T' ? '' : '');
	"&"szlig";"? fill('');
	"&"eth";"? fill('');
	"&"divide";"? fill('');
	"&"yuml";"? fill('');
	"&#"[0-9]{1,3}";"? {
	  unsigned char c1 = atoi(htmlcontenttext + 2) & 0xff;
	  fill(c1);
	}
	"&"nbsp";"? fill_white;
	"<"{OPTWS}"font"[^>]*">"    ;
	"</"{OPTWS}"font"[^>]*">"    ;
	"<"{OPTWS}"blink"[^>]*">"    ;
	"</"{OPTWS}"blink"[^>]*">"    ;
	"<"{OPTWS}[bi]{OPTWS}">"    ;
	"</"{OPTWS}[bi]{OPTWS}">"    ;
}

<SCRIPT_TAG>{
	"<"{OPTWS}"/"{OPTWS}script[^<>]*">" {
	  if(verbose) fprintf(stderr, "finished script\n");
	  BEGIN(INITIAL);
	}
	[[:blank:]\n\r]+ ;
	. ;
}

<STYLE_TAG>{
	"<"{OPTWS}"/"{OPTWS}style[^<>]*">" {
	  if(verbose) fprintf(stderr, "finished style\n");
	  BEGIN(INITIAL);
	}
	[[:blank:]\n\r]+ ;
	. ;
}

<META_TAG>{
	name{OPTWS}={OPTWS}["']|name{OPTWS}={OPTWS} {
	    if(verbose) fprintf(stderr, "found meta name\n");
            content_info.token = HTML_CONTENT_METANAME;
            BEGIN(META_NAME);
        }
	content{OPTWS}={OPTWS}\" {
	    if(verbose) fprintf(stderr, "found meta content (double quote)\n");
            content_info.token = HTML_CONTENT_METACONTENT;
            BEGIN(META_CONTENT_DOUBLEQUOTE);
        }
	content{OPTWS}={OPTWS}\' {
	    if(verbose) fprintf(stderr, "found meta content (single quote)\n");
            content_info.token = HTML_CONTENT_METACONTENT;
            BEGIN(META_CONTENT_QUOTE);
        }
	content{OPTWS}={OPTWS} {
	    if(verbose) fprintf(stderr, "found meta content (no quote)\n");
            content_info.token = HTML_CONTENT_METACONTENT;
            BEGIN(META_CONTENT_NOQUOTE);
        }
	">" {
            BEGIN(INITIAL);
            content_info.tokens = 0;
        }
	[[:blank:]\n\r]+ ;
	. ;
}

<META_NAME>{
	['">[:blank:]] { 
	  BEGIN(META_TAG);
          content_info.token = HTML_CONTENT_TEXT;
	  return HTML_CONTENT_METANAME;
	}
	.      fill(htmlcontenttext[0] & 0xff);
}

<META_CONTENT_DOUBLEQUOTE>{
	\" { 
	  if(verbose) fprintf(stderr, "finished meta content (double quote)\n");
	  BEGIN(META_TAG);
          content_info.token = HTML_CONTENT_TEXT;
	  return HTML_CONTENT_METACONTENT;
	}
	.      fill(htmlcontenttext[0] & 0xff);
}

<META_CONTENT_QUOTE>{
	\' { 
	  if(verbose) fprintf(stderr, "finished meta content (single quote)\n");
	  BEGIN(META_TAG);
          content_info.token = HTML_CONTENT_TEXT;
	  return HTML_CONTENT_METACONTENT;
	}
	.      fill(htmlcontenttext[0] & 0xff);
}

<META_CONTENT_NOQUOTE>{
	\> { 
	  if(verbose) fprintf(stderr, "finished meta content (no quote)\n");
	  BEGIN(META_TAG);
          content_info.token = HTML_CONTENT_TEXT;
	  return HTML_CONTENT_METANAME;
	}
	.      fill(htmlcontenttext[0] & 0xff);
}

<<EOF>> {
  yyterminate();
  BEGIN(INITIAL);
}

%%

/*
 * First time thru, allocate href buffer
 */
static void init(html_content_t* arg)
{
  static_alloc(&content_info.content, &content_info.content_size, 1024);
  content_info.iso2letter = (arg->parser.info & HTML_UNACCENT) ? iso2letter_unaccent : 0;
  content_info.token = HTML_CONTENT_TEXT;
  content_info.chunk_size = HTML_CONTENT_CHUNK;
}

/*
 * Run the parser. Backend of all other interfaces.
 */
int html_content_parse(html_content_t* arg)
{
  int ret = 1;

  init(arg);

  html_parser_run(html_content_lexer);

  return ret;
}

static int html_content_lexer_handle(int token, html_content_parsed_t *parsed, html_content_t* arg)
{
  int done = 0;
  int call = 1;

  /*
   * Group METACONTENT and METANAME together.
   */
  switch(token) {
  case HTML_CONTENT_METANAME:
    if(content_info.tokens != HTML_CONTENT_METACONTENT) {
      call = 0;
      content_info.tokens = token;
    } else
      content_info.tokens |= token;
    break;
  case HTML_CONTENT_METACONTENT:
    if(content_info.tokens != HTML_CONTENT_METANAME) {
      call = 0;
      content_info.tokens = token;
    } else
      content_info.tokens |= token;
    break;
  case HTML_CONTENT_TITLE:
  case HTML_CONTENT_TEXT:
    content_info.tokens = token;
    break;
  }
  /*
   * Memorize data
   */
  switch(token) {
  case HTML_CONTENT_METANAME:
  case HTML_CONTENT_TITLE:
  case HTML_CONTENT_TEXT:
    static_alloc(&parsed->buffer0, &parsed->buffer0_size, content_info.content_length);
    memcpy(parsed->buffer0, content_info.content, content_info.content_length);
    parsed->buffer0_length = content_info.content_length;
    parsed->buffer0[parsed->buffer0_length] = '\0';
    break;
  case HTML_CONTENT_METACONTENT:
    static_alloc(&parsed->buffer1, &parsed->buffer1_size, content_info.content_length);
    memcpy(parsed->buffer1, content_info.content, content_info.content_length);
    parsed->buffer1_length = content_info.content_length;
    parsed->buffer1[parsed->buffer1_length] = '\0';
    break;
  }

  /*
   * Reset parsing conditions
   */
  content_info.content_length = 0;
  content_info.white_space = 0;

  /*
   * Notify tag found if complete (metaname + metacontent or title or text)
   */
  if(call) {
    int i;
    /*
     * Do not call the callback if we only collected white spaces.
     */
    int only_white = 1;
    for(i = 0; i < parsed->buffer0_length; i++) {
      if(!isspace(parsed->buffer0[i])) {
	only_white = 0;
	break;
      }
    }
    if(!only_white) {
      if(arg->content_callback != 0 && !arg->content_callback(content_info.tokens, parsed, arg->content_data))
	done = 1;
    }
    /*
     * Reset context for next token
     */
    content_info.tokens = 0;
    parsed->buffer0_length = 0;
    parsed->buffer1_length = 0;
  }

  return done;
}

static int html_content_lexer(html_content_t* arg)
{
  int token;
  html_content_parsed_t parsed;
  int done = 0;
  
  memset(&parsed, '\0', sizeof(html_content_parsed_t));

  /*
   * Call the lex parser that returns when content is found
   */
  while((token = htmlcontentlex()) && !done) {
    if(token & arg->parser.ignore)
      continue;
    done = html_content_lexer_handle(token, &parsed, arg);
  }

  html_content_lexer_handle(content_info.token, &parsed, arg);

  if(parsed.buffer0) free(parsed.buffer0);
  if(parsed.buffer1) free(parsed.buffer1);

  return 1;
}

char* html_content_token2string(int token)
{
  static char string[512];

  string[0] = '\0';

  switch(token) {
#define STMT(w) \
  case w: \
    if(string[0] != '\0') strcat(string, ","); \
    strcat(string, #w); \
    break;

    STMT(HTML_CONTENT_META);
    STMT(HTML_CONTENT_TITLE);
    STMT(HTML_CONTENT_TEXT);
#undef STMT
  }

  return string;
}

int html_content_string2token(char* string, int string_length)
{
  char* tmp = (char*)smalloc(string_length + 1);
  char** splitted = 0;
  int count = 0;
  int i;
  int token = 0;

  strncpy(tmp, string, string_length);
  tmp[string_length] = '\0';
  split_inplace(tmp, string_length, &splitted, &count, ',', SPLIT_TRIM);

  for(i = 0; i < count; i++) {
    if(0) ;
#define STMT(w) \
    else if(!strcmp(splitted[i], #w)) \
      token |= w
    STMT(HTML_CONTENT_META);
    STMT(HTML_CONTENT_TITLE);
    STMT(HTML_CONTENT_TEXT);
#undef STMT
  }
  
  free(tmp);
  return token;
}

static int html_content_parse_print_one(int info, html_content_parsed_t* parsed, void* data)
{
  fprintf(stderr, "%s: (%.*s) (%.*s)\n", html_content_token2string(info),
	  parsed->buffer0_length, parsed->buffer0,
	  parsed->buffer1_length, parsed->buffer1
	  );
  return 1;
}

static int html_content_parse_print(html_content_t* arg)
{
  arg->content_callback = html_content_parse_print_one;

  return html_content_parse(arg);
}

void html_content_reset(html_content_t* arg)
{
  memset(arg, '\0', sizeof(html_content_t));
  arg->parser.input_limit = -1;
}

#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>

// If we have this, we probably want it.
#ifdef HAVE_GETOPT_H
#include <getopt.h>
#endif
#include <timer.h>

typedef struct 
{
  char* file;
  int ignore;
  int bench;
  int quiet;
} params_t;

static void usage();
static void docontent(params_t* params);

int html_content_main(int ac, char **av)
{
  int			c;
  extern char		*optarg;
  extern int		optind;
  params_t		params;

  optind = 0;

  params.file = strdup("???");
  params.ignore = 0;
  params.bench = 0;
  params.quiet = 0;

  while ((c = getopt(ac, av, "vf:i:B:Q")) != -1)
    {
      switch (c)
	{
	case 'v':
	  verbose++;
	  break;
	case 'f':
	  free(params.file);
	  params.file = strdup(optarg);
	  break;
	case 'i':
	  params.ignore = html_content_string2token(optarg, strlen(optarg));
	  break;
	case 'B':
	  params.bench = atoi(optarg);
	  break;
	case 'Q':
	  params.quiet = 1;
	  break;
	case '?':
	  usage();
	  break;
	}
    }

  docontent(&params);

  free(params.file);

  return 0;
}

static void docontent(params_t* params)
{
  html_content_t arg;
  html_content_reset(&arg);
  
  arg.parser.info = HTML_SOURCE_FILENAME;
  arg.parser.source = params->file;
  arg.parser.source_length = strlen(params->file);
  arg.parser.ignore = params->ignore;

  if(params->bench < 1)
    params->bench = 1;

  {
    int i;
    time_register(0);
    for(i = 0; i < params->bench; i++) {
      if(params->quiet)
	html_content_parse(&arg);
      else
	html_content_parse_print(&arg);
    }
    if(params->bench > 1) {
       time_t seconds = time_show("overall execution time: ", 0);
       fprintf(stderr, "per call execution time %ld milli-seconds\n", (seconds * 1000) / params->bench);
    }
  }
  
}

//*****************************************************************************
//
//   Display program usage information
//
static void usage()
{
    printf("usage: t_htmlcontent [options]\n");
    printf("Options:\n");
    printf("\t-v\tIncreases the verbosity\n");
    printf("\t-B n\trepeat <n> times and report statistics\n");
    printf("\t-Q\tquiet, only parsing, no printing\n");
    printf("\t-f file\tname of HTML file to parse\n");
    printf("\t-i context\tIgnore URLs found in context (HTML_CONTENT_META,HTML_CONTENT_TITLE,HTML_CONTENT_TEXT)\n");
    exit(0);
}

/*
 Local Variables: ***
 mode: C ***
 End: ***
*/
