%{
/*
 *   Copyright (C) 1997, 1998, 1999 Loic Dachary
 *
 *   This program is free software; you can redistribute it and/or modify it
 *   under the terms of the GNU General Public License as published by the
 *   Free Software Foundation; either version 2, or (at your option) any
 *   later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program; if not, write to the Free Software
 *   Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 
 *
 */
/*
 * Extract hypertext links from an HTML file.
 * The links understood are
 *    <a href=
 *    <frame src=
 *    <base href=
 *    <meta url=
 *    <area href=
 *    <img src=
 */
/* Head */
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif /* HAVE_CONFIG_H */

#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#include <ctype.h>
#include <string.h>
#ifdef HAVE_DMALLOC_H
#include <dmalloc.h>
#endif /* HAVE_DMALLOC_H */

#include <uri.h>
#include <split.h>
#include <salloc.h>

#include <html_href.h>

#define T_BASE	HTML_URI_BASE
#define T_A	HTML_URI_A
#define T_META	HTML_URI_META
#define T_FRAME	HTML_URI_FRAME
#define T_AREA	HTML_URI_AREA
#define T_IMG	HTML_URI_IMG

struct href_info_t {
  /*
   * TRUE if page has a frameset
   */
  int in_frame;
  /*
   * Hold the href string
   */
  char* href;
  /* 
   * Total size of the buffer pointed by href
   */
  int href_size;
  /*
   * Token to be returned
   */
  int token;
};

static struct href_info_t href_info;

/*
 * Verbosity level
 */
static int verbose;

/*
 * Fill the href buffer when a link has been detected
 */
static void fill_href();
static void init();
static int html_href_lexer(html_href_t* arg);
%}

/*
 * Flex configuration
 */
%option nounput noyywrap prefix="htmlhref" outfile="lex.yy.c"
/*
 * Input related options
 */
%option case-insensitive 8bit
/*
 * Performance tuning options
 * fast : object code 4 times bigger, 20% faster
 * full : object code 6 times bigger, 20% faster
 * 
 * It takes 51 milli-seconds to parse (html_href_parse + %option fast) a 400k html file 
 * containing 1000 href on a PII 350 with enough memory to hold everything 
 * in memory.
 */
%option fast

%x COMMENT_TAG FRAME_TAG ADDRESS_TAG BASE_TAG META_TAG AREA_TAG IMG_TAG IGNORING_TAG

OPTWS		[[:blank:]\n\r]*

%%
<INITIAL>{
	"<"{OPTWS}a { href_info.token = T_A; BEGIN(ADDRESS_TAG); }
	"<"{OPTWS}frame {
	  href_info.in_frame = 1;
	  href_info.token = T_FRAME;
	  BEGIN(FRAME_TAG);
	}
	"<"{OPTWS}base { href_info.token = T_BASE; BEGIN(BASE_TAG); }
	"<"{OPTWS}meta { href_info.token = T_META; BEGIN(META_TAG); }
	"<"{OPTWS}area { href_info.token = T_AREA; BEGIN(AREA_TAG); }
	"<"{OPTWS}img { href_info.token = T_IMG; BEGIN(IMG_TAG); }
	"<"{OPTWS}(script|style)[^<>]*">" {
	   BEGIN(IGNORING_TAG);
	 }
	"<!--" BEGIN(COMMENT_TAG);
	.	;
	[\n\r]	;
}

<IGNORING_TAG>{
	"<"{OPTWS}"/"{OPTWS}(script|style)[^<>]*">" {
	  BEGIN(INITIAL);
	}
	[[:blank:]\n\r]+ ;
	. ;
}

<FRAME_TAG,IMG_TAG>{
	src{OPTWS}={OPTWS}\"[^>"]*[>"]|src{OPTWS}={OPTWS}'[^>']*[>']	{
                fill_href(1);
		return href_info.token;
	}
	src{OPTWS}={OPTWS}[^"' >]+	{
                fill_href(0);
		return href_info.token;
	}
	">"	BEGIN(INITIAL);
	\n	
	.	
}

<META_TAG>{
	url{OPTWS}={OPTWS}\"[^>"]*[>"]|url{OPTWS}={OPTWS}'[^>']*[>']	{
                fill_href(1);
		return href_info.token;
	}
	url{OPTWS}={OPTWS}[^"' >]+	{
                fill_href(0);
		return href_info.token;
	}
	">"	BEGIN(INITIAL);
	\n	
	.	
}

<ADDRESS_TAG,AREA_TAG,BASE_TAG>{
	href{OPTWS}={OPTWS}\"[^>"]*[>"]|href{OPTWS}={OPTWS}\'[^'>]*[>']	{
                fill_href(1);
		return href_info.token;
	}
	href{OPTWS}={OPTWS}[^"' >]+	{
                fill_href(0);
		return href_info.token;
	}
	">"	BEGIN(INITIAL);
	\n	
	.	
}

<COMMENT_TAG>{
        "-->" BEGIN(INITIAL);
        "<html>" BEGIN(INITIAL);
        "<head>" BEGIN(INITIAL);
        "<title>" BEGIN(INITIAL);
	[[:blank:]\n\r]+ ;
	. ;
}

<<EOF>> {
        BEGIN(INITIAL);
        yyterminate();
}

%%

/*
 * Fill the href with link detected by the parser
 *   
 */
static void fill_href(int has_quote)
{
  /*
   * Realloc href if necessary
   */
  static_alloc(&href_info.href, &href_info.href_size, htmlhrefleng + 1);
  if(verbose > 3) fprintf(stderr, "fill_href: htmlhreftext = %.*s, leng = %d\n", htmlhrefleng, htmlhreftext, htmlhrefleng);
  {
    char* start	;
    int length	;

    /* Translate &#199; sequences */
    {
      int from;
      int to;
      for(from = 0, to = 0; from < htmlhrefleng;) {
	if(htmlhreftext[from] == '&' &&
	   (from + 1 < htmlhrefleng && htmlhreftext[from + 1] == '#')) {
	  char value = 0;
	  int i = 0;
	  if(verbose > 3) fprintf(stderr, "found %.6s\n", &htmlhreftext[from]);
	  for(i = 0;
	      from + 2 + i < htmlhrefleng && i < 3 &&
	      htmlhreftext[from + 2 + i] >= '0' && htmlhreftext[from + 2 + i] <= '9';
	      i++) {
	    value = value * 10 + (char)(htmlhreftext[from + 2 + i]) - '0';
	  }
	  if(from + 2 + i < htmlhrefleng && htmlhreftext[from + 2 + i] == ';')
	    i++;
	  from += 2 + i;
	  if(verbose > 3) fprintf(stderr, "value = %c\n", value);
	  htmlhreftext[to++] = value;
	} else {
	  htmlhreftext[to++] = htmlhreftext[from++];
	}
      }
      htmlhrefleng = to;
      if(verbose > 3) fprintf(stderr, "fill_href: translated htmlhreftext = %.*s, leng = %d\n", htmlhrefleng, htmlhreftext, htmlhrefleng);
    }

    /*
     * First remove surrounding quotes (" or ') if any
     */
    if(has_quote) {
      char* start_alt;
      start = (char*)(strchr(htmlhreftext, '"'));
      start_alt = (char*)(strchr(htmlhreftext, '\''));
      /*
       * If both ' and " are found, take the closest one
       */
      if(start == 0 || (start_alt != 0 && start > start_alt))
        start = start_alt;
      if(start == 0) {
	fprintf(stderr, "fill_href: cannot find delimiter ' or \"\n");
	exit(2);
      }
    } else {
      start = (char*)(strchr(htmlhreftext, '='));
      if(start == 0) {
	fprintf(stderr, "fill_href: cannot find delimiter = \n");
	exit(2);
      }
    }      

    /* Go over first delimiter. */
    start++;

    /* Strip leading spaces. */
    while(*start && isspace((int)*start))
      start++;
    length = htmlhrefleng - (start - htmlhreftext);
    if(has_quote) {
      /* Skip last delimiter. (can be '" or > if common <A href="url> occurs. */
      length--;
    }
    /*
     * If the size of the data after strip is null, nothing left : error
     */
    if(length < 0) {
      fprintf(stderr, "fill_href: length < 0 for %.*s\n", htmlhrefleng, htmlhreftext);
      exit(3);
    }
    /* 
     * Strip trailing spaces.
     */
    while(length > 0 && isspace((int)start[length - 1]))
      length--;

    /*
     * Refine the links, clean it
     */
    {
      int i;
      int tmp = length;
      char* p = href_info.href;

      /* 
       * Auto patch the http:/dir/file -> /dir/file. 
       */
      if(length > 7 && !strncmp("http:/", start, 6) && start[6] != '/') {
	start += 5;
	length -= 5;
      }

      /*
       * Remove newlines, yes links sometimes contain newlines
       */
      for(i = 0; i < tmp; i++) {
	if(start[i] != '\n') {
	  *p++ = start[i];
	} else {
	  length--;
	}
      }
    }
    /*
     * href is a null terminated string.
     */
    href_info.href[length] = '\0';
  }
}

/*
 * First time thru, allocate href buffer
 */
static void init()
{
  href_info.in_frame = 0;
  static_alloc(&href_info.href, &href_info.href_size, 1024);
}

/*
 * Run the parser. Backend of all other interfaces.
 */
int html_href_parse(html_href_t* arg)
{
  int ret = 1;

  init();

  html_parser_run(html_href_lexer);

  return ret;
}

static int html_href_lexer(html_href_t* arg)
{
  int token;
  uri_t* uri = uri_alloc("/", 1);
  int done = 0;
  
  uri_mode_set(URI_MODE_FLAG_DEFAULT);
  
  /*
   * Call the lex parser that returns when a link is found
   */
  while((token = htmlhreflex()) && !done) {
    int href_length = strlen(href_info.href);
    if(token & arg->parser.ignore)
      continue;
    if(uri_realloc(uri, href_info.href, href_length) != URI_CANNONICAL)
      continue;
    if((arg->parser.info & HTML_STRIP_RELATIVE) && (uri_info(uri) & URI_INFO_RELATIVE))
      continue;
    if(arg->href_callback != 0 && !arg->href_callback(token, uri, arg->href_data))
      done = 1;
  }

  uri_free(uri);

  return 1;
}

char* html_href_token2string(int token)
{
  static char string[512];
  
  string[0] = '\0';

  switch(token) {
#define STMT(w) \
  case w: \
    if(string[0] != '\0') strcat(string, ","); \
    strcat(string, #w); \
    break;

    STMT(HTML_URI_BASE);
    STMT(HTML_URI_A);
    STMT(HTML_URI_META);
    STMT(HTML_URI_FRAME);
    STMT(HTML_URI_AREA);
    STMT(HTML_URI_IMG);
#undef STMT
  }

  return string;
}

int html_href_string2token(char* string, int string_length)
{
  char* tmp = (char*)smalloc(string_length + 1);
  char** splitted = 0;
  int count = 0;
  int i;
  int token = 0;

  strncpy(tmp, string, string_length);
  tmp[string_length] = '\0';
  split_inplace(tmp, string_length, &splitted, &count, ',', SPLIT_TRIM);

  for(i = 0; i < count; i++) {
    if(0) ;
#define STMT(w) \
    else if(!strcmp(splitted[i], #w)) \
      token |= w
    STMT(HTML_URI_BASE);
    STMT(HTML_URI_A);
    STMT(HTML_URI_META);
    STMT(HTML_URI_FRAME);
    STMT(HTML_URI_AREA);
    STMT(HTML_URI_IMG);
#undef STMT
  }
  
  free(tmp);
  return token;
}

int html_href_parse_print_one(int info, uri_t* uri, void* data)
{
  fprintf(stderr, "%s: %s\n", html_href_token2string(info), uri_uri(uri));
  return 1;
}

int html_href_parse_print(html_href_t* arg)
{
  arg->href_callback = html_href_parse_print_one;

  return html_href_parse(arg);
}

void html_href_reset(html_href_t* arg)
{
  memset(arg, '\0', sizeof(html_href_t));
  html_parser_reset(&arg->parser);
}

#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>

// If we have this, we probably want it.
#ifdef HAVE_GETOPT_H
#include <getopt.h>
#endif
#include <timer.h>

typedef struct 
{
  char* file;
  int href;
  int ignore;
  int bench;
  int absolute;
  int quiet;
} params_t;

static void usage();
static void dohref(params_t* params);

static int verbose = 0;

int html_href_main(int ac, char **av)
{
  int			c;
  extern char		*optarg;
  extern int		optind;
  params_t		params;

  optind = 0;

  params.file = strdup("???");
  params.href = 0;
  params.absolute = 0;
  params.ignore = 0;
  params.bench = 0;
  params.quiet = 0;

  while ((c = getopt(ac, av, "vf:hai:B:Q")) != -1)
    {
      switch (c)
	{
	case 'v':
	  verbose++;
	  break;
	case 'f':
	  free(params.file);
	  params.file = strdup(optarg);
	  break;
	case 'i':
	  params.ignore = html_href_string2token(optarg, strlen(optarg));
	  break;
	case 'B':
	  params.bench = atoi(optarg);
	  break;
	case 'Q':
	  params.quiet = 1;
	  break;
	case 'h':
	  params.href = 1;
	  break;
	case 'a':
	  params.absolute = 1;
	  break;
	case '?':
	  usage();
	  break;
	}
    }

  dohref(&params);

  free(params.file);

  return 0;
}

static void dohref(params_t* params)
{
  html_href_t arg;
  html_href_reset(&arg);
  
  arg.parser.info = HTML_SOURCE_FILENAME;
  arg.parser.source = params->file;
  arg.parser.source_length = strlen(params->file);
  arg.parser.ignore = params->ignore;

  if(params->absolute)
    arg.parser.info |= HTML_STRIP_RELATIVE;

  if(params->bench < 1)
    params->bench = 1;

  {
    int i;
    time_register(0);
    for(i = 0; i < params->bench; i++) {
      if(params->quiet)
	html_href_parse(&arg);
      else
	html_href_parse_print(&arg);
    }
    if(params->bench > 1) {
       time_t seconds = time_show("overall execution time: ", 0);
       fprintf(stderr, "per call execution time %ld milli-seconds\n", (seconds * 1000) / params->bench);
    }
  }
  
}

//*****************************************************************************
//
//   Display program usage information
//
static void usage()
{
    printf("usage: t_htmlhref [options]\n");
    printf("Options:\n");
    printf("\t-v\tIncreases the verbosity\n");
    printf("\t-B n\trepeat <n> times and report statistics\n");
    printf("\t-Q\tquiet, only parsing, no printing\n");
    printf("\t-f file\tname of HTML file to parse\n");
    printf("\t-h\tExercise href extraction\n");
    printf("\t-a\tSkip relative URLs\n");
    printf("\t-i context\tIgnore URLs found in context (HTML_URI_BASE,HTML_URI_A,HTML_URI_META,HTML_URI_FRAME,HTML_URI_AREA,HTML_URI_IMG)\n");
    exit(0);
}

/*
 Local Variables: ***
 mode: C ***
 End: ***
*/
