%{
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "cut.h"
#include "store.h"
#include "score.h"

YY_BUFFER_STATE all;
YY_BUFFER_STATE buf_tag;
YY_BUFFER_STATE buf_word;

char *mot=NULL;
unsigned int porn, scoring;
%}
ALPHA    [0-9]
ALPHANUM [a-zA-Z0-9]
DOMAIN   {ALPHANUM}+("-"|".")({ALPHANUM}|"-"|".")+
WORD     &?{ALPHANUM}+{ALPHANUM}*[\-!\?@$\+%=\/:.;&]*{ALPHANUM}*
WORDA    {ALPHANUM}+{ALPHANUM}*[\-!\?@$\+%=\/:.]*{ALPHANUM}*

%option stack
%x DOMAINST ACCENT
%x SCRIPT COMMENT TAG STYLE
%%

<INITIAL>"</"[^>]+">"   {}

<INITIAL>"<style"    {yy_push_state(STYLE);}
<STYLE>"/style>"|">" {yy_pop_state();}
<STYLE>"\n"          {}
<STYLE>.             {}
<STYLE><<EOF>>       {} 

<INITIAL>"<script"   {yy_push_state(SCRIPT);}
<SCRIPT>"/script>"   {yy_pop_state();}
<SCRIPT>"\n"         {}
<SCRIPT>.            {} 
<SCRIPT><<EOF>>      {yy_pop_state();}

<INITIAL>"<!--"      {yy_push_state(COMMENT);}
<COMMENT>"-->"       {yy_pop_state();}
<COMMENT>"\n"        {}
<COMMENT>.           {} 
<COMMENT><<EOF>>     {yy_pop_state();}

<INITIAL>"<" {yy_push_state(TAG);yymore();}
<TAG>">" {
  yy_pop_state();
  char *toCut = NULL;
  toCut = malloc(sizeof(char)*(strlen(yytext)+1));
  strcpy(toCut, yytext);
  cut_tags( toCut , porn, scoring);
  free(toCut);
  toCut = NULL;
  BEGIN DOMAINST;
  buf_tag = yy_scan_string( yytext );
  yy_switch_to_buffer(buf_tag);
}
<TAG>(\"[^\"]*\"|\'[^\']*\'|[^\'\"">"\n])* {
  yymore();
}
<TAG>"\n" {yymore();}
<TAG><<EOF>> {yy_pop_state();}

<INITIAL>{WORD}* {
  mot = malloc(sizeof(char));
  strcpy(mot,"");
  BEGIN ACCENT;
  buf_word = yy_scan_string( yytext );
  yy_switch_to_buffer(buf_word);
}

<ACCENT>"&aacute;"|"&agrave;"|"&acirc;" {
  mot = realloc(mot,sizeof(char)*(strlen(mot)+2));
  strcat(mot, "a");
}
<ACCENT>"&eacute;"|"&egrave;"|"&ecirc;" {
  mot = realloc(mot,sizeof(char)*(strlen(mot)+2));
  strcat(mot, "e");
}
<ACCENT>"&iacute;"|"&igrave;"|"&icirc;" {
  mot = realloc(mot,sizeof(char)*(strlen(mot)+2));
  strcat(mot, "i");
}
<ACCENT>"&oacute;"|"&ograve;"|"&ocirc;" {
  mot = realloc(mot,sizeof(char)*(strlen(mot)+2));
  strcat(mot, "o");
}
<ACCENT>"&uacute;"|"&ugrave;"|"&ucirc;" {
  mot = realloc(mot,sizeof(char)*(strlen(mot)+2));
  strcat(mot, "u");
}
<ACCENT>"&ccedil;" {
  mot = realloc(mot,sizeof(char)*(strlen(mot)+2));
  strcat(mot, "c");
}
<ACCENT>"&nbsp;"|"&lt;"|"&gt;"|" "|"&#\w+;" {
  mot = realloc(mot,sizeof(char)*(strlen(mot)+2));
  strcat(mot," ");
}
<ACCENT>{WORDA}* {
  mot = realloc(mot,sizeof(char)*(strlen(mot)+strlen(yytext)+1));
  strcat(mot,yytext);
}

<ACCENT><<EOF>> {
  cut_words(mot, porn, scoring);
  BEGIN DOMAINST;
  yy_delete_buffer(buf_word);  
  buf_tag = yy_scan_string( mot );
  yy_switch_to_buffer(buf_tag);
  free(mot);
  mot=NULL;
}

<DOMAINST>\/\/{DOMAIN} {
  if(scoring) score_storeToken(yytext, DOMAINS);
  else store_storeTempToken(yytext, DOMAINS);
}

<DOMAINST><<EOF>> {
  BEGIN INITIAL;
  yy_delete_buffer(buf_tag);
  yy_switch_to_buffer(all);
}

<*>.|\n {}

<<EOF>> {return 1;}

%%
/**
* Launch the lexer for a FILE pointer.
* @param fp the FILE pointer to parse
* @param isporn (1 = "is porn", 0 = "is not porn")
* @param isscoring (1 = "is scoring", 0 = "is learning")
* @param verbose verbose mode (1 = "normal verbose mode", 2 = "extra verbose mode")
*/
void init_parsing(FILE *fp, int isporn, int isscoring, int verbose)
{
  char line[5000];
  char *all_text = NULL,*temp=NULL;

  unsigned int somme;

  if(!isscoring)
  store_storeToken("~~nombre-pages~~", isporn);
  else score_initScoring();

  porn = isporn;
  scoring = isscoring;

  all_text = malloc(sizeof(char));
  temp = malloc(sizeof(char));
  strcpy(all_text,"");
  strcpy(temp,"");

   while (feof(fp) == 0){
     fgets(line, 5000, fp);
     somme = strlen(line)+strlen(temp)+2;
     free(all_text);
     all_text = malloc(sizeof(char)*(somme));
         
     strcpy(all_text, temp);
     strcat(all_text, line);
     
     free(temp);
     temp = malloc(sizeof(char)*(strlen(all_text)+2));
     strcpy(temp, all_text);
   }

   all = yy_scan_string(all_text);
   yylex();
   yy_delete_buffer(all);
   cut_free();
  
   free(all_text); all_text=NULL;
   free(temp); temp=NULL;

   if (isscoring){
     score_getProbability(verbose);
     score_closedb();
   }else{
     store_storeAll(isporn);
   }
}

