%{

/* phtml_parse - parse PHP and ASP combined HTML 

When building this scanner there should be no warning that the default
rule can be matched despite the fact that it uses the nodefault
option.

lib phtml_parse is a library for parsing php and asp files
Copyright (C) 2003 Michael De La Rue

This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.

*/


#define error(reason) do { phtmlparse_error_message=reason; error_status=1; \
                           yyterminate(); } while(0)
#ifdef DEBUG
#define asp_push_state(state) do { fputs ("push to state " #state ,stderr);\
                                   yy_push_state(state); } while(0)

#define asp_BEGIN(state) do { fputs ("begin in state " #state ,stderr);\
                                   BEGIN(state); } while(0)
#else /* not DEBUG */
#define asp_push_state(state) do { yy_push_state(state); } while(0)
#define asp_BEGIN(state) do { BEGIN(state); } while(0)
#endif /* not DEBUG */


#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "phtml_parse.h"

/* FIXME it seems that flex already includes string.h but without 
   defining posix source */

char * strndup (const char *S, size_t SIZE);

typedef enum {
  ASP,
  PHP
} script_type;

int start_mode; 
int short_tags;
int asp_tag;
char *here_tag;
int here_len;

/* here is a bit of a nasty hack so that we can provide flexible errors*/
int error_status;

static inline void put_out(int type, int len, char *string);

#define handle_string(TYPE) put_out(TYPE,yyleng,yytext)


%}

PHP_SCRIPT_START \<[sS][cC][rR][iI][pP][tT]\ +[lL][aA][nN][gG][uU][aA][gG][eE]=\"?[pP][hH][pP]\"?\ *\>


ASP_SCRIPT_START \<[sS][cC][rR][iI][pP][tT].*\>

SHORT_START_TAG \<\?

ASP_START_TAG \<\%

PHP_START_TAG \<\?[pP][hH][pP]

X_SCRIPT_END \<\/[sS][cC][rR][iI][pP][tT]\ *\>

PHP_END_TAG \?\>

ASP_END_TAG \%\>

%x php
%x aphp
%x ahtml
%x asphtml
%x aphtml
%x sphtml
%x phtml
%x asp
%x asps
%x c_comment
%x qquote_string
%x quote_string
%x php_line_comment
%x here_doc

%option nodefault
%option stack

%%

    switch (start_mode) {
	case ASP: asp_BEGIN(ahtml); break;
	case PHP: 
	    if (short_tags) {
		 if (asp_tag) {
		     asp_BEGIN(asphtml);
		 } else {
		     asp_BEGIN(sphtml);
		 } /* not asp_tags */
	     } else {
		 if (asp_tag) {
		     asp_BEGIN(aphtml);
		 } else {
		     asp_BEGIN(phtml);
		 } /* not asp_tags */
	     }; break;
	default:    error ("unknown case");
    }

    /* throw out all big chunks of text */

<asphtml,aphtml,sphtml,phtml>{
     {PHP_START_TAG} |
     {PHP_SCRIPT_START} { 
	   handle_string(SCRIPT_START); 
	   if (asp_tag) {
	       asp_push_state(aphp);
	   } else {
	       asp_push_state(php);
	   }
     }
}

<asphtml,sphtml>{SHORT_START_TAG} |
<asphtml,aphtml>{ASP_START_TAG} { 
      handle_string(SCRIPT_START); 
      if (asp_tag) {
	  asp_push_state(aphp);
      } else {
	  asp_push_state(php);
      }
}	

<ahtml>{
   {ASP_SCRIPT_START} { handle_string(SCRIPT_START); 
                        asp_push_state(asps);
                      }
   {ASP_START_TAG} { handle_string(SCRIPT_START);
		     asp_push_state(asp);
                   }
}

<ahtml,asphtml,aphtml,sphtml,phtml>{
  [^<]+ handle_string(TEXT); /* match normal text */
  . handle_string(TEXT); /* match the start of other markup tags */
}

<ahtml,asphtml,aphtml,sphtml,phtml><<EOF>> { 
	put_out(END_FILE, 0, "");
	yyterminate();
   }

<php,aphp>{
  \# |
  \/\/ handle_string(COMMENT); asp_push_state(php_line_comment);
  \/\* handle_string(COMMENT); asp_push_state(c_comment);
  \"   handle_string(QQUOTE); asp_push_state(qquote_string);
  \'   handle_string(QUOTE); asp_push_state(quote_string);
  \<\<\<[A-Za-z0-9]+ {
      here_len=yyleng - 3;
      here_tag=strndup(yytext+3,here_len);  
      if (here_tag==0)
          error ("virtual memory exhausted");
      handle_string(HERE_START); 
      asp_push_state(here_doc);
  }
  {PHP_END_TAG} |
  {X_SCRIPT_END}  handle_string(SCRIPT_END); yy_pop_state();
}
<aphp>{ASP_END_TAG} handle_string(SCRIPT_END); yy_pop_state();

     /* we do very simple parsing of asp.  I can't find a clear definition
	of what goes on inside the parsing of an ASP apart from just the
	start and end tag.  I assume that this means that content is
	ignored until an end tag is met. 

	furthermore, according to the current definition, we don't allow
	ASP to start in a different way from the way it finishes.

     */

<asp>{
  {X_SCRIPT_END}  handle_string(SCRIPT_END); yy_pop_state();
  [^<]+ handle_string(SCRIPT);
}

<asps>{
  {ASP_END_TAG} handle_string(SCRIPT_END); yy_pop_state();
  % |
  %+[^%>] |
  [^%]+ handle_string(SCRIPT);
}

<php,aphp,asps,asp>{
  [^"'<>/%?#]* handle_string(SCRIPT); /* match normal text" */
  . handle_string(SCRIPT); /* short match the start of other markup tags */
}

<c_comment>{
  "*"+"/"    handle_string(COMMENT); yy_pop_state();
  "*" | /* eat up lonely stars */
  [^*]+ | /* eat anything that's not a '*' */
  "*"+[^*/\n]*  handle_string(COMMENT); /* eat up '*'s not followed by '/'s */
}

<qquote_string>{
  \" handle_string(QQUOTE); yy_pop_state();
  \\. | /* slashified character or start of octal string etc */
  \\   | /* single slash on its own */
  [^\\\"]+ handle_string(SSTRING); /* eat up anthing without slashes */
}

<quote_string>{
  \' handle_string(QUOTE); yy_pop_state();
  \\\\ | /* protected literal slash */
  \\\' | /* slashed out quote */
  \\   | /* single slash on its own */
  [^\\\']+ handle_string(STRING); /* eat up anthing without slashes */
}

<php_line_comment>{
  [^%?\n]* handle_string(COMMENT); /* match normal text" */
  \n handle_string(COMMENT); yy_pop_state();
  . handle_string(COMMENT); /* match parts of end tags */
  {PHP_END_TAG} { handle_string(SCRIPT_END); yy_pop_state(); yy_pop_state(); } 
}

<here_doc>{
  ^[A-Za-z0-9]+ { 
      /* a possible end tag  - all of the line to the first non alnum*/
      if ( (yyleng == here_len) && memcmp (yytext, here_tag, here_len) == 0 ) {
	free(here_tag);
	handle_string(HERE_STOP);
	yy_pop_state();
      } else {
	handle_string(HSTRING);
      }
  }
  [^A-Za-z0-9\n][^\n]*  handle_string(HSTRING); /* non alum to end of line*/
  \n handle_string(HSTRING); /* end of lines */ 
}

<<EOF>> { 
    error("unexpected eof during parsing script"); 
    put_out(END_FILE, 0, "");
    yyterminate();
  }

%%

/* call_lexer ( callback_function ) -

this function calls the lexer and every time it recieves output calls
the callback function with the appropriate 

  int - type of string
  sizet - length of string
  str - string

Currently this function is not re-entrant because currently yylex is
not re-entrant.  It may be re-implemented as reentrant later and .

N.B. the return of the callback_function is ignored.  Handle errors by
through setjump/longjump (read about flex limitations though!!) or by
exiting the program.

*/

void (*callback_store)(int, size_t, char *, void *);
void *callback_data_store;

int 
phtmlparse_file ( FILE* in_stream, int options, 
		  void (*callback)(int, size_t, char *, void *),
		  void *callback_data ) {
  asp_tag=0;
  short_tags=0;
  start_mode=ASP;

  yyin = in_stream;
  if ( options & LEX_PHP )
    start_mode=PHP;
  if ( options & LEX_ASP_TAG )
    asp_tag=1;
  if ( options & LEX_SHORT_TAG )
    short_tags=1;
  if ( options & LEX_DEBUG ) {
    fprintf ( stderr, "Parser starting\n" );
    fprintf ( stderr, "Script type: %s ASP Tag: %s Short Tag %s\n",
	      ( ( start_mode == ASP ) ? "ASP" : "PHP" ),
	      ( ( asp_tag ) ? "yes" : "no" ),
	      ( ( short_tags ) ? "yes" : "no" ) );
  }

  callback_store = callback;
  callback_data_store = callback_data;

  phtmlparse_error_message=0;
  error_status=0;
  yylex();

  if ( error_status ) {
    if ( options & LEX_OUTPUT_ERRORS )
      fprintf(stderr, "%s\n", phtmlparse_error_message);
    if ( options & LEX_FATAL_ERRORS )
      exit(1);
  }
  return error_status;
}

static inline void
put_out(int type, int len, char *string) {
  (*callback_store)(type,len,string,callback_data_store);
}

inline int
phtmlparse_is_text (int check) {
  return (check & TEXT);
}

inline int
phtmlparse_is_script (int check) {
  return (check & (SCRIPT|QUOTE|QQUOTE|HERE_START|HERE_STOP
		   |STRING|SSTRING|HSTRING|COMMENT));
}

inline int
phtmlparse_is_string (int check) {
  return (check & (STRING|SSTRING|HSTRING));
}

inline int
phtmlparse_is_tag (int check) {
  return (check & (SCRIPT_START|SCRIPT_END));
}
