/******************************************************************************
 * program:     wp2latex                                                      *
 * function:    module for conversion HTML files into LaTeX 		      *
 * modul:       pass1htm.cc                                                   *
 * description: This module contains parser for HTML documents. It could be   *
 *		optionally compiled with WP2LaTeX package.		      *
 * licency:     GPL		                                              *
 ******************************************************************************/
#include<stdio.h>
#include<string.h>
#include<stdlib.h>

#include<strings.h>
#include<lists.h>
#include<dbllist.h>

#include"wp2latex.h"
#include"pass1xml.h"
#include "cp_lib/cptran.h"
#include "cp_lib/html.trn"


list HTMLChars(Table_html,sizeof(Table_html)/sizeof(char *));

#define HTMLVersion "0.18"


class TconvertedPass1_HTML:public TconvertedPass1_XML
     {
public:
     virtual int Convert_first_pass(void);
     };

/*Register translators here*/
TconvertedPass1 *Factory_HTML(void) {return new TconvertedPass1_HTML;}
FFormatTranslator FormatHTML("HTML",Factory_HTML);

static void ProcessKeyHTML(TconvertedPass1_XML *cq);


typedef enum
{
  TAG_ATTR =	129,
  TAG_xATTR =	130,
  TAG_BR =	131,
  TAG_xP =	133,
  TAG_DD =	140,
  TAG_JUST =	134,
  TAG_H =	135,
  TAG_xH =	136,
  TAG_HR =	139,
  TAG_IMG =	142,
  TAG_xINDENT = 141,
  TAG_ITEMIZE =	144,
  TAG_META =	137,
  TAG_P =	132,
  TAG_SCRIPT =  138,
  TAG_TABLE =	143,
  TAG_HTML =	145,
} HTML_TAGS;



#define PAPER_SIZE	210.0
#define PagePixels 800.0
static void ImageHTML(TconvertedPass1_XML *cq)
{
#ifdef DEBUG
  fprintf(cq->log,"\n#ImageHTML() ");fflush(cq->log);
#endif
TBox Box;
int i;
char *FileName=NULL;
char *str,terminator;

  initBox(Box);
  Box.Width=100; 		//100mm
  Box.Image_type=0;		//Image on disk
  Box.AnchorType = 0; 		/*0-Paragraph, 1-Page, 2-Character*/
  Box.HorizontalPos=2;		/*0-Left, 1-Right, 2-Center, 3-Full */

  if((i=cq->TAG_Args IN "src")>=0 || (i=cq->TAG_Args IN "SRC")>=0)
	FileName=cq->TAG_Args.Member(i,1);
  if((i=cq->TAG_Args IN "width")>=0 || (i=cq->TAG_Args IN "WIDTH")>=0)
	{
	str=cq->TAG_Args.Member(i,1);
	switch(sscanf(str,"%f%c",&Box.Width,&terminator))
	  {
	  case 0:terminator=255;break;
	  case 1:terminator=0;break;
	  case 2:break;
	  default:terminator=255;break;
	  }
	switch(terminator)
	  {
	  case 0:Box.Width *= PAPER_SIZE/PagePixels;
		 break;					//normal
	  case '%':Box.HorizontalPos=4;
		 if(Box.Width==100) Box.HorizontalPos=3;
		 break;				        //percentage
	  default:break;			        //nothing
	  }
	}
  if(FileName==NULL) FileName="dummy";

  ImageWP(cq, FileName, Box, NULL);
return;
}


static void IndentHTML(TconvertedPass1_XML *cq)
{
#ifdef DEBUG
  fprintf(cq->log,"\n#IndentHTML() ");fflush(cq->log);
#endif
  cq->tabpos[0]=470+cq->WP_sidemargin;
  Indent(cq,0);
}


static void ItemizeHTML(TconvertedPass1_XML *cq)
{
#ifdef DEBUG
  fprintf(cq->log,"\n#ItemizeHTML() ");fflush(cq->log);
#endif
unsigned char OldFlag;
char OldEnvir;
long FilePos;

  FilePos = ftell(cq->wpd);
  OldFlag = cq->flag;
  OldEnvir = cq->envir;
  cq->flag = Nothing;
  cq->recursion++;

  cq->line_term = 's';   /* Soft return */
  if(cq->char_on_line == -20)    /* Left one enpty line for new enviroment */
	{
	fputc('%', cq->table);fputc('%', cq->strip);
	NewLine(cq);
	}
  if(cq->char_on_line>=true) //  if(cq->char_on_line>=-1)
	{
	NewLine(cq);
	}
  cq->envir='!';
  fputc('%', cq->table);fputc('%', cq->strip);
  NewLine(cq);

  cq->envir = ' ';

  if(OldEnvir=='B') fprintf(cq->strip, "\\vbox{"); //protect itemize inside table
  fprintf(cq->strip, "\\begin{itemize}");

  cq->char_on_line = false;
  cq->nomore_valid_tabs = false;
  cq->rownum++;
  Make_tableentry_attr(cq);
  cq->latex_tabpos = 0;

	/*Process all content of the table */
  cq->flag=OldFlag;
  fseek(cq->wpd,FilePos,SEEK_SET);
  cq->ReadXMLTag();
  while(!feof(cq->wpd))
	{
/*	if(cq->by==0 && (cq->subby==10 || cq->subby==13))
		{
		cq->subby=' ';	// remove \n from cell text
		} */
	ProcessKeyHTML(cq);
	if(cq->by==TAG_ITEMIZE)
		{
		if(cq->subby==1) break; /*End of itemize*/
		if(cq->subby==2) fprintf(cq->strip, "\\item ");
		}

	cq->ReadXMLTag();
	}

  fprintf(cq->strip, "\\end{itemize}");
  if(OldEnvir=='B') fprintf(cq->strip, "}");
  if(cq->char_on_line <= -10)    /* Left one enpty line for ending enviroment */
	{
	fputc('%', cq->table);fputc('%', cq->strip);
	NewLine(cq);
	}
  cq->envir='^';		//Ignore enviroments after table
  fputc('%', cq->table);
  NewLine(cq);
  cq->char_on_line = -10;		// stronger false;

  cq->recursion--;

  cq->flag=OldFlag;
  cq->envir=OldEnvir;
cq->TAG="ITEMIZE";
strcpy(cq->ObjType, "Itemize Start");
}


static void LanguageHTML(TconvertedPass1_XML *cq, const char *LangStr)
{
 if(LangStr==NULL) return;
 if(*LangStr==0) return;

 if(!strcmp(LangStr,"en-US"))
 { 
   Language(cq,'U'+256*'S');
 }
}


/** This function extracts some information from meta HTML tag */
static void MetaHTML(TconvertedPass1_XML *cq)
{
#ifdef DEBUG
  fprintf(cq->log,"\n#MetaHTML() ");fflush(cq->log);
#endif
  int i;
  char *content,*charset;
  string translator;

  strcpy(cq->ObjType, "Meta");

  if((i=cq->TAG_Args IN "content")>=0 || (i=cq->TAG_Args IN "CONTENT")>=0)
	{
	content=cq->TAG_Args.Member(i,1);
	if(content==NULL) return;
	charset=strstr(content,"CHARSET");
	if(charset==NULL) charset=strstr(content,"charset");
	if(charset==NULL) return;
	charset+=7;
	while(*charset==' ') charset++;
	if(*charset++!='=') return;

	cq->SelectTranslator(charset);
	}
}


static void ScriptXML(TconvertedPass1_XML *cq)
{
#ifdef DEBUG
  fprintf(cq->log,"\n#ScriptXML() ");fflush(cq->log);
#endif
unsigned char OldFlag;

OldFlag = cq->flag;
cq->flag = Nothing;

do {
   cq->TAG = "";
   ProcessKeyHTML(cq);
   if(cq->by==TAG_SCRIPT && cq->subby==1) break;
   }while(!feof(cq->wpd));

cq->flag = OldFlag;
}


static void TableHTML(TconvertedPass1_XML *cq)
{
#ifdef DEBUG
  fprintf(cq->log,"\n#TableHTML() ");fflush(cq->log);
#endif
unsigned char OldFlag;
char OldEnvir;
long FilePos;
int i,FieldCount=0;
char State=0;
char *Align;
char *FieldPos=NULL,Alignment;

FilePos = ftell(cq->wpd);
OldFlag = cq->flag;
OldEnvir = cq->envir;
cq->flag = Nothing;
cq->recursion++;

do {
   cq->TAG = "";
   ProcessKeyHTML(cq);
   if(cq->by==TAG_TABLE)
     {
     if(cq->subby == 0) break;			// <TABLE>
     if(cq->subby == 1) break;			// </TABLE>
     if(cq->subby == 2)				// <TR>
	  {
	  if(State==0) {State++;continue;}
	  if(FieldCount==0) continue;
	  break;
	  }
     if(cq->subby == 3) break;			// </TR>
     if(cq->subby == 4 || cq->subby == 6)	// <TD> or <TH>
	  {
	  if(State==0) State++;
	  Alignment='l';
	  if((i=cq->TAG_Args IN "align")>=0 || (i=cq->TAG_Args IN "ALIGN")>=0)
	      {
	      Align=cq->TAG_Args.Member(i,1);
	      if(!StrCmp(Align,"left"))  Alignment='l';
	      if(!StrCmp(Align,"right")) Alignment='r';
	      if(!StrCmp(Align,"center"))Alignment='c';
	      }
	  if(FieldPos==NULL) FieldPos=(char *)malloc(100);
	  if(FieldCount<100 && FieldPos!=NULL)
		{
		FieldPos[FieldCount]=Alignment;
		}
	  FieldCount++;
	  }
     }
   }while(!feof(cq->wpd));


  if(FieldCount>0)
    {
    cq->line_term = 's';   /* Soft return */
    if(cq->char_on_line == -20)    /* Left one enpty line for new enviroment */
	{
	fputc('%', cq->table);fputc('%', cq->strip);
	NewLine(cq);
	}
    if(cq->char_on_line>=true) //  if(cq->char_on_line>=-1)
	{
	NewLine(cq);
	}
    cq->envir='!';
    fputc('%', cq->table);fputc('%', cq->strip);
    NewLine(cq);

    cq->envir = 'B';

    fprintf(cq->strip, "{|");
    for (i = 0; i < FieldCount; i++)
	{
	Alignment='l';
	if(FieldCount<100 && FieldPos!=NULL) Alignment=FieldPos[i];
	fprintf(cq->strip, "%c|",Alignment);
	}
    putc('}', cq->strip);

    cq->char_on_line = false;
    cq->nomore_valid_tabs = false;
    cq->rownum++;
    Make_tableentry_attr(cq);
    cq->latex_tabpos = 0;
    }

	/*Process all content of the table */
  cq->flag=OldFlag;
  fseek(cq->wpd,FilePos,SEEK_SET);
  cq->ReadXMLTag();
  while(!feof(cq->wpd))
	{
	if(cq->by==0 && (cq->subby==10 || cq->subby==13))
		{
		cq->subby=' ';	/* remove \n from cell text */
		}
	ProcessKeyHTML(cq);
	if(cq->by==TAG_TABLE && cq->subby==1) break; /*End of table*/

	cq->ReadXMLTag();
	}
  if(cq->char_on_line <= -10)    /* Left one enpty line for ending enviroment */
	{
	fputc('%', cq->table);fputc('%', cq->strip);
	NewLine(cq);
	}
  cq->envir='^';		//Ignore enviroments after table
  fputc('%', cq->table);
  NewLine(cq);
  cq->char_on_line = -10;		// stronger false;

  cq->recursion--;

  cq->flag=OldFlag;
  cq->envir=OldEnvir;
cq->TAG="TABLE";
strcpy(cq->ObjType, "Table Start");
}


static void ProcessKeyHTML(TconvertedPass1_XML *cq)
{
#ifdef DEBUG
  fprintf(cq->log,"\n#ProcessKeyHTML() ");fflush(cq->log);
#endif
 string TAG;
 const char *tag = NULL;
 BYTE by,subby;
 int i;

 *cq->ObjType=0;
 if(cq->TAG=="") cq->ReadXMLTag();
 by=cq->by;
 subby=cq->subby;

 switch(by)
	{
	case 0:switch(cq->subby)
		  {
		  case 10:
		  case 13:by=128;break; //CR
		  case  9:strcpy(cq->ObjType, "!Tab");
		  case 32:by=32;break;  //Space
		  }
	       break;					//Normal character
	case 1:
	case 5:TAG=copy(cq->TAG,1,length(cq->TAG)-2);   //Extended chatacter &xxx;
	       if( (tag=TAG())==NULL) break;
	       if(TAG=="nbsp")	{by=200;break;}  	//Hard space

	       if((i=(TAG() IN HTMLChars))>0)
		  {
		  i--;
		  by=201;
		  tag = Ext_chr_str(i, cq, cq->ConvertHTML); /*Translate HTML character set to WP5.x one*/
		  }
//	       fprintf(cq->strip," !!!&%s; ",TAG.ch);
	       break;
	case 2:TAG = copy(cq->TAG,1,length(cq->TAG)-2);	//Normal tag <xxx>
	       if( (tag=TAG())==NULL) break;
	       if(TAG=="B")    {by=TAG_ATTR;subby=12;break;}	//ATTR on bold
	       if(TAG=="BIG")  {by=TAG_ATTR;subby=2;break;}	//ATTR on large
	       if(TAG=="BR")   {by=TAG_BR;break;}		//HRt
	       if(TAG=="DD")   {by=TAG_DD;break;}		//Indented definition
	       if(TAG=="CENTER"){by=TAG_JUST;subby=0x82;break;}//Justification Center
	       if(TAG=="H1")   {by=TAG_H;subby=1;break;}	//section level 1
	       if(TAG=="H2")   {by=TAG_H;subby=2;break;}	//section level 2
	       if(TAG=="H3")   {by=TAG_H;subby=3;break;}	//section level 3
	       if(TAG=="H4")   {by=TAG_H;subby=4;break;}	//section level 4
	       if(TAG=="H5")   {by=TAG_H;subby=1;break;}	//section level 5
	       if(TAG=="H6")   {by=TAG_H;subby=2;break;}	//section level 6
	       if(TAG=="HR")   {by=TAG_HR;subby=0;break;}	//horizontal line
	       if(TAG=="HTML") {by=TAG_HTML;}			//horizontal line
	       if(TAG=="I")    {by=TAG_ATTR;subby=8;break;}	//ATTR on italic
	       if(TAG=="IMG")  {by=TAG_IMG;break;}		//Image
	       if(TAG=="LI")   {by=TAG_ITEMIZE;subby=2;break;}	//Start of item
	       if(TAG=="META") {by=TAG_META;subby=0;break;}	//meta tag
	       if(TAG=="P")    {by=TAG_P;break;}		//new paragraph
	       if(TAG=="S")    {by=TAG_ATTR;subby=13;break;}	//ATTR on strike out
	       if(TAG=="SMALL"){by=TAG_ATTR;subby=3;break;}	//ATTR on small
	       if(TAG=="SCRIPT"){by=TAG_SCRIPT;subby=0;break;}	//start SCRIPT
	       if(TAG=="SUB")  {by=TAG_ATTR;subby=6;break;}	//ATTR on subscript
	       if(TAG=="SUP")  {by=TAG_ATTR;subby=5;break;}	//ATTR on superscript
	       if(TAG=="TABLE"){by=TAG_TABLE;subby=0;break;}	//Start of the table
	       if(TAG=="TD")   {by=TAG_TABLE;subby=4;break;}	//Start of cell
	       if(TAG=="TH")   {by=TAG_TABLE;subby=6;break;}	//Start of head cell
	       if(TAG=="TR")   {by=TAG_TABLE;subby=2;break;}	//Start of row
	       if(TAG=="TT")   {by=TAG_ATTR;subby=16;break;}	//ATTR on typewriter
	       if(TAG=="U")    {by=TAG_ATTR;subby=14;break;}	//ATTR on underline
	       if(TAG=="UL")   {by=TAG_ITEMIZE;subby=0;break;}	//Start of itemize
	       break;
	case 3:TAG=copy(cq->TAG,2,length(cq->TAG)-3);	//Closing tag </xxx>
	       if( (tag=TAG())==NULL) break;
	       if(TAG=="B")    {by=TAG_xATTR;subby=12;break;}	//ATTR off bold
	       if(TAG=="BIG")  {by=TAG_xATTR;subby=2;break;}	//ATTR off large
	       if(TAG=="CENTER"){by=TAG_JUST;subby=0x81;break;}	//End of center justification
	       if(TAG=="DD")   {by=TAG_xINDENT;break;}		//End of indented definition
	       if(TAG=="DL")   {by=TAG_xINDENT;break;}		//End of definition list
	       if(TAG=="H1")   {by=TAG_xH;subby=1;break;}	//section off level 1
	       if(TAG=="H2")   {by=TAG_xH;subby=2;break;}	//section off level 2
	       if(TAG=="H3")   {by=TAG_xH;subby=3;break;}	//section off level 3
	       if(TAG=="H4")   {by=TAG_xH;subby=4;break;}	//section off level 4
	       if(TAG=="H5")   {by=TAG_xH;subby=1;break;}	//section off level 5
	       if(TAG=="H6")   {by=TAG_xH;subby=2;break;}	//section off level 6
	       if(TAG=="LI")   {by=TAG_ITEMIZE;subby=3;break;}	//End of item
	       if(TAG=="I")    {by=TAG_xATTR;subby=8;break;}	//ATTR off italic
	       if(TAG=="P")    {by=TAG_xP;break;}		//end paragraph
	       if(TAG=="S")    {by=TAG_xATTR;subby=13;break;}	//ATTR off strike out
	       if(TAG=="SMALL"){by=TAG_xATTR;subby=3;break;}    //ATTR off small
	       if(TAG=="SCRIPT"){by=TAG_SCRIPT;subby=1;break;}	//end SCRIPT
	       if(TAG=="SUB")  {by=TAG_xATTR;subby=6;break;}	//ATTR off subscript
	       if(TAG=="SUP")  {by=TAG_xATTR;subby=5;break;}	//ATTR off superscript
	       if(TAG=="TABLE"){by=TAG_TABLE;subby=1;break;}	//End of Table
	       if(TAG=="TD")   {by=TAG_TABLE;subby=5;break;}	//End of cell
	       if(TAG=="TH")   {by=TAG_TABLE;subby=7;break;}	//End of head cell
	       if(TAG=="TR")   {by=TAG_TABLE;subby=3;break;}	//End of row
	       if(TAG=="TT")   {by=TAG_xATTR;subby=16;break;}	//ATTR off typewriter
	       if(TAG=="U")    {by=TAG_xATTR;subby=14;break;}	//ATTR off underline
	       if(TAG=="UL")   {by=TAG_ITEMIZE;subby=1;break;}	//End of itemize
	       break;
	case 4:				//comment
	       break;
	}

  cq->by=by;
  cq->subby=subby;
  if(cq->flag<Nothing)
    switch(by)
	{
	case 0:tag = Ext_chr_str(subby,cq,cq->ConvertCpg);
	       CharacterStr(cq,tag);
	       break;		//Normal character
	case 4:cq->CommentXML();
	       break;
	case 6:CharacterStr(cq,cq->TAG);
	       break;		//Already expanded unicode character
	case 32:putc(' ', cq->strip);   /*soft space*/
		break;

	case 128:if(cq->TablePos!=1 && cq->TablePos!=3)
		   if(cq->char_on_line)
			SoftReturn(cq);
		 break;
	case TAG_ATTR:	AttrOn(cq->attr,subby);break;
	case TAG_xATTR:	AttrOff(cq,subby);     break;
	case TAG_BR:	HardReturn(cq);        break;
	case TAG_P:	if(cq->char_on_line) HardReturn(cq);  //Paragraph on
			SoftReturn(cq);
			break;
	case TAG_xP:	if(cq->char_on_line) HardReturn(cq);  //Paragraph off
			break;
	case TAG_JUST:	Justification(cq,subby);break;
	case TAG_H:	StartSection(cq,-subby);break;
	case TAG_xH:	EndSection(cq,-subby);	break;
	case TAG_META:	MetaHTML(cq);		break;
	case TAG_SCRIPT:ScriptXML(cq);		break;
	case TAG_HR:	HLine(cq,-16);		break;
	case TAG_DD:	IndentHTML(cq);	break;
	case TAG_xINDENT:End_of_indent(cq);	break;
	case TAG_IMG:	ImageHTML(cq);		break;
	case TAG_TABLE: switch(subby)
		   {
		   case 0:TableHTML(cq);cq->TablePos=1;break;
		   case 1:EndTable(cq);cq->TablePos=0; break;
		   case 2:RowTable(cq);cq->TablePos|=2;break;
		   case 3:cq->TablePos&=~2;break;
		   case 4:
		   case 6:CellTable(cq);cq->TablePos|=4;break;
		   case 5:
		   case 7:cq->TablePos&=~4;break;
		   }
		 break;
	case TAG_ITEMIZE: switch(subby)
		   {
		   case 0:ItemizeHTML(cq);break;
		   }
		 break;
	case TAG_HTML:
		{
		if((i=cq->TAG_Args IN "lang")>=0 || (i=cq->TAG_Args IN "LANG")>=0)
		  {		  
		  LanguageHTML(cq,cq->TAG_Args.Member(i,1));
		  if(cq->ConvertCpg==NULL) cq->SelectTranslator("iso_8859_1");
		  }

		break;
		}

	case 200:fputc('~', cq->strip);strcpy(cq->ObjType, " ");
		 break;
	case 201:CharacterStr(cq,tag);break; //
	}


 cq->by=by;
 cq->subby=subby;
 if (cq->log != NULL)
    {   /**/
    if(by==128) fputc('\n',cq->log);
    else if(by==' ' || by==200) fputc(' ',cq->log);    
    else if(by==0 || by==201)
	{
	if(tag)
	  fprintf(cq->log,"%s",tag);
        else
	  fprintf(cq->log,"!%c",cq->subby);
	}
    else
	{
	fprintf(cq->log, _("\n%*s [%s %s]   "),
		  cq->recursion * 2, "", cq->TAG(), cq->ObjType);
//	if(*cq->ObjType==0) UnknownObjects++;
	}
    }

 cq->ActualPos = ftell(cq->wpd);
}


int TconvertedPass1_HTML::Convert_first_pass(void)
{
#ifdef DEBUG
  fprintf(log,"\n#Convert_pass1_HTML() ");fflush(log);
#endif
DWORD fsize;

  if(Verbosing >= 1)
     printf(_("\n>>>HTML2LaTeX<<< Conversion program: From HTML to LaTeX Version %s\n"
	      "      Made by J.Fojtik  (Hosted on WP2LaTeX :))))\n\n"),
	    HTMLVersion);

  ConvertHTML = GetTranslator("htmlTOinternal");
  CharReader = &ch_fgetc;

  DocumentStart=ftell(wpd);

  TablePos=0;

  fsize=filesize(wpd);
  perc.Init(ftell(wpd), fsize,_("First pass HTML:") );

  ActualPos = ftell(wpd);
  while (ActualPos < fsize)
      {
      if(Verbosing >= 1)		//actualise a procentage counter
	      perc.Actualise(ActualPos);

      TAG = "";
      ProcessKeyHTML(this);
      }

  Finalise_Conversion(this);
  return(1);
}
