
#include <string.h>
#include <stdio.h>
#include "catdoc.h"
static char OLE_SIGN[]={0xd0,0xcf,0x11,0xE0,0xA1,0xB1,0x1a,0xe1,0};
static unsigned short int buffer[PARAGRAPH_BUFFER];
/**************************************************************************/
/* Just prints out content of input file. Called when file is not OLE     */
/* stream                                                                 */
/**************************************************************************/
void copy_out (FILE *f) {
    char *buf=(char *)buffer;
    int count;
    while ((count = fread(buf,1,PARAGRAPH_BUFFER,f))) {
        fwrite(buf,1,count,stdout);
    }
} 
/**************************************************************************/
/*  process_file - main process engine. Reads word file using function,   */
/*  pointed by get_unicode_char, searches for things which looks like     */
/*  paragraphs and print them out                                         */
/**************************************************************************/
void process_file(FILE *f) {
    int bufptr;
    int tabmode=0;
    unsigned short c;
    if (signature_check) {
        /*checking for OLE header */
        char sign_buf[9]={0,0,0,0,0,0,0,0,0};
        fread(sign_buf,8,1,f);
        if (strcmp(sign_buf,OLE_SIGN)) {
            fputs(sign_buf, stdout);
            copy_out(f);
            return;
        }
    }
    /* Now we are starting to read with get_unicode_char */
    while (!feof(f)) {
        bufptr = -1;
        do {
             c=get_unicode_char(f);
	    /* Following symbols below 32 are allowed inside paragraph:
               0x0007 - table separator (converted to tabmode)
               0x0009 - Horizontal tab ( printed as is)
               0x000D - return - marks an end of paragraph
               0x001E - IS2 for some reason means short defis in Word.
               0x000B - means same as return
             */
	     if (tabmode) {
		 tabmode=0;
	         if (c==0x007) {
	           buffer[++bufptr]=0x1E;
		   continue;
		 } else {
		   buffer[++bufptr]=0x1C;
		 }  
	     }   	 
             if (c<32) {
                 switch (c) {
                    case 0x007:
                            tabmode = 1;
                            break;
                    case 0x000D:
                    case 0x000B:
                        buffer[++bufptr]=0x000A;
                        break;
                    case 0x001E:
                        buffer[++bufptr]='-';
                        break;
                    case 0x0002: break;
                    case 0x0009:
                        buffer[++bufptr]=c;
                        break;
                    default:
                        bufptr=-1; /* Any other control char - discard para*/
                }
	    } else if (c >=0xfeff) {
                bufptr = -1;/* not a valid unicode - discard paragraph */
            } else {
                buffer[++bufptr]=c;
            }
        } while (bufptr<PARAGRAPH_BUFFER-2&&!feof(f) && buffer[bufptr]!=0x000a);
	if (bufptr>0) {
	   buffer[++bufptr]=0;
           output_paragraph(buffer);
        }
    }

}
