/* $Cambridge: hermes/src/prayer/session/html_secure.c,v 1.9 2009/01/18 12:03:15 dpc22 Exp $ */
/************************************************
 *    Prayer - a Webmail Interface              *
 ************************************************/

/* Copyright (c) University of Cambridge 2000 - 2008 */
/* See the file NOTICE for conditions of use and distribution. */

#include "prayer_session.h"

/* text/HTML model implemented by sqwebmail.
 *
 * Browser Security - HTML
 *
 *  SqWebMail has the ability to display HTML E-mail, which leads to
 *  several complicated situations regarding embedded Javascript or Java
 *  applets that try to grab the mailboxid of the recipient (amongst other
 *  things). SqWebMail attempts to remove all forms of scripting from HTML
 *  E-mail as follows:
 *    * The following HTML tags are removed: <SCRIPT>, </SCRIPT>, <APP>,
 *      </APP>, <APPLET>, </APPLET>, <SERVER>, </SERVER>, <OBJECT>,
 *      </OBJECT>, <HTML>, </HTML>, <HEAD>, </HEAD>, <BODY>, </BODY>,
 *      <META>, <TITLE>, </TITLE>, <FRAME>, </FRAME>, <LINK>, <IFRAME> and
 *      </IFRAME>.
 *    * The following HTML attributes are stripped from every tag:
 *      ONLOAD=, ONMOUSEOVER=, and all ON*= attributes; TARGET=, CODE=,
 *      CODETYPE=, and LANGUAGE= are removed; TARGET=_blank is added to
 *      all <A> tags.
 *    * The HREF and SRC attributes are stripped, unless the URL starts
 *      with one of the following: http:, https:, ftp:, gopher:, wais:, or
 *      telnet, and cid:.
 *    * The HREF and SRC attribute values are prefixed with a URL that
 *     will resolve to SqWebMail, and with an additional TARGET="_blank"
 *     attribute. A request to that resulting URL will result in a blank
 *     page with a 0-second refresh to the original URL. This method
 *     strips mailbox IDs from Referer: tags sent to external web site.
 *     If the HREF attribute starts with a cid:, it is replaced by an
 *     http: reference to SqWebMail that will return the specified MIME
 *     part.
 *   * IMG tags are removed and replaced with an A tag, in order to keep
 *     the HTTP client from automatically loading any images from
 *     external web sites, upon opening a given message.
 */

/* Additional notes for Prayer implementation
 *   o We allow IMG tags, but the SRC argument is a /redirect to prevent
 *     "Referrer:" attacks
 *
 *   o No attempt to interpret multibyte characters sequences which some
 *     browsers may recognise as alternatives for '<' etc in some character
 *     sets. We rely on the fact that Prayer should always do 8 bit -> 7 bit
 *     plus entity convertion when generating HTML pages, and will always
 *     set a Charset.
 *
 *   o Like IMP we replace tags with <cleaned_tag>. This way the browser
 *     should quitely ignore the noise inside a tag that we want to kill.
 */

#define MATCH(string, tag)  ((strcasecmp(string, tag)) ? NIL : T)
#define NMATCH(string, tag) ((strncasecmp(string, tag, strlen(tag))) ? NIL : T)

/* ====================================================================== */

/* Static support routines */

static BOOL html_isspace(char c)
{
    return ((c == ' ') || (c == '\t') || (c == '\015') || (c == '\012'));
}


static char *html_get_token(char **sp)
{
    char *s = *sp, *result;

    if (!(s && *s))
        return (NIL);

    while (html_isspace(*s))
        s++;

    /* Do we need to strip out comments at token level? */

    /* Record position of this token */
    result = s;

    /* Find next whitespace character or end of string */
    while ((*s) && !html_isspace(*s) && (*s != '"') && (*s != '\''))
        s++;

    if (*s == '"') {
        s++;
        while ((*s) && (*s != '"'))
            s++;
        if (*s == '"')
            s++;
    } else if (*s == '\'') {
        s++;
        while ((*s) && (*s != '\''))
            s++;
        if (*s == '\'')
            s++;
    }

    /* Tie off the string unless \0 already reached */
    if (*s) {
        *s++ = '\0';

        while (html_isspace(*s))
            s++;
    }

    /* Record position of first non-whitespace character for next caller */
    *sp = s;

    if (result[0] == '\0')
        return (NIL);

    return (result);
}

/* Foul piece of code to strip out the following character entities from
 * the HTML text: &lt; &gt; &amp; &quot; &#<decimal>; and &#X<hex>;
 *
 * Can update in place as result will always be smaller than source */

static void
my_strip_entities(char *src)
{
    char *dst = src;

    if (!strchr(src, '&'))
        return;

    while (*src) {
        if (*src != '&') {
            *dst++ = *src++;           /* Optimise the comment case */
            continue;
        }
        if (!strncasecmp(src, "&lt;", 4)) {
            src += 4;
            *dst++ = '<';
            continue;
        }
        if (!strncasecmp(src, "&gt;", 4)) {
            src += 4;
            *dst++ = '>';
            continue;
        }
        if (!strncasecmp(src, "&amp;", 5)) {
            src += 5;
            *dst++ = '&';
            continue;
        }
        if (!strncasecmp(src, "&quot;", 6)) {
            src += 6;
            *dst++ = '"';
            continue;
        }
        if (src[1] == '#') {
            char *t = src+2;
            unsigned long value;
            
            if (((*t == 'x') || (*t == 'X')) && isxdigit(t[1])) {
                t += 2;
                while (isxdigit(*t))       /* Decode &#X<hexidecimal>; */
                    t++;
                if (*t == ';') {
                    value = strtoul(src+3, NIL, 16);
                    if ((value > 0) && (value < 256)) {
                        *dst++ = (char)value;
                        src    = t+1;
                        continue;
                    }
                }
            } else if (isdigit(*t)) {      
                while (isdigit(*t))        /* Decode &#<decimal>; */
                    t++;
                if (*t == ';') {
                    value = strtoul(src+2, NIL, 10);
                    if ((value > 0) && (value < 256)) {
                        *dst++ = (char)value;
                        src    = t+1;
                        continue;
                    }
                }
            }
        }
        *dst++ = *src++;  /* Failed to decode entity: just pass it through */
    }
    *dst = '\0';
}

/* ====================================================================== */

/* html_secure() *********************************************************
 *
 * Generate "secure" version of HTML input, with dangerous tags stripped.
 *      b: Output buffer
 *  input: Input string.
 ************************************************************************/

/* Trashes input => caller has to generate temporary copy.
 * Better way to do this? */

void html_secure(struct session *session, struct buffer *b,
                 BOOL show_images, char *input)
{
    struct options *options = session->options;
    struct prefs *prefs = options->prefs;
    char *s = input;
    char *tag, *token;
    char c;
    BOOL endtag, strip_tag;

    while (1) {
        /* Copy verbatim until we find a tag (Removes unmatched '>' chars:
         * apparently some browsers try to fix broken HTML by inferring missing
         * '<' characters. Yuck! */

        while ((c = *s++) && (c != '<')) {
            if (c != '>')
                bputc(b, c);
        }

        if (c == '\0')
            break;

        /* Strip comments entirely to avoid problems with tags within comments */
        if (!strncmp(s, "!--", strlen("!--"))) {
            s += strlen("!--");
            while ((c = *s)) {
                if ((c == '-') && !strncmp(s, "-->", strlen("-->"))) {
                    s += strlen("-->");
                    break;
                }
                s++;
            }
            continue;
        }

        /* Isolate tag */
        tag = s;
        while ((c = *s) && (c != '>'))
            s++;

        if (c == '\0')
            break;

        *s++ = '\0';

        if (!(token = html_get_token(&tag))) {
            /* Empty tag: Malformed HTML? */
            continue;
        }

        if (*token == '/') {
            endtag = T;
            token++;
        } else
            endtag = NIL;

        strip_tag = NIL;
        switch (Utoupper(*token)) {
        case 'A':
            if (MATCH(token, "APP"))
                strip_tag = T;
            else if (MATCH(token, "APPLET"))
                strip_tag = T;
            break;
        case 'B':
            if (MATCH(token, "BASE"))
                strip_tag = T;
            else if (MATCH(token, "BODY"))
                strip_tag = T;
            break;
        case 'F':
            if (MATCH(token, "FRAME"))
                strip_tag = T;
            else if (MATCH(token, "FORM"))
                strip_tag = T;
            break;
        case 'I':
            if (MATCH(token, "IFRAME"))
                strip_tag = T;
            else if (MATCH(token, "IMG") && !show_images)
                strip_tag = T;
            break;
        case 'H':
            if (MATCH(token, "HTML"))
                strip_tag = T;
            else if (MATCH(token, "HEAD"))
                strip_tag = T;
            break;
        case 'M':
            if (MATCH(token, "META"))
                strip_tag = T;
            break;
        case 'O':
            if (MATCH(token, "OBJECT"))
                strip_tag = T;
            break;
        case 'T':
            if (MATCH(token, "TITLE"))
                strip_tag = T;
            break;
        case 'S':
            if (MATCH(token, "SCRIPT"))
                strip_tag = T;
            else if (MATCH(token, "SERVER"))
                strip_tag = T;
            else if (MATCH(token, "STYLE"))
                strip_tag = T;
            break;
        }

        if (strip_tag) {
            if (endtag)
                bputs(b, "</cleaned_tag>");
            else
                bputs(b, "<cleaned_tag>");
            continue;
        }

        /* Tag has been accepted */
        if (endtag) {
            bputs(b, "</");
            bputs(b, token);
        } else {
            bputc(b, '<');
            bputs(b, token);
        }

        /* Check each attribute in tag */
        while ((token = html_get_token(&tag))) {
            switch (Utoupper(*token)) {
            case 'B':
                if (NMATCH(token, "BACKGROUND=") && !prefs->html_remote_images)
                    continue;
                break;
            case 'C':
                if (NMATCH(token, "CODE="))
                    continue;
                if (NMATCH(token, "CODEPAGE="))
                    continue;
                break;
            case 'H':
                if (NMATCH(token, "HREF=")) {
                    char *t = token + strlen("HREF=");

                    while (html_isspace(*t))
                        t++;

                    /* Remove quotes */
                    if ((*t == '"') && t[1] && (t[strlen(t) - 1] == '"')) {
                        t++;
                        t[strlen(t) - 1] = '\0';
                    } else if ((*t == '\'') && t[1]
                               && (t[strlen(t) - 1] == '\'')) {
                        t++;
                        t[strlen(t) - 1] = '\0';
                    }

                    /* Indirect links for HTTP and HTTPS to remove "Referrer:" */
                    my_strip_entities(t);

                    if (NMATCH(t, "http://") ||
                        NMATCH(t, "https://")) {
                        bprintf(b, " target=\"_blank\" href=\"%s\"", t);
                        continue;
                    } else if (((NMATCH(t, "ftp:")) ||
                                (NMATCH(t, "gopher:")) ||
                                (NMATCH(t, "wais:")) ||
                                (NMATCH(t, "telnet:")) ||
                                (NMATCH(t, "cid:")))) {
                        bprintf(b, " target=\"_blank\" href=\"%s\"", t);
                        continue;
                    } else if (NMATCH(t, "#")) {
                        bprintf(b, " href=\"%s\"", t);
                        continue;
                    } else
                        continue;       /* Remove this HREF tag */
                }
                break;
            case 'L':
                if (NMATCH(token, "LANGUAGE="))
                    continue;
                break;
            case 'O':
                if (NMATCH(token, "ONLOAD="))
                    continue;
                if (NMATCH(token, "ONMOUSEOVER="))
                    continue;

                if (NMATCH(token, "ON")) {
                    char *t = token + 2;

                    while (Uisalpha(*t))
                        t++;

                    if (*t == '=')
                        continue;
                }
                break;
            case 'S':
                if (NMATCH(token, "SRC=")) {
                    char *t = token + strlen("SRC=");

                    while (html_isspace(*t))
                        t++;

                    /* Remove quotes */
                    if ((*t == '"') && t[1] && (t[strlen(t) - 1] == '"')) {
                        t++;
                        t[strlen(t) - 1] = '\0';
                    } else if ((*t == '\'') && t[1]
                               && (t[strlen(t) - 1] == '\'')) {
                        t++;
                        t[strlen(t) - 1] = '\0';
                    }

                    /* Indirect links for HTTP and HTTPS to remove "Referrer:" */
                    my_strip_entities(t);

                    if (NMATCH(t, "http://") || NMATCH(t, "https://")) {
                        bprintf(b, " TARGET=\"_blank\" SRC=\"%s\"", t);
                        continue;
                    } else
                        if (((NMATCH(t, "ftp:")) || (NMATCH(t, "gopher:"))
                             || (NMATCH(t, "wais:"))
                             || (NMATCH(t, "telnet:"))
                             || (NMATCH(t, "cid:")))) {
                        bprintf(b, " TARGET=\"_blank\" SRC=\"%s\"", t);
                        continue;
                    } else
                        continue;       /* Remove this SRC tag */
                }
                break;
            case 'T':
                /* Remove target attributes */
                if (NMATCH(token, "TARGET="))
                    continue;
                break;
            }

            /* Default action: Accept this attribute */
            bputc(b, ' ');
            bputs(b, token);
        }
        /* Close this tag */
        bputc(b, '>');
    }
}

/* ====================================================================== */

/* html_secure_strip_all() ***********************************************
 *
 * Remove all HTML tags from source document, to generate something
 * suitable for reply. Really need to work harder at parsing HTML:
 * take a look at just what PINE does.
 *      b: Output buffer
 *  input: Input string.
 ************************************************************************/


/* Strip _all_ HTML tags from document */

void html_secure_strip_all(struct buffer *b, char *input)
{
    char *s = input;
    char c;
    BOOL not_empty;

    while (1) {
        not_empty = NIL;
        /* Copy verbatim until we find a tag */
        while ((c = *s++) && (c != '<')) {
            if ((c == '\015') && (*s == '\012')) {
                s++;            /* CRLF */
                bputs(b, "" CRLF);
                not_empty = NIL;
            } else if ((c == '\015') || (c == '\012')) {
                /* CR or LF */
                bputs(b, "" CRLF);
                not_empty = NIL;
            } else if ((c == ' ') || (c == '\t')) {
                bputc(b, c);
            } else if ((c == '&')) {
                /* Do what if we fail to match? */
                switch (Utoupper(*s)) {
                case '#':
                    if (Uisdigit(s[1]) && Uisdigit(s[2]) &&
                        Uisdigit(s[3]) && (s[4] == ';')) {
                        bputc(b, (char) ((((int) (s[1] - '0')) * 100) +
                                         (((int) (s[2] - '0')) * 10) +
                                         (((int) (s[3] - '0')))));
                        s += 4;
                        continue;
                    }
                    break;
                case 'A':
                    if (NMATCH(s, "AMP;")) {
                        bputc(b, '&');
                        s += strlen("AMP;");
                        continue;
                    }
                    break;
                case 'G':
                    if (NMATCH(s, "GT;")) {
                        bputc(b, '>');
                        s += strlen("GT;");
                        continue;
                    }
                    break;
                case 'L':
                    if (NMATCH(s, "LT;")) {
                        bputc(b, '<');
                        s += strlen("LT;");
                        continue;
                    }
                    break;
                case 'N':
                    if (NMATCH(s, "NBSP;")) {
                        bputc(b, ' ');
                        s += strlen("NBSP;");
                        continue;
                    }
                    break;
                case 'P':
                    if (NMATCH(s, "POUND;")) {
                        bputc(b, '&');
                        s += strlen("POUND;");
                        continue;
                    }
                    break;
                case 'Q':
                    if (NMATCH(s, "QUOT;")) {
                        bputc(b, '"');
                        s += strlen("QUOT;");
                        continue;
                    }
                    break;
                }
                /* Unrecognised '&' escape code. What is best action? */
                bputc(b, '&');
            } else {
                not_empty = T;
                bputc(b, c);
            }
        }

        if (c == '\0')
            break;

        /* Strip out comments completely */
        if (!strncmp(s, "!--", strlen("!--"))) {
            s += strlen("!--");
            while ((c = *s)) {
                if ((c == '-') && !strncmp(s, "-->", strlen("-->"))) {
                    s += strlen("-->");
                    break;
                }
                s++;
            }
            continue;
        }

        /* Find other end of this tag */
        while ((c = *s) && (c != '>'))
            s++;

        if (c == '\0')
            break;

        s++;

        if ((s[0] == '\015') && (s[1] == '\012')) {
            s += 2;             /* CRLF */
            if (not_empty)
                bputs(b, "" CRLF);
        } else if ((s[0] == '\015') || (s[0] == '\012')) {
            s++;                /* CR or LF */
            if (not_empty)
                bputs(b, "" CRLF);
        }
    }
}
