/* 
 * Copyright (C) 2000-2003 by Oswald Buddenhagen <puf@ossi.cjb.net>
 * based on puf 0.1.x (C) 1999,2000 by Anders Gavare <gavare@hotmail.com>
 *
 * You may modify and distribute this code under the terms of the GPL.
 * There is NO WARRANTY of any kind. See COPYING for details.
 *
 * recurse.c - scan files/buffers for references to other urls
 *
 */

#include "puf.h"


/* Find rule in buf and return pointer past the match */
/* This is case-sensitive!!! */
static const char *
matchen(const char *buff, int blen, const char *rule)
{
    const char *buf, *rp;
    int bp, len;
    char r, b;

    for (buf = buff - 1, len = blen;;) {
	bp = 0;
	rp = rule;
	r = *rp;
	do {
	    buf++;
	    if (--len < 0)
		goto ret0;
	} while (*buf != r);
	do {
	    bp++;
	    rp++;
	    b = buf[bp];
	    if (!(r = *rp))
		goto complete;
	} while (b == r);
    }

  complete:
    return buf + bp;
  ret0:
    return 0;
}

static void 
recursen(url_t *u, const char *buf, const char *lbuf, int len, int max, 
	 const char *what, int is_req, aurl_t *au)
{
    const char *moff;
    int p, p2;

    for (p = 0; (moff = matchen(lbuf + p, max - p, what)); ) {
	p = moff - lbuf;

	/*  Find the end of the ref-string  */
	if (buf[p] == '\\' && buf[p + 1] == '"') {
	    /* for some bust sites */
	    p += 2;
	    p2 = p;
	    while (p + 2 < len && buf[p] != '#' && buf[p] >= ' ' && 
		   (buf[p] != '\\' || buf[p + 1] != '"'))
		p++;
	} else if (buf[p] == '"') {
	    p++;
	    p2 = p;
	    while (p + 1 < len && buf[p] != '#' && buf[p] >= ' ' &&
		   buf[p] != '"')
		p++;
	} else {
	    /* space is illegal but still occurs. but we can't match it here. */
	    /* unquoted strings are illegal per se. */
	    p2 = p;
	    while (p + 1 < len && buf[p] != '#' && buf[p] > ' ' &&
		   buf[p] != '>')
		p++;
	}

	if (p - p2 > 0)
	    parse_add_url("reference", buf + p2, p - p2, u,
			  u, u->parm, is_req, 0, u->link_depth + 1, au);
    }
}

/*  scan a buffer for references  */
int 
recurse_buff(url_t *u, const char *buf, int len, int notlast, aurl_t *au)
{
    char lbuf[MAXBUFSIZE + OVERLAPLEN];
    int p, max = len - (notlast ? OVERLAPLEN : 15);

    if (max <= 0)
	return 0;

    dbgu(REF, (u, "recursing $u\n"));

    if (au && !au->url->parm->opt->dump_refs)
	au = 0;

    for (p = 0; p < len; p++)
	lbuf[p] = tolower((int)buf[p]);

    if (u->parm->opt->follow_src > NOT_RECURSIVE || au) {
	recursen(u, buf, lbuf, len, max, "src=", 1, au);
	recursen(u, buf, lbuf, len, max, "background=", 1, au);
    }
    if (u->parm->opt->follow_href > NOT_RECURSIVE || au)
	recursen(u, buf, lbuf, len, max, "href=", 0, au);

    return max;
}


/*  scan a partial file for references  */

void 
recurse_pfile(url_t *u, int fi, char **bupo, int *lepo, aurl_t *au)
{
    int len, off, nnl;
    char buf[MAXBUFSIZE];

    dbgu(REF, (u, "recursing $u from file%s\n", bupo ? " (partial)" : ""));
    if (u->parm->opt->ext_dump) {
	nnl = 0;
	while ((len = read(fi, buf, MAXBUFSIZE)) > 0)
	    for (off = 0; off < len; )
		if (buf[off++] == '\n') {
		    if (++nnl == 2)
			goto gotit;
		} else
		    nnl = 0;
	return;
      gotit:
	memmove(buf, buf + off, len - off);
    } else
	off = 0;
    /*  Scan file for href's and src's:  */
    while ((len = read(fi, buf + off, MAXBUFSIZE - off) + off) ==
	   MAXBUFSIZE) {
	recurse_buff(u, buf, len, 1, au);
	memcpy(buf, buf + MAXBUFSIZE - OVERLAPLEN, off = OVERLAPLEN);
    }
    off = recurse_buff(u, buf, len, bupo != 0, au);
    if (bupo) {
	len -= off;
	*bupo -= len;
	*lepo += len;
	memcpy(*bupo, buf + off, len);
    }
}


/*  scan an entire file for references  */

void 
recurse_file(url_t *u, char *name)
{
    int fi, fi2;

    if ((fi = mmfopen(name, O_RDONLY, &fi2)) >= 0) {
	recurse_pfile(u, fi, 0, 0, 0);
      if (fi2 != -1)
	close(fi2);
    } else
	prx(ERR, "cannot scan %s for links: %s\n", name, strerror(errno));
}

int
needs_recurse_u(url_t *u, int dump)
{
    int ret =
	(dump && u->parm->opt->dump_refs) ||
	((!u->parm->opt->max_recurse ||
	  u->link_depth < u->parm->opt->max_recurse) &&
	 u->parm->opt->follows_max > NOT_RECURSIVE);
    dbgu(REF, (u, "$u needs%s recurse\n", ret ? "" : " no"));
    return ret;
}

int
needs_recurse_au(aurl_t *au, int dump)
{
    if (!au->content_is_html && !au->url->parm->opt->force_html) {
	dbgu(REF, (au->url, "$u needs no recurse (not html)\n"));
	return 0;
    }
    return needs_recurse_u(au->url, dump);
}

