/*
 *   Copyright (C) 1997, 1998, 1999 Loic Dachary
 *
 *   This program is free software; you can redistribute it and/or modify it
 *   under the terms of the GNU General Public License as published by the
 *   Free Software Foundation; either version 2, or (at your option) any
 *   later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program; if not, write to the Free Software
 *   Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 
 *
 */
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif /* HAVE_CONFIG_H */

#include <stdio.h>
#include <string.h>
#include <ctype.h>

#include <hash.h>
#include <split.h>
#include <dirname.h>
#include <dirsel.h>
#include <uri.h>
#include <salloc.h>

#define DIRSEL_BOL	'/'
#define DIRSEL_ANY	'A'

typedef struct dirsel {
  int info;
  char** prefix;
  int prefix_size;
  int prefix_length;
  char* pool;
  int pool_size;
} dirsel_t;

typedef struct context {
  hash_t* allow;
  hash_t* disallow;
  hash_t* robots_allow;
  hash_t* robots_disallow;
  uri_t* url_object;
} context_t;

static int verbose = 0;

static context_t context;

static void dirsel_end_1(hash_t* table);
static void dirsel_insert(hash_t* table, char* url, char* dirs, int flag);
static int dirsel_allowed_1(char* comparable, dirsel_t* allow, dirsel_t* disallow);
static char* dirsel_comparable(char* netpath, uri_t* url_object);
static int dirsel_match(dirsel_t* entry, char* comparable);
static dirsel_t* dirsel_alloc();
static char* dirsel_netpath(uri_t* url_object);
static dirsel_t* entry_find(hash_t* table, char* key);

static void hnode_free(hnode_t *node, void *context)
{
  dirsel_t* entry = (dirsel_t*)node->data;
  free(entry->prefix);
  free(entry->pool);
  free(entry);
  free(node);
}

void dirsel_init()
{
  if(context.url_object == 0) {
    int size = 257;
    context.allow = hash_create(size, 0, 0);
    hash_set_allocator(context.allow, 0, hnode_free, 0);
    context.disallow = hash_create(size, 0, 0);
    hash_set_allocator(context.disallow, 0, hnode_free, 0);
    context.robots_allow = hash_create(size, 0, 0);
    hash_set_allocator(context.robots_allow, 0, hnode_free, 0);
    context.robots_disallow = hash_create(size, 0, 0);
    hash_set_allocator(context.robots_disallow, 0, hnode_free, 0);
    context.url_object = uri_alloc_1();
  }
}

void dirsel_end()
{
  dirsel_end_1(context.allow);
  dirsel_end_1(context.disallow);
  dirsel_end_1(context.robots_allow);
  dirsel_end_1(context.robots_disallow);
  uri_free(context.url_object);
  context.url_object = 0;
}

void dirsel_verbose(int level)
{
  verbose += level;
  fprintf(stderr, "\tdirsel_verbose: level = %d\n", level);
}

static void dirsel_end_1(hash_t* table)
{
  hash_free(table);
}

void dirsel_allow(char* url, char* dirs, int flag)
{
  char* netpath;
  uri_realloc(context.url_object, url, strlen(url));
  netpath = dirsel_netpath(context.url_object);
  if(verbose) fprintf(stderr, "\tdirsel_allow: %s (%s)\n", netpath, dirs);
  dirsel_insert(context.allow, netpath, dirs, flag);
}

void dirsel_disallow(char* url, char* dirs, int flag)
{
  char* netpath;
  uri_realloc(context.url_object, url, strlen(url));
  netpath = dirsel_netpath(context.url_object);
  if(verbose) fprintf(stderr, "\tdirsel_disallow: %s (%s)\n", netpath, dirs);
  dirsel_insert(context.disallow, netpath, dirs, flag);
}

void dirsel_robots_allow(char* netloc, char* dirs, int flag)
{
  if(verbose) fprintf(stderr, "\tdirsel_robots_allow: %s (%s)\n", netloc, dirs);
  dirsel_insert(context.robots_allow, netloc, dirs, flag);
}

void dirsel_robots_disallow(char* netloc, char* dirs, int flag)
{
  if(verbose) fprintf(stderr, "\tdirsel_robots_disallow: %s (%s)\n", netloc, dirs);
  dirsel_insert(context.robots_disallow, netloc, dirs, flag);
}

static void dirsel_insert(hash_t* table, char* key, char* dirs, int flag)
{
  hnode_t* node = hash_lookup(table, key);
  dirsel_t* entry;

  if(!node) {
    entry = dirsel_alloc();
    hash_alloc_insert(table, key, (void*)entry);
  } else if(flag == DIRSEL_LOAD) {
    return;
  } else {
    entry = (dirsel_t*)hnode_get(node);
  }

  static_alloc(&entry->pool, &entry->pool_size, strlen(dirs) + 1);
  strcpy(entry->pool, dirs);
  {
    char** splitted;

    split_inplace(entry->pool, strlen(entry->pool), &splitted, &entry->prefix_length, ' ', SPLIT_TRIM);
    static_alloc((char**)&entry->prefix, &entry->prefix_size, entry->prefix_length * sizeof(char*));
    memcpy((char*)entry->prefix, splitted, entry->prefix_length * sizeof(char*));
  }
}

/*
 * Return true if url_object is allowed considering
 * the restrictions associated to netpath.
 */
int dirsel_allowed(char* netpath, char* url)
{
  char* comparable;
  static char* netloc = 0;
  static int netloc_size = 0;

  if(!netpath) {
    fprintf(stderr, "dirsel_allowed: null netpath (probably crawl_touch on redirection)\n");
    return 0;
  }
  if(verbose) fprintf(stderr, "\tdirsel_allowed: netpath = %s, url = %s \n", netpath, url);

  if(uri_realloc(context.url_object, url, strlen(url)) != URI_CANNONICAL)
    return 1;

  comparable = dirsel_comparable(netpath, context.url_object);

  if(!comparable)
    return 1;

  {
    char* tmp = strchr(netpath, '/');
    static_alloc(&netloc, &netloc_size, strlen(netpath));
    strncpy(netloc, netpath, tmp - netpath);
    netloc[tmp - netpath] = '\0';
  }

  if(verbose) fprintf(stderr, "\tdirsel_allowed: comparable = %s \n", comparable);
  if(verbose) fprintf(stderr, "\tdirsel_allowed: search robots\n");
  {
    int ret = dirsel_allowed_1(comparable, entry_find(context.robots_allow, netloc), entry_find(context.robots_disallow, netloc));
    if(ret) {
      if(verbose) fprintf(stderr, "\tdirsel_allowed: search manual\n");

      ret = dirsel_allowed_1(comparable, entry_find(context.allow, netpath), entry_find(context.disallow, netpath));
    }
    return ret;
  }
}

static int dirsel_allowed_1(char* comparable, dirsel_t* allow, dirsel_t* disallow)
{
  /*
   * In allow table, allowed
   */
  if(allow) {
    if(verbose) fprintf(stderr, "\tdirsel_allowed_1: allow table found\n");
    if(dirsel_match(allow, comparable)) {
      if(verbose) fprintf(stderr, "\tdirsel_allowed_1: match found in allow table\n");
      return 1;
    }
  }

  /*
   * Not in allow table and not in disallow, allowed
   */
  if(!disallow) {
    if(verbose) fprintf(stderr, "\tdirsel_allowed_1: no disallow table, therefore allowed\n");
    return 1;
  }
  if(verbose) fprintf(stderr, "\tdirsel_allowed_1: disallow table found for '%s'\n", comparable);

  {
    int result = dirsel_match(disallow, comparable);
    if(verbose) fprintf(stderr, "\tdirsel_allowed_1: match %sfound in disallow table\n", (result ? "" : "not "));
    return !result;
  }
}

static char* dirsel_comparable(char* netpath, uri_t* url_object)
{
  char* fake_netpath = dirsel_netpath(url_object);
  int netpath_length = strlen(netpath);

  /*
   * If they do not start the same, the do not compare.
   */
  if(strncmp(fake_netpath, netpath, netpath_length))
    return 0;

  return url_object->path;
}

/*
 * Search a prefix contained in comparable.
 */
static int dirsel_match(dirsel_t* entry, char* comparable)
{
  int i;
  for(i = 0; i < entry->prefix_length; i++) {
    /*
     * Systematicaly skip the leading /
     * If, by accident, the prefix does not start with /, take it completely.
     * If the prefix is empty, ignore it.
     */
    char method = entry->prefix[i][0] == DIRSEL_BOL ? DIRSEL_BOL : DIRSEL_ANY;
    char* prefix = (method == DIRSEL_BOL) ? entry->prefix[i] + 1 : entry->prefix[i];
    int length = strlen(prefix);
    if(verbose) fprintf(stderr, "\tdirsel_match: compare prefix %s (deduced from %s) with %s using method %s\n", prefix, entry->prefix[i], comparable, (method == DIRSEL_BOL ? "bol" : "any"));
    if(length > 0) {
      int result = 0;
      switch(method) {
      case DIRSEL_BOL:
	result = !strncmp(prefix, comparable, length);
	break;
      case DIRSEL_ANY:
	result = strstr(comparable, prefix) != 0;
	break;
      default:
	fprintf(stderr, "dirsel_match: unknown method %d\n", method);
	break;
      }
      if(result) {
	if(verbose) fprintf(stderr, "\tdirsel_match: %s match found prefix %s (method %s)\n", comparable, entry->prefix[i], (method == DIRSEL_BOL ? "bol" : "any"));
	return 1;
      }
    }
  }
  if(verbose) fprintf(stderr, "\tdirsel_match: %s no match\n", comparable);
  return 0;
}

static dirsel_t* dirsel_alloc()
{
  dirsel_t* entry = (dirsel_t*)smalloc(sizeof(dirsel_t));
  memset((char*)entry, '\0', sizeof(dirsel_t));
  static_alloc((char**)&entry->prefix, &entry->prefix_size, 10 * sizeof(char*));
  static_alloc(&entry->pool, &entry->pool_size, 32);
  return entry;
}

char* dirsel_key(char* url)
{
  if(verbose) fprintf(stderr, "\tdirsel_key: %s\n", url);
  if(uri_realloc(context.url_object, url, strlen(url)) == URI_CANNONICAL) {
    if(context.url_object->info & URI_INFO_RELATIVE) {
      fprintf(stderr, "dirsel_key: unexpected relative url %s\n", url);
      return "_unlikely_";
    } else {
      return dirsel_netpath(context.url_object);
    }
  } else
    return "_unlikely_";
}

static char* dirsel_netpath(uri_t* url_object)
{
  static char* netpath = 0;
  static int netpath_size = 0;
  char* netloc = uri_netloc(url_object);
  char* path = url_object->path ? url_object->path : "";
  path = dirname(path);
  if(!strcmp(path, "."))
    path = "";

  static_alloc(&netpath, &netpath_size, strlen(netloc) + strlen(path) + 3);

  sprintf(netpath, "%s/%s", netloc, path);

  return netpath;
}

static dirsel_t* entry_find(hash_t* table, char* key)
{
  hnode_t* node = hash_lookup(table, key);
  return node ? (dirsel_t*)hnode_get(node) : 0;
}
