/*
 *   Copyright (C) 1997, 1998, 1999 Loic Dachary
 *
 *   This program is free software; you can redistribute it and/or modify it
 *   under the terms of the GNU General Public License as published by the
 *   Free Software Foundation; either version 2, or (at your option) any
 *   later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program; if not, write to the Free Software
 *   Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 
 *
 */
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif /* HAVE_CONFIG_H */

#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif /* HAVE_STDLIB_H */
#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include <errno.h>
#ifdef HAVE_DMALLOC_H
#include <dmalloc.h>
#endif /* HAVE_DMALLOC_H */

#include <hash.h>
#include <getopttools.h>
#include <urldirname.h>
#include <split.h>
#include <salloc.h>
#include <uri.h>
#include <file_exists.h>
#include <file_size.h>
#include <rm.h>
#include <creatp.h>
#include <base64.h>

#include <crawl_private.h>
#include <crawl.h>
#include <robots.h>
#include <sqlutil.h>
#include <ftp.h>
#include <mime.h>
#include <dirsel.h>
#include <bodyparse.h>
#ifdef LANGREC
#include <decodage_usr.h>
#endif /* LANGREC */
#include <hooks.h>

static int verbose = 1;

static void crawl_rehook_1(char* paramsp, webbase_url_t* webbase_url);
static void hp_remove_unloaded_1(crawl_params_t* params, webbase_url_start_t* start, webbase_url_t* webbase_url);
static void hp_remove_unloaded_start(crawl_params_t* params, webbase_url_start_t* start);
static int hp_print_externals_2(char* args, webbase_url_t* base_url, char* url, int flag);
static void hp_print_externals_1(crawl_params_t* params, webbase_url_start_t* start, webbase_url_t* webbase_url);
static void hp_action_2(char* action_argp, MYSQL_RES* res, MYSQL_ROW row);
static void hp_action_1(char* action_argp, webbase_url_start_t* start);
static void noheuristics_1(char* contextp, MYSQL_RES* res, MYSQL_ROW row);
static void noheuristics(crawl_context_t* context);
static void hp_load_mirror(char* paramsp, webbase_url_start_t* start);
static void mirror_explore_exploring(crawl_context_t* context);
static void mirror_cleanup(crawl_context_t* context);
static void mirror_explore_virgin(crawl_context_t* context);
static int internal_bootstrap_start2url(webbase_t* base, char* query, char* url, int start_rowid, int level);
static void mirror_explore_update(crawl_context_t* context);
static void mirror_explore_wait(crawl_context_t* context);
static void mirror_explore_new(crawl_context_t* context);
static int mirror_query(crawl_context_t* context, char* query);
static int mirror_crawl_continue(crawl_context_t* context);
static int mirror_crawl_test(crawl_context_t* context, char* url);
static webbase_url_t* mirror_1(crawl_context_t* context, char* url);
static webbase_url_t* mirror_2(crawl_context_t* context, char* url);
static void mirror_scheme(crawl_context_t* context, webbase_url_t* webbase_url, uri_t* url_object, char* url);
static void ftp_body_parse(webbase_url_t* webbase_url, uri_t* url_object, char* path);
static void mirror_ftp_free();
static int mirror_ftp(crawl_context_t* context, webbase_url_t* webbase_url, uri_t* url_object, char* url);
static int mirror_http(crawl_context_t* context, webbase_url_t* webbase_url, uri_t* url_object, char* url);
static webbase_url_t* mirror_location(crawl_context_t* context, webbase_url_t* webbase_url);
static char* time2str(time_t t);
static char* mirror_request_http(crawl_context_t* context, webbase_url_t* webbase_url, uri_t* url_object);
static void mirror_schedule(crawl_context_t* context, webbase_url_t* webbase_url);
static crawl_params_t* params_alloc();

static struct option long_options[] =
{
  /* These options set a flag. */
  {"babil_crawl", 0, &verbose, 3},
  {"verbose_crawl", 0, &verbose, 2},
  {"quiet_crawl", 0, &verbose, 0},
  {"verbose_dirsel", 0, 0, 0},
  {"verbose_hook", 0, 0, 0},
  {"no_hook", 0, 0, 0},
  {"url_max_size", 1, 0, 0},
  {"size_limit", 1, 0, 0},
  {"min", 1, 0, 0},
  {"depth", 1, 0, 0},
  {"level", 1, 0, 0},
  {"timeout", 1, 0, 0},
  {"loaded_delay", 1, 0, 0},
  {"modified_delay", 1, 0, 0},
  {"not_found_delay", 1, 0, 0},
  {"timeout_delay", 1, 0, 0},
  {"robot_delay", 1, 0, 0},
  {"auth", 1, 0, 0},
  {"accept", 1, 0, 0},
  {"filter", 1, 0, 0},
  {"noheuristics", 0, 0, 0},
  {"unescape", 0, 0, 0},
  {"norobot_exclusion", 0, 0, 0},
  {"tags", 1, 0, 0},
  {"allow", 1, 0, 0},
  {"disallow", 1, 0, 0},
  {"agent", 1, 0, 0},
  {"sleepy", 0, 0, 0},
  {"sticky", 0, 0, 0},
  {"update", 0, 0, 0},
  {"update_rebuild", 0, 0, 0},
  {"candidate_limit", 1, 0, 0},
  {0, 0, 0, CRAWL_OPTIONS}
};

struct option* crawl_options(struct option options[])
{
  getopt_merge(options, webbase_options(options));
  getopt_merge(options, webtools_options(options));
  getopt_merge(options, robots_options(options));
  getopt_merge(options, cookies_options(options));

  return long_options;
}

#define MAX_OPTIONS 100
#define APPLICATION_OPTIONS		0x8000000

crawl_params_t* crawl_init(int argc, char** argv)
{
  static struct option long_options[MAX_OPTIONS + 1] =
  {
    /* These options set a flag. */
    {0, MAX_OPTIONS, 0, APPLICATION_OPTIONS}
  };
  getopt_merge(long_options, crawl_options(long_options));
  return crawl_alloc(argc, argv, long_options);
}

crawl_params_t* crawl_alloc(int argc, char** argv, struct option options[])
{
  crawl_params_t* params = params_alloc();
  webbase_url_start_t* start = &params->start_in_core;

  webbase_start_reset(start);
  dirsel_init();

  optind = 0;
  while(1) {
    /* `getopt_long' stores the option index here. */
    int option_index = 0;
    int c;
    int found = 1;

    c = getopt_long_only(argc, argv, "", options, &option_index);

    /* Detect the end of the options. */
    if (c == -1)
      break;
     
    switch (c)
      {
      case 0:
	/* If this option set a flag, do nothing else now. */
	
	if (options[option_index].flag != 0)
	  break;
	if(!strcmp(options[option_index].name, "verbose_dirsel")) {
	  dirsel_verbose(1);
	  break;
	} else if(!strcmp(options[option_index].name, "verbose_hook")) {
	  hook_verbose(1);
	  break;
	} else if(!strcmp(options[option_index].name, "url_max_size")) {
	  start->url_max_size = atoi(optarg);
	  break;
	} else if(!strcmp(options[option_index].name, "size_limit")) {
	  start->size_limit = atoi(optarg);
	  break;
	} else if(!strcmp(options[option_index].name, "min")) {
	  start->min = atoi(optarg);
	  break;
	} else if(!strcmp(options[option_index].name, "depth")) {
	  start->depth = atoi(optarg);
	  break;
	} else if(!strcmp(options[option_index].name, "level")) {
	  start->level = atoi(optarg);
	  break;
	} else if(!strcmp(options[option_index].name, "timeout")) {
	  start->timeout = atoi(optarg);
	  break;
	} else if(!strcmp(options[option_index].name, "loaded_delay")) {
	  start->loaded_delay = atoi(optarg);
	  break;
	} else if(!strcmp(options[option_index].name, "modified_delay")) {
	  start->modified_delay = atoi(optarg);
	  break;
	} else if(!strcmp(options[option_index].name, "not_found_delay")) {
	  start->not_found_delay = atoi(optarg);
	  break;
	} else if(!strcmp(options[option_index].name, "timeout_delay")) {
	  start->timeout_delay = atoi(optarg);
	  break;
	} else if(!strcmp(options[option_index].name, "candidate_limit")) {
	  params->candidate_limit = atoi(optarg);
	  break;
	} else if(!strcmp(options[option_index].name, "noheuristics")) {
	  params->noheuristics = 1;
	  break;
	} else if(!strcmp(options[option_index].name, "sticky")) {
	  params->sticky = 1;
	  break;
	} else if(!strcmp(options[option_index].name, "update")) {
	  params->update = 1;
	  break;
	} else if(!strcmp(options[option_index].name, "update_rebuild")) {
	  params->update_rebuild = 1;
	  break;
	} else if(!strcmp(options[option_index].name, "robot_delay")) {
	  start->robot_delay = atoi(optarg);
	  break;
#define S(field,field_length,flag) \
	} else if(!strcmp(options[option_index].name, #field)) { \
	  static_alloc(&start->field, &start->field_length, strlen(optarg) + 1); \
	  strcpy(start->field, optarg); \
          start->info |= flag; \
	  break; 
        S(auth,auth_length,WEBBASE_URL_START_AUTH);
        S(accept,accept_length,WEBBASE_URL_START_ACCEPT);
        S(filter,filter_length,WEBBASE_URL_START_FILTER);
        S(allow,allow_length,WEBBASE_URL_START_ALLOW);
        S(disallow,disallow_length,WEBBASE_URL_START_DISALLOW);
#undef S
	} else if(!strcmp(options[option_index].name, "no_hook")) {
	  params->no_hook = 1;
	}
	found = 0;
	break;
      default:
	fprintf(stderr, "option parse error %c, 0x%x\n", c & 0xff, c);
	exit(1);
      }
    if(found) {
      hash_alloc_insert(params->options, (char*)options[option_index].name, strdup(optarg ? optarg : " "));
    }
  }

  if(verbose > 2) {
    http_verbose(verbose);
  }

  {
    params->http = webtools_alloc(argc, argv, options);
    params->http->callback_http_header = http_header;
    params->http->callback_http_body = http_body;
  }
  params->base = webbase_alloc(argc, argv, options);
  params->cookies = cookies_alloc(argc, argv, options);
  params->cookies->base = params->base;
  params->robots = robots_alloc(argc, argv, options);
  params->robots->base = params->base;
  if(params->candidate_limit <= 0)
    params->candidate_limit = CRAWL_PARAMS_CANDIDATE_LIMIT;
  if(!params->no_hook)
    hook_init(params->base);

  return params;
}

static void crawl_rebuild_1(char* paramsp, webbase_url_t* webbase_url)
{
  crawl_params_t* params = (crawl_params_t*)paramsp;

  webbase_lock_ignore(params->base, webbase_url->w_rowid, "url");
  if(hook_ok(params->base, webbase_url)) {
    char query[256];
    char* path = uri_furi_string(webbase_url->w_url, strlen(webbase_url->w_url), URI_FURI_REAL_PATH);
    if(file_exists(path)) {
      hook_insert(params->base, webbase_url);
      if(verbose > 1) fprintf(stderr, "\trebuild %s (%d)\n", webbase_url->w_url, webbase_url->w_hookid);
      sprintf(query, "update url set hookid = %d where rowid = %d\n", webbase_url->w_hookid, webbase_url->w_rowid);
      smysql_query(&params->base->mysql, query);
      hook_insert_watch(params->base);
    } else {
      if(verbose > 1) fprintf(stderr, "\tmissing file for %s\n", webbase_url->w_url);
      sprintf(query, "update url set mtime = 0 where rowid = %d\n", webbase_url->w_rowid);
    }
  }
  webbase_lock_ignore(params->base, 0, 0);
  if(verbose == 1) {
    static int count = 0;
    /*
     * Print a dot every 1000 call.
     */
    if(count > 1000) {
      fprintf(stderr, ".");
      count = 0;
    }
    count++;
  }
}

void crawl_rebuild(crawl_params_t* params, int resume)
{
  if(resume > 1) {
    fprintf(stderr, "resuming rebuild\n");
  } else {
    hook_rebuild_start(params->base);
  }

  webbase_walk_url(params->base, "where hookid = 0", crawl_rebuild_1, (char*)params, WEBBASE_GET_URL_LIGHT);

  hook_rebuild_end(params->base);
}

static void crawl_urls_1(char* arg, MYSQL_RES* res, MYSQL_ROW row)
{
  crawl_params_t* params = (crawl_params_t*)arg;
  char* url = row[0];

  crawl_touch(params, url);
}

void crawl_urls(crawl_params_t* params, char* urls)
{
  char* sql = smalloc(strlen(urls) + 128);
  sprintf(sql, "select url from url where %s", urls);
  sql_select(&params->base->mysql, 0, sql, crawl_urls_1, (char*)params);
  free(sql);
}

static void crawl_rehook_1(char* paramsp, webbase_url_t* webbase_url)
{
  crawl_params_t* params = (crawl_params_t*)paramsp;

  webbase_lock_ignore(params->base, webbase_url->w_rowid, "url");
  if(hook_ok(params->base, webbase_url)) {
    char query[256];
    hook_insert(params->base, webbase_url);
    if(verbose > 1) fprintf(stderr, "\trehook %s (%d)\n", webbase_url->w_url, webbase_url->w_hookid);
    sprintf(query, "update url set hookid = %d where rowid = %d\n", webbase_url->w_hookid, webbase_url->w_rowid);
    smysql_query(&params->base->mysql, query);
  }
  webbase_lock_ignore(params->base, 0, 0);
  if(verbose == 1) {
    static int count = 0;
    /*
     * Print a dot every 1000 call.
     */
    if(count > 1000) {
      fprintf(stderr, ".");
      count = 0;
    }
    count++;
  }
}

void crawl_rehook(crawl_params_t* params)
{
  char query[256];
  /*
   * Try to restore as many hookid as possible, 
   * and remove from the hooked database records than
   * should not be there.
   */
  hook_rehook_start(params->base);
  /*
   * Insert in hooked database all records that must
   * be hooked.
   */
  sprintf(query, "where hookid >= %d", HOOK_MAX_RECORDS * 2);
  webbase_walk_url(params->base, query, crawl_rehook_1, (char*)params, WEBBASE_GET_URL_LIGHT);
  /*
   * Forget hook info for all URLs not hooked at this point.
   */
  sprintf(query, "update url set hookid = 0 where hookid >= %d", HOOK_MAX_RECORDS * 2);
  smysql_query(&params->base->mysql, query);
  /*
   * Close the hooked database (index validation etc..)
   */
  hook_rehook_end(params->base);
}

/*
 * Set level -1 for all URLs that have not yet been loaded.
 */
static void hp_remove_unloaded_1(crawl_params_t* params, webbase_url_start_t* start, webbase_url_t* webbase_url)
{
  if(webbase_url->w_info == 0) {
    char query[256];
    sprintf(query, "update start2url set level = -1 where start = %d and url = %d", start->rowid, webbase_url->w_rowid);
    if(verbose > 1) fprintf(stderr, "\tremove unloaded %s\n", webbase_url->w_url);
    smysql_query(&params->base->mysql, query);
  }
}

/*
 * First make sure that all URLs have level >= 0
 */
static void hp_remove_unloaded_start(crawl_params_t* params, webbase_url_start_t* start)
{
  char query[256];
  sprintf(query, "update start2url set level = 0 where start = %d", start->rowid);
  smysql_query(&params->base->mysql, query);
}

void hp_remove_unloaded(crawl_params_t* params, char* url)
{
  char* where = smalloc(128 + strlen(url));
  sprintf(where, "where url = '%s'", sql_quote_char_simple(url));
  hp_action(params, where, hp_remove_unloaded_start, hp_remove_unloaded_1);
  free(where);
}

webbase_url_t* crawl_touch(crawl_params_t* params, char* url)
{
  crawl_context_t context;

  memset(&context, '\0', sizeof(crawl_context_t));
  context.params = params;
  context.base = params->base;
  context.touch = 1;
  context.walk_end = 1; 

  return mirror(&context, url);
}

void hp_load_in_core(crawl_params_t* params, char* url)
{
  int url_length = strlen(url);
  int exists;
  webbase_t* base = params->base;
  webbase_url_start_t* start_in_core = &params->start_in_core;
  static webbase_url_start_t start;

  if(url_length > WEBBASE_URL_LENGTH) {
    fprintf(stderr, "hp_load_in_core: %s too long (ignored)\n", url);
    return;
  }

  webbase_enable_start(base, url);
  if((exists = (int)webbase_get_start(base, url, &start)) == 0) {
    webbase_start_reset(&start);
    webbase_default_start(base, &start);
    strcpy(start.url, url);
  }

  webbase_merge_start(&start, start_in_core);
  start.info |= WEBBASE_URL_START_IN_CORE;
  if(params->update)
    start.info = (start.info & ~WEBBASE_URL_START_STATE_MASK) | WEBBASE_URL_START_EXPLORED;
  webbase_update_start(base, &start);
  if(!exists)
#ifdef WEBBASE_LOCK
    webbase_lock(base, start.rowid, "start");
#endif /* WEBBASE_LOCK */
    
  
  do {
    params->robot_delayed = 0;
    hp_load_mirror((char*)params, &start);
  } while(params->robot_delayed > 0);
#ifdef WEBBASE_LOCK
  webbase_unlock(base, start.rowid, "start");
#endif /* WEBBASE_LOCK */
  if(verbose) fprintf(stderr, "exploration done\n");
}

/*
 * Print all external links contained in the pages
 * of a starting point.
 */
static int hp_print_externals_2(char* args, webbase_url_t* base_url, char* url, int flag)
{
  printf("%s\n", url);
  return 1;
}

static void hp_print_externals_1(crawl_params_t* params, webbase_url_start_t* start, webbase_url_t* webbase_url)
{
  webbase_url_walk_href(webbase_url, WEBBASE_URL_WALK_ABSOLUTE, hp_print_externals_2, (char*)params);
}

void hp_print_externals(crawl_params_t* params, char* where_arg)
{
  char* where = smalloc(128 + strlen(where_arg));
  sprintf(where, "where disabled = 'no' and %s", where_arg);
  hp_action(params, where, 0, hp_print_externals_1);
  free(where);
}

/*
 * Run a function on every URL found on a given starting point.
 */
typedef struct action_arg {
  crawl_params_t* params;
  hp_action_callback func;
  hp_action_start_callback func_start;
  webbase_url_start_t* start;
} action_arg_t;

static void hp_action_2(char* action_argp, MYSQL_RES* res, MYSQL_ROW row)
{
  action_arg_t* action_arg = (action_arg_t*)action_argp;
  static webbase_url_t webbase_url_object;
  char* url = row[0];
  webbase_url_t* webbase_url = webbase_get_url(action_arg->params->base, url, &webbase_url_object, WEBBASE_GET_URL_ALL);
  if(webbase_url)
    (*action_arg->func)(action_arg->params, action_arg->start, webbase_url);
}

static void hp_action_1(char* action_argp, webbase_url_start_t* start)
{
  action_arg_t* action_arg = (action_arg_t*)action_argp;
  crawl_params_t* params = action_arg->params;
  action_arg->start = start;

  if(action_arg->func_start)
    (*action_arg->func_start)(action_arg->params, action_arg->start);
  
  {
    char query[128];
    sprintf(query, "select url.url from start2url,url where start2url.start = %d and start2url.url = url.rowid", start->rowid);
    sql_select(&params->base->mysql, 0, query, hp_action_2, (char*)action_arg);
  }
  
}

void hp_action(crawl_params_t* params, char* where, hp_action_start_callback func_start, hp_action_callback func)
{
  action_arg_t action_arg;
  action_arg.params = params;
  action_arg.func = func;
  action_arg.func_start = func_start;
  webbase_walk_start(params->base, where, hp_action_1, (char*)&action_arg);
}

/*
 * Remove the starting point and all the URLs linked to it
 */
void hp_unload(crawl_params_t* params, char* url, int keep_start)
{
  int url_length = strlen(url);
  webbase_t* base = params->base;
  static webbase_url_start_t start;
  crawl_context_t context;

  if(url_length > WEBBASE_URL_LENGTH) {
    fprintf(stderr, "hp_unload: %s too long (ignored)\n", url);
    return;
  }

  webbase_enable_start(base, url);
  if(webbase_get_start(base, url, &start) == 0) {
    fprintf(stderr, "cannot unload %s because it does not exist in start\n", url);
    return;
  }

  memset(&context, '\0', sizeof(crawl_context_t));
  context.params = params;
  context.base = params->base;
  context.start = &start;

  /*
   * Set level to -1 for all start2url entries
   */
  {
    char query[256];
    sprintf(query, "update start2url set level = -1 where start = %d", start.rowid);
    smysql_query(&base->mysql, query);
    /*
     * Since all start2url entries have level -1 it effectively
     * deletes all of them.
     */
    mirror_cleanup(&context);
    if(keep_start) {
      sprintf(query, "update start set info = 0, count = 0 where rowid = %d", start.rowid);
    } else {
      sprintf(query, "delete from start where rowid = %d", start.rowid);
    }
    smysql_query(&base->mysql, query);
  }

#ifdef WEBBASE_LOCK
  webbase_unlock(base, start.rowid, "start");
#endif /* WEBBASE_LOCK */
}

/*
 * Explore or update all the starting points
 */
void hp_load(crawl_params_t* params)
{
  hp_load_1(params, "");
}

/*
 * Explore or update the starting points that match
 * the 'where' constraint.
 */
void hp_load_1(crawl_params_t* params, char* where)
{
  int first = 1;
  char* robots_delay_where = smalloc(128 + (where ? strlen(where) : 0));
  if(where && where[0] != '\0') {
    sprintf(robots_delay_where, "where %s and disabled = 'no' ", where);
  } else {
    strcpy(robots_delay_where, "where disabled = 'no' ");
  }
  do {
    params->robot_delayed = 0;
    if(verbose > 1) fprintf(stderr, "\thp_load_1: %s\n", robots_delay_where);
    webbase_walk_start(params->base, robots_delay_where, hp_load_mirror, (char*)params);
    if(params->robot_delayed) {
      if(verbose) fprintf(stderr, "still %d home pages delayed\n", params->robot_delayed);
      if(first) {
	if(where && where[0] != '\0') {
	  sprintf(robots_delay_where, " where delay is not null and disabled = 'no' and %s order by delay ", where);
	} else {
	  strcpy(robots_delay_where, " where delay is not null and disabled = 'no' order by delay ");
	}
	first = 0;
      }
    }
  } while(params->robot_delayed);
  free(robots_delay_where);
  if(verbose) fprintf(stderr, "exploration done\n");
}

static void noheuristics_1(char* contextp, MYSQL_RES* res, MYSQL_ROW row)
{
  crawl_context_t* context = (crawl_context_t*)contextp;
  int rowid = atoi(row[0]);
  char query[256];
  
  sprintf(query, "update url set crawl = from_unixtime(0) where rowid = %d", rowid);
  smysql_query(&context->base->mysql, query);
}

static void noheuristics(crawl_context_t* context)
{
  char query[256];

  if(context->params->noheuristics == 0) {
    if(verbose > 2) fprintf(stderr, "\tnoheuristics: active\n");
    return;
  }

  if(verbose > 1) fprintf(stderr, "\tnoheuristics: reset crawl time for %s\n", context->start->url);
  sprintf(query, "select url.rowid from url,start2url where start2url.start = %d and start2url.url = url.rowid", context->start->rowid);

  sql_select(&context->base->mysql, 0, query, noheuristics_1, (char*)context);
}

static void hp_load_mirror(char* paramsp, webbase_url_start_t* start)
{
  crawl_params_t* params = (crawl_params_t*)paramsp;
  crawl_context_t context;

  memset(&context, '\0', sizeof(crawl_context_t));
  context.params = params;
  context.base = params->base;
  context.start = start;
  context.home = strdup(urldirname(start->url));
  context.home_length = strlen(context.home);

  noheuristics(&context);

  if(start->info & WEBBASE_URL_START_ALLOW)
    dirsel_allow(start->url, start->allow, DIRSEL_LOAD);
  if(start->info & WEBBASE_URL_START_DISALLOW)
    dirsel_disallow(start->url, start->disallow, DIRSEL_LOAD);
  /*
   * Key to directory selection (robots & manual)
   */
  context.dirsel_key = strdup(dirsel_key(context.home));

  hook_info_set(start->hook_info);

  switch(start->info & WEBBASE_URL_START_STATE_MASK) {
  case WEBBASE_URL_START_VIRGIN:
  default:
    if(!context.start->delay && verbose)
      fprintf(stderr, "explore virgin %s\n", context.start->url);
    mirror_explore_virgin(&context);
    break;
  case WEBBASE_URL_START_EXPLORING:
    if(!context.start->delay && verbose)
      fprintf(stderr, "resuming exploration %s\n", context.start->url);
    mirror_explore_exploring(&context);
    break;
  case WEBBASE_URL_START_EXPLORED:
    if(!context.start->delay && verbose)
      fprintf(stderr, "update %s\n", context.start->url);
    mirror_explore_update(&context);
    break;
  case WEBBASE_URL_START_UPDATING:
    if(!context.start->delay && verbose)
      fprintf(stderr, "resuming update %s\n", context.start->url);
    mirror_explore_update(&context);
    break;
  }

  if(context.start->delay > 0)
    params->robot_delayed++;
  free(context.home);
  free(context.dirsel_key);
}

/*
 */
static void mirror_explore_exploring(crawl_context_t* context)
{
  /*
   * Restore the url count if the process was interrupted.
   * If it is an internal interuption such as a delay, do not restore
   * the count.
   */
  if(context->start->delay == 0) {
    webbase_t* base = context->base;
    MYSQL_RES *res;
    MYSQL_ROW row;
    char query[256];

    /*
     * Restore url count.
     */
    sprintf(query, "select count(*) from start2url where start2url.start = %d", context->start->rowid);
    smysql_query(&base->mysql, query);
    res = smysql_store_result(&base->mysql);
    row = mysql_fetch_row(res);
    context->start->count = atoi(row[0]);
    mysql_free_result(res);

    if(verbose > 1) fprintf(stderr, "\tmirror_explore_exploring: restore count %d\n", context->start->count);
  }
  
  mirror_explore_new(context);
}

/*
 * Remove those URLs that were not visited
 */
static void mirror_cleanup(crawl_context_t* context)
{
  MYSQL_RES *res;
  MYSQL_ROW row;
  char query[256];
  int deleted_count = 0;
  int unref_count = 0;
  webbase_t* base = context->base;
  webbase_url_start_t* start = context->start;

  /*
   * For all URLs that are out of scope for this start point but still
   * referenced by another start point, just kill the start2url entry.
   */
  sprintf(query, "select a.url from start2url a,start2url b where a.start = %d and a.url = b.url and b.start != %d and a.level < 0", start->rowid, start->rowid);
  if(verbose > 2) fprintf(stderr, "\tmirror_cleanup: %s ...", query);
  smysql_query(&base->mysql, query);
  res = smysql_store_result(&base->mysql);
  if(verbose > 2) fprintf(stderr, "done\n");
  if(mysql_num_rows(res)) {
    while((row = mysql_fetch_row(res))) {
      int rowid = atoi(row[0]);

      sprintf(query, "delete from start2url where start = %d and url = %d", start->rowid, rowid);
      smysql_query(&base->mysql, query);
      unref_count++;
    }
  }
  mysql_free_result(res);

  /*
   * For all URLs that are out of scope and uniquely referenced by this
   * starting point, delete all data.
   */
  sprintf(query, "select url.rowid,url.complete_rowid,url.hookid,url.url from start2url,url where start2url.start = %d and start2url.url = url.rowid and start2url.level < 0", start->rowid);
  if(verbose > 2) fprintf(stderr, "\tmirror_cleanup: %s ...", query);
  smysql_query(&base->mysql, query);
  res = smysql_store_result(&base->mysql);
  if(verbose > 2) fprintf(stderr, "done\n");
  if(mysql_num_rows(res)) {
    while((row = mysql_fetch_row(res))) {
      int rowid = atoi(row[0]);
      int complete_rowid = row[1] ? atoi(row[1]) : 0;
      int hookid = row[2] ? atoi(row[2]) : 0;
      char* url = row[3];

      if(hookid && !context->params->no_hook)
	hook_delete_id(context->base, hookid, url, strlen(url));
      sprintf(query, "delete from url where rowid = %d", rowid);
      smysql_query(&base->mysql, query);    
      if(complete_rowid) {
	sprintf(query, "delete from url_complete where rowid = %d", complete_rowid);
	smysql_query(&base->mysql, query);    
      }
      deleted_count++;
    }
  }
  mysql_free_result(res);
  sprintf(query, "delete from start2url where start = %d and level < 0", start->rowid);
  smysql_query(&base->mysql, query);

  if(verbose > 1) fprintf(stderr, "\tmirror_cleanup: %d deletions, %d unref\n", deleted_count, unref_count);
}

/*
 * Explore new URLs up to limit defined by start.
 * Do not reconsider existing URLs.
 */
static void mirror_explore_virgin(crawl_context_t* context)
{
  /*
   * Inconditionally crawl the starting point.
   */

  mirror_collect((char*)context, (webbase_url_t*)0, context->start->url, WEBBASE_URL_WALK_RELATIVE);
  mirror_explore_new(context);
}

static int internal_bootstrap_start2url(webbase_t* base, char* query, char* url, int start_rowid, int level)
{
  MYSQL_RES *res;
  MYSQL_ROW row;
  int url_rowid;
  int ret = 0;

  url = sql_quote_char_simple(url);
  sprintf(query, "select rowid,url from url where url = '%s'", url);
  if(verbose > 1) fprintf(stderr, "\tbootstrap_start2url: query = %s\n", query);
  if(webbase_get_url_2(base, query, url, &res, &row)) {
    url_rowid = atoi(row[0]);
    webbase_insert_start2url(base, start_rowid, url_rowid, level);
  } else {
    fprintf(stderr, "bootstrap_start2url: could not find %s in url table\n", url);
    ret = -1;
  }
  mysql_free_result(res);
  return ret;
}

static void mirror_explore_update(crawl_context_t* context)
{
  static char* query = 0;
  static int query_size = 0;
  webbase_t* base = context->base;
  char* url = context->start->url;
  int url_length = strlen(url);
  
  mirror_explore_wait(context);
  webbase_start_state(context->base, context->start, WEBBASE_URL_START_UPDATING);
  static_alloc(&query, &query_size, 1024 + url_length);

  /*
   * Update all the URLs that need to but do not collect new URLs from them.
   * There is a chance that updated URLs will change the structure of the
   * graph is such a way that some URLs we know become out of scope and
   * others become in scope.
   */
  /*
   * -update_rebuild must only be used manually: crawler must be 
   * run once more to make sure that the tree is consistent. Mainly
   * usefull when removing part of a tree to prevent reloading of the rest.
   */
  if(!context->params->update_rebuild) {
    int updated = 0;
    if(verbose > 1) fprintf(stderr, "\tmirror_explore_update: crawl what needs to\n");
    sprintf(query, "select url.url from url,start2url where start2url.start = %d and start2url.url = url.rowid and url.crawl < now() limit %d", context->start->rowid, context->params->candidate_limit);
    context->walk_end = 1;
    updated = mirror_query(context, query);
    /*
     * Abort for now if the mirror did not finish because of robot delay.
     * we will be back soon :-)
     */
    if(context->start->delay) {
      /*
       * To update the delay field.
       */
      webbase_start_state(context->base, context->start, WEBBASE_URL_START_UPDATING);
      return;
    }

    context->walk_end = context->empty = 0;
    /*
     * Do not go any further if no URL was reloaded. If the update is
     * necessary because depth or level changed in start, use -update.
     */
    if(updated == 0 && !context->params->update) {
      if(verbose > 1) fprintf(stderr, "\tmirror_explore_update: no URL reloaded, update finished\n");
      webbase_start_state(context->base, context->start, WEBBASE_URL_START_EXPLORED);
      return;
    }
  }

  /*
   * Reset level to -1 for every URL associated to start point.
   * It allows us to find out which URLs become out of scope (those
   * that still have level = -1 after recursion).
   */
  if(verbose > 1) fprintf(stderr, "\tmirror_explore_update: set start to -1\n");
  sprintf(query, "update start2url set level = -1 where start = %d", context->start->rowid);
  smysql_query(&base->mysql, query);

  /*
   * Set the depth of the starting point in start2url so that recursion
   * can begin. And set the robot file level too so that it is not deleted.
   * robots.txt is at artificial level 0 so that it will be first when
   * walking the graph. If robots.txt is not first, a href in the home
   * page may be collected before it comes.
   */
  {
    uri_t* url_object = uri_object(url, url_length);
    if(!strncasecmp(uri_scheme(url_object), "http", 4))
      internal_bootstrap_start2url(base, query, uri_robots(url_object), context->start->rowid, 0);
  }

  /*
   * First URL is always there
   */
  if(internal_bootstrap_start2url(base, query, url, context->start->rowid, 1) >= 0) {

    /*
     * Forget everything about the current URLs known, it is about to
     * be recomputed from scratch. 2 counts for the robots.txt file
     * and the home page.
     */
    context->start->count = 2;

    /*
     * Recursively walk the hypertext graph, beginning at the starting point.
     * Prevent effective loading of the URLs. explore_new will take care 
     * of that and restart gracefully if interrupted. If the crawl is 
     * interrupted while in the following loop, interruption will
     * imply to recursively walk the graph again.
     */
    if(verbose > 1) fprintf(stderr, "\tmirror_explore_update: walk the graph\n");
    context->no_crawl = 1;
    {
      int level_count = 0;
      do {
	sprintf(query, "select url.url,start2url.level from url,start2url where start2url.start = %d and start2url.url = url.rowid and start2url.level >= %d order by start2url.level", context->start->rowid, level_count);
	mirror_query(context, query);
	level_count++;
      } while(!context->empty);
    }
    context->no_crawl = 0;
    context->empty = 0;
  } else {
    /*
     * Ho! ho! first URL is not there after all. Cleanup everything
     * and do as if it was all new.
     */
    context->start->count = 0;
    mirror_collect((char*)context, (webbase_url_t*)0, context->start->url, WEBBASE_URL_WALK_RELATIVE);
  }
  /*
   * Kill all the URLs that are out of scope
   */
  if(verbose > 1) fprintf(stderr, "\tmirror_explore_update: cleanup\n");
  mirror_cleanup(context);

  /*
   * The recursive walk possibly left the graph unfinished (URLs may
   * have been collected but not crawled yet, global conditions not
   * exhausted). We are exactly in the situation of an aborted first
   * crawl (mirror_explore_exploring) except that the context information
   * need not to be recalculated as it is in explore_exploring.
   * It is not needed to reset the start state since explore_new will
   * do it. If the exploration is interrupted while in explore_new it
   * will be resumed by explore_exploring.
   */
  if(verbose > 1) fprintf(stderr, "\tmirror_explore_update: continue exploration\n");
  mirror_explore_new(context);
}

static void mirror_explore_wait(crawl_context_t* context)
{
  time_t now = time(0);

  if(context->start->delay > now) {
    int delay = context->start->delay - now;
    if(verbose) fprintf(stderr, "sleep %ds ", delay);
    /*
     * Sometimes sleep() sleeps a bit less than the number of 
     * required seconds.
     */
    sleep(delay + 1);
  } else {
    if(verbose > 1) {
      if(context->start->delay) {
	fprintf(stderr, "\tmirror_explore_wait: delay is %ld\n", context->start->delay - now);
      } else {
	fprintf(stderr, "\tmirror_explore_wait: no delay\n");
      }
    }
  }
  context->start->delay = 0;
}

static void mirror_explore_new(crawl_context_t* context)
{
  char query[256];

  mirror_explore_wait(context);
  webbase_start_state(context->base, context->start, WEBBASE_URL_START_EXPLORING);

  sprintf(query, "select url.url from url,start2url where start2url.start = %d and start2url.url = url.rowid and url.crawl < now() limit %d", context->start->rowid, context->params->candidate_limit);

  do {
    mirror_query(context, query);
  } while(!context->empty);

  {
    int state = context->start->delay ? WEBBASE_URL_START_EXPLORING : WEBBASE_URL_START_EXPLORED;
    webbase_start_state(context->base, context->start, state);
  }
}

/*
 * ! in query the first field MUST be url
 */
static int mirror_query(crawl_context_t* context, char* query)
{
  webbase_url_t* webbase_url;
  webbase_t* base = context->base;
  MYSQL_RES *res;
  MYSQL_ROW row;
  int num_rows;

  if(verbose > 1) fprintf(stderr, "\tmirror_query: %s\n", query);
  smysql_query(&base->mysql, query);
  res = smysql_store_result(&base->mysql);
  num_rows = mysql_num_rows(res);
  if(num_rows > 0) {
    if(verbose > 1) fprintf(stderr, "\t%d candidate urls\n", num_rows);
    while((row = mysql_fetch_row(res)) && !context->empty) {
      char* url = row[0];

      robots_load(context->params->robots, context, url);
      if((webbase_url = mirror(context, url))) {
	/*
	 * walk_href returns the status of the last 'mirror_crawl_continue'
	 * If it returned FALSE, it means that exploration should stop for
	 * this starting point.
	 */
	if(!context->walk_end) {
	  context->walk_end = !webbase_url_walk_href(webbase_url, WEBBASE_URL_WALK_ABSOLUTE|WEBBASE_URL_WALK_RELATIVE, mirror_collect, (char*)context);
	  if(context->walk_end && verbose > 1) fprintf(stderr, "\tmirror: walking %s hrefs set walk_end\n", url);
	}
      }
    }
    if(verbose > 1) fprintf(stderr, "\texploration of %s %s\n", context->start->url, (context->empty ? "interrupted" : "done"));
  } else {
    if(verbose > 1) fprintf(stderr, "\tquery returns empty set\n");
    context->empty = 1;
  }

  mysql_free_result(res);

  return num_rows;
}

/*
 * Return 0 to stop walking
 *        1 to continue walking
 */
int mirror_collect(char* contextp, webbase_url_t* base_url, char* url, int flag)
{
  crawl_context_t* context = (crawl_context_t*)contextp;
  webbase_t* base = context->base;
  int insert_url = 0;
  int insert_start2url = 0;
  int rowid;

  context->collected = 0;

  if(verbose > 1) fprintf(stderr, "\tmirror_collect: %s\n", url);
  if(flag == WEBBASE_URL_WALK_ROBOTS || mirror_crawl_test(context, url)) {
#ifdef WEBBASE_LOCK
    webbase_lock(base, 0, url);
#endif /* WEBBASE_LOCK */
    rowid = webbase_exists_url(base, url);

    if(rowid) {
      int counted = webbase_counted(base, context->start, rowid);
      if(!counted) {
	insert_start2url = 1;
      }
    } else {
      insert_start2url = insert_url = 1;
    }

    if(insert_start2url)
      context->start->count++;

    if(insert_url)
      rowid = webbase_visited(base, url);

    if(insert_start2url)
      webbase_insert_start2url(base, context->start->rowid, rowid, context->level + 1);

#ifdef WEBBASE_LOCK
    webbase_unlock(base, 0, url);
#endif /* WEBBASE_LOCK */
    context->collected = 1;
  }
  return mirror_crawl_continue(context);
}

/*
 * Should exploration continue ? The start structure should keep all the
 * information that allow to answer this question.
 */
static int mirror_crawl_continue(crawl_context_t* context)
{
  int ret = context->start->count < context->start->depth;
  if(verbose > 1) fprintf(stderr, "\tcontinue: %s count = %d, depth = %d\n", (ret ? "yes" : "no"), context->start->count, context->start->depth);
  return ret;
}

/*
 * Should this URL be loaded according to all the options given in the
 * start point ?
 */
static int mirror_crawl_test(crawl_context_t* context, char* url)
{
  webbase_url_start_t* start = context->start;

  if(!mirror_crawl_continue(context)) {
    if(verbose > 1) fprintf(stderr, "\t%s rejected because start exhausted\n", url);
    return 0;
  }

  if(strlen(url) >= WEBBASE_URL_LENGTH) {
    if(verbose > 1) fprintf(stderr, "\t%s rejected because too long\n", url);
    return 0;
  }
  
  if(!dirsel_allowed(context->dirsel_key, url)) {
    if(verbose > 1) fprintf(stderr, "\t%s rejected because path not allowed\n", url);
    return 0;
  }
  
  {
    char* spec = context->start->accept;
    if(spec == 0 || spec[0] == '\0')
      spec = "text/*,magnus-internal/*";

    if(!mime_accept_ext(context->base, spec, url)) {
      if(verbose > 1) fprintf(stderr, "\textension rejected %s\n", url);
      return 0;
    }
  }
    
  if((start->info & WEBBASE_URL_START_HOMEFREE) == 0) {
    /*
     * Discard URLs that do not belong to the same WEB as the start point
     */
    if(strncmp(context->home, url, context->home_length)) {
      if(verbose > 1) fprintf(stderr, "\t%s rejected because not in same start web\n", url);
      return 0;
    }
  }
  return 1;
}

webbase_url_t* mirror(crawl_context_t* context, char* url)
{
  context->redirections = 0;
  context->root_url = 0;
  return mirror_1(context, url);
}

static webbase_url_t* mirror_1(crawl_context_t* context, char* url)
{
  webbase_url_t* webbase_url = mirror_2(context, url);

  if(webbase_url) {
    if(!context->params->no_hook) {
      char query[256];
      int hook_inserted = 0;
      int hook_insert_now = hook_ok(context->base, webbase_url);
      int do_update = 0;

      /*
       * Find out if the current data has been inserted already
       */
      if(webbase_url->w_hookid > 0)
	hook_inserted = 1;
   
      /*
       * Inform hooked base that a change occured. Set webbase_lock_ignore
       * to prevent double locking when mirror is recursivly called by
       * hook_delete, hook_insert or hook_update
       */
      webbase_lock_ignore(context->base, webbase_url->w_rowid, "url");
      if(hook_inserted && hook_insert_now) 
	hook_update(context->base, webbase_url);
      else if(hook_inserted && !hook_insert_now) {
	hook_delete(context->base, webbase_url);
	do_update = 1;
      } else if(!hook_inserted && hook_insert_now) {
	hook_insert(context->base, webbase_url);
	do_update = 1;
      } else
	;
      webbase_lock_ignore(context->base, 0, 0);

      if(do_update) {
	sprintf(query, "update url set hookid = %d where rowid = %d", webbase_url->w_hookid, webbase_url->w_rowid);
	smysql_query(&context->base->mysql, query);
      }
    }
#ifdef WEBBASE_LOCK
    webbase_unlock(context->base, webbase_url->w_rowid, "url");
#endif /* WEBBASE_LOCK */
  }

  return webbase_url;
}

static webbase_url_t* mirror_2(crawl_context_t* context, char* url)
{
  static webbase_url_t webbase_url_object;
  webbase_url_t* webbase_url = &webbase_url_object;
  static uri_t* url_object = 0;
  int url_length = strlen(url);

  /*
   * Sanity checks
   */
  if(url_length >= WEBBASE_URL_LENGTH) {
    fprintf(stderr, "crawl_mirror: url too long %d > %d\n", url_length, WEBBASE_URL_LENGTH);
    return 0;
  }

  /*
   * Build an url object from url
   */
  if(!url_object) {
    url_object = uri_alloc(url, url_length);
  } else {
    uri_realloc(url_object, url, url_length);
  }
  if(url_object->info & URI_INFO_RELATIVE) {
    fprintf(stderr, "mirror: cannot crawl relative URL %s\n", url);
    return 0;
  }
  
  /*
   * Retrieve all available information about the url
   */
  webbase_get_url(context->base, url, webbase_url, WEBBASE_GET_URL_ALL);
  if(webbase_url->w_rowid == 0) {
    fprintf(stderr, "mirror_2: could not find %s in url table\n", url);
    return 0;
  }
  if(context->start == 0) {
    context->start = webbase_get_start_of_url(context->base, webbase_url);
    if(context->start == 0) {
      fprintf(stderr, "mirror_2: could not get start for url %s\n", webbase_url->w_url);
#ifdef WEBBASE_LOCK
      webbase_unlock(context->base, webbase_url->w_rowid, "url");
#endif /* WEBBASE_LOCK */
      return 0;
    } else {
      /*
       * If start was not known when entering mirror function, it is
       * only retrieved for crawl parameters but will change any fields
       * nor will it change the conditions on which the values of the 
       * fields depend. It is therefore safe to unlock it at this point.
       */
#ifdef WEBBASE_LOCK
      webbase_unlock(context->base, context->start->rowid, "start");
#endif /* WEBBASE_LOCK */
    }
  }
  webbase_get_start2url(context->base, &context->start->rowid, &webbase_url->w_rowid, &context->level);

  /*
   * If 'touching', the content of the document needs to be in the cache.
   * Force crawling if the document is not in the cache.
   */
  if(context->touch && (webbase_url->w_info & WEBBASE_URL_INFO_CONTENT)) {
    char* path = uri_furi_string(webbase_url->w_url, strlen(webbase_url->w_url), URI_FURI_REAL_PATH);
    if(path) {
      if(!file_exists(path)) {
	if(verbose > 1) fprintf(stderr, "\tmirror: touching\n");
	webbase_url->w_crawl = time(0) - WEBBASE_TIME_DAY;
      }
    } else {
      fprintf(stderr, "cannot uncache %s, unable to resolve path (ignored)\n", webbase_url->w_url);
    }
  }

  /*
   * It's not time to crawl yet
   */
  if(!context->params->noheuristics && (webbase_url->w_crawl > time(0))) {
    if(verbose > 1) fprintf(stderr, "\tnot time to crawl %s yet\n", url);
    return mirror_location(context, webbase_url);
  }

  /*
   * This server timeouts too much, automatically give up.
   */
  if(context->timeouts > CRAWL_MAX_TIMEOUTS && context->params->sticky == 0) {
    if(verbose > 1) fprintf(stderr, "\ttoo many timeouts %s\n", url);
    webbase_url->w_info |= WEBBASE_URL_INFO_TIMEOUT;
    webbase_url->w_crawl = time(0) + (context->start->timeout_delay * WEBBASE_TIME_DAY);

    webbase_insert_url(context->base, webbase_url);

    return mirror_location(context, webbase_url);
  }

  /*
   * Do nothing if all we want is the webbase_url structure
   */
  if(context->no_crawl)
    return mirror_location(context, webbase_url);

  /*
   * Robot exclusion compliance
   */
  if(!strncasecmp(uri_scheme(url_object), "http", 4)) {
    time_t delay = 0;
    int delayed = 0;
    /*
     * Do not apply robot delay when handling redirections otherwise
     * it breaks the redirections hops limit check.
     */
    int robot_delay = context->redirections <= 0 ? context->start->robot_delay : 0;

    robots_info(context->params->robots, robot_delay, url_object, &delay, &delayed);

    if(delayed == ROBOTS_DELAYED) {
      if(verbose > 1) {
	time_t s = delay - time(0);
	fprintf(stderr, "\tmirror: delayed %d sec\n", (int)s);
      }
      context->empty = 1;
      context->walk_end = 1;
      context->start->delay = delay;
      return webbase_url;
    }
  }

  if(verbose) fprintf(stderr, "loading %s\n", url);

  /*
   * Load the URL from the server
   */
  mirror_scheme(context, webbase_url, url_object, url);

  /*
   * Calculate the date of the next crawl
   */
  mirror_schedule(context, webbase_url);

  /*
   * Update information in database
   */
  webbase_insert_url(context->base, webbase_url);

  return mirror_location(context, webbase_url);
}

static void mirror_scheme(crawl_context_t* context, webbase_url_t* webbase_url, uri_t* url_object, char* url)
{
  int error = 0;

  if(!strncasecmp(uri_scheme(url_object), "http", 4))
    error = mirror_http(context, webbase_url, url_object, url);
  else if(!strcasecmp(uri_scheme(url_object), "ftp"))
    error = mirror_ftp(context, webbase_url, url_object, url);
  
  if(error) {
    if(verbose > 1) {
      errno = error;
      perror("mirror: ");
    }
    switch(error) {
    case ECONNREFUSED:
    case EHOSTDOWN:
    case EHOSTUNREACH:
    case EADDRNOTAVAIL:
      webbase_url->w_code = WEBBASE_URL_CODE_CONNECTION_REFUSED;
      webbase_url->w_info |= WEBBASE_URL_INFO_TIMEOUT;
      break;
    case ETIMEDOUT:
    case ETIME:
      context->timeouts++;
      webbase_url->w_code = WEBBASE_URL_CODE_CONNECTION_TIMED_OUT;
      webbase_url->w_info |= WEBBASE_URL_INFO_TIMEOUT;
      break;
    default:
      /* Unknown non fatal error, consider it timeout but print it. */
      fprintf(stderr, "mirror: %s failed because ", webbase_url->w_url);
      perror("");
      webbase_url->w_code = WEBBASE_URL_CODE_CONNECTION_TIMED_OUT;
      webbase_url->w_info |= WEBBASE_URL_INFO_TIMEOUT;
      break;
    }
  }
}

static void ftp_body_parse(webbase_url_t* webbase_url, uri_t* url_object, char* path)
{
#define BUFFER_SIZE	10240
  int relative_length = 0;
  FILE* fp = fopen(path, "r");
  char buffer[BUFFER_SIZE];
  if(!fp) {
    fprintf(stderr, "ftp_body_parse: cannot open %s for reading\n", path);
    return;
  }
  while(fgets(buffer, BUFFER_SIZE, fp)) {
    int buffer_length = strlen(buffer);
    char** splitted;
    int count;
    char* file;
    int file_length;
    /* Reduce white spaces to single space */
    {
      char* from = buffer;
      char* to = buffer;
      *to++ = *from++;
      while(from - buffer < buffer_length) {
	if(to[-1] == ' ' && isspace(*from)) {
	    from++;
	} else {
	  if(isspace(*from)) {
	    *to++ = ' ';
	    from++;
	  } else {
	    *to++ = *from++;
	  }
	}
      }
      *to = '\0';
    }
    buffer_length = strlen(buffer);
    split_inplace(buffer, buffer_length, &splitted, &count, ' ', SPLIT_TRIM);
    if(count < 4) {
      if(verbose > 2) fprintf(stderr, "\tftp_body_parse: %s too few fields (%d) rejected\n", buffer, count);
      continue;
    }
    {
      char* permissions = splitted[0];
      if(strlen(permissions) != 10) {
	if(verbose > 2) fprintf(stderr, "\tftp_body_parse: %s permissions size is not 10 small\n", buffer);
	continue;
      }
      if(!strchr("dDlF-", permissions[0])) {
	if(verbose > 2) fprintf(stderr, "\tftp_body_parse: %s type %c rejected\n", buffer, permissions[0]);
	continue;
      }
    }
    file = splitted[count - 1];
    if(!strcmp(file, ".") ||
       !strcmp(file, "..")) {
      if(verbose > 2) fprintf(stderr, "\tftp_body_parse: %s ignore . or ..\n", buffer);
      continue;
    }

    /*
     * Select according to file type
     */
    {
      char perm = splitted[0][0];
      file_length = strlen(file);
      switch(perm) {
	/*
	 * Terminate directory file name with a /
	 */
      case 'd':
      case 'D':
	file[file_length++] = '/';
	file[file_length] = '\0';
	break;
      case 'l':
	if(perm == 'l') {
	  /*
	   * Should do something smart to handle symbolic links
	   * because we cannot easily guess if they are directories
	   * or files. Need to check in the directory listing of
	   * their parent directory to find out.
	   */
	  continue;
	}
	break;
      case '-':
      case 'F':
	break;
      default:
	if(verbose > 2) fprintf(stderr, "\tbody_ftp_parse: unknown file type %c for %s\n", perm, buffer);
	continue;
	break;
      }
    }

    /*
     * When we get here, the file name is ready to be collected.
     */
    if(verbose > 2) fprintf(stderr, "\tbody_ftp_parse: got %s\n", file);
    /*
     * + 1 is for white space. + 1 is for null termination.
     */
    static_alloc(&webbase_url->w_relative, &webbase_url->w_relative_length, relative_length + file_length + 1 + 1);
    sprintf(webbase_url->w_relative + relative_length, "%s ", file);
    relative_length += file_length + 1;
  }
  if(relative_length)
    webbase_url->w_info |= WEBBASE_URL_INFO_RELATIVE | WEBBASE_URL_INFO_COMPLETE;
  fclose(fp);
#undef BUFFER_SIZE
}

static char* mirror_ftp_host = 0;

static void mirror_ftp_free()
{
    if(mirror_ftp_host != 0) {
	free(mirror_ftp_host);
	ftpQuit();
    }
}

static int mirror_ftp(crawl_context_t* context, webbase_url_t* webbase_url, uri_t* url_object, char* url)
{
    int url_length = strlen(url);

    webbase_url->w_info |= WEBBASE_URL_INFO_FTP;

    if(mirror_ftp_host == 0 || strcmp(mirror_ftp_host, url_object->host)) {
	mirror_ftp_free();

	mirror_ftp_host = strdup(url_object->host);
    
	if(!ftpOpen(url_object->host)) {
	    fprintf(stderr, "mirror_ftp: %s cannot connect to %s\n", url, url_object->host);
	    return ECONNREFUSED;
	}
	{
	    char* user = url_object->user ? url_object->user : "ftp";
	    char* passwd = url_object->passwd ? url_object->passwd : "someone@somewhere.com";
	    if(!ftpLogin(user, passwd)) {
		fprintf(stderr, "mirror_ftp: %s cannot login to %s with %s:%s\n", url, url_object->host, user, passwd);
		webbase_url_code_set(webbase_url, WEBBASE_URL_CODE_UNAUTHORIZED);
		return ECONNREFUSED;
	    }
	}
    }

    {
	int isdir;
	char* path = uri_furi_string(webbase_url->w_url, strlen(webbase_url->w_url), URI_FURI_REAL_PATH);
	int code;

	if(path) {
	    creatp(path);
	} else {
	    fprintf(stderr, "mirror_ftp: cannot cache %s, unable to resolve path (ignored)\n", webbase_url->w_url);
	    webbase_url_code_set(webbase_url, WEBBASE_URL_CODE_BAD_REQUEST);
	    return ETIME;
	}

	isdir = (url[url_length - 1] == '/' || url[0] == '\0') ? 1 : 0;

	if(isdir) 
	    code = ftpDir(path, (url_object->path[0] == '\0' ? "/" : url_object->path));
	else
	    code = ftpGet(path, url_object->path, 'I');

	if(!code) {
	    fprintf(stderr, "mirror_ftp: %s cannot get/ls `%s'\n", url, url_object->path);
	    webbase_url_code_set(webbase_url, WEBBASE_URL_CODE_NOT_FOUND);
	    return ETIME;
	}

	webbase_url_code_set(webbase_url, WEBBASE_URL_CODE_OK);
	if(isdir)
	    webbase_url->w_info |= WEBBASE_URL_INFO_FTP_DIR;
	webbase_url->w_info |= WEBBASE_URL_INFO_FTP | WEBBASE_URL_INFO_CONTENT;
	webbase_url->w_content_length = file_size(path);
	/*
	 * Until better handling (parse directory containing file to find
	 * the date of the file) the last modified time is the date of the
	 * last successfull crawl.
	 * If touching (only reload the file) do not change the date.
	 */
	if(!context->touch)
	    webbase_url->w_mtime = time(0);
	if(isdir) {
	    strcpy(webbase_url->w_content_type, "text/ftp-dir-listing");
	    ftp_body_parse(webbase_url, url_object, path);
	} else {
	    strcpy(webbase_url->w_content_type, mime_ext2mime(context->base, url_object->path, "text/plain"));
	}
    }

    return 0;
}

static int mirror_http(crawl_context_t* context, webbase_url_t* webbase_url, uri_t* url_object, char* url)
{
  webtools_params_t* http = context->params->http;
  http_context_t http_context;
  char* get;
  int sd;
  int error = 0;
  get = mirror_request_http(context, webbase_url, url_object);
  http_context.webbase_url = webbase_url;
  http_context.cookies = context->params->cookies;
  http->callback_arg = (char*)&http_context;
  {
    char* proxy = getenv("http_proxy");
    if(proxy) {
      uri_t* url_proxy = uri_alloc(proxy, strlen(proxy));
      if(verbose > 1) fprintf(stderr, "\tusing proxy %s\n", proxy);
      sd = webtools_open(http, url_proxy->host, uri_port(url_proxy));
      uri_free(url_proxy);
    } else {
      sd = webtools_open(http, url_object->host, uri_port(url_object));
    }
  }
  if(sd >= 0) {
    if(verbose > 1) fprintf(stderr, "\tloading %s : socket opened\n", url);
    webtools_write(http, sd, get, strlen(get));
    http->timeout = context->start->timeout;
    http->mode = WEBTOOLS_READER_HTTP_HEADER;
    if(webtools_reader(http, sd) == WEBTOOLS_READER_OK) {
      char* spec = context->start->accept;
      static int cookie_loop = 0;
      if(spec == 0 || spec[0] == '\0')
        spec = "text/*,magnus-internal/*";
      if(verbose > 1) fprintf(stderr, "\tloading %s : header read %d\n", url, webbase_url->w_code);
      /*
       * When an URL is redirected and sets a cookie, retry
       * with cookie set, there is a chance that redirection
       * disapear. Only do this once, however.
       */
      if(!cookie_loop &&
	 (webbase_url->w_info & WEBBASE_URL_INFO_COOKIE) &&
	 (webbase_url->w_code == WEBBASE_URL_CODE_MOVED_PERMANENTLY ||
	  webbase_url->w_code == WEBBASE_URL_CODE_MOVED_TEMPORARILY)) {
	if(verbose) fprintf(stderr, "\tmirror_2: cookie loop\n");
	webtools_close(http, sd);
	cookie_loop = 1;
	return mirror_http(context, webbase_url, url_object, url);
      }
      cookie_loop = 0;
      if(WEBBASE_URL_INFO_LOAD_BODY(webbase_url->w_code) && mime_accept_type(context->base, spec, webbase_url->w_content_type)) {
	webbase_url->w_info &= ~WEBBASE_URL_INFO_READING;
	http->mode = WEBTOOLS_READER_HTTP_BODY;
	if(webtools_reader(http, sd) == WEBTOOLS_READER_OK) {
	  if(verbose > 1) fprintf(stderr, "\tloading %s : body read\n", url);
	  webbase_url_hrefs_set(webbase_url, webbase_url->path, context->start->size_limit);
	  {
	    char* raw_path = webbase_url->path;
	    char* path = smalloc(strlen(raw_path) + 2);
	    sprintf(path, "%sR", raw_path);
	    if(bodyparse(raw_path, path, webbase_url->w_url, 100000000, 0, BODY_PARSE_UNACCENT) < 0) {
	      fprintf(stderr, "mirror_http: cannot convert to text %s\n", raw_path);
	    } else {
#ifdef LANGREC
	      char language[DECODAGE_TAILLE_MAX];
	      textlang_in_file(path, language);
	      memcpy(webbase_url->w_language, language, WEBBASE_LANGUAGE_LENGTH);
	      webbase_url->w_language[WEBBASE_LANGUAGE_LENGTH] = '\0';
	      if(verbose > 1) fprintf(stderr, "\tlanguage = %s\n", webbase_url->w_language);
#else /* LANGREC */
	      webbase_url->w_language[0] = '\0';
#endif /* LANGREC */
	      unlink(path);
	    }
	    free(path);
	  }
	  webbase_url_content_length_fix(webbase_url, webbase_url->path);
	  webbase_url_mtime_fix(webbase_url);
	  webbase_url->path = 0;
	  webbase_url->w_info &= ~(WEBBASE_URL_INFO_TRUNCATED | WEBBASE_URL_INFO_READING);
	} else {
	  webbase_url->w_info |= WEBBASE_URL_INFO_TRUNCATED;
	  error = errno;
	  if(verbose > 1) fprintf(stderr, "\terror while reading body (%d)\n", error);
	}
      } else {
	char* path = uri_furi_string(webbase_url->w_url, strlen(webbase_url->w_url), URI_FURI_REAL_PATH);
	if(path) {
	  if(file_exists(path))
	    rm(path);
	} else {
	  fprintf(stderr, "cannot uncache %s, unable to resolve path (ignored)\n", webbase_url->w_url);
	}
      }
    } else {
      error = errno;
      if(verbose > 1) fprintf(stderr, "\terror while reading header (%d)\n", error);
    }
    webtools_close(http, sd);
  } else {
    error = errno;
  }

  webbase_url->w_info |= WEBBASE_URL_INFO_HTTP;
  return error;
}

static webbase_url_t* mirror_location(crawl_context_t* context, webbase_url_t* webbase_url)
{
  if(context->location_no_loop)
    return webbase_url;

  if(webbase_url->w_info & WEBBASE_URL_INFO_LOCATION) {
    int is_root = context->root_url == 0;
    /*
     * Keep the starting point of redirection
     */
    if(context->root_url == 0) {
      webbase_copy_url(&context->webbase_url, webbase_url);
      context->root_url = 1;
    }

    if(verbose > 1) fprintf(stderr, "\t%d redirections %s\n", context->redirections, webbase_url->w_location);
#ifdef WEBBASE_LOCK
    if(!is_root)
      webbase_unlock(context->base, webbase_url->w_rowid, "url");
#endif /* WEBBASE_LOCK */
    if(context->redirections > CRAWL_MAX_REDIRECTIONS) {
      fprintf(stderr, "mirror_location: too many redirections (%d) for %s\n", CRAWL_MAX_REDIRECTIONS, context->webbase_url.w_url);
      return &context->webbase_url;
    }
    context->walk_end = !mirror_collect((char*)context, webbase_url, webbase_url->w_location, WEBBASE_URL_WALK_ABSOLUTE);
    if(context->collected == 0) {
      if(verbose > 1) fprintf(stderr, "\tmirror_location: redirections for %s cannot be followed because %s cannot be collected\n", context->webbase_url.w_url, webbase_url->w_location);
      return &context->webbase_url;
    }
    context->redirections++;
    {
      /*
       * Save the location because webbase_url is volatile.
       */
      char* location = strdup(webbase_url->w_location);
      webbase_url_t* ret;
      ret = mirror_2(context, location);
      free(location);
      return ret;
    }
  } else {
    /*
     * At end point of a redirection, got the real one, can unlock the 
     * first redirection point.
     */
#ifdef WEBBASE_LOCK
    if(context->root_url) {
      webbase_unlock(context->base, context->webbase_url.w_rowid, "url");
    }
#endif /* WEBBASE_LOCK */
    return webbase_url;
  }
}

static char* time2str(time_t t)
{
  struct tm* tmp = localtime(&t);
  static char str[50];
  static char* days[] = { "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat" };
  static char* monthes[] = { "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec" };

  if(tmp == 0) {
    fprintf(stderr, "time2str: gmtime returned 0\n");
    exit(1);
  }
  sprintf(str, "%s, %02d %s %04d %02d:%02d:%02d GMT",
	  days[tmp->tm_wday], tmp->tm_mday, monthes[tmp->tm_mon],
	  tmp->tm_year + 1900, tmp->tm_hour, tmp->tm_min, tmp->tm_sec);

  return str;
}

static char* mirror_request_http(crawl_context_t* context, webbase_url_t* webbase_url, uri_t* url_object)
{
  static char request[1024];
  char modified_since[50];
  static char* cookie = 0;
  static int cookie_size = 0;
  static char* auth = 0;
  static int auth_size = 0;
  char* path;

  modified_since[0] = '\0';
  
  /*
   * Fill If-Modified-Since field if appropriate.
   */
  if(!context->params->noheuristics && !context->touch && (webbase_url->w_mtime > 0)) {
    sprintf(modified_since, "If-Modified-Since: %s\r\n", time2str(webbase_url->w_mtime));
    if(verbose > 1) fprintf(stderr, "\tmirror_request_http: %.*s\n", (int)(strlen(modified_since) - 2), modified_since);
  }
  /*
   * Fill Cookie field if appropriate
   */
  if(cookie)
    cookie[0] = '\0';
  if((context->start->info & WEBBASE_URL_START_NOCOOKIE) == 0) {
    char* tmp = cookies_match(context->params->cookies, url_object);
    if(tmp) {
      static_alloc(&cookie, &cookie_size, strlen(tmp) + 128);
      sprintf(cookie, "Cookie: %s\r\n", tmp);
    }
  }
  /*
   * Fill Authorization field if appropriate
   */
  {
    char* tmp = uri_auth(url_object);
    if(tmp && tmp[0] != '\0') {
      static_alloc(&auth, &auth_size, strlen(tmp) + 128);
      sprintf(auth, "Authorization: Basic %s\r\n", base64_encode(tmp));
    } else {
      if(auth)
	auth[0] = '\0';
    }
  }
  /*
   * Fill path according to proxy spec
   */
  {
    char* proxy = getenv("http_proxy");
    if(proxy) {
      path = uri_uri(url_object);
    } else {
      path = uri_all_path(url_object);
    }
  }

  sprintf(request, "GET %s HTTP/1.0\r\nHost: %s\r\nUser-Agent: "CRAWL_USER_AGENT"\r\n%s%s%s\r\n",
	  path,
	  uri_netloc(url_object),
	  modified_since,
	  (cookie ? cookie : ""),
	  (auth ? auth : ""));
  if(verbose > 2) fprintf(stderr, "request = %s", request);
  return request;
}

static void mirror_schedule(crawl_context_t* context, webbase_url_t* webbase_url)
{
  int delay = WEBBASE_TIME_WEEK;
  webbase_url_start_t* start = context->start;

  if(verbose > 1) fprintf(stderr, "\tschedule %s\n", webbase_url->w_url);

  if(webbase_url->w_info & WEBBASE_URL_INFO_TIMEOUT) {
    delay = start->timeout_delay * WEBBASE_TIME_DAY;
  } else if(webbase_url->w_info & WEBBASE_URL_INFO_NOT_MODIFIED) {
    delay = start->modified_delay * WEBBASE_TIME_DAY;
  } else if(webbase_url->w_info & WEBBASE_URL_INFO_NOT_FOUND) {
    delay = start->not_found_delay * WEBBASE_TIME_DAY;
  } else if(webbase_url->w_info & WEBBASE_URL_INFO_OK) {
    delay = start->loaded_delay * WEBBASE_TIME_DAY;
  } else if(webbase_url->w_info & WEBBASE_URL_INFO_ERROR) {
    delay = start->not_found_delay * WEBBASE_TIME_DAY;
  } 

  webbase_url->w_crawl = time(0) + delay;
}

void crawl_free(crawl_params_t* params)
{
  if(!params->no_hook)
    hook_end();
  dirsel_end();
  robots_free(params->robots);
  cookies_free(params->cookies);
  hash_free(params->options);
  webbase_free(params->base);
  webtools_free(params->http);
  mirror_ftp_free();
  free(params);
}

static void hnode_free(hnode_t *node, void *context)
{
  free(node->data);
  free(node);
}

static crawl_params_t* params_alloc()
{
  crawl_params_t* params = (crawl_params_t*)smalloc(sizeof(crawl_params_t));
  memset((char*)params, '\0', sizeof(crawl_params_t));
  params->options = hash_create(33, 0, 0);
  hash_set_allocator(params->options, 0, hnode_free, 0);

  return params;
}
