/* Copyright 2002, Jonathan S. Shapiro.
   All Rights Reserved.

   XDCS is a quasi-archiver based on the XDelta copy/insert algorithm
   published by Josh MacDonald.

   XDCS differs from the XDelta in several ways:

   1. It is not intended for use as a streaming delta
      protocol. Therefore, it does limit back-searching when expanding
      matches.

   2. It is intended for use in the same scenarios as RCS/SCCS. It
      therefore assumes that the sequence of inputs is inherently
      self-similar.

   3. The original implementation encodes by taking two versions
      (/base/, /new/) and generating a sequence of copy and insert
      operations relative to /base/ that produce /new/. That is, it is
      conceived as producing one version from another. This
      implementation redefines /base/ to be the entire previous
      insertion history. See the notes below for details of this.

   4. Josh's implementation clobbers hash entries when hashes
      collide. Since the hashes are of relatively short sequences this
      can penalize things quite a bit. We keep all collisions (at
      least for now -- need to experiment) and use the "longest match"
      rule.

   Classical delta algorithms use one of the following strategies:

   a) forward deltas: start from a baseline and then add compute all
      deltas relative to existing baseline.

      PRO: Simple to do.
      PRO: Existing entries never rewritten.
      PRO: Any version can be extracted in time that is linear in
           sizeof(baseline) + sizeof(encoded delta).
      CON: Inserts grow progressively larger, as new content is
	   progressively added.

      The insert problem is usually solved by periodically storing
      a save point and then restarting the sequence from there.

   b) backward deltas: most recent entry is complete, all others are
      computed relative to the most recent.

      PRO: Most recent deltas come out quickly.
      CON: All deltas must be re-encoded every time an insert is
           done, or sequential decoding must be done to extract old
           versions. 
      CON: Because of the sequential decoding requirement, the
           memory required is on the same order as that of two
           entries in the file.

   c) interleaved deltas, in which all encodings are kept in some
      form of weighted tree.

      PRO: Any version extracs in roughly NlogN time, N the size of
           the version.
      CON: I don't know how to do this.
      CON: The algorithms are complicated enough that they do not
           promote confidence.

   The main downside to the forward delta scheme is that version N+1
   is likely to be most similar to version N. Computing a strictly
   forward delta therefore leads to progressively longer diffs.

   Because Xdelta is a copy+insert scheme, the algorithm for archives
   can use a minor *variant* on the forward delta scheme.

   Each entry i in the archive can be thought of as an "insert buffer"
   S_i consisting of bytes and a "command string "C_i". The command
   string is a sequence of operations, each of which is a copy or an
   insert. The "copy" commands copy byte sequences from the base
   buffer, while the insert commands copy successive byte sequences
   from the insert buffer.

   EXPERIMENTS:

   There are two optimizations on this that I plan to try. The first
   takes advantage of self-similarity in any given file (e.g. C
   code). The second takes advantage of self-similarity across the
   entire archive.

   First, note that the *initial* entry in the archive normally
   consists of an insert string that is the entire initial file and a
   single insert command inserting this buffer into the output.

   Proposal 1: Redefine that for any given entry in the archive the
   /base/ buffer should be defined as a (normalized) concatenation of
   all previous insert buffers. This allows later insertions to draw
   their copies from both earlier and later versions, and *should*
   preserve the advantages of backward deltas without introducing the
   extra associated regeneration costs.

   This ought to be pretty simple.

   Proposal 2: More work, but possibly interesting, is to revise the
   construction algorithm such that it considers the insertion buffer
   to be a sequence of individual insertions si_0, si_1,
   si_2. Following each insertion i it appends the resulting si_i to
   the /base/ string, allowing subsequent copy operations to
   self-reference the insertion buffer and thereby achieving some
   degree of compression.

   Proposal 3: Use a slightly trickier encoding of inserts to take
   advantage of short offsets.
 */

#include <opencm.h>

// #define PARANOID
// #define HASH_TEST
// #define VERBOSE

#define SHORT_INS  0u
#define SHORT_COPY 0x40u
#define LONG_INS   0x80u
#define MID_INS    0x81u
#define LONG_COPY  0xC0u
#define MID_COPY   0xC1u
#define ISCOPY(cmd) (cmd & 0x40u)
#define CMDTY(len) ((len < 64) ? 'S' : ((len < 65536) ? 'M' : 'L'))

#define LG_CHUNKSZ  4
#define CHUNKSZ     16
#define SKIPSZ      CHUNKSZ

#ifdef __CYGWIN__
/* It appears that Cygwin doesn't define these ANYWHERE. If other
   platforms prove to suffer from the same malady we will need to do
   something using limits.h to choose the proper value. */
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
typedef unsigned long uint32_t;
#endif

typedef struct hashpos_t {
  phash_t hash;
  size_t offset;
} hashpos_t ;

typedef struct hashvec_t {
  unsigned  nhash;
  hashpos_t *hash;		/* array */
} hashvec_t;

static int
cmp_hash(const void *v1, const void *v2)
{
  const hashpos_t *bh1 = (const hashpos_t *) v1;
  const hashpos_t *bh2 = (const hashpos_t *) v2;

  assert(bh1 != 0);
  assert(bh2 != 0);

  if (bh1->hash < bh2->hash)
    return -1;

  if (bh1->hash > bh2->hash)
    return 1;
  
  return 0;
}

static int
cmp_hashkey(const void *vkey, const void *vmember)
{
  const hashpos_t *member = (const hashpos_t *) vmember;
  const phash_t key = * ((const phash_t *) vkey);

  if (key < member->hash)
    return -1;
  if (key > member->hash)
    return 1;
  
  return 0;
}

#if 0
/* These random numbers are NOT the same as the ones used in the
 * MacDonald libxdelta implementation. We cannot reuse those due to
 * GPL restrictions. Thank heavens those bits are not externally
 * exposed by the XDelta algorithm!
 *
 * The reason to use random numbers is that it flattens the entropy of
 * the input bytes across a larger numeric space, and therby reduced
 * collisions. Without this, the adler32 of  "bcx" and "adx" are the
 * same.
 */

uint16_t magic[] = {
  0x1be1U, 0x85e3U, 0x93e3U, 0x77eaU, 0x8effU, 0x9af0U, 0x25faU, 0xc9faU,
  0xd6faU, 0xb14fU, 0x7340U, 0x3c47U, 0x0f48U, 0x1d48U, 0x0254U, 0x2a55U, 
  0x6355U, 0xee5cU, 0x7167U, 0xf57dU, 0x687eU, 0x927cU, 0xdb8fU, 0x148eU, 
  0xe58eU, 0x8f88U, 0xf488U, 0xfe8aU, 0x9f8bU, 0x1b93U, 0x2796U, 0xfa97U, 
  0x4aaeU, 0xeaa3U, 0x8ea7U, 0x64beU, 0xa1beU, 0x2ab1U, 0x65b3U, 0x4eb7U, 
  0x14b8U, 0xd9bbU, 0x8ccfU, 0x98cdU, 0xb9c3U, 0x9ccbU, 0xdcccU, 0x5dddU, 
  0x33d3U, 0xf7d4U, 0xf10dU, 0xb000U, 0x4b02U, 0x8505U, 0x5606U, 0x9e0aU, 
  0x971fU, 0xdc1eU, 0x5b15U, 0x9019U, 0x932fU, 0xbb23U, 0xa724U, 0xfb2bU, 
  0x012cU, 0x963eU, 0x2031U, 0x6d36U, 0x5c4dU, 0xb24eU, 0xf142U, 0x0c49U, 
  0x7e50U, 0xa850U, 0xcc50U, 0x1e51U, 0xe156U, 0xbb57U, 0xaf59U, 0xe55aU, 
  0x366dU, 0x176eU, 0x6c67U, 0x0b6bU, 0xd471U, 0x6b76U, 0x6b77U, 0x0f7bU, 
  0x4f7bU, 0xde7bU, 0xc981U, 0x8d83U, 0xe086U, 0x9787U, 0x5c88U, 0x9f8aU, 
  0x9e9eU, 0xf490U, 0x1092U, 0xbc96U, 0x78aeU, 0xf2b3U, 0xe4b4U, 0x5fbbU, 
  0x58cdU, 0x8ecdU, 0xf8cdU, 0xd3c6U, 0xc9ccU, 0x1cd1U, 0xaddaU, 0x63eeU, 
  0xe9e3U, 0xf9e6U, 0xb3ebU, 0x4effU, 0xa0f0U, 0xa5f6U, 0x77f8U, 0x46f8U, 
  0xc6fcU, 0xd50fU, 0x8a04U, 0x2509U, 0x9309U, 0xea09U, 0x0f0bU, 0x9a1eU, 
  0xe711U, 0x9718U, 0xec19U, 0x0e1bU, 0xa81cU, 0xe82fU, 0x372fU, 0x5b28U, 
  0xc32cU, 0x8c3dU, 0x2a31U, 0x1532U, 0xcf38U, 0xfc3cU, 0x124dU, 0x2d45U, 
  0x3547U, 0x6448U, 0xca4cU, 0x6450U, 0xc253U, 0x3d55U, 0x6255U, 0x1d5bU, 
  0x3a5cU, 0x5867U, 0xd667U, 0x4278U, 0xb079U, 0x6888U, 0xad8bU, 0xe19dU, 
  0x4190U, 0x3594U, 0xec96U, 0xfeaeU, 0x23a1U, 0xfba1U, 0x6da2U, 0x48beU, 
  0xd0beU, 0x44b3U, 0x1eb5U, 0x8fbbU, 0xe0bbU, 0xefceU, 0xe1c2U, 0xd8c9U, 
  0x37caU, 0x2dd6U, 0x89e6U, 0x58fdU, 0xebf2U, 0xc1f5U, 0x84f6U, 0x5cfbU, 
  0xfafcU, 0x960fU, 0x1e0dU, 0xcf0eU, 0x0103U, 0x1205U, 0x411aU, 0x961bU, 
  0xe61bU, 0x0b2dU, 0x3221U, 0x0522U, 0xc222U, 0x7b23U, 0xee28U, 0x072aU, 
  0xa432U, 0xc13bU, 0x023cU, 0x5a4eU, 0x8646U, 0x1c5fU, 0x1d53U, 0x3f56U, 
  0x6256U, 0x2664U, 0xed65U, 0x9767U, 0xa96aU, 0x3770U, 0x8f71U, 0xa673U, 
  0xf783U, 0xb885U, 0x5f9eU, 0xef9eU, 0x179fU, 0xd690U, 0x1091U, 0xc4adU, 
  0x80aeU, 0x72a3U, 0x3ea4U, 0x8ea6U, 0xe3a9U, 0x55b2U, 0xeab7U, 0x99b9U, 
  0x29c2U, 0xe8c6U, 0xa3cbU, 0x32d5U, 0xe7d8U, 0x3cedU, 0x50edU, 0xf5e2U, 
  0xb1e5U, 0xbaebU, 0x2cf3U, 0xb203U, 0x0604U, 0x6508U, 0x9408U, 0x830bU, 
  0x850cU, 0x391dU, 0x8510U, 0x792dU, 0xd424U, 0x3e29U, 0x2931U, 0xbc32U,
};

#define MAGIC(x) magic[x]

/* This is a sequential checksum based loosely on adler32. 
 *
 * It is NOT the same as the one used in Josh MacDonald's XDFS,
 * because it does not incorporate the same randomly generated 16 bit
 * numbers (because of GPL copyright restrictions). Thank heavens
 * those bits are not externally exposed by the XDelta family of
 * programs!
 *
 * The reason to use random numbers is that it flattens the entropy of
 * the input bytes across a larger numeric space, and therby reduced
 * collisions. Without this, the adler32 of  "bcx" and "adx" are the
 * same.
 */
static phash_t
sxd_sum(const unsigned char *buf, size_t len)
{
  uint32_t low = 0;
  uint32_t hi = 0;

  while (len --) {
    low += MAGIC(*buf++);
    low &= 0xffffu;
    hi += low;
  }

  return (hi << 16) | low;
}

static phash_t
sxd_progressive_sum(phash_t sum, unsigned char cold, unsigned char
		    cnew, )
{
  uint32_t low = sum & 0xffffu;
  uint32_t hi = sum >> 16;

  low -= MAGIC(cold);
  low += MAGIC(cnew);
  low &= 0xffffu;

  hi -= ( ((uint32_t) MAGIC(cold)) << LG_CHUNKSZ );
  hi += low;

  return (hi << 16) | low;
}
#endif

static hashvec_t *
compute_base_hashes(Buffer *buf)
{
  unsigned i;
  ocmoff_t bufLen = buffer_length(buf);

#ifdef HASH_TEST
  phash_t phash = 0;		/* for the progressive hash */
#endif
  phash_t hash = 0;		/* for the normal hash */
  hashvec_t *bh;

  assert(SKIPSZ >= CHUNKSZ);

  bh = GC_MALLOC(sizeof(hashvec_t));
  bh->nhash = bufLen - (bufLen % CHUNKSZ); /* chop residual */
  bh->nhash = bufLen / SKIPSZ;
  bh->hash = GC_MALLOC_ATOMIC(sizeof(hashpos_t) * bh->nhash);

  if (bh->nhash == 0)
    return bh;

  for (i = 0; i < bh->nhash; i++) {
    unsigned char sxdbuf[CHUNKSZ];

    buffer_read(buf, sxdbuf, i * SKIPSZ, CHUNKSZ);

    hash = psum(sxdbuf, CHUNKSZ);

    bh->hash[i].offset = i * SKIPSZ;
    bh->hash[i].hash = hash;

#ifdef HASH_TEST
    if (i == 0) {
      phash = hash;
    }
    else {
      unsigned j;

      for (j = 0; j < CHUNKSZ; j++)
	phash = sxd_progressive_sum(phash, 
				    buffer_getc(buf, i*SKIPSZ + j-CHUNKSZ),
				    buffer_getc(buf, i*SKIPSZ + j));

      if (phash != hash)
	xprintf("Progressive 0x%x, base 0x%x\n", phash, hash);
    }
#endif
  }

  qsort(bh->hash, bh->nhash, sizeof(hashpos_t), cmp_hash);


#ifdef VERBOSE
  xprintf("# hashes: %d\n", bh->nhash);
#endif

  return bh;
}

static hashpos_t *
find_first_matching_hash(hashvec_t *bh, phash_t key)
{
  hashpos_t *hp;
  hp = xbsearch(&key, bh->hash, bh->nhash, sizeof(*bh->hash), cmp_hashkey);

  if (hp == 0)
    return hp;

  while (bh->hash != hp && hp[-1].hash == hp->hash)
    hp --;

  return hp;
}

typedef struct match_t {
  off_t src_start;		/* inclusive */
  off_t src_end;		/* exclusive */
  off_t dst_start;		/* inclusive */
  off_t dst_end;		/* exclusive */
  size_t len;
} match_t;

static void
expand_match(match_t *match, 
	     Buffer *src, 
	     Buffer *dst, off_t floor)
{
  /* See if the range discovered by the hash actually matches: */
  unsigned u;
  for (u = 0; match->src_start + u < match->src_end; u++) {
    if (buffer_getc(src, match->src_start + u) 
	!= buffer_getc(dst, match->dst_start + u)) {
      match->len = 0;
      return;
    }
  }

  /* Extend backwards, but not past floor */
  while (match->src_start > 0 && match->dst_start > floor) {

    if (buffer_getc(src, match->src_start-1) 
	== buffer_getc(dst, match->dst_start-1)) {
      match->src_start--;
      match->dst_start--;

      match->len ++;

      continue;
    }

    break;
  }

  {
    ocmoff_t srcLen = buffer_length(src);
    ocmoff_t dstLen = buffer_length(dst);

    while(match->src_end < srcLen && match->dst_end < dstLen) {
      if (buffer_getc(src, match->src_end) 
	  == buffer_getc(dst, match->dst_end)) {
	match->src_end++;
	match->dst_end++;

	match->len ++;
	continue;
      }
      break;
    }
  }

  assert(match->len == match->dst_end - match->dst_start);
}

#define TAKE_BEST_MATCH
// #define TAKE_LAST_MATCH

/* NOTE: At the moment this does NOT actually find the best match. It
   finds the *last* match. This is equivalent to the behavior of the
   MacDonald XDelta implementation, though XDelta accomplishes the
   same result by simply throwing away the earlier candidates. */
static OC_bool
find_best_match(hashvec_t *bh, phash_t key, match_t *match,
		Buffer *src, Buffer *dst, off_t floor, 
		off_t pos)
{
  hashpos_t *found;

  found = find_first_matching_hash(bh, key);
  if (!found)
    return FALSE;

  match->dst_start = pos;
  match->dst_end = pos+CHUNKSZ;
  match->src_start = found->offset;
  match->src_end = found->offset + CHUNKSZ;
  match->len = CHUNKSZ;

  expand_match(match, src, dst, floor);

  /* Matching hash found. We now have the correct hash, but the
     actual characters may not match due to hash collision. Also, this
     match needs to be expanded backwards and forwards to maximize the
     match length. */

#if defined(TAKE_BEST_MATCH)
  /* A match is better if (a) found->offset is reduced by using it or
     (b) the total length is longer and the offset doesn't get any
     worse. */
 {
   hashpos_t *last = bh->hash + bh->nhash - 1;

   while (found != last && found[1].hash == found[0].hash) {
      match_t m;

      found++;
      m.dst_start = pos;
      m.dst_end = pos+CHUNKSZ;
      m.src_start = found->offset;
      m.src_end = found->offset + CHUNKSZ;
      m.len = CHUNKSZ;

      expand_match(&m, src, dst, floor);

      /* Take earlier (expanded) match: */
      if (m.len > 0 && m.dst_start < match->dst_start)
	memcpy(match, &m, sizeof(m));
      /* Then take longer match */
      else if (m.dst_start == match->dst_start && m.len > match->len)
	memcpy(match, &m, sizeof(m));
   }
 }
#elif defined(TAKE_LAST_MATCH)
  {
    hashpos_t *last = bh->hash + bh->nhash - 1;

    while (found != last && found[1].hash == found[0].hash) {
      match_t m;

      found++;
      m.dst_start = pos;
      m.dst_end = pos+CHUNKSZ;
      m.src_start = found->offset;
      m.src_end = found->offset + CHUNKSZ;
      m.len = CHUNKSZ;

      expand_match(&m, src, dst, floor);
      if (m.len > 0)
	memcpy(match, &m, sizeof(m));
    }
  }
#else
  {
    hashpos_t *last = bh->hash + bh->nhash - 1;

    while (found != last && found[1].hash == found[0].hash) {
      match_t m;

      found++;
      m.dst_start = pos;
      m.dst_end = pos+CHUNKSZ;
      m.src_start = found->offset;
      m.src_end = found->offset + CHUNKSZ;
      m.len = CHUNKSZ;

      expand_match(&m, src, dst, floor);
      if (m.len > 0) {
	memcpy(match, &m, sizeof(m));
	break;
      }
    }
  }
#endif

   if (match->len == 0)
     return FALSE;

  return TRUE;
}

typedef struct sxdelta_t {
  SDR_stream *ins;
  SDR_stream *cmd;
} sxdelta_t;

static void
emit_insert(sxdelta_t *xd,
	    Buffer *src, Buffer *dst, 
	    off_t start, off_t end)
{
  size_t len = end - start;

  char ty = CMDTY(len);

  switch(ty) {
  case 'S':
    sdr_w_u8("cmd", xd->cmd, SHORT_INS|len);
    break;
  case 'M':
    sdr_w_u8("cmd", xd->cmd, MID_INS);
    sdr_w_u16("len", xd->cmd, len);
    break;
  case 'L':
    sdr_w_u8("cmd", xd->cmd, LONG_INS);
    sdr_w_u32("len", xd->cmd, len);
    break;
  }

  stream_write_partial_buffer(xd->ins, dst, start, end - start);

#ifdef VERBOSE
  xprintf("%cI %d [%d,%d)\n", 
	  CMDTY(len), end - start, start, end);
#endif
}

static void
emit_copy(sxdelta_t *xd,
	  Buffer *src, Buffer *dst, 
	  match_t *match)
{
  off_t len = match->src_end - match->src_start;

  char ty = CMDTY(len);

  switch(ty) {
  case 'S':
    sdr_w_u8("cmd", xd->cmd, SHORT_COPY |len);
    break;
  case 'M':
    sdr_w_u8("cmd", xd->cmd, MID_COPY);
    sdr_w_u16("len", xd->cmd, len);
    break;
  case 'L':
    sdr_w_u8("cmd", xd->cmd, LONG_COPY);
    sdr_w_u32("len", xd->cmd, len);
    break;
  }
  sdr_w_u32("start", xd->cmd, match->src_start);

#ifdef VERBOSE
  xprintf("%cC %d [%d,%d) matches dst [%d,%d)\n", 
	  CMDTY(len), 
	  len,
	  match->src_start, match->src_end, 
	  match->dst_start, match->dst_end);
#endif
}

static void
emit_finish(sxdelta_t *xd)
{
#if 0
  stream_printf(xd->cmd, "F\n");
#endif

#ifdef VERBOSE
  xprintf("F\n");
#endif
}

#define HT_SHA1  0

typedef struct xdirent_t {
  unsigned long insOffset;	/* of start of insert pool list */
  unsigned long cmdOffset;	/* of start of CMD list */
  unsigned long cmdLen;		/* of CMD list */
  unsigned const char *name;	/* entry name */
} xdirent_t;

typedef struct XDeltaArchive_t {
  const char *magic;		/* "sxdl" */
  unsigned short version;	/* currently 1 */
  unsigned short flags;		/* currently 0 */

  unsigned long nDirent;	/* number of entries */
  unsigned long maxDirent;	/* allocated dir space */
  unsigned long nContent;		/* number of stored deltas */
  struct xdirent_t *dir;
  
  Buffer *content;		/* sequence of serialized xdeltas */
} XDeltaArchive_t;

static XDeltaArchive_t *
xda_fromStream(SDR_stream *s)
{
  unsigned u;
  XDeltaArchive_t *xda = GC_MALLOC(sizeof(XDeltaArchive_t));

  if (s == 0 || stream_length(s) == 0) {
    xda->magic = "sxdl";
    xda->version = 1;
    xda->flags = 0;
    xda->nDirent = 0;
    xda->maxDirent = xda->nDirent + 1;
    xda->nContent = 0;
    xda->content = buffer_create();

    /* We allocate an EXTRA directory slot. This is because the insert
       operation adds at most one object, and it is cheaper to avoid
       reallocating the array later. */
    xda->dir = GC_MALLOC(sizeof(xdirent_t) * xda->maxDirent);
    return xda;
  }

  xda->magic = (const char *) sdr_r_bytes("magic", s, 4);
  xda->version = sdr_r_u16("version", s);
  xda->flags = sdr_r_u16("flags", s);
  xda->nDirent = sdr_r_u32("nDirent", s);
  xda->maxDirent = xda->nDirent + 1;
  xda->nContent = sdr_r_u32("nContent", s);

  /* We allocate an EXTRA directory slot. This is because the insert
     operation adds at most one object, and it is cheaper to avoid
     reallocating the array later. */
  xda->dir = GC_MALLOC(sizeof(xdirent_t) * xda->maxDirent);

  for (u = 0; u < xda->nDirent; u++) {
    xdirent_t *xdd = &xda->dir[u];

    xdd->insOffset = sdr_r_u32("insOff", s);
    xdd->cmdOffset = sdr_r_u32("cmdOff", s);
    xdd->cmdLen    = sdr_r_u32("cmdLen", s);
    xdd->name      = sdr_r_string("name", s);
  }

  {
    Buffer *b = stream_asBuffer(s);
    xda->content = buffer_fromBuffer(b, stream_position(s), xda->nContent);
  }

  return xda;
}

static int
xdirent_cmp(const void *v1, const void *v2)
{
  const xdirent_t *xd1 = (const xdirent_t *) v1;
  const xdirent_t *xd2 = (const xdirent_t *) v2;

  assert(xd1 != 0);
  assert(xd2 != 0);
  return strcmp(xd1->name, xd2->name);
}

static int
xdirent_keycmp(const void *vkey, const void *vmember)
{
  const char *key = (const char *) vkey;
  const xdirent_t *xd = (const xdirent_t *) vmember;
  return strcmp(key, xd->name);
}

static void
xda_rewriteWith(XDeltaArchive_t *xda, sxdelta_t *xd, 
		const char *name,
		SDR_stream *s)
{
  unsigned u;

  ocmoff_t contentLen = buffer_length(xda->content);
  unsigned long newLen = contentLen;
  newLen +=  stream_length(xd->ins);
  newLen +=  stream_length(xd->cmd);
  xda->nContent = newLen;

  /* Use the preallocated extra directory entry to add in the new tag
     name: */
  xda->dir[xda->nDirent].insOffset = contentLen;
  xda->dir[xda->nDirent].cmdOffset = contentLen + stream_length(xd->ins);
  xda->dir[xda->nDirent].cmdLen = stream_length(xd->cmd);
  xda->dir[xda->nDirent].name = name;

  xda->nDirent++;

  sdr_w_bytes("magic", s, 4, xda->magic);
  sdr_w_u16("version", s, xda->version);
  sdr_w_u16("flags", s, xda->flags);
  sdr_w_u32("nDirent", s, xda->nDirent);
  sdr_w_u32("nContent", s, xda->nContent);

  /* Sort the entries for quick reference: */
  qsort(xda->dir, xda->nDirent, sizeof(*xda->dir), xdirent_cmp);

  for (u = 0; u < xda->nDirent; u++) {
    xdirent_t *xdd = &xda->dir[u];

    sdr_w_u32("insOff", s, xdd->insOffset);
    sdr_w_u32("cmdOff", s, xdd->cmdOffset);
    sdr_w_u32("cmdLen", s, xdd->cmdLen);
    sdr_w_string("name", s, xdd->name);
  }

  /* Rewrite the old content: */
  stream_write_partial_buffer(s, xda->content, 0, buffer_length(xda->content));

  /* Write the new content: */
  stream_write_buffer(s, stream_asBuffer(xd->ins));
  stream_write_buffer(s, stream_asBuffer(xd->cmd));
}

static sxdelta_t *
gen_sxdelta(XDeltaArchive_t *src, Buffer *dst)
{
  sxdelta_t *xd = GC_MALLOC(sizeof(sxdelta_t));
  hashvec_t *bh = compute_base_hashes(src->content);

  OC_bool found = FALSE;
  phash_t h;
  off_t len = buffer_length(dst);
  off_t floor = 0;		/* number of bytes we can already generate */
  match_t match = {0, 0, 0, 0};

  xd->ins = stream_createBuffer(SDR_RAW);
  xd->cmd = stream_createBuffer(SDR_RAW);

#if 0
  /* Emit the command sequence format type first: */
  stream_printf(xd->cmd, "T\n");
#endif

  if (len < CHUNKSZ)
    goto finish;

  /* The case where the input archive is empty is easy to pick off,
     and it seems like a lot of unnecessary work to go through
     computing positional hashes when they cannot possibly match. This
     is an especially helpful optimization the first time you check in
     a large file.

     FIX: note that this will no longer be the right thing to do when
     we incorporate self-copy, because in that case the file will be
     likely to have self-similarity that we can exploit. */
  if (src->nContent == 0)
    goto finish;

  /* FIX: It is probably worth the trouble to recognize gzip-format
     input here and simply bypass the hash check, as I rather suspect
     that no matches are likely to be found. Before attempting such
     recognition, however, we should definitely TEST this hypothesis!
  */

  /* Proceed as follows:
   *
   * 1. Iterate to find best match against first matching hash:
   *    A. Find a match.
   *    B. Extend it backwards (not past floor) and forwards (not past
   *       end).
   *    C. If result length not better than previous, take it
   * 2. If there is a gap between the beginning of this match and
   *    floor, add an "insert" instruction for the gap.
   * 3. Add the discovered match as a "copy" operation.
   */

  do {
    off_t pos = floor;		/* search position in target file */
    found= FALSE;

    {
      unsigned char sxdsum[CHUNKSZ];
      buffer_read(dst, sxdsum, pos, CHUNKSZ);

      h = psum(sxdsum, CHUNKSZ);
    }

    while (pos + CHUNKSZ < len) {
#ifdef HASH_TEST
      phash_t check;
#endif

      found = find_best_match(bh, h, &match, src->content, dst, floor, pos);
      if (found)
	break;

      h = psum_progressive(h, LG_CHUNKSZ, 
			   buffer_getc(dst, pos), 
			   buffer_getc(dst, pos+CHUNKSZ));
#ifdef HASH_TEST
      {
	unsigned char sxdsum[CHUNKSZ];
	buffer_read(dst, sxdsum, pos + 1, CHUNKSZ);
	check = psum(sxdsum, CHUNKSZ);
      }

      if (h != check) {
	xprintf("Hash fail at pos %d: progressive 0x%x real 0x%x\n", pos, h, check);
	return 0;
      }
#endif

      /* No match found at this position. Proceed forward. */
      pos++;
    }

    if (!found)
      break;

    /* See if we need to output an insert instruction before the next
       copy: */
    if (match.dst_start > floor)
      emit_insert(xd, src->content, dst, floor, match.dst_start);

    emit_copy(xd, src->content, dst, &match);
    
    floor = match.dst_end;
  } while (floor + CHUNKSZ < len);

 finish:
  if (floor < buffer_length(dst))
    emit_insert(xd, src->content, dst, floor, buffer_length(dst));

  emit_finish(xd);

#if 0
  xprintf("Yielded %d bytes from %d bytes\n", 
	  xd->cmd->len + xd->ins->len, dst->length);

#ifdef VERBOSE
  xprintf("Commands take %d bytes inserting %d bytes\n", 
	  xd->cmd->len, xd->ins->len);
#endif
#endif

  return xd;
}

static SDR_stream *
xdcs_apply_delta(Buffer *base, sxdelta_t *xd, SDR_stream *out)
{
  off_t insertPos = 0;
  
  SDR_stream *cmd = xd->cmd;
  SDR_stream *ins = xd->ins;
  Buffer *insBuf = stream_asBuffer(xd->ins);

  stream_reread(cmd);
  stream_reread(ins);

  while(stream_position(cmd) != stream_length(cmd)) {
    unsigned char theCmd = sdr_r_u8("cmd", cmd);

    {
      off_t len;
#ifdef VERBOSE
      unsigned startPos = out->pos;
#endif

      if (ISCOPY(theCmd)) {
	off_t start;

	if (theCmd == LONG_COPY)
	  len = sdr_r_u32("len", cmd);
	else if (theCmd == MID_COPY)
	  len = sdr_r_u16("len", cmd);
	else
	  len = theCmd & 0x3fu;

	start = sdr_r_u32("start", cmd);
	stream_write_partial_buffer(out, base, start, len);
#ifdef VERBOSE
	xprintf("Applied %cC %d from %d [%d,%d)\n", 
		CMDTY(len), len, start, 
		startPos, out->pos);
#endif
      }
      else {
	if (theCmd == LONG_INS)
	  len = sdr_r_u32("len", cmd);
	else if (theCmd == MID_INS)
	  len = sdr_r_u16("len", cmd);
	else
	  len = theCmd & 0x3fu;

	stream_write_partial_buffer(out, insBuf,  insertPos, len);
	insertPos += len;

#ifdef VERBOSE
	{
	  unsigned startPos = out->pos;
	  xprintf("Applied %cI %d [%d,%d)\n", CMDTY(len), startPos, out->pos);
	}
#endif
      }
    }
  }

  return out;
}

void
xdcs_insert(SDR_stream *archive, 
	    SDR_stream *content,
	    const char *name,
	    SDR_stream *out)
{
  sxdelta_t *xd;
  XDeltaArchive_t *xda;

  xda = xda_fromStream(archive);

  if (xbsearch(name, xda->dir, xda->nDirent, sizeof(*xda->dir), 
               xdirent_keycmp))
    return;			/* entity is already stored */

  xd = gen_sxdelta(xda, stream_asBuffer(content));

#if defined(PARANOID)
  /* Test it: */
  {
    unsigned u;
    SDR_stream *out = stream_createstring(STREAM_RAW);
    xdcs_apply_delta(xda->content, xd, out);

    if (out->len != content->len)
      THROW(ExIntegrityFail, 
	    format("Inserted file length mismatch: %d (orig %d)\n", 
		   out->len, content->len));

    stream_reread(out);
    stream_reread(content);

    for (u = 0; u < out->len; u++) {
      unsigned char oc= stream_getc(out);
      unsigned char cc= stream_getc(content);

      
      if (oc != cc)
	THROW(ExIntegrityFail, 
	      format("Content mismatch at position %d\n", u));
    }
  }
#endif

  /* Rewrite it */
  xda_rewriteWith(xda, xd, name, out);   
}

void
xdcs_extract(SDR_stream *archive,
	     const char *name,
	     SDR_stream *out)
{
  sxdelta_t *xd;
  XDeltaArchive_t *xda;
  unsigned u;
  xdirent_t *xde = 0;

  xda = xda_fromStream(archive);

  xd = GC_MALLOC(sizeof(sxdelta_t));

  for (u = 0; u < xda->nDirent; u++) {
    if (nmequal(xda->dir[u].name, name)) {
      xde = &xda->dir[u];
      break;
    }
  }

  if (xde == 0)
    THROW(ExNoObject, "Requested hash not found\n");

  {
    Buffer *insBuf = buffer_fromBuffer(xda->content, xde->insOffset,
				       xde->cmdOffset - xde->insOffset);
    Buffer *cmdBuf = buffer_fromBuffer(xda->content, xde->cmdOffset,
				       xde->cmdLen);
    xd->ins = stream_fromBuffer(insBuf, SDR_RAW);
    xd->cmd = stream_fromBuffer(cmdBuf, SDR_RAW);
  }

#ifdef VERBOSE
  xprintf("Extract ins %d bytes from %d\n", xd->ins->len, xde->insOffset);
  xprintf("Extract cmds %d bytes from %d\n", xd->cmd->len, xde->cmdOffset);
#endif

  xdcs_apply_delta(xda->content, xd, out);
}

void
xdcs_ls(SDR_stream *archive)
{
  XDeltaArchive_t *xda;
  unsigned u;

  xda = xda_fromStream(archive);

  for (u = 0; u < xda->nDirent; u++) {
    xprintf("%-5d %s\n", xda->dir[u].insOffset, xda->dir[u].name);
  }
}
