#include <opencm.h>

static int
diffline_cmp(const DiffLine *dl1, const DiffLine *dl2)
{
  if (dl1->hash < dl2->hash)
    return -1;
  else if (dl1->hash > dl2->hash)
    return 1;
  else 
    return strcmp(dl1->content, dl2->content);
}

/* Note that calc_line_hashes currently relies on the fact that lines
   are newline terminated. This is safe for OpenCM because (a) the buffer
   reader canonicalizes all text files that way, and (b) diff doesn't
   work for binary files anyway. */
ObVec *
diff_extract_lines(Buffer *buf)
{
  ObVec *hvec = obvec_create();
  ocmoff_t pos = 0;
  ocmoff_t len = buffer_length(buf);
  unsigned line = 1;
  ocmoff_t end = pos + len;
  phash_t curHash = 0;

  /* For debugging: */
  ocmoff_t lastLineStart = 0;

  /* For each line, compute a hash of that line and append the
     resulting DiffLine structure to the hvec vector */
  while (pos < end) {
    const unsigned char *nl;
    ocmoff_t bcLen;

    BufferChunk bc = buffer_getChunk(buf, pos, end - pos);
    assert(bc.len <= (end - pos));

    bcLen = bc.len;

    /* We have a (possibly empty) residual partial line from the
       previous iteration whose partial hash is carried in
       curHash. Finish that line, consume all complete lines in the
       BufferChunk, and then grab a new (possibly empty) residual line
       from the end of the BufferChunk. Note that the trailing newline
       (if any) is omitted from the hash computation. This simplifies
       treatment of trailing lines of text where the newline has been
       omitted. */

    while((nl = strnchr(bc.ptr, (size_t) bc.len, '\n'))) {
      ocmoff_t lineLen = nl - bc.ptr;
      DiffLine *dl = GC_MALLOC(sizeof(DiffLine));

      curHash = psum_incremental(curHash, bc.ptr, lineLen);

      dl->lineNo = line++;
      dl->hash = curHash;
      dl->base = lastLineStart;
      dl->bound = pos + lineLen;
      {
	char *lineBuf = GC_MALLOC_ATOMIC(lineLen + 1);
	lineBuf[lineLen] = 0;
	buffer_read(buf, lineBuf, dl->base, dl->bound - dl->base);
	dl->content = lineBuf;
      }

      obvec_append(hvec, dl);

      /* Prepare for next line: */
      curHash = 0;
      lineLen ++;		/* eat the newline */
      bc.ptr += lineLen;
      bc.len -= lineLen;
      pos += lineLen;
      lastLineStart = pos;
    }

    /* Check for (possibly empty) residual partial line: */
    if (bc.len) {
      curHash = psum_incremental(0, bc.ptr, bc.len);
      pos += bc.len;
    }
  }

  /* Deal with residual line that may not have been newline
     terminated, if any: */
  if (lastLineStart != end) {
    DiffLine *dl = GC_MALLOC(sizeof(DiffLine));
    ocmoff_t lineLen = end - lastLineStart;

    dl->lineNo = line++;
    dl->hash = curHash;
    dl->base = lastLineStart;
    dl->bound = end;
    {
      char *lineBuf = GC_MALLOC_ATOMIC(lineLen + 1);
      lineBuf[lineLen] = 0;
      buffer_read(buf, lineBuf, dl->base, dl->bound - dl->base);
      dl->content = lineBuf;
    }

    obvec_append(hvec, dl);
  }

  return hvec;
}

void
diff_dump_lines(ObVec *lvec)
{
  unsigned i;

  for (i = 0; i < vec_size(lvec); i++) {
    DiffLine *dl = vec_fetch(lvec, i, DiffLine *);
    
    xprintf("%-20s 0x%08x %s\n",
	    format("%4d: [%s,%s)", 
		   dl->lineNo,
		   xunsigned64_str(dl->base),
		   xunsigned64_str(dl->bound)),
	    dl->hash,
	    dl->content);
  }
}

typedef struct DiffRegion DiffRegion;
struct DiffRegion {
  ObVec *lines;
  size_t base;
  size_t bound;
};

/* The differencing problem is to find the
   shortest sequence of edits (inserts + deletes) that converts one
   input string into a second. The algorithm used here is derived from

      Eugene W. Myers: An O(ND) Difference Algorithm and its
      Variations

   The "shortest edit distance" problem is the dual of the "longest
   common substring" problem.

   The heart of the algorithm is to imagine that we create a matrix
   indexed across the top by the positions of s1 (the input string)
   and down the left by the positions of s2 (the output string). The
   paper refers to N = len(s1) and M = len(s2). For an illustration,
   see the diagram in the paper on p. 253. We define a set of legal
   "paths" through this graph in which the legal moves are "right by
   one", "down by one" or "diagonally down/right by one". A move to
   the right corresponds to a delete of a character in the input
   string, a move down corresponds to an insert of a character from
   the output string, and a diagonal move corresponds to a "no
   change".

   As suggested by Myers, we will generate the output edit string by
   assigning costs of +1 to right or down moves, and 0 to
   diagonals. We can then view the LCS problem as finding the minimum
   cost path from (0,0) to (N,M) in the graph (which is the single
   source shortest path problem). The trickiness is all in deciding
   how to explore the graph efficiently.

   The algorithm works by diagonalization tricks that are awkward to
   re-describe here. The reader is referred to the paper.

   Our implementation uses the following refinements:

   In order to stick as closely as possible to the notational
   conventions of the paper, I am using 1-relative array indexing into
   strings here.
   */

#define getline(dr,ln) (vec_fetch((dr)->lines, (ln)-1, const DiffLine *))

#define abs(x) ((x) > 0 ? (x) : (-x))
#define odd(x) (abs(x) & 1)
#define even(x) (!odd(x))

static int
find_D_path(long D, long k, long *V, DiffRegion *a, DiffRegion *b, 
	    long N, long M)
{
  /* Find the end of the furthest reaching forward D path in
	 diagonal k. */
  long x, y;

  /* Some ambiguity about the binding strengths of and/or here in
	 the paper. Assume AND stronger than OR: */
  if (k == -D || (k != D && V[k-1] < V[k+1]))
    x = V[k+1];
  else
    x = V[k-1]+1;

  y = x - k;

  /* Note the paper assumes that strings use 1-relative indices. C
	 uses 0-relative, so a_(x+1) => a_(x+1-1) => a(x) and likewise
	 for b. */
  while (x < N && y < M && 
	 diffline_cmp(getline(a,x+1), getline(b,y+1)) == 0) {
    x++;
    y++;
  }

  return x;
}

#if 0
static ObVec *
calc_ses_len(DiffRegion *a, DiffRegion *b)
{
  long N = a->bound;
  long M = b->bound;
  long MAX = M+N;		/* search limit */
  long D;

  long *Vf_base = GC_MALLOC_ATOMIC(MAX*2 + 1);
  long *Vf = Vf_base + MAX;

  long *Vb_base = GC_MALLOC_ATOMIC(MAX*2 + 1);
  long *Vb = Vb_base + MAX;

  long delta = N - M;

  Vf[1] = 0;
  Vb[1] = 0;

  for (D = 0; D <= (MAX+1)/2; D++) {
    long k;
    for (k = -(D-1); k <= (D-1); k += 2) {
      long x = find_D_path(D-1, k, Vf, a, b, N, M);

      Vf[k] = x;

      if (odd(delta) && k >= (delta - (D - 1)) && k <= (delta +	(D-1))) {
	// If the path overlaps the furthest reaching reverse
	// (D-1)-path in diagonal k then:
	//
	//   Length of an SES is d-1
	//   The last snake of the forward path is the middle snake

	return 0;
      }
    }

    for (k = -D; k <= D; k += 2) {
      /* Find the end of the furthest reaching reverse D-path in
	 diagonal k. */

      long x, y;

      /* Some ambiguity about the binding strengths of and/or here in
	 the paper. Assume AND stronger than OR: */
      if (k == -D || (k != D && Vb[k-1] < Vb[k+1]))
	x = Vb[k+1];
      else
	x = Vb[k-1]+1;

      y = x - k;

      /* Note the paper assumes that strings use 1-relative indices. C
	 uses 0-relative, so a_(x+1) => a_(x+1-1) => a(x) and likewise
	 for b. */
      while (x < N && y < M && 
	     diffline_cmp(getline(a,x+1), getline(b,y+1)) == 0) {
	x++;
	y++;
      }

      Vb[k] = x;

      if (even(delta) && (k+delta) >= -D && (k+delta) <= D) {
	// If the path overlaps the furthest reaching forward
	// (D)-path in diagonal k+delta then:
	//
	//   Length of an SES is d-1
	//   The last snake of the forward path is the middle snake

	return 0;
      }
    }
  }

  return 0;
}

#else
static ObVec *
calc_ses_len(DiffRegion *a, DiffRegion *b)
{
  long N = a->bound;
  long M = b->bound;
  long MAX = M+N;		/* search limit */
  long D;

  /* V is a vector indexed from [-MAX..MAX}. Keep Vbase alive so that
     the GC algorithm won't loose track of it. */
  long *Vbase = GC_MALLOC_ATOMIC(MAX*2 + 1);
  long *V = Vbase + MAX;

  V[1] = 0;

  for (D = 0; D <= MAX; D++) {
    long k;

    for (k = -D; k <= D; k += 2) {
      long x = find_D_path(D, k, V, a, b, N, M);
      long y = x - k;

      V[k] = x;

      if (x >= N && y >= M) {
	printf("Length of SES is %ld\n", D);
	return 0;
      }
    }
  }

  printf("Length of SES exceeds MAX\n");

  return 0;
}
#endif

ObVec *
diff2(Buffer *common, Buffer *derived, unsigned flags)
{
  DiffRegion dr1;
  DiffRegion dr2;

  /* This is a text-oriented, line-wise differencing algorithm. Given
     this, the units of difference are lines, and there is no point
     doing expensive string compares to compare those lines. Instead,
     build up a vector of hashes, one per line. We will then pretend
     that the two inputs were strings whose "characters" came from the
     alphabet of possible hash values, and perform our diff as a
     binary diff on those two strings. */

  dr1.lines = diff_extract_lines(common);
  dr2.lines = diff_extract_lines(derived);

  dr1.base = 0;
  dr2.base =0;
  dr1.bound = vec_size(dr1.lines);
  dr2.bound = vec_size(dr2.lines);

  return calc_ses_len(&dr1, &dr2);
}
