
/*****************************************************************************
                Copyright Carnegie Mellon University 1992

                      All Rights Reserved

 Permission to use, copy, modify, and distribute this software and its
 documentation for any purpose and without fee is hereby granted,
 provided that the above copyright notice appear in all copies and that
 both that copyright notice and this permission notice appear in
 supporting documentation, and that the name of CMU not be
 used in advertising or publicity pertaining to distribution of the
 software without specific, written prior permission.

 CMU DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING
 ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL
 CMU BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR
 ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
 SOFTWARE.
*****************************************************************************/

#include <stdio.h>

/* edit_dist -- returns the minimum edit distance between two strings

	Program by:  Mark Maimone   CMU Computer Science   13 Nov 89
	Last Modified:  28 Jan 90

   If the input strings have length n and m, the algorithm runs in time
   O(nm) and space O(min(m,n)).

HISTORY
   13 Nov 89 (mwm) Created edit_dist() and set_costs().

   28 Jan 90 (mwm) Added view_costs().  Should verify that THRESHOLD
   computations will work even when THRESHOLD is not a multiple of
   sizeof(int).

*/


#define DEBUG
#define THRESHOLD 4000		/* worry about allocating more memory only
				   when this # of bytes is exceeded */
#define STRLENTHRESHOLD ((int) ((THRESHOLD / sizeof (int) - 3) / 2))

#define SAFE_ASSIGN(x,y) (((x) != NULL) ? (*(x) = (y)) : (y))

#define swap_int(x,y)  (_iswap = (x), (x) = (y), (y) = _iswap)
#define swap_char(x,y) (_cswap = (x), (x) = (y), (y) = _cswap)
#define min3(x,y,z) (_mx = (x), _my = (y), _mz = (z), (_mx < _my ? (_mx < _mz ? _mx : _mz) : (_mz < _my) ? _mz : _my))
#define min2(x,y) (_mx = (x), _my = (y), (_mx < _my ? _mx : _my))


static int insert_cost = 1;
static int delete_cost = 1;
static int change_cost = 1;
static int swap_cost   = 1;

static int _iswap;			/* swap_int temp variable */
static char *_cswap;			/* swap_char temp variable */
static int _mx, _my, _mz;		/* min2, min3 temp variables */

void view_costs (ins, del, ch, swap)
int *ins, *del, *ch, *swap;
{
    SAFE_ASSIGN (ins, insert_cost);
    SAFE_ASSIGN (del, delete_cost);
    SAFE_ASSIGN (ch, change_cost);
    SAFE_ASSIGN (swap, swap_cost);
} /* view_costs */

void set_costs (ins, del, ch, swap)
int ins, del, ch, swap;
{
    insert_cost = ins;
    delete_cost = del;
    change_cost = ch;
    swap_cost   = swap;
} /* set_costs */

/* edit_dist -- returns the edit distance between two strings, or -1 on
   failure */

int edit_dist (from, to)
char *from, *to;
{
    long int from_len, to_len;		/* length of input strings */
    register int ins, del, ch;	  	/* local copies of edit costs */
    register int row, col, index;	/* dynamic programming counters */
    register int radix;			/* radix for modular indexing */
    int *buffer;			/* pointer to storage for one row
					   of the d.p. array */
    static int store[THRESHOLD / sizeof (int)];
					/* a small amount of static
					   storage, to be used when the
					   input strings are small enough */
    char *malloc ();

/* Handle trivial cases when one string is empty */

    if (from == NULL || *from == '\0')
	if (to == NULL || *to == '\0')
	    return 0;
	else
	    return strlen (to) * insert_cost;
    else if (to == NULL || *to == '\0')
	return strlen (from) * delete_cost;

/* Initialize registers */

    from_len = strlen (from);
    to_len = strlen (to);
    radix = 2 * from_len + 3;
    ins  = insert_cost;
    del  = delete_cost;
    ch   = change_cost;

/* Make   from   short enough to fit in the static storage, if it's at all
   possible */

    if (from_len > to_len && from_len > STRLENTHRESHOLD) {
	swap_int (from_len, to_len);
	swap_char (from, to);
	swap_int (ins, del);
    } /* if from_len > to_len */

/* Allocate the array storage (from the heap if necessary) */

    if (from_len <= STRLENTHRESHOLD)
	buffer = store;
    else if ((buffer = (int *) malloc (radix * sizeof (int))) == NULL) {
	    fprintf (stderr,
		    "edit_dist:  Couldn't allocate %ld ints, quitting\n",
		    radix);
	    return -1;
	} /* if buffer == NULL */
    

/* Here's where the fun begins.  We will find the minimum edit distance
   using dynamic programming.  We only need to store two rows of the matrix
   at a time, since we always progress down the matrix.  For example,
   given the strings "one" and "two", and insert, delete and change costs
   equal to 1:

	   _  o  n  e
	_  0  1  2  3
	t  1  1  2  3
	w  2  2  2  3
	o  3  2  3  3

   The dynamic programming recursion is defined as follows:

	ar(x,0) := x * insert_cost
	ar(0,y) := y * delete_cost
	ar(x,y) := min (a(x - 1, y - 1) + (from[x] == to[y] ? 0 : change),
			a(x - 1, y) + insert_cost,
			a(x, y - 1) + delete_cost,
			a(x - 2, y - 2) + (from[x] == to[y-1] &&
					   from[x-1] == to[y] ? swap_cost :
					   infinity))

   Since this only looks at most two rows and three columns back, we need
   only store the values for the two preceeding rows.  In this
   implementation, we do not explicitly store the zero column, so only 2 *
   from_len + 2   words are needed.  However, in the implementation of the
   swap_cost   check, the current matrix value is used as a buffer; we
   can't overwrite the earlier value until the   swap_cost   check has
   been performed.  So we use   2 * from_len + 3   elements in the buffer.
*/

#define ar(x,y,index) (((x) == 0) ? (y) * del : (((y) == 0) ? (x) * ins : \
	buffer[mod(index)]))
#define NW(x,y)	  ar (x, y, index + from_len + 2)
#define N(x,y)	  ar (x, y, index + from_len + 3)
#define W(x,y)	  ar (x, y, index + radix - 1)
#define NNWW(x,y) ar (x, y, index + 1)
#define mod(x) ((x) % radix)

    index = 0;

#ifdef DEBUG
    eprintf ("      ");
    for (col = 0; (long int) col < from_len;col++)
	eprintf (" %c ", from[col]);
    eprintf ("\n   ");

    for (col = 0; (long int) col <= from_len;col++)
	eprintf ("%2d ", col * del);
#endif

/* Row 0 is handled implicitly; its value at a given column is   col*del.
   The loop below computes the values for Row 1.  At this point we know the
   strings are nonempty.  We also don't need to consider swap costs in row
   1.

   COMMENT:  the indicies   row and col   below point into the STRING, so
   the corresponding MATRIX indicies are   row+1 and col+1.
*/

    buffer[index++] = min2 (ins + del, (from[0] == to[0] ? 0 : ch));

#ifdef DEBUG
    eprintf ("\n %c %2d %2d ", to[0], ins, buffer[index - 1]);
#endif

    for (col = 1; (long int) col < from_len; col++) {
	buffer[index] = min3 (
		col * del + ((from[col] == to[0]) ? 0 : ch),
		(col + 1) * del + ins,
		buffer[index - 1] + del);
	index++;

#ifdef DEBUG
	eprintf ("%2d ", buffer[index - 1]);
#endif

    } /* for col = 1 */

#ifdef DEBUG
    eprintf ("\n %c %2d ", to[1], 2 * ins);
#endif

/* Now handle the rest of the matrix */

    for (row = 1; (long int) row < to_len; row++) {
	for (col = 0; (long int) col < from_len; col++) {
innerloop:
	    buffer[index] = min3 (
		    NW(row, col) + ((from[col] == to[row]) ? 0 : ch),
		    N(row, col + 1) + ins,
		    W(row + 1, col) + del);
	    if (from[col] == to[row - 1] && col > 0 &&
		    from[col - 1] == to[row])		    
		buffer[index] = min2 (buffer[index],
			NNWW(row - 1, col - 1) + swap_cost);

#ifdef DEBUG
	    eprintf ("%2d ", buffer[index]);
#endif

	    index = mod (index + 1);
	} /* for col = 1 */
#ifdef DEBUG
    if (row < to_len - 1)
	eprintf ("\n %c %2d ", to[row+1], (row + 2) * ins);
    else
	eprintf ("\n");
#endif
    } /* for row = 1 */

    row = buffer[mod (index + radix - 1)];
    if (buffer != store)
	free (buffer);
    return row;
} /* edit_dist */
