/* wordtable.c -- Copyright 1989, 1990 Liam R. Quin.  All Rights Reserved.
 * This code is NOT in the public domain.
 * See the file ../COPYRIGHT for full details.
 */

/* Symbol Table Interface to text retrieval database.
 * Handles both the internal and external indexes.
 *
 * This originally used a linked list.  Converting to a hash table reduced
 * the time to index comp.os.vms from nearly an hour to one and a half
 * minutes...
 *
 * Liam Quin, 1989
 */

/* 
 * $Id: wordtable.c,v 2.11 91/02/20 19:07:37 lee Rel1-10 $
 */

#ifndef lint
 static char *Rcs = "$Id: wordtable.c,v 2.11 91/02/20 19:07:37 lee Rel1-10 $";
#endif

#include "globals.h" /* defines and declarations for database filenames */

#ifdef SYSV
extern int _filbuf();
#endif
#include <stdio.h>
#include <malloc.h>
#include <ctype.h>
#include <sys/types.h>
#include <fcntl.h> /* for O_RDWR wtc */
#include "smalldb.h"
#include "fileinfo.h"
#include "wordinfo.h"
#include "pblock.h"
#include "wordrules.h"
#include "emalloc.h"

#define HASHSIZ 32768 /* MUST be a power of two */

#ifndef MAXWORDSINCACHE
# define MAXWORDSINCACHE  (HASHSIZ * 10)
#endif
int MaxWordsInCache = MAXWORDSINCACHE;

extern int AsciiTrace;

/* useful macros */
#define NumberOfElements(array, type) (sizeof(array)/sizeof(type))
#define STRCMP(a,b) ((*(a) > *(b)) ? 1 : ((*(a) < *(b)) ? -1 : strcmp(a,b)) )
/* #define Hash(WordInfo) \
 *	(dbm_hash(WordInfo->Word, WordInfo->Length) % HashSize)
 */

/** System calls and library functions used in this file: **/

/** Lqtext calls */
extern unsigned int Putpblock();
extern void DeleteWordPlaces();

/** System calls: */

/** Library Functions: */
extern char *strncpy();
extern int strcmp();
extern void perror();
extern void exit();
/**/

#define enew(var, type) (var = (type *) emalloc(sizeof (type)))

extern char *progname;
static int HashSize = HASHSIZ; /* MUST be a power of two */

#ifdef NEWSYM

#define NPLACES 7
/* THis is small to optimise the common case -- by far the majority of
 * words are used less than 10 times.  In the cases where we've gone
 * wrong, well, there'll be a few thousand.
 */

typedef struct s_HashEl {
    char *Word;
    t_WID WID;
    int PlacesUsed;
    t_WordPlace Places[NPLACES];
    struct s_HashEl *Next;
} t_HashEl;

static t_HashEl *SymbolTable;
static t_HashEl *LastEl;
static int WordsInCache = 0;

StartHash()
{
    if (MaxWordsInCache) HashSize = MaxWordsInCache / 16;
    SymbolTable = (t_HashEl *) emalloc(sizeof(t_HashEl) * HashSize);
    /* Note that we only need to initialise the Word pointers... */
    for (LastEl = SymbolTable; LastEl != &SymbolTable[HashSize]; LastEl++) {
	LastEl->Word = (char *) 0;
    }
    /* ASSERT: LastEl == &SymbolTable[HashSize] */
    MaxWordsInCache = HashSize;
}

SetElEmpty(El)	/* Initialisation function for Hash Elements */
    t_HashEl *El;
{
    El->Word = (char *) 0;
    El->WID = (t_WID) -1;
		/* NOT zero, so we can distinguish between unknown and
		 * "haven't looked"
		 */
    El->PlacesUsed = 0;
    El->Next = (t_HashEl *) 0;
}

void DumpCache();

void
AddWord(WordInfo)
    t_WordInfo *WordInfo;
{
    register t_HashEl *HashEl;
    int Slot;
    t_HashEl *FirstEl;

    if (!WordInfo || !WordInfo->Word || !WordInfo->Word[0]) {
	(void) fprintf(stderr, "%s: warning: Null Word in AddWord\n", progname);
	return;
    }

    if (!LastEl) {
	StartHash();
    } else if (MaxWordsInCache && ++WordsInCache > MaxWordsInCache) {
	DumpCache(1);
    }

    if (WordInfo->Word[0] == 'q') {
	register char *xp;

	for (xp = &WordInfo->Word[1]; *xp && *xp == 'x'; xp++) {
	    /*NULLBODY*/
	}
	if (!*xp) {
	    if (AsciiTrace >= 10) {
		(void) fprintf(stderr, "Discard %d\n", WordInfo->Word);
	    }
	    return;
	}
    }

    Slot = Hash(WordInfo);
    FirstEl = HashEl = &SymbolTable[Slot];


    for (;;) {
	if (!HashEl->Word) {
	    extern char *strcpy();
	    extern t_WID Word2WID();

	    if (AsciiTrace > 9) {
		(void) fprintf(stderr, "New ", WordInfo->Word);
	    }
	    /* make a new element */
	    SetElEmpty(HashEl);
	    HashEl->Word = emalloc(WordInfo->Length + 1);
	    (void) strcpy(HashEl->Word, WordInfo->Word);
	    /**
	    HashEl->WID = (t_WID) -1;
	    **/
	    HashEl->WID = Word2WID(HashEl->Word, WordInfo->Length);
	    /** **/
	    break;
	} else if (STREQ(HashEl->Word, WordInfo->Word)) {
	    break;
	}

	if (++HashEl == LastEl) HashEl = SymbolTable;

	if (HashEl == FirstEl) {
	    /* We need to dump the cache and start again */
	    DumpCache(1);
	    AddWord(WordInfo);
	    return;
	}
    }
    /* If we get here, all we need to do is add the WordPlace */
    if (AsciiTrace > 9) {
	(void) fprintf(stderr, "AddWord %s\n", WordInfo->Word);
    }
    FirstEl = HashEl;

    while (HashEl->PlacesUsed >= NPLACES && HashEl->Next != (t_HashEl *) 0) {
	HashEl = HashEl->Next;
    }

    if (HashEl->PlacesUsed >= NPLACES) {
	t_HashEl *New;

	New = (t_HashEl *) malloc(sizeof(t_HashEl));
	SetElEmpty(New);

	New->Next = FirstEl->Next;
	FirstEl->Next = HashEl = New;
    }
    HashEl->Places[HashEl->PlacesUsed] = WordInfo->WordPlace; /* structure copy */
    HashEl->PlacesUsed++;
    return;
}

void
DumpCache(CallFree)
    int CallFree;
{
    register t_HashEl *HashEl, *MeNext;
    int Progress = 0;

    for (HashEl = SymbolTable; HashEl != LastEl; HashEl++) {
	if (HashEl->Word) {
	    extern t_WordInfo *MakeWordInfo();
	    unsigned len;
	    t_WordInfo *WP;

	    /* We are going to make a new index entry for the word.
	     * There are two cases -- depending on whether the word
	     * is already indexed or not.
	     * In the former case we must merge the new information.
	     * In the latter case we don't have to read the old info,
	     * but we must make a new entry in the WID Index.
	     */

	    len = strlen(HashEl->Word);
	    if (HashEl->WID == (t_WID) -1) {
		HashEl->WID = Word2WID(HashEl->Word, len);
	    }
	    WP = MakeWordInfo(HashEl->WID, len, HashEl->Word);

	    if (HashEl->WID == (t_WID) 0) {
		NewEntry(HashEl, WP);
	    } else {
		UpdateEntry(HashEl, WP);
	    }
	    /* Reclaim storage */
	    if (CallFree) {
		extern void SlayWordInfo();
		register t_HashEl *FreeMe = HashEl;

		(void) SlayWordInfo(WP);

		efree(HashEl->Word);
		FreeMe->Word = (char *) 0;
		FreeMe = FreeMe->Next; /* don't do the first one */
		while (FreeMe) {
		    MeNext = FreeMe->Next;
		    (void) efree((char *) FreeMe);
		    FreeMe = MeNext;
		}
	    }
	}
	if (AsciiTrace > 1) {
	    if (HashEl - SymbolTable >= Progress * (HashSize / 16)) {
		fputc(" 01234567890ABCDEFGHIJKL"[Progress], stderr);
		++Progress;
	    }
	}
    }
    WordsInCache = 0;
}

NewEntry(HashEl, WP)
    t_HashEl *HashEl;
    t_WordInfo *WP;
{
    extern t_WID GetNextWID();
    t_pblock *pblock;
    long MatchCount;
    t_HashEl *Ep;

    /** Assign a new WID */
    WP->WID = GetNextWID();

    /** make a WIDIndex entry and mark it as invalid (NOTDONE) */

    /* In order to do this, we must make a "pblock", a structure that
     * reflects the physical database.  This is fairly low-level stuff
     * for efficiency's sake...
     */

    /* count the total number of entries we're adding: */
    for (Ep = HashEl, MatchCount = 0; Ep; Ep = Ep->Next) {
	MatchCount += Ep->PlacesUsed;
    }

    /* allocate a pblock structure.  These are rather devious things, a
     * structure with an array tacked onto the end.
     */
    pblock = (t_pblock *) emalloc(sizeof(t_pblock) +
				MatchCount * sizeof(t_WordPlace));
    
    pblock->WID = WP->WID;
    pblock->ChainStart = 0L; /* address on disk -- not there yet, so 0! */
    pblock->NumberOfWordPlaces = WP->NumberOfWordPlaces = MatchCount;

    /* fill in the WordPlaces */
    for (Ep = HashEl, MatchCount = 0; Ep; Ep = Ep->Next) {
	register int i;

	for (i = 0; i < Ep->PlacesUsed; i++) {
	    pblock->WordPlaces[MatchCount++] = Ep->Places[i]; /* struct copy */
	}
    }

    /* Now fill in enough of WP to let us use the low-level routines: */
    WP->FID = (t_FID) 0;
    WP->Next = (t_WordInfo *) 0;
    WP->DataBlock = (char *) 0;
    WP->WordPlaceStart = (char *) 0;
    WP->WordPlaces = (t_WordPlace *) 0;
    WP->WordPlacesInHere = 0;
    WP->WordPlace.FID = 0;
    WP->WordPlace.Flags = 0;
    WP->Offset = 0;

    /* First, let's make an index entry: */
#ifndef MaxWordPlacesInAWordBlock
# define MaxWordPlacesInAWordBlock ((WIDBLOCKSIZE-(WP->Length+2)/3))
#endif
    if (pblock->NumberOfWordPlaces <= MaxWordPlacesInAWordBlock) {
	(void) MkWIB(WP, pblock);
    }

    /** write out the new entry */
    if (WP->WordPlacesInHere == pblock->NumberOfWordPlaces) {
	/* In this case it all fits into the main index */
	if (PutWordInfoIntoIndex(WP, (unsigned long) 0L) < 0) {
	    extern int errno;
	    int e = errno;
	    fprintf(stderr, "%s: Couldn't insert word \"%s\" into the index",
				progname, WP->Word);
	    perror("");
	    exit(1);
	}
    } else {
	(void) Putpblock(WP, pblock);
	if (PutWordInfoIntoIndex(WP, pblock->ChainStart) < 0) {
	    extern int errno;
	    int e = errno;
	    fprintf(stderr, "%s: Couldn't re-insert word \"%s\" into the index",
				progname, WP->Word);
	    perror("");
	    exit(1);
	}
    }

    /** mark it as valid (NOTDONE) */

    /** reclaim storage */
    (void) efree((char *) pblock);
    /* the caller *must* do SlayWordInfo(WP) */
}

UpdateEntry(HashEl, WP)
    t_HashEl *HashEl;
    t_WordInfo *WP;
{
    extern t_pblock *Getpblock();
    extern t_WordInfo *WID2WordInfo();
    t_pblock *pblock;
    long MatchCount;
    t_HashEl *Ep;
    t_WordInfo *Wpp;

    /** Mark the old entry as invalid (NOTDONE) */

    /** get the old entry */
    if ((Wpp = WID2WordInfo(WP->WID)) == (t_WordInfo *) 0) {
	/* someone else has just deleted it! */
	NewEntry(HashEl, WP);
	return;
    }
    /* It would be best if we could append to the old entry... which is what
     * I had in mind when I designed the disk storage stuff... but you can't.
     */
    pblock = Getpblock(Wpp);

    /** merge the old and new entries */

    /* count the total number of entries we're adding: */
    for (Ep = HashEl, MatchCount = 0; Ep; Ep = Ep->Next) {
	MatchCount += Ep->PlacesUsed;
    }

    pblock = (t_pblock *) erealloc((char *) pblock, sizeof(t_pblock) +
	     (Wpp->NumberOfWordPlaces + MatchCount) * sizeof(t_WordPlace));

    /* delete the old entry from disk */
    if (Wpp->Offset) {
	DeleteWordPlaces(Wpp->Offset, Wpp->WID);
    }

    /* fill in the WordPlaces */
    for (Ep = HashEl, MatchCount = 0; Ep; Ep = Ep->Next) {
	register int i;

	for (i = 0; i < Ep->PlacesUsed; i++) {
	    pblock->WordPlaces[pblock->NumberOfWordPlaces++] =
					Ep->Places[i]; /* struct copy */
	}
    }

    Wpp->Offset = 0L; /* it's invalid now... */
    Wpp->WordPlacesInHere = 0;

    /* First, let's make an index entry: */
    if (pblock->NumberOfWordPlaces <= MaxWordPlacesInAWordBlock) {
	(void) MkWIB(WP, pblock);
    }

    /** write out the new entry */
    if (Wpp->WordPlacesInHere == pblock->NumberOfWordPlaces) {
	/* In this case it all fits into the main index */
	if (PutWordInfoIntoIndex(Wpp, (unsigned long) 0L) < 0) {
	    extern int errno;
	    int e = errno;
	    fprintf(stderr, "%s: Couldn't insert word \"%s\" into the index",
				progname, Wpp->Word);
	    perror("");
	    exit(1);
	}
    } else {
	(void) Putpblock(Wpp, pblock);
	if (PutWordInfoIntoIndex(Wpp, pblock->ChainStart) < 0) {
	    extern int errno;
	    int e = errno;
	    fprintf(stderr, "%s: Couldn't re-insert word \"%s\" into the index",
				progname, Wpp->Word);
	    perror("");
	    exit(1);
	}
    }

    /** mark it as valid (NOTDONE) */

    /** reclaim storage */
    (void) efree((char *)pblock);
    /* the caller *must* do SlayWordInfo(WP) */
    (void) SlayWordInfo(Wpp);
}

#else /* NEWSYM */
static t_WordPlaceList *SymbolTable[HASHSIZ]; /* static --> initialised to 0 */
#endif /* NEWSYM */

#ifdef __GNU__
inline
#endif
#ifndef Hash
int
Hash(WordInfo)
    t_WordInfo *WordInfo;
{
    register unsigned long n = 0;
    register int len = WordInfo->Length;
    register char *str = WordInfo->Word;

#ifdef DUFF /* clever stuff for speedup... dmr-approved!... */

#define HASHC	n = *str++ + 65599 * n

    if (len > 0) {
	register int loop = (len + 8 - 1) >> 3;

	switch(len & (8 - 1)) {
	case 0:	do {
		HASHC;	case 7:	HASHC;
	case 6:	HASHC;	case 5:	HASHC;
	case 4:	HASHC;	case 3:	HASHC;
	case 2:	HASHC;	case 1:	HASHC;
		} while (--loop);
	}

    }
#else /* DUFF */
    while (len--)
	n = *str++ + 65599 * n;
#endif /* DUFF */
    /**
    return n & (HashSize - 1);
    **/
    return n % HashSize;
}
#endif

static int HashOK = 0;

void
InitHash()
{
    HashOK = 1;
}

#ifndef NEWSYM
static int WordsInCache = 0;

/* FIXME: this ought to taks a WordInfo and a WordPlaceList instead.
 * Using a hash table means that we can end up with really pathalogical
 * paging pehaviour.  Nearly all of lqaddfile is resident when running
 * on a Sun.  Hence, I shall be replacing this code entirely soon with
 * something that has less memory fragmentation, perhaps by coalescing
 * list members or with a tree.
 * For now, MaxWordsInCache is a parameter that you can set to zero if
 * you want.
 *
 * Also, the cache structure should be cleaver enough to avoid writing
 * out the more common words if it can, so as to minimise the number
 * of data _fetches_ that have to be done.
 * You could also argue that it should be more efficient to add new data,
 * of course.  I couldn't disagree.
 *
 * Next change required is to make AddWord do a little more of the work --
 * in particular, to call Word2WID for each new word, in an attempt to
 * make cache dumping faster.
 */

AddWord(WordInfo) /* old version */
    t_WordInfo *WordInfo;
{
    int Slot;
    int GreaterOrLess = 1;
    t_WordPlaceList *SaveOldNext;
    t_WordPlaceList **WPL;

    if (!HashOK) InitHash();

    /* The following are all awfully serious internal errors.
     * They will only happen if I make a huge coding error, whereupon
     * they tend to happen for every word in the input...
     */
    if (!WordInfo) {
	fprintf(stderr, "AddWord(0)\n");
	return;
    } else if (!WordInfo->Word) {
	fprintf(stderr, "AddWord(Word=0)\n");
	return;
    } else if (!WordInfo->Word[0]) {
	fprintf(stderr, "AddWord(Word[0]=0)\n");
	return;
#ifdef ASCIITRACE
    } else if (AsciiTrace > 20) {
	fprintf(stderr, "[%s.len %d]\n", WordInfo->Word, WordInfo->Length);
#endif
    }

    Slot = Hash(WordInfo);

#ifdef ASCIITRACE
    if (AsciiTrace > 10) {
	fprintf(stderr, "H %d %s\n", Slot, WordInfo->Word);
    }
#endif

    if (WordInfo->Word[0] == 'q') {
	register char *p = WordInfo->Word;

	/* Words of the form qxxxxx* are not indexed.  This is so the filters
	 * can preprocess the files without upsetting the word counts.
	 * If you can think of a better way to do this, well, tell me!
	 * Lee
	 */
	
	for (++p; p - WordInfo->Word < WordInfo->Length; p++) {
	    if (*p != 'x') break;
	}

	if (p - WordInfo->Word == WordInfo->Length) {
#ifdef ASCIITRACE
	    if (AsciiTrace > 10) {
		(void) fprintf(stderr, "rejected %s (too boring)\n",
					WordInfo->Word);
	    }
#endif
	    return;
	}
    }

    for (WPL = &SymbolTable[Slot]; *WPL; WPL = &((*WPL)->Next)) {
	if ((GreaterOrLess = STRCMP((*WPL)->Word, WordInfo->Word)) <= 0) {
	    break;
	}
    }

    /* Insert the new word at the head of the Word Chain,
     * i.e. at the start of the group of similar words
     */
    SaveOldNext = *WPL;

    enew(*WPL, t_WordPlaceList);
    (*WPL)->WordPlace = WordInfo->WordPlace; /* structure copy */
    (*WPL)->WordPlace.FID = WordInfo->WordPlace.FID;
    (*WPL)->Next = SaveOldNext;

    if (GreaterOrLess || !SaveOldNext) {
	(*WPL)->Word = emalloc(WordInfo->Length + 1);
	(void) strncpy((*WPL)->Word, WordInfo->Word, (int) WordInfo->Length);
	(*WPL)->Word[WordInfo->Length] = '\0';
    } else {
	/* The word is already saved, so we only need to link to it */
	(*WPL)->Word = SaveOldNext->Word;
    }
    if (MaxWordsInCache && ++WordsInCache > MaxWordsInCache) {
	void DumpCache();

	DumpCache(1);
	WordsInCache = 0;
    }
}

void
DumpCache(CallFree)
    int CallFree; /* call efree() if non-zero */
{
    extern int WriteWordChain();

    register int Slot;
    register t_WordPlaceList *WordPlaceList;
    int WordsLeft = WordsInCache;
    int EmptySlots = 0, UsedSlots = 0;
    int Progress = 0;

    if (WordsInCache == 0) return; /* save some work maybe */

    if (AsciiTrace) {
	fprintf(stderr, "Writing%s%d words\n",
			(CallFree) ? " and freeing " : " ", WordsInCache);
    }

    for (Slot = 0; WordsLeft > 0 && Slot < HASHSIZ; Slot++) {

	if (AsciiTrace > 1) {
	    if (Slot >= Progress * (HASHSIZ / 16)) {
		fputc(" 01234567890ABCDEFGHIJKL"[Progress], stderr);
		++Progress;
	    }
	}
	if (SymbolTable[Slot] == (t_WordPlaceList *) 0) {
	    ++EmptySlots;
	    continue;
	} else {
	    char *LastFreed = (char *) 0;

	    ++UsedSlots;
	    WordPlaceList = SymbolTable[Slot];
	    WordsLeft -= WriteWordChain(WordPlaceList);

	    if (CallFree) {
		while (WordPlaceList) {
		    register t_WordPlaceList *SavePointer;

		    if (WordPlaceList->Word &&
					WordPlaceList->Word != LastFreed) {
			efree(WordPlaceList->Word);
			LastFreed = WordPlaceList->Word;
		    }

		    SavePointer = WordPlaceList->Next;
		    efree((char *) WordPlaceList);
		    WordPlaceList = SavePointer;
		}
		SymbolTable[Slot] = (t_WordPlaceList *) 0;
	    }
	}
    }

    if (AsciiTrace) {
	double d = UsedSlots;
	d /= (EmptySlots + UsedSlots);
	d *= 100.0;

	fprintf(stderr, "%4.3f%% cache used -- %d out of (%d <= %d)\n",
			d, UsedSlots, UsedSlots + EmptySlots, HASHSIZ);
#ifdef MALLOCTRACE
	mallocmap();
#endif
    }

    if (WordsInCache != 0 && CallFree) {
	WordsInCache = 0;
    }
}

#endif /*!NEWSYM*/

/*
 * $Log:	wordtable.c,v $
 * Revision 2.11  91/02/20  19:07:37  lee
 * The qxxx fix only worked if ASCIITRACE was defined!
 * 
 * Revision 2.10  90/10/06  00:51:05  lee
 * Prepared for first beta release.
 * 
 * Revision 2.9  90/10/05  23:44:30  lee
 * Major experimentation with new symbol table failed...
 * 
 * Revision 2.8  90/09/26  19:45:02  lee
 * Added call to mallocmap() in ifdef MALLTRACE.
 * 
 * Revision 2.7  90/09/20  18:58:25  lee
 * Added some comments, and deleted a needless test.  Reorderered a loop
 * in the (probably vain) hope of a speed-up in the face of paging...
 * 
 * Revision 2.6  90/09/19  20:25:44  lee
 * Don't index "qxxxxxxxx" words (this is a hook for filters...)
 * 
 * Revision 2.5  90/08/29  21:46:11  lee
 * Alpha release
 * 
 * Revision 2.4  90/08/09  19:17:37  lee
 * BSD lint and Saber
 * 
 * Revision 2.3  90/03/21  17:32:31  lee
 * new hashing function, masses, masses better -- the old one only ever
 * used abuot 6% of the available values!
 * 
 * Revision 2.2  89/10/08  20:47:47  lee
 * Working version of nx-text engine.  Addfile and wordinfo work OK.
 * 
 * Revision 2.1  89/10/02  01:16:22  lee
 * New index format, with Block/WordInBlock/Flags/BytesSkipped info.
 * 
 * Revision 1.3  89/09/17  23:05:15  lee
 * Various fixes; NumberInBlock now a short...
 * 
 * Revision 1.2  89/09/16  21:18:55  lee
 * First demonstratable version.
 * 
 * Revision 1.1  89/09/07  21:06:20  lee
 * Initial revision
 * 
 */
