/* pblock.c -- Copyright 1989 Liam R. Quin.  All Rights Reserved.
 * This code is NOT in the public domain.
 * See the file COPYRIGHT for full details.
 */

/* The physical Word Database for NX-Text...
 *
 * Interface by Liam Quin, 1989
 *
 * The main database is a mesh of linked lists, rather like tagliatelli.
 *
 * $Id: pblock.c,v 1.10 90/10/13 03:08:36 lee Rel1-10 $
 *
 * $Log:	pblock.c,v $
 * Revision 1.10  90/10/13  03:08:36  lee
 * deleted an illegal dereference!
 * 
 * Revision 1.9  90/10/06  00:12:17  lee
 * Prepared for first beta release.
 * 
 * Revision 1.8  90/09/29  23:49:05  lee
 * commented out (with #if 0) some code that called WID2WordInfo needlessly.
 * 
 * Revision 1.7  90/08/29  21:47:04  lee
 * Alpha release.
 * 
 * Revision 1.6  90/08/09  19:16:52  lee
 * BSD lint and fixes...
 * 
 * Revision 1.5  90/08/08  22:26:17  lee
 * Many major lint and gcc -Wall fixes...
 * 
 * Revision 1.4  90/03/22  14:26:54  lee
 * Fixed the test for Sorting monotonicity (?)..., which had not been
 * checking that the FIDS were the same before checking the blocks...
 * 
 * Revision 1.3  90/03/21  19:42:22  lee
 * new calls to efree();
 * removed WID from the data blocks.
 * doesn't work yet, though, sorry.
 * 
 * Revision 2.2  89/10/08  20:46:53  lee
 * Working version of nx-text engine.  Addfile and wordinfo work OK.
 * 
 * Revision 2.1  89/10/02  01:15:24  lee
 * New index format, with Block/WordInBlock/Flags/BytesSkipped info.
 * 
 * Revision 1.3  89/09/17  23:03:12  lee
 * Various fixes; NumberInBlock now a short...
 * 
 * Revision 1.2  89/09/16  21:16:30  lee
 * First demonstratable version.
 * 
 * Revision 1.1  89/09/07  21:06:05  lee
 * Initial revision
 * 
 *
 */

#include "globals.h" /* defines and declarations for database filenames */

#include <stdio.h> /* stderr, also for fileinfo.h */
#include <fcntl.h>
#include <malloc.h>
#include <sys/types.h>
#include "fileinfo.h" /* for wordinfo.h */
#include "wordinfo.h"
#include "pblock.h"
#include "numbers.h"
#include "wordrules.h"
#include "emalloc.h"

#ifndef STREQ
# define STREQ(boy,girl) ((*(boy) == *(girl)) && (!strcmp((boy),(girl))))
#endif

#define new(type) ( ((type) *) emalloc(sizeof(type)) )

extern t_WordInfo *WID2WordInfo();
extern t_WID Word2WID();

/** Unix system calls that need to be declared: **/
extern void exit();
extern int open();
extern int read(), write();

/** C library functions that need to be declared: **/
extern char *memcpy();
extern void perror();
extern void qsort();
extern int strcmp();
extern char *strcpy();
extern int strlen();

/** lqtext library functions that need to be declared: **/
extern int MkWIB();
extern void MkWIBH();
extern int PutWordInfoIntoIndex();
extern void SortWordPlaces();
extern t_WordInfo *MakeWordInfo();
extern void SlayWordInfo();

/** Functions within this file that need to be declared: **/
void DeleteWordPlaces();
static void FlushBlock();

t_WordPlace *GetWordPlaces();
t_pblock *Getpblock();
unsigned short PutWordPlaces();
int _PutByte(), PutLong();
unsigned char _GetByte();
unsigned long GetLong();
unsigned long FindFreeBlock();
void FillWordPlaces();
/** **/

extern char *progname;

/* If you find this macro confusing, see "The C Programming Language",
 * Kernighan & Ritchie, 1st. Edition, for a good introduction to
 * programming in C.
 * :-( :-)
 */
#define PutByte(Byte, WID, sp, Blockp, BlockLength, LastStart, NextBlock) \
    ( (*(sp) - (unsigned char *) *(Blockp) >= *(BlockLength)) ? \
       _PutByte(Byte, WID, sp, Blockp, BlockLength, LastStart, NextBlock) : \
      ((*((*(sp))++) = (Byte)), 0))

#define GetByte(WID, sp, Blockp, BlockLength, NextBlock) \
    ( (*(sp) - (unsigned char *) *(Blockp) >= *(BlockLength)) ? \
       _GetByte(WID, sp, Blockp, BlockLength, NextBlock) : *((*(sp))++) )

extern long lseek();

static int DataFile = -1;

#ifdef ASCIITRACE
extern int AsciiTrace;
#endif

char *ReadBlock();
void WriteBlock();
unsigned long Putpblock();

/* Layout of the physical index database (OUT OF DATE)
 * =====================================
 *
 * This file is the only interface to the database of FID/Offset pairs.
 *
 * The db is organised in blocks arranged in Tagliatelli format: a linked
 * list of blocks for each WID; there is a list for each WID in the Word
 * Index.  The Word Index contains the block number of the start of the
 * chain.
 * Block 0 is the start of the free list (but is never itself free!)
 *
 * block 0... Free list header; otherwise currently unused.
 * block 1... first data block:
 * +---------------------------
 * | bytes 0...3: Offset of next block in this chain
 * |	   4...7: Number of valid pairs in this block (0-->unused block)
 * |	   8..11: WID to which this block refers (for checking)
 * |	  12..15: Total number of WIDS in the chain
 * |		  (only the first block in each chain has this)
 * | The (FID, Offset) pairs follow, in compressed (Internet-style) format.
 * |
 * block 2... next data block (either the start of a new chain, or a
 * continuation of some other chain.  Or maybe unused, especially if files
 * have been deleted).
 *
 * The block header is described by t_BlockHeader.
 *
 */

#include "blkheader.h"

/* Look up a word in the database...
 * and return a list of all the WordPlaces where it's found
 * BUG: should be called WordInfo2pblock().
 */
t_pblock *
Getpblock(WordInfo)
    t_WordInfo *WordInfo;
{
    t_pblock *pblock = 0;
    unsigned long HowManyToGet = 0L;
    t_WordPlace *WordPlaces;
    unsigned long CurrentPair = 0L;

#if 0
    /* This code allows one to call GetPblock with a very minimal
     * wordinfo entry:
     */
    if (WordInfo->Offset == 0L) {
	/* No database entry found yet... */
	if (!WordInfo->WID && WordInfo->Length) {
	    WordInfo->WID = Word2WID(WordInfo->Word, WordInfo->Length);
	}
	if (WordInfo->WID) {
	    WordInfo = WID2WordInfo(WordInfo->WID);
	}
    }
#endif

    if (!WordInfo->NumberOfWordPlaces) {
	return (t_pblock *) 0; /* nothing to write */
    }

    if (!WordInfo->Offset && !WordInfo->NumberOfWordPlaces) {
	/* no pblock offset, so no pblock! */
	return (t_pblock *) 0;
    }

    HowManyToGet = WordInfo->NumberOfWordPlaces;

    pblock = (t_pblock *) emalloc( sizeof(t_pblock) +
		    (unsigned) (HowManyToGet + 1) * sizeof(t_WordPlace));

    WordPlaces = pblock->WordPlaces;
    pblock->WID = WordInfo->WID;
    pblock->ChainStart = WordInfo->Offset;
    pblock->NumberOfWordPlaces = WordInfo->NumberOfWordPlaces;

    /* First, the pairs in the WordInfo might suffice: */
    if (WordInfo->WordPlacesInHere >= HowManyToGet) {
	for (; CurrentPair < WordInfo->WordPlacesInHere; CurrentPair++) {
	    WordPlaces[CurrentPair] = WordInfo->WordPlaces[CurrentPair];
	}
    }

    /* If they all fitted in the WordInfo block, well, that was a big win! */
    if (CurrentPair >= HowManyToGet) {
	pblock->ChainStart = 0L;
	return pblock;
    }

    /* So we need to read the entire list of WordPlaces from the database.
     * Although we may have already done the first few, I'm going to do them
     * all again because that ensures that the last few bytes in the
     * WordInfo data block can get used!
     */
    
    WordPlaces = GetWordPlaces(
		WordInfo->WID,
		WordInfo->WordPlaceStart,
		(unsigned) WIDBLOCKSIZE -
			    (WordInfo->WordPlaceStart - WordInfo->DataBlock),
		WordInfo->Offset,
		HowManyToGet);

  
  
    if (WordPlaces == (t_WordPlace *) 0) {
	fprintf(stderr, "%s: SNAFU: no wordplaces for WID %ld, wanted %ld\n",
			progname, WordInfo->WID, HowManyToGet);
	exit(1);
    }

    /* copy the result... */
    (void) memcpy((char *) pblock->WordPlaces, (char *) WordPlaces,
				(int) (sizeof(t_WordPlace) * HowManyToGet));
    (void) efree((char *) WordPlaces);
    return pblock;
}

/* This is how many WordPlaces one could write into a block in WIDFILE:
 * The Header takes up 1 for the word length, MinWordLength or more for
 * the actual word, and another 1 for the number of places.
 * (32 - 5) / 3 is 27/3 is 9, but this is rather rare in practice!
 * See the comment just before PutPairs().
 * If this is not kept up to date, space may be wasted in the database,
 * or you will slow down lqaddfile.
 */
#define MaxWordPlacesInAWordBlock ((WIDBLOCKSIZE - (MinWordLength + 2)/3))

/* This should in fact take a (t_WordPlaceList *) or something.
 * It returns the number of words written, for checking in
 * DumpCache (wordtable.c)
 */
int
WriteWordChain(WordPlaceList)
    t_WordPlaceList *WordPlaceList;
{
    extern t_WID GetNextWID();
    extern t_WordInfo *FindWordInfoFromIndex();

    t_WID WID;
    int Length;
    t_pblock *pblock = (t_pblock *) 0;
    int NumberOfWordPlacesToAdd = 0;
    t_WordPlaceList *WP;
    t_WordInfo *IndexEntry = 0;
    char *FirstWord;
    register int i;
    int TotalWordsWritten = 0;

    if (!WordPlaceList) return 0; /* nothing to do */

    /* Sanity check: */
    if (!(WordPlaceList->Word) || !*(WordPlaceList->Word)) {
	fprintf(stderr, "Warning: WriteWordChain() asked to write null word\n");
	return 0;
    }

    Length = strlen(WordPlaceList->Word);

    /* This is undoubtedly a bad way to do things:
     * if IndexEntry is big, we don't want to fetch it when we could
     * simply stuff things on the end!
     */
    if ((WID = Word2WID(WordPlaceList->Word, Length)) != (t_WID) 0) {
	if ((IndexEntry = WID2WordInfo(WID)) != (t_WordInfo *) 0) {
	    pblock = Getpblock(IndexEntry);
	}
    }

    if (!WID) {
	WID = GetNextWID();
    }

#ifdef ASCIITRACE
    if (AsciiTrace >= 3) {
	fprintf(stderr, "Entry %lu for \"%s\"", WID, WordPlaceList->Word);
	if (IndexEntry) {
	    fprintf(stderr, ", had %d pairs\n", IndexEntry->NumberOfWordPlaces);
	} else {
	    fprintf(stderr, " (new entry)\n");
	}
    }
#endif

    /* Count the number of entries we are going to add.
     * There may be several different words in the chain, but they are
     * sorted in word order.
     */
    FirstWord = WordPlaceList->Word;

    for (WP = WordPlaceList; WP; WP = WP->Next) {
	if (!STREQ(WP->Word, FirstWord)) break;
	++NumberOfWordPlacesToAdd;
	if (WP != WordPlaceList) {
	    WP->Word = (char *) 0; /* so we don't free it; see AddWord() */
	}
    }

    if (pblock == (t_pblock *) 0) {
	int oldnp = (IndexEntry) ? IndexEntry->NumberOfWordPlaces : 0;

	pblock = (t_pblock *) emalloc(sizeof(t_pblock) +
		(NumberOfWordPlacesToAdd + oldnp) * sizeof(t_WordPlace));
	pblock->NumberOfWordPlaces = 0;
	if (IndexEntry && IndexEntry->NumberOfWordPlaces) {
	    register t_WordPlace *W, *Q;
	    register unsigned long boy = 0L;
  
	    if (IndexEntry->WordPlacesInHere < IndexEntry->NumberOfWordPlaces) {
		FillWordPlaces(IndexEntry);
	    }

	    for (W = IndexEntry->WordPlaces, Q = pblock->WordPlaces;
				boy < IndexEntry->NumberOfWordPlaces; boy++) {
		*Q++ = *W++;
	    }
	    pblock->NumberOfWordPlaces = IndexEntry->NumberOfWordPlaces;
	}
    } else {
	/* Remove the old information from disk.
	 * This isn't as bad as it sounds, as it will be at the start
	 * of the freelist, so when we write it out again it will be
	 * in the buffer cache...
	 * Although, it would be faster simply to append.
	 */
	if (IndexEntry->Offset) {
	    DeleteWordPlaces(IndexEntry->Offset, IndexEntry->WID);
	}
	/*NOSTRICT*/
	pblock = (t_pblock *) erealloc((char *) pblock, sizeof(t_pblock) +
		(unsigned) (NumberOfWordPlacesToAdd +
			pblock->NumberOfWordPlaces) * sizeof(t_WordPlace));
    }

    pblock->WID = WID;
    pblock->ChainStart = 0L; /* certainly it is now invalid! */
    i = pblock->NumberOfWordPlaces;
    pblock->NumberOfWordPlaces += NumberOfWordPlacesToAdd;

    for (WP = WordPlaceList; WP && NumberOfWordPlacesToAdd--; WP = WP->Next) {
	/* As we ignore the rest of these WordInfo entries, something
	 * has to change here for more efficiency!
	 * Probably we should get passed a linked list of WordPlaces
	 * instead of WordInfo structures.
	 */
	pblock->WordPlaces[i] = WP->WordPlace;
	i++;
    }

    if (WP && STREQ(WP->Word, FirstWord)) {
	fprintf(stderr, "Internal error counting pairs to add\n");
	exit(1);
    }

    if (NumberOfWordPlacesToAdd > 0) {
	fprintf(stderr, "I've got some pairs left over, \"%s\" line %d\n",
			__FILE__, __LINE__ - 1);
	exit(1);
    }

    /* We are going to need a WordInfo here... */
    if (IndexEntry == (t_WordInfo *) 0) {
	IndexEntry = MakeWordInfo(WID, Length, WordPlaceList->Word);
    }
    IndexEntry->NumberOfWordPlaces = pblock->NumberOfWordPlaces;

    /* Now, see if they all fit into the WordInfo itself! */

    pblock->ChainStart = IndexEntry->Offset = 0L;

    if (pblock->NumberOfWordPlaces <= MaxWordPlacesInAWordBlock) {
	(void) MkWIB(IndexEntry, pblock);
    }

    if (IndexEntry->WordPlacesInHere == pblock->NumberOfWordPlaces) { 
	/* no pblock needed! */
	if (PutWordInfoIntoIndex(IndexEntry, (unsigned long) 0L) < 0) {
	    extern int errno;
	    int e = errno;
	    fprintf(stderr, "%s: Couldn't put \"%s\" into the ",
						progname, IndexEntry->Word);
	    errno = e;
	    perror("index");
	    exit(1);
	}
    } else {
	(void) Putpblock(IndexEntry, pblock); /* (this alters *WordInfo) */

	if (PutWordInfoIntoIndex(IndexEntry, pblock->ChainStart) < 0) {
	    extern int errno;
	    int e = errno;
	    fprintf(stderr, "%s: Couldn't add word \"%s\" to the ",
						progname, IndexEntry->Word);
	    errno = e;
	    perror("word index");
	    exit(1);
	}
    }

    if (pblock) {
	efree((char *) pblock);
	pblock = (t_pblock *) 0;
    }

    if (IndexEntry) {
	SlayWordInfo(IndexEntry);
    }

    TotalWordsWritten += NumberOfWordPlacesToAdd;

    /* Now see if there are any more words in the chain: */
    if (WP) {
	TotalWordsWritten += WriteWordChain(WP);
    }

    return TotalWordsWritten;
}

static unsigned char pblockBuffer[BLOCKSIZE + 5];

/* Write out an entire (presumably new) data entry, and
 * return a disk pointer to the start of the chain
 */
unsigned long
Putpblock(WordInfo, pblock)
    t_WordInfo *WordInfo;
    t_pblock *pblock;
{
    /* Assume that we can discard the PairBlock in WordInfo --
     * it was a pointer to a static buffer anyway.
     */

    if (WordInfo->DataBlock) {
	WordInfo->DataBlock = (char *) 0;
    }

    WordInfo->Offset = pblock->ChainStart = FindFreeBlock(WordInfo->WID);
    (void) MkWIBH(WordInfo, pblock);
    PutWordPlaces(
	    pblock->WordPlaces,
	    WordInfo->WID,
	    (unsigned char *) WordInfo->WordPlaceStart,
	    (unsigned)
	    WIDBLOCKSIZE - (WordInfo->WordPlaceStart - WordInfo->DataBlock),
	    WordInfo->Offset,
	    pblock->NumberOfWordPlaces);

    return WordInfo->Offset;
}

/** WordPlaces are now stored as sequences, as follows:
 **  FID*2 -- 1, 2, 3 (usually) or 4 bytes				1-4
 **  (very, very occasionaly a variable-size number may be 5 bytes.)
 **   . the bottom bit in the stored number determines whether there
 **     is more than one FID to follow
 **  number of following places (only if prev. bit was 1) -- 1 byte	0-1
 ** For each following entry:-
 **   . for each of the following places:
 **     Block In File (long, 1, 2, 3 or 4 bytes, usually 1)		1-4
 **     Word In Block -- always 1 byte					1-1
 **		the bottom bit of this says if there are flags
 **     Flags -- always 1 byte, if present				0-1
 **	    Stuff Before -- 1 byte					0-1
 **	    (if there are no flags, there's no Stuff Before byte, and
 **     we use the default value of 1)
 **
 ** Hence:	each sub-place takes from 2 to 7 bytes [was: 0 to 4]
 **		each Place sequence takes from 3 [min was 2 with old format]
 **		to (4 + 1) + 255 * (2..7) bytes.
 **		In most (I guess > 7/10) cases, flags will be 0, and
 **		StuffBefore be the default of 1.
 **
 **	I am hoping, of course, that the extra information stored is
 ** worth while!
 **	It might be possible to coalesce WordInBlock and BlockInFile using
 ** delta modulation -- i.e., storing the increment from the previous.  In
 ** this case, a flag bit could mean that those two values each occupy a
 ** nibble in a single byte.  Or, I could use a single byte, like this:
 **	[a b c d e f g h]
 **	a == 1 --> (efgh) is word in block inc., (bcd is block in file inc)
 ** but I need to do some real measurements to figure out how best to save
 ** space.  It really is worth while keeping the format as simple as I can,
 ** as this speeds retrieval.
 **
 **/

unsigned short
PutWordPlaces(WordPlaces, WID, Block, BlockLength, NextOffset, NumberToWrite)
    t_WordPlace *WordPlaces;
    t_WID WID;
    unsigned char *Block;
    unsigned BlockLength;
    unsigned long NextOffset;
    unsigned long NumberToWrite;
{
    unsigned char *q = Block;
    unsigned long L;
    int CurrentPlace = 0;
    unsigned long LastStart = 0L;
    t_FID LastFID = 0;
    unsigned long LastBlock = 0L;

    /** IMPORTANT NOTICE:  keep the definition of MaxWordPlacesInAWordBlock
     ** up to date (it's #defined above).
     **/

    /* sort the pblock to simplify subsequent accesses */
    if (NumberToWrite > 1) {
	SortWordPlaces(NumberToWrite, WordPlaces);
    }

    while (CurrentPlace < 0 || CurrentPlace < NumberToWrite) {
	unsigned short NumberOfRepeats;
	unsigned char U;
	t_FID FID = WordPlaces[CurrentPlace].FID;
	int LastPlace;

	/* Determine the number of Places in the same file;
	 * note that we can write at most 255 in the same place, so
	 * longer stretches are broken up into clumps of 255.
	 * This is a reasonable tradeoff, I think.  The alternative would
	 * be to write NumberOfRepeats as a long, and lose if there were
	 * (say) between 64 (old, 127 new) and 255 of them.  This case only
	 * occurs once in the New Testament anyway, and presumably is
	 * generally quite rare.
	 */
	NumberOfRepeats = 0;
	LastPlace = CurrentPlace;
	while (NumberOfRepeats < 255) {
	    if (LastPlace >= NumberToWrite) {
		break;
	    } else if (WordPlaces[LastPlace].FID != FID) {
		break;
	    }
	    ++NumberOfRepeats;
	    ++LastPlace;
	}

	L = (FID - LastFID) << 1;
	LastFID = FID;
	if (NumberOfRepeats > 1) L |= 01L;
	if (PutLong(L, WID, &q, &Block, &BlockLength,
						&NextOffset, &LastStart) < 0) {
	    return CurrentPlace;
	}
	if (L & 01L) {
	    if (PutByte(NumberOfRepeats, WID, &q, &Block, &BlockLength,
						&LastStart, &NextOffset) < 0) {
		return CurrentPlace;
	    }
	}

	LastBlock = 0;

	for (; NumberOfRepeats != 0; --NumberOfRepeats) {
	    if (CurrentPlace > NumberToWrite) {
		fprintf(stderr,
		"Entry for file %lu (WID %ld) has more matches than expected\n",
								    FID, WID);
		exit(1);
		/* This would represent a rather serious bug, I think! */
	    }
	    /* block number */
	    if (WordPlaces[CurrentPlace].BlockInFile < LastBlock) {
		fprintf(stderr, "Sort WID %ld failed, non-monatonic blocks\n",
									WID);
		exit(1); /* can't cope with this one */
	    } else if (CurrentPlace &&
		    (WordPlaces[CurrentPlace].FID ==
			    WordPlaces[CurrentPlace - 1].FID) &&
		    (WordPlaces[CurrentPlace].BlockInFile == LastBlock) &&
		    (WordPlaces[CurrentPlace].WordInBlock <=
			    WordPlaces[CurrentPlace - 1].WordInBlock)) {
		fprintf(stderr,
		    "Sort WID %ld failed, FID %ld: Blk %d: WIB %d <= %d\n",
		    WID, FID, LastBlock, WordPlaces[CurrentPlace].WordInBlock,
		    WordPlaces[CurrentPlace - 1].WordInBlock
		);
	    }
	    L = WordPlaces[CurrentPlace].BlockInFile - LastBlock;
	    LastBlock += L;

	    if (PutLong(L, WID, &q, &Block, &BlockLength, &NextOffset,
							    &LastStart) < 0) {
		return CurrentPlace;
	    }
	    U = (WordPlaces[CurrentPlace].WordInBlock << 1);
	    if (WordPlaces[CurrentPlace].StuffBefore != 1) {
		WordPlaces[CurrentPlace].Flags |= WPF_HASSTUFFBEFORE;
	    }
	    if (WordPlaces[CurrentPlace].Flags != WPF_HASSTUFFBEFORE) {
		U |= 01;
	    }
	    if (U > 255) {
		fprintf(stderr,
		"WID %lu: WordInBlock (0%o) from FID %lu too big\n",
			WID, U, FID);
		exit(1);
	    }

	    if (PutByte(U, WID, &q, &Block, &BlockLength, &LastStart,
							&NextOffset) < 0) {
		return CurrentPlace;
	    }
	    if (U & 01) {
		if (PutByte(WordPlaces[CurrentPlace].Flags, WID, &q, &Block,
				&BlockLength, &LastStart, &NextOffset) < 0) {
		    return CurrentPlace;
		}
	    }

	    /* Even if there are flags, there still might not be a separate
	     * entry for the number of preceding skipped bytes.
	     */
	    if ((U & 01) &&
			(WordPlaces[CurrentPlace].Flags & WPF_HASSTUFFBEFORE)) {
		if (PutByte(WordPlaces[CurrentPlace].StuffBefore, WID, &q,
			&Block, &BlockLength, &LastStart, &NextOffset) < 0) {
		    return CurrentPlace;
		}
	    }
	    ++CurrentPlace;
	}
	if (CurrentPlace > LastPlace) {
	    fprintf(stderr, "oops- I went wrong and wrote %ld > %ld\n",
						    CurrentPlace, LastPlace);
	}
    }
    if (LastStart) {
	/* NextStart had better not be non-zero, but FlushBlock will
	 * take care of it (we have wasted a block in that case!).
	 * LastStart is zero if we fitted it all inside the WordInfo
	 * block, although this is currently unlikely as we don't get
	 * called in that case!
	 * Oh, maybe we do.
	 */
	FlushBlock((unsigned char *) Block, &NextOffset, &LastStart, WID);
    }
    return NumberToWrite;
}

t_WordPlace *
GetWordPlaces(WID, Block, BlockLength, NextOffset, NumberExpected)
    t_WID WID;
    char *Block;
    unsigned BlockLength;
    unsigned long NextOffset;
    unsigned long NumberExpected;
{
    unsigned char *q = (unsigned char *) Block;
    unsigned long L;
    t_WordPlace *Places = (t_WordPlace *) 0;
    long CurrentPlace = 0;
    t_FID LastFID = (t_FID) 0;
    unsigned LastBlock = 0L;

#ifdef ASCIITRACE
    if (AsciiTrace > 10) {
	fprintf(stderr,
		"GetWordPlaces WID %ld Blk %ld len %d next %ld No. %ld\n",
		    WID, Block, BlockLength, NextOffset, NumberExpected);
			
    }
#endif

    if (Block == (char *) 0) {
	fprintf(stderr, "GetWordPlaces Error %lu\n", WID);
    }

    /*NOSTRICT*/
    Places = (t_WordPlace *) emalloc(sizeof(t_WordPlace) * NumberExpected);

    while (CurrentPlace < NumberExpected) {
	unsigned short NumberOfRepeats;
	unsigned char Uchar;
	t_FID FID;

	/** First get the FID.  The bottom bit of the number stored
	 ** actually determines whether there are multiple Places
	 ** stored here for the same FID.
	 **/
	L = GetLong(WID, &q, &Block, &BlockLength, &NextOffset);
	FID = (L >> 1) + LastFID; /* Get rid of flag bit */
	LastFID = FID;
	NumberOfRepeats = (L & 01L) ? 
		GetByte(WID, &q, &Block, &BlockLength, &NextOffset) : 1;

	/* Quick Sanity check */
	switch (NumberOfRepeats) {
	case 0L:
	    fprintf(stderr, "Warning: no entries! for FID %lu\n", FID);
	case 1L:
	    if (L & 01L) {
		fprintf(stderr, "Warning: FID %lu repeated 1 times!\n", FID);
	    }
	}

	LastBlock = 0L;
	if (CurrentPlace + NumberOfRepeats > NumberExpected) {
	    fprintf(stderr,
		"Entry for file %lu WID %ld has %lu matches, not %lu\n",
		FID, WID, CurrentPlace + NumberOfRepeats + 1, NumberExpected);
	    NumberOfRepeats = NumberExpected - CurrentPlace;
	}
	for (; NumberOfRepeats != 0; --NumberOfRepeats) {
	    Places[CurrentPlace].FID = FID;
	    /* block number */
	    L = GetLong(WID, &q, &Block, &BlockLength, &NextOffset);
	    LastBlock += L;
	    Places[CurrentPlace].BlockInFile = LastBlock;
	    Uchar = GetByte(WID, &q, &Block, &BlockLength, &NextOffset);
	    Places[CurrentPlace].WordInBlock = (Uchar >> 1);

	    /* Sanity check: */
	    if (CurrentPlace > 0 && Places[CurrentPlace].FID ==
					Places[CurrentPlace - 1].FID) {
		if (Places[CurrentPlace - 1].BlockInFile ==
				    Places[CurrentPlace].BlockInFile) {
		    if ( Places[CurrentPlace - 1].WordInBlock >=
				Places[CurrentPlace].WordInBlock) {
			fprintf(stderr,
    "%s: warning: %dth match for word %ld (FID %ld) WIB goes backwards!\n",
				    progname, CurrentPlace, WID, FID);
		    }
		} else if (Places[CurrentPlace - 1].BlockInFile >
				Places[CurrentPlace].BlockInFile) {
			fprintf(stderr,
	"%s: warning: %dth match for word %ld (FID %ld) BIF goes backwards!\n",
				    progname, CurrentPlace, WID, FID);
		}
	    }
	    /* end of sanity test */

	    if (Uchar & 01) { /* use if, not ?:, for profiler */
		Places[CurrentPlace].Flags = 
		    GetByte(WID, &q, &Block, &BlockLength, &NextOffset);
	    } else {
		Places[CurrentPlace].Flags = 0;
	    }

	    /* If there are flags, there still might not be a separate
	     * entry for the number of preceding skipped bytes.
	     */
	    if (Places[CurrentPlace].Flags & WPF_HASSTUFFBEFORE) {
		Places[CurrentPlace].StuffBefore = 
		    GetByte(WID, &q, &Block, &BlockLength, &NextOffset);
	    } else {
		Places[CurrentPlace].StuffBefore = 1;
	    }
	    ++CurrentPlace;
	}
    }
    return Places;
}

void
FillWordPlaces(WordInfo)
    t_WordInfo *WordInfo;
{
    WordInfo->WordPlaces = GetWordPlaces(
	    WordInfo->WID,
	    WordInfo->WordPlaceStart,
	    WIDBLOCKSIZE - (WordInfo->WordPlaceStart - WordInfo->DataBlock),
	    WordInfo->Offset,
	    WordInfo->NumberOfWordPlaces
    );
}

static void
FlushBlock(Block, NextOffset, LastStart, WID)
    char *Block;
    unsigned long *NextOffset, *LastStart;
    t_WID WID;
{
    if (*LastStart && Block) {
	/*NOSTRICT*/
	t_BlockHeader *BH = (t_BlockHeader *) Block;

	BH->NextOffset = ((*NextOffset) / BLOCKSIZE) << 1;
	WriteBlock(*LastStart, Block);
    }
    *LastStart = *NextOffset = 0L;
}

/* This is simply to help keep the source lines getting too long! */
typedef unsigned char *UCP;

int
_PutByte(Byte, WID, sp, Blockp, BlockLength, LastStart, NextBlock)
    unsigned char Byte;
    t_WID WID;
    unsigned char **sp;
    unsigned char **Blockp;
    unsigned *BlockLength;
    unsigned long *NextBlock;
    unsigned long *LastStart; /* for writing the linked list */
{
    t_BlockHeader *BH;

    if (*sp - (*Blockp) >= (*BlockLength)) {
	if (!*NextBlock && !*LastStart) return -1; /* only do the 1st block */
	if (*NextBlock == (unsigned long) 0) {
	    *NextBlock = FindFreeBlock(WID);
	}
	/* Complete the information in the previous block, if required */
	if (*LastStart) {
	    BH = (t_BlockHeader *) (*Blockp);
	    BH->NextOffset = ((*NextBlock) / BLOCKSIZE) << 1;
	    /* Write the old block */
	    WriteBlock(*LastStart, *Blockp);
	    *LastStart = 0L;
	}
	*LastStart = (*NextBlock);
	*BlockLength = BLOCKSIZE;
	(*NextBlock) = 0L;
	*Blockp = pblockBuffer; /* Use static (to this file) data buffer */
	/*NOSTRICT*/
	BH = (t_BlockHeader *) (*Blockp);
	(*sp) = (UCP) BH->Data;
    }
    **sp = Byte;
    (*sp)++;
    return 0;
}

unsigned char
_GetByte(WID, sp, Blockp, BlockLength, NextBlock)
    t_WID WID;
    unsigned char **sp;
    unsigned char **Blockp;
    unsigned long *BlockLength;
    unsigned long *NextBlock;
{
    t_BlockHeader *BH;

    if (*sp - (*Blockp) >= (*BlockLength)) {
	if (*NextBlock == (unsigned long) 0) {
	    (*Blockp) = (*sp) = (UCP) 0;
	    fprintf(stderr, "Database Corrupt, Next is zero\n");
	    return 0;
	} else {
	    (*sp) = (*Blockp) = (UCP) ReadBlock(*NextBlock);
	}
	/* Check the new block */
	if ((*Blockp) == (UCP) 0) {
	    fprintf(stderr, "Database corrupt, %lu, sigh.\n", *NextBlock);
	    exit(1);
	}
	/*NOSTRICT*/
	BH = (t_BlockHeader *) (*Blockp);
	*NextBlock = (BH->NextOffset >> 1) * BLOCKSIZE;
	*BlockLength = BLOCKSIZE;
	(*sp) = (UCP) BH->Data;
    }
    return *((*sp)++);
}

/* PutLong -- write a long number in compressed/abbreviated form into a
 * string.  If this moves the string pointer beyond the block, write out
 * the block and start a new one.  In that case, the number written may well
 * span the gap between the blocks.  We use an overflow buffer to copy
 * the bytes (if any) that overflowed into it.
 * Then we write them at the start of the next block.
 *
 * This routine returns -1 and writes a partial number (no allocated block)
 * if *LastBlock and *NextBlock are zero.  This allows PutwOrdPlaces to be
 * called to put the WordPlaces into the WIDFILE block without writing out
 * an entire chain.
 */
int
PutLong(Long, WID, sp, Blockp, BlockLength, NextBlock, LastStart)
    unsigned long Long;
    t_WID WID;
    unsigned char **sp;
    unsigned char **Blockp;
    unsigned *BlockLength;
    unsigned long *NextBlock;
    unsigned long *LastStart; /* for writing the linked list */
{
    t_BlockHeader *BH;
    unsigned char Buffer[sizeof(unsigned long) + 1];
    unsigned char *Bufp = Buffer;
    unsigned char *p;

    sWriteNumber((char **) sp, Long);

    if ((*sp) - (*Blockp) > (*BlockLength)) { /* gone too far! */
	if (!*NextBlock && !*LastStart) return -1;
	/* Save the overflow in Buffer:
	 * the 1st 1 or more characters will fitted into the old block,
	 * but we need them all in a lump for readnumber().
	 * When we write the next block, we need to put the overflow
	 * characters into the start of the next block.
	 */
	for (p = &(*Blockp)[*BlockLength]; p < (*sp); p++) {
	    *Bufp++ = *p;
	}
	if (*NextBlock == (unsigned long) 0) {
	    *NextBlock = FindFreeBlock(WID);
	}
	/* Complete the information in the previous block, if required */
	if (*LastStart) {
	    BH = (t_BlockHeader *) (*Blockp);
	    BH->NextOffset = ((*NextBlock) / BLOCKSIZE) << 1;

	    /* Write the old block */
	    WriteBlock(*LastStart, *Blockp);
	}
	*LastStart = (*NextBlock);
	(*NextBlock) = 0L;
	*Blockp = pblockBuffer;
	BH = (t_BlockHeader *) (*Blockp);
	*BlockLength = BLOCKSIZE;
	(*sp) = (UCP) BH->Data;
	/* Now write the stuff from Buffer into the new block */
	if (Bufp > Buffer) {
	    for (p = Buffer; p < Bufp; p++) {
		*((*sp)++) = (*p);
	    }
	}
    }
    return 0;
}

/* This is the reverse of PutLong.
 * Things are slightly complicated by the need to provide sReadNumber
 * with a contiguous copy of all of the bytes in a number that spanned
 * a gap between data blocks.
 */
unsigned long
GetLong(WID, sp, Blockp, BlockLength, NextBlock)
    t_WID WID;
    unsigned char **sp;
    unsigned char **Blockp;
    unsigned *BlockLength;
    unsigned long *NextBlock;
{
    unsigned char Buffer[sizeof(unsigned long) + 1];
    long Result;
    t_BlockHeader *BH;
    unsigned char *NumberStart = (*sp);
    unsigned char *p;

    Result = sReadNumber(sp);

    /* Now, have we fallen off the end of the block? */
    if ((*sp) - (*Blockp) > (*BlockLength)) {
	unsigned char *bp = Buffer;

	if (*NextBlock == (unsigned long) 0) {
	    return 0L;
	}

	/* Copy the first half of the number into the overflow buffer */
	for (p = NumberStart; p < &(*Blockp)[*BlockLength]; p++) {
	    *bp++ = *p;
	}

	/** Now:
	 ** . sp is garbage, as is NumberStart, as they point at the old
	 **   data block
	 ** . Buffer contains the first few bytes of the number
	 ** . we need some more bytes, but don't yet know how many, as
	 **   this depends on the number representation
	 **   NOTE that we must have, however, that we know that there
	 **   are more bytes, so that we know if we need the next block.
	 ** . bp points 1 beyond the end of the 1st half of the number.
	 **/

	(*sp) = *Blockp = (UCP) ReadBlock(*NextBlock);
	/* Check the new block */
	if ((*Blockp) == (UCP) 0) {
	    fprintf(stderr, "Database corrupt, %lu, oh dear!\n", *NextBlock);
	    exit(1);
	}
	BH = (t_BlockHeader *) *Blockp;
	*NextBlock = (BH->NextOffset >> 1) * BLOCKSIZE;
	*BlockLength = BLOCKSIZE;
	(*sp) = (UCP) BH->Data;
	/* Fill up the buffer from the new block */
	for (p = bp; p - Buffer < sizeof(Buffer) - 1; p++) {
	    *p = *(*sp)++;
	}
	/* read the number from the buffer */
	(*sp) = Buffer;
	/* Try that number again... */
	Result = sReadNumber(sp);
	/* Now put sp where it should be.  Part of the buffer was
	 * from the old block...
	 */
	(*sp) = (UCP) BH->Data + ((*sp) - bp);
    }
    return Result;
}

void
WriteBlock(Block, Data)
    unsigned long Block;
    char *Data;
{
    if (DataFile < 0) {
	if ((DataFile = open(DataBase, O_RDWR|O_CREAT, 0666)) < 0) {
	    fprintf(stderr, "Can't open database");
	    perror(DataBase);
	    exit(1);
	}
    }

    if (lseek(DataFile, (long) Block, 0) < 0) {
	perror("lseek");
	exit(1);
    }

    if (write(DataFile, Data, BLOCKSIZE) != BLOCKSIZE) {
	extern int errno;
	int e = errno;
	fprintf(stderr,
	"Warning: %s: E%d -- couldn't write %ld bytes at block %lu: ",
					    DataBase, e, BLOCKSIZE, Block);
	errno = e;
	perror("write");
    }
    return;
}

unsigned long
FindFreeBlock(WID)
    t_WID WID;
{
    char Data[BLOCKSIZE];
    t_BlockHeader *HP;
    unsigned long Here = 0L;
    unsigned long There = 0L;

    if (DataFile < 0) {
	if ((DataFile = open(DataBase, O_RDWR|O_CREAT, 0666)) < 0) {
	    fprintf(stderr, "Can't open database");
	    perror(DataBase);
	    exit(1);
	}
    }

    do {
	(void) lseek(DataFile, (long) Here, 0);

	if (read(DataFile, Data, BLOCKSIZE) < BLOCKSIZE) {
	    HP = (t_BlockHeader *) 0;
	    There = 0L;
	    Here = BLOCKSIZE;
	} else {
	    HP = (t_BlockHeader *) Data;
	}

	/* It is free if it has no pairs (or if the block has
	 * never been written to, of course)
	 * Freed blocks are given a NextOffset or'd with 1.
	 */
	if ((!HP || (HP->NextOffset & 01L)) && Here != 0L) {
	    unsigned long NextFree = HP ? HP->NextOffset : 0L;

	    if (NextFree) {
		/* make it suitable for use with lseek() */
		NextFree = (NextFree >> 1) * BLOCKSIZE;
	    }

	    /* Make the previous block in the chain point to the one
	     * after this one in the free list, instead of this one.
	     */

	    if (HP) {
		(void) lseek(DataFile, (long) There, 0);
		(void) read(DataFile, Data, BLOCKSIZE);
	    				/* Well, we read it a second ago!*/
	    }
	    HP = (t_BlockHeader *) Data;
	    HP->NextOffset = (NextFree / BLOCKSIZE) << 1;
	    (void) lseek(DataFile, (long) There, 0);
	    (void) write(DataFile, Data, BLOCKSIZE);

	    goto GotOne;
	}

	There = Here;
	Here = (HP->NextOffset >> 1) * BLOCKSIZE;

    } while (Here != 0L);

    if ((Here = lseek(DataFile, 0L, 2)) == 0L) {
	Here = BLOCKSIZE;
    }

GotOne:
    /* Mark it in use */
    HP->NextOffset = 0L;
    (void) lseek(DataFile, (long) Here, 0);
    (void) write(DataFile, Data, BLOCKSIZE);

#ifdef ASCIITRACE
    if (AsciiTrace > 10) {
	fprintf(stderr, "\tFindFree --> %lu\n", (Here == 0L) ? BLOCKSIZE : Here);
    }
#endif
    return (Here == 0L) ? BLOCKSIZE : Here;
}

/* Get a single disk block from the database */
char *
ReadBlock(Offset)
    unsigned long Offset;
{
    static char Buffer0[BLOCKSIZE];

    if (DataFile < 0) {
	if ((DataFile = open(DataBase, O_RDWR|O_CREAT, 0664)) < 0) {
	    return (char *) 0;
	}
    }

#ifdef ASCIITRACE
    if (AsciiTrace > 15) {
	fprintf(stderr, "===ReadBlock(%ld)\n", Offset);
    }
#endif

    if (lseek(DataFile, (long) Offset, 0) < 0) {
	return (char *) 0;
    }

    /* Now we are in the right place */

    if (read(DataFile, Buffer0, BLOCKSIZE) != BLOCKSIZE) {
	return (char *) 0;
    }

    /* Now that we have the 1st block of the data.. */
    return Buffer0;
}

int
WordPlaceCompare(F1, F2)
    t_WordPlace *F1, *F2;
{
    if (F1->FID != F2->FID) return F1->FID - F2->FID;
    if (F1->BlockInFile != F2->BlockInFile) {
	return F1->BlockInFile - F2->BlockInFile;
    }
    return F1->WordInBlock - F2->WordInBlock;
}

int
RemoveFIDFrompblock(FID, pblock)
    t_FID FID;
    t_pblock *pblock;
{
    unsigned int ThisPair;
    int TotalWordPlacesToMove = 0;

    /* Mark unwanted pairs as deleted */
    for (ThisPair = 0; ThisPair < pblock->NumberOfWordPlaces; ThisPair++) {
	if (pblock->WordPlaces[ThisPair].FID == FID) {
	    pblock->WordPlaces[ThisPair].FID = 0L;
	    ++TotalWordPlacesToMove;
	} 
    }
    if (!TotalWordPlacesToMove) return 0;

    if (pblock->NumberOfWordPlaces > 1) {
	SortWordPlaces(pblock->NumberOfWordPlaces, pblock->WordPlaces);
    }

    /* shuffle up any FID=0 pairs */
    {
	register t_WordPlace *Src, *Dest;
	register int n = TotalWordPlacesToMove;

	Src = pblock->WordPlaces;
	Dest = &pblock->WordPlaces[TotalWordPlacesToMove];

	while (n-- > 0) {
	    *Src++ = *Dest++;
	}
    }

    return TotalWordPlacesToMove;
}

void
SortWordPlaces(NumberOfWordPlaces, WordPlaces)
    unsigned long NumberOfWordPlaces;
    t_WordPlace *WordPlaces;
{
    /* sort the list */
    if (NumberOfWordPlaces > 2) {
	qsort(WordPlaces, (int) NumberOfWordPlaces,
		sizeof(t_WordPlace), WordPlaceCompare);
    } else {
	/* don't call qsort in the trivial cases... */
	if (NumberOfWordPlaces == 2) {
	    if (WordPlaceCompare(&WordPlaces[0], &WordPlaces[1]) > 0) {
		t_WordPlace tmp;
		tmp = WordPlaces[0]; /* structure copy! */
		WordPlaces[0] = WordPlaces[1];
		WordPlaces[1] = tmp;
	    }
	}
    }
}

void
DeleteWordPlaces(FirstBlock, WID)
    unsigned long FirstBlock;
    t_WID WID;
{
    char Data[BLOCKSIZE];
    char BlockZero[BLOCKSIZE];
    t_BlockHeader *H;
    t_BlockHeader *HZERO; /* for the free list*/
    unsigned long ThisBlock;
    unsigned long Here;

    if (!FirstBlock) {
	return;
    }

    if (DataFile < 0) {
	if ((DataFile = open(DataBase, O_RDWR|O_CREAT, 0666)) < 0) {
	    fprintf(stderr, "Can't open database");
	    perror(DataBase);
	    exit(1);
	}
    }

    (void) lseek(DataFile, 0L, 0);

    if (read(DataFile, BlockZero, BLOCKSIZE) <= 0) {
	return; /* nothing to delete */
    }

    /* Block zero is always the head of the free list */
    HZERO = (t_BlockHeader *) BlockZero;

    Here = FirstBlock;

    do {
	if (lseek(DataFile, (long) Here, 0) < 0) {
	    fprintf(stderr, "Aaaargh:");
	    perror("lseek");
	    exit(1);
	}

	if (read(DataFile, Data, BLOCKSIZE) <= 0) {
	    return; /* it's not there to delete */
	}

	H = (t_BlockHeader *) Data;

	ThisBlock = Here; /* so we can write the changed block */
	Here = (H->NextOffset >> 1) * BLOCKSIZE;
	if (Here == 0L) { /* next block... */
	    H->NextOffset = HZERO->NextOffset;
	}
	H->NextOffset |= 01L; /* mark it as free */

	/* Write out the newly freed block, having saved its Next pointer
	 */
	WriteBlock(ThisBlock, Data);
    } while (Here);

    /* Now make the free list point to the start of the new chain;
     * the end of the newly added chain points to the start of the
     * original chain.  This means that the new blocks are still in the
     * Unix buffer cache, so this helps performance.  I hope.
     */
    HZERO->NextOffset = (FirstBlock / BLOCKSIZE) << 1;
    (void) lseek(DataFile, 0L, 0);
    (void) write(DataFile, BlockZero, BLOCKSIZE);

    return;
}

void
Deletepblock(pblock)
    t_pblock *pblock;
{
    char Data[BLOCKSIZE];
    char BlockZero[BLOCKSIZE];
    t_BlockHeader *H;
    t_BlockHeader *HZERO; /* for the free list*/
    unsigned long ThisBlock;
    unsigned long Here;

    if (!pblock || !pblock->ChainStart) {
	return;
    }

    if (DataFile < 0) {
	if ((DataFile = open(DataBase, O_RDWR|O_CREAT, 0666)) < 0) {
	    fprintf(stderr, "Can't open database");
	    perror(DataBase);
	    exit(1);
	}
    }

    if (read(DataFile, BlockZero, BLOCKSIZE) <= 0) {
	return; /* nothing to delete */
    }

    /* Block zero is always the head of the free list */
    HZERO = (t_BlockHeader *) BlockZero;

    Here = pblock->ChainStart;

    do {
	if (lseek(DataFile, (long) Here, 0) < 0) {
	    fprintf(stderr, "Aaaargh:");
	    perror("lseek");
	    exit(1);
	}

	if (read(DataFile, Data, BLOCKSIZE) <= 0) {
	    return; /* it's not there to delete */
	}

	H = (t_BlockHeader *) Data;

	ThisBlock = Here; /* so we can write the changed block later */
	if ((Here = ((H->NextOffset >> 1) * BLOCKSIZE)) == 0L) {
	    H->NextOffset = HZERO->NextOffset;
	}
	H->NextOffset |= 01L; /* mark it as free */

	/* Write out the newly freed block, having saved its Next pointer
	 */
	WriteBlock(ThisBlock, Data);
    } while (Here);

    /* Now make the free list point to the start of the new chain;
     * the end of the newly added chain points to the start of the
     * original chain.  This means that the new blocks are still in the
     * Unix buffer cache, so this helps performance.  I hope.
     */
    HZERO->NextOffset = (pblock->ChainStart / BLOCKSIZE) << 1;
    (void) lseek(DataFile, 0L, 0);
    write(DataFile, BlockZero, BLOCKSIZE);

    return;
}
