
/* SixBit.c -- Copyright 1990 Liam R. Quin.  All Rights Reserved.
 * This code is NOT in the public domain.
 * See the file COPYRIGHT for full details.
 *
 * $Id: SixBit.c,v 1.2 90/10/06 00:12:00 lee Rel1-10 $
 */

/* SixBit.c -- handle simple alphanumerical string compression
 * 
 * $Log:	SixBit.c,v $
 * Revision 1.2  90/10/06  00:12:00  lee
 * Prepared for first beta release.
 * 
 * Revision 1.1  90/08/09  19:16:32  lee
 * Initial revision
 * 
 */

#include "globals.h" /* defines and declarations for database filenames */

#ifdef SYSV
extern int _flsbuf();
#endif
#include <stdio.h>
#include <string.h>
#include <ctype.h>

#include "wordrules.h"

/** Unix System Calls that need to be declared **/
extern void exit();
/** Unix/C Library Calls that need to be declared **/
#ifndef tolower
extern int tolower();
#endif
/** liblqtext library calls that need to be declared **/
	/* (none) */
/** functions in this file that need to be declared **/
/** **/

extern int AsciiTrace;

typedef struct {
    unsigned char *String;
    unsigned long Length;
} t_SixBitString;

#define COOKIE8 (11 + ('z' - 'a') + 5) /* marks an 8-bit char */

t_SixBitString *
String2SixBitString(String)
    unsigned char *String;
{
    static t_SixBitString Result;
    int CookieCount = 0;
    int Cookie = 0;
#ifdef TESTSIX
    static unsigned char Buffer[4096];
    unsigned char Buffer2[4096 + 4];
    unsigned char CookieList[4096 + 4];
#else
    static unsigned char Buffer[MaxWordLength + 1];
    unsigned char Buffer2[MaxWordLength + 5];
    unsigned char CookieList[MaxWordLangth + 1];
#endif
    register unsigned char *p, *q;
    unsigned short Val;

    Result.String = Buffer;

    /* BUG: we lose word-processing accents, etc. and 8-bitness if
     * we do this.  Also, it slows things down very, very slightly.
     */

    /* Some ascii character equivalents:
     * '0' 48 060 0x30
     * 'A' 65 0101 0x41
     * '_' 95 0137 0x5f
     * 'a' 97 0141 0x61
     */
    for (p = String, q = Buffer2; *p; p++, q++) {
	if (isupper(*p)) *p = tolower(*p);
	/* Store as
	 * 0-9 --> 1-10 (easy, but avoiding \0)
	 * a-z --> 10...35
	 * _/' --> 36/37
	 * hence, I need 6 bits per character.  This also leaves rather
	 * a lot of bits spare (38..64, 27 or so spaces).  As I fold case,
	 * and don't have controls, I don't know what to do there.  I
	 * could store digrams.  There are 38*38 = 1444 of these, but
	 * some of them don't happen.  Not worth the effort.
	 * A character that does not fit in this scheme gets marked as
	 * a magic cookie -- the character goes at the end.  This is
	 * clearly wasteful, but it happens very rarely in English text.
	 */
	if (isdigit(*p)) {
	    Val = (*p) - '0' + 1;
	} else if (isalpha(*p)) {
	    Val = (*p) - 'a' + 11;
	} else if (*p == '\'') {
	    Val = 11 + ('z' - 'a') + 1;
	} else if (*p == '_') {
	    Val = 11 + ('z' - 'a') + 2;
	} else if (*p == '-') {
	    Val = 11 + ('z' - 'a') + 3;
	} else if (*p == ' ') {
	    Val = 11 + ('z' - 'a') + 4;
	} else {
	    Val = COOKIE8;
	    CookieList[CookieCount++] = (*p);
	}

	*q = Val;
    }
    *q = '\0';

    /* p0	    p1	         p2	      p3
     * ! ! ! ! ! !  ! ! ! ! ! !  ! ! ! ! ! !  ! ! ! ! ! ! 
     * ! ! ! ! ! !  ! !@! ! ! !  ! ! ! !@! !  ! ! ! ! ! ! (lsb.....msb)
     * q0		q1		 q2
     */
    Result.Length = (long) CookieCount;
    for (p = Buffer2, q = Buffer; *p; q += 3, p += 4) {
	int CookWhere = 3;

	q[0] = q[1] = q[2] = q[3] = '\0'; /* ensure a trailing \0 */

	if (!p[0]) break;
	q[0] = (p[0] & 077) | ((p[1] & 03) << 6); /* all of p0, bot 2 of p1 */
	Result.Length++;

	if (p[0] == COOKIE8) {
	    q[CookWhere] = CookieList[Cookie++];
	    q[++CookWhere] = '\0';
	}
	if (!p[1]) break;
	q[1] = ((p[1] & 074) >> 2) |   /* top 4 of p1, bot 4 of p2 */
			((p[2] & 017) << 4);


	if (p[1] == COOKIE8) {
	    q[CookWhere] = CookieList[Cookie++];
	    q[++CookWhere] = '\0';
	}
	Result.Length++;

	if (!p[2]) break;

	if (p[2] == COOKIE8) {
	    q[CookWhere] = CookieList[Cookie++];
	    q[++CookWhere] = '\0';
	}

	q[2] = ((p[2] & 060) >> 4) | ((p[3] & 077) << 2);
	Result.Length++;
	if (!p[3]) break;

	if (p[3] == COOKIE8) {
	    q[CookWhere] = CookieList[Cookie++];
	    q[++CookWhere] = '\0';
	}
	Result.Length++;

	/* skip over the cookies... */
	q += (CookWhere - 3);
    }
    return &Result;
}

char *
SixBitString2String(SixBitString)
    t_SixBitString *SixBitString;
{
    int Cookie = 0;
    int CookieCount = 0;
#ifdef TESTSIX
    static unsigned char Buffer[4096];
    static unsigned char CookieJar[4096];
#else
    static unsigned char Buffer[MaxWordLength + 2];
    static unsigned char CookieJar[MaxWordLength + 1];
#endif
    register unsigned char *p = SixBitString->String;
    unsigned char *Bufp = Buffer;
    unsigned long Length = SixBitString->Length;

    /* March along looking at clumps of 3 bytes...
     * and unpacking each clump into 4 characters
     * Characters that don't fit into the 6-bit scheme are marked as
     * Cookies, and the corresponding characters go inbetween the clumps.
     * Hence, such characters take up 6 + 8 - 14 bits in the encoding.
     */

    /*
     * p0		p1		 p2
     * ! ! ! ! ! !  ! !@! ! ! !  ! ! ! !@! !  ! ! ! ! ! ! (lsb.....msb)
     * ! ! ! ! ! !  ! ! ! ! ! !  ! ! ! ! ! !  ! ! ! ! ! ! 
     * Bufp0	    Bufp1	 Bufp2	      Bufp3
     */
    for (;;) {
	int CookieWhere = 3;

	Bufp[0] = Bufp[1] = Bufp[2] = Bufp[3] = Bufp[4] = '\0';

	/* Length is the length of the 6-bit string */
	if (!Length) break;
	Bufp[0] = p[0] & 077; /* bottom 6 */
	if (Bufp[0] == COOKIE8) {
	    CookieJar[CookieCount++] = p[CookieWhere++];
	    if (--Length <= 0) break;
	}
	Bufp[1] = ((p[0] & 0300) >> 6) | ((p[1] & 017) << 2); /* 2, 4 */
	if (Bufp[1] == COOKIE8) {
	    CookieJar[CookieCount++] = p[CookieWhere++];
	    if (--Length <= 0) break;
	}
	if (--Length <= 0) break; /* done with p[0] */
	Bufp[2] = ((p[1] & 0360) >> 4) | ((p[2] & 03) << 4); /* 4, 2 */
	if (Bufp[2] == COOKIE8) {
	    CookieJar[CookieCount++] = p[CookieWhere++];
	    if (--Length <= 0) break;
	}
	if (!--Length) break; /* done with p[1] */
	Bufp[3] = (p[2] & 0374) >> 2; /* top 6 */
	if (Bufp[3] == COOKIE8) {
	    CookieJar[CookieCount++] = p[CookieWhere++];
	    if (--Length <= 0) break;
	}
	if (!--Length) break; /* done with p[1] */

	Bufp += 4;
	p += 3 + CookieWhere - 3;
    }

    Cookie = 0;

    for (p = Buffer; *p; p++) {
	switch (*p) {
	case 1: case 2: case 3: case 4: case 5:
	case 6: case 7: case 8: case 9: case 10:
	    *p += '0' - 1;
	    break;
	case 11 + ('z' - 'a') + 1: *p = '\''; break;
	case 11 + ('z' - 'a') + 2: *p = '_'; break;
	case 11 + ('z' - 'a') + 3: *p = '-'; break;
	case 11 + ('z' - 'a') + 4: *p = ' '; break;
	case COOKIE8:
	    /* This is a little too trusting... */
	    *p = CookieJar[Cookie++];
	    break;

	default: /* lower case letters */
	    *p += 'a' - 11;
	    break;
	}
    }
    return (char *) Buffer;
}

#ifdef TESTSIX
char *progname= "testsix";

int
main(argc, argv)
    int argc;
    char *argv[];
{
    extern char *gets();

    unsigned char Line[4096];
    int Encode = 1;
    t_SixBitString *Sixp;
    t_SixBitString Six;

    if (argc != 2) {
	fprintf(stderr, "bad arg count; usage: %s -[de]\n", progname);
    }
    if (STREQ(argv[1], "-e")) Encode = 1;
    else if (STREQ(argv[1], "-d")) Encode = 0;
    else {
	fprintf(stderr, "usage: %s -[d|e]\n", progname);
	exit(1);
    }
    if (Encode) {
	while (gets(Line) != (char *) 0) {
	    unsigned char c;
	    Sixp = String2SixBitString(Line);
	    c = Sixp->Length;
	    (void) putchar(c);
	    (void) fwrite(Sixp->String, Sixp->Length, 1, stdout);
	}
    } else { /* decode */
	char *s;
	unsigned char c;

	while (fread(&c, 1, 1, stdin) == 1) {
	    if ((Six.Length = c) != 0L) {
		if (fread(Line, 1, Six.Length, stdin) != Six.Length) {
		    fprintf(stderr, "short file\n");
		    exit(1);
		}
		Line[Six.Length] = '\0';
		Six.String = Line;
		s = SixBitString2String(&Six);
		puts(s);
	    } else {
		putchar('\n');
	    }
	}
    }
    return 0;
}

#endif /*TESTSIX*/
