/* comp_h.c */
/* Compression routines */
#include "copyrite.h"
#include "config.h"
#ifdef I_STRING
#include <string.h>
#else
#include <strings.h>
#endif
#ifdef I_STDLIB
#include <stdlib.h>
#endif
#include <ctype.h>
#include "externs.h"
#include "intrface.h"
#include "conf.h"
#include "mushdb.h"
#include "mymalloc.h"
#include "confmagic.h"
#ifdef WIN32
#pragma warning( disable : 4244)	/* NJG: disable warning re conversion */
#endif
/* Talek's rewrite of compress.c, using a Huffman compression routine. */
/* This routine adds some time to the MUSH startup, since it reads a file */
/* in order to auto-tune the compression at each restart. See the */
/* SAMPLE_SIZE define in options.h for ways to trade efficiency for speed */
/* This rewrite was inspired by Javelin's rewrite on a similar vein. */
/* Most of the comments are his. The nasty ones are Talek's. */
/* Since atr_comm_match sticks its grubby little fingers in where it */
/* shouldn't, the first character of each string is not compressed. */
/* Due to idiots compressing and uncompressing strings inappropriately, */
/* I use a special marker ('\1') in second position to designate already */
/* compressed strings.  Say goodbye to compression on short strings.  This */
/* also means that any normal string with that marker will probably cause */
/* core dumps. */
#define TABLE_SIZE	256	/* allow all characters */
#define EOS		0	/* use null code for end of string */
#define CHAR_BITS	8	/* number of bits in char */
#define CHAR_MASK	255	/* mask for just one char */
#define CODE_BITS	25	/* max number of bits in code */
#ifndef SAMPLE_SIZE
#define SAMPLE_SIZE	0
#endif
/* If your system is little-endian *AND* it doesn't care about 
 * alignment (that is, you can assign a long to an unsigned char
 * without aligntment problems), you can #define IM_LITTLE_ENDIAN
 * and gain some performance benefits. Dec systems are little-endian,
 * but care about alignment.
 */
/* The code type */
typedef unsigned long CType;	/* must be at least CODE_BITS+CHAR_BITS-1 */
				/* bits long */
/* A node in the compression tree */
typedef struct cnode {
  struct cnode *left, *right;
  unsigned char c;
} CNode;
static CNode *ctop;
static CType ctable[TABLE_SIZE];
static char ltable[TABLE_SIZE];
static int fix_tree_depth _((CNode *node, int height, int zeros));
static void add_ones _((CNode *node));
static void build_ctable _((CNode *root, CType code, int numbits));
int init_compress _((FILE * f));
/* Compress a string: this is pretty easy. For each char in the string,
 * look up its code in ctable and add it to the compressed string we
 * build, keeping careful track of the number of bits we add.
 * Then stick the EOS character at the end.
 */
unsigned char *
compress(s)
    char const *s;
{
  static unsigned char buf[BUFFER_LEN];
  CType stage;
  int bits = 0;
  unsigned char *p, *b;
  p = (unsigned char *) s;
  b = buf;
  stage = 0;
  /* Actually get around to compressing the data... */
  while (p && *p) {
    /* Put code on stage */
    stage |= ctable[*p] << bits;
    bits += ltable[*p];
    /* Put any full bytes of stage into the compressed string */
#ifdef IM_LITTLE_ENDIAN
    *((CType *) b) = stage;
    b += bits / CHAR_BITS;
    stage = stage >> (CHAR_BITS * (bits / CHAR_BITS));
    bits = bits % CHAR_BITS;
#else
    while (bits >= CHAR_BITS) {
      *b++ = stage & CHAR_MASK;
      stage = stage >> CHAR_BITS;
      bits -= CHAR_BITS;
    }
#endif
    p++;
  }
  /* Put in EOS */
  /* This relies on EOS == 00000000 */
  /* Put the rest of the stage into the compressed string */
#ifdef IM_LITTLE_ENDIAN
  *((CType *) b) = stage;
#else
  bits += ltable[EOS] + CHAR_BITS - 1;
  while (bits >= CHAR_BITS) {
    *b++ = stage & CHAR_MASK;
    stage = stage >> CHAR_BITS;
    bits -= CHAR_BITS;
  }
#endif
  return buf;
}
/* This macro is used for walking the compression tree.
 * It is a macro for efficiency.
 */
#define WALK_TREE(bitpos) \
do { \
  if (*p & (bitpos)) \
    node = node->right; \
  else \
    node = node->left; \
  if (!node->left && !node->right) { \
    /* Got a char */ \
    *b++ = node->c; \
    if (!*p || ((long)(b - buf) >= (long)(sizeof(buf) - 1))) { \
      *b++ = EOS; \
      return buf; \
    } \
    if (node->c == EOS) \
      return buf; \
    node = ctop; \
  } \
} while (0)
/* Uncompression is a snap, too. Go bit by bit, using the
 * bits to traverse the binary tree (0=left, 1=right) until reaching
 * a leaf node, which is the uncompressed character.
 * Stop when the leaf node turns out to be EOS.
 */
char *
uncompress(s)
    unsigned char const *s;
{
  /* to avoid generating memory problems, this function should be
   * used with something of the format
   * char tbuf1[BUFFER_LEN];
   * strcpy(tbuf1, uncompress(a->value));
   * if you are using something of type char *buff, use the
   * safe_uncompress function instead.
   */
  static char buf[BUFFER_LEN];
  unsigned char *p;
  char *b;
  CNode *node;
  buf[0] = '\0';
  if (!s || !*s)
    return buf;
  p = (unsigned char *) s;
  b = buf;
  /* Finally start decompressing the string... */
  node = ctop;
  for (;;) {
    WALK_TREE(1);
    WALK_TREE(2);
    WALK_TREE(4);
    WALK_TREE(8);
    WALK_TREE(16);
    WALK_TREE(32);
    WALK_TREE(64);
    WALK_TREE(128);
    p++;
  }
}
char *
safe_uncompress(s)
    unsigned char const *s;
{
  /* this function should be used when you're doing something like
   * char *attrib = safe_uncompress(a->value);
   * NEVER use it with something like
   * char tbuf1[BUFFER_LEN]; strcpy(tbuf1, safe_uncompress(a->value));
   * or you will create a horrendous memory leak.
   */
  return (char *) strdup((char *) uncompress(s));
}
static int
fix_tree_depth(node, height, zeros)
    CNode *node;
    int height;
    int zeros;
{
  int a, b;
  CNode *temp;
  if (!node)
    return height + (zeros > 2);
  a = fix_tree_depth(node->left, height + 1 + (zeros == 7), (zeros + 1) % 8);
  b = fix_tree_depth(node->right, height + 1, 0);
  if ((a > CODE_BITS) && (b < (a - 1))) {
#ifdef STANDALONE
    printf("Rotate right at depth %d.\n", height);
#endif
    temp = node->right;
    node->right = node->left;
    node->left = node->right->left;
    node->right->left = node->right->right;
    node->right->right = temp;
    a = fix_tree_depth(node->left, height + 1 + (zeros == 7), (zeros + 1) % 8);
    b = fix_tree_depth(node->right, height + 1, 0);
  } else if ((b > CODE_BITS) && (a < (b - 1))) {
#ifdef STANDALONE
    printf("Rotate left at depth %d.\n", height);
#endif
    temp = node->left;
    node->left = node->right;
    node->right = node->left->right;
    node->left->right = node->left->left;
    node->left->left = temp;
    a = fix_tree_depth(node->left, height + 1 + (zeros == 7), (zeros + 1) % 8);
    b = fix_tree_depth(node->right, height + 1, 0);
  }
  return ((a > b) ? a : b);
}
/* Add 1s to the tree, recursively */
static void
add_ones(node)
    CNode *node;
{
  int count;
  count = 0;
  do {
    if (node->right)
      add_ones(node->right);
    if ((count >= 7) || ((count >= 3) && !node->left && !node->right)) {
      ctop = (CNode *) malloc((unsigned) sizeof(CNode));
      if (!ctop) {
	fprintf(stderr,
	     "Cannot allocate memory for compression tree. Aborting.\n");
	exit(1);
      }
      ctop->left = node->left;
      ctop->right = node->right;
      ctop->c = node->c;
      node->left = (CNode *) NULL;
      node->right = ctop;
      node = ctop;
      count = 0;
    }
    node = node->left;
    count++;
  } while (node);
}
/* Build ctable and ltable from the tree, recursively */
static void
build_ctable(root, code, numbits)
    CNode *root;
    CType code;
    int numbits;
{
#ifdef STANDALONE
  int i;
#endif
  if (!root->left && !root->right) {
    ctable[root->c] = code;
    ltable[root->c] = numbits;
#ifdef STANDALONE
    printf(isprint(root->c) ? "Code for '%c':\t" : "Code for %d:\t", root->c);
    for (i = 0; i < numbits; i++)
      printf("%d", (code >> i) & 1);
    printf("\n");
#endif
    if (numbits > CODE_BITS) {
      fprintf(stderr, "Illegal compression code length (%d).  Aborting.\n",
	      numbits);
      exit(1);
    }
  } else {
    if (root->left)
      build_ctable(root->left, code | (0 << numbits), numbits + 1);
    if (root->right)
      build_ctable(root->right, code | (1 << numbits), numbits + 1);
  }
}
/* Initialize the compression tree and table in 5 steps:
 * 1. Initialize arrays and things
 * 2. Read indb (up to SAMPLE_SIZE chars, if defined) and count
 *    the frequency of every character
 * 3. Cheat the relative frequency of some known special chars
 *    and upper-case letters
 * 4. Construct an (un)compression tree based on frequencies
 * 5. Construct a compression table by searching the tree
 */
int
init_compress(f)
    FILE *f;
{
  int total;
  unsigned char c;
  struct {
    long freq;
    CNode *node;
  } table[TABLE_SIZE];
  int indx, count;
  long temp;
  CNode *node;
#ifdef STANDALONE
  printf("init_compress: Part 1\n");
#endif
  /* Part 1: initialize */
  for (total = 0; total < TABLE_SIZE; total++) {
    table[total].freq = 0;
    table[total].node = (CNode *) malloc((unsigned) sizeof(CNode));
    if (!table[total].node) {
      fprintf(stderr,
	      "Cannot allocate memory for compression tree. Aborting.\n");
      exit(1);
    }
    table[total].node->c = total;
    table[total].node->left = (CNode *) NULL;
    table[total].node->right = (CNode *) NULL;
  }
#ifdef STANDALONE
  printf("init_compress: Part 2\n");
#endif
  /* Part 2: count frequencies */
  total = 0;
  while (!feof(f) && (!SAMPLE_SIZE || (total++ < SAMPLE_SIZE))) {
    c = fgetc(f);
    table[c].freq++;
  }
#ifdef STANDALONE
  for (indx = 0; indx < TABLE_SIZE; indx++) {
    printf(isprint(indx) ? "Frequency for '%c': %d\n"
	   : "Frequency for %d: %d\n",
	   (unsigned char) indx, table[indx].freq);
  }
#endif
#ifdef STANDALONE
  printf("init_compress: Part 3\n");
#endif
  /* Part 3: Cheat the frequencies. Because there's a lot of wierd
   * stuff in indb (like ]'s and upper-case letters), we downplay it
   * by cutting frequencies. Actually, we shouldn't need to much.
   */
  /* The ']' character is artificially raised by being the
   * start-of-attribute marker in indb.  Set it back to '[',
   * which it should be balancing...
   */
  table[']'].freq = table['['].freq;
  /* The DEL character is returned once for no apparent reason (I think
   * it is returned at EOF), so remove that one count...
   */
  table[255].freq--;
  /* Newlines really aren't all that common in the attributes, so
   * chop the value substantially.
   */
  table['\n'].freq /= 16;
#ifdef STANDALONE
  printf("init_compress: Part 4(a)\n");
#endif
  /* Part 4(a): Sort the table.  I'm using a stupid insert sort here
   * because this only gets called once, and I don't want to conform
   * to the qsort interface.  NOTE: I don't sort in EOS.
   */
  for (indx = 2; indx < TABLE_SIZE; indx++) {
    for (count = indx;
	 (count > 1) && (table[count - 1].freq < table[count].freq);
	 count--) {
      temp = table[count].freq;
      table[count].freq = table[count - 1].freq;
      table[count - 1].freq = temp;
      node = table[count].node;
      table[count].node = table[count - 1].node;
      table[count - 1].node = node;
    }
  }
#ifdef STANDALONE
  printf("init_compress: Part 4(b)\n");
#endif
  /* Part 4(b): Now we've got a list sorted from most freq (table[0]) to
   * least freq. We build a binary tree by traversing the list, making
   * a subtree out of the two least frequent nodes (creating a parent
   * node whose frequency is the sum of its children's frequency),
   * and reinserting the subtree's parent into the list. We repeat
   * until there's only one node in the list, the root of the tree.
   * NOTE: I'm still not dealing with EOS.
   */
  for (indx = TABLE_SIZE - 1; indx > 0; indx--) {
#ifdef NEVER
    printf("Freq. table:\n");
    for (count = indx; count >= 0; count--)
      printf("%3d: %d\t", table[count].node->c, table[count].freq);
    printf("\n");
#endif
    node = (CNode *) malloc((unsigned) sizeof(CNode));
    if (!node) {
      fprintf(stderr,
	      "Cannot allocate memory for compression tree. Aborting.\n");
      exit(1);
    }
    node->left = table[indx].node;
    node->right = table[indx - 1].node;
    table[indx - 1].freq += table[indx].freq;
    table[indx - 1].node = node;
    for (count = indx - 1;
	 (count > 1) && (table[count - 1].freq <= table[count].freq);
	 count--) {
      temp = table[count].freq;
      table[count].freq = table[count - 1].freq;
      table[count - 1].freq = temp;
      node = table[count].node;
      table[count].node = table[count - 1].node;
      table[count - 1].node = node;
    }
  }
#ifdef NEVER
  build_ctable(table[1].node, 0, 0);
#endif
#ifdef STANDALONE
  printf("init_compress: Part 4(c)\n");
#endif
  /* Part 4(c): If necessary, squash the tree so that it obeys
   * the code length limitations (CODE_BITS et all).  This is
   * done recursively, with rotations.
   */
  (void) fix_tree_depth(table[1].node, 0, 2);
#ifdef STANDALONE
  printf("init_compress: Part 4(d)\n");
#endif
  /* Part 4(d): It is now time to insure that sequences of eight 0s
   * never occur in the output data, because having nulls in the
   * output would royally confuse strcpy et all.
   */
  /* Force a 1 at fifth position on the left edge of tree. (Or terminating
   * 1 for the all 0 code.)
   */
  node = table[1].node;		/* top of tree */
  for (count = 0; node->left && (count < 4); count++)
    node = node->left;
  ctop = (CNode *) malloc((unsigned) sizeof(CNode));
  if (!ctop) {
    fprintf(stderr,
	    "Cannot allocate memory for compression tree. Aborting.\n");
    exit(1);
  }
  ctop->left = node->left;
  ctop->right = node->right;
  ctop->c = node->c;
  node->left = (CNode *) NULL;
  node->right = ctop;
  /* Recursively descend tree adding 1s where needed. */
  add_ones(table[1].node);
#ifdef STANDALONE
  printf("init_compress: Part 4(e)\n");
#endif
  /* Part 4(e): Finally add in EOS as 00000000.
   */
  node = table[1].node;		/* top of tree */
  for (count = 0; count < 8; count++) {
    if (!node->left) {
      ctop = (CNode *) malloc((unsigned) sizeof(CNode));
      if (!ctop) {
	fprintf(stderr,
	     "Cannot allocate memory for compression tree. Aborting.\n");
	exit(1);
      }
      ctop->left = (CNode *) NULL;
      ctop->right = (CNode *) NULL;
      ctop->c = EOS;
      node->left = ctop;
    }
    node = node->left;
  }
#ifdef STANDALONE
  printf("init_compress: Part 5\n");
#endif
  /* Part 5: Now traverse the tree, depth-first, and construct
   * the compression table.
   */
  ctop = table[1].node;
  build_ctable(ctop, 0, 0);
#ifdef STANDALONE
  printf("init_compress: Done\n");
#endif
  /* Whew */
  return 0;
}
#ifdef STANDALONE
void
main(argc, argv)
    int argc;
    char *argv[];
{
  FILE *input;
  unsigned char buffer[BUFFER_LEN];
  unsigned char otherbuf[BUFFER_LEN];
  unsigned char newbuffer[BUFFER_LEN];
  unsigned char *p1, *p2;
  int count;
  if ((input = fopen(argv[1], "rb")) == NULL) {
    printf("Can't open %s.\n", argv[1]);
    exit(1);
  }
  init_compress(input);
  fclose(input);
  do {
    printf("Enter text: ");
    fgets(buffer, 4095, stdin);
    if ((buffer[0] == '\n') || (buffer[0] == '\r') || (buffer[0] == '\0'))
      exit(0);
    printf("Text: %s!\n", buffer);
    printf("Compressing\n");
    strcpy(otherbuf, compress(buffer));
    printf("Compressed: ");
    p1 = otherbuf;
    while (p1 && *p1) {
      for (count = 0; count < 8; count++)
	printf("%d", (*p1 >> count) & 1);
      p1++;
    }
    printf("\n");
    printf("Length: %d, Complength: %d\n", strlen(buffer), strlen(otherbuf));
    printf("Uncompressing\n");
    strcpy(newbuffer, uncompress(otherbuf));
    printf("Text: %s!\n", newbuffer);
    printf("Strcmp(orig,uncomp) = %d\n", strcmp(newbuffer, buffer));
    printf("strlen(orig) = %d, strlen(uncomp) = %d\n", strlen(buffer),
	   strlen(newbuffer));
    p1 = buffer;
    p2 = newbuffer;
/*
 * while (p1 && p2 && *p1 && *p2) {
 * if (*p1 != *p2) printf("Unequal: %d and %d\n",*p1,*p2);
 * else printf("Equal: %c and %c\n",*p1,*p2);
 * p1++; p2++;
 * }
 */
    strcpy(newbuffer, otherbuf);
/*
 * printf("Trying safe.\n");
 * buf = safe_uncompress(newbuffer);
 * printf("Safe uncompress: %s!\n",buf);
 */
  } while (1);
}
#endif