tinymush-2.2.5/
tinymush-2.2.5/conf/
tinymush-2.2.5/radixlib/
tinymush-2.2.5/scripts/
tinymush-2.2.5/src/
tinymush-2.2.5/text/
tinymush-2.2.5/tools/
tinymush-2.2.5/vms/
#!/bin/sh
MAXLEN=4
AWK=nawk
#
# 'a' finds all strings of length <= $MAXLEN in theflat MUSH db it reads on
#	standard input. It writes records like "1234 'the co'" on standard
#	output, meaning it saw the string 'the co' 1234 times.
# 'phase1.awk' eliminates all one character strings, and eliminates things
#	which are just prefixes of one another. For example:
#		98671 'th'
#		98658 'the'
#	would indicate that virtually all instances of 'th' are just the 'th'
#	in the word 'the', so we might as well just keep the string 'the'.
#	It produces records like 'a' does, except the count is multipled
#	by the length of the string, giving a measure of how profitable
#	this string is to compress.
# 'sort' just sorts standard input numerically by the first field, using
#	$TMPDIR as the place to put temporary files. Now we have the
#	common substrings ordered by some number which measures how common
#	they are and how long they are, i.e. how profictable it is to
#	compress them down to one output code.
# 'head' just takes the first 4000 of these strings.
# 'phase2.awk' formats up the output into a proper C structure.
#
# Using a longer MAXLEN increases memory usage of 'a' and run-time more
# or less exponentially. On a Sparc10 with 128M MAXLEN=6 is about as
# big as you want to go. 4 seems to be pretty useful a number, and is
# manageable on a normal workstation sort of thing, if you're patient.
#

a $MAXLEN | \
	$AWK -f phase1.awk | \
	sort -r -n +0 -1 | \
	head -4000 | \
	$AWK -f phase2.awk > compresstab.h