#include "config.h" #include <stdio.h> #include <stdlib.h> #include <string.h> #include <ctype.h> #include "externs.h" #include "db.h" #define WORD_HASH_SIZE (100000) #define SIZE_HASH_SIZE (100000) struct queue_node { struct queue_node *next; struct queue_node *prev; int count; int spcount; int val; int len; char word[32]; }; struct queue_node *head = NULL; struct queue_node *tail = NULL; int total_words = 0; int counted_words = 0; hash_tab wordhash[WORD_HASH_SIZE]; struct queue_node *sizehash[100000]; int notify(int player, const char *msg) { return printf("%s\n", msg); } int string_compare(const char *s1, const char *s2) { return strcmp(s1, s2); } #ifndef MALLOC_PROFILING char * string_dup(const char *s) { char *p = (char *) malloc(strlen(s) + 1); strcpy(p, s); return p; } #endif void queue_remove_node(struct queue_node *node) { if (node->prev) { node->prev->next = node->next; } else { head = node->next; } if (node->next) { node->next->prev = node->prev; } else { tail = node->prev; } } void queue_insert_after(struct queue_node *where, struct queue_node *node) { if (where) { node->next = where->next; where->next = node; } else { node->next = head; head = node; } node->prev = where; if (node->next) { node->next->prev = node; } else { tail = node; } } void queue_promote(struct queue_node *node) { struct queue_node *prev; struct queue_node *p; int oldval; oldval = node->val; node->val = (node->count * (node->len - 2)) + (node->spcount * (node->len - 1)); /* remove node from sizehash table if necessary */ if (oldval && oldval < SIZE_HASH_SIZE) { p = sizehash[oldval]; if (p) { if (p == node) { if (node->prev) { if (node->prev->val == oldval) { sizehash[oldval] = node->prev; } else { sizehash[oldval] = NULL; } } else { sizehash[oldval] = NULL; } } } } if (node->val < SIZE_HASH_SIZE) { int limit; int i; limit = 16; i = node->val; do { prev = sizehash[i]; } while (!prev && ++i < SIZE_HASH_SIZE && --limit); if (i >= SIZE_HASH_SIZE || !limit) { prev = node->prev; while (prev && prev->val < node->val) { prev = prev->prev; } } } else { prev = node->prev; while (prev && prev->val < node->val) { prev = prev->prev; } } if (prev != node->prev) { queue_remove_node(node); queue_insert_after(prev, node); } /* put entry in size hash */ if (node->val < SIZE_HASH_SIZE) { sizehash[node->val] = node; } } void list_top_4k_words(void) { int count = 4096; int lastval; struct queue_node *node; lastval = 0x3fffffff; for (node = head; node && count--; node = node->next) { /* printf("%-32s %5d %5d\n", node->word, node->count, node->spcount); */ printf("%s\n", node->word); fflush(stdout); if (lastval < node->val) abort(); lastval = node->val; } } struct queue_node * queue_add_node(const char *word, int pri) { struct queue_node *nu; hash_data hd; nu = (struct queue_node *) malloc(sizeof(struct queue_node)); if (!nu) abort(); if (!head) { head = nu; nu->prev = NULL; } nu->next = NULL; nu->prev = tail; if (tail) { tail->next = nu; } tail = nu; nu->count = 0; nu->spcount = 0; strcpy(nu->word, word); nu->len = strlen(nu->word); if (nu->word[nu->len - 1] == ' ') { nu->word[nu->len - 1] = '\0'; nu->len--; nu->spcount += pri; } else { nu->count += pri; } nu->val = 0; queue_promote(nu); hd.pval = (void *) nu; (void) add_hash(nu->word, hd, wordhash, WORD_HASH_SIZE); return nu; } void add_to_list(const char *word) { struct queue_node *node; hash_data *exp; char buf[32]; short spcflag = 0; if (strlen(word) < 3) { return; } strcpy(buf, word); if (buf[strlen(buf) - 1] == ' ') { buf[strlen(buf) - 1] = '\0'; spcflag++; } exp = find_hash(buf, wordhash, WORD_HASH_SIZE); if (exp) { node = (struct queue_node *) exp->pval; if (spcflag) { node->spcount++; } else { node->count++; } queue_promote(node); counted_words++; } else { /* printf("%s\n", buf); */ queue_add_node(word, 1); total_words++; counted_words++; } } void remember_word_variants(const char *in) { char word[32]; strcpy(word, in); add_to_list(word); /* t = word + strlen(word); while (t > word) { h = word; while (h < t) { add_to_list(h); h++; } t--; if (*t == ' ') { t--; } *t = '\0'; } */ } void remember_words_in_string(const char *in) { char buf[32]; const char *h; char *o; h = in; while (*h) { o = buf; while (*h && !isalnum(*h)) { h++; } while (*h && isalnum(*h) && (o - buf < 30)) { if (isupper(*h)) { *o++ = tolower(*h++); } else { *o++ = *h++; } } if (*h == ' ') { *o++ = ' '; } *o++ = '\0'; if (strlen(buf) > 3) { remember_word_variants(buf); } } } int main(int argc, char **argv) { char buf[16384]; char *p; queue_add_node("felorin", 99999999); queue_add_node("anthro", 99999999); queue_add_node("phile ", 99999999); queue_add_node("morph", 99999999); queue_add_node("revar", 99999999); queue_add_node("sion ", 99999999); queue_add_node("tion ", 99999999); queue_add_node("post", 99999999); queue_add_node("ing ", 99999999); queue_add_node("ion ", 99999999); queue_add_node("est ", 99999999); queue_add_node("ies ", 99999999); queue_add_node("ism ", 99999999); queue_add_node("ish ", 99999999); queue_add_node("ary ", 99999999); queue_add_node("ous ", 99999999); queue_add_node("dis", 99999999); queue_add_node("non", 99999999); queue_add_node("pre", 99999999); queue_add_node("sub", 99999999); queue_add_node("al ", 99999999); queue_add_node("ic ", 99999999); queue_add_node("ly ", 99999999); queue_add_node("le ", 99999999); queue_add_node("es ", 99999999); queue_add_node("ed ", 99999999); queue_add_node("er ", 99999999); while (!feof(stdin)) { fgets(buf, sizeof(buf), stdin); p = strchr(buf, ':'); if (p) { p = strchr(p + 1, ':'); if (p) { remember_words_in_string(p + 1); } } } list_top_4k_words(); /* printf("%d unique words found.\n", total_words); */ /* printf("%d counted words.\n", counted_words); */ return(0); }