#include <unistd.h>
#include <stdio.h>
#include <string.h>
      	   				   
#define ALLOC_QUANT    (1024*1204)
#define HASH_SIZE      (65537*11)
#define HALF_HASH_SIZE (65537*7)
#define MAX_WORD_LEN   (64)
#define MAX_RULES      (32) 
#define MAX_FORMS      (16) 
#define INPUT_BUF_SIZE (256*1024)
#define MIN_CHECK_LEN  (3) 

#define true  1
#define false 0
#define bool  int

int min_count = 2;
int max_count = 0x7FFFFFFF;
int min_match = 2;
int dispersion =  50;
int check_threshold = 0;

char* adjective_A[] = {"","","","","","","","","","",NULL};
char* adjective_B[] = {"","","","","","","","","","",NULL};
char* adjective_C[] = {"","","", "", "","","", "", NULL};
char* adjective_D[] = {"","","",   "","",NULL};

char* noun_E[] = {"","","","","","","","",NULL};
char* noun_F[] = {"","","","","","",NULL};

char* verb_G[] = {"","","","","","",NULL};
char* verb_H[] = {"","","","","","","","","","","","","",NULL};
char* verb_I[] = {"","","","","","","","",NULL};
char* verb_J[] = {"","","","","","","","",NULL};
char* verb_K[] = {"","","","","","","","","",NULL};

char** suffixes[] = {
    adjective_A, adjective_B, adjective_C, adjective_D, 
    noun_E, noun_F, 
    verb_G, verb_H, verb_I, verb_J, verb_K,
    NULL
}; 

unsigned char alt2koi[] = {
/* non letters */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
/* english letters */
0x00, '', '', '', 0x00, '', 0x00, 0x00, 
'', 0x00, 0x00, '', 0x00, '', '', '',
'', 0x00, 0x00, 0x00, '', 0x00, 0x00, 0x00, 
'', '', 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
0x00, '', '', '', 0x00, '', 0x00, 0x00, 
'', 0x00, 0x00, '', 0x00, '', '', '',
'', 0x00, 0x00, 0x00, '', 0x00, 0x00, 0x00, 
'', '', 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
/* russian letters */
0XC1, 0XC2, 0XD7, 0XC7, 0XC4, 0XC5, 0XD6, 0XDA, 
0XC9, 0XCA, 0XCB, 0XCC, 0XCD, 0XCE, 0XCF, 0XD0, 
0XD2, 0XD3, 0XD4, 0XD5, 0XC6, 0XC8, 0XC3, 0XDE, 
0XDB, 0XDD, 0XDF, 0XD9, 0XD8, 0XDC, 0XC0, 0XD1, 
0XC1, 0XC2, 0XD7, 0XC7, 0XC4, 0XC5, 0XD6, 0XDA, 
0XC9, 0XCA, 0XCB, 0XCC, 0XCD, 0XCE, 0XCF, 0XD0, 
/* pseudo-graphics */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
/* russian-letters */
0XD2, 0XD3, 0XD4, 0XD5, 0XC6, 0XC8, 0XC3, 0XDE, 
0XDB, 0XDD, 0XDF, 0XD9, 0XD8, 0XDC, 0XC0, 0XD1, 
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 
};

unsigned char koi2koi[] = {
/* non letters */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
/* english letters */
0x00, '', '', '', 0x00, '', 0x00, 0x00, 
'', 0x00, 0x00, '', 0x00, '', '', '',
'', 0x00, 0x00, 0x00, '', 0x00, 0x00, 0x00, 
'', '', 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
0x00, '', '', '', 0x00, '', 0x00, 0x00, 
'', 0x00, 0x00, '', 0x00, '', '', '',
'', 0x00, 0x00, 0x00, '', 0x00, 0x00, 0x00, 
'', '', 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 

0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
/* russian letters */
0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 
0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 
0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 
0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 
0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF
};

int n_words, n_chars, n_dictionary_words, n_uniq_words;

unsigned char* code_table;

class allocator {
    char*  ptr;
    size_t used;

  public: 
    void* alloc(size_t size) { 
	size = (size + sizeof(void*)-1) & ~(sizeof(void*) - 1);
	if ((used += size) > ALLOC_QUANT) { 
	    ptr = new char[ALLOC_QUANT];
	    used = size;
	}
	char* p = ptr;
	ptr += size;
	return p;
    }
    allocator() { 
	used = ALLOC_QUANT;
    }
};

unsigned char *buf_cur, *buf_end, *buf_ptr;

inline int get_char() 
{ 
    if (buf_cur == buf_end) { 
	int red = read(0, buf_ptr, INPUT_BUF_SIZE);
	if (red <= 0) return EOF;
	buf_cur = buf_ptr;
	buf_end = buf_ptr + red;
    }
    n_chars += 1;
    return *buf_cur++;
}

inline void unget_char(int ch)
{
    *--buf_cur = ch;
    n_chars -= 1;
}

int dictionary_statistic[MAX_WORD_LEN+1];
int word_statistic[MAX_WORD_LEN+1];

inline bool get_word(char* buf) 
{ 
    int ch;
    int i;
    int mask = 0;
    do {
	while ((ch = get_char()) != EOF && code_table[ch] == 0);
	for (i = 0; i < MAX_WORD_LEN; i++) { 
	    if (ch == '-') { 
		if ((ch = get_char()) == '\n' || ch == '\r') { 
		    while((ch=get_char()) == '\t' || ch == ' ' || ch == '\n');
		} else { 
		    unget_char(ch);
		    break;
		}
	    }
	    if (ch == EOF) { 
		buf[i] = '\0';
		return i > 0 && (signed char)mask < 0;
	    } else if ((buf[i] = code_table[ch]) == 0) { 
		break;
	    }
	    mask |= ch;
	    ch = get_char();
	}
	buf[i] = '\0';
    } while ((signed char)mask >= 0);

    n_words += 1;
    word_statistic[i] += 1;
    return true;
}

inline unsigned string_hash_function(const char* name, int len)
{ 
    unsigned h = 0, g;
    while(--len >= 0) { 
	h = (h << 4) + *name++;
	if ((g = h & 0xF0000000) != 0) { 
	    h ^= g >> 24;
	}
	h &= ~g;
    }
    return h;
}

inline unsigned string_hash_function(const char* name)
{ 
    unsigned h = 0, g;
    while(*name) { 
	h = (h << 4) + *name++;
	if ((g = h & 0xF0000000) != 0) { 
	    h ^= g >> 24;
	}
	h &= ~g;
    }
    return h;
}

allocator pool;

class word { 
  public: 
    word* next;
    int   count;
    char  name[1];
    
    void* operator new (size_t size, size_t len) { 
	return pool.alloc(size+len);
    }
    word(const char* p, word* chain, int c) { 
	strcpy(name, p);
	count = c;
	next = chain;
    }
    static word* create(const char* p, word* chain, int c = 1) { 
	int len = strlen(p);
	dictionary_statistic[len] += 1;
	n_uniq_words += 1;
        return new (len) word(p, chain, c); 
    }
};


class half_word { 
  public: 
    half_word* next;
    word*      w;
    
    void* operator new(size_t size) { return pool.alloc(size); }
    half_word(word* w, half_word* chain) { 
	this->w = w;
	next = chain;
    }
};


void generate_aff_file()
{
    FILE* f = fopen("russian.aff", "w");
    if (f == NULL) { 
	perror("opening russian.aff file");
	return;
    }
    fputs("nroffchars      ().\\\\*\n"
	  "texchars        ()\\[]{}<\\>\\\\$*.%\n"
	  "wordchars [\\xc0-\\xdf] [\\xe0-\\xff]\n"
	  "suffixes\n\n", f);

    for (int flag = 0; suffixes[flag] != NULL; flag++) { 
	char buf[256], *bp = buf;
	char** sp = suffixes[flag];
	fprintf(f, "flag %c:\n", 'A' + flag);
	char* p = *sp;
	while (*p != 0) { 
	    *bp++ = ' ';
	    *bp++ = *p++;
	}
	sprintf(bp, "\t> -%s,", *sp);
	while (*++sp != NULL) { 
	    char replace = 0;
	    if (buf[1] != '' && buf[1] != '' && buf[1] != '') { 
		switch(**sp) { 
		  case '':
		    replace = '';
		    break;
		  case '':
		    replace = '';
		    break;
		  case '':
		    replace = '';
		    break;
		}
	    }
	    if (replace) { 
		fprintf(f, " [^]%s%s\n", buf, *sp);
		fprintf(f, " [] %s%c%s\n", buf, replace, *sp+1);
	    } else { 
		fprintf(f, "%s%s\n", buf, *sp);
	    }
	}
    }
    fclose(f);
}	

inline void copy_suffix(char* root, char* suffix) 
{
    if (root[-1]=='' || root[-1]=='' || root[-1]=='' || root[-1]=='' || 
	root[-1] == '' || root[-1] == '' || root[-1] == '')
    {
	if      (*suffix == '') suffix++, *root++ = '';
	else if (*suffix == '') suffix++, *root++ = '';
	else if (*suffix == '') suffix++, *root++ = '';
    }
    strcpy(root, suffix);
}

inline int find_suffix(char* word_text, int word_len, char* suffix)
{
    int suffix_len = strlen(suffix);
    if (suffix_len >= word_len) return 0;
    char *sp = &word_text[word_len-suffix_len];
    char lrc = sp[-1]; // last root character

    if (lrc == '' || lrc == '' || lrc == '' || lrc == '' || 
	lrc == '' || lrc == '' || lrc == '')
    {
	if (*suffix == '') { 
	    if (*sp != '') return 0;
	    else suffix += 1, sp += 1;
	} else if (*suffix == '') { 
	    if (*sp != '') return 0;
	    else suffix += 1, sp += 1;
	} else if (*suffix == '') { 
	    if (*sp != '') return 0;
	    else suffix += 1, sp += 1;
	}	    
    }
    while (*sp == *suffix) { 
	if (*sp == 0) return suffix_len;
	sp += 1;
	suffix += 1;
    }
    return 0;
}
    

bool generate_forms(word** hash, word* w) 
{ 
    char* suffix;
    char* name = w->name;
    int   len = strlen(name);
    int   i, j, k, n;
    int   match_root_len = 0, match_i = 0, max_matches = 0;
    word* wm[MAX_RULES][MAX_FORMS];

    for (i = 0; suffixes[i] != NULL; i++) {
	for (j = 0; (suffix = suffixes[i][j]) != NULL; j++) { 
	    int suffix_len = strlen(suffix);
	    if (len > suffix_len+1 
		&& memcmp(name+len-suffix_len, suffix, suffix_len) == 0)
	    {
		char buf[MAX_WORD_LEN*2];
		int root_len = len - suffix_len;

		memcpy(buf, name, root_len);

		for (k = 0, n = 0; (suffix = suffixes[i][k]) != NULL; k++) { 
		    copy_suffix(buf+root_len, suffix);
		    unsigned h = string_hash_function(buf)%HASH_SIZE;
		    for (w = hash[h]; w != NULL; w = w->next) { 
			if (w->count >= min_count && strcmp(w->name, buf) == 0)
			{ 
			    wm[i][n++] = w;
			    break;
			}
		    }
		}
		if (n > max_matches) { 
		    match_root_len = root_len;
		    max_matches = n;
		    match_i = i;
		}
		break;
	    }
	}
    }
    if (max_matches >= min_match) { 
	while (--max_matches >= 0) wm[match_i][max_matches]->count = 0;
	printf("%.*s%s/%c\n", match_root_len, name, 
	       *suffixes[match_i], 'A' + match_i);
	return true;
    }
    return false;
}
		    
inline bool match_words(char* w1, char* w2) 
{
    int errors = 0;
    while (true) { 
	while (*w1 == *w2) {
	    if (*w1 == 0) return true;
	    w1 += 1;
	    w2 += 1;
	}
	if (*w1 == 0) {
	    if (errors != 0) return false;
	    else return w2[1] == 0;
	}
	if (*w2 == 0) {
	    if (errors != 0) return false;
	    else return w1[1] == 0;
	}
	if (w1[1] == w2[0]) {
	    w1 += 1;
	    if (errors != 0) return false;
	    errors += 1;
	} else if (w1[0] == w2[1]) { 
	    w2 += 1;
	    if (errors != 0) return false;
	    errors += 1;
	} else if (w1[1] == w2[1]) { 
	    w1 += 1;
	    w2 += 1;
	    if (errors != 0) return false;
	    errors += 1;
	} else return false;
    }
}
		
int main(int argc, char* argv[]) 
{
    word **hash = new word*[HASH_SIZE];
    half_word **half_hash = NULL;
    char buf[MAX_WORD_LEN+1];
    bool wc_flag = false;
    bool read_hash = false;
    bool show_stat = false;
    int  i;
    
    buf_ptr = new unsigned char[INPUT_BUF_SIZE];

    code_table = alt2koi;

    if (argc > 1) {
	for (i = 1; i < argc; i++) { 
	    if (strcmp(argv[i], "-alt") == 0) { 
		code_table = alt2koi;
	    } else if (strcmp(argv[i], "-koi") == 0) { 
		code_table = koi2koi;
	    } else if (strcmp(argv[i], "-aff") == 0) { 
	        generate_aff_file();
	    } else if (strcmp(argv[i], "-wc") == 0) { 
	        wc_flag = true;
	    } else if (strcmp(argv[i], "-read") == 0) { 
	        read_hash = true;
	    } else if (strcmp(argv[i], "-stat") == 0) { 
	        show_stat = true;
	    } else if (strcmp(argv[i], "-mincount") == 0 && i+1 < argc) { 
		sscanf(argv[++i], "%d", &min_count);
	    } else if (strcmp(argv[i], "-maxcount") == 0 && i+1 < argc) { 
		sscanf(argv[++i], "%d", &max_count);
	    } else if (strcmp(argv[i], "-match") == 0 && i+1 < argc) { 
		sscanf(argv[++i], "%d", &min_match);
	    } else if (strcmp(argv[i], "-disp") == 0 && i+1 < argc) { 
		sscanf(argv[++i], "%d", &dispersion);
	    } else if (strcmp(argv[i], "-check") == 0 && i+1 < argc) { 
		sscanf(argv[++i], "%d", &check_threshold);
	    } else { 
		fputs("Preparing dictionary for ISPELL\n"
		      "Usage: makedict [options] < input_text > output_dictionary\n"
		      "Options:\n"
		      "\t-alt\t\tconvert input text from ALT to KOI8\n"
		      "\t-koi\t\tassume input text is in KOI8\n"
		      "\t-read\t\tread prepared dictionary\n"
		      "\t-stat\t\toutput word statistics\n"
		      "\t-wc\t\tprint word count\n"
		      "\t-aff\t\tgenerate russian.aff file\n"
		      "\t-\t\tgenerate russian.aff file\n"
		      "\t-maxcount number\tmaximal count of word to be placed in dictionary\n"
		      "\t-mincount number\tminimal count of word to be placed in dictionary\n"
		      "\t-check number\tspecify checking threshold\n"
		      "\t-match number\tminimal number of matches in group\n"
		      "\t-disp number\tword trust coefficient\n",
		      stderr);
		return 1;
	    }
	}
    }
    memset(hash, 0, sizeof(word*)*HASH_SIZE);

    if (check_threshold) { 
	half_hash = new half_word*[HALF_HASH_SIZE];
	memset(half_hash, 0,  sizeof(word*)*HALF_HASH_SIZE);
    }
    if (read_hash) { 
	int count;
	while (scanf("%s %d", buf, &count) == 2) { 
	    i = string_hash_function(buf) % HASH_SIZE;
	    word* w;
	    for (w = hash[i]; w != NULL; w = w->next) { 
		if (strcmp(w->name, buf) == 0) {
		    w->count += count;
		    break;
		}
	    }
	    int len = strlen(buf);
	    if (w == NULL) { 
		hash[i] = word::create(buf, hash[i], count);
	    }
	    word_statistic[len] += count;
	    if ((++n_words & 0xFFFF) == 0) { 
		fprintf(stderr, "Proceed %u words\r", n_words);
		fflush(stderr);
	    }
	} 
    } else { 
	while (get_word(buf)) { 
	    i = string_hash_function(buf) % HASH_SIZE;
	    word* w;
	    for (w = hash[i]; w != NULL; w = w->next) { 
		if (strcmp(w->name, buf) == 0) {
		    w->count += 1;
		    break;
		}
	    }
	    if (w == NULL) { 
		hash[i] = word::create(buf, hash[i]);
	    }
	    if ((n_words & 0xFFFF) == 0) { 
		fprintf(stderr, "Proceed %u chars, %u words\r", 
			n_chars, n_words);
		fflush(stderr);
	    }
	}
    }
    if (show_stat) { 
	fprintf(stderr, "Average word occurrence\n");
	for (i = 1; i < MAX_WORD_LEN; i++) { 
	    if (word_statistic[i] != 0) { 
		fprintf(stderr, "Word length %d: %d\n", 
			i, word_statistic[i]/dictionary_statistic[i]); 
	    }
	}
    }
    if (check_threshold) { 
	for (i = HASH_SIZE; --i >= 0;) { 
	    for (word* w = hash[i]; w != NULL; w = w->next) { 
		int len = strlen(w->name);
		if ((len >> 1) >= MIN_CHECK_LEN) { 
		    half_word** hw;
		    hw = &half_hash[string_hash_function(w->name, len >> 1)
				   % HALF_HASH_SIZE]; 
		    *hw = new half_word(w, *hw);
		    hw = &half_hash[string_hash_function(w->name + (len >> 1)) 
				   % HALF_HASH_SIZE];
		    *hw = new half_word(w, *hw);
		    if (len & 1) {
			len = (len >> 1) + 1;
			hw = &half_hash[string_hash_function(w->name, len)
				       % HALF_HASH_SIZE]; 
			*hw = new half_word(w, *hw);
			hw = &half_hash[string_hash_function(w->name + len) 
				       % HALF_HASH_SIZE];
			*hw = new half_word(w, *hw);
		    }

		}
	    }
	}
	for (i = HALF_HASH_SIZE; --i >= 0;) { 
	    for (half_word* h1 = half_hash[i]; h1 != NULL; h1 = h1->next) { 
		for (half_word* h2 = h1->next; h2 != NULL; h2 = h2->next) { 
		    if (h1->w->count <= check_threshold &&
			h2->w->count > h1->w->count * dispersion) 
		    {
			if (match_words(h1->w->name, h2->w->name)) { 
			    h1->w->count = 0;
			}
		    } else if (h2->w->count <= check_threshold &&
			       h1->w->count > h2->w->count * dispersion) 
		    {
			if (match_words(h1->w->name, h2->w->name)) { 
			    h2->w->count = 0;
			}
		    }
		}
	    }
	}
    }

    if (min_count == 0) { 
	for (i = HASH_SIZE; --i >= 0;) { 
	    for (word* w = hash[i]; w != NULL; w = w->next) { 
		if (w->count <= max_count) { 
		    n_dictionary_words += 1;
		    if (!generate_forms(hash, w)) { 
			if (wc_flag) { 
			    printf("%s %u\n", w->name, w->count);
			} else { 
			    printf("%s\n", w->name);
			}
		    }
		}
	    }
	}
    } else {
	for (i = HASH_SIZE; --i >= 0;) { 
	    for (word* w = hash[i]; w != NULL; w = w->next) { 
		int len = strlen(w->name);
		if (w->count <= max_count && w->count >= min_count + 
		    word_statistic[len]/dictionary_statistic[len]/dispersion) 
		{ 
		    n_dictionary_words += 1;
		    if (!generate_forms(hash, w)) { 
			if (wc_flag) { 
			    printf("%s %u\n", w->name, w->count);
			} else { 
			    printf("%s\n", w->name);
			}
		    }
		}
	    }
	}
    }
    fprintf(stderr, 
	    "\nTotally: %u bytes, %u words, %u unique words, dictionary size %u\n", 
	    n_chars, n_words, n_uniq_words, n_dictionary_words);
    return 0;
}





