00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038 #include <stdlib.h>
00039 #include <stdio.h>
00040 #include <string.h>
00041
00042 #ifdef UNX
00043 #include <unistd.h>
00044 #endif
00045
00046 #define noVERBOSE
00047
00048 #include "hnjalloc.h"
00049 #include "hyphen.h"
00050
00051 static char *
00052 hnj_strdup (const char *s)
00053 {
00054 char *new;
00055 int l;
00056
00057 l = strlen (s);
00058 new = hnj_malloc (l + 1);
00059 memcpy (new, s, l);
00060 new[l] = 0;
00061 return new;
00062 }
00063
00064
00065
00066
00067 typedef struct _HashTab HashTab;
00068 typedef struct _HashEntry HashEntry;
00069
00070
00071 #define HASH_SIZE 31627
00072
00073 struct _HashTab {
00074 HashEntry *entries[HASH_SIZE];
00075 };
00076
00077 struct _HashEntry {
00078 HashEntry *next;
00079 char *key;
00080 int val;
00081 };
00082
00083
00084 static unsigned int
00085 hnj_string_hash (const char *s)
00086 {
00087 const char *p;
00088 unsigned int h=0, g;
00089
00090 for(p = s; *p != '\0'; p += 1) {
00091 h = ( h << 4 ) + *p;
00092 if ( ( g = h & 0xf0000000 ) ) {
00093 h = h ^ (g >> 24);
00094 h = h ^ g;
00095 }
00096 }
00097 return h ;
00098 }
00099
00100 static HashTab *
00101 hnj_hash_new (void)
00102 {
00103 HashTab *hashtab;
00104 int i;
00105
00106 hashtab = hnj_malloc (sizeof(HashTab));
00107 for (i = 0; i < HASH_SIZE; i++)
00108 hashtab->entries[i] = NULL;
00109
00110 return hashtab;
00111 }
00112
00113 static void
00114 hnj_hash_free (HashTab *hashtab)
00115 {
00116 int i;
00117 HashEntry *e, *next;
00118
00119 for (i = 0; i < HASH_SIZE; i++)
00120 for (e = hashtab->entries[i]; e; e = next)
00121 {
00122 next = e->next;
00123 hnj_free (e->key);
00124 hnj_free (e);
00125 }
00126
00127 hnj_free (hashtab);
00128 }
00129
00130
00131 static void
00132 hnj_hash_insert (HashTab *hashtab, const char *key, int val)
00133 {
00134 int i;
00135 HashEntry *e;
00136
00137 i = hnj_string_hash (key) % HASH_SIZE;
00138 e = hnj_malloc (sizeof(HashEntry));
00139 e->next = hashtab->entries[i];
00140 e->key = hnj_strdup (key);
00141 e->val = val;
00142 hashtab->entries[i] = e;
00143 }
00144
00145
00146 static int
00147 hnj_hash_lookup (HashTab *hashtab, const char *key)
00148 {
00149 int i;
00150 HashEntry *e;
00151
00152 i = hnj_string_hash (key) % HASH_SIZE;
00153 for (e = hashtab->entries[i]; e; e = e->next)
00154 if (!strcmp (key, e->key))
00155 return e->val;
00156 return -1;
00157 }
00158
00159
00160 static int
00161 hnj_get_state (HyphenDict *dict, HashTab *hashtab, const char *string)
00162 {
00163 int state_num;
00164
00165 state_num = hnj_hash_lookup (hashtab, string);
00166
00167 if (state_num >= 0)
00168 return state_num;
00169
00170 hnj_hash_insert (hashtab, string, dict->num_states);
00171
00172 if (!(dict->num_states & (dict->num_states - 1)))
00173 {
00174 dict->states = hnj_realloc (dict->states,
00175 (dict->num_states << 1) *
00176 sizeof(HyphenState));
00177 }
00178 dict->states[dict->num_states].match = NULL;
00179 dict->states[dict->num_states].fallback_state = -1;
00180 dict->states[dict->num_states].num_trans = 0;
00181 dict->states[dict->num_states].trans = NULL;
00182 return dict->num_states++;
00183 }
00184
00185
00186
00187 static void
00188 hnj_add_trans (HyphenDict *dict, int state1, int state2, char ch)
00189 {
00190 int num_trans;
00191
00192 num_trans = dict->states[state1].num_trans;
00193 if (num_trans == 0)
00194 {
00195 dict->states[state1].trans = hnj_malloc (sizeof(HyphenTrans));
00196 }
00197 else if (!(num_trans & (num_trans - 1)))
00198 {
00199 dict->states[state1].trans = hnj_realloc (dict->states[state1].trans,
00200 (num_trans << 1) *
00201 sizeof(HyphenTrans));
00202 }
00203 dict->states[state1].trans[num_trans].ch = ch;
00204 dict->states[state1].trans[num_trans].new_state = state2;
00205 dict->states[state1].num_trans++;
00206 }
00207
00208 #ifdef VERBOSE
00209 HashTab *global;
00210
00211 static char *
00212 get_state_str (int state)
00213 {
00214 int i;
00215 HashEntry *e;
00216
00217 for (i = 0; i < HASH_SIZE; i++)
00218 for (e = global->entries[i]; e; e = e->next)
00219 if (e->val == state)
00220 return e->key;
00221 return NULL;
00222 }
00223 #endif
00224
00225 HyphenDict *
00226 hnj_hyphen_load (const char *fn)
00227 {
00228 HyphenDict *dict;
00229 HashTab *hashtab;
00230 FILE *f;
00231 char buf[80];
00232 char word[80];
00233 char pattern[80];
00234 int state_num, last_state;
00235 int i, j;
00236 char ch;
00237 int found;
00238 HashEntry *e;
00239
00240 f = fopen (fn, "r");
00241 if (f == NULL)
00242 return NULL;
00243
00244 hashtab = hnj_hash_new ();
00245 #ifdef VERBOSE
00246 global = hashtab;
00247 #endif
00248 hnj_hash_insert (hashtab, "", 0);
00249
00250 dict = hnj_malloc (sizeof(HyphenDict));
00251 dict->num_states = 1;
00252 dict->states = hnj_malloc (sizeof(HyphenState));
00253 dict->states[0].match = NULL;
00254 dict->states[0].fallback_state = -1;
00255 dict->states[0].num_trans = 0;
00256 dict->states[0].trans = NULL;
00257
00258
00259 for (i=0;i<MAX_NAME;i++) dict->cset[i]= 0;
00260 fgets(dict->cset, sizeof(dict->cset),f);
00261 for (i=0;i<MAX_NAME;i++)
00262 if ((dict->cset[i] == '\r') || (dict->cset[i] == '\n'))
00263 dict->cset[i] = 0;
00264
00265 while (fgets (buf, sizeof(buf), f) != NULL)
00266 {
00267 if (buf[0] != '%')
00268 {
00269 j = 0;
00270 pattern[j] = '0';
00271 for (i = 0; ((buf[i] > ' ') || (buf[i] < 0)); i++)
00272 {
00273 if (buf[i] >= '0' && buf[i] <= '9')
00274 pattern[j] = buf[i];
00275 else
00276 {
00277 word[j] = buf[i];
00278 pattern[++j] = '0';
00279 }
00280 }
00281 word[j] = '\0';
00282 pattern[j + 1] = '\0';
00283
00284
00285 for (i = 0; pattern[i] == '0'; i++);
00286
00287 #ifdef VERBOSE
00288 printf ("word %s pattern %s, j = %d\n", word, pattern + i, j);
00289 #endif
00290 found = hnj_hash_lookup (hashtab, word);
00291 state_num = hnj_get_state (dict, hashtab, word);
00292 dict->states[state_num].match = hnj_strdup (pattern + i);
00293
00294
00295 for (; found < 0 ;j--)
00296 {
00297 last_state = state_num;
00298 ch = word[j - 1];
00299 word[j - 1] = '\0';
00300 found = hnj_hash_lookup (hashtab, word);
00301 state_num = hnj_get_state (dict, hashtab, word);
00302 hnj_add_trans (dict, state_num, last_state, ch);
00303 }
00304 }
00305 }
00306
00307 fclose(f);
00308
00309
00310
00311
00312
00313
00314
00315
00316
00317
00318
00319
00320
00321
00322
00323
00324
00325
00326
00327
00328 for (i = 0; i < HASH_SIZE; i++)
00329 for (e = hashtab->entries[i]; e; e = e->next)
00330 if ( *(e->key) )
00331 {
00332 for (j = 1; 1; j++)
00333 {
00334 state_num = hnj_hash_lookup (hashtab, e->key + j);
00335 if (state_num >= 0)
00336 break;
00337 }
00338
00339 if (e->val)
00340 dict->states[e->val].fallback_state = state_num;
00341 }
00342 #ifdef VERBOSE
00343 for (i = 0; i < HASH_SIZE; i++)
00344 for (e = hashtab->entries[i]; e; e = e->next)
00345 {
00346 printf ("%d string %s state %d, fallback=%d\n", i, e->key, e->val,
00347 dict->states[e->val].fallback_state);
00348 for (j = 0; j < dict->states[e->val].num_trans; j++)
00349 printf (" %c->%d\n", dict->states[e->val].trans[j].ch,
00350 dict->states[e->val].trans[j].new_state);
00351 }
00352 #endif
00353
00354 #ifndef VERBOSE
00355 hnj_hash_free (hashtab);
00356 #endif
00357
00358 return dict;
00359 }
00360
00361 void hnj_hyphen_free (HyphenDict *dict)
00362 {
00363 int state_num;
00364 HyphenState *hstate;
00365
00366 for (state_num = 0; state_num < dict->num_states; state_num++)
00367 {
00368 hstate = &dict->states[state_num];
00369 if (hstate->match)
00370 hnj_free (hstate->match);
00371 if (hstate->trans)
00372 hnj_free (hstate->trans);
00373 }
00374
00375 hnj_free (dict->states);
00376
00377 hnj_free (dict);
00378 }
00379
00380 #define MAX_WORD 256
00381
00382 int hnj_hyphen_hyphenate (HyphenDict *dict,
00383 const char *word, int word_size,
00384 char *hyphens)
00385 {
00386 char prep_word_buf[MAX_WORD];
00387 char *prep_word;
00388 int i, j, k;
00389 int state;
00390 char ch;
00391 HyphenState *hstate;
00392 char *match;
00393 int offset;
00394
00395 if (word_size + 3 < MAX_WORD)
00396 prep_word = prep_word_buf;
00397 else
00398 prep_word = hnj_malloc (word_size + 3);
00399
00400 j = 0;
00401 prep_word[j++] = '.';
00402
00403 for (i = 0; i < word_size; i++)
00404 prep_word[j++] = word[i];
00405
00406 for (i = 0; i < j; i++)
00407 hyphens[i] = '0';
00408
00409 prep_word[j++] = '.';
00410
00411 prep_word[j] = '\0';
00412 #ifdef VERBOSE
00413 printf ("prep_word = %s\n", prep_word);
00414 #endif
00415
00416
00417 state = 0;
00418 for (i = 0; i < j; i++)
00419 {
00420 ch = prep_word[i];
00421 for (;;)
00422 {
00423
00424 if (state == -1) {
00425
00426
00427 state = 0;
00428 goto try_next_letter;
00429 }
00430
00431 #ifdef VERBOSE
00432 char *state_str;
00433 state_str = get_state_str (state);
00434
00435 for (k = 0; k < i - strlen (state_str); k++)
00436 putchar (' ');
00437 printf ("%s", state_str);
00438 #endif
00439
00440 hstate = &dict->states[state];
00441 for (k = 0; k < hstate->num_trans; k++)
00442 if (hstate->trans[k].ch == ch)
00443 {
00444 state = hstate->trans[k].new_state;
00445 goto found_state;
00446 }
00447 state = hstate->fallback_state;
00448 #ifdef VERBOSE
00449 printf (" falling back, fallback_state %d\n", state);
00450 #endif
00451 }
00452 found_state:
00453 #ifdef VERBOSE
00454 printf ("found state %d\n",state);
00455 #endif
00456
00457
00458
00459 match = dict->states[state].match;
00460 if (match)
00461 {
00462 offset = i + 1 - strlen (match);
00463 #ifdef VERBOSE
00464 for (k = 0; k < offset; k++)
00465 putchar (' ');
00466 printf ("%s\n", match);
00467 #endif
00468
00469
00470 for (k = 0; match[k] && offset+k < word_size+1 ; k++)
00471 if (hyphens[offset + k] < match[k])
00472 hyphens[offset + k] = match[k];
00473 }
00474
00475
00476
00477
00478 try_next_letter: ;
00479
00480 }
00481 #ifdef VERBOSE
00482 for (i = 0; i < j; i++)
00483 putchar (hyphens[i]);
00484 putchar ('\n');
00485 #endif
00486
00487 for (i = 0; i < j - 4; i++)
00488 #if 0
00489 if (hyphens[i + 1] & 1)
00490 hyphens[i] = '-';
00491 #else
00492 hyphens[i] = hyphens[i + 1];
00493 #endif
00494 hyphens[0] = '0';
00495 for (; i < word_size; i++)
00496 hyphens[i] = '0';
00497 hyphens[word_size] = '\0';
00498
00499 if (prep_word != prep_word_buf)
00500 hnj_free (prep_word);
00501 return 0;
00502 }