KPlato: Fichier source de F:/KPlato/koffice/libs/kotext/kohyphen/hyphen.c

00001 /* LibHnj is dual licensed under LGPL and MPL. Boilerplate for both
00002  * licenses follows.
00003  */
00004 
00005 /* LibHnj - a library for high quality hyphenation and justification
00006  * Copyright (C) 1998 Raph Levien, 
00007  *           (C) 2001 ALTLinux, Moscow (http://www.alt-linux.org), 
00008  *           (C) 2001 Peter Novodvorsky (nidd@cs.msu.su)
00009  *
00010  * This library is free software; you can redistribute it and/or
00011  * modify it under the terms of the GNU Library General Public
00012  * License as published by the Free Software Foundation; either
00013  * version 2 of the License, or (at your option) any later version.
00014  *
00015  * This library is distributed in the hope that it will be useful,
00016  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00017  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00018  * Library General Public License for more details.
00019  *
00020  * You should have received a copy of the GNU Library General Public
00021  * License along with this library; if not, write to the 
00022  * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 
00023  * Boston, MA  02110-1301  USA.
00024 */
00025 
00026 /*
00027  * The contents of this file are subject to the Mozilla Public License
00028  * Version 1.0 (the "MPL"); you may not use this file except in
00029  * compliance with the MPL.  You may obtain a copy of the MPL at
00030  * http://www.mozilla.org/MPL/
00031  *
00032  * Software distributed under the MPL is distributed on an "AS IS" basis,
00033  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the MPL
00034  * for the specific language governing rights and limitations under the
00035  * MPL.
00036  *
00037  */
00038 #include <stdlib.h> /* for NULL, malloc */
00039 #include <stdio.h>  /* for fprintf */
00040 #include <string.h> /* for strdup */
00041 
00042 #ifdef UNX
00043 #include <unistd.h> /* for exit */
00044 #endif
00045 
00046 #define noVERBOSE
00047 
00048 #include "hnjalloc.h"
00049 #include "hyphen.h"
00050 
00051 static char *
00052 hnj_strdup (const char *s)
00053 {
00054   char *new;
00055   int l;
00056 
00057   l = strlen (s);
00058   new = hnj_malloc (l + 1);
00059   memcpy (new, s, l);
00060   new[l] = 0;
00061   return new;
00062 }
00063 
00064 /* a little bit of a hash table implementation. This simply maps strings
00065    to state numbers */
00066 
00067 typedef struct _HashTab HashTab;
00068 typedef struct _HashEntry HashEntry;
00069 
00070 /* A cheap, but effective, hack. */
00071 #define HASH_SIZE 31627
00072 
00073 struct _HashTab {
00074   HashEntry *entries[HASH_SIZE];
00075 };
00076 
00077 struct _HashEntry {
00078   HashEntry *next;
00079   char *key;
00080   int val;
00081 };
00082 
00083 /* a char* hash function from ASU - adapted from Gtk+ */
00084 static unsigned int
00085 hnj_string_hash (const char *s)
00086 {
00087   const char *p;
00088   unsigned int h=0, g;
00089 
00090   for(p = s; *p != '\0'; p += 1) {
00091     h = ( h << 4 ) + *p;
00092     if ( ( g = h & 0xf0000000 ) ) {
00093       h = h ^ (g >> 24);
00094       h = h ^ g;
00095     }
00096   }
00097   return h /* % M */;
00098 }
00099 
00100 static HashTab *
00101 hnj_hash_new (void)
00102 {
00103   HashTab *hashtab;
00104   int i;
00105 
00106   hashtab = hnj_malloc (sizeof(HashTab));
00107   for (i = 0; i < HASH_SIZE; i++)
00108     hashtab->entries[i] = NULL;
00109 
00110   return hashtab;
00111 }
00112 
00113 static void
00114 hnj_hash_free (HashTab *hashtab)
00115 {
00116   int i;
00117   HashEntry *e, *next;
00118 
00119   for (i = 0; i < HASH_SIZE; i++)
00120     for (e = hashtab->entries[i]; e; e = next)
00121       {
00122         next = e->next;
00123         hnj_free (e->key);
00124         hnj_free (e);
00125       }
00126 
00127   hnj_free (hashtab);
00128 }
00129 
00130 /* assumes that key is not already present! */
00131 static void
00132 hnj_hash_insert (HashTab *hashtab, const char *key, int val)
00133 {
00134   int i;
00135   HashEntry *e;
00136 
00137   i = hnj_string_hash (key) % HASH_SIZE;
00138   e = hnj_malloc (sizeof(HashEntry));
00139   e->next = hashtab->entries[i];
00140   e->key = hnj_strdup (key);
00141   e->val = val;
00142   hashtab->entries[i] = e;
00143 }
00144 
00145 /* return val if found, otherwise -1 */
00146 static int
00147 hnj_hash_lookup (HashTab *hashtab, const char *key)
00148 {
00149   int i;
00150   HashEntry *e;
00151 
00152   i = hnj_string_hash (key) % HASH_SIZE;
00153   for (e = hashtab->entries[i]; e; e = e->next)
00154     if (!strcmp (key, e->key))
00155       return e->val;
00156   return -1;
00157 }
00158 
00159 /* Get the state number, allocating a new state if necessary. */
00160 static int
00161 hnj_get_state (HyphenDict *dict, HashTab *hashtab, const char *string)
00162 {
00163   int state_num;
00164 
00165   state_num = hnj_hash_lookup (hashtab, string);
00166 
00167   if (state_num >= 0)
00168     return state_num;
00169 
00170   hnj_hash_insert (hashtab, string, dict->num_states);
00171   /* predicate is true if dict->num_states is a power of two */
00172   if (!(dict->num_states & (dict->num_states - 1)))
00173     {
00174       dict->states = hnj_realloc (dict->states,
00175                                   (dict->num_states << 1) *
00176                                   sizeof(HyphenState));
00177     }
00178   dict->states[dict->num_states].match = NULL;
00179   dict->states[dict->num_states].fallback_state = -1;
00180   dict->states[dict->num_states].num_trans = 0;
00181   dict->states[dict->num_states].trans = NULL;
00182   return dict->num_states++;
00183 }
00184 
00185 /* add a transition from state1 to state2 through ch - assumes that the
00186    transition does not already exist */
00187 static void
00188 hnj_add_trans (HyphenDict *dict, int state1, int state2, char ch)
00189 {
00190   int num_trans;
00191 
00192   num_trans = dict->states[state1].num_trans;
00193   if (num_trans == 0)
00194     {
00195       dict->states[state1].trans = hnj_malloc (sizeof(HyphenTrans));
00196     }
00197   else if (!(num_trans & (num_trans - 1)))
00198     {
00199       dict->states[state1].trans = hnj_realloc (dict->states[state1].trans,
00200                                                 (num_trans << 1) *
00201                                                 sizeof(HyphenTrans));
00202     }
00203   dict->states[state1].trans[num_trans].ch = ch;
00204   dict->states[state1].trans[num_trans].new_state = state2;
00205   dict->states[state1].num_trans++;
00206 }
00207 
00208 #ifdef VERBOSE
00209 HashTab *global;
00210 
00211 static char *
00212 get_state_str (int state)
00213 {
00214   int i;
00215   HashEntry *e;
00216 
00217   for (i = 0; i < HASH_SIZE; i++)
00218     for (e = global->entries[i]; e; e = e->next)
00219       if (e->val == state)
00220         return e->key;
00221   return NULL;
00222 }
00223 #endif
00224 
00225 HyphenDict *
00226 hnj_hyphen_load (const char *fn)
00227 {
00228   HyphenDict *dict;
00229   HashTab *hashtab;
00230   FILE *f;
00231   char buf[80];
00232   char word[80];
00233   char pattern[80];
00234   int state_num, last_state;
00235   int i, j;
00236   char ch;
00237   int found;
00238   HashEntry *e;
00239 
00240   f = fopen (fn, "r");
00241   if (f == NULL)
00242     return NULL;
00243 
00244   hashtab = hnj_hash_new ();
00245 #ifdef VERBOSE
00246   global = hashtab;
00247 #endif
00248   hnj_hash_insert (hashtab, "", 0);
00249 
00250   dict = hnj_malloc (sizeof(HyphenDict));
00251   dict->num_states = 1;
00252   dict->states = hnj_malloc (sizeof(HyphenState));
00253   dict->states[0].match = NULL;
00254   dict->states[0].fallback_state = -1;
00255   dict->states[0].num_trans = 0;
00256   dict->states[0].trans = NULL;
00257 
00258   /* read in character set info */
00259   for (i=0;i<MAX_NAME;i++) dict->cset[i]= 0;
00260   fgets(dict->cset,  sizeof(dict->cset),f);
00261   for (i=0;i<MAX_NAME;i++)
00262     if ((dict->cset[i] == '\r') || (dict->cset[i] == '\n'))
00263         dict->cset[i] = 0;
00264 
00265   while (fgets (buf, sizeof(buf), f) != NULL)
00266     {
00267       if (buf[0] != '%')
00268         {
00269           j = 0;
00270           pattern[j] = '0';
00271           for (i = 0; ((buf[i] > ' ') || (buf[i] < 0)); i++)
00272             {
00273               if (buf[i] >= '0' && buf[i] <= '9')
00274                 pattern[j] = buf[i];
00275               else
00276                 {
00277                   word[j] = buf[i];
00278                   pattern[++j] = '0';
00279                 }
00280             }
00281           word[j] = '\0';
00282           pattern[j + 1] = '\0';
00283 
00284           /* Optimize away leading zeroes */
00285           for (i = 0; pattern[i] == '0'; i++);
00286 
00287 #ifdef VERBOSE
00288           printf ("word %s pattern %s, j = %d\n", word, pattern + i, j);
00289 #endif
00290           found = hnj_hash_lookup (hashtab, word);
00291           state_num = hnj_get_state (dict, hashtab, word);
00292           dict->states[state_num].match = hnj_strdup (pattern + i);
00293 
00294           /* now, put in the prefix transitions */
00295           for (; found < 0 ;j--)
00296             {
00297               last_state = state_num;
00298               ch = word[j - 1];
00299               word[j - 1] = '\0';
00300               found = hnj_hash_lookup (hashtab, word);
00301               state_num = hnj_get_state (dict, hashtab, word);
00302               hnj_add_trans (dict, state_num, last_state, ch);
00303             }
00304         }
00305     }
00306 
00307   fclose(f);
00308 
00309   /* Could do unioning of matches here (instead of the preprocessor script).
00310      If we did, the pseudocode would look something like this:
00311 
00312      foreach state in the hash table
00313         foreach i = [1..length(state) - 1]
00314            state to check is substr (state, i)
00315            look it up
00316            if found, and if there is a match, union the match in.
00317 
00318      It's also possible to avoid the quadratic blowup by doing the
00319      search in order of increasing state string sizes - then you
00320      can break the loop after finding the first match.
00321 
00322      This step should be optional in any case - if there is a
00323      preprocessed rule table, it's always faster to use that.
00324 
00325 */
00326 
00327   /* put in the fallback states */
00328   for (i = 0; i < HASH_SIZE; i++)
00329     for (e = hashtab->entries[i]; e; e = e->next)
00330       if ( *(e->key) ) /* not for the empty key entry */
00331       {
00332         for (j = 1; 1; j++)
00333           {
00334             state_num = hnj_hash_lookup (hashtab, e->key + j);
00335             if (state_num >= 0)
00336               break;
00337           }
00338         /*KBH: FIXME state 0 fallback_state should always be -1? */
00339         if (e->val) 
00340           dict->states[e->val].fallback_state = state_num;
00341       }
00342 #ifdef VERBOSE
00343   for (i = 0; i < HASH_SIZE; i++)
00344     for (e = hashtab->entries[i]; e; e = e->next)
00345       {
00346         printf ("%d string %s state %d, fallback=%d\n", i, e->key, e->val,
00347                 dict->states[e->val].fallback_state);
00348         for (j = 0; j < dict->states[e->val].num_trans; j++)
00349           printf (" %c->%d\n", dict->states[e->val].trans[j].ch,
00350                   dict->states[e->val].trans[j].new_state);
00351       }
00352 #endif
00353 
00354 #ifndef VERBOSE
00355   hnj_hash_free (hashtab);
00356 #endif
00357 
00358   return dict;
00359 }
00360 
00361 void hnj_hyphen_free (HyphenDict *dict)
00362 {
00363   int state_num;
00364   HyphenState *hstate;
00365 
00366   for (state_num = 0; state_num < dict->num_states; state_num++)
00367     {
00368       hstate = &dict->states[state_num];
00369       if (hstate->match)
00370         hnj_free (hstate->match);
00371       if (hstate->trans)
00372         hnj_free (hstate->trans);
00373     }
00374 
00375   hnj_free (dict->states);
00376 
00377   hnj_free (dict);
00378 }
00379 
00380 #define MAX_WORD 256
00381 
00382 int hnj_hyphen_hyphenate (HyphenDict *dict,
00383                            const char *word, int word_size,
00384                            char *hyphens)
00385 {
00386   char prep_word_buf[MAX_WORD];
00387   char *prep_word;
00388   int i, j, k;
00389   int state;
00390   char ch;
00391   HyphenState *hstate;
00392   char *match;
00393   int offset;
00394 
00395   if (word_size + 3 < MAX_WORD)
00396     prep_word = prep_word_buf;
00397   else
00398     prep_word = hnj_malloc (word_size + 3);
00399 
00400   j = 0;
00401   prep_word[j++] = '.';
00402   
00403   for (i = 0; i < word_size; i++)
00404       prep_word[j++] = word[i];
00405       
00406   for (i = 0; i < j; i++)                                                       
00407     hyphens[i] = '0';    
00408   
00409   prep_word[j++] = '.';
00410 
00411   prep_word[j] = '\0';
00412 #ifdef VERBOSE
00413   printf ("prep_word = %s\n", prep_word);
00414 #endif
00415 
00416   /* now, run the finite state machine */
00417   state = 0;
00418   for (i = 0; i < j; i++)
00419     {
00420       ch = prep_word[i];
00421       for (;;)
00422         {
00423 
00424           if (state == -1) {
00425                   /*return 1;*/
00426                   /*KBH: FIXME shouldn't this be as follows?*/
00427             state = 0;
00428             goto try_next_letter;
00429           }          
00430 
00431 #ifdef VERBOSE
00432           char *state_str;
00433           state_str = get_state_str (state);
00434 
00435           for (k = 0; k < i - strlen (state_str); k++)
00436             putchar (' ');
00437           printf ("%s", state_str);
00438 #endif
00439 
00440           hstate = &dict->states[state];
00441           for (k = 0; k < hstate->num_trans; k++)
00442             if (hstate->trans[k].ch == ch)
00443               {
00444                 state = hstate->trans[k].new_state;
00445                 goto found_state;
00446               }
00447           state = hstate->fallback_state;
00448 #ifdef VERBOSE
00449           printf (" falling back, fallback_state %d\n", state);
00450 #endif
00451         }
00452     found_state:
00453 #ifdef VERBOSE
00454       printf ("found state %d\n",state);
00455 #endif
00456       /* Additional optimization is possible here - especially,
00457          elimination of trailing zeroes from the match. Leading zeroes
00458          have already been optimized. */
00459       match = dict->states[state].match;
00460       if (match)
00461         {
00462           offset = i + 1 - strlen (match);
00463 #ifdef VERBOSE
00464           for (k = 0; k < offset; k++)
00465             putchar (' ');
00466           printf ("%s\n", match);
00467 #endif
00468           /* This is a linear search because I tried a binary search and
00469              found it to be just a teeny bit slower. */
00470           for (k = 0; match[k] && offset+k < word_size+1 ; k++)
00471             if (hyphens[offset + k] < match[k])
00472               hyphens[offset + k] = match[k];
00473         }
00474 
00475       /* KBH: we need this to make sure we keep looking in a word
00476                  for patterns even if the current character is not known in state 0
00477                  since patterns for hyphenation may occur anywhere in the word*/
00478       try_next_letter: ;
00479 
00480     }
00481 #ifdef VERBOSE
00482   for (i = 0; i < j; i++)
00483     putchar (hyphens[i]);
00484   putchar ('\n');
00485 #endif
00486 
00487   for (i = 0; i < j - 4; i++)
00488 #if 0
00489     if (hyphens[i + 1] & 1)
00490       hyphens[i] = '-';
00491 #else
00492     hyphens[i] = hyphens[i + 1];
00493 #endif
00494   hyphens[0] = '0';
00495   for (; i < word_size; i++)
00496     hyphens[i] = '0';
00497   hyphens[word_size] = '\0';
00498 
00499   if (prep_word != prep_word_buf)
00500     hnj_free (prep_word);
00501   return 0;    
00502 }