Back to index

scribus-ng  1.3.4.dfsg+svn20071115
hyphen.c
Go to the documentation of this file.
00001 /* LibHnj is dual licensed under LGPL and MPL. Boilerplate for both
00002  * licenses follows.
00003  */
00004 
00005 /* LibHnj - a library for high quality hyphenation and justification
00006  * Copyright (C) 1998 Raph Levien, 
00007  *          (C) 2001 ALTLinux, Moscow (http://www.alt-linux.org), 
00008  *           (C) 2001 Peter Novodvorsky (nidd@cs.msu.su)
00009  *
00010  * This library is free software; you can redistribute it and/or
00011  * modify it under the terms of the GNU Library General Public
00012  * License as published by the Free Software Foundation; either
00013  * version 2 of the License, or (at your option) any later version.
00014  *
00015  * This library is distributed in the hope that it will be useful,
00016  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00017  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00018  * Library General Public License for more details.
00019  *
00020  * You should have received a copy of the GNU Library General Public
00021  * License along with this library; if not, write to the 
00022  * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 
00023  * Boston, MA  02111-1307  USA.
00024 */
00025 
00026 /*
00027  * The contents of this file are subject to the Mozilla Public License
00028  * Version 1.0 (the "MPL"); you may not use this file except in
00029  * compliance with the MPL.  You may obtain a copy of the MPL at
00030  * http://www.mozilla.org/MPL/
00031  *
00032  * Software distributed under the MPL is distributed on an "AS IS" basis,
00033  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the MPL
00034  * for the specific language governing rights and limitations under the
00035  * MPL.
00036  *
00037  */
00038 #include <stdlib.h> /* for NULL, malloc */
00039 #include <stdio.h>  /* for fprintf */
00040 #include <string.h> /* for strdup */
00041 
00042 #ifdef UNX
00043 #include <unistd.h> /* for exit */
00044 #endif
00045 
00046 #define noVERBOSE
00047 
00048 #include "hnjalloc.h"
00049 #include "hyphen.h"
00050 
00051 static char *
00052 hnj_strdup (const char *s)
00053 {
00054   char *new;
00055   int l;
00056 
00057   l = strlen (s);
00058   new = hnj_malloc (l + 1);
00059   memcpy (new, s, l);
00060   new[l] = 0;
00061   return new;
00062 }
00063 
00064 /* a little bit of a hash table implementation. This simply maps strings
00065    to state numbers */
00066 
00067 typedef struct _HashTab HashTab;
00068 typedef struct _HashEntry HashEntry;
00069 
00070 /* A cheap, but effective, hack. */
00071 #define HASH_SIZE 31627
00072 
00073 struct _HashTab {
00074   HashEntry *entries[HASH_SIZE];
00075 };
00076 
00077 struct _HashEntry {
00078   HashEntry *next;
00079   char *key;
00080   int val;
00081 };
00082 
00083 /* a char* hash function from ASU - adapted from Gtk+ */
00084 static unsigned int
00085 hnj_string_hash (const char *s)
00086 {
00087   const char *p;
00088   unsigned int h=0, g;
00089 
00090   for(p = s; *p != '\0'; p += 1) {
00091     h = ( h << 4 ) + *p;
00092     if ( ( g = h & 0xf0000000 ) ) {
00093       h = h ^ (g >> 24);
00094       h = h ^ g;
00095     }
00096   }
00097   return h /* % M */;
00098 }
00099 
00100 static HashTab *
00101 hnj_hash_new (void)
00102 {
00103   HashTab *hashtab;
00104   int i;
00105 
00106   hashtab = hnj_malloc (sizeof(HashTab));
00107   for (i = 0; i < HASH_SIZE; i++)
00108     hashtab->entries[i] = NULL;
00109 
00110   return hashtab;
00111 }
00112 
00113 static void
00114 hnj_hash_free (HashTab *hashtab)
00115 {
00116   int i;
00117   HashEntry *e, *next;
00118 
00119   for (i = 0; i < HASH_SIZE; i++)
00120     for (e = hashtab->entries[i]; e; e = next)
00121       {
00122        next = e->next;
00123        hnj_free (e->key);
00124        hnj_free (e);
00125       }
00126 
00127   hnj_free (hashtab);
00128 }
00129 
00130 /* assumes that key is not already present! */
00131 static void
00132 hnj_hash_insert (HashTab *hashtab, const char *key, int val)
00133 {
00134   int i;
00135   HashEntry *e;
00136 
00137   i = hnj_string_hash (key) % HASH_SIZE;
00138   e = hnj_malloc (sizeof(HashEntry));
00139   e->next = hashtab->entries[i];
00140   e->key = hnj_strdup (key);
00141   e->val = val;
00142   hashtab->entries[i] = e;
00143 }
00144 
00145 /* return val if found, otherwise -1 */
00146 static int
00147 hnj_hash_lookup (HashTab *hashtab, const char *key)
00148 {
00149   int i;
00150   HashEntry *e;
00151 
00152   i = hnj_string_hash (key) % HASH_SIZE;
00153   for (e = hashtab->entries[i]; e; e = e->next)
00154     if (!strcmp (key, e->key))
00155       return e->val;
00156   return -1;
00157 }
00158 
00159 /* Get the state number, allocating a new state if necessary. */
00160 static int
00161 hnj_get_state (HyphenDict *dict, HashTab *hashtab, const char *string)
00162 {
00163   int state_num;
00164 
00165   state_num = hnj_hash_lookup (hashtab, string);
00166 
00167   if (state_num >= 0)
00168     return state_num;
00169 
00170   hnj_hash_insert (hashtab, string, dict->num_states);
00171   /* predicate is true if dict->num_states is a power of two */
00172   if (!(dict->num_states & (dict->num_states - 1)))
00173     {
00174       dict->states = hnj_realloc (dict->states,
00175                               (dict->num_states << 1) *
00176                               sizeof(HyphenState));
00177     }
00178   dict->states[dict->num_states].match = NULL;
00179   dict->states[dict->num_states].fallback_state = -1;
00180   dict->states[dict->num_states].num_trans = 0;
00181   dict->states[dict->num_states].trans = NULL;
00182   return dict->num_states++;
00183 }
00184 
00185 /* add a transition from state1 to state2 through ch - assumes that the
00186    transition does not already exist */
00187 static void
00188 hnj_add_trans (HyphenDict *dict, int state1, int state2, char ch)
00189 {
00190   int num_trans;
00191 
00192   num_trans = dict->states[state1].num_trans;
00193   if (num_trans == 0)
00194     {
00195       dict->states[state1].trans = hnj_malloc (sizeof(HyphenTrans));
00196     }
00197   else if (!(num_trans & (num_trans - 1)))
00198     {
00199       dict->states[state1].trans = hnj_realloc (dict->states[state1].trans,
00200                                           (num_trans << 1) *
00201                                           sizeof(HyphenTrans));
00202     }
00203   dict->states[state1].trans[num_trans].ch = ch;
00204   dict->states[state1].trans[num_trans].new_state = state2;
00205   dict->states[state1].num_trans++;
00206 }
00207 
00208 #ifdef VERBOSE
00209 HashTab *global;
00210 
00211 static char *
00212 get_state_str (int state)
00213 {
00214   int i;
00215   HashEntry *e;
00216 
00217   for (i = 0; i < HASH_SIZE; i++)
00218     for (e = global->entries[i]; e; e = e->next)
00219       if (e->val == state)
00220        return e->key;
00221   return NULL;
00222 }
00223 #endif
00224 
00225 HyphenDict *
00226 hnj_hyphen_load (const char *fn)
00227 {
00228   HyphenDict *dict;
00229   HashTab *hashtab;
00230   FILE *f;
00231   char buf[80];
00232   char word[80];
00233   char pattern[80];
00234   int state_num, last_state;
00235   int i, j;
00236   char ch;
00237   int found;
00238   HashEntry *e;
00239 
00240   f = fopen (fn, "r");
00241   if (f == NULL)
00242     return NULL;
00243 
00244   hashtab = hnj_hash_new ();
00245 #ifdef VERBOSE
00246   global = hashtab;
00247 #endif
00248   hnj_hash_insert (hashtab, "", 0);
00249 
00250   dict = hnj_malloc (sizeof(HyphenDict));
00251   dict->num_states = 1;
00252   dict->states = hnj_malloc (sizeof(HyphenState));
00253   dict->states[0].match = NULL;
00254   dict->states[0].fallback_state = -1;
00255   dict->states[0].num_trans = 0;
00256   dict->states[0].trans = NULL;
00257 
00258   /* read in character set info */
00259   for (i=0;i<MAX_NAME;i++) dict->cset[i]= 0;
00260   fgets(dict->cset,  sizeof(dict->cset),f);
00261   for (i=0;i<MAX_NAME;i++)
00262     if ((dict->cset[i] == '\r') || (dict->cset[i] == '\n'))
00263         dict->cset[i] = 0;
00264 
00265   while (fgets (buf, sizeof(buf), f) != NULL)
00266     {
00267       if (buf[0] != '%')
00268        {
00269          j = 0;
00270          pattern[j] = '0';
00271          for (i = 0; ((buf[i] > ' ') || (buf[i] < 0)); i++)
00272            {
00273              if (buf[i] >= '0' && buf[i] <= '9')
00274               pattern[j] = buf[i];
00275              else
00276               {
00277                 word[j] = buf[i];
00278                 pattern[++j] = '0';
00279               }
00280            }
00281          word[j] = '\0';
00282          pattern[j + 1] = '\0';
00283 
00284          /* Optimize away leading zeroes */
00285          for (i = 0; pattern[i] == '0'; i++);
00286 
00287 #ifdef VERBOSE
00288          printf ("word %s pattern %s, j = %d\n", word, pattern + i, j);
00289 #endif
00290          found = hnj_hash_lookup (hashtab, word);
00291 
00292          state_num = hnj_get_state (dict, hashtab, word);
00293          dict->states[state_num].match = hnj_strdup (pattern + i);
00294 
00295          /* now, put in the prefix transitions */
00296          for (; found < 0 ;j--)
00297            {
00298              last_state = state_num;
00299              ch = word[j - 1];
00300              word[j - 1] = '\0';
00301              found = hnj_hash_lookup (hashtab, word);
00302              state_num = hnj_get_state (dict, hashtab, word);
00303              hnj_add_trans (dict, state_num, last_state, ch);
00304            }
00305        }
00306     }
00307 
00308   /* Could do unioning of matches here (instead of the preprocessor script).
00309      If we did, the pseudocode would look something like this:
00310 
00311      foreach state in the hash table
00312         foreach i = [1..length(state) - 1]
00313            state to check is substr (state, i)
00314            look it up
00315            if found, and if there is a match, union the match in.
00316 
00317      It's also possible to avoid the quadratic blowup by doing the
00318      search in order of increasing state string sizes - then you
00319      can break the loop after finding the first match.
00320 
00321      This step should be optional in any case - if there is a
00322      preprocessed rule table, it's always faster to use that.
00323 
00324 */
00325 
00326   /* put in the fallback states */
00327   for (i = 0; i < HASH_SIZE; i++)
00328     for (e = hashtab->entries[i]; e; e = e->next)
00329       {
00330 /*     for (j = 1; 1; j++) */
00331        for (j = 1; *(e->key); j++)
00332          {
00333            state_num = hnj_hash_lookup (hashtab, e->key + j);
00334            if (state_num >= 0)
00335              break;
00336          }
00337         /* KBH: FIXME state 0 fallback_state should always be -1? */
00338        if (e->val) 
00339          dict->states[e->val].fallback_state = state_num;
00340       }
00341 #ifdef VERBOSE
00342   for (i = 0; i < HASH_SIZE; i++)
00343     for (e = hashtab->entries[i]; e; e = e->next)
00344       {
00345        printf ("%d string %s state %d, fallback=%d\n", i, e->key, e->val,
00346               dict->states[e->val].fallback_state);
00347        for (j = 0; j < dict->states[e->val].num_trans; j++)
00348          printf (" %c->%d\n", dict->states[e->val].trans[j].ch,
00349                 dict->states[e->val].trans[j].new_state);
00350       }
00351 #endif
00352 
00353 #ifndef VERBOSE
00354   hnj_hash_free (hashtab);
00355 #endif
00356   fclose(f);
00357   return dict;
00358 }
00359 
00360 void hnj_hyphen_free (HyphenDict *dict)
00361 {
00362   int state_num;
00363   HyphenState *hstate;
00364 
00365   for (state_num = 0; state_num < dict->num_states; state_num++)
00366     {
00367       hstate = &dict->states[state_num];
00368       if (hstate->match)
00369        hnj_free (hstate->match);
00370       if (hstate->trans)
00371        hnj_free (hstate->trans);
00372     }
00373 
00374   hnj_free (dict->states);
00375 
00376   hnj_free (dict);
00377 }
00378 
00379 #define MAX_WORD 256
00380 
00381 int hnj_hyphen_hyphenate (HyphenDict *dict,
00382                         const char *word, int word_size,
00383                         char *hyphens)
00384 {
00385   char prep_word_buf[MAX_WORD];
00386   char *prep_word;
00387   int i, j, k;
00388   int state;
00389   char ch;
00390   HyphenState *hstate;
00391   char *match;
00392   int offset;
00393 
00394   if (word_size + 3 < MAX_WORD)
00395     prep_word = prep_word_buf;
00396   else
00397     prep_word = hnj_malloc (word_size + 3);
00398 
00399   j = 0;
00400   prep_word[j++] = '.';
00401   
00402   for (i = 0; i < word_size; i++)
00403       prep_word[j++] = word[i];
00404       
00405   for (i = 0; i < j; i++)                                                       
00406     hyphens[i] = '0';    
00407   
00408   prep_word[j++] = '.';
00409 
00410   prep_word[j] = '\0';
00411 #ifdef VERBOSE
00412   printf ("prep_word = %s\n", prep_word);
00413 #endif
00414 
00415   /* now, run the finite state machine */
00416   state = 0;
00417   for (i = 0; i < j; i++)
00418     {
00419       ch = prep_word[i];
00420       for (;;)
00421        {
00422 
00423          if (state == -1) {
00424             /* return 1;
00425             KBH: FIXME shouldn't this be as follows? */
00426             state = 0;
00427             goto try_next_letter;
00428           }          
00429 
00430 #ifdef VERBOSE
00431          char *state_str;
00432          state_str = get_state_str (state);
00433 
00434          for (k = 0; k < i - strlen (state_str); k++)
00435            putchar (' ');
00436          printf ("%s", state_str);
00437 #endif
00438 
00439          hstate = &dict->states[state];
00440          for (k = 0; k < hstate->num_trans; k++)
00441            if (hstate->trans[k].ch == ch)
00442              {
00443               state = hstate->trans[k].new_state;
00444               goto found_state;
00445              }
00446          state = hstate->fallback_state;
00447 #ifdef VERBOSE
00448          printf (" falling back, fallback_state %d\n", state);
00449 #endif
00450        }
00451     found_state:
00452 #ifdef VERBOSE
00453       printf ("found state %d\n",state);
00454 #endif
00455       /* Additional optimization is possible here - especially,
00456         elimination of trailing zeroes from the match. Leading zeroes
00457         have already been optimized. */
00458       match = dict->states[state].match;
00459       if (match)
00460        {
00461          offset = i + 1 - strlen (match);
00462 #ifdef VERBOSE
00463          for (k = 0; k < offset; k++)
00464            putchar (' ');
00465          printf ("%s\n", match);
00466 #endif
00467          /* This is a linear search because I tried a binary search and
00468             found it to be just a teeny bit slower. */
00469          for (k = 0; match[k]; k++)
00470            if (hyphens[offset + k] < match[k])
00471              hyphens[offset + k] = match[k];
00472        }
00473 
00474       /* KBH: we need this to make sure we keep looking in a word
00475        for patterns even if the current character is not known in state 0
00476        since patterns for hyphenation may occur anywhere in the word */
00477       try_next_letter: ;
00478 
00479     }
00480 #ifdef VERBOSE
00481   for (i = 0; i < j; i++)
00482     putchar (hyphens[i]);
00483   putchar ('\n');
00484 #endif
00485 
00486   for (i = 0; i < j - 4; i++)
00487 #if 0
00488     if (hyphens[i + 1] & 1)
00489       hyphens[i] = '-';
00490 #else
00491     hyphens[i] = hyphens[i + 1];
00492 #endif
00493   hyphens[0] = '0';
00494   for (; i < word_size; i++)
00495     hyphens[i] = '0';
00496   hyphens[word_size] = '\0';
00497 
00498   if (prep_word != prep_word_buf)
00499     hnj_free (prep_word);
00500   return 0;    
00501 }