Back to index

texmacs  1.0.7.15
hyphenate.cpp
Go to the documentation of this file.
00001 
00002 /******************************************************************************
00003 * MODULE     : hyphenate.cpp
00004 * DESCRIPTION: hyphenation by Liang's algorithm
00005 * COPYRIGHT  : (C) 1999  Joris van der Hoeven
00006 *******************************************************************************
00007 * This software falls under the GNU general public license version 3 or later.
00008 * It comes WITHOUT ANY WARRANTY WHATSOEVER. For details, see the file LICENSE
00009 * in the root directory or <http://www.gnu.org/licenses/gpl-3.0.html>.
00010 ******************************************************************************/
00011 
00012 #include "file.hpp"
00013 #include "hyphenate.hpp"
00014 #include "analyze.hpp"
00015 
00016 #include <stdio.h>
00017 #include <stdlib.h>
00018 #include <string.h>
00019 
00020 typedef int SI;
00021 #define MAX_SEARCH 10
00022 #define MAX_BUFFER_SIZE 256
00023 
00024 /*
00025 static bool
00026 my_strncmp (char* s1, char* s2, int len) {
00027   int i;
00028   for (i=0; i<len; i++) if (s1[i]!=s2[i]) return false;
00029   return true;
00030 }
00031 */
00032 
00033 static string
00034 unpattern (string s) {
00035   int i, n= N(s);
00036   string r;
00037   for (i=0; i<n; ) {
00038     while ((i<n) && (s[i]>='0') && (s[i]<='9')) i++;
00039     if (i<n) r << s[i++];
00040   }
00041   return r;
00042 }
00043 
00044 static string
00045 hyphen_normalize (string s) {
00046   int i;
00047   string r (0);
00048   for (i=0; i<N(s); i++)
00049     if ((i+3<N(s)) && (s[i]=='^') && (s[i+1]=='^')) {
00050       r << from_hexadecimal (s (i+2, i+4));
00051       i+=3;
00052     }
00053     else r << s[i];
00054   return r;
00055 }
00056 
00057 void
00058 load_hyphen_tables (string file_name,
00059                     hashmap<string,string>& patterns,
00060                     hashmap<string,string>& hyphenations) {
00061   string s;
00062   file_name= string ("hyphen.") * file_name;
00063   load_string (url ("$TEXMACS_PATH/langs/natural/hyphen", file_name), s, true);
00064   if (DEBUG_VERBOSE) cout << "TeXmacs] Loading " << file_name << "\n";
00065 
00066   hashmap<string,string> H ("?");
00067   bool pattern_flag=false;
00068   bool hyphenation_flag=false;
00069   int i=0, n= N(s);
00070   while (i<n) {
00071     string buffer;
00072     while ((i<n) && (s[i]!=' ') && (s[i]!='\t') && (s[i]!='\n')) {
00073       if (s[i] != '%') buffer << s[i++];
00074       else while ((i<n) && (s[i]!='\n')) i++;
00075     }
00076     if (i<n) i++;
00077     if (buffer == "}") {
00078       pattern_flag=false;
00079       hyphenation_flag=false;
00080     }
00081     if (pattern_flag && i != 0) {
00082       string norm= hyphen_normalize (buffer);
00083       patterns (unpattern (norm))= norm;
00084       //cout << unpattern (norm) << " ==> " << norm << "\n";
00085     }
00086     if (hyphenation_flag && i != 0 && N(buffer) != 0) {
00087       string word= replace (buffer, "-", "");
00088       hyphenations (word)= buffer;
00089       //cout << word << " --> " << buffer << "\n";
00090     }
00091     if (buffer == "\\patterns{") pattern_flag=true;
00092     if (buffer == "\\hyphenation{") hyphenation_flag=true;
00093   }
00094 }
00095 
00096 static string
00097 lower_case (string s) {
00098   int i;
00099   string r (N(s));
00100   for (i=0; i<N(s); i++) {
00101     if ((s[i]>='A') && (s[i]<='Z'))
00102       r[i]= (char) (((int) s[i])+ ((int) 'a')- ((int) 'A'));
00103     else r[i]=s[i];
00104   }
00105   return r;
00106 }
00107 
00108 array<int>
00109 get_hyphens (string s,
00110              hashmap<string,string> patterns,
00111              hashmap<string,string> hyphenations) {
00112   ASSERT (N(s) != 0, "hyphenation of empty string");
00113 
00114   if (hyphenations->contains (s)) {
00115     string h= hyphenations [s];
00116     array<int> penalty (N(s)-1);
00117     int i=0, j=0;
00118     while (h[j] == '-') j++;
00119     i++; j++;
00120     while (i < N(s)) {
00121       penalty[i-1]= HYPH_INVALID;
00122       while (j < N(h) && h[j] == '-') {
00123         penalty[i-1]= HYPH_STD;
00124         j++;
00125       }
00126       i++; j++;
00127     }
00128     //cout << s << " --> " << penalty << "\n";
00129     return penalty;
00130   }
00131   else {
00132     s= "." * lower_case (s) * ".";
00133     // cout << s << "\n";
00134     int i, j, k, m, len;
00135     array<int> T (N(s)+1);
00136     for (i=0; i<N(s)+1; i++) T[i]=0;
00137     for (len=1; len < MAX_SEARCH; len++)
00138       for (i=0; i<N(s)-len; i++) {
00139         string r= patterns [s (i, i+len)];
00140         if (!(r == "?")) {
00141           // cout << "  " << s (i, i+len) << " => " << r << "\n";
00142           for (j=0, k=0; j<=len; j++, k++) {
00143             if ((k<N(r)) && (r[k]>='0') && (r[k]<='9')) {
00144               m=((int) r[k])-((int) '0');
00145               k++;
00146             }
00147             else m=0;
00148             if (m>T[i+j]) T[i+j]=m;
00149           }
00150         }
00151       }
00152 
00153     array<int> penalty (N(s)-3);
00154     for (i=2; i<N(s)-1; i++)
00155       penalty [i-2]= (((T[i]&1)==1)? HYPH_STD: HYPH_INVALID);
00156     if (N(penalty)>0) penalty[0] = penalty[N(penalty)-1] = HYPH_INVALID;
00157     if (N(penalty)>1) penalty[1] = penalty[N(penalty)-2] = HYPH_INVALID;
00158     if (N(penalty)>2) penalty[N(penalty)-3] = HYPH_INVALID;
00159     // cout << s << " --> " << penalty << "\n";
00160     return penalty;
00161   }
00162 }
00163 
00164 void
00165 std_hyphenate (string s, int after, string& left, string& right, int penalty) {
00166   left = s (0, after+1);
00167   right= s (after+1, N(s));
00168   if (penalty >= HYPH_INVALID) left << string ("\\");
00169   else left << string ("-");
00170 }