Back to index

texmacs  1.0.7.15
text_language.cpp
Go to the documentation of this file.
00001 
00002 /******************************************************************************
00003 * MODULE     : text_language.cpp
00004 * DESCRIPTION: natural textual languages
00005 * COPYRIGHT  : (C) 1999  Joris van der Hoeven
00006 *******************************************************************************
00007 * This software falls under the GNU general public license version 3 or later.
00008 * It comes WITHOUT ANY WARRANTY WHATSOEVER. For details, see the file LICENSE
00009 * in the root directory or <http://www.gnu.org/licenses/gpl-3.0.html>.
00010 ******************************************************************************/
00011 
00012 #if defined(_WIN32) || defined(__WIN32__)
00013 #include <locale.h>
00014 #endif
00015 
00016 #include "analyze.hpp"
00017 #include "hyphenate.hpp"
00018 #include "impl_language.hpp"
00019 #include "sys_utils.hpp"
00020 
00021 #ifdef QTTEXMACS
00022 #include "Qt/qt_utilities.hpp"
00023 #endif
00024 
00025 /******************************************************************************
00026 * Western text languages
00027 ******************************************************************************/
00028 
00029 struct text_language_rep: language_rep {
00030   hashmap<string,string> patterns;
00031   hashmap<string,string> hyphenations;
00032 
00033   text_language_rep (string lan_name, string hyph_name);
00034   text_property advance (tree t, int& pos);
00035   array<int> get_hyphens (string s);
00036   void hyphenate (string s, int after, string& left, string& right);
00037 };
00038 
00039 text_language_rep::text_language_rep (string lan_name, string hyph_name):
00040   language_rep (lan_name), patterns ("?"), hyphenations ("?") {
00041     load_hyphen_tables (hyph_name, patterns, hyphenations); }
00042 
00043 text_property
00044 text_language_rep::advance (tree t, int& pos) {
00045   string s= t->label;
00046   if (pos == N(s)) return &tp_normal_rep;
00047 
00048   if (s[pos]==' ') {
00049     pos++;
00050     // while ((pos<N(s)) && (s[pos]==' ')) pos++;
00051     if ((pos == N(s)) || (!is_punctuation (s[pos])))
00052       return &tp_space_rep;
00053     return &tp_blank_rep;
00054   }
00055   
00056   if (is_punctuation (s[pos])) {
00057     while ((pos<N(s)) && is_punctuation (s[pos])) pos++;
00058     if ((pos==N(s)) || (s[pos]!=' ')) return &tp_normal_rep;
00059     switch (s[pos-1]) {
00060     case ',': case ':': case ';': case '`': case '\'':
00061       return &tp_space_rep;
00062     case '.': case '!': case '?':
00063       return &tp_period_rep;
00064     }
00065     return &tp_space_rep;
00066   }
00067 
00068   if (s[pos]=='-') {
00069     pos++;
00070     while ((pos<N(s)) && (s[pos]=='-')) pos++;
00071     return &tp_hyph_rep;
00072   }
00073 
00074   if (is_iso_alpha (s[pos])) {
00075     while ((pos<N(s)) && is_iso_alpha (s[pos])) pos++;
00076     return &tp_normal_rep;
00077   }
00078 
00079   if (is_numeric (s[pos])) { // can not be a '.'
00080     while ((pos<N(s)) && is_numeric (s[pos])) pos++;
00081     while (s[pos-1]=='.') pos--;
00082     return &tp_normal_rep;
00083   }
00084 
00085   if (s[pos]=='<') {
00086     while ((pos<N(s)) && (s[pos]!='>')) pos++;
00087     if (pos<N(s)) pos++;
00088     return &tp_normal_rep;
00089   }
00090 
00091   pos++;
00092   return &tp_normal_rep;
00093 }
00094 
00095 array<int>
00096 text_language_rep::get_hyphens (string s) {
00097   return ::get_hyphens (s, patterns, hyphenations);
00098 }
00099 
00100 void
00101 text_language_rep::hyphenate (
00102   string s, int after, string& left, string& right)
00103 {
00104   array<int> penalty= get_hyphens (s);
00105   std_hyphenate (s, after, left, right, penalty[after]);
00106 }
00107 
00108 /******************************************************************************
00109 * Oriental languages
00110 ******************************************************************************/
00111 
00112 struct oriental_language_rep: language_rep {
00113   oriental_language_rep (string lan_name);
00114   text_property advance (tree t, int& pos);
00115   array<int> get_hyphens (string s);
00116   void hyphenate (string s, int after, string& left, string& right);
00117 };
00118 
00119 oriental_language_rep::oriental_language_rep (string lan_name):
00120   language_rep (lan_name) {}
00121 
00122 text_property
00123 oriental_language_rep::advance (tree t, int& pos) {
00124   string s= t->label;
00125   if (pos == N(s)) return &tp_normal_rep;
00126 
00127   if (s[pos]==' ') {
00128     pos++;
00129     if ((pos == N(s)) || (!is_punctuation (s[pos])))
00130       return &tp_space_rep;
00131     return &tp_blank_rep;
00132   }
00133 
00134   int begin= pos;
00135   while (pos<N(s) && s[pos] != ' ') {
00136     int start= pos;
00137     tm_char_forwards (s, pos);
00138     string c= s (start, pos);
00139     if (starts (c, "<#300") && N(c) == 7) {
00140       if (start > begin) pos= start;
00141       break;
00142     }
00143   }
00144   return &tp_normal_rep;
00145 }
00146 
00147 array<int>
00148 oriental_language_rep::get_hyphens (string s) {
00149   int i, n= N(s);
00150   array<int> T (n-1);
00151   for (i=0; i<n-1; i++)
00152     T[i]= HYPH_INVALID;
00153   for (i=0, tm_char_forwards (s, i); i<n; tm_char_forwards (s, i))
00154     if (s[i] == '<')
00155       T[i-1]= 0;
00156   return T;
00157 }
00158 
00159 void
00160 oriental_language_rep::hyphenate (
00161   string s, int after, string& left, string& right)
00162 {
00163   left = s (0, after+1);
00164   right= s (after+1, N(s));
00165 }
00166 
00167 /******************************************************************************
00168 * Locales
00169 ******************************************************************************/
00170 
00171 string
00172 windows_locale_to_language (string s) {
00173   if (s == "Bulgarian_Bulgaria.1251") return "bulgarian";
00174   if (s == "Chinese_People's Republic of China.936")
00175     return "chinese"; // for windows xp
00176   if (s == "Chinese (Simplified)_People's Republic of China.936")
00177     return "chinese"; // for windows 7
00178   if (s == "Chinese_Taiwan.950")
00179     return "taiwanese"; // for windows xp
00180   if (s == "Chinese (Traditional)_Taiwan.950")
00181     return "taiwanese"; // for windows 7
00182   if (s == "Czech_Czech Republic.1250") return "czech";
00183   if (s == "Danish_Denmark.1252") return "danish";
00184   if (s == "Dutch_Netherlands.1252") return "dutch";
00185   if (s == "English_United States.1252") return "english";
00186   if (s == "English_United Kingdom.1252") return "british";
00187   if (s == "Finnish_Finland.1252") return "finnish";
00188   if (s == "French_France.1252") return "french";
00189   if (s == "German_Germany.1252") return "german";
00190   if (s == "Hungarian_Hungary.1250") return "hungarian";
00191   if (s == "Italian_Italy.1252") return "italian";
00192   if (s == "Japanese_Japan.932") return "japanese";
00193   if (s == "Korean_Korea.949") return "korean";
00194   if (s == "Polish_Poland.1250") return "polish";
00195   if (s == "Portuguese_Portugal.1252") return "portuguese";
00196   if (s == "Romanian_Romania.1250") return "romanian";
00197   if (s == "Russian_Russia.1251") return "russian";
00198   if (s == "Slovenian_Slovenia.1250") return "slovene";
00199   if (s == "Spanish_Spain.1252") return "spanish";
00200   if (s == "Swedish_Sweden.1252") return "swedish";
00201   if (s == "Ukrainian_Ukraine.1251") return "ukrainian";
00202   return "english";
00203 }
00204 
00205 string
00206 locale_to_language (string s) {
00207   if (N(s) > 5) s= s (0, 5);
00208   if (s == "en_GB") return "british";
00209   if (s == "zh_TW") return "taiwanese";
00210   if (N(s) > 2) s= s (0, 2);
00211   if (s == "bg") return "bulgarian";
00212   if (s == "zh") return "chinese";
00213   if (s == "cs") return "czech";
00214   if (s == "da") return "danish";
00215   if (s == "nl") return "dutch";
00216   if (s == "en") return "english";
00217   if (s == "fi") return "finnish";
00218   if (s == "fr") return "french";
00219   if (s == "de") return "german";
00220   if (s == "hu") return "hungarian";
00221   if (s == "it") return "italian";
00222   if (s == "ja") return "japanese";
00223   if (s == "ko") return "korean";
00224   if (s == "pl") return "polish";
00225   if (s == "pt") return "portuguese";
00226   if (s == "ro") return "romanian";
00227   if (s == "ru") return "russian";
00228   if (s == "sl") return "slovene";
00229   if (s == "es") return "spanish";
00230   if (s == "sv") return "swedish";
00231   if (s == "uk") return "ukrainian";
00232   return "english";
00233 }
00234 
00235 string
00236 language_to_locale (string s) {
00237   if (s == "american") return "en_US";
00238   if (s == "british") return "en_GB";
00239   if (s == "bulgarian") return "bg_BG";
00240   if (s == "chinese") return "zh_CN";
00241   if (s == "czech") return "cs_CZ";
00242   if (s == "danish") return "da_DK";
00243   if (s == "dutch") return "nl_NL";
00244   if (s == "english") return "en_US";
00245   if (s == "finnish") return "fi_FI";
00246   if (s == "french") return "fr_FR";
00247   if (s == "german") return "de_DE";
00248   if (s == "hungarian") return "hu_HU";
00249   if (s == "italian") return "it_IT";
00250   if (s == "japanese") return "ja_JP";
00251   if (s == "korean") return "ko_KR";
00252   if (s == "polish") return "pl_PL";
00253   if (s == "portuguese") return "pt_PT";
00254   if (s == "romanian") return "ro_RO";
00255   if (s == "russian") return "ru_RU";
00256   if (s == "slovene") return "sl_SI";
00257   if (s == "spanish") return "es_ES";
00258   if (s == "swedish") return "sv_SV";
00259   if (s == "taiwanese") return "zh_TW";
00260   if (s == "ukrainian") return "uk_UA";
00261   return "en_US";
00262 }
00263 
00264 string
00265 get_locale_language () {
00266 #if defined(_WIN32) || defined(__WIN32__)
00267   return windows_locale_to_language (setlocale (LC_ALL, ""));
00268 #else
00269   string env_lan= get_env ("LC_ALL");
00270   if (env_lan != "") return locale_to_language (env_lan);
00271   env_lan= get_env ("LC_MESSAGES");
00272   if (env_lan != "") return locale_to_language (env_lan);
00273   env_lan= get_env ("LANG");
00274   if (env_lan != "") return locale_to_language (env_lan);
00275   env_lan= get_env ("GDM_LANG");
00276   if (env_lan != "") return locale_to_language (env_lan);
00277   return "english";
00278 #endif
00279 }
00280 
00281 /******************************************************************************
00282 * Getting a formatted date
00283 ******************************************************************************/
00284 
00285 #ifdef QTTEXMACS
00286 string
00287 get_date (string lan, string fm) {
00288   return qt_get_date(lan, fm);
00289 }
00290 #else
00291 
00292 static bool
00293 invalid_format (string s) {
00294   if (N(s) == 0) return true;
00295   for (int i=0; i<N(s); i++)
00296     if (!(is_alpha (s[i]) || is_numeric (s[i]) ||
00297          s[i] == ' ' || s[i] == '%' || s[i] == '.' || s[i] == ',' ||
00298          s[i] == '+' || s[i] == '-' || s[i] == ':'))
00299       return true;
00300   return false;
00301 }
00302 
00303 static string
00304 simplify_date (string s) {
00305   int i, n=N(s);
00306   string r;
00307   for (i=0; i<n; i++)
00308     if ((s[i]!='0') || ((N(r)>0) && is_digit(r[N(r)-1]))) r << s[i];
00309   return r;
00310 }
00311 
00312 string
00313 get_date (string lan, string fm) {
00314 //#if defined(__MINGW__) || defined(__MINGW32__) || defined(OS_WIN32)
00315 //  return win32::get_date(lan, fm);
00316   if (invalid_format (fm)) {
00317     if ((lan == "british") || (lan == "english") || (lan == "american"))
00318       fm= "%B %d, %Y";
00319     else if (lan == "german")
00320       fm= "%d. %B %Y";
00321     else if (lan == "chinese" || lan == "japanese" ||
00322             lan == "korean" || lan == "taiwanese")
00323       {
00324        string y= simplify_date (var_eval_system ("date +\"%Y\""));
00325        string m= simplify_date (var_eval_system ("date +\"%m\""));
00326        string d= simplify_date (var_eval_system ("date +\"%d\""));
00327        if (lan == "japanese")
00328          return y * "<#5e74>" * m * "<#6708>" * d * "<#65e5>";
00329        if (lan == "korean")
00330          return y * "<#b144> " * m * "<#c6d4> " * d * "<#c77c>";
00331        return y * "," * m * "," * d;
00332       }
00333     else fm= "%d %B %Y";
00334   }
00335   lan= language_to_locale (lan);
00336   string lvar= "LC_TIME";
00337   if (get_env (lvar) == "") lvar= "LC_ALL";
00338   if (get_env (lvar) == "") lvar= "LANG";
00339   string old= get_env (lvar);
00340   set_env (lvar, lan);
00341   string date= simplify_date (var_eval_system ("date +\"" * fm * "\""));
00342   if ((lan == "cz_CZ") || (lan == "hu_HU") || (lan == "pl_PL"))
00343     date= il2_to_cork (date);
00344   // if (lan == "ru_RU") date= iso_to_koi8 (date);
00345   set_env (lvar, old);
00346   return date;
00347 }
00348 #endif
00349 
00350 /******************************************************************************
00351 * Main interface
00352 ******************************************************************************/
00353 
00354 typedef const char* const_char_ptr;
00355 
00356 static language
00357 make_text_language (string s, string h) {
00358   return tm_new<text_language_rep> (s, h);
00359 }
00360 
00361 language
00362 text_language (string s) {
00363   if (language::instances -> contains (s)) return language (s);
00364   if (s == "american") return make_text_language (s, "us");
00365   if (s == "british") return make_text_language (s, "ukenglish");
00366   if (s == "bulgarian") return make_text_language (s, "bulgarian");
00367   if (s == "chinese") return tm_new<oriental_language_rep> (s);
00368   if (s == "czech") return make_text_language (s, "czech");
00369   if (s == "danish") return make_text_language (s, "danish");
00370   if (s == "dutch") return make_text_language (s, "dutch");
00371   if (s == "english") return make_text_language (s, "us");
00372   if (s == "finnish") return make_text_language (s, "finnish");
00373   if (s == "french") return make_text_language (s, "french");
00374   if (s == "german") return make_text_language (s, "german");
00375   if (s == "hungarian") return make_text_language (s, "hungarian");
00376   if (s == "italian") return make_text_language (s, "italian");
00377   if (s == "japanese") return tm_new<oriental_language_rep> (s);
00378   if (s == "korean") return tm_new<oriental_language_rep> (s);
00379   if (s == "polish") return make_text_language (s, "polish");
00380   if (s == "portuguese") return make_text_language (s, "portuguese");
00381   if (s == "romanian") return make_text_language (s, "romanian");
00382   if (s == "russian") return make_text_language (s, "russian");
00383   if (s == "slovene") return make_text_language (s, "slovene");
00384   if (s == "spanish") return make_text_language (s, "spanish");
00385   if (s == "swedish") return make_text_language (s, "swedish");
00386   if (s == "taiwanese") return tm_new<oriental_language_rep> (s);
00387   if (s == "ukrainian") return make_text_language (s, "ukrainian");
00388   if (s == "verbatim") return tm_new<verb_language_rep> ("verbatim");
00389   cerr << "\nThe language was " << s << "\n";
00390   FAILED ("unknown language");
00391   return tm_new<verb_language_rep> ("verbatim");
00392 }