Back to index

texmacs  1.0.7.15
converter.cpp
Go to the documentation of this file.
00001 
00002 /******************************************************************************
00003 * MODULE     : converter.cpp
00004 * DESCRIPTION: Applies dictionaries to strings in an efficient manner.
00005 * COPYRIGHT  : (C) 2002  Felix Breuer
00006 *******************************************************************************
00007 * This software falls under the GNU general public license version 3 or later.
00008 * It comes WITHOUT ANY WARRANTY WHATSOEVER. For details, see the file LICENSE
00009 * in the root directory or <http://www.gnu.org/licenses/gpl-3.0.html>.
00010 ******************************************************************************/
00011 
00012 #include "converter.hpp"
00013 #include "convert.hpp"
00014 #ifdef USE_ICONV
00015 #include <iconv.h>
00016 #endif
00017 #include <errno.h>
00018 
00019 RESOURCE_CODE (converter);
00020 
00021 /******************************************************************************
00022 * converter methods
00023 ******************************************************************************/
00024 
00025 void
00026 operator << (converter c, string str) {
00027   int index = 0;
00028   while (index < N(str))
00029     c->match(str, index);
00030 }
00031 
00032 string
00033 apply (converter c, string str) {
00034   c->output = string();
00035   c << str;
00036   return flush(c);
00037 }
00038 
00039 string
00040 flush (converter c) {
00041   string result = c->output;
00042   c->output = string();
00043   return result;
00044 }
00045 
00046 /******************************************************************************
00047 * method for loading converters
00048 ******************************************************************************/
00049  
00050 converter
00051 load_converter (string from, string to) {
00052   string name= from * "-" * to;
00053   if (converter::instances -> contains (name))
00054     return converter (name);
00055   converter conv = tm_new<converter_rep> (from, to);
00056   return conv;
00057 }
00058 
00059 /******************************************************************************
00060 * converter_rep methods
00061 ******************************************************************************/
00062 
00063 inline bool
00064 converter_rep::has_value(hashtree<char,string> node) {
00065   return node->label != nil_string;
00066 }
00067 
00068 inline void
00069 converter_rep::match (string& str, int& index) {
00070   int forward = index;
00071   int last_match = -1;
00072   string value("");
00073   bool done = false;
00074   hashtree<char,string> node = ht;
00075   //cout << "[";
00076   while (!done && forward < N(str)) {
00077     if (node->contains (str[forward])) {
00078       node = node(str[forward]);
00079       //printf("->%x",str[forward]);
00080       if (has_value(node)) {
00081         last_match = forward;
00082         value = node->label;
00083       }
00084       forward++;
00085     }
00086     else done = true;
00087   }
00088   if (last_match==-1) {
00089     if (copy_unmatched)
00090       output << string(str[index]);
00091     index++;
00092   }
00093   else {
00094     //printf(":");for(int i = 0; i < N(value);i++) printf("%x ",value[i]);
00095     output << value;
00096     index = last_match + 1;
00097   }
00098   //cout << "]";
00099 }
00100 
00101 void
00102 converter_rep::load () {
00103   // to handle each case individually seems unelegant, but there is simply more
00104   // to be done here than just loading a file.
00105   // cout << "TeXmacs] load converter " << from << " -> " << to << "\n";
00106   if ( from=="Cork" && to=="UTF-8" ) {
00107     hashtree<char,string> dic;
00108     hashtree_from_dictionary (dic,"corktounicode", BIT2BIT, UTF8, false);
00109     hashtree_from_dictionary (dic,"cork-unicode-oneway", BIT2BIT, UTF8, false);
00110     hashtree_from_dictionary (dic,"tmuniversaltounicode", BIT2BIT, UTF8, false);
00111     hashtree_from_dictionary (dic,"symbol-unicode-oneway", BIT2BIT, UTF8, false);
00112     hashtree_from_dictionary (dic,"symbol-unicode-math", BIT2BIT, UTF8, false);
00113     ht = dic;
00114   }
00115   else if ( from=="UTF-8" && to=="Cork") {
00116     hashtree<char,string> dic;
00117     hashtree_from_dictionary (dic,"corktounicode", UTF8, BIT2BIT, true);
00118     hashtree_from_dictionary (dic,"unicode-cork-oneway", UTF8, BIT2BIT, false);
00119     hashtree_from_dictionary (dic,"tmuniversaltounicode", UTF8, BIT2BIT, true);
00120     hashtree_from_dictionary (dic,"unicode-symbol-oneway", UTF8, BIT2BIT, true);
00121     ht = dic;
00122   }
00123   else if ( from=="UTF-8" && to=="HTML") {
00124     hashtree<char,string> dic;
00125     hashtree_from_dictionary (dic, "HTMLlat1"   , CHAR_ENTITY, ENTITY_NAME, true);
00126     hashtree_from_dictionary (dic, "HTMLspecial", CHAR_ENTITY, ENTITY_NAME, true);
00127     hashtree_from_dictionary (dic, "HTMLsymbol" , CHAR_ENTITY, ENTITY_NAME, true);
00128     ht = dic;
00129   } else if ( from=="T2A" && to=="UTF-8" ) {
00130     hashtree<char,string> dic;
00131     hashtree_from_dictionary (dic,"corktounicode", BIT2BIT, UTF8, false);
00132     hashtree_from_dictionary (dic,"cork-unicode-oneway", BIT2BIT, UTF8, false);
00133     hashtree_from_dictionary (dic,"tmuniversaltounicode", BIT2BIT, UTF8, false);
00134     hashtree_from_dictionary (dic,"symbol-unicode-oneway", BIT2BIT, UTF8, false);
00135     hashtree_from_dictionary (dic,"symbol-unicode-math", BIT2BIT, UTF8, false);
00136     hashtree_from_dictionary (dic,"t2atounicode", BIT2BIT, UTF8, false);
00137     ht = dic;
00138   }  
00139 }
00140 
00141 /******************************************************************************
00142 * convenience functions
00143 ******************************************************************************/
00144 
00145 bool
00146 check_encoding (string input, string encoding) {
00147   if (encoding == "Cork") return true;
00148   else return check_using_iconv (input, encoding);
00149 }
00150 
00151 string 
00152 convert (string input, string from, string to) {
00153   if (from == "Cork")
00154     return convert_from_cork (input, to);
00155   else if (to == "Cork")
00156     return convert_to_cork (input,from);
00157   else
00158     return convert_using_iconv (input, from, to);
00159 }
00160 
00161 string 
00162 convert_to_cork (string input, string from) {
00163   string str;
00164   if (from != "UTF-8")
00165     str = convert_using_iconv (input, from, "UTF-8");
00166   return utf8_to_cork (str);
00167 }
00168 
00169 string 
00170 convert_from_cork (string input, string to) {
00171   string str = cork_to_utf8 (input);
00172   if (to != "UTF-8")
00173     str = convert_using_iconv (str, "UTF-8", to);
00174   return str;
00175 }
00176 
00177 string
00178 utf8_to_cork (string input) {
00179   converter conv= load_converter ("UTF-8", "Cork");
00180   int start, i, n= N(input);
00181   string output;
00182   for (i=0; i<n; ) {
00183     start= i;
00184     unsigned int code= decode_from_utf8 (input, i);
00185     string s= input (start, i);
00186     string r= apply (conv, s);
00187     if (r == s && code >= 256)
00188       r= "<#" * as_hexadecimal (code) * ">";
00189     output << r;
00190   }
00191   return output;
00192 }
00193 
00194 string
00195 cork_to_utf8 (string input) {
00196   converter conv= load_converter ("Cork", "UTF-8");
00197   int start= 0, i, n= N(input);
00198   string r;
00199   for (i=0; i<n; i++)
00200     if (input[i] == '<' && i+1<n && input[i+1] == '#') {
00201       r << apply (conv, input (start, i));
00202       start= i= i+2;
00203       while (i<n && input[i] != '>') i++;
00204       r << encode_as_utf8 (from_hexadecimal (input (start, i)));
00205       start= i+1;
00206     }
00207   r << apply (conv, input (start, n));
00208   return r;
00209 }
00210 
00211 string
00212 t2a_to_utf8 (string input) {
00213   converter conv= load_converter ("T2A", "UTF-8");
00214   int start= 0, i, n= N(input);
00215   string r;
00216   for (i=0; i<n; i++)
00217     if (input[i] == '<' && i+1<n && input[i+1] == '#') {
00218       r << apply (conv, input (start, i));
00219       start= i= i+2;
00220       while (i<n && input[i] != '>') i++;
00221       r << encode_as_utf8 (from_hexadecimal (input (start, i)));
00222       start= i+1;
00223     }
00224   r << apply (conv, input (start, n));
00225   return r;
00226 }
00227 
00228 string
00229 utf8_to_html (string input) {
00230   converter conv = load_converter ("UTF-8", "HTML");
00231   string s = apply (conv, input);
00232   return utf8_to_hex_entities(s);
00233 }
00234 
00235 #ifdef USE_ICONV
00236 
00237 // auto_array<T> objects ensure that the contained array is deleted when the
00238 // block where it is defined is exited. No spurious delete[], no memory leak.
00239 template<class T> class auto_array {
00240   T* value;
00241 public:
00242   auto_array (T* x) : value (x) {}
00243   ~auto_array () { tm_delete_array (value ); }
00244   operator T* () const { return value; }
00245 };
00246 
00247 class iconv_converter {
00248   string from;
00249   string to;
00250   iconv_t cd;
00251   bool show_errors;
00252   bool successful;
00253 public:
00254   iconv_converter (string from, string to, bool errors=true);
00255   ~iconv_converter ();
00256   inline bool is_valid () { return cd != (iconv_t)-1; }
00257   inline bool is_successful () { return successful; }
00258   friend string apply (iconv_converter &conv, string input);
00259 };
00260 
00261 iconv_converter::iconv_converter (string from2, string to2, bool errors):
00262   from (from2), to (to2), show_errors (errors), successful (false)
00263 {
00264   auto_array<char> from_cp = as_charp (from);
00265   auto_array<char> to_cp = as_charp (to);
00266   cd = iconv_open (to_cp, from_cp);
00267   if (!is_valid() && show_errors)
00268     system_error ("Initialization of iconv from " * from *
00269                 " to " * to * " failed!");
00270   successful= true;
00271 }
00272 
00273 iconv_converter::~iconv_converter () {
00274   iconv_close(cd);
00275 }
00276 
00277 // From the standard C++ library (remember, TeXmacs does _not_ use std!)
00278 template<typename T>
00279 inline size_t
00280 iconv_adaptor(size_t(*iconv_func)(iconv_t, T, size_t *, char**, size_t*),
00281              iconv_t cd, char **inbuf, size_t *inbytesleft,
00282              char **outbuf, size_t *outbytesleft) {
00283   return iconv_func (cd, (T) ((void*) inbuf), inbytesleft,
00284                    outbuf, outbytesleft);
00285 }
00286 
00287 string apply (iconv_converter &conv, string input) {
00288   if (! conv.is_valid()) {
00289     conv.successful= false;
00290     return "";
00291   }
00292   string result;
00293   auto_array<char> in_cp= as_charp(input);
00294   char* in_cursor= in_cp;
00295   size_t in_left= N(input);
00296   double expansion= 1.1;
00297   size_t out_counter= 0;
00298   while (in_left > 0) {
00299     size_t out_left= max(int(in_left * expansion), 1024);
00300     auto_array<char> out_cp= tm_new_array<char> (out_left);
00301     char* out_cursor= out_cp;
00302     size_t r = iconv_adaptor(iconv, conv.cd,
00303                           &in_cursor, &in_left, &out_cursor, &out_left);
00304     if(r == (size_t)-1 && errno != E2BIG) {
00305       if (conv.show_errors) {
00306        cerr << "\nConverting from " << conv.from << " to " << conv.to << "\n";
00307        system_error ("String conversion using iconv failed!");
00308       }
00309       conv.successful= false;
00310       return "";
00311     }
00312     size_t used_out= out_cursor - out_cp;
00313     result << string(out_cp, used_out);
00314     out_counter += used_out;
00315     expansion= max((double) out_counter / (in_cursor - in_cp), 1.0) + 0.1;
00316   }
00317   conv.successful= true;
00318   return result;
00319 }
00320 
00321 #endif // defined USE_ICONV
00322 
00323 bool check_using_iconv (string input, string encoding) {
00324 #ifdef USE_ICONV
00325   iconv_converter conv (encoding, encoding, false);
00326   apply (conv, input);
00327   return conv.is_successful();
00328 #else
00329   (void) input;
00330   (void) encoding;
00331   FAILED ("iconv not enabled");
00332   return false;
00333 #endif
00334 }
00335 
00336 string
00337 convert_using_iconv (string input, string from, string to) {
00338 #ifdef USE_ICONV
00339   iconv_converter conv (from, to, true);
00340   return apply (conv, input);
00341 #else
00342   (void) input;
00343   (void) from;
00344   (void) to;
00345   FAILED ("iconv not enabled");
00346   return "";
00347 #endif
00348 }
00349 
00350 /******************************************************************************
00351 * Functions for hashtree handling
00352 ******************************************************************************/
00353 
00354 void
00355 put_prefix_code (string key, string value, hashtree<char,string> tree) {
00356   if (DEBUG_STD) {
00357     hashtree<char,string> ht= find_node (key,tree);
00358     if (ht->label != "")
00359       cout << "overwriting: " << ht->label << " with " << value << '\n';
00360   }
00361   find_node (key,tree)->set_label(value);
00362 }
00363 
00364 hashtree<char,string>
00365 find_node (string key, hashtree<char,string> ht) {
00366   int i;
00367   for(i = 0; i < N(key); i++)
00368     ht = ht(key[i]);
00369   return ht;
00370 }
00371 
00372 void
00373 hashtree_from_dictionary (
00374   hashtree<char,string> dic, string file_name, escape_type key_escape,
00375   escape_type val_escape, bool reverse)
00376 {
00377   system_info ("Loading",file_name);
00378   string key_string, val_string, file;
00379   file_name = file_name * ".scm";
00380   if (load_string (url ("$TEXMACS_PATH/langs/encoding", file_name), file, false)) {
00381     system_error ("Couldn't open encoding dictionary", file_name);
00382     return;
00383   }
00384   tree t = block_to_scheme_tree (file);
00385   if (!is_tuple (t)) {
00386     system_error ("Malformed encoding dictionary", file_name);
00387     return;
00388   }
00389   for (int i=0; i<N(t); i++) {
00390     if (is_func (t[i], TUPLE, 2) &&
00391         is_atomic (t[i][0]) && is_atomic (t[i][1]))
00392       {
00393         //cout << N(pairs[i]) << "\n" << as_string(pairs[i]) << "\n";
00394         reverse ? key_string = t[i][1]->label : key_string = t[i][0]->label;
00395         reverse ? val_string = t[i][0]->label : val_string = t[i][1]->label;
00396         if (is_quoted (key_string)) key_string = scm_unquote (key_string);
00397         if (is_quoted (val_string)) val_string = scm_unquote (val_string);
00398         //cout << "key: " << key_string << " val: " << val_string << "\n";
00399         if (key_escape == BIT2BIT)
00400           key_string = convert_escapes (key_string, false);
00401         else if (key_escape == UTF8)
00402           key_string = convert_escapes (key_string, true);
00403        else if (key_escape == CHAR_ENTITY)
00404          key_string = convert_char_entities (key_string);
00405         if (val_escape == BIT2BIT)
00406           val_string = convert_escapes (val_string, false);
00407         else if (val_escape == UTF8)
00408           val_string = convert_escapes (val_string, true);
00409        else if (val_escape == ENTITY_NAME)
00410          val_string = "&" * val_string * ";";
00411         //cout << "key: " << key_string << " val: " << val_string << "\n";
00412         put_prefix_code(key_string,val_string,dic);        
00413       }
00414   }
00415 }
00416 
00417 /***************************************************************************
00418 * Functions for UTF-8 handling
00419 * These functions are helper functions to convert escape string a la "#23F7"
00420 * and HTML/XML character entities to and from UTF-8 byte sequences.
00421 ***************************************************************************/
00422 
00423 bool is_hex_digit (char c) {
00424   return
00425     (48 <= c && c <= 57) ||
00426     (65 <= c && c <= 70) ||
00427     (97 <= c && c <= 102);
00428 }
00429 
00430 int hex_digit_to_int(unsigned char c) {
00431   if (48 <= c && c <= 57)
00432     return c - 0x30;
00433   else if (65 <= c && c <= 70)
00434     return c - 0x41 + 0x0A;
00435   else if (97 <= c && c <= 102)
00436     return c - 0x61 + 0x0A;
00437   else
00438     return 0;
00439 }
00440 
00441 string
00442 convert_escapes (string in, bool utf8) {
00443   // cout << "converting " << in ;
00444   string result;
00445   int i = 0;
00446   while (i < N(in)) {
00447     if (in[i]!='#') result << in[i++];
00448     else {
00449       i++;
00450       unsigned int num = 0;
00451       while (i < N(in) && is_hex_digit(in[i]))
00452         num = 0x10 * num + hex_digit_to_int((unsigned char) in[i++]);
00453       //cout << " to num "; printf("%x",num); cout << " then to ";
00454       if (utf8) result << encode_as_utf8 (num);
00455       else result << string((char)num);
00456     }
00457   }
00458   //for(int i = 0; i < N(result);i++)
00459   //  printf("%x ", (unsigned char)result[i]); printf("\n");
00460   return result;
00461 }
00462 
00463 string
00464 convert_char_entities (string s) {
00465   int i, n=N(s);
00466   string r;
00467   for (i=0; i<n; /* noop */) {
00468     if (s[i] == '&' && i+1<n && s[i+1] == '#') {
00469       i += 2;
00470       bool okay= false;
00471       string rr= convert_char_entity(s, i, okay);
00472       if (okay) r << rr;
00473       else { r << "&#"; continue; }
00474     }
00475     else r << s[i++];
00476   }
00477   return r;
00478 }
00479 
00480 static unsigned int
00481 as_unsigned_int (string s) {
00482   int i=0, n=N(s);
00483   unsigned int val=0;
00484   if (n==0) return 0;
00485   while (i<n) {
00486     if (s[i]<'0') break;
00487     if (s[i]>'9') break;
00488     val *= 10;
00489     val += (int) (s[i]-'0');
00490     i++;
00491   }
00492   return val;
00493 }
00494 
00495 string
00496 convert_char_entity (string s, int& start, bool& success) {
00497   // start: position in s after the character entity marker "&#".
00498   success = false;
00499   int i= start;
00500   int n= N(s);
00501   unsigned int num= 0;
00502   if (i >= n) return "";
00503   else if (s[i] == 'x' || s[i] == 'X') {
00504     i++;
00505     // int j=i;
00506     while (i<n && is_hex_digit (s[i])) {
00507       success = true;
00508       num = 0x10 * num + hex_digit_to_int(s[i]);
00509       i++;
00510     }
00511     // if (success) cout << "hex-ent: " << s(j,i) ;
00512   }
00513   else {
00514     int j=i;
00515     while (i<n && is_digit (s[i])) {
00516       success = true;
00517       i++;
00518     }
00519     // if (success) cout << "dec-ent: " << s(j,i) ;
00520     num = as_unsigned_int (s(j,i));
00521   }
00522   if (success) {
00523     if (i<n && s[i]==';') i++;
00524     start= i;
00525     // cout << " --> (" << num << ") " << encode_as_utf8 (num) << '\n' ;
00526     return encode_as_utf8(num);
00527   }
00528   else return "";
00529 }
00530 
00531 string
00532 encode_as_utf8 (unsigned int code) {
00533   if (/* 0x0 <= code && */ code <= 0x7F) {
00534     // 0x0ddddddd
00535     return string((char) code);
00536   }
00537   else if (0x80 <= code  && code <= 0x7FF) {
00538     // 0x110ddddd 0x10dddddd
00539     string str(2);
00540     str[0] = ((code >> 6) & 0x1F) | 0xC0;
00541     str[1] = (code & 0x3F) | 0x80;
00542     return str;
00543   } 
00544   else if (0x800 <= code && code <= 0xFFFF) {
00545     // 0x1110dddd 0x10dddddd 0x10dddddd
00546     string str(3);
00547     str[0] = ((code >> 12) & 0x0F) | 0xE0;
00548     str[1] = ((code >> 6) & 0x3F) | 0x80;
00549     str[2] = (code & 0x3F) | 0x80;
00550     return str;
00551   }
00552   else if (0x10000 <= code && code <= 0x1FFFFF) {
00553     // 0x11110uuu 0x10zzzzzz 0x10yyyyyy 0x10xxxxxx
00554     string str(4);
00555     str[0] = ((code >> 18) & 0x07) | 0xF0;
00556     str[1] = ((code >> 12) & 0x3F) | 0x80;
00557     str[2] = ((code >> 6) & 0x3F) | 0x80;
00558     str[3] = (code & 0x3F) | 0x80;
00559     return str;
00560   }
00561   else return "";
00562 }
00563 
00564 unsigned int
00565 decode_from_utf8 (string s, int& i) {
00566   unsigned char c = s[i];
00567   if ((0x80 & c) == 0) {
00568     // 0x0ddddddd
00569     i++;
00570     return (unsigned int) c;
00571   }
00572   unsigned int code;
00573   int trail;
00574   if ((0xE0 & c) == 0xC0) {
00575     // 0x110ddddd 0x10dddddd
00576     trail = 1;
00577     code = c & 0x1F;
00578   }
00579   else if ((0xF0 & c) == 0xE0) {
00580     // 0x1110dddd 0x10dddddd 0x10dddddd
00581     trail = 2;
00582     code = c & 0x0F;
00583   }
00584   else if ((0xF8 & c) == 0xF0) {
00585     // 0x11110dddd 0x10dddddd 0x10dddddd 0x10dddddd
00586     trail = 3;
00587     code = c & 0x07;
00588   }
00589   else {
00590     // failsafe
00591     //cout << "failsafe: " << c << " (" << (unsigned int)(c) << ")\n";
00592     i++;
00593     return (unsigned int) c;
00594   }
00595   for (; trail > 0; trail--) {
00596     i++;
00597     if (i >= N(s)) i= N(s)-1;
00598     c = s[i];
00599     code = (code << 6) | (c & 0x3F);
00600   }
00601   i++;
00602   return code;
00603 }
00604 
00605 string
00606 utf8_to_hex_entities (string s) {
00607   string result;
00608   int i, n= N(s);
00609   for (i=0; i<n; ) {
00610     unsigned char c = s[i];
00611     if ((0x80 & c) == 0 || ((0xF8 & c) == 0xF8)) {
00612       result << c;
00613       i++;
00614     }
00615     else {
00616       unsigned int code= decode_from_utf8 (s, i);
00617       string hex= as_hexadecimal (code);
00618       while (N(hex) < 4) hex = "0" * hex;
00619       //cout << "entity: " << hex << " (" << code << ")\n";
00620       result << "&#x" << hex << ";";
00621     }
00622   }
00623   return result;
00624 
00625   /*
00626   string result;
00627   const int n = N(s);
00628   int i;
00629   for (i=0; i<n; i++) {
00630     unsigned char c = s[i];
00631     if ((0x80 & c) == 0) {
00632       // 0x0ddddddd
00633       //cout << "ASCII: " << c << '\n';
00634       result << c;
00635       continue;
00636     }
00637     unsigned int code;
00638     int trail;
00639     if ((0xE0 & c) == 0xC0) {
00640       // 0x110ddddd 0x10dddddd
00641       trail = 1;
00642       code = c & 0x1F;
00643     }
00644     else if ((0xF0 & c) == 0xE0) {
00645       // 0x1110dddd 0x10dddddd 0x10dddddd
00646       trail = 2;
00647       code = c & 0x0F;
00648     }
00649     else if ((0xF8 & c) == 0xF0) {
00650       // 0x11110dddd 0x10dddddd 0x10dddddd 0x10dddddd
00651       trail = 3;
00652       code = c & 0x07;
00653     }
00654     else {
00655       // failsafe
00656       //cout << "failsafe: " << c << " (" << (unsigned int)(c) << ")\n";
00657       result << c;
00658       continue;
00659     }
00660     for (; trail > 0; trail--) {
00661       // Garbage in, garbage out. Do not resync when input is bad.
00662       i++;
00663       c = s[i];
00664       code = (code << 6) | (c & 0x3F);
00665     }
00666     string hex= as_hexadecimal (code);
00667     while (N(hex) < 4) hex = "0" * hex;
00668     //cout << "entity: " << hex << " (" << code << ")\n";
00669     result << "&#x" << hex << ";";
00670   }
00671   return result;
00672   */
00673 }