Back to index

texmacs  1.0.7.15
converter.hpp
Go to the documentation of this file.
00001 
00002 /******************************************************************************
00003 * MODULE     : converter.hpp
00004 * DESCRIPTION: Applies dictionaries to strings in an efficient manner.
00005 * COPYRIGHT  : (C) 2002  Felix Breuer
00006 *******************************************************************************
00007 * This software falls under the GNU general public license version 3 or later.
00008 * It comes WITHOUT ANY WARRANTY WHATSOEVER. For details, see the file LICENSE
00009 * in the root directory or <http://www.gnu.org/licenses/gpl-3.0.html>.
00010 ******************************************************************************/
00011 
00012 #ifndef CONVERTER_H
00013 #define CONVERTER_H
00014 #include "resource.hpp"
00015 #include "hashtree.hpp"
00016 #include "file.hpp"
00017 
00018 enum escape_type { NOESCAPES, BIT2BIT, UTF8, ENTITY_NAME, CHAR_ENTITY };
00019 
00020 RESOURCE(converter);
00021 
00022 /******************************************************************************
00023 * The converter class applies a dictionary to a given string.
00024 * It does so by iterating over a string, finding the longest matching key
00025 * in the dictionary and replacing the matched substring with the translation.
00026 * The hashtree and converter classes are used.
00027 ******************************************************************************/
00028 
00029 class converter_rep: public rep<converter> {
00030   hashtree<char,string> ht;
00031   string output, nil_string, from, to;
00032   bool copy_unmatched;
00033   void match (string& str, int& index);
00034   void load ();
00035 
00036 public:
00037   inline converter_rep(string from2, string to2) : 
00038     rep<converter>(from2*"-"*to2), ht(), output(), 
00039     nil_string(), from(from2), to(to2), copy_unmatched(true) { load(); }
00040 
00041   inline bool has_value(hashtree<char,string> node);
00042 
00043   friend class converter;
00044   friend string flush (converter c);
00045   friend string apply (converter c, string str);
00046   friend void operator << (converter c, string str);
00047 };
00048  
00049 /******************************************************************************
00050 * functions that operate on converters
00051 ******************************************************************************/
00052   
00053 // takes a string str and returns its translation. strings contained in the
00054 // converter are lost in this process. the converter is empty when this
00055 // method returns.
00056 string apply (converter c, string str);
00057   
00058 // take a string and writes it into the converter. thus multiple strings can
00059 // be translated and concatenated into one big string
00060 void operator << (converter c, string str);
00061   
00062 // concatenates and returns the contents of this converter. this converter is
00063 // empty when this method returns.
00064 string flush (converter c);  
00065 
00066 /**************************************************************************
00067 * convenience functions
00068 **************************************************************************/
00069 
00070 // check that the input string is valid according to the specified encoding
00071 // converts the input string from one encoding to the other
00072 // recognized encodings are "Cork", "UTF-8" and all those recognized by iconv.
00073 
00074 bool check_encoding (string input, string encoding);
00075 string convert (string input, string from, string to);
00076 string convert_to_cork (string input, string from); 
00077 string convert_from_cork (string input, string to); 
00078 string utf8_to_cork (string input); 
00079 string cork_to_utf8 (string input); 
00080 string utf8_to_html (string input);
00081 string t2a_to_utf8 (string input);
00082 bool check_using_iconv (string input, string encoding);
00083 string convert_using_iconv (string input, string from, string to); 
00084 
00085 /**************************************************************************
00086 * Functions for hashtree handling
00087 **************************************************************************/
00088 
00089 // find_node("test",tree) means tree('t')('e')('s')('t')
00090 // might modify hashtree!
00091 hashtree<char,string> find_node (string key, hashtree<char,string> ht);
00092 
00093 // finds a node and assigns it a value
00094 void put_prefix_code (string key, string value, hashtree<char,string> ht);
00095 
00096 // reads a dictionary from a file
00097 
00098 void hashtree_from_dictionary (
00099   hashtree<char,string> dic, string file_name, escape_type key_escape=BIT2BIT,
00100   escape_type val_escape=UTF8, bool reverse=false);
00101 
00102 /***************************************************************************
00103 * Functions for UTF-8 handling
00104 * These functions are helper functions to convert escape string a la "#23F7"
00105 * and HTML/XML character entities to and from UTF-8 byte sequences.
00106 ***************************************************************************/
00107 
00108 bool is_hex_digit (char c);
00109 int hex_digit_to_int (unsigned char c);
00110 string encode_as_utf8 (unsigned int code);
00111 unsigned int decode_from_utf8 (string s, int& i);
00112 string convert_escapes (string in, bool utf8);
00113 string convert_char_entities (string s);
00114 string convert_char_entity (string s, int& start, bool& success);
00115 string utf8_to_hex_entities (string s);
00116 
00117 #endif // CONVERTER_H