00001
00002 #ifndef MiX_EncodingResolver_h_
00003 #define MiX_EncodingResolver_h_
00004
00005 #include "classes.h"
00006 #include "XMLString.h"
00007 #include "Tokenizer.h"
00008 #include "XMLToken.h"
00009 namespace MiX {
00011 template <class charT>
00012 class generic_char_traits {
00013 public:
00014 typedef charT char_type;
00015 typedef const charT& const_char_ref;
00016 typedef const charT* const_char_ptr;
00017 typedef charT* char_ptr;
00018
00019 typedef int int_type;
00020 typedef size_t size_type;
00021
00022 typedef size_t pos_type;
00023 typedef size_t off_type;
00024 typedef char state_type;
00025
00026 static void assign(char_type& c1, const char_type& c2) { c1 = c2; }
00027 static bool eq( const_char_ref c1, const_char_ref c2) { return c1 == c2; }
00028 static bool lt( const_char_ref c1, const_char_ref c2) { return c1 < c2; }
00029 static int compare( const_char_ptr s1, const_char_ptr s2, size_type n) {
00030 for (size_type i = 0; i < n; ++i)
00031 if (!eq(s1[i], s2[i])) return lt(s1[i],s2[i]) ? -1 : 1;
00032 return 0;
00033 }
00034 static size_t length(const_char_ptr s) {
00035 char_type null = char_type();
00036 size_type i;
00037 for (i = 0; !eq(s[i], null); ++i) ;
00038 return i;
00039 }
00040 static const_char_ptr find(const_char_ptr s, size_type n, const_char_ref c) {
00041 for ( ; n > 0 ; ++s, --n) if (eq(*s, c)) return s;
00042 return 0;
00043 }
00044 static char_ptr move( char_ptr s1, const_char_ptr s2, size_type n) {
00045 return (n == 0 ? s1 : (char_ptr)memmove(s1, s2, n * sizeof(char_type)));
00046 }
00047
00048 static char_ptr copy( char_ptr s1, const_char_ptr s2, size_type n) {
00049 return (n == 0 ? s1 : (char_ptr)memcpy(s1, s2, n * sizeof(char_type)));
00050 }
00051 static char_ptr assign( char_ptr s, size_type n, char_type c) {
00052 for (size_type i = 0; i < n; ++i) s[i] = c;
00053 return s;
00054 }
00055
00056 static int_type not_eof(const int_type& c) {
00057 return !eq_int_type(c, eof()) ? c : static_cast<int_type>(0);
00058 }
00059 static char_type to_char_type(const int_type& c) {
00060 return (char_type)c;
00061 }
00062
00063 static int_type to_int_type(const char_type& c) {
00064 return (int_type)c;
00065 }
00066 static bool eq_int_type(const int_type& c1, const int_type& c2) {
00067 return c1 == c2;
00068 }
00069 static int_type eof() {
00070 return (int_type)-1;
00071 }
00072 };
00073
00074 struct CharsetInfo {
00075 int width;
00076 bool le;
00077 int len;
00078 };
00079
00081 inline CharsetInfo expectCharset( const char* data, int len ) {
00082 int zero_count = 0;
00083 const char* p = data;
00084 CharsetInfo ret;
00085 for( int i=0; i<len ; i++,p++ ) if( *p==0 ) zero_count++;
00086 if( zero_count==0 ) {
00087 ret.width = 1;
00088 ret.len = len;
00089 } else if( zero_count<len/2+1 ) {
00090
00091 ret.width = 2;
00092 ret.len = len/2;
00093
00094
00095 for( int i=0 ; i<len-1 ; i+=2, p+=2 ) {
00096 if( data[0]==0 ) {
00097 ret.le = true;
00098 break;
00099 }
00100 if( data[1]==0 ) {
00101 ret.le = false;
00102 break;
00103 }
00104 }
00105 } else {
00106
00107 ret.width = 4;
00108 ret.len = len/4;
00109 for( int i=0 ; i<len-1 ; i+=2, p+=2 ) {
00110 if( data[0]==0 ) {
00111 ret.le = true;
00112 break;
00113 }
00114 if( data[3]==0 ) {
00115 ret.le = false;
00116 break;
00117 }
00118 }
00119 }
00120 return ret;
00121 }
00122
00123 template <class T>
00124 inline T* createBuffer( CharsetInfo info, const char* data ) {
00125 T* ret = new T[info.len+1];
00126 T* cur = ret;
00127 for( int i=0 ; i<info.len ; data+=info.width, ++i ) {
00128 *cur = 0;
00129 if( info.le ) {
00130 for( int j=0 ; j<info.width ; ++j ) {
00131 *cur *= 256;
00132 *cur += static_cast<T>(data[j]);
00133 }
00134 } else {
00135 for( int j=info.width-1 ; j>=0 ; --j ) {
00136 *cur *= 256;
00137 *cur += static_cast<T>(data[j]);
00138 }
00139 }
00140 cur++;
00141 }
00142 *cur = T();
00143 return ret;
00144 }
00145
00146
00147
00148 inline std::string getEncoding( const char* data, int len ) {
00149 typedef unsigned long bigchar_t;
00150 typedef XMLString<bigchar_t, generic_char_traits<bigchar_t> > bigstr_t;
00151 bigchar_t str_enc[] = {
00152 (bigchar_t)'e', (bigchar_t)'n', (bigchar_t)'c', (bigchar_t)'o',
00153 (bigchar_t)'d', (bigchar_t)'i', (bigchar_t)'n', (bigchar_t)'g',
00154 (bigchar_t) 0
00155 };
00156 CharsetInfo info = expectCharset( data, len );
00157 MiX::Tokenizer<bigchar_t, generic_char_traits<bigchar_t> > tokenizer;
00158 unsigned long *buf = createBuffer<unsigned long>( info, data );
00159 bigstr_t bigret;
00160 tokenizer.injectString( buf );
00161 XMLToken<bigchar_t, generic_char_traits<bigchar_t> > tok;
00162 do {
00163 tokenizer.ejectToken( tok );
00164 if( tok.getType()==Token_text ) {
00165 bigstr_t str = tok.getData();
00166 if( str==str_enc ) {
00167 tokenizer.ejectToken( tok );
00168 if( tok.getType()==Token_eq ) {
00169 tokenizer.ejectToken( tok );
00170 bigret.clear();
00171 if( tok.getType()==Token_quote || tok.getType()==Token_dblquote ) {
00172 tokenizer.ejectToken( tok );
00173 while( tok.getType()!=Token_dblquote &&
00174 tok.getType()!=Token_quote ) {
00175 bigret += tok.getData();
00176 tokenizer.ejectToken( tok );
00177 }
00178 break;
00179 } else tokenizer.pushToken( tok );
00180 } else tokenizer.pushToken( tok );
00181 }
00182 }
00183 } while( tok.getType()!=Token_null );
00184 std::string ret;
00185 bigstr_t::iterator it = bigret.begin();
00186 bigstr_t::iterator last = bigret.end();
00187 for( ; it!=last ; ++it ) {
00188 ret += (char)(*it);
00189 }
00190 delete[] buf;
00191 return ret;
00192 }
00193 }
00194
00195 #endif