Index: kdelibs/khtml/misc/decoder.cpp =================================================================== RCS file: /home/kde/kdelibs/khtml/misc/decoder.cpp,v retrieving revision 1.50 retrieving revision 1.57 diff -u -r1.50 -r1.57 --- kdelibs/khtml/misc/decoder.cpp 29 Jul 2001 16:26:38 -0000 1.50 +++ kdelibs/khtml/misc/decoder.cpp 14 May 2002 00:37:15 -0000 1.57 @@ -21,15 +21,18 @@ //---------------------------------------------------------------------------- // // KDE HTML Widget -- decoder for input stream -// $Id: decoder.cpp,v 1.50 2001/07/29 16:26:38 mueller Exp $ +// $Id: decoder.cpp,v 1.57 2002/05/14 00:37:15 mueller Exp $ #undef DECODE_DEBUG //#define DECODE_DEBUG +#include + #include "decoder.h" using namespace khtml; #include "htmlhashes.h" + #include #include @@ -38,6 +41,230 @@ #include #include +#include + +class KanjiCode +{ +public: + enum Type {ASCII, JIS, EUC, SJIS, UNICODE, UTF8 }; + static enum Type judge(const char *str); + static const int ESC; + static const int _SS2_; + static const unsigned char kanji_map_sjis[]; + static int ISkanji(int code) + { + if (code >= 0x100) + return 0; + return (kanji_map_sjis[code & 0xff] & 1); + } + + static int ISkana(int code) + { + if (code >= 0x100) + return 0; + return (kanji_map_sjis[code & 0xff] & 2); + } + +}; + +const int KanjiCode::ESC = 0x1b; +const int KanjiCode::_SS2_ = 0x8e; + +const unsigned char KanjiCode::kanji_map_sjis[] = +{ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 +}; + +/* + * EUC-JP is + * [0xa1 - 0xfe][0xa1 - 0xfe] + * 0x8e[0xa1 - 0xfe](SS2) + * 0x8f[0xa1 - 0xfe][0xa1 - 0xfe](SS3) + * + * Shift_Jis is + * [0x81 - 0x9f, 0xe0 - 0xef(0xfe?)][0x40 - 0x7e, 0x80 - 0xfc] + * + * Shift_Jis Hankaku Kana is + * [0xa1 - 0xdf] + */ + +/* + * KanjiCode::judge() is based on judge_jcode() from jvim + * http://hp.vector.co.jp/authors/VA003457/vim/ + * + * Special Thanks to Kenichi Tsuchida + */ + +/* + * Maybe we should use QTextCodec::heuristicContentMatch() + * But it fails detection. It's not useful. + */ + +enum KanjiCode::Type KanjiCode::judge(const char *str) +{ + enum Type code; + int i; + int bfr = FALSE; /* Kana Moji */ + int bfk = 0; /* EUC Kana */ + int sjis = 0; + int euc = 0; + + const unsigned char *ptr = (const unsigned char *) str; + int size = strlen(str); + + code = ASCII; + + i = 0; + while (i < size) { + if (ptr[i] == ESC && (size - i >= 3)) { + if ((ptr[i + 1] == '$' && ptr[i + 2] == 'B') + || (ptr[i + 1] == '(' && ptr[i + 2] == 'B')) { + code = JIS; + goto breakBreak; + } else if ((ptr[i + 1] == '$' && ptr[i + 2] == '@') + || (ptr[i + 1] == '(' && ptr[i + 2] == 'J')) { + code = JIS; + goto breakBreak; + } else if (ptr[i + 1] == '(' && ptr[i + 2] == 'I') { + code = JIS; + i += 3; + } else if (ptr[i + 1] == ')' && ptr[i + 2] == 'I') { + code = JIS; + i += 3; + } else { + i++; + } + bfr = FALSE; + bfk = 0; + } else { + if (ptr[i] < 0x20) { + bfr = FALSE; + bfk = 0; + /* ?? check kudokuten ?? && ?? hiragana ?? */ + if ((i >= 2) && (ptr[i - 2] == 0x81) + && (0x41 <= ptr[i - 1] && ptr[i - 1] <= 0x49)) { + code = SJIS; + sjis += 100; /* kudokuten */ + } else if ((i >= 2) && (ptr[i - 2] == 0xa1) + && (0xa2 <= ptr[i - 1] && ptr[i - 1] <= 0xaa)) { + code = EUC; + euc += 100; /* kudokuten */ + } else if ((i >= 2) && (ptr[i - 2] == 0x82) && (0xa0 <= ptr[i - 1])) { + sjis += 40; /* hiragana */ + } else if ((i >= 2) && (ptr[i - 2] == 0xa4) && (0xa0 <= ptr[i - 1])) { + euc += 40; /* hiragana */ + } + } else { + /* ?? check hiragana or katana ?? */ + if ((size - i > 1) && (ptr[i] == 0x82) && (0xa0 <= ptr[i + 1])) { + sjis++; /* hiragana */ + } else if ((size - i > 1) && (ptr[i] == 0x83) + && (0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x9f)) { + sjis++; /* katakana */ + } else if ((size - i > 1) && (ptr[i] == 0xa4) && (0xa0 <= ptr[i + 1])) { + euc++; /* hiragana */ + } else if ((size - i > 1) && (ptr[i] == 0xa5) && (0xa0 <= ptr[i + 1])) { + euc++; /* katakana */ + } + if (bfr) { + if ((i >= 1) && (0x40 <= ptr[i] && ptr[i] <= 0xa0) && ISkanji(ptr[i - 1])) { + code = SJIS; + goto breakBreak; + } else if ((i >= 1) && (0x81 <= ptr[i - 1] && ptr[i - 1] <= 0x9f) && ((0x40 <= ptr[i] && ptr[i] < 0x7e) || (0x7e < ptr[i] && ptr[i] <= 0xfc))) { + code = SJIS; + goto breakBreak; + } else if ((i >= 1) && (0xfd <= ptr[i] && ptr[i] <= 0xfe) && (0xa1 <= ptr[i - 1] && ptr[i - 1] <= 0xfe)) { + code = EUC; + goto breakBreak; + } else if ((i >= 1) && (0xfd <= ptr[i - 1] && ptr[i - 1] <= 0xfe) && (0xa1 <= ptr[i] && ptr[i] <= 0xfe)) { + code = EUC; + goto breakBreak; + } else if ((i >= 1) && (ptr[i] < 0xa0 || 0xdf < ptr[i]) && (0x8e == ptr[i - 1])) { + code = SJIS; + goto breakBreak; + } else if (ptr[i] <= 0x7f) { + code = SJIS; + goto breakBreak; + } else { + if (0xa1 <= ptr[i] && ptr[i] <= 0xa6) { + euc++; /* sjis hankaku kana kigo */ + } else if (0xa1 <= ptr[i] && ptr[i] <= 0xdf) { + ; /* sjis hankaku kana */ + } else if (0xa1 <= ptr[i] && ptr[i] <= 0xfe) { + euc++; + } else if (0x8e == ptr[i]) { + euc++; + } else if (0x20 <= ptr[i] && ptr[i] <= 0x7f) { + sjis++; + } + bfr = FALSE; + bfk = 0; + } + } else if (0x8e == ptr[i]) { + if (size - i <= 1) { + ; + } else if (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xdf) { + /* EUC KANA or SJIS KANJI */ + if (bfk == 1) { + euc += 100; + } + bfk++; + i++; + } else { + /* SJIS only */ + code = SJIS; + goto breakBreak; + } + } else if (0x81 <= ptr[i] && ptr[i] <= 0x9f) { + /* SJIS only */ + code = SJIS; + if ((size - i >= 1) + && ((0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x7e) + || (0x80 <= ptr[i + 1] && ptr[i + 1] <= 0xfc))) { + goto breakBreak; + } + } else if (0xfd <= ptr[i] && ptr[i] <= 0xfe) { + /* EUC only */ + code = EUC; + if ((size - i >= 1) + && (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xfe)) { + goto breakBreak; + } + } else if (ptr[i] <= 0x7f) { + ; + } else { + bfr = TRUE; + bfk = 0; + } + } + i++; + } + } + if (code == ASCII) { + if (sjis > euc) { + code = SJIS; + } else if (sjis < euc) { + code = EUC; + } + } +breakBreak: + return (code); +} Decoder::Decoder() { @@ -117,14 +344,12 @@ m_decoder = m_codec->makeDecoder(); } else { - if(m_codec->mibEnum() != 1000) // utf16 - { - // ### hack for a bug in QTextCodec. It cut's the input stream - // in case there are \0 in it. ZDNET has them inside... :-( + if(m_codec->mibEnum() != 1000) { // utf16 + // replace '\0' by spaces, for buggy pages char *d = const_cast(data); int i = len - 1; while(i >= 0) { - if(*(d+i) == 0) *(d+i) = ' '; + if(d[i] == 0) d[i] = ' '; i--; } } @@ -220,7 +445,7 @@ default: body = true; #ifdef DECODE_DEBUG - kdDebug( 6005 ) << "Decoder: no charset found, using latin1. Id=" << id << endl; + kdDebug( 6005 ) << "Decoder: no charset found. Id=" << id << endl; #endif goto found; } @@ -233,6 +458,32 @@ } found: + if (!haveEncoding && KGlobal::locale()->languageList()[0] == "ja") { +#ifdef DECODE_DEBUG + kdDebug( 6005 ) << "Decoder: use auto-detect (" << strlen(data) << ")" << endl; +#endif + switch ( KanjiCode::judge( data ) ) { + case KanjiCode::JIS: + enc = "jis7"; + break; + case KanjiCode::EUC: + enc = "eucjp"; + break; + case KanjiCode::SJIS: + enc = "sjis"; + break; + default: + enc = NULL; + break; + } +#ifdef DECODE_DEBUG + kdDebug( 6005 ) << "Decoder: auto detect encoding is " << enc << endl; +#endif + if (!enc.isEmpty()) { + setEncoding(enc, true); + } + } + // if we still haven't found an encoding latin1 will be used... // this is according to HTML4.0 specs if (!m_codec) @@ -270,7 +521,7 @@ // the hell knows, why the output does sometimes have a QChar::null at // the end... if(out[out.length()-1] == QChar::null) - out.truncate(out.length() - 1); + assert(0); return out; }