Copy the KDE 3.5 branch to branches/trinity for new KDE 3.5 features.

BUG:215923 git-svn-id: svn://anonsvn.kde.org/home/kde/branches/trinity/kdepim@1054174 283d02a7-25f6-0310-bc7c-ecb5cbfe19da
author: toma <toma@283d02a7-25f6-0310-bc7c-ecb5cbfe19da> 2009-11-25 17:56:58 +0000
committer: toma <toma@283d02a7-25f6-0310-bc7c-ecb5cbfe19da> 2009-11-25 17:56:58 +0000
commit: 460c52653ab0dcca6f19a4f492ed2c5e4e963ab0 (patch)
tree: 67208f7c145782a7e90b123b982ca78d88cc2c87 /kmail/encodingdetector.cpp
download: tdepim-460c52653ab0dcca6f19a4f492ed2c5e4e963ab0.tar.gz
tdepim-460c52653ab0dcca6f19a4f492ed2c5e4e963ab0.zip
1 files changed, 1377 insertions, 0 deletions
diff --git a/kmail/encodingdetector.cpp b/kmail/encodingdetector.cpp
new file mode 100644
index 00000000..e5881d6f
--- /dev/null
+++ b/kmail/encodingdetector.cpp
@@ -0,0 +1,1377 @@
+/*
+    This file was taken from the KDE 4.x libraries and backported to Qt 3.
+
+    Copyright (C) 1999 Lars Knoll (knoll@kde.org)
+    Copyright (C) 2003 Dirk Mueller (mueller@kde.org)
+    Copyright (C) 2003 Apple Computer, Inc.
+    Copyright (C) 2007 Nick Shaforostoff (shafff@ukr.net)
+
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Library General Public
+    License as published by the Free Software Foundation; either
+    version 2 of the License, or (at your option) any later version.
+
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Library General Public License for more details.
+
+    You should have received a copy of the GNU Library General Public License
+    along with this library; see the file COPYING.LIB.  If not, write to
+    the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+    Boston, MA 02110-1301, USA.
+*/
+//----------------------------------------------------------------------------
+//
+// decoder for input stream
+
+#include "encodingdetector.h"
+
+#undef DECODE_DEBUG
+//#define DECODE_DEBUG
+
+#define MAX_BUFFER 16*1024
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include "encodingdetector_ja_p.h"
+
+#include <qregexp.h>
+#include <qtextcodec.h>
+
+#include <kglobal.h>
+#include <kcharsets.h>
+#include <kdebug.h>
+#include <klocale.h>
+
+#include <ctype.h>
+
+// The following table was taken from libpango 1.19.3 and slightly modified.
+// Multiple scripts per language were removed and the entries were reordered so
+// that simple substring matching will work. For example, bam was put before ba
+// so that the first match will be likely the right match. Otherwise "ba" would
+// match "bam" but we would have to search on to find "bam" which is what we want.
+// The original file is called pango-script-lang-table.h
+
+/* pango-script-lang-table.h:
+ * 
+ * Generated by gen-script-for-lang-new.c
+ * Date: 2007-10-26
+ * Source: fontconfig-2.4.91
+ * 
+ * Do not edit. // I did. Sue me ;)
+ */
+typedef struct _PangoScriptForLang {
+  const char lang[6];
+  EncodingDetector::AutoDetectScript scripts[1];
+} PangoScriptForLang;
+
+//Unfortunately EncodingDetector does not know all scripts that Pango knows.
+//Also, using EncodingDetector::CentralEuropean for the appropriate countries
+//might give better results in some cases.
+//One especially important (many speakers/literates) omission is the lack of
+//Indian scripts.
+
+#define PANGO_SCRIPT_ARMENIAN EncodingDetector::None
+#define PANGO_SCRIPT_BENGALI EncodingDetector::None
+#define PANGO_SCRIPT_CANADIAN_ABORIGINAL EncodingDetector::None
+#define PANGO_SCRIPT_CHEROKEE EncodingDetector::None
+#define PANGO_SCRIPT_DEVANAGARI EncodingDetector::None
+#define PANGO_SCRIPT_ETHIOPIC EncodingDetector::None
+#define PANGO_SCRIPT_GUJARATI EncodingDetector::None
+#define PANGO_SCRIPT_GURMUKHI EncodingDetector::None
+#define PANGO_SCRIPT_KANNADA EncodingDetector::None
+#define PANGO_SCRIPT_KHMER EncodingDetector::None
+#define PANGO_SCRIPT_LAO EncodingDetector::None
+#define PANGO_SCRIPT_MALAYALAM EncodingDetector::None
+#define PANGO_SCRIPT_MONGOLIAN EncodingDetector::None
+#define PANGO_SCRIPT_MYANMAR EncodingDetector::None
+#define PANGO_SCRIPT_ORIYA EncodingDetector::None
+#define PANGO_SCRIPT_SINHALA EncodingDetector::None
+#define PANGO_SCRIPT_SYRIAC EncodingDetector::None
+#define PANGO_SCRIPT_TAGALOG EncodingDetector::None
+#define PANGO_SCRIPT_TAMIL EncodingDetector::None
+#define PANGO_SCRIPT_TIBETAN EncodingDetector::None
+#define PANGO_SCRIPT_TELUGU EncodingDetector::None
+
+//Instead of changing the table even more...
+#define PANGO_SCRIPT_ARABIC EncodingDetector::Arabic
+#define PANGO_SCRIPT_CYRILLIC EncodingDetector::Cyrillic
+#define PANGO_SCRIPT_GEORGIAN EncodingDetector::SouthEasternEurope
+#define PANGO_SCRIPT_GREEK EncodingDetector::Greek
+#define PANGO_SCRIPT_HEBREW EncodingDetector::Hebrew
+#define PANGO_SCRIPT_LATIN EncodingDetector::WesternEuropean
+#define PANGO_SCRIPT_THAI EncodingDetector::Thai
+
+
+static const PangoScriptForLang pango_script_for_lang[] = {
+  { "aa",    { PANGO_SCRIPT_LATIN/*62*/ } },
+  { "ab",    { PANGO_SCRIPT_CYRILLIC/*90*/ } },
+  { "af",    { PANGO_SCRIPT_LATIN/*69*/ } },
+  { "am",    { PANGO_SCRIPT_ETHIOPIC/*218*/ } },
+  { "ar",    { PANGO_SCRIPT_ARABIC/*125*/ } },
+  { "as",    { PANGO_SCRIPT_BENGALI/*89*/ } },
+  { "ast",   { PANGO_SCRIPT_LATIN/*66*/ } },
+  { "ava",   { PANGO_SCRIPT_CYRILLIC/*67*/ } },
+  { "ay",    { PANGO_SCRIPT_LATIN/*60*/ } },
+  { "az-ir", { PANGO_SCRIPT_ARABIC/*129*/ } },
+  { "az",    { PANGO_SCRIPT_CYRILLIC/*80*/ } }, //, PANGO_SCRIPT_LATIN/*68*/ } },
+  { "bam",   { PANGO_SCRIPT_LATIN/*60*/ } },
+  { "ba",    { PANGO_SCRIPT_CYRILLIC/*82*/ } },
+  { "be",    { PANGO_SCRIPT_CYRILLIC/*68*/ } },
+  { "bg",    { PANGO_SCRIPT_CYRILLIC/*60*/ } },
+  { "bh",    { PANGO_SCRIPT_DEVANAGARI/*68*/ } },
+  { "bho",   { PANGO_SCRIPT_DEVANAGARI/*68*/ } },
+  { "bi",    { PANGO_SCRIPT_LATIN/*58*/ } },
+  { "bin",   { PANGO_SCRIPT_LATIN/*76*/ } },
+  { "bn",    { PANGO_SCRIPT_BENGALI/*89*/ } },
+  { "bo",    { PANGO_SCRIPT_TIBETAN/*95*/ } },
+  { "br",    { PANGO_SCRIPT_LATIN/*64*/ } },
+  { "bs",    { PANGO_SCRIPT_LATIN/*62*/ } },
+  { "bua",   { PANGO_SCRIPT_CYRILLIC/*70*/ } },
+  { "ca",    { PANGO_SCRIPT_LATIN/*74*/ } },
+  { "ce",    { PANGO_SCRIPT_CYRILLIC/*67*/ } },
+  { "chm",   { PANGO_SCRIPT_CYRILLIC/*76*/ } },
+  { "chr",   { PANGO_SCRIPT_CHEROKEE/*85*/ } },
+  { "ch",    { PANGO_SCRIPT_LATIN/*58*/ } },
+  { "co",    { PANGO_SCRIPT_LATIN/*84*/ } },
+  { "cs",    { PANGO_SCRIPT_LATIN/*82*/ } },
+  { "cu",    { PANGO_SCRIPT_CYRILLIC/*103*/ } },
+  { "cv",    { PANGO_SCRIPT_CYRILLIC/*72*/ } }, //, PANGO_SCRIPT_LATIN/*2*/ } },
+  { "cy",    { PANGO_SCRIPT_LATIN/*78*/ } },
+  { "da",    { PANGO_SCRIPT_LATIN/*70*/ } },
+  { "de",    { PANGO_SCRIPT_LATIN/*59*/ } },
+  { "dz",    { PANGO_SCRIPT_TIBETAN/*95*/ } },
+  { "el",    { PANGO_SCRIPT_GREEK/*69*/ } },
+  { "en",    { PANGO_SCRIPT_LATIN/*72*/ } },
+  { "eo",    { PANGO_SCRIPT_LATIN/*64*/ } },
+  { "es",    { PANGO_SCRIPT_LATIN/*66*/ } },
+//  { "et",    { PANGO_SCRIPT_LATIN/*64*/ } },
+  { "et",    { EncodingDetector::Baltic } },
+  { "eu",    { PANGO_SCRIPT_LATIN/*56*/ } },
+  { "fa",    { PANGO_SCRIPT_ARABIC/*129*/ } },
+  { "fi",    { PANGO_SCRIPT_LATIN/*62*/ } },
+  { "fj",    { PANGO_SCRIPT_LATIN/*52*/ } },
+  { "fo",    { PANGO_SCRIPT_LATIN/*68*/ } },
+  { "fr",    { PANGO_SCRIPT_LATIN/*84*/ } },
+  { "ful",   { PANGO_SCRIPT_LATIN/*62*/ } },
+  { "fur",   { PANGO_SCRIPT_LATIN/*66*/ } },
+  { "fy",    { PANGO_SCRIPT_LATIN/*75*/ } },
+  { "ga",    { PANGO_SCRIPT_LATIN/*80*/ } },
+  { "gd",    { PANGO_SCRIPT_LATIN/*70*/ } },
+  { "gez",   { PANGO_SCRIPT_ETHIOPIC/*218*/ } },
+  { "gl",    { PANGO_SCRIPT_LATIN/*66*/ } },
+  { "gn",    { PANGO_SCRIPT_LATIN/*70*/ } },
+  { "gu",    { PANGO_SCRIPT_GUJARATI/*78*/ } },
+  { "gv",    { PANGO_SCRIPT_LATIN/*54*/ } },
+  { "ha",    { PANGO_SCRIPT_LATIN/*60*/ } },
+  { "haw",   { PANGO_SCRIPT_LATIN/*62*/ } },
+  { "he",    { PANGO_SCRIPT_HEBREW/*27*/ } },
+  { "hi",    { PANGO_SCRIPT_DEVANAGARI/*68*/ } },
+  { "ho",    { PANGO_SCRIPT_LATIN/*52*/ } },
+  { "hr",    { PANGO_SCRIPT_LATIN/*62*/ } },
+  { "hu",    { PANGO_SCRIPT_LATIN/*70*/ } },
+  { "hy",    { PANGO_SCRIPT_ARMENIAN/*77*/ } },
+  { "ia",    { PANGO_SCRIPT_LATIN/*52*/ } },
+  { "ibo",   { PANGO_SCRIPT_LATIN/*58*/ } },
+  { "id",    { PANGO_SCRIPT_LATIN/*54*/ } },
+  { "ie",    { PANGO_SCRIPT_LATIN/*52*/ } },
+  { "ik",    { PANGO_SCRIPT_CYRILLIC/*68*/ } },
+  { "io",    { PANGO_SCRIPT_LATIN/*52*/ } },
+  { "is",    { PANGO_SCRIPT_LATIN/*70*/ } },
+  { "it",    { PANGO_SCRIPT_LATIN/*72*/ } },
+  { "iu",    { PANGO_SCRIPT_CANADIAN_ABORIGINAL/*161*/ } },
+//  { "ja",    { PANGO_SCRIPT_HAN/*6356*/, PANGO_SCRIPT_KATAKANA/*88*/, PANGO_SCRIPT_HIRAGANA/*85*/ } },
+  { "ja",    { EncodingDetector::Japanese } },
+  { "kaa",   { PANGO_SCRIPT_CYRILLIC/*78*/ } },
+  { "ka",    { PANGO_SCRIPT_GEORGIAN/*33*/ } },
+  { "ki",    { PANGO_SCRIPT_LATIN/*56*/ } },
+  { "kk",    { PANGO_SCRIPT_CYRILLIC/*77*/ } },
+  { "kl",    { PANGO_SCRIPT_LATIN/*81*/ } },
+  { "km",    { PANGO_SCRIPT_KHMER/*70*/ } },
+  { "kn",    { PANGO_SCRIPT_KANNADA/*80*/ } },
+//  { "ko",    { PANGO_SCRIPT_HANGUL/*2443*/ } },
+  { "ko",    { EncodingDetector::Korean } },
+  { "kok",   { PANGO_SCRIPT_DEVANAGARI/*68*/ } },
+  { "ks",    { PANGO_SCRIPT_DEVANAGARI/*68*/ } },
+  { "ku-ir", { PANGO_SCRIPT_ARABIC/*32*/ } },
+  { "ku",    { PANGO_SCRIPT_CYRILLIC/*60*/ } }, //, PANGO_SCRIPT_LATIN/*4*/ } },
+  { "kum",   { PANGO_SCRIPT_CYRILLIC/*66*/ } },
+  { "kv",    { PANGO_SCRIPT_CYRILLIC/*70*/ } },
+  { "kw",    { PANGO_SCRIPT_LATIN/*64*/ } },
+  { "ky",    { PANGO_SCRIPT_CYRILLIC/*70*/ } },
+  { "la",    { PANGO_SCRIPT_LATIN/*68*/ } },
+  { "lb",    { PANGO_SCRIPT_LATIN/*75*/ } },
+  { "lez",   { PANGO_SCRIPT_CYRILLIC/*67*/ } },
+  { "ln",    { PANGO_SCRIPT_LATIN/*78*/ } },
+  { "lo",    { PANGO_SCRIPT_LAO/*65*/ } },
+//  { "lt",    { PANGO_SCRIPT_LATIN/*70*/ } },
+  { "lt",    { EncodingDetector::Baltic } },
+//  { "lv",    { PANGO_SCRIPT_LATIN/*78*/ } },
+  { "lv",    { EncodingDetector::Baltic } },
+  { "mg",    { PANGO_SCRIPT_LATIN/*56*/ } },
+  { "mh",    { PANGO_SCRIPT_LATIN/*62*/ } },
+  { "mi",    { PANGO_SCRIPT_LATIN/*64*/ } },
+  { "mk",    { PANGO_SCRIPT_CYRILLIC/*42*/ } },
+  { "ml",    { PANGO_SCRIPT_MALAYALAM/*78*/ } },
+  { "mn",    { PANGO_SCRIPT_MONGOLIAN/*130*/ } },
+  { "mo",    { PANGO_SCRIPT_CYRILLIC/*66*/ } }, //, PANGO_SCRIPT_LATIN/*62*/ } },
+  { "mr",    { PANGO_SCRIPT_DEVANAGARI/*68*/ } },
+  { "mt",    { PANGO_SCRIPT_LATIN/*72*/ } },
+  { "my",    { PANGO_SCRIPT_MYANMAR/*48*/ } },
+  { "nb",    { PANGO_SCRIPT_LATIN/*70*/ } },
+  { "nds",   { PANGO_SCRIPT_LATIN/*59*/ } },
+  { "ne",    { PANGO_SCRIPT_DEVANAGARI/*68*/ } },
+  { "nl",    { PANGO_SCRIPT_LATIN/*82*/ } },
+  { "nn",    { PANGO_SCRIPT_LATIN/*76*/ } },
+  { "no",    { PANGO_SCRIPT_LATIN/*70*/ } },
+  { "nr",    { PANGO_SCRIPT_LATIN/*52*/ } },
+  { "nso",   { PANGO_SCRIPT_LATIN/*58*/ } },
+  { "ny",    { PANGO_SCRIPT_LATIN/*54*/ } },
+  { "oc",    { PANGO_SCRIPT_LATIN/*70*/ } },
+  { "om",    { PANGO_SCRIPT_LATIN/*52*/ } },
+  { "or",    { PANGO_SCRIPT_ORIYA/*79*/ } },
+  { "os",    { PANGO_SCRIPT_CYRILLIC/*66*/ } },
+  { "pa",    { PANGO_SCRIPT_GURMUKHI/*63*/ } },
+  { "pl",    { PANGO_SCRIPT_LATIN/*70*/ } },
+  { "ps-af", { PANGO_SCRIPT_ARABIC/*49*/ } },
+  { "ps-pk", { PANGO_SCRIPT_ARABIC/*49*/ } },
+  { "pt",    { PANGO_SCRIPT_LATIN/*82*/ } },
+  { "rm",    { PANGO_SCRIPT_LATIN/*66*/ } },
+  { "ro",    { PANGO_SCRIPT_LATIN/*62*/ } },
+  { "ru",    { PANGO_SCRIPT_CYRILLIC/*66*/ } },
+  { "sah",   { PANGO_SCRIPT_CYRILLIC/*76*/ } },
+  { "sa",    { PANGO_SCRIPT_DEVANAGARI/*68*/ } },
+  { "sco",   { PANGO_SCRIPT_LATIN/*56*/ } },
+  { "sel",   { PANGO_SCRIPT_CYRILLIC/*66*/ } },
+  { "se",    { PANGO_SCRIPT_LATIN/*66*/ } },
+  { "sh",    { PANGO_SCRIPT_CYRILLIC/*76*/ } },
+  { "si",    { PANGO_SCRIPT_SINHALA/*77*/ } },
+  { "sk",    { PANGO_SCRIPT_LATIN/*86*/ } },
+  { "sl",    { PANGO_SCRIPT_LATIN/*62*/ } },
+  { "sma",   { PANGO_SCRIPT_LATIN/*60*/ } },
+  { "smj",   { PANGO_SCRIPT_LATIN/*60*/ } },
+  { "smn",   { PANGO_SCRIPT_LATIN/*68*/ } },
+  { "sms",   { PANGO_SCRIPT_LATIN/*80*/ } },
+  { "sm",    { PANGO_SCRIPT_LATIN/*52*/ } },
+  { "so",    { PANGO_SCRIPT_LATIN/*52*/ } },
+  { "sq",    { PANGO_SCRIPT_LATIN/*56*/ } },
+  { "sr",    { PANGO_SCRIPT_CYRILLIC/*76*/ } },
+  { "ss",    { PANGO_SCRIPT_LATIN/*52*/ } },
+  { "st",    { PANGO_SCRIPT_LATIN/*52*/ } },
+  { "sv",    { PANGO_SCRIPT_LATIN/*68*/ } },
+  { "sw",    { PANGO_SCRIPT_LATIN/*52*/ } },
+  { "syr",   { PANGO_SCRIPT_SYRIAC/*45*/ } },
+  { "ta",    { PANGO_SCRIPT_TAMIL/*48*/ } },
+  { "te",    { PANGO_SCRIPT_TELUGU/*80*/ } },
+  { "tg",    { PANGO_SCRIPT_CYRILLIC/*78*/ } },
+  { "th",    { PANGO_SCRIPT_THAI/*86*/ } },
+  { "ti-er", { PANGO_SCRIPT_ETHIOPIC/*255*/ } },
+  { "ti-et", { PANGO_SCRIPT_ETHIOPIC/*255*/ } },
+  { "tig",   { PANGO_SCRIPT_ETHIOPIC/*221*/ } },
+  { "tk",    { PANGO_SCRIPT_CYRILLIC/*74*/ } },
+  { "tl",    { PANGO_SCRIPT_TAGALOG/*19*/ } },
+  { "tn",    { PANGO_SCRIPT_LATIN/*58*/ } },
+  { "to",    { PANGO_SCRIPT_LATIN/*52*/ } },
+//  { "tr",    { PANGO_SCRIPT_LATIN/*70*/ } },
+  { "tr",    { EncodingDetector::Turkish } },
+  { "ts",    { PANGO_SCRIPT_LATIN/*52*/ } },
+  { "tt",    { PANGO_SCRIPT_CYRILLIC/*76*/ } },
+  { "tw",    { PANGO_SCRIPT_LATIN/*70*/ } },
+  { "tyv",   { PANGO_SCRIPT_CYRILLIC/*70*/ } },
+  { "ug",    { PANGO_SCRIPT_ARABIC/*125*/ } },
+  { "uk",    { PANGO_SCRIPT_CYRILLIC/*72*/ } },
+  { "ur",    { PANGO_SCRIPT_ARABIC/*145*/ } },
+  { "uz",    { PANGO_SCRIPT_CYRILLIC/*68*/ } },
+  { "ven",   { PANGO_SCRIPT_LATIN/*62*/ } },
+  { "vi",    { PANGO_SCRIPT_LATIN/*186*/ } },
+  { "vot",   { PANGO_SCRIPT_LATIN/*62*/ } },
+  { "vo",    { PANGO_SCRIPT_LATIN/*54*/ } },
+  { "wa",    { PANGO_SCRIPT_LATIN/*70*/ } },
+  { "wen",   { PANGO_SCRIPT_LATIN/*76*/ } },
+  { "wo",    { PANGO_SCRIPT_LATIN/*66*/ } },
+  { "xh",    { PANGO_SCRIPT_LATIN/*52*/ } },
+  { "yap",   { PANGO_SCRIPT_LATIN/*58*/ } },
+  { "yi",    { PANGO_SCRIPT_HEBREW/*27*/ } },
+  { "yo",    { PANGO_SCRIPT_LATIN/*114*/ } },
+//  { "zh-cn", { PANGO_SCRIPT_HAN/*6763*/ } },
+  { "zh-cn", { EncodingDetector::ChineseSimplified } },
+//  { "zh-hk", { PANGO_SCRIPT_HAN/*2213*/ } },
+  { "zh-hk", { EncodingDetector::ChineseTraditional } },
+//  { "zh-mo", { PANGO_SCRIPT_HAN/*2213*/ } },
+  { "zh-mo", { EncodingDetector::ChineseTraditional } },
+//  { "zh-sg", { PANGO_SCRIPT_HAN/*6763*/ } },
+  { "zh-sg", { EncodingDetector::ChineseSimplified } },
+//  { "zh-tw", { PANGO_SCRIPT_HAN/*13063*/ } },
+  { "zh-tw", { EncodingDetector::ChineseTraditional } },
+  { "zu",    { PANGO_SCRIPT_LATIN/*52*/ } },
+  { "\x00",    { EncodingDetector::None } }      //end mark
+};
+
+enum MIB
+{
+    MibLatin1  = 4,
+    Mib8859_8  = 85,
+    MibUtf8    = 106,
+    MibUcs2    = 1000,
+    MibUtf16   = 1015,
+    MibUtf16BE = 1013,
+    MibUtf16LE = 1014
+};
+
+static bool is16Bit(QTextCodec* codec)
+{
+    switch (codec->mibEnum())
+    {
+    case MibUtf16:
+    case MibUtf16BE:
+    case MibUtf16LE:
+    case MibUcs2:
+        return true;
+    default:
+        return false;
+    }
+}
+
+class EncodingDetectorPrivate
+{
+public:
+    QTextCodec *m_codec;
+    QTextDecoder *m_decoder; // utf16
+    QTextCodec *m_defaultCodec;
+    QCString  m_storeDecoderName;
+
+    EncodingDetector::EncodingChoiceSource m_source;
+    EncodingDetector::AutoDetectScript m_autoDetectLanguage;
+
+    bool m_visualRTL : 1;
+    bool m_seenBody : 1;
+    bool m_writtingHappened : 1;
+    bool m_analyzeCalled : 1; //for decode()
+    int m_multiByte;
+
+    QCString m_bufferForDefferedEncDetection;
+
+    EncodingDetectorPrivate()
+            : m_codec(QTextCodec::codecForMib(MibLatin1))
+            , m_decoder(m_codec->makeDecoder())
+            , m_defaultCodec(m_codec)
+            , m_source(EncodingDetector::DefaultEncoding)
+            , m_autoDetectLanguage(EncodingDetector::SemiautomaticDetection)
+            , m_visualRTL(false)
+            , m_seenBody(false)
+            , m_writtingHappened(false)
+            , m_analyzeCalled(false)
+            , m_multiByte(0)
+    {
+    }
+
+    EncodingDetectorPrivate(QTextCodec* codec,EncodingDetector::EncodingChoiceSource source, EncodingDetector::AutoDetectScript script)
+            : m_codec(codec)
+            , m_decoder(m_codec->makeDecoder())
+            , m_defaultCodec(m_codec)
+            , m_source(source)
+            , m_autoDetectLanguage(script)
+            , m_visualRTL(false)
+            , m_seenBody(false)
+            , m_writtingHappened(false)
+            , m_analyzeCalled(false)
+            , m_multiByte(0)
+    {
+    }
+
+    ~EncodingDetectorPrivate()
+    {
+        delete m_decoder;
+    }
+};
+
+
+static QCString automaticDetectionForArabic( const unsigned char* ptr, int size )
+{
+    for ( int i = 0; i < size; ++i ) {
+        if ( ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F ) || ptr[ i ] == 0xA1 || ptr[ i ] == 0xA2 || ptr[ i ] == 0xA3
+             || ( ptr[ i ] >= 0xA5 && ptr[ i ] <= 0xAB ) || ( ptr[ i ] >= 0xAE && ptr[ i ] <= 0xBA )
+             || ptr[ i ] == 0xBC || ptr[ i ] == 0xBD || ptr[ i ] == 0xBE || ptr[ i ] == 0xC0
+             || ( ptr[ i ] >= 0xDB && ptr[ i ] <= 0xDF ) || ( ptr[ i ] >= 0xF3 ) ) {
+            return "cp1256";
+        }
+    }
+
+    return "iso-8859-6";
+}
+
+static QCString automaticDetectionForBaltic( const unsigned char* ptr, int size )
+{
+    for ( int i = 0; i < size; ++i ) {
+        if ( ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9E ) )
+             return "cp1257";
+
+        if ( ptr[ i ] == 0xA1 || ptr[ i ] == 0xA5 )
+            return "iso-8859-13";
+    }
+
+    return "iso-8859-13";
+}
+
+static QCString automaticDetectionForCentralEuropean(const unsigned char* ptr, int size )
+{
+    QCString charset;
+    for ( int i = 0; i < size; ++i ) {
+        if ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F ) {
+            if ( ptr[ i ] == 0x81 || ptr[ i ] == 0x83 || ptr[ i ] == 0x90 || ptr[ i ] == 0x98 )
+                return "ibm852";
+
+            if ( i + 1 > size )
+                return "cp1250";
+            else { // maybe ibm852 ?
+                charset = "cp1250";
+                continue;
+            }
+        }
+        if ( ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE || ptr[ i ] == 0xBE || ptr[ i ] == 0xC3 || ptr[ i ] == 0xD0 || ptr[ i ] == 0xE3 || ptr[ i ] == 0xF0 ) {
+            if ( i + 1 > size )
+                return "iso-8859-2";
+            else {  // maybe ibm852 ?
+                if ( charset.isNull() )
+                    charset = "iso-8859-2";
+                continue;
+            }
+        }
+    }
+
+    if ( charset.isNull() )
+        charset = "iso-8859-3";
+
+    return charset.data();
+}
+
+static QCString automaticDetectionForCyrillic( const unsigned char* ptr, int size)
+{
+#ifdef DECODE_DEBUG
+        kWarning() << "EncodingDetector: Cyr heuristics";
+#endif
+
+//     if (ptr[0]==0xef && ptr[1]==0xbb && ptr[2]==0xbf)
+//         return "utf8";
+    int utf8_mark=0;
+    int koi_score=0;
+    int cp1251_score=0;
+
+    int koi_st=0;
+    int cp1251_st=0;
+
+//     int koi_na=0;
+//     int cp1251_na=0;
+
+    int koi_o_capital=0;
+    int koi_o=0;
+    int cp1251_o_capital=0;
+    int cp1251_o=0;
+
+    int koi_a_capital=0;
+    int koi_a=0;
+    int cp1251_a_capital=0;
+    int cp1251_a=0;
+
+    int koi_s_capital=0;
+    int koi_s=0;
+    int cp1251_s_capital=0;
+    int cp1251_s=0;
+
+    int koi_i_capital=0;
+    int koi_i=0;
+    int cp1251_i_capital=0;
+    int cp1251_i=0;
+
+    int cp1251_small_range=0;
+    int koi_small_range=0;
+    int ibm866_small_range=0;
+
+    int i;
+    for (i=1; (i<size) && (cp1251_small_range+koi_small_range<1000) ;++i)
+    {
+        if (ptr[i]>0xdf)
+        {
+            ++cp1251_small_range;
+
+            if (ptr[i]==0xee)//small o
+                ++cp1251_o;
+            else if (ptr[i]==0xe0)//small a
+                ++cp1251_a;
+            else if (ptr[i]==0xe8)//small i
+                ++cp1251_i;
+            else if (ptr[i]==0xf1)//small s
+                ++cp1251_s;
+            else if (ptr[i]==0xf2 && ptr[i-1]==0xf1)//small st
+                ++cp1251_st;
+
+            else if (ptr[i]==0xef)
+                ++koi_o_capital;
+            else if (ptr[i]==0xe1)
+                ++koi_a_capital;
+            else if (ptr[i]==0xe9)
+                ++koi_i_capital;
+            else if (ptr[i]==0xf3)
+                ++koi_s_capital;
+
+        }
+        else if (ptr[i]>0xbf)
+        {
+            ++koi_small_range;
+
+            if (ptr[i]==0xd0||ptr[i]==0xd1)//small o
+                ++utf8_mark;
+            else if (ptr[i]==0xcf)//small o
+                ++koi_o;
+            else if (ptr[i]==0xc1)//small a
+                ++koi_a;
+            else if (ptr[i]==0xc9)//small i
+                ++koi_i;
+            else if (ptr[i]==0xd3)//small s
+                ++koi_s;
+            else if (ptr[i]==0xd4 && ptr[i-1]==0xd3)//small st
+                ++koi_st;
+
+            else if (ptr[i]==0xce)
+                ++cp1251_o_capital;
+            else if (ptr[i]==0xc0)
+                ++cp1251_a_capital;
+            else if (ptr[i]==0xc8)
+                ++cp1251_i_capital;
+            else if (ptr[i]==0xd1)
+                ++cp1251_s_capital;
+        }
+        else if (ptr[i]>0x9f && ptr[i]<0xb0) //first 16 letterz is 60%
+            ++ibm866_small_range;
+
+    }
+
+    //cannot decide?
+    if (cp1251_small_range+koi_small_range+ibm866_small_range<8)
+    {
+        return "";
+    }
+
+    if (3*utf8_mark>cp1251_small_range+koi_small_range+ibm866_small_range)
+    {
+#ifdef DECODE_DEBUG
+        kWarning() << "Cyr Enc Detection: UTF8";
+#endif
+        return "UTF-8";
+    }
+
+    if (ibm866_small_range>cp1251_small_range+koi_small_range)
+        return "ibm866";
+
+//     QCString koi_string = "koi8-u";
+//     QCString cp1251_string = "cp1251";
+
+    if (cp1251_st==0 && koi_st>1)
+        koi_score+=10;
+    else if (koi_st==0 && cp1251_st>1)
+        cp1251_score+=10;
+
+    if (cp1251_st && koi_st)
+    {
+        if (cp1251_st/koi_st>2)
+            cp1251_score+=20;
+        else if (koi_st/cp1251_st>2)
+            koi_score+=20;
+    }
+
+    if (cp1251_a>koi_a)
+        cp1251_score+=10;
+    else if (cp1251_a || koi_a)
+        koi_score+=10;
+
+    if (cp1251_o>koi_o)
+        cp1251_score+=10;
+    else if (cp1251_o || koi_o)
+        koi_score+=10;
+
+    if (cp1251_i>koi_i)
+        cp1251_score+=10;
+    else if (cp1251_i || koi_i)
+        koi_score+=10;
+
+    if (cp1251_s>koi_s)
+        cp1251_score+=10;
+    else if (cp1251_s || koi_s)
+        koi_score+=10;
+
+    if (cp1251_a_capital>koi_a_capital)
+        cp1251_score+=9;
+    else if (cp1251_a_capital || koi_a_capital)
+        koi_score+=9;
+
+    if (cp1251_o_capital>koi_o_capital)
+        cp1251_score+=9;
+    else if (cp1251_o_capital || koi_o_capital)
+        koi_score+=9;
+
+    if (cp1251_i_capital>koi_i_capital)
+        cp1251_score+=9;
+    else if (cp1251_i_capital || koi_i_capital)
+        koi_score+=9;
+
+    if (cp1251_s_capital>koi_s_capital)
+        cp1251_score+=9;
+    else if (cp1251_s_capital || koi_s_capital)
+        koi_score+=9;
+#ifdef DECODE_DEBUG
+    kWarning()<<"koi_score " << koi_score << " cp1251_score " << cp1251_score;
+#endif
+    if (abs(koi_score-cp1251_score)<10)
+    {
+        //fallback...
+        cp1251_score=cp1251_small_range;
+        koi_score=koi_small_range;
+    }
+    if (cp1251_score>koi_score)
+        return "cp1251";
+    else
+        return "koi8-u";
+
+
+//     if (cp1251_score>koi_score)
+//         setEncoding("cp1251",AutoDetectedEncoding);
+//     else
+//         setEncoding("koi8-u",AutoDetectedEncoding);
+//     return true;
+
+}
+
+static QCString automaticDetectionForGreek( const unsigned char* ptr, int size )
+{
+    for ( int i = 0; i < size; ++i ) {
+        if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x87 ) || ptr[ i ] == 0x89 || ptr[ i ] == 0x8B
+             || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x97 ) || ptr[ i ] == 0x99 || ptr[ i ] == 0x9B || ptr[ i ] == 0xA4
+             || ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE ) {
+            return "cp1253";
+        }
+    }
+
+    return "iso-8859-7";
+}
+
+static QCString automaticDetectionForHebrew( const unsigned char* ptr, int size )
+{
+    for ( int i = 0; i < size; ++i ) {
+        if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x89 ) || ptr[ i ] == 0x8B
+             || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x99 ) || ptr[ i ] == 0x9B || ptr[ i ] == 0xA1 || ( ptr[ i ] >= 0xBF && ptr[ i ] <= 0xC9 )
+             || ( ptr[ i ] >= 0xCB && ptr[ i ] <= 0xD8 ) ) {
+            return "cp1255";
+        }
+
+        if ( ptr[ i ] == 0xDF )
+            return "iso-8859-8-i";
+    }
+
+    return "iso-8859-8-i";
+}
+
+static QCString automaticDetectionForJapanese( const unsigned char* ptr, int size )
+{
+    JapaneseCode kc;
+
+    switch ( kc.guess_jp( (const char*)ptr, size ) ) {
+    case JapaneseCode::JIS:
+        return "jis7";
+    case JapaneseCode::EUC:
+        return "eucjp";
+    case JapaneseCode::SJIS:
+        return "sjis";
+     case JapaneseCode::UTF8:
+        return "utf8";
+    default:
+        break;
+    }
+
+    return "";
+}
+
+static QCString automaticDetectionForTurkish( const unsigned char* ptr, int size )
+{
+    for ( int i = 0; i < size; ++i ) {
+        if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x8C ) || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x9C ) || ptr[ i ] == 0x9F ) {
+            return "cp1254";
+        }
+    }
+
+    return "iso-8859-9";
+}
+
+static QCString automaticDetectionForWesternEuropean( const unsigned char* ptr, int size )
+{
+    uint nonansi_count=0;
+    for (int i=0; i<size; ++i)
+    {
+        if (ptr[i]>0x79)
+        {
+             ++nonansi_count;
+            if ( ptr[i]>0xc1 && ptr[i]<0xf0 && i+1<size && ptr[i+1]>0x7f && ptr[i+1]<0xc0)
+            {
+                return "UTF-8";
+            }
+            if (ptr[i] >= 0x78 && ptr[i] <= 0x9 )
+            {
+                return "cp1252";
+            }
+        }
+
+    }
+
+    if (nonansi_count>0)
+        return "iso-8859-15";
+
+    return "";
+}
+
+// Other browsers allow comments in the head section, so we need to also.
+// It's important not to look for tags inside the comments.
+static void skipComment(const char *&ptr, const char *pEnd)
+{
+    const char *p = ptr;
+    // Allow <!-->; other browsers do.
+    if (*p=='>')
+    {
+        p++;
+    }
+    else
+    {
+        while (p!=pEnd)
+        {
+            if (*p=='-')
+            {
+                // This is the real end of comment, "-->".
+                if (p[1]=='-' && p[2]=='>')
+                {
+                    p += 3;
+                    break;
+                }
+                // This is the incorrect end of comment that other browsers allow, "--!>".
+                if (p[1] == '-' && p[2] == '!' && p[3] == '>')
+                {
+                    p += 4;
+                    break;
+                }
+            }
+            p++;
+        }
+    }
+    ptr=p;
+}
+
+// Returns the position of the encoding string.
+static int findXMLEncoding(const QCString &str, int &encodingLength)
+{
+    int len = str.length();
+    int pos = str.find("encoding");
+    if (pos == -1)
+        return -1;
+    pos += 8;
+
+    // Skip spaces and stray control characters.
+    while (pos<len && str[pos]<=' ')
+        ++pos;
+
+    //Bail out if nothing after
+    // Skip equals sign.
+    if (pos>=len || str[pos] != '=')
+        return -1;
+    ++pos;
+
+    // Skip spaces and stray control characters.
+    while (pos<len && str[pos]<=' ')
+        ++pos;
+
+    //Bail out if nothing after
+    if (pos >= len)
+        return -1;
+
+    // Skip quotation mark.
+    char quoteMark = str[pos];
+    if (quoteMark != '"' && quoteMark != '\'')
+        return -1;
+    ++pos;
+
+    // Find the trailing quotation mark.
+    int end=pos;
+    while (end<len && str[end]!=quoteMark)
+        ++end;
+
+    if (end>=len)
+        return -1;
+
+    encodingLength = end-pos;
+    return pos;
+}
+
+
+bool EncodingDetector::errorsIfUtf8 (const char* data, int length)
+{
+    if (d->m_codec->mibEnum()!=MibUtf8)
+        return false; //means no errors
+// #define highest1Bits (unsigned char)0x80
+// #define highest2Bits (unsigned char)0xC0
+// #define highest3Bits (unsigned char)0xE0
+// #define highest4Bits (unsigned char)0xF0
+// #define highest5Bits (unsigned char)0xF8
+static const unsigned char highest1Bits = 0x80;
+static const unsigned char highest2Bits = 0xC0;
+static const unsigned char highest3Bits = 0xE0;
+static const unsigned char highest4Bits = 0xF0;
+static const unsigned char highest5Bits = 0xF8;
+
+    for (int i=0; i<length; ++i)
+    {
+        unsigned char c = data[i];
+
+        if (d->m_multiByte>0)
+        {
+            if ((c & highest2Bits) == 0x80)
+            {
+                --(d->m_multiByte);
+                continue;
+            }
+#ifdef DECODE_DEBUG
+            kWarning() << "EncDetector: Broken UTF8";
+#endif
+            return true;
+        }
+
+        // most significant bit zero, single char
+        if ((c & highest1Bits) == 0x00)
+            continue;
+
+        // 110xxxxx => init 1 following bytes
+        if ((c & highest3Bits) == 0xC0)
+        {
+            d->m_multiByte = 1;
+            continue;
+        }
+
+        // 1110xxxx => init 2 following bytes
+        if ((c & highest4Bits) == 0xE0)
+        {
+            d->m_multiByte = 2;
+            continue;
+        }
+
+        // 11110xxx => init 3 following bytes
+        if ((c & highest5Bits) == 0xF0)
+        {
+            d->m_multiByte = 3;
+            continue;
+        }
+#ifdef DECODE_DEBUG
+        kWarning() << "EncDetector:_Broken UTF8";
+#endif
+        return true;
+    }
+    return false;
+}
+
+EncodingDetector::EncodingDetector() : d(new EncodingDetectorPrivate)
+{
+}
+
+EncodingDetector::EncodingDetector(QTextCodec* codec, EncodingChoiceSource source, AutoDetectScript script) :
+    d(new EncodingDetectorPrivate(codec,source,script))
+{
+}
+
+EncodingDetector::~EncodingDetector()
+{
+    delete d;
+}
+
+void EncodingDetector::setAutoDetectLanguage( EncodingDetector::AutoDetectScript lang)
+{
+    d->m_autoDetectLanguage=lang;
+}
+EncodingDetector::AutoDetectScript EncodingDetector::autoDetectLanguage() const
+{
+    return d->m_autoDetectLanguage;
+}
+
+EncodingDetector::EncodingChoiceSource EncodingDetector::encodingChoiceSource() const
+{
+    return d->m_source;
+}
+
+const char* EncodingDetector::encoding() const
+{
+    d->m_storeDecoderName = d->m_codec->name();
+    return d->m_storeDecoderName.data();
+}
+
+bool EncodingDetector::visuallyOrdered() const
+{
+    return d->m_visualRTL;
+}
+
+// const QTextCodec* EncodingDetector::codec() const
+// {
+//     return d->m_codec;
+// }
+
+QTextDecoder* EncodingDetector::decoder()
+{
+    return d->m_decoder;
+}
+
+bool EncodingDetector::setEncoding(const char *_encoding, EncodingChoiceSource type)
+{
+    QTextCodec *codec;
+    QCString enc(_encoding);
+    if(/*enc.isNull() || */enc.isEmpty())
+    {
+        if (type==DefaultEncoding)
+            codec=d->m_defaultCodec;
+        else
+            return false;
+    }
+    else
+    {
+        //QString->QTextCodec
+
+        enc = enc.lower();
+         // hebrew visually ordered
+        if(enc=="visual")
+            enc="iso8859-8";
+        bool b;
+        codec = KGlobal::charsets()->codecForName(enc, b);
+        if (!b)
+        return false;
+    }
+
+    if (d->m_codec->mibEnum()==codec->mibEnum())
+        return true;
+
+    if ((type==EncodingFromMetaTag || type==EncodingFromXMLHeader) && is16Bit(codec))
+    {
+        //Sometimes the codec specified is absurd, i.e. UTF-16 despite
+        //us decoding a meta tag as ASCII. In that case, ignore it.
+        return false;
+    }
+
+    if (codec->mibEnum() == Mib8859_8)
+    {
+        //We do NOT want to use Qt's QHebrewCodec, since it tries to reorder itself.
+        codec = QTextCodec::codecForName("iso8859-8-i");
+
+        // visually ordered unless one of the following
+        if(!(enc=="iso-8859-8-i"||enc=="iso_8859-8-i"||enc=="csiso88598i"||enc=="logical"))
+            d->m_visualRTL = true;
+    }
+
+    d->m_codec = codec;
+    d->m_source = type;
+    delete d->m_decoder;
+    d->m_decoder = d->m_codec->makeDecoder();
+#ifdef DECODE_DEBUG
+    kDebug(6005) << "EncodingDetector::encoding used is" << d->m_codec->name();
+#endif
+    return true;
+}
+
+bool EncodingDetector::analyze(const QByteArray &data)
+{
+    return analyze( data.data(), data.size() );
+}
+
+bool EncodingDetector::analyze(const char *data, int len)
+{
+    // Check for UTF-16 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding.
+    // maximumBOMLength = 10
+    // Even if the user has chosen utf16 we still need to auto-detect the endianness
+    if (len >= 10 && ((d->m_source != UserChosenEncoding) || is16Bit(d->m_codec)))
+    {
+        // Extract the first three bytes.
+        const uchar *udata = (const uchar *)data;
+        uchar c1 = *udata++;
+        uchar c2 = *udata++;
+        uchar c3 = *udata++;
+
+        // Check for the BOM
+        const char *autoDetectedEncoding;
+        if ((c1 == 0xFE && c2 == 0xFF) || (c1 == 0xFF && c2 == 0xFE))
+        {
+            autoDetectedEncoding = "ISO-10646-UCS-2";
+        }
+        else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF)
+        {
+            autoDetectedEncoding = "UTF-8";
+        }
+        else if (c1 == 0x00 || c2 == 0x00)
+        {
+            uchar c4 = *udata++;
+            uchar c5 = *udata++;
+            uchar c6 = *udata++;
+            uchar c7 = *udata++;
+            uchar c8 = *udata++;
+            uchar c9 = *udata++;
+            uchar c10 = *udata++;
+
+            int nul_count_even = (c2 != 0) + (c4 != 0) + (c6 != 0) + (c8 != 0) + (c10 != 0);
+            int nul_count_odd = (c1 != 0) + (c3 != 0) + (c5 != 0) + (c7 != 0) + (c9 != 0);
+            if ((nul_count_even==0 && nul_count_odd==5) || (nul_count_even==5 && nul_count_odd==0))
+                autoDetectedEncoding = "ISO-10646-UCS-2";
+            else
+                autoDetectedEncoding = 0;
+        }
+        else
+        {
+            autoDetectedEncoding = 0;
+        }
+
+        // If we found a BOM, use the encoding it implies.
+        if (autoDetectedEncoding != 0)
+        {
+            d->m_source = BOM;
+            d->m_codec = QTextCodec::codecForName(autoDetectedEncoding);
+            assert(d->m_codec);
+            //enc = d->m_codec->name();
+            delete d->m_decoder;
+            d->m_decoder = d->m_codec->makeDecoder();
+#ifdef DECODE_DEBUG
+            kWarning() << "Detection by BOM";
+#endif
+            if (is16Bit(d->m_codec) && c2==0x00)
+            {
+                // utf16LE, we need to put the decoder in LE mode
+                char reverseUtf16[3] = {(char)0xFF, (char)0xFE, 0x00};
+                d->m_decoder->toUnicode(reverseUtf16, 2);
+            }
+            return true;
+        }
+    }
+
+    //exit from routine in case it was called to only detect byte order for utf-16
+    if (d->m_source==UserChosenEncoding)
+    {
+#ifdef DECODE_DEBUG
+        kWarning() << "EncodingDetector: UserChosenEncoding exit ";
+#endif
+
+        if (errorsIfUtf8(data, len))
+            setEncoding("",DefaultEncoding);
+        return true;
+    }
+#if 0  //This is for plaintext, so don't try to parse HTML headers -- ahartmetz
+    if (!d->m_seenBody)
+    {
+        // we still don't have an encoding, and are in the head
+        // the following tags are allowed in <head>:
+        // SCRIPT|STYLE|META|LINK|OBJECT|TITLE|BASE
+        const char *ptr = data;
+        const char *pEnd = data+len;
+
+        while(ptr != pEnd)
+        {
+            if(*ptr!='<')
+            {
+                ++ptr;
+                continue;
+            }
+            ++ptr;
+            // Handle comments.
+            if (ptr[0] == '!' && ptr[1] == '-' && ptr[2] == '-')
+            {
+                ptr += 3;
+                skipComment(ptr, pEnd);
+                continue;
+            }
+
+            // Handle XML header, which can have encoding in it.
+            if (ptr[0]=='?' && ptr[1]=='x' && ptr[2]=='m' && ptr[3]=='l')
+            {
+                const char *end = ptr;
+                while (*end != '>' && end < pEnd)
+                    end++;
+                if (*end == '\0' || end == pEnd)
+                    break;
+                QCString str(ptr, end - ptr + 1);
+                int length;
+                int pos = findXMLEncoding(str, length);
+                // also handles the case when specified encoding aint correct
+                if (pos!=-1 && setEncoding(str.mid(pos, length), EncodingFromXMLHeader))
+                {
+                    return true;
+                }
+            }
+
+            //look for <meta>, stop if we reach <body>
+            while (
+                        !((*ptr >= 'a') && (*ptr <= 'z') ||
+                        (*ptr >= 'A') && (*ptr <= 'Z'))
+                        && ptr < pEnd
+                )
+                ++ptr;
+
+            char tmp[5];
+            int length=0;
+            const char* max=ptr+4;
+            if (pEnd<max)
+                max=pEnd;
+            while (
+                        ((*ptr >= 'a') && (*ptr <= 'z') ||
+                        (*ptr >= 'A') && (*ptr <= 'Z') ||
+                        (*ptr >= '0') && (*ptr <= '9'))
+                        && ptr < max
+                )
+            {
+                tmp[length] = tolower( *ptr );
+                ++ptr;
+                ++length;
+            }
+            tmp[length] = 0;
+            if (tmp[0]=='m'&&tmp[1]=='e'&&tmp[2]=='t'&&tmp[3]=='a')
+            {
+                // found a meta tag...
+                const char* end = ptr;
+                while(*end != '>' && *end != '\0' && end<pEnd)
+                    end++;
+                //if ( *end == '\0' ) break;
+                QCString str( ptr, (end-ptr)+1);
+                str = str.lower();
+                int pos=0;
+                        //if( (pos = str.find("http-equiv", pos)) == -1) break;
+                        //if( (pos = str.find("content-type", pos)) == -1) break;
+                if( (pos = str.find("charset")) == -1)
+                    continue;
+                pos+=6;
+                // skip to '='
+                if( (pos = str.find('=', pos)) == -1)
+                    continue;
+
+                // skip whitespace before encoding itself
+                while (pos < (int)str.length() && str[pos] <= ' ')
+                    ++pos;
+                if ( pos == (int)str.length())
+                    continue;
+
+                int endpos = pos;
+                while( endpos < str.length() &&
+                        (str[endpos] != ' ' && str[endpos] != '"' && str[endpos] != '\''
+                                    && str[endpos] != ';' && str[endpos] != '>') )
+                    ++endpos;
+    #ifdef DECODE_DEBUG
+                kDebug( 6005 ) << "EncodingDetector: found charset in <meta>: " << str.mid(pos,endpos-pos).data();
+    #endif
+                if (setEncoding(str.mid(pos,endpos-pos), EncodingFromMetaTag))
+                    return true;
+            }
+            else if (tmp[0]=='b'&&tmp[1]=='o'&&tmp[2]=='d'&&tmp[3]=='y')
+            {
+                d->m_seenBody=true;
+                break;
+            }
+        }
+    }
+
+    if (d->m_source==EncodingFromHTTPHeader)
+        return true;
+#endif
+    //if (len<20)     //make a guess even if the file is short -- ahartmetz
+    if (len < 1)
+    {
+        setEncoding("",DefaultEncoding);
+        return false;
+    }
+#ifdef DECODE_DEBUG
+    kDebug( 6005 ) << "EncodingDetector: using heuristics (" << strlen(data) << ")";
+#endif
+
+    switch ( d->m_autoDetectLanguage )
+    {
+        case EncodingDetector::Arabic:
+            return setEncoding(automaticDetectionForArabic( (const unsigned char*) data, len ), AutoDetectedEncoding);
+//             break;
+        case EncodingDetector::Baltic:
+            return setEncoding(automaticDetectionForBaltic( (const unsigned char*) data, len ), AutoDetectedEncoding);
+//             break;
+        case EncodingDetector::CentralEuropean:
+            return setEncoding(automaticDetectionForCentralEuropean( (const unsigned char*) data, len ), AutoDetectedEncoding);
+            break;
+        case EncodingDetector::Cyrillic:
+            return setEncoding(automaticDetectionForCyrillic( (const unsigned char*) data, len), AutoDetectedEncoding);
+//             break;
+        case EncodingDetector::Greek:
+            return setEncoding(automaticDetectionForGreek( (const unsigned char*) data, len ), AutoDetectedEncoding);
+//             break;
+        case EncodingDetector::Hebrew:
+            return setEncoding(automaticDetectionForHebrew( (const unsigned char*) data, len ), AutoDetectedEncoding);
+//             break;
+        case EncodingDetector::Japanese:
+            return setEncoding(automaticDetectionForJapanese( (const unsigned char*) data, len ), AutoDetectedEncoding);
+//             break;
+        case EncodingDetector::Turkish:
+            return setEncoding(automaticDetectionForTurkish( (const unsigned char*) data, len ), AutoDetectedEncoding);
+//             break;
+        case EncodingDetector::WesternEuropean:
+            if (setEncoding(automaticDetectionForWesternEuropean( (const unsigned char*) data, len ), AutoDetectedEncoding))
+                return true;
+            else if (d->m_defaultCodec->mibEnum()==MibLatin1) //detection for khtml
+            {
+                return setEncoding("iso-8859-15",AutoDetectedEncoding);
+            }
+            else //use default provided by eg katepart
+            {
+                return setEncoding("",DefaultEncoding);
+            }
+//             break;
+        case EncodingDetector::SemiautomaticDetection:
+        case EncodingDetector::ChineseSimplified:
+        case EncodingDetector::ChineseTraditional:
+        case EncodingDetector::Korean:
+        case EncodingDetector::Thai:
+        case EncodingDetector::Unicode:
+        case EncodingDetector::NorthernSaami:
+        case EncodingDetector::SouthEasternEurope:
+        case EncodingDetector::None:
+            // huh. somethings broken in this code ### FIXME
+            //enc = 0; //Reset invalid codec we tried, so we get back to latin1 fallback.
+            break;
+        }
+
+        setEncoding("",DefaultEncoding);
+        return true;
+}
+
+
+EncodingDetector::AutoDetectScript EncodingDetector::scriptForName(const QString& lang)
+{
+    if (lang.isEmpty())
+        return EncodingDetector::None;
+    else if (lang==i18n("@item Text character set", "Unicode"))
+        return EncodingDetector::Unicode;
+    else if (lang==i18n("@item Text character set", "Cyrillic"))
+        return EncodingDetector::Cyrillic;
+    else if (lang==i18n("@item Text character set", "Western European"))
+        return EncodingDetector::WesternEuropean;
+    else if (lang==i18n("@item Text character set", "Central European"))
+        return EncodingDetector::CentralEuropean;
+    else if (lang==i18n("@item Text character set", "Greek"))
+        return EncodingDetector::Greek;
+    else if (lang==i18n("@item Text character set", "Hebrew"))
+        return EncodingDetector::Hebrew;
+    else if (lang==i18n("@item Text character set", "Turkish"))
+        return EncodingDetector::Turkish;
+    else if (lang==i18n("@item Text character set", "Japanese"))
+        return EncodingDetector::Japanese;
+    else if (lang==i18n("@item Text character set", "Baltic"))
+        return EncodingDetector::Baltic;
+    else if (lang==i18n("@item Text character set", "Arabic"))
+        return EncodingDetector::Arabic;
+
+    return EncodingDetector::None;
+}
+
+bool EncodingDetector::hasAutoDetectionForScript(EncodingDetector::AutoDetectScript script)
+{
+    switch (script)
+    {
+        case EncodingDetector::Arabic:
+            return true;
+        case EncodingDetector::Baltic:
+            return true;
+        case EncodingDetector::CentralEuropean:
+            return true;
+        case EncodingDetector::Cyrillic:
+            return true;
+        case EncodingDetector::Greek:
+            return true;
+        case EncodingDetector::Hebrew:
+            return true;
+        case EncodingDetector::Japanese:
+            return true;
+        case EncodingDetector::Turkish:
+            return true;
+        case EncodingDetector::WesternEuropean:
+            return true;
+        case EncodingDetector::ChineseTraditional:
+            return true;
+        case EncodingDetector::ChineseSimplified:
+            return true;
+        case EncodingDetector::Unicode:
+            return true;
+            break;
+        default:
+            return false;
+    }
+}
+
+QString EncodingDetector::nameForScript(EncodingDetector::AutoDetectScript script)
+{
+    switch (script)
+    {
+        case EncodingDetector::Arabic:
+            return i18n("@item Text character set", "Arabic");
+            break;
+        case EncodingDetector::Baltic:
+            return i18n("@item Text character set", "Baltic");
+            break;
+        case EncodingDetector::CentralEuropean:
+            return i18n("@item Text character set", "Central European");
+            break;
+        case EncodingDetector::Cyrillic:
+            return i18n("@item Text character set", "Cyrillic");
+            break;
+        case EncodingDetector::Greek:
+            return i18n("@item Text character set", "Greek");
+            break;
+        case EncodingDetector::Hebrew:
+            return i18n("@item Text character set", "Hebrew");
+            break;
+        case EncodingDetector::Japanese:
+            return i18n("@item Text character set", "Japanese");
+            break;
+        case EncodingDetector::Turkish:
+            return i18n("@item Text character set", "Turkish");
+            break;
+        case EncodingDetector::WesternEuropean:
+            return i18n("@item Text character set", "Western European");
+            break;
+        case EncodingDetector::ChineseTraditional:
+            return i18n("@item Text character set", "Chinese Traditional");
+            break;
+        case EncodingDetector::ChineseSimplified:
+            return i18n("@item Text character set", "Chinese Simplified");
+            break;
+        case EncodingDetector::Korean:
+            return i18n("@item Text character set", "Korean");
+            break;
+        case EncodingDetector::Thai:
+            return i18n("@item Text character set", "Thai");
+            break;
+        case EncodingDetector::Unicode:
+            return i18n("@item Text character set", "Unicode");
+            break;
+        //case EncodingDetector::SemiautomaticDetection:
+        default:
+            return QString();
+
+        }
+}
+
+EncodingDetector::AutoDetectScript EncodingDetector::scriptForLanguageCode(const QString &lc)
+{
+  // It might make sense to do something special if the locale ends with
+  // ".UTF-8" or "@utf8"
+  const char *langStr = pango_script_for_lang[0].lang;
+  // There is obvious optimization potential...
+  for ( int i = 0; langStr; i++ ) {
+     langStr = pango_script_for_lang[i].lang;
+     // startsWith() works for empty strings: every string "starts with" an empty string.
+     if ( lc.startsWith( QString::fromAscii( langStr ) ) )
+       return pango_script_for_lang[i].scripts[0];
+  }
+  return None;
+}
+
+#undef DECODE_DEBUG
+
author	toma <toma@283d02a7-25f6-0310-bc7c-ecb5cbfe19da>	2009-11-25 17:56:58 +0000
committer	toma <toma@283d02a7-25f6-0310-bc7c-ecb5cbfe19da>	2009-11-25 17:56:58 +0000
commit	460c52653ab0dcca6f19a4f492ed2c5e4e963ab0 (patch)
tree	67208f7c145782a7e90b123b982ca78d88cc2c87 /kmail/encodingdetector.cpp
download	tdepim-460c52653ab0dcca6f19a4f492ed2c5e4e963ab0.tar.gz tdepim-460c52653ab0dcca6f19a4f492ed2c5e4e963ab0.zip