From bd0f3345a938b35ce6a12f6150373b0955b8dd12 Mon Sep 17 00:00:00 2001 From: Timothy Pearson Date: Sun, 10 Jul 2011 15:24:15 -0500 Subject: Add Qt3 development HEAD version --- src/codecs/qgb18030codec.cpp | 9381 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 9381 insertions(+) create mode 100644 src/codecs/qgb18030codec.cpp (limited to 'src/codecs/qgb18030codec.cpp') diff --git a/src/codecs/qgb18030codec.cpp b/src/codecs/qgb18030codec.cpp new file mode 100644 index 0000000..fe7b1e5 --- /dev/null +++ b/src/codecs/qgb18030codec.cpp @@ -0,0 +1,9381 @@ +/**************************************************************************** +** Implementation of QGb18030Codec template/macro class +** +** Copyright (C) 1992-2008 Trolltech ASA. All rights reserved. +** +** This file is part of the tools module of the Qt GUI Toolkit. +** +** This file may be used under the terms of the GNU General +** Public License versions 2.0 or 3.0 as published by the Free +** Software Foundation and appearing in the files LICENSE.GPL2 +** and LICENSE.GPL3 included in the packaging of this file. +** Alternatively you may (at your option) use any later version +** of the GNU General Public License if such license has been +** publicly approved by Trolltech ASA (or its successors, if any) +** and the KDE Free Qt Foundation. +** +** Please review the following information to ensure GNU General +** Public Licensing requirements will be met: +** http://trolltech.com/products/qt/licenses/licensing/opensource/. +** If you are unsure which license is appropriate for your use, please +** review the following information: +** http://trolltech.com/products/qt/licenses/licensing/licensingoverview +** or contact the sales department at sales@trolltech.com. +** +** This file may be used under the terms of the Q Public License as +** defined by Trolltech ASA and appearing in the file LICENSE.QPL +** included in the packaging of this file. Licensees holding valid Qt +** Commercial licenses may use this file in accordance with the Qt +** Commercial License Agreement provided with the Software. +** +** This file is provided "AS IS" with NO WARRANTY OF ANY KIND, +** INCLUDING THE WARRANTIES OF DESIGN, MERCHANTABILITY AND FITNESS FOR +** A PARTICULAR PURPOSE. Trolltech reserves all rights not granted +** herein. +** +**********************************************************************/ + +/*! \class QGb18030Codec qgb18030codec.h + \reentrant + \ingroup i18n + + \brief The QGb18030Codec class provides conversion to and from the Chinese + GB18030/GBK/GB2312 encoding. + + \omit Last updated: September, 3, 2002 \endomit + + GBK, formally the Chinese Internal Code Specification, is a commonly + used extension of GB 2312-80. Microsoft Windows uses it under the + name codepage 936. + + GBK has been superceded by the new Chinese national standard + GB 18030-2000, which added a 4-byte encoding while remaining + compatible with GB2312 and GBK. The new GB 18030-2000 may be described + as a special encoding of Unicode 3.x and ISO-10646-1. + + Special thanks to charset gurus Markus Scherer (IBM), + Dirk Meyer (Adobe Systems) and Ken Lunde (Adobe Systems) for publishing + an excellent GB 18030-2000 summary and specification on the Internet. + Some must-read documents are: + \list + \i + \l{ftp://ftp.oreilly.com/pub/examples/nutshell/cjkv/pdf/GB18030_Summary.pdf} + \i \l{http://oss.software.ibm.com/cvs/icu/~checkout~/charset/source/gb18030/gb18030.html} + \i + \l{http://oss.software.ibm.com/cvs/icu/~checkout~/charset/data/xml/gb-18030-2000.xml} + \endlist + + The GBK codec was contributed to Qt by + Justin Yu \ and + Sean Chen \. They may also be reached at + Yu Mingjian \, \ + Chen Xiangyang \ + + The GB18030 codec Qt functions were contributed to Qt by + James Su \, \ + who pioneered much of GB18030 development on GNU/Linux systems. + + The GB18030 codec was contributed to Qt by + Anthony Fok \, \ + using a Perl script to generate C++ tables from gb-18030-2000.xml + while merging contributions from James Su, Justin Yu and Sean Chen. + A copy of the source Perl script is available at: + + \l{http://people.debian.org/~foka/gb18030/gen-qgb18030codec.pl} + + The copyright notice for their code follows: + + \legalese + + Copyright (C) 2000 TurboLinux, Inc. Written by Justin Yu and Sean Chen. + Copyright (C) 2001, 2002 Turbolinux, Inc. Written by James Su. + Copyright (C) 2001, 2002 ThizLinux Laboratory Ltd. Written by Anthony Fok. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + \list 1 + \i Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + \i Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + \endlist + + THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + SUCH DAMAGE. +*/ + +#include "qgb18030codec.h" + +#if (QT_VERSION-0 >= 0x040000) +#error "move obsolete header into the src/compat directory" +#endif + +#ifndef QT_NO_BIG_CODECS + +#define InRange(c, lower, upper) (((c) >= (lower)) && ((c) <= (upper))) +#define IsLatin(c) ((c) <= 0x7F) +#define IsByteInGb2312(c) (InRange((c), 0xA1, 0xFE)) +#define Is1stByte(c) (InRange((c), 0x81, 0xFE)) +#define Is2ndByteIn2Bytes(c) (InRange((c), 0x40, 0xFE) && (c) != 0x7F) +#define Is2ndByteIn4Bytes(c) (InRange((c), 0x30, 0x39)) +#define Is2ndByte(c) (Is2ndByteIn2Bytes(c) || Is2ndByteIn4Bytes(c)) +#define Is3rdByte(c) (InRange((c), 0x81, 0xFE)) +#define Is4thByte(c) (InRange((c), 0x30, 0x39)) + +#define QValidChar(u) ((u) ? QChar((ushort)(u)) : QChar::replacement) + +/* User-defined areas: UDA 1: 0xAAA1 - 0xAFFE (564/0) + UDA 2: 0xF8A1 - 0xFEFE (658/0) + UDA 3: 0xA140 - 0xA7A0 (672/0) */ +#define IsUDA1(a, b) (InRange((a), 0xAA, 0xAF) && InRange((b), 0xA1, 0xFE)) +#define IsUDA2(a, b) (InRange((a), 0xF8, 0xFE) && InRange((b), 0xA1, 0xFE)) +#define IsUDA3(a, b) (InRange((a), 0xA1, 0xA7) && InRange((b), 0x40, 0xA0) && ((b) != 0x7F)) + +typedef struct { + Q_UINT8 tblBegin; + Q_UINT8 tblEnd; + Q_UINT16 tblOffset; + Q_UINT16 algOffset; +} indexTbl_t; + +static uint qt_Gb18030ToUnicode(const uchar *gbstr, int& len); +static int qt_UnicodeToGb18030(uint unicode, uchar *gbchar); +int qt_UnicodeToGbk(uint unicode, uchar *gbchar); + +/*! \internal */ +QGb18030Codec::QGb18030Codec() +{ +} + +/*! \reimp */ +const char* QGb18030Codec::name() const +{ + //qDebug("QGb18030Codec::name() = \"GB18030\""); + return "GB18030"; +} + +/*! \reimp */ +int QGb18030Codec::mibEnum() const +{ + return 114; +} + +/*! \reimp */ +QCString QGb18030Codec::fromUnicode(const QString& uc, int& lenInOut) const +{ + int l = QMIN((int)uc.length(),(lenInOut<0)?(int)uc.length():lenInOut); + int rlen = l*4+1; + QCString rstr(rlen); + uchar* cursor = (uchar*)rstr.data(); + + //qDebug("QGb18030Codec::fromUnicode(const QString& uc, int& lenInOut = %d)", lenInOut); + for (int i=0; i= 0xdc00) + *cursor++ = '?'; + else { + unsigned short low = uc[i+1].unicode(); + if (low >= 0xdc00 && low <= 0xdfff) { + // valid surrogate pair + ++i; + uint u = (high-0xd800)*0x400+(low-0xdc00)+0x10000; + len = qt_UnicodeToGb18030(u, buf); + if (len >= 2) { + for (int j=0; j= 2 ) { + for (int j=0; j= 0xA1) && (buf[1] >= 0xA1) ) { + *cursor++ = buf[0]; + *cursor++ = buf[1]; + } else { + // Error + *cursor++ = '?'; // unknown char + } + } + + lenInOut = cursor - (uchar*)rstr.data(); + rstr.truncate(lenInOut); + return rstr; +} + +/*! \reimp */ +QString QGb2312Codec::toUnicode(const char* chars, int len) const +{ + QString result; + int clen; + + //qDebug("QGb2312Codec::toUnicode(const char* chars, int len = %d)", len); + for (int i=0; i= 2 ) { + uchar second = gbstr[1]; + + if ( Is2ndByteIn2Bytes(second) ) { + len = 2; + + if (IsUDA1(first, second)) + uni = 0xE000 + (first - 0xAA) * 94 + (second - 0xA1); + else if (IsUDA2(first, second)) + uni = 0xE234 + (first - 0xF8) * 94 + (second - 0xA1); + else if (IsUDA3(first, second)) + uni = 0xE4C6 + (first - 0xA1) * 96 + (second - 0x40) + - ((second >= 0x80) ? 1 : 0); + else { + // Use the mapping table + uint i; + + i = (first - 0x81) * 190 + (second - 0x40) + - ((second >= 0x80) ? 1 : 0); + + if (InRange(first, 0xA1, 0xA7)) + i -= (first - 0xA0) * 96; + if (first > 0xA7) + i -= 672; + if (InRange(first, 0xAA, 0xAF)) + i -= (first - 0xAA) * 94; + if (first > 0xAF) + i -= 564; + if (first >= 0xF8) + i -= (first - 0xF8) * 94; + + uni = (uint)gb18030_2byte_to_ucs[i]; + } + } + else if ( Is2ndByteIn4Bytes(second) && len >= 4 ) { + uchar third = gbstr[2], + fourth = gbstr[3]; + + if ( Is3rdByte(third) && Is4thByte(fourth) ) { + // Valid 4-byte GB18030, whether defined or not + uint gb4lin; + indexTbl_t g2u; + + gb4lin = (first - 0x81) * 12600 + (second - 0x30) * 1260 + + (third - 0x81) * 10 + (fourth - 0x30); + + len = 4; + if ( gb4lin <= 0x99FB ) { + /* GB+81308130 - GB+8431A439 */ + g2u = gb18030_to_ucs_index[gb4lin >> 8]; + + if ((Q_UINT8)(gb4lin & 0xFF) >= g2u.tblBegin && + (Q_UINT8)(gb4lin & 0xFF) <= g2u.tblEnd) { + + uni = (uint)gb18030_4byte_to_ucs[gb4lin - g2u.tblOffset]; + } + else { + uni = g2u.algOffset + (gb4lin & 0xFF); + } + } else if (InRange(gb4lin, 0x2E248, 0x12E247)) { + /* GB+90308130 - GB+E3329A35 */ + uni = gb4lin - 0x1E248; + } else { + /* undefined or reserved area */ + len = 1; + uni = QChar::replacement.unicode(); + } + } + else { + len = 1; + uni = QChar::replacement.unicode(); + } + } + else { + len = 1; + uni = QChar::replacement.unicode(); + } + } + else { + len = 1; + uni = QChar::replacement.unicode(); + } + return uni; +} + + +int qt_UnicodeToGb18030(uint uni, uchar *gbchar) { + /* Returns the bytesize of the GB18030 character. */ + uint gb, gb4lin; + indexTbl_t u2g; + + if ( IsLatin(uni) ) { + *gbchar = (uchar)uni; + return 1; + } + else if (uni <= 0xD7FF || InRange(uni, 0xE766, 0xFFFF)) { + u2g = ucs_to_gb18030_index[uni >> 8]; + + if ((Q_UINT8)(uni & 0xFF) >= u2g.tblBegin && (Q_UINT8)(uni & 0xFF) <= u2g.tblEnd) { + // Use mapping table (2-byte or 4-byte GB18030) + uint tblEntry; + + tblEntry = ucs_to_gb18030[uni - u2g.tblOffset]; + + if (tblEntry > 0x8000) { + // 2-byte GB18030 + gb = tblEntry; + } + else { + // 4-byte GB18030 stored in a special compact format + uchar a, b; + a = 0x81; + b = 0x30 + (tblEntry >> 11); + if (tblEntry >= 0x7000) { + a += 3; + b -= 14; + } else if (tblEntry >= 0x6000) { + a += 2; + b -= 6; + } else if (tblEntry >= 0x3000) { + a += 1; + b -= 6; + } else if (b >= 0x31) { + b += 5; + } + gbchar[0] = a; + gbchar[1] = b; + gbchar[2] = 0x81 + ( (tblEntry >> 4) & 0x7F ); + gbchar[3] = 0x30 + (tblEntry & 0xF); + return 4; + } + } + else { + // 4-byte GB18030 calculated algorithmically + gb4lin = u2g.algOffset + (uni & 0xFF); + // Yikes, my index table could not cover all the bases... + if (InRange(uni, 0x49B8, 0x49FF)) + gb4lin -= 11; + gb = gb4lin_to_gb(gb4lin); + } + } + else if (InRange(uni, 0xE000, 0xE765)) { + // User-defined areas in GB18030 (2-byte) + if (uni <= 0xE233) + gb = 0xAAA1 + (((uni - 0xE000) / 94) << 8) + (uni - 0xE000) % 94; + else if (uni <= 0xE4C5) + gb = 0xF8A1 + (((uni - 0xE234) / 94) << 8) + (uni - 0xE234) % 94; + else { + gb = 0xA140 + (((uni - 0xE4C6) / 96) << 8) + (uni - 0xE4C6) % 96; + // Skip the gap at 0x7F + if ((gb & 0xFF) >= 0x7F) + gb++; + } + } + else if (InRange(uni, 0x10000, 0x10FFFF)) { + // Qt 3.x does not support beyond BMP yet, but what the heck... + // (U+10000 = GB+90308130) to (U+10FFFF = GB+E3329A35) + gb = gb4lin_to_gb(0x1E248 + uni); + } + else { + // Surrogate area and other undefined/reserved areas (discard) + *gbchar = 0; + return 0; + } + + if (gb <= 0xFFFF) { + gbchar[0] = (uchar)((gb >> 8) & 0xFF); + gbchar[1] = (uchar)(gb & 0xFF); + return 2; + } else { + gbchar[0] = (uchar)((gb >> 24) & 0xFF); + gbchar[1] = (uchar)((gb >> 16) & 0xFF); + gbchar[2] = (uchar)((gb >> 8) & 0xFF); + gbchar[3] = (uchar)(gb & 0xFF); + return 4; + } +} + + +int qt_UnicodeToGbk(uint uni, uchar *gbchar) { + /* Returns the bytesize of the GBK character. */ + /* Intended for improving performance of GB2312 and GBK functions. */ + uint gb; + indexTbl_t u2g; + + if ( IsLatin(uni) ) { + *gbchar = (uchar)uni; + return 1; + } + else if (uni <= 0xD7FF || InRange(uni, 0xE766, 0xFFFF)) { + u2g = ucs_to_gb18030_index[uni >> 8]; + + if ( (Q_UINT8)(uni & 0xFF) >= u2g.tblBegin && (Q_UINT8)(uni & 0xFF) <= u2g.tblEnd ) { + // Use mapping table (2-byte GBK or 4-byte GB18030) + uint tblEntry; + + tblEntry = ucs_to_gb18030[uni - u2g.tblOffset]; + + if (tblEntry > 0x8000) { + // GBK + gb = tblEntry; + } + else { + // 4-byte GB18030 stored in a special compact format (discard) + *gbchar = 0; + return 0; + } + } + else { + // 4-byte GB18030 calculated algorithmically (discard) + *gbchar = 0; + return 0; + } + } + else if (InRange(uni, 0xE000, 0xE765)) { + // User-defined areas in GB18030 (2-byte) + if (uni <= 0xE233) + gb = 0xAAA1 + (((uni - 0xE000) / 94) << 8) + (uni - 0xE000) % 94; + else if (uni <= 0xE4C5) + gb = 0xF8A1 + (((uni - 0xE234) / 94) << 8) + (uni - 0xE234) % 94; + else { + gb = 0xA140 + (((uni - 0xE4C6) / 96) << 8) + (uni - 0xE4C6) % 96; + // Skip the gap at 0x7F + if ((gb & 0xFF) >= 0x7F) + gb++; + } + } + else { + // Surrogate area and other undefined/reserved areas (discard) + *gbchar = 0; + return 0; + } + + gbchar[0] = (uchar)((gb >> 8) & 0xFF); + gbchar[1] = (uchar)(gb & 0xFF); + return 2; +} + +#endif + -- cgit v1.2.3