konversation/src/unicode.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145

/*
  This program is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.
*/

/*
  The Original Code is mozilla.org code.
  See http://lxr.mozilla.org/mozilla/source/modules/rdf/src/utils.c#540

  Copyright (C) 1998 Netscape Communications Corporation
  Copyright (C) 2005 Ismail Donmez <ismail@kde.org>
*/

#define kLeft1BitMask  0x80
#define kLeft2BitsMask 0xC0
#define kLeft3BitsMask 0xE0
#define kLeft4BitsMask 0xF0
#define kLeft5BitsMask 0xF8
#define kLeft6BitsMask 0xFC
#define kLeft7BitsMask 0xFE

#define k2BytesLeadByte kLeft2BitsMask
#define k3BytesLeadByte kLeft3BitsMask
#define k4BytesLeadByte kLeft4BitsMask
#define k5BytesLeadByte kLeft5BitsMask
#define k6BytesLeadByte kLeft6BitsMask
#define kTrialByte      kLeft1BitMask

#define UTF8_1Byte(c) ( 0 == ((c) & kLeft1BitMask))
#define UTF8_2Bytes(c) ( k2BytesLeadByte == ((c) & kLeft3BitsMask))
#define UTF8_3Bytes(c) ( k3BytesLeadByte == ((c) & kLeft4BitsMask))
#define UTF8_4Bytes(c) ( k4BytesLeadByte == ((c) & kLeft5BitsMask))
#define UTF8_5Bytes(c) ( k5BytesLeadByte == ((c) & kLeft6BitsMask))
#define UTF8_6Bytes(c) ( k6BytesLeadByte == ((c) & kLeft7BitsMask))
#define UTF8_ValidTrialByte(c) ( kTrialByte == ((c) & kLeft2BitsMask))


bool isUtf8(const TQCString& text)
{
    int i;
    int j;
    int clen = 0;
    int len = text.length();

    JapaneseCode* jc = new JapaneseCode();

    JapaneseCode::Type result = jc->guess_jp(text, len);

    switch(result)
    {
        case JapaneseCode::SJIS:
        case JapaneseCode::JIS:
            delete jc;
            return false;
        default:
            delete jc;
            break;
    }

    for(i=0; i < len; i += clen)
    {
        if(UTF8_1Byte(text[i]))
        {
            clen = 1;
        }
        else if(UTF8_2Bytes(text[i]))
        {
            clen = 2;

            /* No enough trail bytes */
            if( (i + clen) > len)
                return false;

            /* 0000 0000 - 0000 007F : should encode in less bytes */
            if(0 ==  (text[i] & 0x1E ))
                return false;
        }
        else if(UTF8_3Bytes(text[i]))
        {
            clen = 3;

            /* No enough trail bytes */
            if( (i + clen) > len)
                return false;

            /* a single Surrogate should not show in 3 bytes UTF8, instead, the pair should be intepreted
               as one single UCS4 char and encoded UTF8 in 4 bytes */
            if((TQChar(0xED) == text[i] ) && (0xA0 == (text[i+1] & 0xA0 ) ))
                return false;

            /* 0000 0000 - 0000 07FF : should encode in less bytes */
            if((0 ==  (text[i] & 0x0F )) && (0 ==  (text[i+1] & 0x20 ) ))
                return false;
        }
        else if(UTF8_4Bytes(text[i]))
        {
            clen = 4;

            /* No enough trail bytes */
            if( (i + clen) > len)
                return false;

            /* 0000 0000 - 0000 FFFF : should encode in less bytes */
            if((0 ==  (text[i] & 0x07 )) && (0 ==  (text[i+1] & 0x30 )) )
                return false;
        }
        else if(UTF8_5Bytes(text[i]))
        {
            clen = 5;

            /* No enough trail bytes */
            if( (i + clen) > len)
                return false;

            /* 0000 0000 - 001F FFFF : should encode in less bytes */
            if((0 ==  (text[i] & 0x03 )) && (0 ==  (text[i+1] & 0x38 )) )
                return false;
        }
        else if(UTF8_6Bytes(text[i]))
        {
            clen = 6;

            /* No enough trail bytes */
            if( (i + clen) > len)
                return false;

            /* 0000 0000 - 03FF FFFF : should encode in less bytes */
            if((0 ==  (text[i] & 0x01 )) && (0 ==  (text[i+1] & 0x3E )) )
                return false;
        }
        else
        {
            return false;
        }

        for(j = 1; j<clen ;++j)
        {
            if(! UTF8_ValidTrialByte(text[i+j]))  /* Trail bytes invalid */
                return false;
        }
    }
    return true;
}