/** * @file unicode.cpp * Detects, read and writes characters in the proper format. * * @author Ben Gardner * @license GPL v2+ */ #include "unicode.h" using namespace std; //! See if all characters are ASCII (0-127) static bool is_ascii(const vector &data, size_t &non_ascii_cnt, size_t &zero_cnt); //! Convert the array of bytes into an array of ints static bool decode_bytes(const vector &in_data, deque &out_data); /** * Decode UTF-8 sequences from in_data and put the chars in out_data. * If there are any decoding errors, then return false. */ static bool decode_utf8(const vector &in_data, deque &out_data); /** * Extract 2 bytes from the stream and increment idx by 2 * * @param in byte vector with input data * @param idx index points to working position in vector */ static int get_word(const vector &in_data, size_t &idx, bool be); /** * Decode a UTF-16 sequence. * Sets enc based on the BOM. * Must have the BOM as the first two bytes. */ static bool decode_utf16(const vector &in_data, deque &out_data, char_encoding_e &enc); /** * Looks for the BOM of UTF-16 BE/LE and UTF-8. * If found, set enc and return true. * Sets enc to char_encoding_e::e_ASCII and returns false if not found. */ static bool decode_bom(const vector &in_data, char_encoding_e &enc); //! Write for ASCII and BYTE encoding static void write_byte(int ch); //! Writes a single character to a file using UTF-8 encoding static void write_utf8(int ch); static void write_utf16(int ch, bool be); static bool is_ascii(const vector &data, size_t &non_ascii_cnt, size_t &zero_cnt) { non_ascii_cnt = 0; zero_cnt = 0; for (unsigned char value : data) { if (value & 0x80) { non_ascii_cnt++; } if (!value) { zero_cnt++; } } return((non_ascii_cnt + zero_cnt) == 0); } static bool decode_bytes(const vector &in_data, deque &out_data) { out_data.resize(in_data.size()); for (size_t idx = 0; idx < in_data.size(); idx++) { out_data[idx] = in_data[idx]; } return(true); } void encode_utf8(int ch, vector &res) { if (ch < 0) { // illegal code - do not store } else if (ch < 0x80) { // 0xxxxxxx res.push_back(ch); } else if (ch < 0x0800) { // 110xxxxx 10xxxxxx res.push_back(0xC0 | (ch >> 6)); res.push_back(0x80 | (ch & 0x3f)); } else if (ch < 0x10000) { // 1110xxxx 10xxxxxx 10xxxxxx res.push_back(0xE0 | (ch >> 12)); res.push_back(0x80 | ((ch >> 6) & 0x3f)); res.push_back(0x80 | (ch & 0x3f)); } else if (ch < 0x200000) { // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx res.push_back(0xF0 | (ch >> 18)); res.push_back(0x80 | ((ch >> 12) & 0x3f)); res.push_back(0x80 | ((ch >> 6) & 0x3f)); res.push_back(0x80 | (ch & 0x3f)); } else if (ch < 0x4000000) { // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx res.push_back(0xF8 | (ch >> 24)); res.push_back(0x80 | ((ch >> 18) & 0x3f)); res.push_back(0x80 | ((ch >> 12) & 0x3f)); res.push_back(0x80 | ((ch >> 6) & 0x3f)); res.push_back(0x80 | (ch & 0x3f)); } else // (ch <= 0x7fffffff) { // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx res.push_back(0xFC | (ch >> 30)); res.push_back(0x80 | ((ch >> 24) & 0x3f)); res.push_back(0x80 | ((ch >> 18) & 0x3f)); res.push_back(0x80 | ((ch >> 12) & 0x3f)); res.push_back(0x80 | ((ch >> 6) & 0x3f)); res.push_back(0x80 | (ch & 0x3f)); } } // encode_utf8 static bool decode_utf8(const vector &in_data, deque &out_data) { size_t idx = 0; int cnt; out_data.clear(); // check for UTF-8 BOM silliness and skip if (in_data.size() >= 3) { if ( (in_data[0] == 0xef) && (in_data[1] == 0xbb) && (in_data[2] == 0xbf)) { idx = 3; // skip it } } while (idx < in_data.size()) { int ch = in_data[idx++]; if (ch < 0x80) // 1-byte sequence { out_data.push_back(ch); continue; } else if ((ch & 0xE0) == 0xC0) // 2-byte sequence { ch &= 0x1F; cnt = 1; } else if ((ch & 0xF0) == 0xE0) // 3-byte sequence { ch &= 0x0F; cnt = 2; } else if ((ch & 0xF8) == 0xF0) // 4-byte sequence { ch &= 0x07; cnt = 3; } else if ((ch & 0xFC) == 0xF8) // 5-byte sequence { ch &= 0x03; cnt = 4; } else if ((ch & 0xFE) == 0xFC) // 6-byte sequence { ch &= 0x01; cnt = 5; } else { // invalid UTF-8 sequence return(false); } while ( cnt-- > 0 && idx < in_data.size()) { int tmp = in_data[idx++]; if ((tmp & 0xC0) != 0x80) { // invalid UTF-8 sequence return(false); } ch = (ch << 6) | (tmp & 0x3f); } if (cnt >= 0) { // short UTF-8 sequence return(false); } out_data.push_back(ch); } return(true); } // decode_utf8 static int get_word(const vector &in_data, size_t &idx, bool be) { int ch; if ((idx + 2) > in_data.size()) { ch = -1; } else if (be) { ch = (in_data[idx] << 8) | in_data[idx + 1]; } else { ch = in_data[idx] | (in_data[idx + 1] << 8); } idx += 2; return(ch); } static bool decode_utf16(const vector &in_data, deque &out_data, char_encoding_e &enc) { out_data.clear(); if (in_data.size() & 1) { // can't have and odd length return(false); } if (in_data.size() < 2) { // we require the BOM or at least 1 char return(false); } size_t idx = 2; if ( (in_data[0] == 0xfe) && (in_data[1] == 0xff)) { enc = char_encoding_e::e_UTF16_BE; } else if ( (in_data[0] == 0xff) && (in_data[1] == 0xfe)) { enc = char_encoding_e::e_UTF16_LE; } else { /* * If we have a few words, we can take a guess, assuming the first few * chars are ASCII */ enc = char_encoding_e::e_ASCII; idx = 0; if (in_data.size() >= 6) { if ( (in_data[0] == 0) && (in_data[2] == 0) && (in_data[4] == 0)) { enc = char_encoding_e::e_UTF16_BE; } else if ( (in_data[1] == 0) && (in_data[3] == 0) && (in_data[5] == 0)) { enc = char_encoding_e::e_UTF16_LE; } } if (enc == char_encoding_e::e_ASCII) { return(false); } } bool be = (enc == char_encoding_e::e_UTF16_BE); while (idx < in_data.size()) { int ch = get_word(in_data, idx, be); if ((ch & 0xfc00) == 0xd800) { ch &= 0x3ff; ch <<= 10; int tmp = get_word(in_data, idx, be); if ((tmp & 0xfc00) != 0xdc00) { return(false); } ch |= (tmp & 0x3ff); ch += 0x10000; out_data.push_back(ch); } else if ( ( ch >= 0 && ch < 0xD800) || ch >= 0xE000) { out_data.push_back(ch); } else { // invalid character return(false); } } return(true); } // decode_utf16 static bool decode_bom(const vector &in_data, char_encoding_e &enc) { enc = char_encoding_e::e_ASCII; if (in_data.size() >= 2) { if ( (in_data[0] == 0xfe) && (in_data[1] == 0xff)) { enc = char_encoding_e::e_UTF16_BE; return(true); } if ( (in_data[0] == 0xff) && (in_data[1] == 0xfe)) { enc = char_encoding_e::e_UTF16_LE; return(true); } if ( (in_data.size() >= 3) && (in_data[0] == 0xef) && (in_data[1] == 0xbb) && (in_data[2] == 0xbf)) { enc = char_encoding_e::e_UTF8; return(true); } } return(false); } bool decode_unicode(const vector &in_data, deque &out_data, char_encoding_e &enc, bool &has_bom) { // check for a BOM if (decode_bom(in_data, enc)) { has_bom = true; if (enc == char_encoding_e::e_UTF8) { return(decode_utf8(in_data, out_data)); } return(decode_utf16(in_data, out_data, enc)); } has_bom = false; // Check for simple ASCII size_t non_ascii_cnt; size_t zero_cnt; if (is_ascii(in_data, non_ascii_cnt, zero_cnt)) { enc = char_encoding_e::e_ASCII; return(decode_bytes(in_data, out_data)); } // There are a lot of 0's in UTF-16 (~50%) if ( (zero_cnt > (in_data.size() / 4)) && (zero_cnt <= (in_data.size() / 2))) { // likely is UTF-16 if (decode_utf16(in_data, out_data, enc)) { return(true); } } if (decode_utf8(in_data, out_data)) { enc = char_encoding_e::e_UTF8; return(true); } // it is an unrecognized byte sequence enc = char_encoding_e::e_BYTE; return(decode_bytes(in_data, out_data)); } // decode_unicode static void write_byte(int ch) { if ((ch & 0xff) == ch) { if (cpd.fout) { fputc(ch, cpd.fout); } if (cpd.bout) { cpd.bout->push_back(static_cast(ch)); } } else { // illegal code - do not store } } static void write_utf8(int ch) { vector vv; vv.reserve(6); encode_utf8(ch, vv); for (unsigned char char_val : vv) { write_byte(char_val); } } static void write_utf16(int ch, bool be) { // U+0000 to U+D7FF and U+E000 to U+FFFF if ( ( ch >= 0 && ch < 0xD800) || ( ch >= 0xE000 && ch < 0x10000)) { if (be) { write_byte(ch >> 8); write_byte(ch & 0xff); } else { write_byte(ch & 0xff); write_byte(ch >> 8); } } else if ( ch >= 0x10000 && ch < 0x110000) { int v1 = ch - 0x10000; int w1 = 0xD800 + (v1 >> 10); int w2 = 0xDC00 + (v1 & 0x3ff); if (be) { write_byte(w1 >> 8); write_byte(w1 & 0xff); write_byte(w2 >> 8); write_byte(w2 & 0xff); } else { write_byte(w1 & 0xff); write_byte(w1 >> 8); write_byte(w2 & 0xff); write_byte(w2 >> 8); } } else { // illegal code - do not store } } // write_utf16 void write_bom(void) { switch (cpd.enc) { case char_encoding_e::e_UTF8: write_byte(0xef); write_byte(0xbb); write_byte(0xbf); break; case char_encoding_e::e_UTF16_LE: write_utf16(0xfeff, false); break; case char_encoding_e::e_UTF16_BE: write_utf16(0xfeff, true); break; default: // char_encoding_e::e_ASCII // char_encoding_e::e_BYTE // do nothing // Coveralls will complain break; } } void write_char(int ch) { if (ch >= 0) { switch (cpd.enc) { case char_encoding_e::e_BYTE: write_byte(ch & 0xff); break; case char_encoding_e::e_ASCII: default: write_byte(ch); break; case char_encoding_e::e_UTF8: write_utf8(ch); break; case char_encoding_e::e_UTF16_LE: write_utf16(ch, false); break; case char_encoding_e::e_UTF16_BE: write_utf16(ch, true); break; } } } void write_string(const unc_text &text) { for (size_t idx = 0; idx < text.size(); idx++) { write_char(text[idx]); } }