#ifndef __STDUTF8_H #define __STDUTF8_H #include #define DSTRING_INCR 8 #include "../utils/types.h" #include #include #include #include namespace utf8 { #define MAXL(ll) \ ( (ll) < (u32)Utf8::STR_MAXLEN ? (ll) : (u32)Utf8::STR_MAXLEN ) const u32 UNI_REPLACEMENT_CHAR = (u32)0x0000FFFD; // Maximum valid value for a Unicode code point const u32 CODE_POINT_MAX = 0x0010ffffu; class utf8_iterator; /* UTF8 code values. Study: http://unicode.org http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8 http://www.utf-8.com/ U-00000000 – U-0000007F: 0xxxxxxx (one byte, a normal ascii character 0x0 - 0x7F) U-00000080 – U-000007FF: 110xxxxx 10xxxxxx (2 bytes UTF8 character) U-00000800 – U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx (3 bytes UTF8 character) U-00010000 – U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx (4 bytes UTF8 character) U-00200000 – U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (5 bytes UTF8 character) U-04000000 – U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (6 bytes UTF8 character) */ class Utf8 { friend class utf8_iterator; private: u32 alloc_length; u32 data_length; byte *data; void init0() { data = 0; data_length = alloc_length = 0; } void internal_add_char(char _c) { this->data[data_length] = _c; data_length++; } void internal_delete_data() { if (data) delete [] data; init0(); } void internal_copy_from_raw_data(const char *_rawdata, u32 _len) { if (!_rawdata) { init0(); } else { _len = MAXL(_len); this->allocate(_len); memcpy(data, _rawdata, _len); data_length = _len; } } public: // Maximum (iostream) input string length enum {STR_MAXLEN = 524288}; enum {INPUT_MAXLEN = 262144}; friend std::fstream & operator>>(std::fstream & _in, Utf8 &_str); friend std::ostream & operator<<(std::ostream & out, Utf8 &_str); // friend std::fstream & operator<<(std::fstream & out, Utf8 &_str); Utf8() { init0(); } Utf8(const Utf8 &_other) { // Convert from const Utf8 *p = const_cast(&_other); init0(); *this = *p; } Utf8(const std::string &_str) { init0(); internal_copy_from_raw_data((char*)_str.c_str(), _str.length()); } Utf8(const char *_text) { init0(); internal_copy_from_raw_data(_text, std::strlen(_text)); } Utf8(u32 _utf8_charval, u32 _count) { init0(); // copy_from_raw_data(_text, cstrlen(_text)); } // ------------------------------------------------- Utf8 &operator = (const Utf8 &_str) { Utf8 *p = const_cast(&_str); internal_delete_data(); internal_copy_from_raw_data(p->raw_data(), p->raw_length()); return *this; } Utf8 &operator = (const char *_text) { internal_delete_data(); internal_copy_from_raw_data(_text, std::strlen(_text)); return *this; } Utf8 &operator = (const std::string &_str) { *this = _str.c_str(); return *this; } ~Utf8() { if (alloc_length) delete data; } u32 append(std::string _str) { return this->append((char*)_str.c_str(), (u32)_str.length()); } u32 append(char *_text, u32 _len) { this->allocate(_len); for (u32 i=0; i< _len; i++) { this->internal_add_char(_text[i]); } return data_length; } u32 append(u32 _uchar) { return 0; } u32 length() { u32 i; u32 j; u32 n; i = 0; n = 0; while (i < data_length) { j = get_sequence_length(&data[i]); if (j > 0) i += j; else i += 1; n++; } // j += getCharLength(&data[j]); // std::cout << "j=" << j << std::endl; // while (i < data_length) //{ // j = getCharLength(&data[i]); //} return n; } u32 raw_length() { return this->data_length; } bool isBOM() { return true; } void allocate(u32 _length) { this->newSize(_length + data_length); } void newSize(u32 _length) { u32 siz; try { // Expand? if (_length > alloc_length) { byte *p; // Calculate new size siz = (((_length - 1)/ DSTRING_INCR)* DSTRING_INCR) + DSTRING_INCR; p = new byte[siz * sizeof(byte)]; // Enough memory? if (!p) throw "UTF8::allocate: Cannot allocate memory"; if (data) { memmove(p, data, data_length *sizeof(byte)); delete [] data; } data = p; // Set new slots to NULL memset(data + data_length, '\0', (siz - data_length)*sizeof(byte)); alloc_length = siz; } } catch ( char *e) { std::cerr << e << std::endl; throw; } } // --------------------------------------------------------- byte *convert_to_utf8_format(u32 _uchar_value, u8 &_len) { // Convert unicode value (_uchar_value) to UTF8 internal character representation. /* ftp://www.unicode.org/Public/PROGRAMS/CVTUTF/ http://www.ietf.org/rfc/rfc3629.txt --------------------+--------------------------------------------- 0000 0000-0000 007F | 0xxxxxxx 0000 0080-0000 07FF | 110xxxxx 10xxxxxx 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ // Convert to UTF8 internal format static byte buf[10]; static u32 i, j; // 11000000, 11100000, 11110111 byte length_mark[] = {0x0, 0x0, 0xC0, 0xE0, 0xF0}; // Is ASCII(US) character 0x00 - 0x7F? Return it as is. if (_uchar_value < 128 /* 0x7F hex */) { _len = 1; buf[0] = static_cast(_uchar_value); return buf; } // Invalid numbers 0xD800 - 0xDFFF if (_uchar_value > 0xD7FF && _uchar_value < 0xE000) { _len = 0; return buf; } // 2 bytes character in range 0x80 - 0x7FF if (_uchar_value < 0x800) { _len = 2; } // 3 bytes character in range 0x800 - 0xFFFF else if (_uchar_value < 0x10000) { _len = 3; } // 4 bytes character in range 0x1000 - 0x10FFFF else { _len = 4; } // Set the code points. 10xxxxxx bytes. for (i=1; i < _len; i++) { // Shift 6 bits to right. Take the first as it is. if (i > 1) _uchar_value = _uchar_value >> 6; // Buffer index j = _len - i; // Take the lsb byte buf[j] = static_cast(_uchar_value); // Set 10xx xxxx marker bits. Clear the 2 msb bits first. buf[j] = buf[j] & 0x3F; // Set the msb bit 10xx xxxx buf[j] = buf[j] | 0x80; } // Put remaining bits into first byte. _uchar_value = _uchar_value >> 6; buf[0] = static_cast(_uchar_value); // Add the length mark into first byte buf[0] = buf[0] & (~length_mark[_len]); buf[0] = buf[0] | length_mark[_len]; buf[_len] = '\0'; return buf; /* TEST: Character value Answer in bytes: report(0x0041); // 41 E2 89 A2 CE 91 2E report(0x2262); report(0x0391); report(0x002E); report(0xD55C); // ED 95 9C EA B5 AD EC 96 B4 report(0xAD6D); report(0xC5B4); report(0x65E5); // E6 97 A5 E6 9C AC E8 AA 9E report(0x672C); report(0x8A9E); */ } // ------------------------------------------------------- u32 convert_utf8_to_value(byte *buf, u8 _len /* = 0 */) { // Convert from internal format to 32 bits number static u32 char_val; static u32 temp_val; static u8 i, j; static byte length_bits[] = {0x0, 0x0, 0x1F, 0x0F, 0x07}; // Is the length given? if (_len == 0) { // Get the length _len = get_sequence_length(buf); // An error? if (_len == 0) return 0; } /* report(0x0041); // 41 E2 89 A2 CE 91 2E report(0x2262); report(0x0391); report(0x002E); */ // One byte only, range 0x0 - 0x7F if (_len == 1) { return static_cast(buf[0]); } char_val = 0; for (i=_len-1, j=0; i>0; i--, j++) { // Nullify 2 msb bits of the byte (10xx xxxx). temp_val = static_cast(buf[i] & 0x3F); char_val = char_val | (temp_val << (j*6)); } // Add the first byte to the char_value. // Remove the length marker bits first. temp_val = static_cast(buf[0]); temp_val = temp_val & length_bits[_len]; char_val = char_val | (temp_val << ((_len - 1)*6)); return char_val; } // ------------------------------------------------------ u8 get_sequence_length(byte *_uch) { // Return the length of a UTF8 character. The length will be a number between 1 - 6. // Return 0 if the character sequence has invalid UTF8 format. // http://www.eskimo.com/~scs/cclass/int/sx4ab.html // Is it a normal ascii character, one byte long, between 0x0 ---> 0x7F. // Is the MSB bit 0? (0xxxxxxx) // Test agianst 1000 0000 if ((*_uch & 0x80) == 0) return 1; // Are the 3 MSB bits 110? A 2 bytes UTF8 character. // 110XXXXX XXXXXXXX if ((*_uch & 0xE0) == 0xC0) { // Rest of the bytes must be 10xxxxxx if ((_uch[1] & 0xC0) == 0x80) return 2; else return 0; // error } // Are the 4 MSB bits 1110? A 3 bytes UTF8 character. // 1110XXXX XXXXXXXX if ((*_uch & 0xF0) == 0xE0) { // Rest of the bytes must be 10xxxxxx if ((_uch[1] & 0xC0) == 0x80 && (_uch[2] & 0xC0) == 0x80) return 3; else return 0; // error } // Are the 5 MSB bits 11110xxx? A 4 bytes UTF8 character. // 11110XXX XXXXXXXX if ((*_uch & 0xF8) == 0xF0) { // Rest of the bytes must be 10xxxxxx if ((_uch[1] & 0xC0) == 0x80 && (_uch[2] & 0xC0) == 0x80 && (_uch[3] & 0xC0) == 0x80) return 4; else return 0; // error } // Are the 6 MSB bits 111110xx? A 5 bytes UTF8 character. // 111110XX XXXXXXXX if ((*_uch & 0xFC) == 0xF8) { // Rest of the bytes must be 10xxxxxx if ((_uch[1] & 0xC0) == 0x80 && (_uch[2] & 0xC0) == 0x80 && (_uch[3] & 0xC0) == 0x80 && (_uch[4] & 0xC0) == 0x80) return 5; else return 0; // error } // 01xxxxxx is an error ! return 0; } const char *raw_data() { return (const char*)data; } // ---------------------------------------------- u8 get_sequence_length_fast(byte _char) { if (_char < static_cast(0x80)) return 1; else if ((_char >> 5) == 0x6) return 2; else if ((_char >> 4) == 0xe) return 3; else if ((_char >> 3) == 0x1e) return 4; else return 0; } bool is_valid_code_point(byte *_cp) { return get_sequence_length(_cp) > 0; // && (*_cp != 0xfffe) && (*_cp != 0xffff); // return (cp <= CODE_POINT_MAX && !is_surrogate(cp) && cp != 0xfffe && cp != 0xffff); } byte *begin() { return data; } byte *end() { byte *cp = prev_code_point(data + data_length); return cp; } byte *next_code_point(byte *cp, u32 _count = 1) { if (cp < data) return this->begin(); else if (cp >= data + data_length) return this->end(); byte *p = cp; byte *last = this->end(); while (_count && p < last) { p++; // Simple acsii character (<= 0x7F) ? if (*p < static_cast(0x80)) { _count--; } // 11xx xxxx ? Is a code point (1.st byte)? else if ((*p & 0xC0) == 0xC0) { _count--; } } return p; } byte *prev_code_point(byte *cp, u32 _count = 1) { byte *p; if (cp <= data) return this->begin(); if (cp >= data + data_length) p = data + data_length; else p = cp; while (_count && p > data) { p--; // std::cout << "p=" << std::hex << (short)*p << std::endl; // Simple acsii character (<= 0x7F) ? if (*p < static_cast(0x80)) { _count--; continue; } // 11xx xxxx ? Is a code point (1.st byte)? else if ((*p & 0xC0) == 0xC0) { _count--; continue; } else { // It is 10xx xxxx } } return p; } byte *get_next_valid_code_point(byte *_ch) { byte *p = _ch + 1; while (p < data + data_length) { // Simple acsii character (<= 0x7F) ? if (*p < static_cast(0x80)) { return p; } // 11xx xxxx ? Is a code point (1.st byte)? else if ((*p & 0xC0) == 0xC0) { return p; } p++; } return p; } }; std::fstream &operator>>(std::fstream & _in, Utf8 &_str) { char cbuf[Utf8::INPUT_MAXLEN]; _in.width(Utf8::INPUT_MAXLEN); _in >> cbuf; // _str = cbuf; return _in; } std::ostream & operator<<(std::ostream &out, Utf8 &_str) { const uint8_t bom[] = {0xef, 0xbb, 0xbf}; // out << bom[0] << bom[1] << bom[2]; for (u32 i = 0; i < _str.raw_length(); i++) { out << _str.data[i]; } return out; } u8 length_from_utf8_value(u32 _ch_value) { // Figure out how many bytes the result will require. if (_ch_value < (u32)0x80) { return 1; } else if (_ch_value < (u32)0x800) { return 2; } else if (_ch_value < (u32)0x10000) { return 3; } else if (_ch_value < (u32)0x110000) { return 4; } else { // bytesToWrite = 3; // ch = UNI_REPLACEMENT_CHAR; return 0; } } /*std::fstream & operator<<(std::fstream &out, Utf8 &_str) { for (u32 i = 0; i < _str.raw_length(); i++) { // out << (const char*)data[i]; } return out; } */ // static int utf8ToUtf16(); // static int utf16ToUtf8(); class utf8_iterator { private: byte *cp; // code point Utf8 *ustr; public: utf8_iterator(Utf8 &_str) { ustr = &_str; cp = 0; } byte *next(u32 _count = 1) { cp = ustr->next_code_point(cp, _count); return cp; } byte *prev(u32 _count = 1) { cp = ustr->prev_code_point(cp, _count); return cp; } byte *operator * () { return cp; } byte *operator ++() { return this->next(); } }; } #endif