OpenMW
|
00001 #ifndef MISC_UTF8ITER_HPP 00002 #define MISC_UTF8ITER_HPP 00003 00004 #include <boost/tuple/tuple.hpp> 00005 00006 class Utf8Stream 00007 { 00008 public: 00009 00010 typedef uint32_t UnicodeChar; 00011 typedef unsigned char const * Point; 00012 00013 //static const unicode_char sBadChar = 0xFFFFFFFF; gcc can't handle this 00014 static UnicodeChar sBadChar () { return UnicodeChar (0xFFFFFFFF); } 00015 00016 Utf8Stream (Point begin, Point end) : 00017 cur (begin), nxt (begin), end (end), val(Utf8Stream::sBadChar()) 00018 { 00019 } 00020 00021 Utf8Stream (std::pair <Point, Point> range) : 00022 cur (range.first), nxt (range.first), end (range.second), val(Utf8Stream::sBadChar()) 00023 { 00024 } 00025 00026 bool eof () const 00027 { 00028 return cur == end; 00029 } 00030 00031 Point current () const 00032 { 00033 return cur; 00034 } 00035 00036 UnicodeChar peek () 00037 { 00038 if (cur == nxt) 00039 next (); 00040 return val; 00041 } 00042 00043 UnicodeChar consume () 00044 { 00045 if (cur == nxt) 00046 next (); 00047 cur = nxt; 00048 return val; 00049 } 00050 00051 static std::pair <UnicodeChar, Point> decode (Point cur, Point end) 00052 { 00053 if ((*cur & 0x80) == 0) 00054 { 00055 UnicodeChar chr = *cur++; 00056 00057 return std::make_pair (chr, cur); 00058 } 00059 00060 int octets; 00061 UnicodeChar chr; 00062 00063 boost::tie (octets, chr) = octet_count (*cur++); 00064 00065 if (octets > 5) 00066 return std::make_pair (sBadChar(), cur); 00067 00068 Point eoc = cur + octets; 00069 00070 if (eoc > end) 00071 return std::make_pair (sBadChar(), cur); 00072 00073 while (cur != eoc) 00074 { 00075 if ((*cur & 0xC0) != 0x80) // check continuation mark 00076 return std::make_pair (sBadChar(), cur);; 00077 00078 chr = (chr << 6) | UnicodeChar ((*cur++) & 0x3F); 00079 } 00080 00081 return std::make_pair (chr, cur); 00082 } 00083 00084 private: 00085 00086 static std::pair <int, UnicodeChar> octet_count (unsigned char octet) 00087 { 00088 int octets; 00089 00090 unsigned char mark = 0xC0; 00091 unsigned char mask = 0xE0; 00092 00093 for (octets = 1; octets <= 5; ++octets) 00094 { 00095 if ((octet & mask) == mark) 00096 break; 00097 00098 mark = (mark >> 1) | 0x80; 00099 mask = (mask >> 1) | 0x80; 00100 } 00101 00102 return std::make_pair (octets, octet & ~mask); 00103 } 00104 00105 void next () 00106 { 00107 boost::tie (val, nxt) = decode (nxt, end); 00108 } 00109 00110 Point cur; 00111 Point nxt; 00112 Point end; 00113 UnicodeChar val; 00114 }; 00115 00116 #endif