OpenMW
components/misc/utf8stream.hpp
Go to the documentation of this file.
00001 #ifndef MISC_UTF8ITER_HPP
00002 #define MISC_UTF8ITER_HPP
00003 
00004 #include <boost/tuple/tuple.hpp>
00005 
00006 class Utf8Stream
00007 {
00008 public:
00009 
00010     typedef uint32_t UnicodeChar;
00011     typedef unsigned char const * Point;
00012 
00013     //static const unicode_char sBadChar = 0xFFFFFFFF; gcc can't handle this
00014     static UnicodeChar sBadChar () { return UnicodeChar (0xFFFFFFFF); }
00015 
00016     Utf8Stream (Point begin, Point end) :
00017         cur (begin), nxt (begin), end (end), val(Utf8Stream::sBadChar())
00018     {
00019     }
00020 
00021     Utf8Stream (std::pair <Point, Point> range) :
00022         cur (range.first), nxt (range.first), end (range.second), val(Utf8Stream::sBadChar())
00023     {
00024     }
00025 
00026     bool eof () const
00027     {
00028         return cur == end;
00029     }
00030 
00031     Point current () const
00032     {
00033         return cur;
00034     }
00035 
00036     UnicodeChar peek ()
00037     {
00038         if (cur == nxt)
00039             next ();
00040         return val;
00041     }
00042 
00043     UnicodeChar consume ()
00044     {
00045         if (cur == nxt)
00046             next ();
00047         cur = nxt;
00048         return val;
00049     }
00050 
00051     static std::pair <UnicodeChar, Point> decode (Point cur, Point end)
00052     {
00053         if ((*cur & 0x80) == 0)
00054         {
00055             UnicodeChar chr = *cur++;
00056 
00057             return std::make_pair (chr, cur);
00058         }
00059 
00060         int octets;
00061         UnicodeChar chr;
00062 
00063         boost::tie (octets, chr) = octet_count (*cur++);
00064 
00065         if (octets > 5)
00066             return std::make_pair (sBadChar(), cur);
00067 
00068         Point eoc = cur + octets;
00069 
00070         if (eoc > end)
00071             return std::make_pair (sBadChar(), cur);
00072 
00073         while (cur != eoc)
00074         {
00075             if ((*cur & 0xC0) != 0x80) // check continuation mark
00076                 return std::make_pair (sBadChar(), cur);;
00077 
00078             chr = (chr << 6) | UnicodeChar ((*cur++) & 0x3F);
00079         }
00080 
00081         return std::make_pair (chr, cur);
00082     }
00083 
00084 private:
00085 
00086     static std::pair <int, UnicodeChar> octet_count (unsigned char octet)
00087     {
00088         int octets;
00089 
00090         unsigned char mark = 0xC0;
00091         unsigned char mask = 0xE0;
00092 
00093         for (octets = 1; octets <= 5; ++octets)
00094         {
00095             if ((octet & mask) == mark)
00096                 break;
00097 
00098             mark = (mark >> 1) | 0x80;
00099             mask = (mask >> 1) | 0x80;
00100         }
00101 
00102         return std::make_pair (octets, octet & ~mask);
00103     }
00104 
00105     void next ()
00106     {
00107         boost::tie (val, nxt) = decode (nxt, end);
00108     }
00109 
00110     Point cur;
00111     Point nxt;
00112     Point end;
00113     UnicodeChar val;
00114 };
00115 
00116 #endif