Boost.Locale
boost/locale/utf.hpp
00001 //
00002 //  Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
00003 //
00004 //  Distributed under the Boost Software License, Version 1.0. (See
00005 //  accompanying file LICENSE_1_0.txt or copy at
00006 //  http://www.boost.org/LICENSE_1_0.txt)
00007 //
00008 #ifndef BOOST_LOCALE_UTF_HPP_INCLUDED
00009 #define BOOST_LOCALE_UTF_HPP_INCLUDED
00010 
00011 #include <boost/cstdint.hpp>
00012 
00013 namespace boost {
00014 namespace locale {
00020 namespace utf {
00022     #ifdef __GNUC__
00023     #   define BOOST_LOCALE_LIKELY(x)   __builtin_expect((x),1)
00024     #   define BOOST_LOCALE_UNLIKELY(x) __builtin_expect((x),0)
00025     #else
00026     #   define BOOST_LOCALE_LIKELY(x)   (x)
00027     #   define BOOST_LOCALE_UNLIKELY(x) (x)
00028     #endif
00029 
00030 
00034     typedef uint32_t code_point;
00035 
00039     static const code_point illegal = 0xFFFFFFFFu;
00040 
00044     static const code_point incomplete = 0xFFFFFFFEu;
00045 
00049     inline bool is_valid_codepoint(code_point v)
00050     {
00051         if(v>0x10FFFF)
00052             return false;
00053         if(0xD800 <=v && v<= 0xDFFF) // surragates
00054             return false;
00055         return true;
00056     }
00057 
00058     #ifdef BOOST_LOCALE_DOXYGEN
00059 
00060 
00061 
00062     template<typename CharType,int size=sizeof(CharType)>
00063     struct utf_traits {
00067         typedef CharType char_type;
00082         template<typename Iterator>
00083         static code_point decode(Iterator &p,Iterator e);
00084 
00092         static const int max_width;
00099         static int width(code_point value);
00100 
00106         static int trail_length(char_type c);
00110         static bool is_trail(char_type c);
00114         static bool is_lead(char_type c);
00115 
00126         template<typename Iterator>
00127         static Iterator encode(code_point value,Iterator out);
00133         template<typename Iterator>
00134         static code_point decode_valid(Iterator &p);
00135     };
00136     
00137     #else
00138 
00139     template<typename CharType,int size=sizeof(CharType)>
00140     struct utf_traits;
00141 
00142     template<typename CharType>
00143     struct utf_traits<CharType,1> {
00144 
00145         typedef CharType char_type;
00146         
00147         static int trail_length(char_type ci) 
00148         {
00149             unsigned char c = ci;
00150             if(c < 128)
00151                 return 0;
00152             if(BOOST_LOCALE_UNLIKELY(c < 194))
00153                 return -1;
00154             if(c < 224)
00155                 return 1;
00156             if(c < 240)
00157                 return 2;
00158             if(BOOST_LOCALE_LIKELY(c <=244))
00159                 return 3;
00160             return -1;
00161         }
00162         
00163         static const int max_width = 4;
00164 
00165         static int width(code_point value)
00166         {
00167             if(value <=0x7F) {
00168                 return 1;
00169             }
00170             else if(value <=0x7FF) {
00171                 return 2;
00172             }
00173             else if(BOOST_LOCALE_LIKELY(value <=0xFFFF)) {
00174                 return 3;
00175             }
00176             else {
00177                 return 4;
00178             }
00179         }
00180 
00181         static bool is_trail(char_type ci)
00182         {
00183             unsigned char c=ci;
00184             return (c & 0xC0)==0x80;
00185         }
00186 
00187         static bool is_lead(char_type ci)
00188         {
00189             return !is_trail(ci);
00190         }
00191         
00192         template<typename Iterator>
00193         static code_point decode(Iterator &p,Iterator e)
00194         {
00195             if(BOOST_LOCALE_UNLIKELY(p==e))
00196                 return incomplete;
00197 
00198             unsigned char lead = *p++;
00199 
00200             // First byte is fully validated here
00201             int trail_size = trail_length(lead);
00202 
00203             if(BOOST_LOCALE_UNLIKELY(trail_size < 0))
00204                 return illegal;
00205 
00206             //
00207             // Ok as only ASCII may be of size = 0
00208             // also optimize for ASCII text
00209             //
00210             if(trail_size == 0)
00211                 return lead;
00212             
00213             code_point c = lead & ((1<<(6-trail_size))-1);
00214 
00215             // Read the rest
00216             unsigned char tmp;
00217             switch(trail_size) {
00218             case 3:
00219                 if(BOOST_LOCALE_UNLIKELY(p==e))
00220                     return incomplete;
00221                 tmp = *p++;
00222                 if (!is_trail(tmp))
00223                     return illegal;
00224                 c = (c << 6) | ( tmp & 0x3F);
00225             case 2:
00226                 if(BOOST_LOCALE_UNLIKELY(p==e))
00227                     return incomplete;
00228                 tmp = *p++;
00229                 if (!is_trail(tmp))
00230                     return illegal;
00231                 c = (c << 6) | ( tmp & 0x3F);
00232             case 1:
00233                 if(BOOST_LOCALE_UNLIKELY(p==e))
00234                     return incomplete;
00235                 tmp = *p++;
00236                 if (!is_trail(tmp))
00237                     return illegal;
00238                 c = (c << 6) | ( tmp & 0x3F);
00239             }
00240 
00241             // Check code point validity: no surrogates and
00242             // valid range
00243             if(BOOST_LOCALE_UNLIKELY(!is_valid_codepoint(c)))
00244                 return illegal;
00245 
00246             // make sure it is the most compact representation
00247             if(BOOST_LOCALE_UNLIKELY(width(c)!=trail_size + 1))
00248                 return illegal;
00249 
00250             return c;
00251 
00252         }
00253         
00254         template<typename Iterator>
00255         static code_point decode_valid(Iterator &p)
00256         {
00257             unsigned char lead = *p++;
00258             if(lead < 192)
00259                 return lead;
00260 
00261             int trail_size;
00262 
00263             if(lead < 224)
00264                 trail_size = 1;
00265             else if(BOOST_LOCALE_LIKELY(lead < 240)) // non-BMP rare
00266                 trail_size = 2;
00267             else
00268                 trail_size = 3;
00269             
00270             code_point c = lead & ((1<<(6-trail_size))-1);
00271 
00272             switch(trail_size) {
00273             case 3:
00274                 c = (c << 6) | ( static_cast<unsigned char>(*p++) & 0x3F);
00275             case 2:
00276                 c = (c << 6) | ( static_cast<unsigned char>(*p++) & 0x3F);
00277             case 1:
00278                 c = (c << 6) | ( static_cast<unsigned char>(*p++) & 0x3F);
00279             }
00280 
00281             return c;
00282         }
00283 
00284 
00285 
00286         template<typename Iterator>
00287         static Iterator encode(code_point value,Iterator out)
00288         {
00289             if(value <= 0x7F) {
00290                 *out++ = static_cast<char_type>(value);
00291             }
00292             else if(value <= 0x7FF) {
00293                 *out++ = static_cast<char_type>((value >> 6) | 0xC0);
00294                 *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
00295             }
00296             else if(BOOST_LOCALE_LIKELY(value <= 0xFFFF)) {
00297                 *out++ = static_cast<char_type>((value >> 12) | 0xE0);
00298                 *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
00299                 *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
00300             }
00301             else {
00302                 *out++ = static_cast<char_type>((value >> 18) | 0xF0);
00303                 *out++ = static_cast<char_type>(((value >> 12) & 0x3F) | 0x80);
00304                 *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
00305                 *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
00306             }
00307             return out;
00308         }
00309     }; // utf8
00310 
00311     template<typename CharType>
00312     struct utf_traits<CharType,2> {
00313         typedef CharType char_type;
00314 
00315         // See RFC 2781
00316         static bool is_first_surrogate(uint16_t x)
00317         {
00318             return 0xD800 <=x && x<= 0xDBFF;
00319         }
00320         static bool is_second_surrogate(uint16_t x)
00321         {
00322             return 0xDC00 <=x && x<= 0xDFFF;
00323         }
00324         static code_point combine_surrogate(uint16_t w1,uint16_t w2)
00325         {
00326             return ((code_point(w1 & 0x3FF) << 10) | (w2 & 0x3FF)) + 0x10000;
00327         }
00328         static int trail_length(char_type c)
00329         {
00330             if(is_first_surrogate(c))
00331                 return 1;
00332             if(is_second_surrogate(c))
00333                 return -1;
00334             return 0;
00335         }
00339         static bool is_trail(char_type c)
00340         {
00341             return is_second_surrogate(c);
00342         }
00346         static bool is_lead(char_type c)
00347         {
00348             return !is_second_surrogate(c);
00349         }
00350 
00351         template<typename It>
00352         static code_point decode(It &current,It last)
00353         {
00354             if(BOOST_LOCALE_UNLIKELY(current == last))
00355                 return incomplete;
00356             uint16_t w1=*current++;
00357             if(BOOST_LOCALE_LIKELY(w1 < 0xD800 || 0xDFFF < w1)) {
00358                 return w1;
00359             }
00360             if(w1 > 0xDBFF)
00361                 return illegal;
00362             if(current==last)
00363                 return incomplete;
00364             uint16_t w2=*current++;
00365             if(w2 < 0xDC00 || 0xDFFF < w2)
00366                 return illegal;
00367             return combine_surrogate(w1,w2);
00368         }
00369         template<typename It>
00370         static code_point decode_valid(It &current)
00371         {
00372             uint16_t w1=*current++;
00373             if(BOOST_LOCALE_LIKELY(w1 < 0xD800 || 0xDFFF < w1)) {
00374                 return w1;
00375             }
00376             uint16_t w2=*current++;
00377             return combine_surrogate(w1,w2);
00378         }
00379 
00380         static const int max_width = 2;
00381         static int width(code_point u)
00382         {
00383             return u>=0x10000 ? 2 : 1;
00384         }
00385         template<typename It>
00386         static It encode(code_point u,It out)
00387         {
00388             if(BOOST_LOCALE_LIKELY(u<=0xFFFF)) {
00389                 *out++ = static_cast<char_type>(u);
00390             }
00391             else {
00392                 u -= 0x10000;
00393                 *out++ = static_cast<char_type>(0xD800 | (u>>10));
00394                 *out++ = static_cast<char_type>(0xDC00 | (u & 0x3FF));
00395             }
00396             return out;
00397         }
00398     }; // utf16;
00399 
00400         
00401     template<typename CharType>
00402     struct utf_traits<CharType,4> {
00403         typedef CharType char_type;
00404         static int trail_length(char_type c)
00405         {
00406             if(is_valid_codepoint(c))
00407                 return 0;
00408             return -1;
00409         }
00410         static bool is_trail(char_type /*c*/)
00411         {
00412             return false;
00413         }
00414         static bool is_lead(char_type /*c*/)
00415         {
00416             return true;
00417         }
00418 
00419         template<typename It>
00420         static code_point decode_valid(It &current)
00421         {
00422             return *current++;
00423         }
00424 
00425         template<typename It>
00426         static code_point decode(It &current,It last)
00427         {
00428             if(BOOST_LOCALE_UNLIKELY(current == last))
00429                 return boost::locale::utf::incomplete;
00430             code_point c=*current++;
00431             if(BOOST_LOCALE_UNLIKELY(!is_valid_codepoint(c)))
00432                 return boost::locale::utf::illegal;
00433             return c;
00434         }
00435         static const int max_width = 1;
00436         static int width(code_point /*u*/)
00437         {
00438             return 1;
00439         }
00440         template<typename It>
00441         static It encode(code_point u,It out)
00442         {
00443             *out++ = static_cast<char_type>(u);
00444             return out;
00445         }
00446 
00447     }; // utf32
00448 
00449     #endif
00450 
00451 
00452 } // utf
00453 } // locale
00454 } // boost
00455 
00456 
00457 #endif
00458 
00459 // vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
00460