Boost.Locale
utf.hpp
1 //
2 // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
3 //
4 // Distributed under the Boost Software License, Version 1.0.
5 // https://www.boost.org/LICENSE_1_0.txt
6 
7 #ifndef BOOST_LOCALE_UTF_HPP_INCLUDED
8 #define BOOST_LOCALE_UTF_HPP_INCLUDED
9 
10 #include <boost/locale/config.hpp>
11 #include <boost/cstdint.hpp>
12 
13 namespace boost { namespace locale {
17  namespace utf {
19  typedef uint32_t code_point;
20 
22  constexpr code_point illegal = 0xFFFFFFFFu;
24  constexpr code_point incomplete = 0xFFFFFFFEu;
25 
28  {
29  if(v > 0x10FFFF)
30  return false;
31  if(0xD800 <= v && v <= 0xDFFF) // surrogates
32  return false;
33  return true;
34  }
35 
36 #ifdef BOOST_LOCALE_DOXYGEN
37 
39  template<typename CharType, int size = sizeof(CharType)>
40  struct utf_traits {
42  typedef CharType char_type;
43 
56  template<typename Iterator>
57  static code_point decode(Iterator& p, Iterator e);
58 
64  static constexpr int max_width;
65 
70  static int width(code_point value);
71 
75  static int trail_length(char_type c);
77  static bool is_trail(char_type c);
79  static bool is_lead(char_type c);
80 
89  template<typename Iterator>
90  static Iterator encode(code_point value, Iterator out);
91 
95  template<typename Iterator>
96  static code_point decode_valid(Iterator& p);
97  };
98 
99 #else
100 
101  template<typename CharType, int size = sizeof(CharType)>
102  struct utf_traits;
103 
104  template<typename CharType>
105  struct utf_traits<CharType, 1> {
106  typedef CharType char_type;
107 
108  static int trail_length(char_type ci)
109  {
110  unsigned char c = ci;
111  if(c < 128)
112  return 0;
113  if(BOOST_UNLIKELY(c < 194))
114  return -1;
115  if(c < 224)
116  return 1;
117  if(c < 240)
118  return 2;
119  if(BOOST_LIKELY(c <= 244))
120  return 3;
121  return -1;
122  }
123 
124  static constexpr int max_width = 4;
125 
126  static int width(code_point value)
127  {
128  if(value <= 0x7F) {
129  return 1;
130  } else if(value <= 0x7FF) {
131  return 2;
132  } else if(BOOST_LIKELY(value <= 0xFFFF)) {
133  return 3;
134  } else {
135  return 4;
136  }
137  }
138 
139  static bool is_trail(char_type ci)
140  {
141  unsigned char c = ci;
142  return (c & 0xC0) == 0x80;
143  }
144 
145  static bool is_lead(char_type ci) { return !is_trail(ci); }
146 
147  template<typename Iterator>
148  static code_point decode(Iterator& p, Iterator e)
149  {
150  if(BOOST_UNLIKELY(p == e))
151  return incomplete;
152 
153  unsigned char lead = *p++;
154 
155  // First byte is fully validated here
156  int trail_size = trail_length(lead);
157 
158  if(BOOST_UNLIKELY(trail_size < 0))
159  return illegal;
160 
161  // Ok as only ASCII may be of size = 0
162  // also optimize for ASCII text
163  if(trail_size == 0)
164  return lead;
165 
166  code_point c = lead & ((1 << (6 - trail_size)) - 1);
167 
168  // Read the rest
169  unsigned char tmp;
170  switch(trail_size) {
171  case 3:
172  if(BOOST_UNLIKELY(p == e))
173  return incomplete;
174  tmp = *p++;
175  if(!is_trail(tmp))
176  return illegal;
177  c = (c << 6) | (tmp & 0x3F);
178  BOOST_FALLTHROUGH;
179  case 2:
180  if(BOOST_UNLIKELY(p == e))
181  return incomplete;
182  tmp = *p++;
183  if(!is_trail(tmp))
184  return illegal;
185  c = (c << 6) | (tmp & 0x3F);
186  BOOST_FALLTHROUGH;
187  case 1:
188  if(BOOST_UNLIKELY(p == e))
189  return incomplete;
190  tmp = *p++;
191  if(!is_trail(tmp))
192  return illegal;
193  c = (c << 6) | (tmp & 0x3F);
194  }
195 
196  // Check code point validity: no surrogates and
197  // valid range
198  if(BOOST_UNLIKELY(!is_valid_codepoint(c)))
199  return illegal;
200 
201  // make sure it is the most compact representation
202  if(BOOST_UNLIKELY(width(c) != trail_size + 1))
203  return illegal;
204 
205  return c;
206  }
207 
208  template<typename Iterator>
209  static code_point decode_valid(Iterator& p)
210  {
211  unsigned char lead = *p++;
212  if(lead < 192)
213  return lead;
214 
215  int trail_size;
216 
217  if(lead < 224)
218  trail_size = 1;
219  else if(BOOST_LIKELY(lead < 240)) // non-BMP rare
220  trail_size = 2;
221  else
222  trail_size = 3;
223 
224  code_point c = lead & ((1 << (6 - trail_size)) - 1);
225 
226  switch(trail_size) {
227  case 3: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F); BOOST_FALLTHROUGH;
228  case 2: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F); BOOST_FALLTHROUGH;
229  case 1: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F);
230  }
231 
232  return c;
233  }
234 
235  template<typename Iterator>
236  static Iterator encode(code_point value, Iterator out)
237  {
238  if(value <= 0x7F) {
239  *out++ = static_cast<char_type>(value);
240  } else if(value <= 0x7FF) {
241  *out++ = static_cast<char_type>((value >> 6) | 0xC0);
242  *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
243  } else if(BOOST_LIKELY(value <= 0xFFFF)) {
244  *out++ = static_cast<char_type>((value >> 12) | 0xE0);
245  *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
246  *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
247  } else {
248  *out++ = static_cast<char_type>((value >> 18) | 0xF0);
249  *out++ = static_cast<char_type>(((value >> 12) & 0x3F) | 0x80);
250  *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
251  *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
252  }
253  return out;
254  }
255  }; // utf8
256 
257  template<typename CharType>
258  struct utf_traits<CharType, 2> {
259  typedef CharType char_type;
260 
261  // See RFC 2781
262  static bool is_first_surrogate(uint16_t x) { return 0xD800 <= x && x <= 0xDBFF; }
263  static bool is_second_surrogate(uint16_t x) { return 0xDC00 <= x && x <= 0xDFFF; }
264  static code_point combine_surrogate(uint16_t w1, uint16_t w2)
265  {
266  return ((code_point(w1 & 0x3FF) << 10) | (w2 & 0x3FF)) + 0x10000;
267  }
268  static int trail_length(char_type c)
269  {
270  if(is_first_surrogate(c))
271  return 1;
272  if(is_second_surrogate(c))
273  return -1;
274  return 0;
275  }
276 
278  static bool is_trail(char_type c) { return is_second_surrogate(c); }
280  static bool is_lead(char_type c) { return !is_second_surrogate(c); }
281 
282  template<typename It>
283  static code_point decode(It& current, It last)
284  {
285  if(BOOST_UNLIKELY(current == last))
286  return incomplete;
287  uint16_t w1 = *current++;
288  if(BOOST_LIKELY(w1 < 0xD800 || 0xDFFF < w1)) {
289  return w1;
290  }
291  if(w1 > 0xDBFF)
292  return illegal;
293  if(current == last)
294  return incomplete;
295  uint16_t w2 = *current++;
296  if(w2 < 0xDC00 || 0xDFFF < w2)
297  return illegal;
298  return combine_surrogate(w1, w2);
299  }
300  template<typename It>
301  static code_point decode_valid(It& current)
302  {
303  uint16_t w1 = *current++;
304  if(BOOST_LIKELY(w1 < 0xD800 || 0xDFFF < w1)) {
305  return w1;
306  }
307  uint16_t w2 = *current++;
308  return combine_surrogate(w1, w2);
309  }
310 
311  static constexpr int max_width = 2;
312  static int width(code_point u) { return u >= 0x10000 ? 2 : 1; }
313  template<typename It>
314  static It encode(code_point u, It out)
315  {
316  if(BOOST_LIKELY(u <= 0xFFFF)) {
317  *out++ = static_cast<char_type>(u);
318  } else {
319  u -= 0x10000;
320  *out++ = static_cast<char_type>(0xD800 | (u >> 10));
321  *out++ = static_cast<char_type>(0xDC00 | (u & 0x3FF));
322  }
323  return out;
324  }
325  }; // utf16;
326 
327  template<typename CharType>
328  struct utf_traits<CharType, 4> {
329  typedef CharType char_type;
330  static int trail_length(char_type c)
331  {
332  if(is_valid_codepoint(c))
333  return 0;
334  return -1;
335  }
336  static bool is_trail(char_type /*c*/) { return false; }
337  static bool is_lead(char_type /*c*/) { return true; }
338 
339  template<typename It>
340  static code_point decode_valid(It& current)
341  {
342  return *current++;
343  }
344 
345  template<typename It>
346  static code_point decode(It& current, It last)
347  {
348  if(BOOST_UNLIKELY(current == last))
350  code_point c = *current++;
351  if(BOOST_UNLIKELY(!is_valid_codepoint(c)))
353  return c;
354  }
355  static constexpr int max_width = 1;
356  static int width(code_point /*u*/) { return 1; }
357  template<typename It>
358  static It encode(code_point u, It out)
359  {
360  *out++ = static_cast<char_type>(u);
361  return out;
362  }
363 
364  }; // utf32
365 
366 #endif
367 
368  } // namespace utf
369 }} // namespace boost::locale
370 
371 #endif
static code_point decode(Iterator &p, Iterator e)
bool is_valid_codepoint(code_point v)
the function checks if v is a valid code point
Definition: utf.hpp:27
static Iterator encode(code_point value, Iterator out)
uint32_t code_point
The integral type that can hold a Unicode code point.
Definition: utf.hpp:19
UTF Traits class - functions to convert UTF sequences to and from Unicode code points.
Definition: utf.hpp:40
static int width(code_point value)
CharType char_type
The type of the character.
Definition: utf.hpp:42
static int trail_length(char_type c)
static bool is_lead(char_type c)
Returns true if c is lead code unit, always true of UTF-32.
static code_point decode_valid(Iterator &p)
static bool is_trail(char_type c)
Returns true if c is trail code unit, always false for UTF-32.
constexpr code_point illegal
Special constant that defines illegal code point.
Definition: utf.hpp:22
static constexpr int max_width
Definition: utf.hpp:64
constexpr code_point incomplete
Special constant that defines incomplete code point.
Definition: utf.hpp:24