Boost C++ Libraries

...one of the most highly regarded and expertly designed C++ library projects in the world. Herb Sutter and Andrei Alexandrescu, C++ Coding Standards

This is the documentation for a snapshot of the master branch, built from commit 8864445b72.
Boost.Nowide
utf.hpp
1 //
2 // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
3 // Copyright (c) 2020 Alexander Grund
4 //
5 // Distributed under the Boost Software License, Version 1.0.
6 // https://www.boost.org/LICENSE_1_0.txt
7 
8 #ifndef BOOST_NOWIDE_UTF_HPP_INCLUDED
9 #define BOOST_NOWIDE_UTF_HPP_INCLUDED
10 
11 #include <boost/nowide/config.hpp>
12 #include <cstdint>
13 
14 namespace boost {
15 namespace nowide {
22  namespace utf {
23 
27  using code_point = uint32_t;
28 
32  static const code_point illegal = 0xFFFFFFFFu;
33 
37  static const code_point incomplete = 0xFFFFFFFEu;
38 
43  {
44  if(v > 0x10FFFF)
45  return false;
46  if(0xD800 <= v && v <= 0xDFFF) // surrogates
47  return false;
48  return true;
49  }
50 
51 #ifdef BOOST_NOWIDE_DOXYGEN
52  template<typename CharType, int size = sizeof(CharType)>
56  struct utf_traits
57  {
61  using char_type = CharType;
76  template<typename Iterator>
77  static code_point decode(Iterator& p, Iterator e);
78 
86  static const int max_width;
93  static int width(code_point value);
94 
100  static int trail_length(char_type c);
104  static bool is_trail(char_type c);
108  static bool is_lead(char_type c);
109 
120  template<typename Iterator>
121  static Iterator encode(code_point value, Iterator out);
127  template<typename Iterator>
128  static code_point decode_valid(Iterator& p);
129  };
130 
131 #else
132 
133  template<typename CharType, int size = sizeof(CharType)>
134  struct utf_traits;
135 
136  template<typename CharType>
137  struct utf_traits<CharType, 1>
138  {
139  using char_type = CharType;
140 
141  static int trail_length(char_type ci)
142  {
143  unsigned char c = ci;
144  if(c < 128)
145  return 0;
146  if(BOOST_UNLIKELY(c < 194))
147  return -1;
148  if(c < 224)
149  return 1;
150  if(c < 240)
151  return 2;
152  if(BOOST_LIKELY(c <= 244))
153  return 3;
154  return -1;
155  }
156 
157  static const int max_width = 4;
158 
159  static int width(code_point value)
160  {
161  if(value <= 0x7F)
162  {
163  return 1;
164  } else if(value <= 0x7FF)
165  {
166  return 2;
167  } else if(BOOST_LIKELY(value <= 0xFFFF))
168  {
169  return 3;
170  } else
171  {
172  return 4;
173  }
174  }
175 
176  static bool is_trail(char_type ci)
177  {
178  unsigned char c = ci;
179  return (c & 0xC0) == 0x80;
180  }
181 
182  static bool is_lead(char_type ci)
183  {
184  return !is_trail(ci);
185  }
186 
187  template<typename Iterator>
188  static code_point decode(Iterator& p, Iterator e)
189  {
190  if(BOOST_UNLIKELY(p == e))
191  return incomplete;
192 
193  unsigned char lead = *p++;
194 
195  // First byte is fully validated here
196  int trail_size = trail_length(lead);
197 
198  if(BOOST_UNLIKELY(trail_size < 0))
199  return illegal;
200 
201  // OK as only ASCII may be of size = 0
202  // also optimize for ASCII text
203  if(trail_size == 0)
204  return lead;
205 
206  code_point c = lead & ((1 << (6 - trail_size)) - 1);
207 
208  // Read the rest
209  unsigned char tmp;
210  switch(trail_size)
211  {
212  case 3:
213  if(BOOST_UNLIKELY(p == e))
214  return incomplete;
215  tmp = *p++;
216  if(!is_trail(tmp))
217  return illegal;
218  c = (c << 6) | (tmp & 0x3F);
219  BOOST_NOWIDE_FALLTHROUGH;
220  case 2:
221  if(BOOST_UNLIKELY(p == e))
222  return incomplete;
223  tmp = *p++;
224  if(!is_trail(tmp))
225  return illegal;
226  c = (c << 6) | (tmp & 0x3F);
227  BOOST_NOWIDE_FALLTHROUGH;
228  case 1:
229  if(BOOST_UNLIKELY(p == e))
230  return incomplete;
231  tmp = *p++;
232  if(!is_trail(tmp))
233  return illegal;
234  c = (c << 6) | (tmp & 0x3F);
235  }
236 
237  // Check code point validity:
238  // - no surrogates and valid range
239  // - most compact representation
240  if(BOOST_UNLIKELY(!is_valid_codepoint(c)) || BOOST_UNLIKELY(width(c) != trail_size + 1))
241  {
242  p -= trail_size;
243  return illegal;
244  }
245 
246  return c;
247  }
248 
249  template<typename Iterator>
250  static code_point decode_valid(Iterator& p)
251  {
252  unsigned char lead = *p++;
253  if(lead < 192)
254  return lead;
255 
256  int trail_size;
257 
258  if(lead < 224)
259  trail_size = 1;
260  else if(BOOST_LIKELY(lead < 240)) // non-BMP rare
261  trail_size = 2;
262  else
263  trail_size = 3;
264 
265  code_point c = lead & ((1 << (6 - trail_size)) - 1);
266 
267  switch(trail_size)
268  {
269  case 3: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F); BOOST_NOWIDE_FALLTHROUGH;
270  case 2: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F); BOOST_NOWIDE_FALLTHROUGH;
271  case 1: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F);
272  }
273 
274  return c;
275  }
276 
277  template<typename Iterator>
278  static Iterator encode(code_point value, Iterator out)
279  {
280  if(value <= 0x7F)
281  {
282  *out++ = static_cast<char_type>(value);
283  } else if(value <= 0x7FF)
284  {
285  *out++ = static_cast<char_type>((value >> 6) | 0xC0);
286  *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
287  } else if(BOOST_LIKELY(value <= 0xFFFF))
288  {
289  *out++ = static_cast<char_type>((value >> 12) | 0xE0);
290  *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
291  *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
292  } else
293  {
294  *out++ = static_cast<char_type>((value >> 18) | 0xF0);
295  *out++ = static_cast<char_type>(((value >> 12) & 0x3F) | 0x80);
296  *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
297  *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
298  }
299  return out;
300  }
301  }; // utf8
302 
303  template<typename CharType>
304  struct utf_traits<CharType, 2>
305  {
306  using char_type = CharType;
307 
308  // See RFC 2781
309  static bool is_single_codepoint(uint16_t x)
310  {
311  // Ranges [U+0000, 0+D7FF], [U+E000, U+FFFF] are numerically equal in UTF-16
312  return x <= 0xD7FF || x >= 0xE000;
313  }
314  static bool is_first_surrogate(uint16_t x)
315  {
316  // Range [U+D800, 0+DBFF]: High surrogate
317  return 0xD800 <= x && x <= 0xDBFF;
318  }
319  static bool is_second_surrogate(uint16_t x)
320  {
321  // Range [U+DC00, 0+DFFF]: Low surrogate
322  return 0xDC00 <= x && x <= 0xDFFF;
323  }
324  static code_point combine_surrogate(uint16_t w1, uint16_t w2)
325  {
326  return ((code_point(w1 & 0x3FF) << 10) | (w2 & 0x3FF)) + 0x10000;
327  }
328  static int trail_length(char_type c)
329  {
330  if(is_first_surrogate(c))
331  return 1;
332  if(is_second_surrogate(c))
333  return -1;
334  return 0;
335  }
337  static bool is_trail(char_type c)
338  {
339  return is_second_surrogate(c);
340  }
342  static bool is_lead(char_type c)
343  {
344  return !is_second_surrogate(c);
345  }
346 
347  template<typename It>
348  static code_point decode(It& current, It last)
349  {
350  if(BOOST_UNLIKELY(current == last))
351  return incomplete;
352  uint16_t w1 = *current++;
353  if(BOOST_LIKELY(is_single_codepoint(w1)))
354  {
355  return w1;
356  }
357  // Now it's either a high or a low surrogate, the latter is invalid
358  if(w1 >= 0xDC00)
359  return illegal;
360  if(current == last)
361  return incomplete;
362  uint16_t w2 = *current++;
363  if(!is_second_surrogate(w2))
364  return illegal;
365  return combine_surrogate(w1, w2);
366  }
367  template<typename It>
368  static code_point decode_valid(It& current)
369  {
370  uint16_t w1 = *current++;
371  if(BOOST_LIKELY(is_single_codepoint(w1)))
372  {
373  return w1;
374  }
375  uint16_t w2 = *current++;
376  return combine_surrogate(w1, w2);
377  }
378 
379  static const int max_width = 2;
380  static int width(code_point u) // LCOV_EXCL_LINE
381  {
382  return u >= 0x10000 ? 2 : 1;
383  }
384  template<typename It>
385  static It encode(code_point u, It out)
386  {
387  if(BOOST_LIKELY(u <= 0xFFFF))
388  {
389  *out++ = static_cast<char_type>(u);
390  } else
391  {
392  u -= 0x10000;
393  *out++ = static_cast<char_type>(0xD800 | (u >> 10));
394  *out++ = static_cast<char_type>(0xDC00 | (u & 0x3FF));
395  }
396  return out;
397  }
398  }; // utf16;
399 
400  template<typename CharType>
401  struct utf_traits<CharType, 4>
402  {
403  using char_type = CharType;
404  static int trail_length(char_type c)
405  {
406  if(is_valid_codepoint(c))
407  return 0;
408  return -1;
409  }
410  static bool is_trail(char_type /*c*/)
411  {
412  return false;
413  }
414  static bool is_lead(char_type /*c*/)
415  {
416  return true;
417  }
418 
419  template<typename It>
420  static code_point decode_valid(It& current)
421  {
422  return *current++;
423  }
424 
425  template<typename It>
426  static code_point decode(It& current, It last)
427  {
428  if(BOOST_UNLIKELY(current == last))
429  return incomplete;
430  code_point c = *current++;
431  if(BOOST_UNLIKELY(!is_valid_codepoint(c)))
432  return illegal;
433  return c;
434  }
435  static const int max_width = 1;
436  static int width(code_point /*u*/)
437  {
438  return 1;
439  }
440  template<typename It>
441  static It encode(code_point u, It out)
442  {
443  *out++ = static_cast<char_type>(u);
444  return out;
445  }
446  }; // utf32
447 
448 #endif
449 
450  } // namespace utf
451 } // namespace nowide
452 } // namespace boost
453 
454 #endif
static const int max_width
Definition: utf.hpp:86
UTF Traits class - functions to convert UTF sequences to and from Unicode code points.
Definition: utf.hpp:56
static bool is_trail(char_type c)
Namespace that holds basic operations on UTF encoded sequences.
Definition: convert.hpp:19
static Iterator encode(code_point value, Iterator out)
static bool is_lead(char_type c)
static const code_point illegal
Special constant that defines illegal code point.
Definition: utf.hpp:32
uint32_t code_point
The integral type that can hold a Unicode code point.
Definition: utf.hpp:27
CharType char_type
Definition: utf.hpp:61
static code_point decode_valid(Iterator &p)
static int trail_length(char_type c)
static const code_point incomplete
Special constant that defines incomplete code point.
Definition: utf.hpp:37
static int width(code_point value)
bool is_valid_codepoint(code_point v)
the function checks if v is a valid code point
Definition: utf.hpp:42
static code_point decode(Iterator &p, Iterator e)