Boost C++ Libraries

...one of the most highly regarded and expertly designed C++ library projects in the world. Herb Sutter and Andrei Alexandrescu, C++ Coding Standards

This is the documentation for a snapshot of the master branch, built from commit e83975fcee.
Boost.Nowide
utf.hpp
1//
2// Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
3// Copyright (c) 2020 Alexander Grund
4//
5// Distributed under the Boost Software License, Version 1.0.
6// https://www.boost.org/LICENSE_1_0.txt
7
8#ifndef BOOST_NOWIDE_UTF_HPP_INCLUDED
9#define BOOST_NOWIDE_UTF_HPP_INCLUDED
10
12#include <cstdint>
13
14namespace boost {
15namespace nowide {
22 namespace utf {
23
27 using code_point = uint32_t;
28
32 static const code_point illegal = 0xFFFFFFFFu;
33
37 static const code_point incomplete = 0xFFFFFFFEu;
38
43 {
44 if(v > 0x10FFFF)
45 return false;
46 if(0xD800 <= v && v <= 0xDFFF) // surrogates
47 return false;
48 return true;
49 }
50
51#ifdef BOOST_NOWIDE_DOXYGEN
55 template<typename CharType, int size = sizeof(CharType)>
57 {
61 using char_type = CharType;
76 template<typename Iterator>
77 static code_point decode(Iterator& p, Iterator e);
78
86 static const int max_width;
93 static int width(code_point value);
94
100 static int trail_length(char_type c);
104 static bool is_trail(char_type c);
108 static bool is_lead(char_type c);
109
120 template<typename Iterator>
121 static Iterator encode(code_point value, Iterator out);
127 template<typename Iterator>
128 static code_point decode_valid(Iterator& p);
129 };
130
131#else
132
133 template<typename CharType, int size = sizeof(CharType)>
134 struct utf_traits;
135
136 template<typename CharType>
137 struct utf_traits<CharType, 1>
138 {
139 using char_type = CharType;
140
141 static int trail_length(char_type ci)
142 {
143 unsigned char c = ci;
144 if(c < 128)
145 return 0;
146 if(BOOST_UNLIKELY(c < 194))
147 return -1;
148 if(c < 224)
149 return 1;
150 if(c < 240)
151 return 2;
152 if(BOOST_LIKELY(c <= 244))
153 return 3;
154 return -1;
155 }
156
157 static const int max_width = 4;
158
159 static int width(code_point value)
160 {
161 if(value <= 0x7F)
162 {
163 return 1;
164 } else if(value <= 0x7FF)
165 {
166 return 2;
167 } else if(BOOST_LIKELY(value <= 0xFFFF))
168 {
169 return 3;
170 } else
171 {
172 return 4;
173 }
174 }
175
176 static bool is_trail(char_type ci)
177 {
178 unsigned char c = ci;
179 return (c & 0xC0) == 0x80;
180 }
181
182 static bool is_lead(char_type ci)
183 {
184 return !is_trail(ci);
185 }
186
187 template<typename Iterator>
188 static code_point decode(Iterator& p, Iterator e)
189 {
190 if(BOOST_UNLIKELY(p == e))
191 return incomplete;
192
193 unsigned char lead = *p++;
194
195 // First byte is fully validated here
196 int trail_size = trail_length(lead);
197
198 if(BOOST_UNLIKELY(trail_size < 0))
199 return illegal;
200
201 // OK as only ASCII may be of size = 0
202 // also optimize for ASCII text
203 if(trail_size == 0)
204 return lead;
205
206 code_point c = lead & ((1 << (6 - trail_size)) - 1);
207
208 // Read the rest
209 unsigned char tmp;
210 switch(trail_size)
211 {
212 case 3:
213 if(BOOST_UNLIKELY(p == e))
214 return incomplete;
215 tmp = *p++;
216 if(!is_trail(tmp))
217 return illegal;
218 c = (c << 6) | (tmp & 0x3F);
219 BOOST_NOWIDE_FALLTHROUGH;
220 case 2:
221 if(BOOST_UNLIKELY(p == e))
222 return incomplete;
223 tmp = *p++;
224 if(!is_trail(tmp))
225 return illegal;
226 c = (c << 6) | (tmp & 0x3F);
227 BOOST_NOWIDE_FALLTHROUGH;
228 case 1:
229 if(BOOST_UNLIKELY(p == e))
230 return incomplete;
231 tmp = *p++;
232 if(!is_trail(tmp))
233 return illegal;
234 c = (c << 6) | (tmp & 0x3F);
235 }
236
237 // Check code point validity:
238 // - no surrogates and valid range
239 // - most compact representation
240 if(BOOST_UNLIKELY(!is_valid_codepoint(c)) || BOOST_UNLIKELY(width(c) != trail_size + 1))
241 {
242 p -= trail_size;
243 return illegal;
244 }
245
246 return c;
247 }
248
249 template<typename Iterator>
250 static code_point decode_valid(Iterator& p)
251 {
252 unsigned char lead = *p++;
253 if(lead < 192)
254 return lead;
255
256 int trail_size;
257
258 if(lead < 224)
259 trail_size = 1;
260 else if(BOOST_LIKELY(lead < 240)) // non-BMP rare
261 trail_size = 2;
262 else
263 trail_size = 3;
264
265 code_point c = lead & ((1 << (6 - trail_size)) - 1);
266
267 switch(trail_size)
268 {
269 case 3: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F); BOOST_NOWIDE_FALLTHROUGH;
270 case 2: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F); BOOST_NOWIDE_FALLTHROUGH;
271 case 1: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F);
272 }
273
274 return c;
275 }
276
277 template<typename Iterator>
278 static Iterator encode(code_point value, Iterator out)
279 {
280 if(value <= 0x7F)
281 {
282 *out++ = static_cast<char_type>(value);
283 } else if(value <= 0x7FF)
284 {
285 *out++ = static_cast<char_type>((value >> 6) | 0xC0);
286 *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
287 } else if(BOOST_LIKELY(value <= 0xFFFF))
288 {
289 *out++ = static_cast<char_type>((value >> 12) | 0xE0);
290 *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
291 *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
292 } else
293 {
294 *out++ = static_cast<char_type>((value >> 18) | 0xF0);
295 *out++ = static_cast<char_type>(((value >> 12) & 0x3F) | 0x80);
296 *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
297 *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
298 }
299 return out;
300 }
301 }; // utf8
302
303 template<typename CharType>
304 struct utf_traits<CharType, 2>
305 {
306 using char_type = CharType;
307
308 // See RFC 2781
309 static bool is_single_codepoint(uint16_t x)
310 {
311 // Ranges [U+0000, 0+D7FF], [U+E000, U+FFFF] are numerically equal in UTF-16
312 return x <= 0xD7FF || x >= 0xE000;
313 }
314 static bool is_first_surrogate(uint16_t x)
315 {
316 // Range [U+D800, 0+DBFF]: High surrogate
317 return 0xD800 <= x && x <= 0xDBFF;
318 }
319 static bool is_second_surrogate(uint16_t x)
320 {
321 // Range [U+DC00, 0+DFFF]: Low surrogate
322 return 0xDC00 <= x && x <= 0xDFFF;
323 }
324 static code_point combine_surrogate(uint16_t w1, uint16_t w2)
325 {
326 return ((code_point(w1 & 0x3FF) << 10) | (w2 & 0x3FF)) + 0x10000;
327 }
328 static int trail_length(char_type c)
329 {
330 if(is_first_surrogate(c))
331 return 1;
332 if(is_second_surrogate(c))
333 return -1;
334 return 0;
335 }
337 static bool is_trail(char_type c)
338 {
339 return is_second_surrogate(c);
340 }
342 static bool is_lead(char_type c)
343 {
344 return !is_second_surrogate(c);
345 }
346
347 template<typename It>
348 static code_point decode(It& current, It last)
349 {
350 if(BOOST_UNLIKELY(current == last))
351 return incomplete;
352 uint16_t w1 = *current++;
353 if(BOOST_LIKELY(is_single_codepoint(w1)))
354 {
355 return w1;
356 }
357 // Now it's either a high or a low surrogate, the latter is invalid
358 if(w1 >= 0xDC00)
359 return illegal;
360 if(current == last)
361 return incomplete;
362 uint16_t w2 = *current++;
363 if(!is_second_surrogate(w2))
364 return illegal;
365 return combine_surrogate(w1, w2);
366 }
367 template<typename It>
368 static code_point decode_valid(It& current)
369 {
370 uint16_t w1 = *current++;
371 if(BOOST_LIKELY(is_single_codepoint(w1)))
372 {
373 return w1;
374 }
375 uint16_t w2 = *current++;
376 return combine_surrogate(w1, w2);
377 }
378
379 static const int max_width = 2;
380 static int width(code_point u) // LCOV_EXCL_LINE
381 {
382 return u >= 0x10000 ? 2 : 1;
383 }
384 template<typename It>
385 static It encode(code_point u, It out)
386 {
387 if(BOOST_LIKELY(u <= 0xFFFF))
388 {
389 *out++ = static_cast<char_type>(u);
390 } else
391 {
392 u -= 0x10000;
393 *out++ = static_cast<char_type>(0xD800 | (u >> 10));
394 *out++ = static_cast<char_type>(0xDC00 | (u & 0x3FF));
395 }
396 return out;
397 }
398 }; // utf16;
399
400 template<typename CharType>
401 struct utf_traits<CharType, 4>
402 {
403 using char_type = CharType;
404 static int trail_length(char_type c)
405 {
406 if(is_valid_codepoint(c))
407 return 0;
408 return -1;
409 }
410 static bool is_trail(char_type /*c*/)
411 {
412 return false;
413 }
414 static bool is_lead(char_type /*c*/)
415 {
416 return true;
417 }
418
419 template<typename It>
420 static code_point decode_valid(It& current)
421 {
422 return *current++;
423 }
424
425 template<typename It>
426 static code_point decode(It& current, It last)
427 {
428 if(BOOST_UNLIKELY(current == last))
429 return incomplete;
430 code_point c = *current++;
431 if(BOOST_UNLIKELY(!is_valid_codepoint(c)))
432 return illegal;
433 return c;
434 }
435 static const int max_width = 1;
436 static int width(code_point /*u*/)
437 {
438 return 1;
439 }
440 template<typename It>
441 static It encode(code_point u, It out)
442 {
443 *out++ = static_cast<char_type>(u);
444 return out;
445 }
446 }; // utf32
447
448#endif
449
450 } // namespace utf
451} // namespace nowide
452} // namespace boost
453
454#endif
Namespace that holds basic operations on UTF encoded sequences.
Definition: convert.hpp:19
bool is_valid_codepoint(code_point v)
the function checks if v is a valid code point
Definition: utf.hpp:42
uint32_t code_point
The integral type that can hold a Unicode code point.
Definition: utf.hpp:27
static const code_point incomplete
Special constant that defines incomplete code point.
Definition: utf.hpp:37
static const code_point illegal
Special constant that defines illegal code point.
Definition: utf.hpp:32
UTF Traits class - functions to convert UTF sequences to and from Unicode code points.
Definition: utf.hpp:57
static code_point decode(Iterator &p, Iterator e)
static bool is_trail(char_type c)
static code_point decode_valid(Iterator &p)
static int trail_length(char_type c)
static Iterator encode(code_point value, Iterator out)
static const int max_width
Definition: utf.hpp:86
CharType char_type
Definition: utf.hpp:61
static bool is_lead(char_type c)
static int width(code_point value)