Boost.Locale
utf.hpp
1//
2// Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
3//
4// Distributed under the Boost Software License, Version 1.0.
5// https://www.boost.org/LICENSE_1_0.txt
6
7#ifndef BOOST_LOCALE_UTF_HPP_INCLUDED
8#define BOOST_LOCALE_UTF_HPP_INCLUDED
9
10#include <boost/locale/config.hpp>
11#include <cstdint>
12
13namespace boost { namespace locale {
17 namespace utf {
19 using code_point = uint32_t;
20
22 constexpr code_point illegal = 0xFFFFFFFFu;
24 constexpr code_point incomplete = 0xFFFFFFFEu;
25
28
31 {
32 if(v > 0x10FFFF)
33 return false;
34 if(0xD800 <= v && v <= 0xDFFF) // surrogates
35 return false;
36 return true;
37 }
38
39#ifdef BOOST_LOCALE_DOXYGEN
40
42 template<typename CharType, int size = sizeof(CharType)>
43 struct utf_traits {
45 typedef CharType char_type;
46
59 template<typename Iterator>
60 static code_point decode(Iterator& p, Iterator e);
61
67 static constexpr int max_width;
68
73 static int width(code_point value);
74
78 static int trail_length(char_type c);
80 static bool is_trail(char_type c);
82 static bool is_lead(char_type c);
83
92 template<typename Iterator>
93 static Iterator encode(code_point value, Iterator out);
94
98 template<typename Iterator>
99 static code_point decode_valid(Iterator& p);
100 };
101
102#else
103
104 template<typename CharType, int size = sizeof(CharType)>
105 struct utf_traits;
106
107 template<typename CharType>
108 struct utf_traits<CharType, 1> {
109 typedef CharType char_type;
110
111 static int trail_length(char_type ci)
112 {
113 unsigned char c = ci;
114 if(c < 128)
115 return 0;
116 if(BOOST_UNLIKELY(c < 194))
117 return -1;
118 if(c < 224)
119 return 1;
120 if(c < 240)
121 return 2;
122 if(BOOST_LIKELY(c <= 244))
123 return 3;
124 return -1;
125 }
126
127 static constexpr int max_width = 4;
128
129 static int width(code_point value)
130 {
131 if(value <= 0x7F)
132 return 1;
133 else if(value <= 0x7FF)
134 return 2;
135 else if(BOOST_LIKELY(value <= 0xFFFF))
136 return 3;
137 else
138 return 4;
139 }
140
141 static bool is_trail(char_type ci)
142 {
143 unsigned char c = ci;
144 return (c & 0xC0) == 0x80;
145 }
146
147 static bool is_lead(char_type ci) { return !is_trail(ci); }
148
149 template<typename Iterator>
150 static code_point decode(Iterator& p, Iterator e)
151 {
152 if(BOOST_UNLIKELY(p == e))
153 return incomplete;
154
155 unsigned char lead = *p++;
156
157 // First byte is fully validated here
158 int trail_size = trail_length(lead);
159
160 if(BOOST_UNLIKELY(trail_size < 0))
161 return illegal;
162
163 // Ok as only ASCII may be of size = 0
164 // also optimize for ASCII text
165 if(trail_size == 0)
166 return lead;
167
168 code_point c = lead & ((1 << (6 - trail_size)) - 1);
169
170 // Read the rest
171 unsigned char tmp;
172 switch(trail_size) {
173 case 3:
174 if(BOOST_UNLIKELY(p == e))
175 return incomplete;
176 tmp = *p++;
177 if(!is_trail(tmp))
178 return illegal;
179 c = (c << 6) | (tmp & 0x3F);
180 BOOST_FALLTHROUGH;
181 case 2:
182 if(BOOST_UNLIKELY(p == e))
183 return incomplete;
184 tmp = *p++;
185 if(!is_trail(tmp))
186 return illegal;
187 c = (c << 6) | (tmp & 0x3F);
188 BOOST_FALLTHROUGH;
189 case 1:
190 if(BOOST_UNLIKELY(p == e))
191 return incomplete;
192 tmp = *p++;
193 if(!is_trail(tmp))
194 return illegal;
195 c = (c << 6) | (tmp & 0x3F);
196 }
197
198 // Check code point validity: no surrogates and
199 // valid range
200 if(BOOST_UNLIKELY(!is_valid_codepoint(c)))
201 return illegal;
202
203 // make sure it is the most compact representation
204 if(BOOST_UNLIKELY(width(c) != trail_size + 1))
205 return illegal;
206
207 return c;
208 }
209
210 template<typename Iterator>
211 static code_point decode_valid(Iterator& p)
212 {
213 unsigned char lead = *p++;
214 if(lead < 192)
215 return lead;
216
217 int trail_size;
218
219 if(lead < 224)
220 trail_size = 1;
221 else if(BOOST_LIKELY(lead < 240)) // non-BMP rare
222 trail_size = 2;
223 else
224 trail_size = 3;
225
226 code_point c = lead & ((1 << (6 - trail_size)) - 1);
227
228 switch(trail_size) {
229 case 3: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F); BOOST_FALLTHROUGH;
230 case 2: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F); BOOST_FALLTHROUGH;
231 case 1: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F);
232 }
233
234 return c;
235 }
236
237 template<typename Iterator>
238 static Iterator encode(code_point value, Iterator out)
239 {
240 if(value <= 0x7F)
241 *out++ = static_cast<char_type>(value);
242 else if(value <= 0x7FF) {
243 *out++ = static_cast<char_type>((value >> 6) | 0xC0);
244 *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
245 } else if(BOOST_LIKELY(value <= 0xFFFF)) {
246 *out++ = static_cast<char_type>((value >> 12) | 0xE0);
247 *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
248 *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
249 } else {
250 *out++ = static_cast<char_type>((value >> 18) | 0xF0);
251 *out++ = static_cast<char_type>(((value >> 12) & 0x3F) | 0x80);
252 *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
253 *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
254 }
255 return out;
256 }
257 }; // utf8
258
259 template<typename CharType>
260 struct utf_traits<CharType, 2> {
261 typedef CharType char_type;
262
263 // See RFC 2781
264 static bool is_first_surrogate(uint16_t x) { return 0xD800 <= x && x <= 0xDBFF; }
265 static bool is_second_surrogate(uint16_t x) { return 0xDC00 <= x && x <= 0xDFFF; }
266 static code_point combine_surrogate(uint16_t w1, uint16_t w2)
267 {
268 return ((code_point(w1 & 0x3FF) << 10) | (w2 & 0x3FF)) + 0x10000;
269 }
270 static int trail_length(char_type c)
271 {
272 if(is_first_surrogate(c))
273 return 1;
274 if(is_second_surrogate(c))
275 return -1;
276 return 0;
277 }
278
280 static bool is_trail(char_type c) { return is_second_surrogate(c); }
282 static bool is_lead(char_type c) { return !is_second_surrogate(c); }
283
284 template<typename It>
285 static code_point decode(It& current, It last)
286 {
287 if(BOOST_UNLIKELY(current == last))
288 return incomplete;
289 uint16_t w1 = *current++;
290 if(BOOST_LIKELY(w1 < 0xD800 || 0xDFFF < w1))
291 return w1;
292 if(w1 > 0xDBFF)
293 return illegal;
294 if(current == last)
295 return incomplete;
296 uint16_t w2 = *current++;
297 if(w2 < 0xDC00 || 0xDFFF < w2)
298 return illegal;
299 return combine_surrogate(w1, w2);
300 }
301 template<typename It>
302 static code_point decode_valid(It& current)
303 {
304 uint16_t w1 = *current++;
305 if(BOOST_LIKELY(w1 < 0xD800 || 0xDFFF < w1))
306 return w1;
307 uint16_t w2 = *current++;
308 return combine_surrogate(w1, w2);
309 }
310
311 static constexpr int max_width = 2;
312 static int width(code_point u) { return u >= 0x10000 ? 2 : 1; }
313 template<typename It>
314 static It encode(code_point u, It out)
315 {
316 if(BOOST_LIKELY(u <= 0xFFFF))
317 *out++ = static_cast<char_type>(u);
318 else {
319 u -= 0x10000;
320 *out++ = static_cast<char_type>(0xD800 | (u >> 10));
321 *out++ = static_cast<char_type>(0xDC00 | (u & 0x3FF));
322 }
323 return out;
324 }
325 }; // utf16;
326
327 template<typename CharType>
328 struct utf_traits<CharType, 4> {
329 typedef CharType char_type;
330 static int trail_length(char_type c)
331 {
332 if(is_valid_codepoint(c))
333 return 0;
334 return -1;
335 }
336 static bool is_trail(char_type /*c*/) { return false; }
337 static bool is_lead(char_type /*c*/) { return true; }
338
339 template<typename It>
340 static code_point decode_valid(It& current)
341 {
342 return *current++;
343 }
344
345 template<typename It>
346 static code_point decode(It& current, It last)
347 {
348 if(BOOST_UNLIKELY(current == last))
350 code_point c = *current++;
351 if(BOOST_UNLIKELY(!is_valid_codepoint(c)))
353 return c;
354 }
355 static constexpr int max_width = 1;
356 static int width(code_point /*u*/) { return 1; }
357 template<typename It>
358 static It encode(code_point u, It out)
359 {
360 *out++ = static_cast<char_type>(u);
361 return out;
362 }
363
364 }; // utf32
365
366#endif
367
368 } // namespace utf
369}} // namespace boost::locale
370
371#endif
uint32_t code_point
The integral type that can hold a Unicode code point.
Definition: utf.hpp:19
bool is_valid_codepoint(code_point v)
the function checks if v is a valid code point
Definition: utf.hpp:30
code_point len_or_error
Either a length/size or an error (illegal/incomplete)
Definition: utf.hpp:27
constexpr code_point illegal
Special constant that defines illegal code point.
Definition: utf.hpp:22
constexpr code_point incomplete
Special constant that defines incomplete code point.
Definition: utf.hpp:24
UTF Traits class - functions to convert UTF sequences to and from Unicode code points.
Definition: utf.hpp:43
static bool is_lead(char_type c)
Returns true if c is lead code unit, always true of UTF-32.
static int width(code_point value)
static int trail_length(char_type c)
static code_point decode_valid(Iterator &p)
static code_point decode(Iterator &p, Iterator e)
static constexpr int max_width
Definition: utf.hpp:67
CharType char_type
The type of the character.
Definition: utf.hpp:45
static Iterator encode(code_point value, Iterator out)
static bool is_trail(char_type c)
Returns true if c is trail code unit, always false for UTF-32.