Boost C++ Libraries

...one of the most highly regarded and expertly designed C++ library projects in the world. Herb Sutter and Andrei Alexandrescu, C++ Coding Standards

This is the documentation for a snapshot of the master branch, built from commit 2b5eb21199.
Boost.Nowide
utf8_codecvt.hpp
1//
2// Copyright (c) 2015 Artyom Beilis (Tonkikh)
3// Copyright (c) 2020 Alexander Grund
4//
5// Distributed under the Boost Software License, Version 1.0.
6// https://www.boost.org/LICENSE_1_0.txt
7
8#ifndef BOOST_NOWIDE_UTF8_CODECVT_HPP_INCLUDED
9#define BOOST_NOWIDE_UTF8_CODECVT_HPP_INCLUDED
10
12#include <boost/nowide/utf/utf.hpp>
13#include <cassert>
14#include <cstdint>
15#include <locale>
16
17namespace boost {
18namespace nowide {
19
20 static_assert(sizeof(std::mbstate_t) >= 2, "mbstate_t is to small to store an UTF-16 codepoint");
21 namespace detail {
22 // Avoid including cstring for std::memcpy
23 inline void copy_uint16_t(void* dst, const void* src)
24 {
25 unsigned char* cdst = static_cast<unsigned char*>(dst);
26 const unsigned char* csrc = static_cast<const unsigned char*>(src);
27 cdst[0] = csrc[0];
28 cdst[1] = csrc[1];
29 }
30 inline std::uint16_t read_state(const std::mbstate_t& src)
31 {
32 std::uint16_t dst;
33 copy_uint16_t(&dst, &src);
34 return dst;
35 }
36 inline void write_state(std::mbstate_t& dst, const std::uint16_t src)
37 {
38 copy_uint16_t(&dst, &src);
39 }
40 } // namespace detail
41
48 template<typename CharType, int CharSize = sizeof(CharType)>
50
51 BOOST_NOWIDE_SUPPRESS_UTF_CODECVT_DEPRECATION_BEGIN
53 template<typename CharType>
54 class BOOST_SYMBOL_VISIBLE utf8_codecvt<CharType, 2> : public std::codecvt<CharType, char, std::mbstate_t>
55 {
56 public:
57 static_assert(sizeof(CharType) >= 2, "CharType must be able to store UTF16 code point");
58
59 utf8_codecvt(size_t refs = 0) : std::codecvt<CharType, char, std::mbstate_t>(refs)
60 {}
61 BOOST_NOWIDE_SUPPRESS_UTF_CODECVT_DEPRECATION_END
62
63 protected:
64 using uchar = CharType;
65
66 std::codecvt_base::result do_unshift(std::mbstate_t& s, char* from, char* /*to*/, char*& next) const override
67 {
68 if(detail::read_state(s) != 0)
69 return std::codecvt_base::error;
70 next = from;
71 return std::codecvt_base::ok;
72 }
73 int do_encoding() const noexcept override
74 {
75 return 0;
76 }
77 int do_max_length() const noexcept override
78 {
79 return 4;
80 }
81 bool do_always_noconv() const noexcept override
82 {
83 return false;
84 }
85
86 // LCOV_EXCL_START
87 int do_length(std::mbstate_t& std_state, const char* from, const char* from_end, size_t max) const override
88 {
89 // LCOV_EXCL_STOP
90 using utf16_traits = utf::utf_traits<uchar, 2>;
91 std::uint16_t state = detail::read_state(std_state);
92 const char* save_from = from;
93 if(state && max > 0)
94 {
95 max--;
96 state = 0;
97 }
98 while(max > 0 && from < from_end)
99 {
100 const char* prev_from = from;
101 std::uint32_t ch = utf::utf_traits<char>::decode(from, from_end);
102 if(ch == utf::illegal)
103 {
105 } else if(ch == utf::incomplete)
106 {
107 from = prev_from;
108 break;
109 }
110 // If we can't write the char, we have to save the low surrogate in state
111 if(BOOST_LIKELY(static_cast<size_t>(utf16_traits::width(ch)) <= max))
112 {
113 max -= utf16_traits::width(ch);
114 } else
115 {
116 static_assert(utf16_traits::max_width == 2, "Required for below");
117 std::uint16_t tmpOut[2]{};
118 utf16_traits::encode(ch, tmpOut);
119 state = tmpOut[1];
120 break;
121 }
122 }
123 detail::write_state(std_state, state);
124 return static_cast<int>(from - save_from);
125 }
126
127 std::codecvt_base::result do_in(std::mbstate_t& std_state, // LCOV_EXCL_LINE
128 const char* from,
129 const char* from_end,
130 const char*& from_next,
131 uchar* to,
132 uchar* to_end,
133 uchar*& to_next) const override
134 {
135 std::codecvt_base::result r = std::codecvt_base::ok;
136 using utf16_traits = utf::utf_traits<uchar, 2>;
137
138 // mbstate_t is POD type and should be initialized to 0 (i.e. state = stateT())
139 // according to standard.
140 // We use it to store a low surrogate if it was not yet written, else state is 0
141 std::uint16_t state = detail::read_state(std_state);
142 // Write low surrogate if present
143 if(state && to < to_end)
144 {
145 *to++ = static_cast<CharType>(state);
146 state = 0;
147 }
148 while(to < to_end && from < from_end)
149 {
150 const char* from_saved = from;
151
152 uint32_t ch = utf::utf_traits<char>::decode(from, from_end);
153
154 if(ch == utf::illegal)
155 {
157 } else if(ch == utf::incomplete)
158 {
159 from = from_saved;
160 r = std::codecvt_base::partial;
161 break;
162 }
163 // If the encoded char fits, write directly, else safe the low surrogate in state
164 if(BOOST_LIKELY(utf16_traits::width(ch) <= to_end - to))
165 {
166 to = utf16_traits::encode(ch, to);
167 } else
168 {
169 static_assert(utf16_traits::max_width == 2, "Required for below");
170 std::uint16_t tmpOut[2]{};
171 utf16_traits::encode(ch, tmpOut);
172 *to++ = static_cast<CharType>(tmpOut[0]);
173 state = tmpOut[1];
174 break;
175 }
176 }
177 from_next = from;
178 to_next = to;
179 if(r == std::codecvt_base::ok && (from != from_end || state != 0))
180 r = std::codecvt_base::partial;
181 detail::write_state(std_state, state);
182 return r;
183 }
184
185 std::codecvt_base::result do_out(std::mbstate_t& std_state,
186 const uchar* from,
187 const uchar* from_end,
188 const uchar*& from_next,
189 char* to,
190 char* to_end,
191 char*& to_next) const override
192 {
193 std::codecvt_base::result r = std::codecvt_base::ok;
194 using utf16_traits = utf::utf_traits<uchar, 2>;
195 // mbstate_t is POD type and should be initialized to 0
196 // (i.e. state = stateT()) according to standard.
197 // We use it to store the first observed surrogate pair, or 0 if there is none yet
198 std::uint16_t state = detail::read_state(std_state);
199 for(; to < to_end && from < from_end; ++from)
200 {
201 std::uint32_t ch = 0;
202 if(state != 0)
203 {
204 // We have a high surrogate, so now there should be a low surrogate
205 std::uint16_t w1 = state;
206 std::uint16_t w2 = *from;
207 if(BOOST_LIKELY(utf16_traits::is_trail(w2)))
208 {
209 ch = utf16_traits::combine_surrogate(w1, w2);
210 } else
211 {
213 }
214 } else
215 {
216 std::uint16_t w1 = *from;
217 if(BOOST_LIKELY(utf16_traits::is_single_codepoint(w1)))
218 {
219 ch = w1;
220 } else if(BOOST_LIKELY(utf16_traits::is_first_surrogate(w1)))
221 {
222 // Store into state and continue at next character
223 state = w1;
224 continue;
225 } else
226 {
227 // Neither a single codepoint nor a high surrogate so must be low surrogate.
228 // This is an error -> Replace character
230 }
231 }
232 assert(utf::is_valid_codepoint(ch)); // Any valid UTF16 sequence is a valid codepoint
233 int len = utf::utf_traits<char>::width(ch);
234 if(to_end - to < len)
235 {
236 r = std::codecvt_base::partial;
237 break;
238 }
240 state = 0;
241 }
242 from_next = from;
243 to_next = to;
244 if(r == std::codecvt_base::ok && (from != from_end || state != 0))
245 r = std::codecvt_base::partial;
246 detail::write_state(std_state, state);
247 return r;
248 }
249 };
250
251 BOOST_NOWIDE_SUPPRESS_UTF_CODECVT_DEPRECATION_BEGIN
253 template<typename CharType>
254 class BOOST_SYMBOL_VISIBLE utf8_codecvt<CharType, 4> : public std::codecvt<CharType, char, std::mbstate_t>
255 {
256 public:
257 utf8_codecvt(size_t refs = 0) : std::codecvt<CharType, char, std::mbstate_t>(refs)
258 {}
259 BOOST_NOWIDE_SUPPRESS_UTF_CODECVT_DEPRECATION_END
260
261 protected:
262 using uchar = CharType;
263
264 std::codecvt_base::result
265 do_unshift(std::mbstate_t& /*s*/, char* from, char* /*to*/, char*& next) const override
266 {
267 next = from;
268 return std::codecvt_base::noconv;
269 }
270 int do_encoding() const noexcept override
271 {
272 return 0;
273 }
274 int do_max_length() const noexcept override
275 {
276 return 4;
277 }
278 bool do_always_noconv() const noexcept override
279 {
280 return false;
281 }
282
283 int do_length(std::mbstate_t& /*state*/, const char* from, const char* from_end, size_t max) const override
284 {
285 const char* start_from = from;
286
287 while(max > 0 && from < from_end)
288 {
289 const char* save_from = from;
290 std::uint32_t ch = utf::utf_traits<char>::decode(from, from_end);
291 if(ch == utf::incomplete)
292 {
293 from = save_from;
294 break;
295 } else if(ch == utf::illegal)
296 {
298 }
299 max--;
300 }
301 return static_cast<int>(from - start_from);
302 }
303
304 std::codecvt_base::result do_in(std::mbstate_t& /*state*/,
305 const char* from,
306 const char* from_end,
307 const char*& from_next,
308 uchar* to,
309 uchar* to_end,
310 uchar*& to_next) const override
311 {
312 std::codecvt_base::result r = std::codecvt_base::ok;
313
314 while(to < to_end && from < from_end)
315 {
316 const char* from_saved = from;
317
318 uint32_t ch = utf::utf_traits<char>::decode(from, from_end);
319
320 if(ch == utf::illegal)
321 {
323 } else if(ch == utf::incomplete)
324 {
325 r = std::codecvt_base::partial;
326 from = from_saved;
327 break;
328 }
329 *to++ = ch;
330 }
331 from_next = from;
332 to_next = to;
333 if(r == std::codecvt_base::ok && from != from_end)
334 r = std::codecvt_base::partial;
335 return r;
336 }
337
338 std::codecvt_base::result do_out(std::mbstate_t& /*std_state*/,
339 const uchar* from,
340 const uchar* from_end,
341 const uchar*& from_next,
342 char* to,
343 char* to_end,
344 char*& to_next) const override
345 {
346 std::codecvt_base::result r = std::codecvt_base::ok;
347 while(to < to_end && from < from_end)
348 {
349 std::uint32_t ch = 0;
350 ch = *from;
352 {
354 }
355 int len = utf::utf_traits<char>::width(ch);
356 if(to_end - to < len)
357 {
358 r = std::codecvt_base::partial;
359 break;
360 }
362 from++;
363 }
364 from_next = from;
365 to_next = to;
366 if(r == std::codecvt_base::ok && from != from_end)
367 r = std::codecvt_base::partial;
368 return r;
369 }
370 };
371
372} // namespace nowide
373} // namespace boost
374
375#endif
Definition: utf8_codecvt.hpp:49
bool is_valid_codepoint(code_point v)
the function checks if v is a valid code point
Definition: utf.hpp:42
static const code_point incomplete
Special constant that defines incomplete code point.
Definition: utf.hpp:37
static const code_point illegal
Special constant that defines illegal code point.
Definition: utf.hpp:32
#define BOOST_NOWIDE_REPLACEMENT_CHARACTER
Definition: replacement.hpp:15
UTF Traits class - functions to convert UTF sequences to and from Unicode code points.
Definition: utf.hpp:57