Boost.Locale
generic_codecvt.hpp
1//
2// Copyright (c) 2015 Artyom Beilis (Tonkikh)
3// Copyright (c) 2021-2023 Alexander Grund
4//
5// Distributed under the Boost Software License, Version 1.0.
6// https://www.boost.org/LICENSE_1_0.txt
7
8#ifndef BOOST_LOCALE_GENERIC_CODECVT_HPP
9#define BOOST_LOCALE_GENERIC_CODECVT_HPP
10
11#include <boost/locale/utf.hpp>
12#include <cstdint>
13#include <locale>
14
15namespace boost { namespace locale {
16
17 static_assert(sizeof(std::mbstate_t) >= 2, "std::mbstate_t is to small to store an UTF-16 codepoint");
18 namespace detail {
19 // Avoid including cstring for std::memcpy
20 inline void copy_uint16_t(void* dst, const void* src)
21 {
22 unsigned char* cdst = static_cast<unsigned char*>(dst);
23 const unsigned char* csrc = static_cast<const unsigned char*>(src);
24 cdst[0] = csrc[0];
25 cdst[1] = csrc[1];
26 }
27 inline uint16_t read_state(const std::mbstate_t& src)
28 {
29 uint16_t dst;
30 copy_uint16_t(&dst, &src);
31 return dst;
32 }
33 inline void write_state(std::mbstate_t& dst, const uint16_t src)
34 {
35 copy_uint16_t(&dst, &src);
36 }
37 } // namespace detail
38
41 public:
46 };
47 };
48
150 template<typename CharType, typename CodecvtImpl, int CharSize = sizeof(CharType)>
152
159 template<typename CharType, typename CodecvtImpl>
160 class generic_codecvt<CharType, CodecvtImpl, 2> : public std::codecvt<CharType, char, std::mbstate_t>,
161 public generic_codecvt_base {
162 public:
163 typedef CharType uchar;
164
165 generic_codecvt(size_t refs = 0) : std::codecvt<CharType, char, std::mbstate_t>(refs) {}
166 const CodecvtImpl& implementation() const { return *static_cast<const CodecvtImpl*>(this); }
167
168 protected:
169 std::codecvt_base::result do_unshift(std::mbstate_t& s, char* from, char* /*to*/, char*& next) const override
170 {
171 if(*reinterpret_cast<char*>(&s) != 0)
172 return std::codecvt_base::error;
173 next = from;
174 return std::codecvt_base::ok;
175 }
176 int do_encoding() const noexcept override { return 0; }
177 int do_max_length() const noexcept override { return implementation().max_encoding_length(); }
178 bool do_always_noconv() const noexcept override { return false; }
179
180 int do_length(std::mbstate_t& std_state, const char* from, const char* from_end, size_t max) const override
181 {
182 bool state = *reinterpret_cast<char*>(&std_state) != 0;
183 const char* save_from = from;
184
185 auto cvt_state = implementation().initial_state(to_unicode_state);
186 while(max > 0 && from < from_end) {
187 const char* prev_from = from;
188 const utf::code_point ch = implementation().to_unicode(cvt_state, from, from_end);
190 from = prev_from;
191 break;
192 }
193 max--;
194 if(ch > 0xFFFF) {
195 if(!state)
196 from = prev_from;
197 state = !state;
198 }
199 }
200 *reinterpret_cast<char*>(&std_state) = state;
201 return static_cast<int>(from - save_from);
202 }
203
204 std::codecvt_base::result do_in(std::mbstate_t& std_state,
205 const char* from,
206 const char* from_end,
207 const char*& from_next,
208 uchar* to,
209 uchar* to_end,
210 uchar*& to_next) const override
211 {
212 std::codecvt_base::result r = std::codecvt_base::ok;
213
214 // mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT())
215 // according to standard. We use it to keep a flag 0/1 for surrogate pair writing
216 //
217 // if 0/false no codepoint above >0xFFFF observed, else a codepoint above 0xFFFF was observed
218 // and first pair is written, but no input consumed
219 bool state = *reinterpret_cast<char*>(&std_state) != 0;
220 auto cvt_state = implementation().initial_state(to_unicode_state);
221 while(to < to_end && from < from_end) {
222 const char* from_saved = from;
223
224 utf::code_point ch = implementation().to_unicode(cvt_state, from, from_end);
225
227 from = from_saved;
228 r = std::codecvt_base::error;
229 break;
230 }
232 from = from_saved;
233 r = std::codecvt_base::partial;
234 break;
235 }
236 // Normal codepoints go directly to stream
237 if(ch <= 0xFFFF)
238 *to++ = static_cast<uchar>(ch);
239 else {
240 // For other codepoints we do the following
241 //
242 // 1. We can't consume our input as we may find ourselves
243 // in state where all input consumed but not all output written,i.e. only
244 // 1st pair is written
245 // 2. We only write first pair and mark this in the state, we also revert back
246 // the from pointer in order to make sure this codepoint would be read
247 // once again and then we would consume our input together with writing
248 // second surrogate pair
249 ch -= 0x10000;
250 std::uint16_t w1 = static_cast<std::uint16_t>(0xD800 | (ch >> 10));
251 std::uint16_t w2 = static_cast<std::uint16_t>(0xDC00 | (ch & 0x3FF));
252 if(!state) {
253 from = from_saved;
254 *to++ = w1;
255 } else
256 *to++ = w2;
257 state = !state;
258 }
259 }
260 from_next = from;
261 to_next = to;
262 if(r == std::codecvt_base::ok && (from != from_end || state))
263 r = std::codecvt_base::partial;
264 *reinterpret_cast<char*>(&std_state) = state;
265 return r;
266 }
267
268 std::codecvt_base::result do_out(std::mbstate_t& std_state,
269 const uchar* from,
270 const uchar* from_end,
271 const uchar*& from_next,
272 char* to,
273 char* to_end,
274 char*& to_next) const override
275 {
276 std::codecvt_base::result r = std::codecvt_base::ok;
277 // mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT())
278 // according to standard. We assume that sizeof(mbstate_t) >=2 in order
279 // to be able to store first observed surrogate pair
280 //
281 // State: state!=0 - a first surrogate pair was observed (state = first pair),
282 // we expect the second one to come and then zero the state
283 std::uint16_t state = detail::read_state(std_state);
284 auto cvt_state = implementation().initial_state(from_unicode_state);
285 while(to < to_end && from < from_end) {
286 utf::code_point ch = 0;
287 if(state != 0) {
288 // if the state indicates that 1st surrogate pair was written
289 // we should make sure that the second one that comes is actually
290 // second surrogate
291 std::uint16_t w1 = state;
292 std::uint16_t w2 = *from;
293 // we don't forward from as writing may fail to incomplete or
294 // partial conversion
295 if(0xDC00 <= w2 && w2 <= 0xDFFF) {
296 std::uint16_t vh = w1 - 0xD800;
297 std::uint16_t vl = w2 - 0xDC00;
298 ch = ((uint32_t(vh) << 10) | vl) + 0x10000;
299 } else {
300 // Invalid surrogate
301 r = std::codecvt_base::error;
302 break;
303 }
304 } else {
305 ch = *from;
306 if(0xD800 <= ch && ch <= 0xDBFF) {
307 // if this is a first surrogate pair we put
308 // it into the state and consume it, note we don't
309 // go forward as it should be illegal so we increase
310 // the from pointer manually
311 state = static_cast<uint16_t>(ch);
312 from++;
313 continue;
314 } else if(0xDC00 <= ch && ch <= 0xDFFF) {
315 // if we observe second surrogate pair and
316 // first only may be expected we should break from the loop with error
317 // as it is illegal input
318 r = std::codecvt_base::error;
319 break;
320 }
321 }
323 r = std::codecvt_base::error;
324 break;
325 }
326 const utf::code_point len = implementation().from_unicode(cvt_state, ch, to, to_end);
328 r = std::codecvt_base::partial;
329 break;
330 } else if(len == boost::locale::utf::illegal) {
331 r = std::codecvt_base::error;
332 break;
333 } else
334 to += len;
335 state = 0;
336 from++;
337 }
338 from_next = from;
339 to_next = to;
340 if(r == std::codecvt_base::ok && (from != from_end || state != 0))
341 r = std::codecvt_base::partial;
342 detail::write_state(std_state, state);
343 return r;
344 }
345 };
346
351 template<typename CharType, typename CodecvtImpl>
352 class generic_codecvt<CharType, CodecvtImpl, 4> : public std::codecvt<CharType, char, std::mbstate_t>,
353 public generic_codecvt_base {
354 public:
355 typedef CharType uchar;
356
357 generic_codecvt(size_t refs = 0) : std::codecvt<CharType, char, std::mbstate_t>(refs) {}
358
359 const CodecvtImpl& implementation() const { return *static_cast<const CodecvtImpl*>(this); }
360
361 protected:
362 std::codecvt_base::result
363 do_unshift(std::mbstate_t& /*s*/, char* from, char* /*to*/, char*& next) const override
364 {
365 next = from;
366 return std::codecvt_base::ok;
367 }
368 int do_encoding() const noexcept override { return 0; }
369 int do_max_length() const noexcept override { return implementation().max_encoding_length(); }
370 bool do_always_noconv() const noexcept override { return false; }
371
372 int do_length(std::mbstate_t& /*state*/, const char* from, const char* from_end, size_t max) const override
373 {
374 const char* start_from = from;
375 auto cvt_state = implementation().initial_state(to_unicode_state);
376 while(max > 0 && from < from_end) {
377 const char* save_from = from;
378 const utf::code_point ch = implementation().to_unicode(cvt_state, from, from_end);
380 from = save_from;
381 break;
382 }
383 max--;
384 }
385
386 return static_cast<int>(from - start_from);
387 }
388
389 std::codecvt_base::result do_in(std::mbstate_t& /*state*/,
390 const char* from,
391 const char* from_end,
392 const char*& from_next,
393 uchar* to,
394 uchar* to_end,
395 uchar*& to_next) const override
396 {
397 std::codecvt_base::result r = std::codecvt_base::ok;
398
399 auto cvt_state = implementation().initial_state(to_unicode_state);
400 while(to < to_end && from < from_end) {
401 const char* from_saved = from;
402
403 const utf::code_point ch = implementation().to_unicode(cvt_state, from, from_end);
404
406 r = std::codecvt_base::error;
407 from = from_saved;
408 break;
409 }
411 r = std::codecvt_base::partial;
412 from = from_saved;
413 break;
414 }
415 *to++ = ch;
416 }
417 from_next = from;
418 to_next = to;
419 if(r == std::codecvt_base::ok && from != from_end)
420 r = std::codecvt_base::partial;
421 return r;
422 }
423
424 std::codecvt_base::result do_out(std::mbstate_t& /*std_state*/,
425 const uchar* from,
426 const uchar* from_end,
427 const uchar*& from_next,
428 char* to,
429 char* to_end,
430 char*& to_next) const override
431 {
432 std::codecvt_base::result r = std::codecvt_base::ok;
433 auto cvt_state = implementation().initial_state(from_unicode_state);
434 while(to < to_end && from < from_end) {
435 const std::uint32_t ch = *from;
437 r = std::codecvt_base::error;
438 break;
439 }
440 const utf::code_point len = implementation().from_unicode(cvt_state, ch, to, to_end);
442 r = std::codecvt_base::partial;
443 break;
444 } else if(len == boost::locale::utf::illegal) {
445 r = std::codecvt_base::error;
446 break;
447 }
448 to += len;
449 from++;
450 }
451 from_next = from;
452 to_next = to;
453 if(r == std::codecvt_base::ok && from != from_end)
454 r = std::codecvt_base::partial;
455 return r;
456 }
457 };
458
459 template<typename CodecvtImpl>
460 class generic_codecvt<char, CodecvtImpl, 1> : public std::codecvt<char, char, std::mbstate_t>,
461 public generic_codecvt_base {
462 public:
463 typedef char uchar;
464
465 const CodecvtImpl& implementation() const { return *static_cast<const CodecvtImpl*>(this); }
466
467 generic_codecvt(size_t refs = 0) : std::codecvt<char, char, std::mbstate_t>(refs) {}
468 };
469
470}} // namespace boost::locale
471
472#endif
A base class that used to define constants for generic_codecvt.
Definition: generic_codecvt.hpp:40
initial_convertion_state
Initial state for converting to or from Unicode code points, used by initial_state in derived classes...
Definition: generic_codecvt.hpp:43
@ to_unicode_state
The state would be used by to_unicode functions.
Definition: generic_codecvt.hpp:44
@ from_unicode_state
The state would be used by from_unicode functions.
Definition: generic_codecvt.hpp:45
Generic codecvt facet for various stateless encodings to UTF-16 and UTF-32 using wchar_t,...
Definition: generic_codecvt.hpp:151
uint32_t code_point
The integral type that can hold a Unicode code point.
Definition: utf.hpp:19
bool is_valid_codepoint(code_point v)
the function checks if v is a valid code point
Definition: utf.hpp:30
constexpr code_point illegal
Special constant that defines illegal code point.
Definition: utf.hpp:22
constexpr code_point incomplete
Special constant that defines incomplete code point.
Definition: utf.hpp:24