Boost.Locale
generic_codecvt.hpp
1//
2// Copyright (c) 2015 Artyom Beilis (Tonkikh)
3// Copyright (c) 2021-2023 Alexander Grund
4//
5// Distributed under the Boost Software License, Version 1.0.
6// https://www.boost.org/LICENSE_1_0.txt
7
8#ifndef BOOST_LOCALE_GENERIC_CODECVT_HPP
9#define BOOST_LOCALE_GENERIC_CODECVT_HPP
10
11#include <boost/locale/utf.hpp>
12#include <cstdint>
13#include <locale>
14
15namespace boost { namespace locale {
16
17 static_assert(sizeof(std::mbstate_t) >= 2, "std::mbstate_t is to small to store an UTF-16 codepoint");
18 namespace detail {
19 // Avoid including cstring for std::memcpy
20 inline void copy_uint16_t(void* dst, const void* src)
21 {
22 unsigned char* cdst = static_cast<unsigned char*>(dst);
23 const unsigned char* csrc = static_cast<const unsigned char*>(src);
24 cdst[0] = csrc[0];
25 cdst[1] = csrc[1];
26 }
27 inline uint16_t read_state(const std::mbstate_t& src)
28 {
29 uint16_t dst;
30 copy_uint16_t(&dst, &src);
31 return dst;
32 }
33 inline void write_state(std::mbstate_t& dst, const uint16_t src)
34 {
35 copy_uint16_t(&dst, &src);
36 }
37 } // namespace detail
38
48
150 template<typename CharType, typename CodecvtImpl, int CharSize = sizeof(CharType)>
152
159 template<typename CharType, typename CodecvtImpl>
160 class generic_codecvt<CharType, CodecvtImpl, 2> : public std::codecvt<CharType, char, std::mbstate_t>,
161 public generic_codecvt_base {
162 public:
163 typedef CharType uchar;
164
165 generic_codecvt(size_t refs = 0) : std::codecvt<CharType, char, std::mbstate_t>(refs) {}
166 const CodecvtImpl& implementation() const { return *static_cast<const CodecvtImpl*>(this); }
167
168 protected:
169 std::codecvt_base::result do_unshift(std::mbstate_t& s, char* from, char* /*to*/, char*& next) const override
170 {
171 if(*reinterpret_cast<char*>(&s) != 0)
172 return std::codecvt_base::error;
173 next = from;
174 return std::codecvt_base::ok;
175 }
176 int do_encoding() const noexcept override { return 0; }
177 int do_max_length() const noexcept override { return implementation().max_encoding_length(); }
178 bool do_always_noconv() const noexcept override { return false; }
179
180 int do_length(std::mbstate_t& std_state, const char* from, const char* from_end, size_t max) const override
181 {
182 bool state = *reinterpret_cast<char*>(&std_state) != 0;
183 const char* save_from = from;
184
185 auto cvt_state = implementation().initial_state(to_unicode_state);
186 while(max > 0 && from < from_end) {
187 const char* prev_from = from;
188 const utf::code_point ch = implementation().to_unicode(cvt_state, from, from_end);
190 from = prev_from;
191 break;
192 }
193 max--;
194 if(ch > 0xFFFF) {
195 if(!state)
196 from = prev_from;
197 state = !state;
198 }
199 }
200 *reinterpret_cast<char*>(&std_state) = state;
201 return static_cast<int>(from - save_from);
202 }
203
204 std::codecvt_base::result do_in(std::mbstate_t& std_state,
205 const char* from,
206 const char* from_end,
207 const char*& from_next,
208 uchar* to,
209 uchar* to_end,
210 uchar*& to_next) const override
211 {
212 std::codecvt_base::result r = std::codecvt_base::ok;
213
214 // mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT())
215 // according to standard. We use it to keep a flag 0/1 for surrogate pair writing
216 //
217 // If true then only the high surrogate of a codepoint > 0xFFFF was written, but no input consumed.
218 bool low_surrogate_pending = *reinterpret_cast<char*>(&std_state) != 0;
219 auto cvt_state = implementation().initial_state(to_unicode_state);
220 while(to < to_end && from < from_end) {
221 const char* from_saved = from;
222
223 utf::code_point ch = implementation().to_unicode(cvt_state, from, from_end);
224
226 from = from_saved;
227 r = std::codecvt_base::error;
228 break;
229 }
231 from = from_saved;
232 r = std::codecvt_base::partial;
233 break;
234 }
235 // Normal codepoints go directly to stream
236 if(ch <= 0xFFFF)
237 *to++ = static_cast<uchar>(ch);
238 else {
239 // For other codepoints we can't consume our input as we may find ourselves in a state
240 // where all input is consumed but not all output written, i.e. only the high surrogate is written.
241 //
242 // So we write only the high surrogate and mark this in the state.
243 // We also set the from pointer to the previous position, i.e. don't consume the input, so this
244 // codepoint will be read again and then we will consume our input together with writing the low
245 // surrogate.
246 ch -= 0x10000;
247 const std::uint16_t w1 = static_cast<std::uint16_t>(0xD800 | (ch >> 10));
248 const std::uint16_t w2 = static_cast<std::uint16_t>(0xDC00 | (ch & 0x3FF));
249 if(!low_surrogate_pending) {
250 from = from_saved;
251 *to++ = w1;
252 } else
253 *to++ = w2;
254 low_surrogate_pending = !low_surrogate_pending;
255 }
256 }
257 from_next = from;
258 to_next = to;
259 if(r == std::codecvt_base::ok && (from != from_end || low_surrogate_pending))
260 r = std::codecvt_base::partial;
261 *reinterpret_cast<char*>(&std_state) = low_surrogate_pending;
262 return r;
263 }
264
265 std::codecvt_base::result do_out(std::mbstate_t& std_state,
266 const uchar* from,
267 const uchar* from_end,
268 const uchar*& from_next,
269 char* to,
270 char* to_end,
271 char*& to_next) const override
272 {
273 std::codecvt_base::result r = std::codecvt_base::ok;
274 // mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT())
275 // according to standard. We assume that sizeof(mbstate_t) >=2 in order
276 // to be able to store first observed surrogate pair
277 //
278 // State: state!=0 - a first surrogate pair was observed (state = first pair),
279 // we expect the second one to come and then zero the state
280 std::uint16_t state = detail::read_state(std_state);
281 auto cvt_state = implementation().initial_state(from_unicode_state);
282 while(to < to_end && from < from_end) {
283 utf::code_point ch = 0;
284 if(state != 0) {
285 // if the state indicates that 1st surrogate pair was written
286 // we should make sure that the second one that comes is actually
287 // second surrogate
288 std::uint16_t w1 = state;
289 std::uint16_t w2 = *from;
290 // we don't forward from as writing may fail to incomplete or
291 // partial conversion
292 if(0xDC00 <= w2 && w2 <= 0xDFFF) {
293 std::uint16_t vh = w1 - 0xD800;
294 std::uint16_t vl = w2 - 0xDC00;
295 ch = ((uint32_t(vh) << 10) | vl) + 0x10000;
296 } else {
297 // Invalid surrogate
298 r = std::codecvt_base::error;
299 break;
300 }
301 } else {
302 ch = *from;
303 if(0xD800 <= ch && ch <= 0xDBFF) {
304 // if this is a first surrogate pair we put
305 // it into the state and consume it, note we don't
306 // go forward as it should be illegal so we increase
307 // the from pointer manually
308 state = static_cast<uint16_t>(ch);
309 from++;
310 continue;
311 } else if(0xDC00 <= ch && ch <= 0xDFFF) {
312 // if we observe second surrogate pair and
313 // first only may be expected we should break from the loop with error
314 // as it is illegal input
315 r = std::codecvt_base::error;
316 break;
317 }
318 }
320 r = std::codecvt_base::error;
321 break;
322 }
323 const utf::code_point len = implementation().from_unicode(cvt_state, ch, to, to_end);
325 r = std::codecvt_base::partial;
326 break;
327 } else if(len == boost::locale::utf::illegal) {
328 r = std::codecvt_base::error;
329 break;
330 } else
331 to += len;
332 state = 0;
333 from++;
334 }
335 from_next = from;
336 to_next = to;
337 if(r == std::codecvt_base::ok && (from != from_end || state != 0))
338 r = std::codecvt_base::partial;
339 detail::write_state(std_state, state);
340 return r;
341 }
342 };
343
348 template<typename CharType, typename CodecvtImpl>
349 class generic_codecvt<CharType, CodecvtImpl, 4> : public std::codecvt<CharType, char, std::mbstate_t>,
350 public generic_codecvt_base {
351 public:
352 typedef CharType uchar;
353
354 generic_codecvt(size_t refs = 0) : std::codecvt<CharType, char, std::mbstate_t>(refs) {}
355
356 const CodecvtImpl& implementation() const { return *static_cast<const CodecvtImpl*>(this); }
357
358 protected:
359 std::codecvt_base::result
360 do_unshift(std::mbstate_t& /*s*/, char* from, char* /*to*/, char*& next) const override
361 {
362 next = from;
363 return std::codecvt_base::ok;
364 }
365 int do_encoding() const noexcept override { return 0; }
366 int do_max_length() const noexcept override { return implementation().max_encoding_length(); }
367 bool do_always_noconv() const noexcept override { return false; }
368
369 int do_length(std::mbstate_t& /*state*/, const char* from, const char* from_end, size_t max) const override
370 {
371 const char* start_from = from;
372 auto cvt_state = implementation().initial_state(to_unicode_state);
373 while(max > 0 && from < from_end) {
374 const char* save_from = from;
375 const utf::code_point ch = implementation().to_unicode(cvt_state, from, from_end);
377 from = save_from;
378 break;
379 }
380 max--;
381 }
382
383 return static_cast<int>(from - start_from);
384 }
385
386 std::codecvt_base::result do_in(std::mbstate_t& /*state*/,
387 const char* from,
388 const char* from_end,
389 const char*& from_next,
390 uchar* to,
391 uchar* to_end,
392 uchar*& to_next) const override
393 {
394 std::codecvt_base::result r = std::codecvt_base::ok;
395
396 auto cvt_state = implementation().initial_state(to_unicode_state);
397 while(to < to_end && from < from_end) {
398 const char* from_saved = from;
399
400 const utf::code_point ch = implementation().to_unicode(cvt_state, from, from_end);
401
403 r = std::codecvt_base::error;
404 from = from_saved;
405 break;
406 }
408 r = std::codecvt_base::partial;
409 from = from_saved;
410 break;
411 }
412 *to++ = ch;
413 }
414 from_next = from;
415 to_next = to;
416 if(r == std::codecvt_base::ok && from != from_end)
417 r = std::codecvt_base::partial;
418 return r;
419 }
420
421 std::codecvt_base::result do_out(std::mbstate_t& /*std_state*/,
422 const uchar* from,
423 const uchar* from_end,
424 const uchar*& from_next,
425 char* to,
426 char* to_end,
427 char*& to_next) const override
428 {
429 std::codecvt_base::result r = std::codecvt_base::ok;
430 auto cvt_state = implementation().initial_state(from_unicode_state);
431 while(to < to_end && from < from_end) {
432 const std::uint32_t ch = *from;
434 r = std::codecvt_base::error;
435 break;
436 }
437 const utf::code_point len = implementation().from_unicode(cvt_state, ch, to, to_end);
439 r = std::codecvt_base::partial;
440 break;
441 } else if(len == boost::locale::utf::illegal) {
442 r = std::codecvt_base::error;
443 break;
444 }
445 to += len;
446 from++;
447 }
448 from_next = from;
449 to_next = to;
450 if(r == std::codecvt_base::ok && from != from_end)
451 r = std::codecvt_base::partial;
452 return r;
453 }
454 };
455
456 template<typename CodecvtImpl>
457 class generic_codecvt<char, CodecvtImpl, 1> : public std::codecvt<char, char, std::mbstate_t>,
458 public generic_codecvt_base {
459 public:
460 typedef char uchar;
461
462 const CodecvtImpl& implementation() const { return *static_cast<const CodecvtImpl*>(this); }
463
464 generic_codecvt(size_t refs = 0) : std::codecvt<char, char, std::mbstate_t>(refs) {}
465 };
466
467}} // namespace boost::locale
468
469#endif
A base class that used to define constants for generic_codecvt.
Definition generic_codecvt.hpp:40
initial_convertion_state
Initial state for converting to or from Unicode code points, used by initial_state in derived classes...
Definition generic_codecvt.hpp:43
@ to_unicode_state
The state would be used by to_unicode functions.
Definition generic_codecvt.hpp:44
@ from_unicode_state
The state would be used by from_unicode functions.
Definition generic_codecvt.hpp:45
Generic codecvt facet for various stateless encodings to UTF-16 and UTF-32 using wchar_t,...
Definition generic_codecvt.hpp:151
bool is_valid_codepoint(code_point v)
the function checks if v is a valid code point
Definition utf.hpp:30
constexpr code_point illegal
Special constant that defines illegal code point.
Definition utf.hpp:22
uint32_t code_point
The integral type that can hold a Unicode code point.
Definition utf.hpp:19
constexpr code_point incomplete
Special constant that defines incomplete code point.
Definition utf.hpp:24
This is the main namespace that encloses all localization classes.
Definition boundary_point.hpp:13