Boost.Locale
generic_codecvt.hpp
1//
2// Copyright (c) 2015 Artyom Beilis (Tonkikh)
3//
4// Distributed under the Boost Software License, Version 1.0.
5// https://www.boost.org/LICENSE_1_0.txt
6
7#ifndef BOOST_LOCALE_GENERIC_CODECVT_HPP
8#define BOOST_LOCALE_GENERIC_CODECVT_HPP
9
10#include <boost/locale/utf.hpp>
11#include <boost/cstdint.hpp>
12#include <locale>
13
14namespace boost { namespace locale {
15
16#ifndef BOOST_LOCALE_DOXYGEN
17 //
18 // Make sure that mbstate can keep 16 bit of UTF-16 sequence
19 //
20 static_assert(sizeof(std::mbstate_t) >= 2, "std::mbstate_t is to small");
21#endif
22
23#if defined(_MSC_VER) && _MSC_VER < 1700
24// up to MSVC 11 (2012) do_length is non-standard it counts wide characters instead of narrow and does not change
25// mbstate
26# define BOOST_LOCALE_DO_LENGTH_MBSTATE_CONST
27#endif
28
31 public:
36 };
37 };
38
142 template<typename CharType, typename CodecvtImpl, int CharSize = sizeof(CharType)>
144
151 template<typename CharType, typename CodecvtImpl>
152 class generic_codecvt<CharType, CodecvtImpl, 2> : public std::codecvt<CharType, char, std::mbstate_t>,
153 public generic_codecvt_base {
154 public:
155 typedef CharType uchar;
156
157 generic_codecvt(size_t refs = 0) : std::codecvt<CharType, char, std::mbstate_t>(refs) {}
158 const CodecvtImpl& implementation() const { return *static_cast<const CodecvtImpl*>(this); }
159
160 protected:
161 std::codecvt_base::result do_unshift(std::mbstate_t& s, char* from, char* /*to*/, char*& next) const override
162 {
163 boost::uint16_t& state = *reinterpret_cast<boost::uint16_t*>(&s);
164 if(state != 0)
165 return std::codecvt_base::error;
166 next = from;
167 return std::codecvt_base::ok;
168 }
169 int do_encoding() const noexcept override { return 0; }
170 int do_max_length() const noexcept override { return implementation().max_encoding_length(); }
171 bool do_always_noconv() const noexcept override { return false; }
172
173 int do_length(
174#ifdef BOOST_LOCALE_DO_LENGTH_MBSTATE_CONST
175 const
176#endif
177 std::mbstate_t& std_state,
178 const char* from,
179 const char* from_end,
180 size_t max) const override
181 {
182#ifndef BOOST_LOCALE_DO_LENGTH_MBSTATE_CONST
183 const char* save_from = from;
184 boost::uint16_t& state = *reinterpret_cast<boost::uint16_t*>(&std_state);
185#else
186 const size_t start_max = max;
187 boost::uint16_t state = *reinterpret_cast<const boost::uint16_t*>(&std_state);
188#endif
189
190 typename CodecvtImpl::state_type cvt_state =
191 implementation().initial_state(generic_codecvt_base::to_unicode_state);
192 while(max > 0 && from < from_end) {
193 const char* prev_from = from;
194 boost::uint32_t ch = implementation().to_unicode(cvt_state, from, from_end);
196 from = prev_from;
197 break;
198 }
199 max--;
200 if(ch > 0xFFFF) {
201 if(state == 0) {
202 from = prev_from;
203 state = 1;
204 } else {
205 state = 0;
206 }
207 }
208 }
209#ifndef BOOST_LOCALE_DO_LENGTH_MBSTATE_CONST
210 return static_cast<int>(from - save_from);
211#else
212 return static_cast<int>(start_max - max);
213#endif
214 }
215
216 std::codecvt_base::result do_in(std::mbstate_t& std_state,
217 const char* from,
218 const char* from_end,
219 const char*& from_next,
220 uchar* to,
221 uchar* to_end,
222 uchar*& to_next) const override
223 {
224 std::codecvt_base::result r = std::codecvt_base::ok;
225
226 // mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT())
227 // according to standard. We use it to keep a flag 0/1 for surrogate pair writing
228 //
229 // if 0 no code above >0xFFFF observed, of 1 a code above 0xFFFF observed
230 // and first pair is written, but no input consumed
231 boost::uint16_t& state = *reinterpret_cast<boost::uint16_t*>(&std_state);
232 typename CodecvtImpl::state_type cvt_state =
233 implementation().initial_state(generic_codecvt_base::to_unicode_state);
234 while(to < to_end && from < from_end) {
235 const char* from_saved = from;
236
237 uint32_t ch = implementation().to_unicode(cvt_state, from, from_end);
238
240 from = from_saved;
241 r = std::codecvt_base::error;
242 break;
243 }
245 from = from_saved;
246 r = std::codecvt_base::partial;
247 break;
248 }
249 // Normal codepoints go directly to stream
250 if(ch <= 0xFFFF) {
251 *to++ = static_cast<uchar>(ch);
252 } else {
253 // For other codepoints we do the following
254 //
255 // 1. We can't consume our input as we may find ourselves
256 // in state where all input consumed but not all output written,i.e. only
257 // 1st pair is written
258 // 2. We only write first pair and mark this in the state, we also revert back
259 // the from pointer in order to make sure this codepoint would be read
260 // once again and then we would consume our input together with writing
261 // second surrogate pair
262 ch -= 0x10000;
263 boost::uint16_t w1 = static_cast<boost::uint16_t>(0xD800 | (ch >> 10));
264 boost::uint16_t w2 = static_cast<boost::uint16_t>(0xDC00 | (ch & 0x3FF));
265 if(state == 0) {
266 from = from_saved;
267 *to++ = w1;
268 state = 1;
269 } else {
270 *to++ = w2;
271 state = 0;
272 }
273 }
274 }
275 from_next = from;
276 to_next = to;
277 if(r == std::codecvt_base::ok && (from != from_end || state != 0))
278 r = std::codecvt_base::partial;
279 return r;
280 }
281
282 std::codecvt_base::result do_out(std::mbstate_t& std_state,
283 const uchar* from,
284 const uchar* from_end,
285 const uchar*& from_next,
286 char* to,
287 char* to_end,
288 char*& to_next) const override
289 {
290 std::codecvt_base::result r = std::codecvt_base::ok;
291 // mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT())
292 // according to standard. We assume that sizeof(mbstate_t) >=2 in order
293 // to be able to store first observed surrogate pair
294 //
295 // State: state!=0 - a first surrogate pair was observed (state = first pair),
296 // we expect the second one to come and then zero the state
297 boost::uint16_t& state = *reinterpret_cast<boost::uint16_t*>(&std_state);
298 typename CodecvtImpl::state_type cvt_state =
299 implementation().initial_state(generic_codecvt_base::from_unicode_state);
300 while(to < to_end && from < from_end) {
301 boost::uint32_t ch = 0;
302 if(state != 0) {
303 // if the state indicates that 1st surrogate pair was written
304 // we should make sure that the second one that comes is actually
305 // second surrogate
306 boost::uint16_t w1 = state;
307 boost::uint16_t w2 = *from;
308 // we don't forward from as writing may fail to incomplete or
309 // partial conversion
310 if(0xDC00 <= w2 && w2 <= 0xDFFF) {
311 boost::uint16_t vh = w1 - 0xD800;
312 boost::uint16_t vl = w2 - 0xDC00;
313 ch = ((uint32_t(vh) << 10) | vl) + 0x10000;
314 } else {
315 // Invalid surrogate
316 r = std::codecvt_base::error;
317 break;
318 }
319 } else {
320 ch = *from;
321 if(0xD800 <= ch && ch <= 0xDBFF) {
322 // if this is a first surrogate pair we put
323 // it into the state and consume it, note we don't
324 // go forward as it should be illegal so we increase
325 // the from pointer manually
326 state = static_cast<uint16_t>(ch);
327 from++;
328 continue;
329 } else if(0xDC00 <= ch && ch <= 0xDFFF) {
330 // if we observe second surrogate pair and
331 // first only may be expected we should break from the loop with error
332 // as it is illegal input
333 r = std::codecvt_base::error;
334 break;
335 }
336 }
338 r = std::codecvt_base::error;
339 break;
340 }
341 boost::uint32_t len = implementation().from_unicode(cvt_state, ch, to, to_end);
343 r = std::codecvt_base::partial;
344 break;
345 } else if(len == boost::locale::utf::illegal) {
346 r = std::codecvt_base::error;
347 break;
348 } else
349 to += len;
350 state = 0;
351 from++;
352 }
353 from_next = from;
354 to_next = to;
355 if(r == std::codecvt_base::ok && (from != from_end || state != 0))
356 r = std::codecvt_base::partial;
357 return r;
358 }
359 };
360
365 template<typename CharType, typename CodecvtImpl>
366 class generic_codecvt<CharType, CodecvtImpl, 4> : public std::codecvt<CharType, char, std::mbstate_t>,
367 public generic_codecvt_base {
368 public:
369 typedef CharType uchar;
370
371 generic_codecvt(size_t refs = 0) : std::codecvt<CharType, char, std::mbstate_t>(refs) {}
372
373 const CodecvtImpl& implementation() const { return *static_cast<const CodecvtImpl*>(this); }
374
375 protected:
376 std::codecvt_base::result
377 do_unshift(std::mbstate_t& /*s*/, char* from, char* /*to*/, char*& next) const override
378 {
379 next = from;
380 return std::codecvt_base::ok;
381 }
382 int do_encoding() const noexcept override { return 0; }
383 int do_max_length() const noexcept override { return implementation().max_encoding_length(); }
384 bool do_always_noconv() const noexcept override { return false; }
385
386 int do_length(
387#ifdef BOOST_LOCALE_DO_LENGTH_MBSTATE_CONST
388 const
389#endif
390 std::mbstate_t& /*state*/,
391 const char* from,
392 const char* from_end,
393 size_t max) const override
394 {
395#ifndef BOOST_LOCALE_DO_LENGTH_MBSTATE_CONST
396 const char* start_from = from;
397#else
398 const size_t start_max = max;
399#endif
400 typename CodecvtImpl::state_type cvt_state =
401 implementation().initial_state(generic_codecvt_base::to_unicode_state);
402 while(max > 0 && from < from_end) {
403 const char* save_from = from;
404 boost::uint32_t ch = implementation().to_unicode(cvt_state, from, from_end);
406 from = save_from;
407 break;
408 }
409 max--;
410 }
411
412#ifndef BOOST_LOCALE_DO_LENGTH_MBSTATE_CONST
413 return static_cast<int>(from - start_from);
414#else
415 return static_cast<int>(start_max - max);
416#endif
417 }
418
419 std::codecvt_base::result do_in(std::mbstate_t& /*state*/,
420 const char* from,
421 const char* from_end,
422 const char*& from_next,
423 uchar* to,
424 uchar* to_end,
425 uchar*& to_next) const override
426 {
427 std::codecvt_base::result r = std::codecvt_base::ok;
428
429 // mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT())
430 // according to standard. We use it to keep a flag 0/1 for surrogate pair writing
431 //
432 // if 0 no code above >0xFFFF observed, of 1 a code above 0xFFFF observed
433 // and first pair is written, but no input consumed
434 auto cvt_state = implementation().initial_state(generic_codecvt_base::to_unicode_state);
435 while(to < to_end && from < from_end) {
436 const char* from_saved = from;
437
438 uint32_t ch = implementation().to_unicode(cvt_state, from, from_end);
439
441 r = std::codecvt_base::error;
442 from = from_saved;
443 break;
444 }
446 r = std::codecvt_base::partial;
447 from = from_saved;
448 break;
449 }
450 *to++ = ch;
451 }
452 from_next = from;
453 to_next = to;
454 if(r == std::codecvt_base::ok && from != from_end)
455 r = std::codecvt_base::partial;
456 return r;
457 }
458
459 std::codecvt_base::result do_out(std::mbstate_t& /*std_state*/,
460 const uchar* from,
461 const uchar* from_end,
462 const uchar*& from_next,
463 char* to,
464 char* to_end,
465 char*& to_next) const override
466 {
467 std::codecvt_base::result r = std::codecvt_base::ok;
468 auto cvt_state = implementation().initial_state(generic_codecvt_base::from_unicode_state);
469 while(to < to_end && from < from_end) {
470 boost::uint32_t ch = 0;
471 ch = *from;
473 r = std::codecvt_base::error;
474 break;
475 }
476 boost::uint32_t len = implementation().from_unicode(cvt_state, ch, to, to_end);
478 r = std::codecvt_base::partial;
479 break;
480 } else if(len == boost::locale::utf::illegal) {
481 r = std::codecvt_base::error;
482 break;
483 }
484 to += len;
485 from++;
486 }
487 from_next = from;
488 to_next = to;
489 if(r == std::codecvt_base::ok && from != from_end)
490 r = std::codecvt_base::partial;
491 return r;
492 }
493 };
494
495 template<typename CharType, typename CodecvtImpl>
496 class generic_codecvt<CharType, CodecvtImpl, 1> : public std::codecvt<CharType, char, std::mbstate_t>,
497 public generic_codecvt_base {
498 public:
499 typedef CharType uchar;
500
501 const CodecvtImpl& implementation() const { return *static_cast<const CodecvtImpl*>(this); }
502
503 generic_codecvt(size_t refs = 0) : std::codecvt<char, char, std::mbstate_t>(refs) {}
504 };
505
506}} // namespace boost::locale
507
508#endif
A base class that used to define constants for generic_codecvt.
Definition: generic_codecvt.hpp:30
initial_convertion_state
Initial state for converting to or from unicode code points, used by initial_state in derived classes...
Definition: generic_codecvt.hpp:33
@ to_unicode_state
The state would be used by to_unicode functions.
Definition: generic_codecvt.hpp:34
@ from_unicode_state
The state would be used by from_unicode functions.
Definition: generic_codecvt.hpp:35
Generic codecvt facet for various stateless encodings to UTF-16 and UTF-32 using wchar_t,...
Definition: generic_codecvt.hpp:143
bool is_valid_codepoint(code_point v)
the function checks if v is a valid code point
Definition: utf.hpp:27
constexpr code_point illegal
Special constant that defines illegal code point.
Definition: utf.hpp:22
constexpr code_point incomplete
Special constant that defines incomplete code point.
Definition: utf.hpp:24