Boost.Locale
generic_codecvt.hpp
1 //
2 // Copyright (c) 2015 Artyom Beilis (Tonkikh)
3 //
4 // Distributed under the Boost Software License, Version 1.0.
5 // https://www.boost.org/LICENSE_1_0.txt
6 
7 #ifndef BOOST_LOCALE_GENERIC_CODECVT_HPP
8 #define BOOST_LOCALE_GENERIC_CODECVT_HPP
9 
10 #include <boost/locale/utf.hpp>
11 #include <boost/cstdint.hpp>
12 #include <locale>
13 
14 namespace boost { namespace locale {
15 
16 #ifndef BOOST_LOCALE_DOXYGEN
17  //
18  // Make sure that mbstate can keep 16 bit of UTF-16 sequence
19  //
20  static_assert(sizeof(std::mbstate_t) >= 2, "std::mbstate_t is to small");
21 #endif
22 
23 #if defined(_MSC_VER) && _MSC_VER < 1700
24 // up to MSVC 11 (2012) do_length is non-standard it counts wide characters instead of narrow and does not change
25 // mbstate
26 # define BOOST_LOCALE_DO_LENGTH_MBSTATE_CONST
27 #endif
28 
31  public:
36  };
37  };
38 
142  template<typename CharType, typename CodecvtImpl, int CharSize = sizeof(CharType)>
144 
151  template<typename CharType, typename CodecvtImpl>
152  class generic_codecvt<CharType, CodecvtImpl, 2> : public std::codecvt<CharType, char, std::mbstate_t>,
153  public generic_codecvt_base {
154  public:
155  typedef CharType uchar;
156 
157  generic_codecvt(size_t refs = 0) : std::codecvt<CharType, char, std::mbstate_t>(refs) {}
158  const CodecvtImpl& implementation() const { return *static_cast<const CodecvtImpl*>(this); }
159 
160  protected:
161  std::codecvt_base::result do_unshift(std::mbstate_t& s, char* from, char* /*to*/, char*& next) const override
162  {
163  boost::uint16_t& state = *reinterpret_cast<boost::uint16_t*>(&s);
164 #ifdef DEBUG_CODECVT
165  std::cout << "Entering unshift " << std::hex << state << std::dec << std::endl;
166 #endif
167  if(state != 0)
168  return std::codecvt_base::error;
169  next = from;
170  return std::codecvt_base::ok;
171  }
172  int do_encoding() const noexcept override
173  {
174  return 0;
175  }
176  int do_max_length() const noexcept override
177  {
178  return implementation().max_encoding_length();
179  }
180  bool do_always_noconv() const noexcept override
181  {
182  return false;
183  }
184 
185  int do_length(
186 #ifdef BOOST_LOCALE_DO_LENGTH_MBSTATE_CONST
187  const
188 #endif
189  std::mbstate_t& std_state,
190  const char* from,
191  const char* from_end,
192  size_t max) const override
193  {
194 #ifndef BOOST_LOCALE_DO_LENGTH_MBSTATE_CONST
195  const char* save_from = from;
196  boost::uint16_t& state = *reinterpret_cast<boost::uint16_t*>(&std_state);
197 #else
198  size_t save_max = max;
199  boost::uint16_t state = *reinterpret_cast<const boost::uint16_t*>(&std_state);
200 #endif
201 
202  typename CodecvtImpl::state_type cvt_state =
203  implementation().initial_state(generic_codecvt_base::to_unicode_state);
204  while(max > 0 && from < from_end) {
205  const char* prev_from = from;
206  boost::uint32_t ch = implementation().to_unicode(cvt_state, from, from_end);
208  from = prev_from;
209  break;
210  }
211  max--;
212  if(ch > 0xFFFF) {
213  if(state == 0) {
214  from = prev_from;
215  state = 1;
216  } else {
217  state = 0;
218  }
219  }
220  }
221 #ifndef BOOST_LOCALE_DO_LENGTH_MBSTATE_CONST
222  return static_cast<int>(from - save_from);
223 #else
224  return static_cast<int>(save_max - max);
225 #endif
226  }
227 
228  std::codecvt_base::result do_in(std::mbstate_t& std_state,
229  const char* from,
230  const char* from_end,
231  const char*& from_next,
232  uchar* to,
233  uchar* to_end,
234  uchar*& to_next) const override
235  {
236  std::codecvt_base::result r = std::codecvt_base::ok;
237 
238  // mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT())
239  // according to standard. We use it to keep a flag 0/1 for surrogate pair writing
240  //
241  // if 0 no code above >0xFFFF observed, of 1 a code above 0xFFFF observed
242  // and first pair is written, but no input consumed
243  boost::uint16_t& state = *reinterpret_cast<boost::uint16_t*>(&std_state);
244  typename CodecvtImpl::state_type cvt_state =
245  implementation().initial_state(generic_codecvt_base::to_unicode_state);
246  while(to < to_end && from < from_end) {
247 #ifdef DEBUG_CODECVT
248  std::cout << "Entering IN--------------\n";
249  std::cout << "State " << std::hex << state << std::endl;
250  std::cout << "Left in " << std::dec << from_end - from << " out " << to_end - to << std::endl;
251 #endif
252  const char* from_saved = from;
253 
254  uint32_t ch = implementation().to_unicode(cvt_state, from, from_end);
255 
256  if(ch == boost::locale::utf::illegal) {
257  from = from_saved;
258  r = std::codecvt_base::error;
259  break;
260  }
262  from = from_saved;
263  r = std::codecvt_base::partial;
264  break;
265  }
266  // Normal codepoints go direcly to stream
267  if(ch <= 0xFFFF) {
268  *to++ = static_cast<uchar>(ch);
269  } else {
270  // for other codepoints we do following
271  //
272  // 1. We can't consume our input as we may find ourselves
273  // in state where all input consumed but not all output written,i.e. only
274  // 1st pair is written
275  // 2. We only write first pair and mark this in the state, we also revert back
276  // the from pointer in order to make sure this codepoint would be read
277  // once again and then we would consume our input together with writing
278  // second surrogate pair
279  ch -= 0x10000;
280  boost::uint16_t w1 = static_cast<boost::uint16_t>(0xD800 | (ch >> 10));
281  boost::uint16_t w2 = static_cast<boost::uint16_t>(0xDC00 | (ch & 0x3FF));
282  if(state == 0) {
283  from = from_saved;
284  *to++ = w1;
285  state = 1;
286  } else {
287  *to++ = w2;
288  state = 0;
289  }
290  }
291  }
292  from_next = from;
293  to_next = to;
294  if(r == std::codecvt_base::ok && (from != from_end || state != 0))
295  r = std::codecvt_base::partial;
296 #ifdef DEBUG_CODECVT
297  std::cout << "Returning ";
298  switch(r) {
299  case std::codecvt_base::ok: std::cout << "ok\n"; break;
300  case std::codecvt_base::partial: std::cout << "partial\n"; break;
301  case std::codecvt_base::error: std::cout << "error\n"; break;
302  default: std::cout << "other\n"; break;
303  }
304  std::cout << "State " << std::hex << state << std::endl;
305  std::cout << "Left in " << std::dec << from_end - from << " out " << to_end - to << std::endl;
306 #endif
307  return r;
308  }
309 
310  std::codecvt_base::result do_out(std::mbstate_t& std_state,
311  const uchar* from,
312  const uchar* from_end,
313  const uchar*& from_next,
314  char* to,
315  char* to_end,
316  char*& to_next) const override
317  {
318  std::codecvt_base::result r = std::codecvt_base::ok;
319  // mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT())
320  // according to standard. We assume that sizeof(mbstate_t) >=2 in order
321  // to be able to store first observed surrogate pair
322  //
323  // State: state!=0 - a first surrogate pair was observed (state = first pair),
324  // we expect the second one to come and then zero the state
325  boost::uint16_t& state = *reinterpret_cast<boost::uint16_t*>(&std_state);
326  typename CodecvtImpl::state_type cvt_state =
327  implementation().initial_state(generic_codecvt_base::from_unicode_state);
328  while(to < to_end && from < from_end) {
329 #ifdef DEBUG_CODECVT
330  std::cout << "Entering OUT --------------\n";
331  std::cout << "State " << std::hex << state << std::endl;
332  std::cout << "Left in " << std::dec << from_end - from << " out " << to_end - to << std::endl;
333 #endif
334  boost::uint32_t ch = 0;
335  if(state != 0) {
336  // if the state indicates that 1st surrogate pair was written
337  // we should make sure that the second one that comes is actually
338  // second surrogate
339  boost::uint16_t w1 = state;
340  boost::uint16_t w2 = *from;
341  // we don't forward from as writing may fail to incomplete or
342  // partial conversion
343  if(0xDC00 <= w2 && w2 <= 0xDFFF) {
344  boost::uint16_t vh = w1 - 0xD800;
345  boost::uint16_t vl = w2 - 0xDC00;
346  ch = ((uint32_t(vh) << 10) | vl) + 0x10000;
347  } else {
348  // Invalid surrogate
349  r = std::codecvt_base::error;
350  break;
351  }
352  } else {
353  ch = *from;
354  if(0xD800 <= ch && ch <= 0xDBFF) {
355  // if this is a first surrogate pair we put
356  // it into the state and consume it, note we don't
357  // go forward as it should be illegal so we increase
358  // the from pointer manually
359  state = static_cast<uint16_t>(ch);
360  from++;
361  continue;
362  } else if(0xDC00 <= ch && ch <= 0xDFFF) {
363  // if we observe second surrogate pair and
364  // first only may be expected we should break from the loop with error
365  // as it is illegal input
366  r = std::codecvt_base::error;
367  break;
368  }
369  }
371  r = std::codecvt_base::error;
372  break;
373  }
374  boost::uint32_t len = implementation().from_unicode(cvt_state, ch, to, to_end);
375  if(len == boost::locale::utf::incomplete) {
376  r = std::codecvt_base::partial;
377  break;
378  } else if(len == boost::locale::utf::illegal) {
379  r = std::codecvt_base::error;
380  break;
381  } else
382  to += len;
383  state = 0;
384  from++;
385  }
386  from_next = from;
387  to_next = to;
388  if(r == std::codecvt_base::ok && from != from_end)
389  r = std::codecvt_base::partial;
390 #ifdef DEBUG_CODECVT
391  std::cout << "Returning ";
392  switch(r) {
393  case std::codecvt_base::ok: std::cout << "ok\n"; break;
394  case std::codecvt_base::partial: std::cout << "partial\n"; break;
395  case std::codecvt_base::error: std::cout << "error\n"; break;
396  default: std::cout << "other\n"; break;
397  }
398  std::cout << "State " << std::hex << state << std::endl;
399  std::cout << "Left in " << std::dec << from_end - from << " out " << to_end - to << std::endl;
400 #endif
401  return r;
402  }
403  };
404 
409  template<typename CharType, typename CodecvtImpl>
410  class generic_codecvt<CharType, CodecvtImpl, 4> : public std::codecvt<CharType, char, std::mbstate_t>,
411  public generic_codecvt_base {
412  public:
413  typedef CharType uchar;
414 
415  generic_codecvt(size_t refs = 0) : std::codecvt<CharType, char, std::mbstate_t>(refs) {}
416 
417  const CodecvtImpl& implementation() const { return *static_cast<const CodecvtImpl*>(this); }
418 
419  protected:
420  std::codecvt_base::result
421  do_unshift(std::mbstate_t& /*s*/, char* from, char* /*to*/, char*& next) const override
422  {
423  next = from;
424  return std::codecvt_base::ok;
425  }
426  int do_encoding() const noexcept override { return 0; }
427  int do_max_length() const noexcept override { return implementation().max_encoding_length(); }
428  bool do_always_noconv() const noexcept override { return false; }
429 
430  int do_length(
431 #ifdef BOOST_LOCALE_DO_LENGTH_MBSTATE_CONST
432  const
433 #endif
434  std::mbstate_t& /*state*/,
435  const char* from,
436  const char* from_end,
437  size_t max) const override
438  {
439 #ifndef BOOST_LOCALE_DO_LENGTH_MBSTATE_CONST
440  const char* start_from = from;
441 #else
442  size_t save_max = max;
443 #endif
444  typename CodecvtImpl::state_type cvt_state =
445  implementation().initial_state(generic_codecvt_base::to_unicode_state);
446  while(max > 0 && from < from_end) {
447  const char* save_from = from;
448  boost::uint32_t ch = implementation().to_unicode(cvt_state, from, from_end);
450  from = save_from;
451  break;
452  }
453  max--;
454  }
455 #ifndef BOOST_LOCALE_DO_LENGTH_MBSTATE_CONST
456  return from - start_from;
457 #else
458  return save_max - max;
459 #endif
460  }
461 
462  std::codecvt_base::result do_in(std::mbstate_t& /*state*/,
463  const char* from,
464  const char* from_end,
465  const char*& from_next,
466  uchar* to,
467  uchar* to_end,
468  uchar*& to_next) const override
469  {
470  std::codecvt_base::result r = std::codecvt_base::ok;
471 
472  // mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT())
473  // according to standard. We use it to keep a flag 0/1 for surrogate pair writing
474  //
475  // if 0 no code above >0xFFFF observed, of 1 a code above 0xFFFF observed
476  // and first pair is written, but no input consumed
477  auto cvt_state = implementation().initial_state(generic_codecvt_base::to_unicode_state);
478  while(to < to_end && from < from_end) {
479 #ifdef DEBUG_CODECVT
480  std::cout << "Entering IN--------------\n";
481  std::cout << "State " << std::hex << state << std::endl;
482  std::cout << "Left in " << std::dec << from_end - from << " out " << to_end - to << std::endl;
483 #endif
484  const char* from_saved = from;
485 
486  uint32_t ch = implementation().to_unicode(cvt_state, from, from_end);
487 
488  if(ch == boost::locale::utf::illegal) {
489  r = std::codecvt_base::error;
490  from = from_saved;
491  break;
492  }
494  r = std::codecvt_base::partial;
495  from = from_saved;
496  break;
497  }
498  *to++ = ch;
499  }
500  from_next = from;
501  to_next = to;
502  if(r == std::codecvt_base::ok && from != from_end)
503  r = std::codecvt_base::partial;
504 #ifdef DEBUG_CODECVT
505  std::cout << "Returning ";
506  switch(r) {
507  case std::codecvt_base::ok: std::cout << "ok\n"; break;
508  case std::codecvt_base::partial: std::cout << "partial\n"; break;
509  case std::codecvt_base::error: std::cout << "error\n"; break;
510  default: std::cout << "other\n"; break;
511  }
512  std::cout << "State " << std::hex << state << std::endl;
513  std::cout << "Left in " << std::dec << from_end - from << " out " << to_end - to << std::endl;
514 #endif
515  return r;
516  }
517 
518  std::codecvt_base::result do_out(std::mbstate_t& /*std_state*/,
519  const uchar* from,
520  const uchar* from_end,
521  const uchar*& from_next,
522  char* to,
523  char* to_end,
524  char*& to_next) const override
525  {
526  std::codecvt_base::result r = std::codecvt_base::ok;
527  auto cvt_state = implementation().initial_state(generic_codecvt_base::from_unicode_state);
528  while(to < to_end && from < from_end) {
529 #ifdef DEBUG_CODECVT
530  std::cout << "Entering OUT --------------\n";
531  std::cout << "State " << std::hex << state << std::endl;
532  std::cout << "Left in " << std::dec << from_end - from << " out " << to_end - to << std::endl;
533 #endif
534  boost::uint32_t ch = 0;
535  ch = *from;
537  r = std::codecvt_base::error;
538  break;
539  }
540  boost::uint32_t len = implementation().from_unicode(cvt_state, ch, to, to_end);
541  if(len == boost::locale::utf::incomplete) {
542  r = std::codecvt_base::partial;
543  break;
544  } else if(len == boost::locale::utf::illegal) {
545  r = std::codecvt_base::error;
546  break;
547  }
548  to += len;
549  from++;
550  }
551  from_next = from;
552  to_next = to;
553  if(r == std::codecvt_base::ok && from != from_end)
554  r = std::codecvt_base::partial;
555 #ifdef DEBUG_CODECVT
556  std::cout << "Returning ";
557  switch(r) {
558  case std::codecvt_base::ok: std::cout << "ok\n"; break;
559  case std::codecvt_base::partial: std::cout << "partial\n"; break;
560  case std::codecvt_base::error: std::cout << "error\n"; break;
561  default: std::cout << "other\n"; break;
562  }
563  std::cout << "State " << std::hex << state << std::endl;
564  std::cout << "Left in " << std::dec << from_end - from << " out " << to_end - to << std::endl;
565 #endif
566  return r;
567  }
568  };
569 
570  template<typename CharType, typename CodecvtImpl>
571  class generic_codecvt<CharType, CodecvtImpl, 1> : public std::codecvt<CharType, char, std::mbstate_t>,
572  public generic_codecvt_base {
573  public:
574  typedef CharType uchar;
575 
576  const CodecvtImpl& implementation() const { return *static_cast<const CodecvtImpl*>(this); }
577 
578  generic_codecvt(size_t refs = 0) : std::codecvt<char, char, std::mbstate_t>(refs) {}
579  };
580 
581 }} // namespace boost::locale
582 
583 #endif
bool is_valid_codepoint(code_point v)
the function checks if v is a valid code point
Definition: utf.hpp:27
The state would be used by to_unicode functions.
Definition: generic_codecvt.hpp:34
initial_convertion_state
Initial state for converting to or from unicode code points, used by initial_state in derived classes...
Definition: generic_codecvt.hpp:33
A base class that used to define constants for generic_codecvt.
Definition: generic_codecvt.hpp:30
Generic codecvt facet for various stateless encodings to UTF-16 and UTF-32 using wchar_t,...
Definition: generic_codecvt.hpp:143
constexpr code_point illegal
Special constant that defines illegal code point.
Definition: utf.hpp:22
The state would be used by from_unicode functions.
Definition: generic_codecvt.hpp:35
constexpr code_point incomplete
Special constant that defines incomplete code point.
Definition: utf.hpp:24