Boost C++ Libraries

...one of the most highly regarded and expertly designed C++ library projects in the world. Herb Sutter and Andrei Alexandrescu, C++ Coding Standards

Click here to view the latest version of this page.
PrevUpHomeNext
Unicode Aware Regex Iterators
u32regex_iterator

Type u32regex_iterator is in all respects the same as regex_iterator except that since the regular expression type is always u32regex it only takes one template parameter (the iterator type). It also calls u32regex_search internally, allowing it to interface correctly with UTF-8, UTF-16, and UTF-32 data:

template <class BidirectionalIterator>
class u32regex_iterator
{
   // for members see regex_iterator
};

typedef u32regex_iterator<const char*>     utf8regex_iterator;
typedef u32regex_iterator<const UChar*>    utf16regex_iterator;
typedef u32regex_iterator<const UChar32*>  utf32regex_iterator;

In order to simplify the construction of a u32regex_iterator from a string, there are a series of non-member helper functions called make_u32regex_iterator:

u32regex_iterator<const char*> 
   make_u32regex_iterator(const char* s, 
                          const u32regex& e, 
                          regex_constants::match_flag_type m = regex_constants::match_default);
                          
u32regex_iterator<const wchar_t*> 
   make_u32regex_iterator(const wchar_t* s, 
                          const u32regex& e, 
                          regex_constants::match_flag_type m = regex_constants::match_default);
                          
u32regex_iterator<const UChar*> 
   make_u32regex_iterator(const UChar* s, 
                          const u32regex& e, 
                          regex_constants::match_flag_type m = regex_constants::match_default);
                          
template <class charT, class Traits, class Alloc>
u32regex_iterator<typename std::basic_string<charT, Traits, Alloc>::const_iterator> 
   make_u32regex_iterator(const std::basic_string<charT, Traits, Alloc>& s, 
                          const u32regex& e, 
                          regex_constants::match_flag_type m = regex_constants::match_default);
                          
u32regex_iterator<const UChar*> 
   make_u32regex_iterator(const UnicodeString& s, 
                          const u32regex& e, 
                          regex_constants::match_flag_type m = regex_constants::match_default);

Each of these overloads returns an iterator that enumerates all occurrences of expression e, in text s, using match_flags m.

Example: search for international currency symbols, along with their associated numeric value:

void enumerate_currencies(const std::string& text)
{
   // enumerate and print all the currency symbols, along
   // with any associated numeric values:
   const char* re = 
      "([[:Sc:]][[:Cf:][:Cc:][:Z*:]]*)?"
      "([[:Nd:]]+(?:[[:Po:]][[:Nd:]]+)?)?"
      "(?(1)"
         "|(?(2)"
            "[[:Cf:][:Cc:][:Z*:]]*"
         ")"
         "[[:Sc:]]"
      ")";
   boost::u32regex r = boost::make_u32regex(re);
   boost::u32regex_iterator<std::string::const_iterator> 
         i(boost::make_u32regex_iterator(text, r)), j;
   while(i != j)
   {
      std::cout << (*i)[0] << std::endl;
      ++i;
   }
}

Calling

enumerate_currencies(" $100.23 or £198.12 ");

Yields the output:

$100.23
£198.12

Provided of course that the input is encoded as UTF-8.

u32regex_token_iterator

Type u32regex_token_iterator is in all respects the same as regex_token_iterator except that since the regular expression type is always u32regex it only takes one template parameter (the iterator type). It also calls u32regex_search internally, allowing it to interface correctly with UTF-8, UTF-16, and UTF-32 data:

template <class BidirectionalIterator>
class u32regex_token_iterator
{
   // for members see regex_token_iterator
};

typedef u32regex_token_iterator<const char*>     utf8regex_token_iterator;
typedef u32regex_token_iterator<const UChar*>    utf16regex_token_iterator;
typedef u32regex_token_iterator<const UChar32*>  utf32regex_token_iterator;

In order to simplify the construction of a u32regex_token_iterator from a string, there are a series of non-member helper functions called make_u32regex_token_iterator:

u32regex_token_iterator<const char*> 
   make_u32regex_token_iterator(
         const char* s, 
         const u32regex& e, 
         int sub, 
         regex_constants::match_flag_type m = regex_constants::match_default);
                               
u32regex_token_iterator<const wchar_t*> 
   make_u32regex_token_iterator(
         const wchar_t* s, 
         const u32regex& e, 
         int sub, 
         regex_constants::match_flag_type m = regex_constants::match_default);
                                
u32regex_token_iterator<const UChar*> 
   make_u32regex_token_iterator(
         const UChar* s, 
         const u32regex& e, 
         int sub, 
         regex_constants::match_flag_type m = regex_constants::match_default);
                                
template <class charT, class Traits, class Alloc>
u32regex_token_iterator<typename std::basic_string<charT, Traits, Alloc>::const_iterator> 
   make_u32regex_token_iterator(
         const std::basic_string<charT, Traits, Alloc>& s, 
         const u32regex& e, 
         int sub, 
         regex_constants::match_flag_type m = regex_constants::match_default);
                                
u32regex_token_iterator<const UChar*> 
   make_u32regex_token_iterator(
         const UnicodeString& s, 
         const u32regex& e, 
         int sub, 
         regex_constants::match_flag_type m = regex_constants::match_default);

Each of these overloads returns an iterator that enumerates all occurrences of marked sub-expression sub in regular expression e, found in text s, using match_flags m.

template <std::size_t N>
u32regex_token_iterator<const char*> 
   make_u32regex_token_iterator(
         const char* p, 
         const u32regex& e, 
         const int (&submatch)[N], 
         regex_constants::match_flag_type m = regex_constants::match_default);
                                
template <std::size_t N>
u32regex_token_iterator<const wchar_t*> 
   make_u32regex_token_iterator(
         const wchar_t* p, 
         const u32regex& e, 
         const int (&submatch)[N], 
         regex_constants::match_flag_type m = regex_constants::match_default);
                                
template <std::size_t N>
u32regex_token_iterator<const UChar*> 
   make_u32regex_token_iterator(
         const UChar* p, 
         const u32regex& e, 
         const int (&submatch)[N], 
         regex_constants::match_flag_type m = regex_constants::match_default);
                                
template <class charT, class Traits, class Alloc, std::size_t N>
u32regex_token_iterator<typename std::basic_string<charT, Traits, Alloc>::const_iterator> 
   make_u32regex_token_iterator(
         const std::basic_string<charT, Traits, Alloc>& p, 
         const u32regex& e, 
         const int (&submatch)[N], 
         regex_constants::match_flag_type m = regex_constants::match_default);
                                
template <std::size_t N>
u32regex_token_iterator<const UChar*> 
   make_u32regex_token_iterator(
         const UnicodeString& s, 
         const u32regex& e, 
         const int (&submatch)[N], 
         regex_constants::match_flag_type m = regex_constants::match_default);

Each of these overloads returns an iterator that enumerates one sub-expression for each submatch in regular expression e, found in text s, using match_flags m.

u32regex_token_iterator<const char*> 
   make_u32regex_token_iterator(
         const char* p, 
         const u32regex& e, 
         const std::vector<int>& submatch, 
         regex_constants::match_flag_type m = regex_constants::match_default);
                                
u32regex_token_iterator<const wchar_t*> 
   make_u32regex_token_iterator(
         const wchar_t* p, 
         const u32regex& e, 
         const std::vector<int>& submatch, 
         regex_constants::match_flag_type m = regex_constants::match_default);
                                
u32regex_token_iterator<const UChar*> 
   make_u32regex_token_iterator(
         const UChar* p, 
         const u32regex& e, 
         const std::vector<int>& submatch, 
         regex_constants::match_flag_type m = regex_constants::match_default);
                                
template <class charT, class Traits, class Alloc>
u32regex_token_iterator<typename std::basic_string<charT, Traits, Alloc>::const_iterator> 
   make_u32regex_token_iterator(
         const std::basic_string<charT, Traits, Alloc>& p, 
         const u32regex& e, 
         const std::vector<int>& submatch, 
         regex_constants::match_flag_type m = regex_constants::match_default);
                                
u32regex_token_iterator<const UChar*> 
   make_u32regex_token_iterator(
         const UnicodeString& s, 
         const u32regex& e, 
         const std::vector<int>& submatch, 
         regex_constants::match_flag_type m = regex_constants::match_default);

Each of these overloads returns an iterator that enumerates one sub-expression for each submatch in regular expression e, found in text s, using match_flags m.

Example: search for international currency symbols, along with their associated numeric value:

void enumerate_currencies2(const std::string& text)
{
   // enumerate and print all the currency symbols, along
   // with any associated numeric values:
   const char* re = 
      "([[:Sc:]][[:Cf:][:Cc:][:Z*:]]*)?"
      "([[:Nd:]]+(?:[[:Po:]][[:Nd:]]+)?)?"
      "(?(1)"
         "|(?(2)"
            "[[:Cf:][:Cc:][:Z*:]]*"
         ")"
         "[[:Sc:]]"
      ")";
   boost::u32regex r = boost::make_u32regex(re);
   boost::u32regex_token_iterator<std::string::const_iterator> 
      i(boost::make_u32regex_token_iterator(text, r, 1)), j;
   while(i != j)
   {
      std::cout << *i << std::endl;
      ++i;
   }
}

PrevUpHomeNext