Browse Source

Added support of UTF-8 flags and icase option

master
Artyom Beilis 8 years ago
parent
commit
42ed3a3529
3 changed files with 64 additions and 6 deletions
  1. +2
    -0
      booster/booster/perl_regex.h
  2. +13
    -2
      booster/lib/regex/src/pcre_regex.cpp
  3. +49
    -4
      booster/lib/regex/test/test_regex.cpp

+ 2
- 0
booster/booster/perl_regex.h View File

@@ -120,6 +120,8 @@ namespace booster {

static const int perl = 0; ///< Constant for expression type - Perl Compatible Regex.
static const int normal = 0; ///< Constant for expression type - synonym of perl, default.
static const int icase = 0x100; ///< Make case insensitive comparison \ver{v1_2}
static const int utf8 = 0x200; ///< Assume that input is UTF-8 so for example '.' would match UTF-8 code point \ver{v1_2}

private:
struct data;


+ 13
- 2
booster/lib/regex/src/pcre_regex.cpp View File

@@ -118,7 +118,18 @@ namespace booster {
d->flags = flags;
char const *err_ptr = 0;
int offset = 0;
pcre *p=pcre_compile(pattern.c_str(),0,&err_ptr,&offset,0);
int pcre_flags = 0;
if(flags & icase) {
pcre_flags |= PCRE_CASELESS;
}
if(flags & utf8) {
#ifndef PCRE_UTF8
throw regex_error("PCRE Library does not support UTF-8 please upgrade");
#else
pcre_flags |= PCRE_UTF8;
#endif
}
pcre *p=pcre_compile(pattern.c_str(),pcre_flags,&err_ptr,&offset,0);
if(!p) {
std::ostringstream ss;
ss << "booster::regex:" << err_ptr <<", at offset "<<offset;
@@ -137,7 +148,7 @@ namespace booster {
anchored+=pattern;
anchored+=")\\z";

p=pcre_compile(anchored.c_str(),0,&err_ptr,&offset,0);
p=pcre_compile(anchored.c_str(),pcre_flags,&err_ptr,&offset,0);
if(!p) {
throw regex_error("booster::regex: Internal error");
}


+ 49
- 4
booster/lib/regex/test/test_regex.cpp View File

@@ -7,12 +7,20 @@
//
#include <booster/regex.h>
#include "test.h"
#include <pcre.h>

#define THROWS(x,te) do { \
try{x;}catch(te const &){break;}catch(...){} \
std::ostringstream oss; \
oss << "Error " << __FILE__ << ":"<<__LINE__ << " "#x; \
throw std::runtime_error(oss.str()); \
}while(0)

#include <iostream>

bool search(std::string r,std::string t)
bool search(std::string r,std::string t,int flags=0)
{
booster::regex re(r);
booster::regex re(r,flags);
bool v1 = re.search(t.c_str(),t.c_str()+t.size());
std::vector<std::pair<int,int> > m;
bool v2 = re.search(t.c_str(),t.c_str()+t.size(),m);
@@ -20,9 +28,9 @@ bool search(std::string r,std::string t)
return v1;
}

bool match(std::string r,std::string t)
bool match(std::string r,std::string t,int flags = 0)
{
booster::regex re(r);
booster::regex re(r,flags);
std::vector<std::pair<int,int> > m;
bool v1 = re.match(t.c_str(),t.c_str()+t.size());
bool v2 = re.match(t.c_str(),t.c_str()+t.size(),m);
@@ -164,6 +172,43 @@ int main()
TEST(!match("((((((((((foo))))))))))","foox"));
TEST(!match("((((((((((foo))))))))))","xfoo"));
TEST(!match("a","A",0));
TEST(!search("a","xAz",0));
TEST(match("a","A",booster::regex::icase));
TEST(search("a","xAz",booster::regex::icase));



int utf8 = 0;
#ifdef PCRE_UTF8
pcre_config(PCRE_CONFIG_UTF8,&utf8);
#endif

if(utf8) {

std::cout << "Testing UTF-8" << std::endl;
TEST(match(".","\xD7\x90",booster::regex::utf8));
TEST(match(".","\xD0\x96",booster::regex::utf8));
TEST(match("\xD0\x96","\xD0\xB6",booster::regex::icase | booster::regex::utf8));

int prop=0;
pcre_config(PCRE_CONFIG_UNICODE_PROPERTIES,&prop);
if(prop) {
std::cout << "Testing Unicode Properties" << std::endl;
TEST(match("\\p{Hebrew}","\xD7\x90",booster::regex::utf8));
TEST(!match("\\p{Hebrew}","\xD0\x96",booster::regex::utf8));
TEST(!match("\\p{Hebrew}","a",booster::regex::utf8));
}
else {
std::cout << "Unicode properties not compiled in" << std::endl;
THROWS(match("\\p{Hebrew}","a",booster::regex::utf8),booster::regex_error);
}
}
else {
std::cout << "UTF-8 is not compiled in" << std::endl;
THROWS(match(".","a",booster::regex::utf8),booster::regex_error);
}
std::cout << "Testing match_result" << std::endl;
test_match<std::string,booster::smatch>("aab");
test_match<char const *,booster::cmatch>("aab");


Loading…
Cancel
Save