Implement support for regex capture groups.

This commit is contained in:
Sadie Powell 2022-05-06 15:06:56 +01:00
parent 5d0e5914a0
commit f7c041f560
6 changed files with 160 additions and 2 deletions

View File

@ -30,9 +30,16 @@ namespace Regex
class Engine;
class EngineReference;
class Exception;
class MatchCollection;
class Pattern;
template<typename> class SimpleEngine;
/** A list of matches that were captured by index. */
typedef std::vector<std::string> Captures;
/** A list of matches that were captured by name. */
typedef insp::flat_map<std::string, std::string> NamedCaptures;
/** A shared pointer to a regex pattern. */
typedef std::shared_ptr<Pattern> PatternPtr;
@ -146,6 +153,34 @@ public:
}
};
class Regex::MatchCollection
{
private:
/** The substrings that were captured. */
const Captures captures;
/** The substrings that were captured by name. */
const NamedCaptures namedcaptures;
public:
/** Initializes a new instance of the Regex::MatchCollection class.
* @param c The substrings that were captured.
* @param nc The substrings that were captured by name.
*/
MatchCollection(const Captures& c, const NamedCaptures& nc)
: captures(c)
, namedcaptures(nc)
{
}
/** Retrieves the substrings that were captured. */
const Captures& GetCaptures() const { return captures; }
/** Retrieves the substrings that were captured by name. */
const NamedCaptures& GetNamedCaptures() const { return namedcaptures; }
};
/** Represents a compiled regular expression pattern. */
class Regex::Pattern
{
@ -182,6 +217,12 @@ public:
* @return If the text matched the pattern then true; otherwise, false.
*/
virtual bool IsMatch(const std::string& text) = 0;
/** Attempts to extract this pattern's match groups from the specified text.
* @param text The text to extract match groups from..
* @return If the text matched the pattern then a match collection; otherwise, std::nullopt.
*/
virtual std::optional<MatchCollection> Matches(const std::string& text) = 0;
};
inline Regex::PatternPtr Regex::Engine::CreateHuman(const std::string& pattern) const

View File

@ -68,11 +68,56 @@ public:
bool IsMatch(const std::string& text) override
{
pcre2_match_data* unused = pcre2_match_data_create(1, nullptr);
pcre2_match_data* unused = pcre2_match_data_create_from_pattern(regex, nullptr);
int result = pcre2_match(regex, reinterpret_cast<PCRE2_SPTR8>(text.c_str()), text.length(), 0, 0, unused, nullptr);
pcre2_match_data_free(unused);
return result >= 0;
}
std::optional<Regex::MatchCollection> Matches(const std::string& text) override
{
pcre2_match_data* data = pcre2_match_data_create_from_pattern(regex, nullptr);
int result = pcre2_match(regex, reinterpret_cast<PCRE2_SPTR8>(text.c_str()), text.length(), 0, 0, data, nullptr);
if (result < 0)
return std::nullopt;
PCRE2_SIZE* ovector = pcre2_get_ovector_pointer(data);
uint32_t capturecount;
Regex::Captures captures;
if (!pcre2_pattern_info(regex, PCRE2_INFO_CAPTURECOUNT, &capturecount) && capturecount)
{
for (uint32_t idx = 0; idx <= capturecount; ++idx)
{
PCRE2_UCHAR* bufferptr;
PCRE2_SIZE bufferlen;
if (!pcre2_substring_get_bynumber(data, idx, &bufferptr, &bufferlen))
captures.emplace_back(reinterpret_cast<const char*>(bufferptr), bufferlen);
}
}
uint32_t namedcapturecount;
Regex::NamedCaptures namedcaptures;
if (!pcre2_pattern_info(regex, PCRE2_INFO_NAMECOUNT, &namedcapturecount) && namedcapturecount)
{
uint32_t nameentrysize;
PCRE2_SPTR nametable;
if (!pcre2_pattern_info(regex, PCRE2_INFO_NAMEENTRYSIZE, &nameentrysize)
&& !pcre2_pattern_info(regex, PCRE2_INFO_NAMETABLE, &nametable))
{
for (uint32_t idx = 0; idx < namedcapturecount; ++idx)
{
int matchidx = (nametable[0] << 8) | nametable[1];
const std::string matchname(reinterpret_cast<const char*>(nametable + 2), nameentrysize - 3);
const std::string matchvalue(text.c_str() + ovector[2 * matchidx], ovector[ 2 * matchidx + 1] - ovector[2 * matchidx]);
namedcaptures.emplace(std::move(matchname), std::move(matchvalue));
nametable += nameentrysize;
}
}
}
return Regex::MatchCollection(std::move(captures), std::move(namedcaptures));
}
};
class ModuleRegexPCRE final

View File

@ -38,7 +38,7 @@ public:
POSIXPattern(const Module* mod, const std::string& pattern, uint8_t options)
: Regex::Pattern(pattern, options)
{
int flags = REG_EXTENDED | REG_NOSUB;
int flags = REG_EXTENDED;
if (options & Regex::OPT_CASE_INSENSITIVE)
flags |= REG_ICASE;
@ -66,6 +66,29 @@ public:
{
return !regexec(&regex, text.c_str(), 0, NULL, 0);
}
std::optional<Regex::MatchCollection> Matches(const std::string& text) override
{
std::vector<regmatch_t> matches(32);
int result = regexec(&regex, text.c_str(), matches.size(), &matches[0], 0);
if (result)
return std::nullopt;
Regex::Captures captures;
for (const auto& match : matches)
{
if (match.rm_so == -1 || match.rm_eo == -1)
break;
captures.emplace_back(text.c_str() + match.rm_so, match.rm_eo - match.rm_so);
}
captures.shrink_to_fit();
// The posix engine does not support named captures.
static const Regex::NamedCaptures unusednc;
return Regex::MatchCollection(std::move(captures), unusednc);
}
};
class ModuleRegexPOSIX final

View File

@ -58,6 +58,27 @@ public:
{
return RE2::FullMatch(text, regex);
}
std::optional<Regex::MatchCollection> Matches(const std::string& text) override
{
std::vector<re2::StringPiece> re2captures(regex.NumberOfCapturingGroups() + 1);
bool result = regex.Match(text, 0, text.length(), RE2::ANCHOR_BOTH, &re2captures[0], static_cast<int>(re2captures.size()));
if (!result)
return std::nullopt;
Regex::Captures captures;
Regex::NamedCaptures namedcaptures;
for (size_t idx = 0; idx < re2captures.size(); ++idx)
{
captures.emplace_back(re2captures[idx]);
auto iter = regex.CapturingGroupNames().find(static_cast<int>(idx));
if (iter != regex.CapturingGroupNames().end())
namedcaptures.emplace(iter->second, re2captures[idx]);
}
return Regex::MatchCollection(captures, namedcaptures);
}
};
class ModuleRegexRE2 final

View File

@ -39,6 +39,18 @@ public:
{
return InspIRCd::Match(text, GetPattern());
}
std::optional<Regex::MatchCollection> Matches(const std::string& text) override
{
if (!InspIRCd::Match(text, GetPattern()))
return std::nullopt;
// The glob engine does not support any kind of capture.
static const Regex::Captures unusedc;
static const Regex::NamedCaptures unusednc;
return Regex::MatchCollection(unusedc, unusednc);
}
};
class ModuleRegexGlob final

View File

@ -54,6 +54,22 @@ public:
{
return std::regex_search(text, regex);
}
std::optional<Regex::MatchCollection> Matches(const std::string& text) override
{
std::smatch matches;
if (!std::regex_search(text, matches, regex))
return std::nullopt;
Regex::Captures captures(matches.size());
for (const auto& match : matches)
captures.push_back(match);
// The stdregex engine does not support named captures.
static const Regex::NamedCaptures unusednc;
return Regex::MatchCollection(std::move(captures), unusednc);
}
};
class StdLibEngine final