libregexis024/src/libregexis024tools/stringmatching.cpp

110 lines
4.2 KiB
C++

#include <algorithm>
#include <libregexis024tools/stringmatching.h>
#include <libregexis024sol/expr_compiler.h>
#include <libregexis024vm/libregexis024vm_interface.h>
#include <libregexis024vm/utils.h>
#include <assert.h>
// using namespace regexis024;
namespace regexis024 {
void convert(TrackingVariableInfo& to, const SubtrackingNameInfo& from) {
#define plagiat(field) to.field = from.field;
plagiat(type);
plagiat(colarr_first);
plagiat(colarr_second);
plagiat(stored_in_ca);
plagiat(selarr_first);
plagiat(selarr_second);
plagiat(stored_in_sa);
#undef plagiat
}
int matchStrToRegexp(const std::string& input, const std::string& pattern,
MatchInfo& retMatchInfo, track_var_list& retTrackVarList, std::string& retStatus)
{
retTrackVarList = {};
retMatchInfo = MatchInfo();
retStatus = "";
REGEX_IS024_MeaningContext regexp(pattern.size(), pattern.data());
if (regexp.error) {
retStatus = "Pattern compilation. " + regexp.error_msg;
return -1;
}
retTrackVarList = {};
for (auto& iip: regexp.ktr.track_names) {
convert(retTrackVarList[iip.first], regexp.ktr.retrieval_info[iip.second]);
}
VirtualMachine vm(regexp.compiled_program.size(), regexp.compiled_program.data(),
UINT64_MAX, UINT16_MAX,
UINT32_MAX, UINT32_MAX, UINT64_MAX);
auto getVMErrString = [&]() -> std::string {
return std::string(error_code_to_str(vm.getErrno()));
};
if (vm.initialize() != error_codes::stable) {
retStatus = "Virtual machine initialization. " + getVMErrString();
return -1;
}
int left_ext_feed = vm.getInputLeftExtensionSize();
int right_ext_feed = vm.getInputRightExtensionSize();
if (left_ext_feed > 1 || right_ext_feed > 1) {
retStatus = "Unnatural extended input request.";
return -1;
}
if (vm.addNewMatchingThread() != error_codes::stable) {
retStatus = "Virtual machine first kick. " + getVMErrString();
}
if (left_ext_feed) {
if (vm.extendedFeedCharacter('\n') != error_codes::stable) {
retStatus = "VM left extended input. " + getVMErrString();
return -1;
}
}
for (size_t cur_text_pos = 0;cur_text_pos < input.size();) {
int32_t inp_code;
size_t adj;
utf8_string_iterat(inp_code, adj, cur_text_pos, input.data(), input.size());
if (inp_code < 0) {
retStatus = "Input string encoding error.";
return -1;
}
if (vm.feedCharacter(static_cast<uint64_t>(inp_code), adj) != error_codes::stable) {
retStatus = "VM input. " + getVMErrString();
return -1;
}
cur_text_pos += adj;
}
if (right_ext_feed) {
if (vm.extendedFeedCharacter('\n') != error_codes::stable) {
retStatus = "VM right extended input. " + getVMErrString();
return -1;
}
}
assert(vm.isUsable());
if (vm.isMatched()) {
retMatchInfo.have_match = true;
size_t SN1 = vm.getSelectionArrayLength();
retMatchInfo.sa.assign(SN1, 0);
for (size_t i = 0; i < SN1; i++)
retMatchInfo.sa[i] = vm.getMatchedThreadSAValue(i);
retMatchInfo.ca_history = vm.getMatchedThreadCABranchReverse();
std::reverse(retMatchInfo.ca_history.begin(), retMatchInfo.ca_history.end());
}
return 0;
}
bool MatchInfo::operator==(const MatchInfo &other) const {
if (!have_match && !other.have_match)
return true;
return (have_match == other.have_match) && (sa == other.sa) && (ca_history == other.ca_history);
}
bool MatchInfo::operator!=(const MatchInfo &other) const {
return !(*this == other);
}
MatchInfo::MatchInfo(const std::vector<CAEvent> &ca_history, const std::vector<uint64_t> &sa):
ca_history(ca_history), sa(sa), have_match(true) {
}
}