libregexis024/src/libregexis024sol/expr_parse_functions/ep_sequence.cpp
2024-07-28 19:54:57 +03:00

223 lines
11 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include <libregexis024sol/expr_parse_functions/epf.h>
#include <assert.h>
#include <libregexis024sol/expr_parse_functions/tracking_units.h>
#include <libregexis024sol/sol_misc_base.h>
#include <libregexis024sol/expr_compiler.h>
#include <libregexis024sol/special_terminals.h>
#include <libregexis024vm/vm_opcodes.h>
#include <libregexis024sol/square_bracket_expression.h>
#include <libregexis024sol/expr_parse_functions/command_recognition.h>
#include <libregexis024fa/misc_fa_funcs.h>
#define call_ERROR_CHECK do { if (ctx.error) { return NULL; } } while (0)
#define call_THROW(str) do { report(ctx, "regex: " str); return NULL; } while (0)
#define aux_ERROR_CHECK do { if (ctx.error) { return; } } while (0)
#define aux_THROW(str) do { report(ctx, "regex: " str); return; } while (0)
/* **************************** Sequence */
void in_case_of_backslash(REGEX_IS024_MeaningContext &ctx, const CommonCodesets& cc, FA_Container &fa, SubExprCompiled& backPart) {
assert(readChar(ctx) == U'\\');
int32_t leader = peep(ctx); aux_ERROR_CHECK;
if (leader == U'b'){
FA_NodeOfForking* n1 = fa.makeForking();
FA_NodeOfLookOneBehind* n1a = fa.makeLookOneBehind(invert_set(cc.word_constituents));
FA_NodeOfLookOneAhead* n2a = fa.makeLookOneAhead(cc.word_constituents);
reattach_nxt_node(n1a, n2a);
FA_NodeOfLookOneBehind* n1b = fa.makeLookOneBehind(cc.word_constituents);
FA_NodeOfLookOneAhead* n2b = fa.makeLookOneAhead(invert_set(cc.word_constituents));
reattach_nxt_node(n1b, n2b);
add_option_to_fork_node(n1, n1a);
add_option_to_fork_node(n1, n1b);
backPart.start = n1;
backPart.ends = {&(n2a->nxt_node), &(n2b->nxt_node)};
} else if (leader == U'B'){
FA_NodeOfForking* n1 = fa.makeForking();
FA_NodeOfLookOneBehind* n1a = fa.makeLookOneBehind(cc.word_constituents);
FA_NodeOfLookOneAhead* n2a = fa.makeLookOneAhead(cc.word_constituents);
reattach_nxt_node(n1a, n2a);
FA_NodeOfLookOneBehind* n1b = fa.makeLookOneBehind(invert_set(cc.word_constituents));
FA_NodeOfLookOneAhead* n2b = fa.makeLookOneAhead(invert_set(cc.word_constituents));
reattach_nxt_node(n1b, n2b);
add_option_to_fork_node(n1, n1a);
add_option_to_fork_node(n1, n1b);
backPart.start = n1;
backPart.ends = {&(n2a->nxt_node), &(n2b->nxt_node)};
} else if (leader == U'<'){
FA_NodeOfLookOneBehind *n1 = fa.makeLookOneBehind(invert_set(cc.word_constituents));
FA_NodeOfLookOneAhead *n2 = fa.makeLookOneAhead(cc.word_constituents);
reattach_nxt_node(n1, n2);
backPart.start = n1;
backPart.ends = {&(n2->nxt_node)};
} else if (leader == U'>'){
FA_NodeOfLookOneBehind *n1 = fa.makeLookOneBehind(cc.word_constituents);
FA_NodeOfLookOneAhead *n2 = fa.makeLookOneAhead(invert_set(cc.word_constituents));
reattach_nxt_node(n1, n2);
backPart.start = n1;
backPart.ends = {&(n2->nxt_node)};
} else {
bool ret_is_multicode; codeset_t res_codeset;
backslash_expression_parsing_try_regular(ctx, cc, ret_is_multicode, res_codeset);
backPart = subexpr_charset_reading_filter(res_codeset, fa);
return; // To avoid reading leader again (it gets read in the end)
}
readChar(ctx);
}
void repeat_stuff_with_check(REGEX_IS024_MeaningContext& ctx,
SubExprCompiled &patient, FA_Container& fa, size_t min_allowed, size_t max_allowed){
if (min_allowed > max_allowed)
aux_THROW("repeat operation: min > max");
if (min_allowed > REGEXIS024_MAX_REPEAT)
aux_THROW("minimum repeat factor is too high");
if (max_allowed > REGEXIS024_MAX_REPEAT && patient.can_be_empty)
aux_THROW("safety abortion: possible infinite loop. Если вы считаете, что ваше регулярное "
"выражение корректно и не вызвает бесконечного цикла, напишите об этом в жалобную книгу: "
"По ссылке: file:///dev/null Ваши предложения по улучшению libregexis024 обязательно будут рассмотрены.");
apply_repeat_to_subexpression(patient, fa, min_allowed, max_allowed);
}
void repeat_command_processing(REGEX_IS024_MeaningContext &ctx, FA_Container &fa, std::vector<SubExprCompiled>& parts,
const Command& cmd){
if (parts.empty())
aux_THROW("no subexpression before !repeat command");
if (cmd.arguments.empty() || (cmd.arguments.size() == 1 && cmd.arguments[0].is_empty)) {
repeat_stuff_with_check(ctx, parts.back(), fa, 0, REGEXIS024_MAX_REPEAT + 1); aux_ERROR_CHECK;
} else if (cmd.arguments.size() == 1){
size_t mm;
int_parse_with_limit_concern(cmd.arguments[0].name, ctx, mm, REGEXIS024_MAX_REPEAT); aux_ERROR_CHECK;
repeat_stuff_with_check(ctx, parts.back(), fa, mm, mm); aux_ERROR_CHECK;
} else if (cmd.arguments.size() > 2){
aux_THROW("too many arguments in !repeat command");
} else {
size_t min_allowed, max_allowed;
if (cmd.arguments[0].is_empty){
min_allowed = 0;
} else {
int_parse_with_limit_concern(cmd.arguments[0].name, ctx, min_allowed, REGEXIS024_MAX_REPEAT);
aux_ERROR_CHECK;
}
if (cmd.arguments[1].is_empty){
max_allowed = REGEXIS024_MAX_REPEAT + 1;
} else {
int_parse_with_limit_concern(cmd.arguments[1].name, ctx, max_allowed, REGEXIS024_MAX_REPEAT);
aux_ERROR_CHECK;
}
if (min_allowed > max_allowed)
aux_THROW("!repeat: min > max");
repeat_stuff_with_check(ctx, parts.back(), fa, min_allowed, max_allowed); aux_ERROR_CHECK;
}
}
chekushka Sequence_ParseCall::firstTime(REGEX_IS024_MeaningContext &ctx, ParsingContext &pctx, FA_Container &fa) {
while (true) {
int32_t fst = peep(ctx);
call_ERROR_CHECK;
if (fst == U'!') {
Command cmdBuf;
size_t before_cmd = ctx.pos;
cmdBuf = command_expr_parse(ctx);
call_ERROR_CHECK;
if (is_header_cmd(cmdBuf)){
ctx.pos = before_cmd;
break;
} else if (cmdBuf.name == "r" || cmdBuf.name == "repeat"){
repeat_command_processing(ctx, fa, parts, cmdBuf); call_ERROR_CHECK;
} else if (is_command_for_charset(cmdBuf)){
codeset_t cs;
interpret_command_as_charset_giving(pctx.cc, cmdBuf, cs); call_ERROR_CHECK;
parts.push_back(subexpr_charset_reading_filter(cs, fa));
} else {
call_THROW("unknown command");
}
} else if (fst == U'\\') {
parts.emplace_back();
in_case_of_backslash(ctx, pctx.cc, fa, parts.back());
call_ERROR_CHECK;
} else if (fst == U'^'){
readChar(ctx);
parts.push_back(subexpression_from_path(fa.makeLookOneBehind(codeset_of_one_char(U'\n'))));
} else if (fst == U'$'){
readChar(ctx);
parts.push_back(subexpression_from_path(fa.makeLookOneAhead(codeset_of_one_char(U'\n'))));
} else if (fst == U'*'){
#define vibe_check(sn) if (parts.empty()) { call_THROW("no subexpression before `" sn "` operator"); } readChar(ctx);
vibe_check("*")
repeat_stuff_with_check(ctx, parts.back(), fa, 0, REGEXIS024_MAX_REPEAT + 1); call_ERROR_CHECK;
} else if (fst == U'+'){
vibe_check("+")
repeat_stuff_with_check(ctx, parts.back(), fa, 1, REGEXIS024_MAX_REPEAT + 1); call_ERROR_CHECK;
} else if (fst == U'?'){
vibe_check("?")
repeat_stuff_with_check(ctx, parts.back(), fa, 0, 1); call_ERROR_CHECK;
#undef vibe_check
} else if (fst == U'#'){
readChar(ctx);
std::string name = tryRead_REGEX024_name(ctx); call_ERROR_CHECK;
if (name.empty())
call_THROW("No name provided after #");
if (ctx.ktr.track_names.count(name) == 0){
ctx.ktr.track_names[name] = static_cast<int64_t>(ctx.ktr.retrieval_info.size());
ctx.ktr.retrieval_info.emplace_back();
}
int64_t id = ctx.ktr.track_names[name];
int32_t typeDet = peep(ctx);
if (typeDet == U'('){
ensure_space_for_track_unit(ctx, name, tracking_var_types::range); call_ERROR_CHECK;
parts.emplace_back();
return std::make_unique<BracketLvl_ParseCall>(parts.back(), id);
} else if (typeDet == U':'){
ensure_space_for_track_unit(ctx, name, tracking_var_types::dot_immediate); call_ERROR_CHECK;
readChar(ctx);
std::string value_str = tryRead_REGEX024_name(ctx);
size_t value;
int_parse_with_limit_concern(value_str, ctx, value, UINT16_MAX);
int32_t cl = peep(ctx);
if (cl != U';')
call_THROW("Missing ; after dot track unit operator");
readChar(ctx);
if (ctx.ktr.retrieval_info[id].stored_in_sa)
parts.emplace_back(subexpression_from_path(
fa.makeTrackArrayMovImm(regex024_opcodes::MOV_SELARR_IMM,
ctx.ktr.retrieval_info[id].selarr_first, value)));
if (ctx.ktr.retrieval_info[id].stored_in_ca)
parts.emplace_back(subexpression_from_path(
fa.makeTrackArrayMovImm(regex024_opcodes::MOV_COLARR_IMM,
ctx.ktr.retrieval_info[id].colarr_first, value)));
} else if (typeDet == U';'){
ensure_space_for_track_unit(ctx, name, tracking_var_types::dot_cur_pos); call_ERROR_CHECK;
readChar(ctx);
if (ctx.ktr.retrieval_info[id].stored_in_sa)
parts.emplace_back(subexpression_from_path(
fa.makeTrackArrayMovHalfinvariant(regex024_opcodes::MOV_SELARR_CHPOS,
ctx.ktr.retrieval_info[id].selarr_first)));
if (ctx.ktr.retrieval_info[id].stored_in_ca)
parts.emplace_back(subexpression_from_path(
fa.makeTrackArrayMovHalfinvariant(regex024_opcodes::MOV_COLARR_BTPOS,
ctx.ktr.retrieval_info[id].colarr_first)));
} else
call_THROW("Missing ; or ( in the beginning of tracking unit");
} else if (fst == U'(') {
parts.emplace_back();
return std::make_unique<BracketLvl_ParseCall>(parts.back(), -1);
} else if (fst == U'[') {
codeset_t filter = sq_bracket_expr_parse(ctx, pctx.cc); call_ERROR_CHECK;
parts.push_back(subexpr_charset_reading_filter(filter, fa));
} else if (fst >= 0 && fst != U')' && fst != U'|' && fst != U']'){
readChar(ctx);
parts.push_back(subexpr_charset_reading_filter(codeset_of_one_char(fst), fa));
} else {
break;
}
}
for (SubExprCompiled& part: parts)
result = join(result, part);
return NULL;
}
chekushka Sequence_ParseCall::afterReceive(REGEX_IS024_MeaningContext &ctx, ParsingContext &pctx, FA_Container &fa) {
// This is possible only if I received a bracket expression
return firstTime(ctx, pctx, fa);
}