223 lines
11 KiB
C++
223 lines
11 KiB
C++
#include <libregexis024sol/expr_parse_functions/epf.h>
|
||
#include <assert.h>
|
||
#include <libregexis024sol/expr_parse_functions/tracking_units.h>
|
||
#include <libregexis024sol/sol_misc_base.h>
|
||
#include <libregexis024sol/expr_compiler.h>
|
||
#include <libregexis024sol/special_terminals.h>
|
||
#include <libregexis024vm/vm_opcodes.h>
|
||
#include <libregexis024sol/square_bracket_expression.h>
|
||
#include <libregexis024sol/expr_parse_functions/command_recognition.h>
|
||
#include <libregexis024fa/misc_fa_funcs.h>
|
||
|
||
#define call_ERROR_CHECK do { if (ctx.error) { return NULL; } } while (0)
|
||
#define call_THROW(str) do { report(ctx, "regex: " str); return NULL; } while (0)
|
||
#define aux_ERROR_CHECK do { if (ctx.error) { return; } } while (0)
|
||
#define aux_THROW(str) do { report(ctx, "regex: " str); return; } while (0)
|
||
|
||
/* **************************** Sequence */
|
||
|
||
void in_case_of_backslash(REGEX_IS024_MeaningContext &ctx, const CommonCodesets& cc, FA_Container &fa, SubExprCompiled& backPart) {
|
||
assert(readChar(ctx) == U'\\');
|
||
int32_t leader = peep(ctx); aux_ERROR_CHECK;
|
||
if (leader == U'b'){
|
||
FA_NodeOfForking* n1 = fa.makeForking();
|
||
FA_NodeOfLookOneBehind* n1a = fa.makeLookOneBehind(invert_set(cc.word_constituents));
|
||
FA_NodeOfLookOneAhead* n2a = fa.makeLookOneAhead(cc.word_constituents);
|
||
reattach_nxt_node(n1a, n2a);
|
||
FA_NodeOfLookOneBehind* n1b = fa.makeLookOneBehind(cc.word_constituents);
|
||
FA_NodeOfLookOneAhead* n2b = fa.makeLookOneAhead(invert_set(cc.word_constituents));
|
||
reattach_nxt_node(n1b, n2b);
|
||
add_option_to_fork_node(n1, n1a);
|
||
add_option_to_fork_node(n1, n1b);
|
||
backPart.start = n1;
|
||
backPart.ends = {&(n2a->nxt_node), &(n2b->nxt_node)};
|
||
} else if (leader == U'B'){
|
||
FA_NodeOfForking* n1 = fa.makeForking();
|
||
FA_NodeOfLookOneBehind* n1a = fa.makeLookOneBehind(cc.word_constituents);
|
||
FA_NodeOfLookOneAhead* n2a = fa.makeLookOneAhead(cc.word_constituents);
|
||
reattach_nxt_node(n1a, n2a);
|
||
FA_NodeOfLookOneBehind* n1b = fa.makeLookOneBehind(invert_set(cc.word_constituents));
|
||
FA_NodeOfLookOneAhead* n2b = fa.makeLookOneAhead(invert_set(cc.word_constituents));
|
||
reattach_nxt_node(n1b, n2b);
|
||
add_option_to_fork_node(n1, n1a);
|
||
add_option_to_fork_node(n1, n1b);
|
||
backPart.start = n1;
|
||
backPart.ends = {&(n2a->nxt_node), &(n2b->nxt_node)};
|
||
} else if (leader == U'<'){
|
||
FA_NodeOfLookOneBehind *n1 = fa.makeLookOneBehind(invert_set(cc.word_constituents));
|
||
FA_NodeOfLookOneAhead *n2 = fa.makeLookOneAhead(cc.word_constituents);
|
||
reattach_nxt_node(n1, n2);
|
||
backPart.start = n1;
|
||
backPart.ends = {&(n2->nxt_node)};
|
||
} else if (leader == U'>'){
|
||
FA_NodeOfLookOneBehind *n1 = fa.makeLookOneBehind(cc.word_constituents);
|
||
FA_NodeOfLookOneAhead *n2 = fa.makeLookOneAhead(invert_set(cc.word_constituents));
|
||
reattach_nxt_node(n1, n2);
|
||
backPart.start = n1;
|
||
backPart.ends = {&(n2->nxt_node)};
|
||
} else {
|
||
bool ret_is_multicode; codeset_t res_codeset;
|
||
backslash_expression_parsing_try_regular(ctx, cc, ret_is_multicode, res_codeset);
|
||
backPart = subexpr_charset_reading_filter(res_codeset, fa);
|
||
return; // To avoid reading leader again (it gets read in the end)
|
||
}
|
||
readChar(ctx);
|
||
}
|
||
|
||
void repeat_stuff_with_check(REGEX_IS024_MeaningContext& ctx,
|
||
SubExprCompiled &patient, FA_Container& fa, size_t min_allowed, size_t max_allowed){
|
||
if (min_allowed > max_allowed)
|
||
aux_THROW("repeat operation: min > max");
|
||
if (min_allowed > REGEXIS024_MAX_REPEAT)
|
||
aux_THROW("minimum repeat factor is too high");
|
||
if (max_allowed > REGEXIS024_MAX_REPEAT && patient.can_be_empty)
|
||
aux_THROW("safety abortion: possible infinite loop. Если вы считаете, что ваше регулярное "
|
||
"выражение корректно и не вызвает бесконечного цикла, напишите об этом в жалобную книгу: "
|
||
"По ссылке: file:///dev/null Ваши предложения по улучшению libregexis024 обязательно будут рассмотрены.");
|
||
apply_repeat_to_subexpression(patient, fa, min_allowed, max_allowed);
|
||
}
|
||
|
||
void repeat_command_processing(REGEX_IS024_MeaningContext &ctx, FA_Container &fa, std::vector<SubExprCompiled>& parts,
|
||
const Command& cmd){
|
||
if (parts.empty())
|
||
aux_THROW("no subexpression before !repeat command");
|
||
if (cmd.arguments.empty() || (cmd.arguments.size() == 1 && cmd.arguments[0].is_empty)) {
|
||
repeat_stuff_with_check(ctx, parts.back(), fa, 0, REGEXIS024_MAX_REPEAT + 1); aux_ERROR_CHECK;
|
||
} else if (cmd.arguments.size() == 1){
|
||
size_t mm;
|
||
int_parse_with_limit_concern(cmd.arguments[0].name, ctx, mm, REGEXIS024_MAX_REPEAT); aux_ERROR_CHECK;
|
||
repeat_stuff_with_check(ctx, parts.back(), fa, mm, mm); aux_ERROR_CHECK;
|
||
} else if (cmd.arguments.size() > 2){
|
||
aux_THROW("too many arguments in !repeat command");
|
||
} else {
|
||
size_t min_allowed, max_allowed;
|
||
if (cmd.arguments[0].is_empty){
|
||
min_allowed = 0;
|
||
} else {
|
||
int_parse_with_limit_concern(cmd.arguments[0].name, ctx, min_allowed, REGEXIS024_MAX_REPEAT);
|
||
aux_ERROR_CHECK;
|
||
}
|
||
if (cmd.arguments[1].is_empty){
|
||
max_allowed = REGEXIS024_MAX_REPEAT + 1;
|
||
} else {
|
||
int_parse_with_limit_concern(cmd.arguments[1].name, ctx, max_allowed, REGEXIS024_MAX_REPEAT);
|
||
aux_ERROR_CHECK;
|
||
}
|
||
if (min_allowed > max_allowed)
|
||
aux_THROW("!repeat: min > max");
|
||
repeat_stuff_with_check(ctx, parts.back(), fa, min_allowed, max_allowed); aux_ERROR_CHECK;
|
||
}
|
||
}
|
||
|
||
|
||
chekushka Sequence_ParseCall::firstTime(REGEX_IS024_MeaningContext &ctx, ParsingContext &pctx, FA_Container &fa) {
|
||
while (true) {
|
||
int32_t fst = peep(ctx);
|
||
call_ERROR_CHECK;
|
||
if (fst == U'!') {
|
||
Command cmdBuf;
|
||
size_t before_cmd = ctx.pos;
|
||
cmdBuf = command_expr_parse(ctx);
|
||
call_ERROR_CHECK;
|
||
if (is_header_cmd(cmdBuf)){
|
||
ctx.pos = before_cmd;
|
||
break;
|
||
} else if (cmdBuf.name == "r" || cmdBuf.name == "repeat"){
|
||
repeat_command_processing(ctx, fa, parts, cmdBuf); call_ERROR_CHECK;
|
||
} else if (is_command_for_charset(cmdBuf)){
|
||
codeset_t cs;
|
||
interpret_command_as_charset_giving(pctx.cc, cmdBuf, cs); call_ERROR_CHECK;
|
||
parts.push_back(subexpr_charset_reading_filter(cs, fa));
|
||
} else {
|
||
call_THROW("unknown command");
|
||
}
|
||
} else if (fst == U'\\') {
|
||
parts.emplace_back();
|
||
in_case_of_backslash(ctx, pctx.cc, fa, parts.back());
|
||
call_ERROR_CHECK;
|
||
} else if (fst == U'^'){
|
||
readChar(ctx);
|
||
parts.push_back(subexpression_from_path(fa.makeLookOneBehind(codeset_of_one_char(U'\n'))));
|
||
} else if (fst == U'$'){
|
||
readChar(ctx);
|
||
parts.push_back(subexpression_from_path(fa.makeLookOneAhead(codeset_of_one_char(U'\n'))));
|
||
} else if (fst == U'*'){
|
||
#define vibe_check(sn) if (parts.empty()) { call_THROW("no subexpression before `" sn "` operator"); } readChar(ctx);
|
||
vibe_check("*")
|
||
repeat_stuff_with_check(ctx, parts.back(), fa, 0, REGEXIS024_MAX_REPEAT + 1); call_ERROR_CHECK;
|
||
} else if (fst == U'+'){
|
||
vibe_check("+")
|
||
repeat_stuff_with_check(ctx, parts.back(), fa, 1, REGEXIS024_MAX_REPEAT + 1); call_ERROR_CHECK;
|
||
} else if (fst == U'?'){
|
||
vibe_check("?")
|
||
repeat_stuff_with_check(ctx, parts.back(), fa, 0, 1); call_ERROR_CHECK;
|
||
#undef vibe_check
|
||
} else if (fst == U'#'){
|
||
readChar(ctx);
|
||
std::string name = tryRead_REGEX024_name(ctx); call_ERROR_CHECK;
|
||
if (name.empty())
|
||
call_THROW("No name provided after #");
|
||
if (ctx.ktr.track_names.count(name) == 0){
|
||
ctx.ktr.track_names[name] = static_cast<int64_t>(ctx.ktr.retrieval_info.size());
|
||
ctx.ktr.retrieval_info.emplace_back();
|
||
}
|
||
int64_t id = ctx.ktr.track_names[name];
|
||
int32_t typeDet = peep(ctx);
|
||
if (typeDet == U'('){
|
||
ensure_space_for_track_unit(ctx, name, tracking_var_types::range); call_ERROR_CHECK;
|
||
parts.emplace_back();
|
||
return std::make_unique<BracketLvl_ParseCall>(parts.back(), id);
|
||
} else if (typeDet == U':'){
|
||
ensure_space_for_track_unit(ctx, name, tracking_var_types::dot_immediate); call_ERROR_CHECK;
|
||
readChar(ctx);
|
||
std::string value_str = tryRead_REGEX024_name(ctx);
|
||
size_t value;
|
||
int_parse_with_limit_concern(value_str, ctx, value, UINT16_MAX);
|
||
int32_t cl = peep(ctx);
|
||
if (cl != U';')
|
||
call_THROW("Missing ; after dot track unit operator");
|
||
readChar(ctx);
|
||
if (ctx.ktr.retrieval_info[id].stored_in_sa)
|
||
parts.emplace_back(subexpression_from_path(
|
||
fa.makeTrackArrayMovImm(regex024_opcodes::MOV_SELARR_IMM,
|
||
ctx.ktr.retrieval_info[id].selarr_first, value)));
|
||
if (ctx.ktr.retrieval_info[id].stored_in_ca)
|
||
parts.emplace_back(subexpression_from_path(
|
||
fa.makeTrackArrayMovImm(regex024_opcodes::MOV_COLARR_IMM,
|
||
ctx.ktr.retrieval_info[id].colarr_first, value)));
|
||
} else if (typeDet == U';'){
|
||
ensure_space_for_track_unit(ctx, name, tracking_var_types::dot_cur_pos); call_ERROR_CHECK;
|
||
readChar(ctx);
|
||
if (ctx.ktr.retrieval_info[id].stored_in_sa)
|
||
parts.emplace_back(subexpression_from_path(
|
||
fa.makeTrackArrayMovHalfinvariant(regex024_opcodes::MOV_SELARR_CHPOS,
|
||
ctx.ktr.retrieval_info[id].selarr_first)));
|
||
if (ctx.ktr.retrieval_info[id].stored_in_ca)
|
||
parts.emplace_back(subexpression_from_path(
|
||
fa.makeTrackArrayMovHalfinvariant(regex024_opcodes::MOV_COLARR_BTPOS,
|
||
ctx.ktr.retrieval_info[id].colarr_first)));
|
||
} else
|
||
call_THROW("Missing ; or ( in the beginning of tracking unit");
|
||
} else if (fst == U'(') {
|
||
parts.emplace_back();
|
||
return std::make_unique<BracketLvl_ParseCall>(parts.back(), -1);
|
||
} else if (fst == U'[') {
|
||
codeset_t filter = sq_bracket_expr_parse(ctx, pctx.cc); call_ERROR_CHECK;
|
||
parts.push_back(subexpr_charset_reading_filter(filter, fa));
|
||
} else if (fst >= 0 && fst != U')' && fst != U'|' && fst != U']'){
|
||
readChar(ctx);
|
||
parts.push_back(subexpr_charset_reading_filter(codeset_of_one_char(fst), fa));
|
||
} else {
|
||
break;
|
||
}
|
||
}
|
||
for (SubExprCompiled& part: parts)
|
||
result = join(result, part);
|
||
return NULL;
|
||
}
|
||
|
||
chekushka Sequence_ParseCall::afterReceive(REGEX_IS024_MeaningContext &ctx, ParsingContext &pctx, FA_Container &fa) {
|
||
// This is possible only if I received a bracket expression
|
||
return firstTime(ctx, pctx, fa);
|
||
}
|