#include #include #include #include #include #include #include #include #include #include #define call_ERROR_CHECK do { if (ctx.error) { return NULL; } } while (0) #define call_THROW(str) do { report(ctx, "regex: " str); return NULL; } while (0) #define aux_ERROR_CHECK do { if (ctx.error) { return; } } while (0) #define aux_THROW(str) do { report(ctx, "regex: " str); return; } while (0) /* **************************** Sequence */ void in_case_of_backslash(REGEX_IS024_MeaningContext &ctx, const CommonCodesets& cc, FA_Container &fa, SubExprCompiled& backPart) { assert(readChar(ctx) == U'\\'); int32_t leader = peep(ctx); aux_ERROR_CHECK; if (leader == U'b'){ FA_NodeOfForking* n1 = fa.makeForking(); FA_NodeOfLookOneBehind* n1a = fa.makeLookOneBehind(invert_set(cc.word_constituents)); FA_NodeOfLookOneAhead* n2a = fa.makeLookOneAhead(cc.word_constituents); reattach_nxt_node(n1a, n2a); FA_NodeOfLookOneBehind* n1b = fa.makeLookOneBehind(cc.word_constituents); FA_NodeOfLookOneAhead* n2b = fa.makeLookOneAhead(invert_set(cc.word_constituents)); reattach_nxt_node(n1b, n2b); add_option_to_fork_node(n1, n1a); add_option_to_fork_node(n1, n1b); backPart.start = n1; backPart.ends = {&(n2a->nxt_node), &(n2b->nxt_node)}; } else if (leader == U'B'){ FA_NodeOfForking* n1 = fa.makeForking(); FA_NodeOfLookOneBehind* n1a = fa.makeLookOneBehind(cc.word_constituents); FA_NodeOfLookOneAhead* n2a = fa.makeLookOneAhead(cc.word_constituents); reattach_nxt_node(n1a, n2a); FA_NodeOfLookOneBehind* n1b = fa.makeLookOneBehind(invert_set(cc.word_constituents)); FA_NodeOfLookOneAhead* n2b = fa.makeLookOneAhead(invert_set(cc.word_constituents)); reattach_nxt_node(n1b, n2b); add_option_to_fork_node(n1, n1a); add_option_to_fork_node(n1, n1b); backPart.start = n1; backPart.ends = {&(n2a->nxt_node), &(n2b->nxt_node)}; } else if (leader == U'<'){ FA_NodeOfLookOneBehind *n1 = fa.makeLookOneBehind(invert_set(cc.word_constituents)); FA_NodeOfLookOneAhead *n2 = fa.makeLookOneAhead(cc.word_constituents); reattach_nxt_node(n1, n2); backPart.start = n1; backPart.ends = {&(n2->nxt_node)}; } else if (leader == U'>'){ FA_NodeOfLookOneBehind *n1 = fa.makeLookOneBehind(cc.word_constituents); FA_NodeOfLookOneAhead *n2 = fa.makeLookOneAhead(invert_set(cc.word_constituents)); reattach_nxt_node(n1, n2); backPart.start = n1; backPart.ends = {&(n2->nxt_node)}; } else { bool ret_is_multicode; codeset_t res_codeset; backslash_expression_parsing_try_regular(ctx, cc, ret_is_multicode, res_codeset); backPart = subexpr_charset_reading_filter(res_codeset, fa); return; // To avoid reading leader again (it gets read in the end) } readChar(ctx); } void repeat_stuff_with_check(REGEX_IS024_MeaningContext& ctx, SubExprCompiled &patient, FA_Container& fa, size_t min_allowed, size_t max_allowed){ if (min_allowed > max_allowed) aux_THROW("repeat operation: min > max"); if (min_allowed > REGEXIS024_MAX_REPEAT) aux_THROW("minimum repeat factor is too high"); if (max_allowed > REGEXIS024_MAX_REPEAT && patient.can_be_empty) aux_THROW("safety abortion: possible infinite loop. Если вы считаете, что ваше регулярное " "выражение корректно и не вызвает бесконечного цикла, напишите об этом в жалобную книгу: " "По ссылке: file:///dev/null Ваши предложения по улучшению libregexis024 обязательно будут рассмотрены."); apply_repeat_to_subexpression(patient, fa, min_allowed, max_allowed); } void repeat_command_processing(REGEX_IS024_MeaningContext &ctx, FA_Container &fa, std::vector& parts, const Command& cmd){ if (parts.empty()) aux_THROW("no subexpression before !repeat command"); if (cmd.arguments.empty() || (cmd.arguments.size() == 1 && cmd.arguments[0].is_empty)) { repeat_stuff_with_check(ctx, parts.back(), fa, 0, REGEXIS024_MAX_REPEAT + 1); aux_ERROR_CHECK; } else if (cmd.arguments.size() == 1){ size_t mm; int_parse_with_limit_concern(cmd.arguments[0].name, ctx, mm, REGEXIS024_MAX_REPEAT); aux_ERROR_CHECK; repeat_stuff_with_check(ctx, parts.back(), fa, mm, mm); aux_ERROR_CHECK; } else if (cmd.arguments.size() > 2){ aux_THROW("too many arguments in !repeat command"); } else { size_t min_allowed, max_allowed; if (cmd.arguments[0].is_empty){ min_allowed = 0; } else { int_parse_with_limit_concern(cmd.arguments[0].name, ctx, min_allowed, REGEXIS024_MAX_REPEAT); aux_ERROR_CHECK; } if (cmd.arguments[1].is_empty){ max_allowed = REGEXIS024_MAX_REPEAT + 1; } else { int_parse_with_limit_concern(cmd.arguments[1].name, ctx, max_allowed, REGEXIS024_MAX_REPEAT); aux_ERROR_CHECK; } if (min_allowed > max_allowed) aux_THROW("!repeat: min > max"); repeat_stuff_with_check(ctx, parts.back(), fa, min_allowed, max_allowed); aux_ERROR_CHECK; } } chekushka Sequence_ParseCall::firstTime(REGEX_IS024_MeaningContext &ctx, ParsingContext &pctx, FA_Container &fa) { while (true) { int32_t fst = peep(ctx); call_ERROR_CHECK; if (fst == U'!') { Command cmdBuf; size_t before_cmd = ctx.pos; cmdBuf = command_expr_parse(ctx); call_ERROR_CHECK; if (is_header_cmd(cmdBuf)){ ctx.pos = before_cmd; break; } else if (cmdBuf.name == "r" || cmdBuf.name == "repeat"){ repeat_command_processing(ctx, fa, parts, cmdBuf); call_ERROR_CHECK; } else if (is_command_for_charset(cmdBuf)){ codeset_t cs; interpret_command_as_charset_giving(pctx.cc, cmdBuf, cs); call_ERROR_CHECK; parts.push_back(subexpr_charset_reading_filter(cs, fa)); } else { call_THROW("unknown command"); } } else if (fst == U'\\') { parts.emplace_back(); in_case_of_backslash(ctx, pctx.cc, fa, parts.back()); call_ERROR_CHECK; } else if (fst == U'^'){ readChar(ctx); parts.push_back(subexpression_from_path(fa.makeLookOneBehind(codeset_of_one_char(U'\n')))); } else if (fst == U'$'){ readChar(ctx); parts.push_back(subexpression_from_path(fa.makeLookOneAhead(codeset_of_one_char(U'\n')))); } else if (fst == U'*'){ #define vibe_check(sn) if (parts.empty()) { call_THROW("no subexpression before `" sn "` operator"); } readChar(ctx); vibe_check("*") repeat_stuff_with_check(ctx, parts.back(), fa, 0, REGEXIS024_MAX_REPEAT + 1); call_ERROR_CHECK; } else if (fst == U'+'){ vibe_check("+") repeat_stuff_with_check(ctx, parts.back(), fa, 1, REGEXIS024_MAX_REPEAT + 1); call_ERROR_CHECK; } else if (fst == U'?'){ vibe_check("?") repeat_stuff_with_check(ctx, parts.back(), fa, 0, 1); call_ERROR_CHECK; #undef vibe_check } else if (fst == U'#'){ readChar(ctx); std::string name = tryRead_REGEX024_name(ctx); call_ERROR_CHECK; if (name.empty()) call_THROW("No name provided after #"); if (ctx.ktr.track_names.count(name) == 0){ ctx.ktr.track_names[name] = static_cast(ctx.ktr.retrieval_info.size()); ctx.ktr.retrieval_info.emplace_back(); } int64_t id = ctx.ktr.track_names[name]; int32_t typeDet = peep(ctx); if (typeDet == U'('){ ensure_space_for_track_unit(ctx, name, tracking_var_types::range); call_ERROR_CHECK; parts.emplace_back(); return std::make_unique(parts.back(), id); } else if (typeDet == U':'){ ensure_space_for_track_unit(ctx, name, tracking_var_types::dot_immediate); call_ERROR_CHECK; readChar(ctx); std::string value_str = tryRead_REGEX024_name(ctx); size_t value; int_parse_with_limit_concern(value_str, ctx, value, UINT16_MAX); int32_t cl = peep(ctx); if (cl != U';') call_THROW("Missing ; after dot track unit operator"); readChar(ctx); if (ctx.ktr.retrieval_info[id].stored_in_sa) parts.emplace_back(subexpression_from_path( fa.makeTrackArrayMovImm(regex024_opcodes::MOV_SELARR_IMM, ctx.ktr.retrieval_info[id].selarr_first, value))); if (ctx.ktr.retrieval_info[id].stored_in_ca) parts.emplace_back(subexpression_from_path( fa.makeTrackArrayMovImm(regex024_opcodes::MOV_COLARR_IMM, ctx.ktr.retrieval_info[id].colarr_first, value))); } else if (typeDet == U';'){ ensure_space_for_track_unit(ctx, name, tracking_var_types::dot_cur_pos); call_ERROR_CHECK; readChar(ctx); if (ctx.ktr.retrieval_info[id].stored_in_sa) parts.emplace_back(subexpression_from_path( fa.makeTrackArrayMovHalfinvariant(regex024_opcodes::MOV_SELARR_CHPOS, ctx.ktr.retrieval_info[id].selarr_first))); if (ctx.ktr.retrieval_info[id].stored_in_ca) parts.emplace_back(subexpression_from_path( fa.makeTrackArrayMovHalfinvariant(regex024_opcodes::MOV_COLARR_BTPOS, ctx.ktr.retrieval_info[id].colarr_first))); } else call_THROW("Missing ; or ( in the beginning of tracking unit"); } else if (fst == U'(') { parts.emplace_back(); return std::make_unique(parts.back(), -1); } else if (fst == U'[') { codeset_t filter = sq_bracket_expr_parse(ctx, pctx.cc); call_ERROR_CHECK; parts.push_back(subexpr_charset_reading_filter(filter, fa)); } else if (fst >= 0 && fst != U')' && fst != U'|' && fst != U']'){ readChar(ctx); parts.push_back(subexpr_charset_reading_filter(codeset_of_one_char(fst), fa)); } else { break; } } for (SubExprCompiled& part: parts) result = join(result, part); return NULL; } chekushka Sequence_ParseCall::afterReceive(REGEX_IS024_MeaningContext &ctx, ParsingContext &pctx, FA_Container &fa) { // This is possible only if I received a bracket expression return firstTime(ctx, pctx, fa); }