#include #include #include /* to get exitf */ #include #include #include #include #include #include #include #if defined(LIBREGEXIS024_DEBUG) && defined(LIBREGEXIS024_ALLOW_LOUD) #include #include #include #include #define PR_DEB #endif /* debug nonsence */ void input_fa_assert(const FA_Container& fa){ assert(fa.start); for (FA_Node* node: fa.all){ if (node->type == one_char_read){ assert(!dynamic_cast(node)->second_ns); } else if (node->type == look_one_ahead || node->type == det_char_crossroads){ exitf("not allowed at this stage\n"); } } } struct OperHistoryNodeTransition { TrackingOperationInFa op; size_t u; OperHistoryNodeTransition(const TrackingOperationInFa &op, size_t u): op(op), u(u) {} }; struct OperHistoryNode { std::vector next; /* When it is part of clean history, this */ std::vector compressed_selarr; std::vector raisin; OperHistoryNode() = default; }; /* This object can describe an empty superstate (needed to describe clean history nodes without raisin) * If det_stops is empty, interpret it as empty superstate */ struct SuperState { std::vector sorted_raisin; std::vector double_compressed_selarr; bool empty() const { return sorted_raisin.empty(); } #ifdef PR_DEB std::string toString() const { std::string f1_raisin; for (uint64_t el: sorted_raisin) { if (!f1_raisin.empty()) f1_raisin += ", "; f1_raisin += std::to_string(el); } std::string f2_selarr; for (uint64_t el: double_compressed_selarr) { if (!f2_selarr.empty()) f2_selarr += ", "; f2_selarr += std::to_string(el); } return "sorted_raisin: {" + f1_raisin + "}, double_comp_selarr: {" + f2_selarr + "}"; } #endif }; struct CleanOperHistoryNode { std::vector next; SuperState exit; }; struct SelarrCompressionScheme { size_t SN1, SN2 = 0, SN3 = 0; std::vector S1_to_S2; std::vector S2_to_sifter; std::vector S3_to_sifter; const RegexPriorityTable& sifter; SelarrCompressionScheme(size_t sn1, const RegexPriorityTable &sifter) : SN1(sn1), sifter(sifter) { assert(sifter.size() <= UINT32_MAX); S1_to_S2.assign(SN1, -1); for (regex_tai_t i = 0; i < sifter.size(); i++) { auto& act = sifter[i].pos; regex_tai_t first_on_s2 = S2_to_sifter.size(); S2_to_sifter.push_back(i); S1_to_S2[act.first] = first_on_s2; if (act.type != tracking_var_types::dot_cur_pos) { S3_to_sifter.push_back(i); } if (act.type == tracking_var_types::range) { regex_tai_t second_on_s2 = S2_to_sifter.size(); S2_to_sifter.push_back(i); S1_to_S2[act.second] = second_on_s2; } } SN2 = S2_to_sifter.size(); SN3 = S3_to_sifter.size(); assert(SN3 <= SN2 && SN2 <= SN1 && SN1 <= UINT16_MAX); } }; std::vector compress_compressed_selarr(const std::vector& S2, const SelarrCompressionScheme& cmp) { std::vector S3(cmp.SN3); for (size_t i = 0; i < cmp.SN3; i++) { const RegexPriorityTableAction_Pos& act = cmp.sifter[cmp.S3_to_sifter[i]].pos; if (act.type == tracking_var_types::dot_immediate) { S3[i] = S2[cmp.S1_to_S2[act.first]]; } else { assert(act.type == tracking_var_types::range); // It must be range type uint64_t onFirstBorder = S2[cmp.S1_to_S2[act.first]]; uint64_t onSecondBorder = S2[cmp.S1_to_S2[act.second]]; S3[i] = (onFirstBorder > onSecondBorder) ? 1 : 0; } } return S3; } bool compressed_selarr_A_outranks_B(const std::vector& A, const std::vector& B, const SelarrCompressionScheme& cmp) { for (const RegexPriorityTableAction& act: cmp.sifter) { uint64_t valA = A[cmp.S1_to_S2[act.pos.first]]; uint64_t valB = B[cmp.S1_to_S2[act.pos.first]]; if (act.pos.type == tracking_var_types::range) { uint64_t valAsecond = A[cmp.S1_to_S2[act.pos.second]]; uint64_t valBsecond = A[cmp.S1_to_S2[act.pos.second]]; valA = valAsecond > valA ? valAsecond - valA : 0; valB = valBsecond > valB ? valBsecond - valB : 0; } if (valA == valB) continue; return (valA < valB) == act.minimize; } return false; } /* Beacuse of the way wash_history_bush builds this structure, root is te last node. * rankdir is from left to right (guaranteed). Can be empty if original history contained no raisin */ struct RaisinBush { std::vector clean_history; ssize_t start = -1; bool empty() const { return start < 0; } #ifdef PR_DEB void print() { lines text; text.push_back("Raisin bush"); if (start >= 0) { size_t n = clean_history.size(); std::vector m(n, false); TreeWithStringsNode e{""}; std::function dfs = [&] (TreeWithStringsNode& fill, size_t nodeId) { if (m[nodeId]) { fill.val = "PARADOX"; return; } m[nodeId] = true; const CleanOperHistoryNode& node = clean_history[nodeId]; fill.val = "[" + std::to_string(nodeId) + "]"; if (!node.exit.empty()) fill.val += (" EXIT: " + node.exit.toString()); size_t CN = node.next.size(); fill.childeren.resize(CN); for (size_t i = 0; i < CN; i++) { fill.childeren[i].val = node.next[i].op.toString(); fill.childeren[i].childeren = {{}}; dfs(fill.childeren[i].childeren[0], node.next[i].u); } }; dfs(e, start); size_t am = 0; for (bool el: m) am += static_cast(el); if (am < n) text[0] += ": " + std::to_string(n - am) + " nodes are unreachable by detour"; e.toLines(text); } else { if (clean_history.empty()) text[0] = "Empty Raisin Bush"; else text [0] = "Raisin bush with no root and " + std::to_string(clean_history.size()) = " nodes missed"; } printLines(wrapWithBox(text)); } #endif }; void wash_history_bush(const std::vector& history, RaisinBush& answer, const SelarrCompressionScheme& cmp) { assert(!history.empty()); std::vector has_raisin(history.size()); std::vector dirty_to_clean(history.size(), -1); std::vector > callStack = {{0, 0}}; auto hist_clean_detour_init_clean = [&](uint64_t v) -> uint64_t { if (!has_raisin[v]) { has_raisin[v] = true; dirty_to_clean[v] = answer.clean_history.size(); answer.clean_history.emplace_back(); } return dirty_to_clean[v]; }; while (!callStack.empty()) { size_t v = callStack.back().first; size_t od = callStack.back().second; if (od == 0) { if (!history[v].raisin.empty()) { size_t cleanVId = hist_clean_detour_init_clean(v); std::vector& sr = answer.clean_history[cleanVId].exit.sorted_raisin; sr = history[v].raisin; std::sort(sr.begin(), sr.end()); answer.clean_history[cleanVId].exit.double_compressed_selarr = compress_compressed_selarr(history[v].compressed_selarr, cmp); } } else { const OperHistoryNodeTransition& old_hist_tr = history[v].next[od - 1]; uint64_t ou = old_hist_tr.u; if (has_raisin[ou]) { size_t cleanVId = hist_clean_detour_init_clean(v); answer.clean_history[cleanVId].next.emplace_back(old_hist_tr.op, dirty_to_clean[ou]); } } if (od == history[v].next.size()) { callStack.pop_back(); } else { callStack.back().second++; callStack.emplace_back(history[v].next[od].u, 0); } } if (has_raisin[0]) { assert(dirty_to_clean[0] >= 0); answer.start = dirty_to_clean[0]; } } /* If is_it_after_read is false, unknown selarr range variable border and cur pos are evaluated to 0. * Otherwise, cur pos considered to be greater than previous values of selarr ange variable boundaries */ void building_detour(const SelarrCompressionScheme& cmp, const std::vector& outer_selarr, const std::vector& zeroeps, const codeset_t& I, RaisinBush& answer, bool is_it_after_read) { #ifdef PR_DEB printf("Det Debug: build_detour started with zeroeps:{"); for (FA_Node* node: zeroeps) printf("%lu,", node->nodeId); printf("}, I: {%s}\n", stringifyCodesetBase10(I).c_str()); #endif assert(cmp.SN3 == outer_selarr.size()); if (!is_it_after_read) for (uint64_t val: outer_selarr) assert(val == 0); struct SearchMark { FA_Node* domain_node; uint64_t epsilon_refs = 0; uint64_t detour_sat = 0; /* id of corresponding history node */ size_t Hv = 0; explicit SearchMark(FA_Node *domain_node) : domain_node(domain_node) {} }; /* Default values are good for me */ std::vector marks; for (size_t i = 0; i < zeroeps.size(); i++) { marks.emplace_back(zeroeps[i]); zeroeps[i]->search_mark = i; } auto lob_allows_to_pass = [&](FA_NodeOfLookOneBehind* lob) -> bool { if (!intersect_sets(lob->filter, I).empty()) { assert(merge_sets(lob->filter, I) == lob->filter); return true; } return false; }; { /* First i need to know exacly how many of MINE epsilon transitions are referencing each NODE */ std::vector domain_detour = zeroeps; while (!domain_detour.empty()) { FA_Node* v = domain_detour.back(); domain_detour.pop_back(); if (v->type == look_one_behind && !lob_allows_to_pass(dynamic_cast(v))) continue; for (FA_Node** uPtr: v->get_all_empty_valid_transitions()) { assert(*uPtr); int64_t &rds = (**uPtr).search_mark; if (rds == -1) { rds = marks.size(); domain_detour.push_back(*uPtr); marks.emplace_back(*uPtr); } marks[rds].epsilon_refs++; } } } std::vector history = {OperHistoryNode()}; history[0].compressed_selarr.assign(cmp.SN2, 0); for (size_t i = 0; i < cmp.SN3; i++) { const RegexPriorityTableAction_Pos& act = cmp.sifter[cmp.S3_to_sifter[i]].pos; if (act.type == tracking_var_types::range) { if (outer_selarr[i]) { history[0].compressed_selarr[cmp.S1_to_S2[act.second]] = 1; } } else { assert(act.type == tracking_var_types::dot_immediate); history[0].compressed_selarr[cmp.S1_to_S2[act.first]] = outer_selarr[i]; } } /* As a result, dot_cur_pos variables will be initialized as zero (always) */ /* In my second detour, I will pass each vertex here only one time: after hitting the total epsilon refcount */ std::vector can_process = zeroeps; /* auto increase_sat_refcount = [&](SearchMark& mark) { mark.detour_sat++; if (mark.detour_sat == mark.epsilon_refs && mark.ever_walked_in) { can_process.push_back(mark.domain_node); } }; */ auto add_history_update = [&](TrackingOperationInFa how, uint64_t where, uint64_t from_where) { history[from_where].next.emplace_back(how, where); }; while (!can_process.empty()) { FA_Node* v = can_process.back(); can_process.pop_back(); SearchMark& Vmark = marks[v->search_mark]; assert(Vmark.detour_sat == Vmark.epsilon_refs); uint64_t Hv = Vmark.Hv; uint64_t Hop = Hv; if (v->type == look_one_behind) { FA_NodeOfLookOneBehind* tv = dynamic_cast(v); if (!lob_allows_to_pass(tv)) continue; } else if (isTrackingFaNode(v)) { Hop = history.size(); history.emplace_back(); std::vector& val2 = history.back().compressed_selarr; val2 = history[Hv].compressed_selarr; if (v->type == track_array_mov_imm) { FA_NodeOfTrackArrayMovImm* tv = dynamic_cast(v); if (isSelarrOpcode(tv->operation)) { int key_s2 = cmp.S1_to_S2[tv->key]; if (key_s2 >= 0){ assert(cmp.sifter[cmp.S2_to_sifter[key_s2]].pos.type == tracking_var_types::dot_immediate); val2[key_s2] = tv->imm_value; } } add_history_update(TrackingOperationInFa(tv->operation, tv->key, tv->imm_value), Hop, Hv); } else if (v->type == track_array_mov_halfinvariant) { FA_NodeOfTrackArrayMovHalfinvariant* tv = dynamic_cast(v); if (isSelarrOpcode(tv->operation)) { int key_s2 = cmp.S1_to_S2[tv->key]; if (key_s2 >= 0){ const RegexPriorityTableAction_Pos& act = cmp.sifter[cmp.S2_to_sifter[key_s2]].pos; assert(act.type != tracking_var_types::dot_immediate); if (act.type == tracking_var_types::dot_cur_pos) { val2[key_s2] = is_it_after_read ? 1 : 0; } else { val2[key_s2] = is_it_after_read ? 2 : 0; } } } add_history_update(TrackingOperationInFa(tv->operation, tv->key), Hop, Hv); } } else if (v->type == match || v->type == one_char_read) { // Determinization stop history[Hv].raisin.push_back(v->nodeId); } for (FA_Node** uPtr: v->get_all_empty_valid_transitions()) { assert(*uPtr); SearchMark& Umark = marks[(**uPtr).search_mark]; /* Here I use Hop to determine Hv value of u */ if (Umark.detour_sat == 0) { Umark.Hv = Hop; } else if (Umark.Hv != Hop) { if (compressed_selarr_A_outranks_B( history[Hop].compressed_selarr, history[Umark.Hv].compressed_selarr, cmp)){ Umark.Hv = Hop; } } /* Collision calculation finished */ Umark.detour_sat++; if (Umark.detour_sat == Umark.epsilon_refs) { can_process.push_back(Umark.domain_node); } } } /* Cleaning this mess */ for (auto& m: marks) { m.domain_node->search_mark = -1; } /* Packaging the answer (we do a little bit of dfs here) */ wash_history_bush(history, answer, cmp); } void update_had_to_fork_status(const RaisinBush& bush, int& had_to_fork) { for (const CleanOperHistoryNode& node: bush.clean_history) { if (node.next.size() > 1 || (!node.next.empty() && !node.exit.empty())) { had_to_fork = 1; return; } } } typedef size_t superstate_id_t; typedef std::vector> homework_t; struct LessSuperState { bool operator()(const SuperState& A, const SuperState& B) const { std::less> f1L; if (f1L(A.sorted_raisin, B.sorted_raisin)) return true; if (f1L(B.sorted_raisin, A.sorted_raisin)) return false; return f1L(A.double_compressed_selarr, B.double_compressed_selarr); } }; struct GlobalDetourProgress { std::map superstates; /* Each element is a root of some megabush in resFa */ std::vector superstate_megabush_constructed; std::vector todo_superstaes; }; /* If x was not previously achieved, it will also add it to t o d o list of global detour */ superstate_id_t convertSuperstateToId(const SuperState& x, GlobalDetourProgress& gdp) { if (gdp.superstates.count(x)) { return gdp.superstates[x]; } size_t n = gdp.superstates.size(); gdp.superstates.insert({x, n}); gdp.todo_superstaes.push_back(x); gdp.superstate_megabush_constructed.push_back(NULL); return n; } FA_Node* build_dead_end(FA_Container& resFa) { return resFa.makeForking(); } void build_bush(const RaisinBush& alpha, FA_Node** sowing_location, FA_Container& resFa, homework_t& homework, GlobalDetourProgress& gdp) { size_t n = alpha.clean_history.size(); if (n == 0) { FA_Node* dead_end = build_dead_end(resFa); reattach_fa_node_edge(sowing_location, dead_end); return; } std::vector> todo = {{sowing_location, alpha.start}}; while (!todo.empty()) { FA_Node** sl = todo.back().first; const CleanOperHistoryNode& hnode = alpha.clean_history[todo.back().second]; todo.pop_back(); auto history_transition = [&](size_t i, FA_Node** of_sl) { FA_NodePathPart* pn = convert_to_node(hnode.next[i].op, resFa); reattach_fa_node_edge(of_sl, pn); todo.emplace_back(&(pn->nxt_node), hnode.next[i].u); }; if (hnode.next.empty()) { assert(!hnode.exit.empty()); superstate_id_t w = convertSuperstateToId(hnode.exit, gdp); homework.emplace_back(sl, w); } else if (hnode.next.size() == 1 && hnode.exit.empty()) { history_transition(0, sl); } else { FA_NodeOfForking* forker = resFa.makeForking(); bool raisin = !hnode.exit.empty(); size_t k = hnode.next.size(); forker->nxt_options.assign(k + static_cast(raisin), NULL); for (size_t i = 0; i < k; i++) { history_transition(i, &(forker->nxt_options[i])); } if (raisin) { superstate_id_t w = convertSuperstateToId(hnode.exit, gdp); homework.emplace_back(&(forker->nxt_options[k]), w); } reattach_fa_node_edge(sl, forker); } } } ColoredCodeset get_pretreated_cc(FA_Container& sourceFa) { std::set little_insects; for (FA_Node* v: sourceFa.all) { if (v->type == look_one_behind) { little_insects.insert(static_cast(v)->filter); } } ColoredCodeset pretreated_cc(little_insects.size()); for (const codeset_t& cs: little_insects) { pretreated_cc.apply_divisor(cs); } return pretreated_cc; } // todo add a check on size of dfa void try_determinize_fa(FA_Container &sourceFa, const RegexPriorityTable &sifter, regex_tai_t selarr_sz, const REGEX_IS024_FA_FirstStageFixInfo &info1, FA_Container &resFa, int &error, int& had_to_fork) { /* During execuion, i will create pointers to field res.start and store them (inside the scope of this function) * Luckily res argument is already immovable in this scope. */ error = 0; had_to_fork = 0; assert(resFa.start == NULL && resFa.all.empty()); input_fa_assert(sourceFa); SelarrCompressionScheme cmp(selarr_sz, sifter); GlobalDetourProgress gdp; homework_t homework; ColoredCodeset pretreated_cc = get_pretreated_cc(sourceFa); FA_Node** res_start_ptr = &(resFa.start); if (info1.fed_chars_extend_one_left) { ColoredCodeset inp_distinction = pretreated_cc; inp_distinction.apply_divisor(codeset_of_all); std::vector starting_Is; std::vector> starting_Cids; /* Filler variable */ inp_distinction.get_splits_of_non_dummy(starting_Is, starting_Cids); size_t R = starting_Is.size(); for (auto& rdh: starting_Cids) { assert(rdh.size() == 1 && rdh[0] == 0); } FA_NodeOfDetCharCrossroads* very_first_cr = resFa.makeDetCharCrossroads(); very_first_cr->second_ns = true; reattach_fa_node_edge(res_start_ptr, very_first_cr); very_first_cr->crossroads.resize(R); /* After that, nobody has right to resize crossroads array */ for (size_t i = 0; i < R; i++) { very_first_cr->crossroads[i].input = starting_Is[i]; FA_Node** sowing_place = &(very_first_cr->crossroads[i].nxt_node); RaisinBush alpha; building_detour(cmp, std::vector(cmp.SN3, 0), {sourceFa.start}, starting_Is[i], alpha, false); #ifdef PR_DEB printf("Initialization hard %ld/%ld\n", i + 1, R); alpha.print(); #endif update_had_to_fork_status(alpha, had_to_fork); build_bush(alpha, sowing_place, resFa, homework, gdp); } } else { RaisinBush alpha; building_detour(cmp, std::vector(cmp.SN3, 0), {sourceFa.start}, codeset_of_all, alpha, false); #ifdef PR_DEB printf("Initialization easy\n"); alpha.print(); #endif update_had_to_fork_status(alpha, had_to_fork); build_bush(alpha, res_start_ptr, resFa, homework, gdp); } /* Now we start the actual detour. */ while (!gdp.todo_superstaes.empty()) { SuperState SS = gdp.todo_superstaes.back(); gdp.todo_superstaes.pop_back(); // printf("Global detour turn: %s\n", SS.toString().c_str()); std::vector reading_stops; codeset_t how_can_i_finish = {}; for (size_t v: SS.sorted_raisin) { FA_Node* node = sourceFa.all[v]; if (node->type == one_char_read) { reading_stops.push_back(static_cast(node)); } else if (node->type == match) { auto fn = static_cast(node); assert(!fn->ext_filter_added || info1.fed_chars_extend_one_right); if (fn->ext_filter_added) { how_can_i_finish = merge_sets(how_can_i_finish, fn->pending_filter); } else { how_can_i_finish = codeset_of_all; } } else assert(false); } // Determinization stop: one char read (input) ColoredCodeset inp_distinction = pretreated_cc; size_t pr = reading_stops.size(); for (size_t i = 0; i < pr; i++) { inp_distinction.apply_divisor(reading_stops[i]->filter); } std::vector Is; std::vector> Cids; inp_distinction.get_splits_of_non_dummy(Is, Cids); size_t R = Is.size(); FA_NodeOfDetCharCrossroads* my_cr = NULL; if (R > 0) { my_cr = resFa.makeDetCharCrossroads(); if (!info1.fed_chars_extend_one_right && !how_can_i_finish.empty()) { assert(how_can_i_finish == codeset_of_all); my_cr->matching = true; } my_cr->crossroads.resize(R); } for (size_t i = 0; i < R; i++) { my_cr->crossroads[i].input = Is[i]; my_cr->crossroads[i].nxt_node = NULL; std::vector fl_passed_filters; for (size_t j: Cids[i]) { fl_passed_filters.push_back(reading_stops[j]->nxt_node); } // todo: make a function out of next 6 lines of code RaisinBush alpha; building_detour(cmp, SS.double_compressed_selarr, fl_passed_filters, Is[i], alpha, true); #ifdef PR_DEB printf("That same turn, subbush %ld/%ld\n", i + 1, R); alpha.print(); #endif update_had_to_fork_status(alpha, had_to_fork); build_bush(alpha, &(my_cr->crossroads[i].nxt_node), resFa, homework, gdp); } // Determinization stop: match (finish) FA_Node* finish_route = NULL; if (!how_can_i_finish.empty() && (info1.fed_chars_extend_one_right || R == 0)) { FA_NodeOfMatch* matcher = resFa.makeMatch(); finish_route = matcher; if (info1.fed_chars_extend_one_right) { FA_NodeOfOneCharRead* right_ext_read = resFa.makeOneCharRead(how_can_i_finish, true); reattach_nxt_node(right_ext_read, matcher); finish_route = right_ext_read; } } // Combining these two cases assert(finish_route || my_cr); FA_Node*& endsUp = gdp.superstate_megabush_constructed[gdp.superstates[SS]]; if (!finish_route) { endsUp = my_cr; } else if (!my_cr) { endsUp = finish_route; } else { FA_NodeOfForking* F = resFa.makeForking(); F->nxt_options = {NULL, NULL}; reattach_fa_node_edge(&(F->nxt_options[0]), my_cr); reattach_fa_node_edge(&(F->nxt_options[1]), finish_route); endsUp = F; } } /* Now it's time to do the homework: link all megabushes */ for (auto& p: homework) { reattach_fa_node_edge(p.first, gdp.superstate_megabush_constructed[p.second]); } }