diff --git a/building/main.cpp b/building/main.cpp index edcaa54..761c5ca 100644 --- a/building/main.cpp +++ b/building/main.cpp @@ -72,7 +72,6 @@ struct Libregexis024BuildSystem { "libregexis024fa/graph_to_bytecode/core.cpp", "libregexis024sol/common_codesets.cpp", - "libregexis024sol/part_of_expr_that_tracks.cpp", "libregexis024sol/expr_compiler.cpp", "libregexis024sol/square_bracket_expression.cpp", "libregexis024sol/sol_misc_base.cpp", diff --git a/src/debugging_regexis024/debug_through_graphviz.cpp b/src/debugging_regexis024/debug_through_graphviz.cpp index 29595fc..811a7d2 100644 --- a/src/debugging_regexis024/debug_through_graphviz.cpp +++ b/src/debugging_regexis024/debug_through_graphviz.cpp @@ -10,316 +10,318 @@ #include #include -const char* one_char_read_color = "black"; -const char* forking_color = "darkorchid1"; -const char* look_one_behind_color = "darkslateblue"; -const char* look_one_ahead_color = "coral1"; -const char* track_array_mov_imm_color = "lightblue2"; -const char* track_array_mov_halfinvariant_color = "lightseagreen"; -const char* match_pending_lob_color = "darkgoldenrod2"; -const char* match_color = "gold"; -const char* det_char_crossroads_color = "navy"; -const char* error_color = "crimson"; -const char* STAR = "★"; +namespace regexis024 { + const char* one_char_read_color = "black"; + const char* forking_color = "darkorchid1"; + const char* look_one_behind_color = "darkslateblue"; + const char* look_one_ahead_color = "coral1"; + const char* track_array_mov_imm_color = "lightblue2"; + const char* track_array_mov_halfinvariant_color = "lightseagreen"; + const char* match_pending_lob_color = "darkgoldenrod2"; + const char* match_color = "gold"; + const char* det_char_crossroads_color = "navy"; + const char* error_color = "crimson"; + const char* STAR = "★"; -const char* get_associated_color(FA_Node* node){ - switch (node->type) { + const char* get_associated_color(FA_Node* node){ + switch (node->type) { #define ccase(tn) case tn: return tn##_color; - ccase(one_char_read) - ccase(forking) - ccase(look_one_behind) - ccase(look_one_ahead) - ccase(track_array_mov_imm) - ccase(track_array_mov_halfinvariant) - ccase(det_char_crossroads) - case match: - return dynamic_cast(node)->ext_filter_added ? match_pending_lob_color : match_color; - default: - return "black"; -#undef ccase - } -} - -struct NodesProblems{ - size_t actual_refcount = 0; - bool refcount_problem = false; - size_t edges_point_to_null = 0; -}; - -struct EdgesProblems { - bool points_to_null = false; - explicit EdgesProblems(bool points_to_null): points_to_null(points_to_null) {} -}; - -std::string get_applied_edge_attributes(FA_Node* node, const NodesProblems& np, const EdgesProblems& ep){ - std::string res = "color="; - if (ep.points_to_null) { - res += error_color; - } else { - res += get_associated_color(node); - if (node->type == one_char_read || node->type == det_char_crossroads) - res += " style=bold"; - } - return res; -} - -std::string get_applied_node_attributes(FA_Node* node, const NodesProblems& bd){ - std::string res = "color="; - res += get_associated_color(node); - if (bd.refcount_problem) - res += " fontcolor=crimson"; - if ((node->type == match) || - (node->type == det_char_crossroads && dynamic_cast(node)->matching)) - res += " shape=doublecircle"; - return res; -} - -void append_reverse_hex(std::string& res, uint32_t num){ - if (num == 0){ - res += "0"; - } else { - while (num){ - uint32_t r = num & 0x0F; - res += static_cast((r < 10) ? (r + '0') : (r - 10 + 'a')); - num >>= 4; - } - } -} - -std::string stringify_codeset(const codeset_t& cs){ - std::string res; - for (long i = static_cast(cs.size()) - 1; i >= 0; i--) { - uint64_t start = cs[i].first, end = cs[i].second; - if (start == end) { - append_reverse_hex(res, start); - } else { - append_reverse_hex(res, end); - res += '-'; - append_reverse_hex(res, start); - } - if (i != 0) - res += ','; - } - std::reverse(res.begin(), res.end()); /* ascii works wonders */ - return res; -} - -std::string get_extended_node_lable(FA_Node* node){ - if ((node->type == one_char_read && dynamic_cast(node)->second_ns) || - (node->type == det_char_crossroads && dynamic_cast(node)->second_ns)) { - return std::string(" ") + STAR; - } - if (node->type == match) { - FA_NodeOfMatch* mn = static_cast(node); - if (mn->ext_filter_added) - return std::string(" pending loa ") + stringify_codeset(mn->pending_filter); - } - return ""; -} - -std::string get_node_lable(FA_Node* node, const NodesProblems& bd){ - std::string res; - switch (node->type) { -#define tcase(tn, str) case tn: res = str; break; - tcase(one_char_read, "ocr") - tcase(match, "m") - tcase(forking, "f") - tcase(look_one_behind, "lob") - tcase(look_one_ahead, "loa") - tcase(track_array_mov_imm, "tami") - tcase(track_array_mov_halfinvariant, "tamh") - tcase(det_char_crossroads, "dcc") - } - res += ("[" + std::to_string(node->nodeId) + "]"); - res += get_extended_node_lable(node); - if (bd.refcount_problem) - res += ("!refcount: " + std::to_string(node->refs) + "!"); - return res; -} - -void print_edge(FA_Node* start, const FA_Node* dest, const std::string& label, FILE* fd, NodesProblems& np){ - if (!dest){ - fprintf(stderr, "NULL transition going from node %lu\n", start->nodeId); - fprintf(fd, "%lu->NULL_%lu_%lu [label=\"%s\" color=crimson]", start->nodeId, - start->nodeId, np.edges_point_to_null++, label.c_str()); - return; - } - fprintf(fd, "%lu->%lu [label=\"%s\" %s]\n", start->nodeId, dest->nodeId, label.c_str(), - get_applied_edge_attributes(start, np, EdgesProblems(false)).c_str()); -} - -void print_fa(const FA_Container& fa, FILE* fd, const KnownTrackingTools& ktr, - const RegexPriorityTable& priority_table){ - assert(fa.start); - assert(fd); - fprintf(fd, "digraph finite_automaton {\ngraph [" - "fontname = \"Helvetica\" charset = \"UTF-8\" label = \"Finite Automaton\" labelloc = \"t\" labeljust = \"c\" " - "bgcolor = \"#FFFAF4\" fontcolor = black fontsize = 18 style = \"filled\" rankdir = LR margin = 0.2 " - "splines = spline nodesep = 0.9 ranksep = 1.2 ]\n node [ style = \"solid,filled\" fontsize = 15 " - "fontcolor = black fontname = \"Helvetica\" color = black fillcolor = white margin = \"0.2,0.2\" shape=circle " - "]\n edge [ style = solid fontsize = 16 fontcolor = black fontname = \"Helvetica\" color = black " - "labelfloat = false labeldistance = 2.5 labelangle = 70 arrowhead = normal ]\n" - "start_state [label = \"start\\nfrom\\nhere\" shape=none style=\"\" ]\n"); - - size_t n = fa.all.size(); - std::vector breakdown; - breakdown.resize(n); - breakdown[fa.start->nodeId].actual_refcount++; - for (size_t i = 0; i < n; i++){ - assert(fa.all[i]->nodeId == static_cast(i)); - for (FA_Node** nxtN: fa.all[i]->get_all_transitions()) - if ((*nxtN) != NULL) - breakdown[(**nxtN).nodeId].actual_refcount++; - } - for (size_t i = 0; i < n; i++){ - if (fa.all[i]->refs != breakdown[i].actual_refcount){ - breakdown[i].refcount_problem = true; - fprintf(stderr, "Corrupted FA: wrong refcount on node %lu\n", fa.all[i]->nodeId); - } - } - for (size_t i = 0; i < n; i++){ - fprintf(fd, "%lu [label=\"%s\" %s]\n", i, get_node_lable(fa.all[i], breakdown[i]).c_str(), - get_applied_node_attributes(fa.all[i], breakdown[i]).c_str()); - } - - /* Two Infoboxes */ - - auto stringifyTrackingVarType = [](tracking_var_type type) -> std::string { - switch (type) { - case tracking_var_types::range: - return "range"; - case tracking_var_types::dot_cur_pos: - return "dot of cur pos"; + ccase(one_char_read) + ccase(forking) + ccase(look_one_behind) + ccase(look_one_ahead) + ccase(track_array_mov_imm) + ccase(track_array_mov_halfinvariant) + ccase(det_char_crossroads) + case match: + return dynamic_cast(node)->ext_filter_added ? match_pending_lob_color : match_color; default: - return "dot of immediate"; + return "black"; +#undef ccase } + } + + struct NodesProblems{ + size_t actual_refcount = 0; + bool refcount_problem = false; + size_t edges_point_to_null = 0; }; - std::string infoText; - for (auto& p: ktr.track_names){ - const SubtrackingNameInfo& tu = ktr.retrieval_info[p.second]; + struct EdgesProblems { + bool points_to_null = false; + explicit EdgesProblems(bool points_to_null): points_to_null(points_to_null) {} + }; - auto getRole = [](bool presence, tracking_var_type type, int first, int second, - const std::string& ARR_NAME) -> std::string { - if (!presence) { - assert(first == -1 && second == -1); - return "Not involved in " + ARR_NAME; - } - if (type == tracking_var_types::range){ - assert(first != -1 && second != -1); - return "In " + ARR_NAME + ": " + std::to_string(first) + " <−> " + std::to_string(second); - } - assert(first != -1 && second == -1); - return "In " + ARR_NAME + ": ( " + std::to_string(first) + " )"; - }; - char buf[2048] = {0}; - snprintf(buf, 2048, "Tracking unit name: %s\\n" "Discovered: %s\\n" "Type: %s\\n" "%s\\n%s", - p.first.c_str(), tu.discovered ? "ofcourse" : "no", - stringifyTrackingVarType(tu.type).c_str(), - getRole(tu.stored_in_ca, tu.type, tu.colarr_first, tu.colarr_second, "colarr").c_str(), - getRole(tu.stored_in_sa, tu.type, tu.selarr_first, tu.selarr_second, "selarr").c_str()); - if (!infoText.empty()) - infoText += "|"; - infoText += buf; - } - fprintf(fd, "infoBoard1 [label=\"%s\" shape = record]\n", infoText.c_str()); - infoText = ""; - for (size_t i = 0; i < priority_table.size(); i++){ - const RegexPriorityTableAction& tu = priority_table[i]; - if (!infoText.empty()) - infoText += "|"; - infoText += tu.minimize ? "Minimize " : "Maximize "; - if (tu.pos.isForRange()){ - infoText += "[" + std::to_string(tu.pos.second) + "] - [" + std::to_string(tu.pos.first) + "]"; + std::string get_applied_edge_attributes(FA_Node* node, const NodesProblems& np, const EdgesProblems& ep){ + std::string res = "color="; + if (ep.points_to_null) { + res += error_color; } else { - infoText += "[" + std::to_string(tu.pos.first) + "]"; + res += get_associated_color(node); + if (node->type == one_char_read || node->type == det_char_crossroads) + res += " style=bold"; } + return res; } - fprintf(fd, "infoBoard2 [label=\"%s\" shape = record]\n", infoText.c_str()); - assert(fa.start); - fprintf(fd, "start_state->%lu [color=gray style=dotted]\n", fa.start->nodeId); + std::string get_applied_node_attributes(FA_Node* node, const NodesProblems& bd){ + std::string res = "color="; + res += get_associated_color(node); + if (bd.refcount_problem) + res += " fontcolor=crimson"; + if ((node->type == match) || + (node->type == det_char_crossroads && dynamic_cast(node)->matching)) + res += " shape=doublecircle"; + return res; + } - - for (FA_Node* node: fa.all){ - NodesProblems& bd = breakdown[node->nodeId]; - if (node->type == one_char_read){ - FA_NodeOfOneCharRead* cn = dynamic_cast(node); - std::string str = stringify_codeset(cn->filter); - print_edge(node, cn->nxt_node, str + (cn->second_ns ? std::string(" ") + STAR : ""), fd, bd); - } else if (node->type == forking){ - FA_NodeOfForking* cn = dynamic_cast(node); - for (FA_Node* nxt: cn->nxt_options){ - print_edge(node, nxt, "", fd, bd); - } - } else if (node->type == look_one_behind){ - FA_NodeOfLookOneBehind* cn = dynamic_cast(node); - print_edge(node, cn->nxt_node, stringify_codeset(cn->filter), fd, bd); - } else if (node->type == look_one_ahead){ - FA_NodeOfLookOneAhead* cn = dynamic_cast(node); - print_edge(node, cn->nxt_node, stringify_codeset(cn->restriction), fd, bd); - } else if (node->type == track_array_mov_imm){ - FA_NodeOfTrackArrayMovImm* cn = dynamic_cast(node); - char buf[1024]; - if (!isImmMovOpcode(cn->operation)) - fprintf(stderr, "bad operation in node %lu\n", node->nodeId); - snprintf(buf, 1024, "%s %hu %lu", - regex024_opcode_tostr(cn->operation), cn->key, cn->imm_value); - print_edge(node, cn->nxt_node,std::string(buf), fd, bd); - } else if (node->type == track_array_mov_halfinvariant){ - FA_NodeOfTrackArrayMovHalfinvariant* cn = dynamic_cast(node); - char buf[1024]; - if (!isCurPosMovOpcode(cn->operation)) - fprintf(stderr, "bad operation in node %lu\n", node->nodeId); - snprintf(buf, 1024, "%s %hu", - regex024_opcode_tostr(cn->operation), cn->key); - print_edge(node, cn->nxt_node,std::string(buf), fd, bd); - } else if (node->type == det_char_crossroads){ - FA_NodeOfDetCharCrossroads* cn = dynamic_cast(node); - for (const auto& transition: cn->crossroads){ - std::string str = stringify_codeset(transition.input); - print_edge(node, transition.nxt_node, str + (cn->second_ns ? std::string(" ") + STAR : ""), - fd, bd); + void append_reverse_hex(std::string& res, uint32_t num){ + if (num == 0){ + res += "0"; + } else { + while (num){ + uint32_t r = num & 0x0F; + res += static_cast((r < 10) ? (r + '0') : (r - 10 + 'a')); + num >>= 4; } } } - fprintf(fd, "}\n"); -} -FILE* get_fd(const char* apath){ - errno = 0; - FILE *fd = fopen(apath, "w"); - if (!fd) - perror("fopen w"); - if (ftruncate(fileno(fd), 0) != 0) - perror("truncation"); - fd = fopen(apath, "a"); - if (!fd) - perror("fopen a"); - return fd; -} + std::string stringify_codeset(const codeset_t& cs){ + std::string res; + for (long i = static_cast(cs.size()) - 1; i >= 0; i--) { + uint64_t start = cs[i].first, end = cs[i].second; + if (start == end) { + append_reverse_hex(res, start); + } else { + append_reverse_hex(res, end); + res += '-'; + append_reverse_hex(res, start); + } + if (i != 0) + res += ','; + } + std::reverse(res.begin(), res.end()); /* ascii works wonders */ + return res; + } -void show_fa_with_sxiv_after_dot(const FA_Container& fa, const KnownTrackingTools& ktr, - const RegexPriorityTable& priority_table) { - const char* temp_gv = "FAGraph.gv"; - const char* temp_png = "FAGraph.png"; - int temp_descriptor = open(temp_gv, O_CLOEXEC | O_APPEND | O_CREAT | O_WRONLY, S_IRWXU | S_IRWXG); - assert(temp_descriptor >= 0); - assert(fa.start); - FILE* fd = get_fd(temp_gv); - print_fa(fa, fd, ktr, priority_table); - fclose(fd); - char cmdBuf[1024]; - // todo: get rid of temporary dot file and shell usage - snprintf(cmdBuf, 1024, "dot %s -Tpng >%s", temp_gv, temp_png); - int chw = system(cmdBuf); - assert(WIFEXITED(chw)); - assert(WEXITSTATUS(chw) == 0); - snprintf(cmdBuf, 1024, "sxiv %s", temp_png); - chw = system(cmdBuf); - assert(WIFEXITED(chw)); - assert(WEXITSTATUS(chw) == 0); - assert(chw >= 0); - unlink(temp_gv); - unlink(temp_png); + std::string get_extended_node_lable(FA_Node* node){ + if ((node->type == one_char_read && dynamic_cast(node)->second_ns) || + (node->type == det_char_crossroads && dynamic_cast(node)->second_ns)) { + return std::string(" ") + STAR; + } + if (node->type == match) { + FA_NodeOfMatch* mn = static_cast(node); + if (mn->ext_filter_added) + return std::string(" pending loa ") + stringify_codeset(mn->pending_filter); + } + return ""; + } + + std::string get_node_lable(FA_Node* node, const NodesProblems& bd){ + std::string res; + switch (node->type) { +#define tcase(tn, str) case tn: res = str; break; + tcase(one_char_read, "ocr") + tcase(match, "m") + tcase(forking, "f") + tcase(look_one_behind, "lob") + tcase(look_one_ahead, "loa") + tcase(track_array_mov_imm, "tami") + tcase(track_array_mov_halfinvariant, "tamh") + tcase(det_char_crossroads, "dcc") + } + res += ("[" + std::to_string(node->nodeId) + "]"); + res += get_extended_node_lable(node); + if (bd.refcount_problem) + res += ("!refcount: " + std::to_string(node->refs) + "!"); + return res; + } + + void print_edge(FA_Node* start, const FA_Node* dest, const std::string& label, FILE* fd, NodesProblems& np){ + if (!dest){ + fprintf(stderr, "NULL transition going from node %lu\n", start->nodeId); + fprintf(fd, "%lu->NULL_%lu_%lu [label=\"%s\" color=crimson]", start->nodeId, + start->nodeId, np.edges_point_to_null++, label.c_str()); + return; + } + fprintf(fd, "%lu->%lu [label=\"%s\" %s]\n", start->nodeId, dest->nodeId, label.c_str(), + get_applied_edge_attributes(start, np, EdgesProblems(false)).c_str()); + } + + void print_fa(const FA_Container& fa, FILE* fd, const KnownTrackingTools& ktr, + const RegexPriorityTable& priority_table){ + assert(fa.start); + assert(fd); + fprintf(fd, "digraph finite_automaton {\ngraph [" + "fontname = \"Helvetica\" charset = \"UTF-8\" label = \"Finite Automaton\" labelloc = \"t\" labeljust = \"c\" " + "bgcolor = \"#FFFAF4\" fontcolor = black fontsize = 18 style = \"filled\" rankdir = LR margin = 0.2 " + "splines = spline nodesep = 0.9 ranksep = 1.2 ]\n node [ style = \"solid,filled\" fontsize = 15 " + "fontcolor = black fontname = \"Helvetica\" color = black fillcolor = white margin = \"0.2,0.2\" shape=circle " + "]\n edge [ style = solid fontsize = 16 fontcolor = black fontname = \"Helvetica\" color = black " + "labelfloat = false labeldistance = 2.5 labelangle = 70 arrowhead = normal ]\n" + "start_state [label = \"start\\nfrom\\nhere\" shape=none style=\"\" ]\n"); + + size_t n = fa.all.size(); + std::vector breakdown; + breakdown.resize(n); + breakdown[fa.start->nodeId].actual_refcount++; + for (size_t i = 0; i < n; i++){ + assert(fa.all[i]->nodeId == static_cast(i)); + for (FA_Node** nxtN: fa.all[i]->get_all_transitions()) + if ((*nxtN) != NULL) + breakdown[(**nxtN).nodeId].actual_refcount++; + } + for (size_t i = 0; i < n; i++){ + if (fa.all[i]->refs != breakdown[i].actual_refcount){ + breakdown[i].refcount_problem = true; + fprintf(stderr, "Corrupted FA: wrong refcount on node %lu\n", fa.all[i]->nodeId); + } + } + for (size_t i = 0; i < n; i++){ + fprintf(fd, "%lu [label=\"%s\" %s]\n", i, get_node_lable(fa.all[i], breakdown[i]).c_str(), + get_applied_node_attributes(fa.all[i], breakdown[i]).c_str()); + } + + /* Two Infoboxes */ + + auto stringifyTrackingVarType = [](tracking_var_type_t type) -> std::string { + switch (type) { + case tracking_var_types::range: + return "range"; + case tracking_var_types::dot_cur_pos: + return "dot of cur pos"; + default: + return "dot of immediate"; + } + }; + + std::string infoText; + for (auto& p: ktr.track_names){ + const SubtrackingNameInfo& tu = ktr.retrieval_info[p.second]; + + auto getRole = [](bool presence, tracking_var_type_t type, int first, int second, + const std::string& ARR_NAME) -> std::string { + if (!presence) { + assert(first == -1 && second == -1); + return "Not involved in " + ARR_NAME; + } + if (type == tracking_var_types::range){ + assert(first != -1 && second != -1); + return "In " + ARR_NAME + ": " + std::to_string(first) + " <−> " + std::to_string(second); + } + assert(first != -1 && second == -1); + return "In " + ARR_NAME + ": ( " + std::to_string(first) + " )"; + }; + char buf[2048] = {0}; + snprintf(buf, 2048, "Tracking unit name: %s\\n" "Discovered: %s\\n" "Type: %s\\n" "%s\\n%s", + p.first.c_str(), tu.discovered ? "ofcourse" : "no", + stringifyTrackingVarType(tu.type).c_str(), + getRole(tu.stored_in_ca, tu.type, tu.colarr_first, tu.colarr_second, "colarr").c_str(), + getRole(tu.stored_in_sa, tu.type, tu.selarr_first, tu.selarr_second, "selarr").c_str()); + if (!infoText.empty()) + infoText += "|"; + infoText += buf; + } + fprintf(fd, "infoBoard1 [label=\"%s\" shape = record]\n", infoText.c_str()); + infoText = ""; + for (size_t i = 0; i < priority_table.size(); i++){ + const RegexPriorityTableAction& tu = priority_table[i]; + if (!infoText.empty()) + infoText += "|"; + infoText += tu.minimize ? "Minimize " : "Maximize "; + if (tu.pos.isForRange()){ + infoText += "[" + std::to_string(tu.pos.second) + "] - [" + std::to_string(tu.pos.first) + "]"; + } else { + infoText += "[" + std::to_string(tu.pos.first) + "]"; + } + } + fprintf(fd, "infoBoard2 [label=\"%s\" shape = record]\n", infoText.c_str()); + + assert(fa.start); + fprintf(fd, "start_state->%lu [color=gray style=dotted]\n", fa.start->nodeId); + + + for (FA_Node* node: fa.all){ + NodesProblems& bd = breakdown[node->nodeId]; + if (node->type == one_char_read){ + FA_NodeOfOneCharRead* cn = dynamic_cast(node); + std::string str = stringify_codeset(cn->filter); + print_edge(node, cn->nxt_node, str + (cn->second_ns ? std::string(" ") + STAR : ""), fd, bd); + } else if (node->type == forking){ + FA_NodeOfForking* cn = dynamic_cast(node); + for (FA_Node* nxt: cn->nxt_options){ + print_edge(node, nxt, "", fd, bd); + } + } else if (node->type == look_one_behind){ + FA_NodeOfLookOneBehind* cn = dynamic_cast(node); + print_edge(node, cn->nxt_node, stringify_codeset(cn->filter), fd, bd); + } else if (node->type == look_one_ahead){ + FA_NodeOfLookOneAhead* cn = dynamic_cast(node); + print_edge(node, cn->nxt_node, stringify_codeset(cn->restriction), fd, bd); + } else if (node->type == track_array_mov_imm){ + FA_NodeOfTrackArrayMovImm* cn = dynamic_cast(node); + char buf[1024]; + if (!isImmMovOpcode(cn->operation)) + fprintf(stderr, "bad operation in node %lu\n", node->nodeId); + snprintf(buf, 1024, "%s %hu %lu", + opcode_to_str(cn->operation), cn->key, cn->imm_value); + print_edge(node, cn->nxt_node,std::string(buf), fd, bd); + } else if (node->type == track_array_mov_halfinvariant){ + FA_NodeOfTrackArrayMovHalfinvariant* cn = dynamic_cast(node); + char buf[1024]; + if (!isCurPosMovOpcode(cn->operation)) + fprintf(stderr, "bad operation in node %lu\n", node->nodeId); + snprintf(buf, 1024, "%s %hu", + opcode_to_str(cn->operation), cn->key); + print_edge(node, cn->nxt_node,std::string(buf), fd, bd); + } else if (node->type == det_char_crossroads){ + FA_NodeOfDetCharCrossroads* cn = dynamic_cast(node); + for (const auto& transition: cn->crossroads){ + std::string str = stringify_codeset(transition.input); + print_edge(node, transition.nxt_node, str + (cn->second_ns ? std::string(" ") + STAR : ""), + fd, bd); + } + } + } + fprintf(fd, "}\n"); + } + + FILE* get_fd(const char* apath){ + errno = 0; + FILE *fd = fopen(apath, "w"); + if (!fd) + perror("fopen w"); + if (ftruncate(fileno(fd), 0) != 0) + perror("truncation"); + fd = fopen(apath, "a"); + if (!fd) + perror("fopen a"); + return fd; + } + + void show_fa_with_sxiv_after_dot(const FA_Container& fa, const KnownTrackingTools& ktr, + const RegexPriorityTable& priority_table) { + const char* temp_gv = "FAGraph.gv"; + const char* temp_png = "FAGraph.png"; + int temp_descriptor = open(temp_gv, O_CLOEXEC | O_APPEND | O_CREAT | O_WRONLY, S_IRWXU | S_IRWXG); + assert(temp_descriptor >= 0); + assert(fa.start); + FILE* fd = get_fd(temp_gv); + print_fa(fa, fd, ktr, priority_table); + fclose(fd); + char cmdBuf[1024]; + // todo: get rid of temporary dot file and shell usage + snprintf(cmdBuf, 1024, "dot %s -Tpng >%s", temp_gv, temp_png); + int chw = system(cmdBuf); + assert(WIFEXITED(chw)); + assert(WEXITSTATUS(chw) == 0); + snprintf(cmdBuf, 1024, "sxiv %s", temp_png); + chw = system(cmdBuf); + assert(WIFEXITED(chw)); + assert(WEXITSTATUS(chw) == 0); + assert(chw >= 0); + unlink(temp_gv); + unlink(temp_png); + } } diff --git a/src/debugging_regexis024/debug_through_graphviz.h b/src/debugging_regexis024/debug_through_graphviz.h index e341248..0e6c857 100644 --- a/src/debugging_regexis024/debug_through_graphviz.h +++ b/src/debugging_regexis024/debug_through_graphviz.h @@ -5,8 +5,10 @@ #include #include -/* Uses temporary file FAGraph.gv,png, dot command and sxiv */ -void show_fa_with_sxiv_after_dot(const FA_Container& fa, const KnownTrackingTools& ktr, - const RegexPriorityTable& priority_table); +namespace regexis024 { + /* Uses temporary file FAGraph.gv,png, dot command and sxiv */ + void show_fa_with_sxiv_after_dot(const FA_Container& fa, const KnownTrackingTools& ktr, + const RegexPriorityTable& priority_table); +} #endif diff --git a/src/debugging_regexis024/prettyprint/prettyprint_util.cpp b/src/debugging_regexis024/prettyprint/prettyprint_util.cpp index 878ff9e..eb04087 100644 --- a/src/debugging_regexis024/prettyprint/prettyprint_util.cpp +++ b/src/debugging_regexis024/prettyprint/prettyprint_util.cpp @@ -2,88 +2,86 @@ #include #include -TreeWithStringsNode::TreeWithStringsNode(const std::string &val): val(val) { -} +namespace regexis024 { + static const char* ch_empty = " "; + static const char* ch_passing_by = "\u2502 "; + static const char* ch_connect_right_and_forward = "\u251c\u2500\u2500\u2500"; + static const char* ch_connect_right_last = "\u2514\u2500\u2500\u2500"; -static const char* ch_empty = " "; -static const char* ch_passing_by = "\u2502 "; -static const char* ch_connect_right_and_forward = "\u251c\u2500\u2500\u2500"; -static const char* ch_connect_right_last = "\u2514\u2500\u2500\u2500"; + static const char* ch_box_left_side = "\u2551"; + static const char* ch_box_right_side = "\u2551"; + static const char* ch_box_top_side = "\u2550"; + static const char* ch_box_bottom_side = "\u2550"; + static const char* ch_box_crn_top_left = "\u2554"; + static const char* ch_box_crn_top_right = "\u2557"; + static const char* ch_box_crn_bottom_left = "\u255A"; + static const char* ch_box_crn_bottom_right = "\u255D"; -static const char* ch_box_left_side = "\u2551"; -static const char* ch_box_right_side = "\u2551"; -static const char* ch_box_top_side = "\u2550"; -static const char* ch_box_bottom_side = "\u2550"; -static const char* ch_box_crn_top_left = "\u2554"; -static const char* ch_box_crn_top_right = "\u2557"; -static const char* ch_box_crn_bottom_left = "\u255A"; -static const char* ch_box_crn_bottom_right = "\u255D"; - -size_t length_of_line(const std::string& str) { - size_t ch = 0; - size_t pos = 0; - while (pos < str.size()) { - int32_t code; - size_t adj; - utf8_string_iterat(code, adj, pos, reinterpret_cast(str.data()), str.size()); - if (code < 0) - return ch; - ch++; - pos += adj; - } - return ch; -} - -/* Warning: recursion used */ -void toLines_dfs(const TreeWithStringsNode& node, lines& out, std::vector& prefix) { - out.push_back(""); - size_t n = prefix.size(); - for (size_t i = 0; i < n; i++) { - if (i + 1 < n) { - out.back() += prefix[i] ? ch_passing_by : ch_empty; - } else { - out.back() += prefix[i] ? ch_connect_right_and_forward : ch_connect_right_last; + size_t length_of_line(const std::string& str) { + size_t ch = 0; + size_t pos = 0; + while (pos < str.size()) { + int32_t code; + size_t adj; + utf8_string_iterat(code, adj, pos, str.data(), str.size()); + if (code < 0) + return ch; + ch++; + pos += adj; } + return ch; } - out.back() += node.val; - prefix.push_back(true); - size_t m = node.childeren.size(); - for (size_t i = 0; i < m; i++) { - if (i + 1 == m) - prefix[n] = false; - toLines_dfs(node.childeren[i], out, prefix); + + /* Warning: recursion used */ + void toLines_dfs(const TreeWithStringsNode& node, lines& out, std::vector& prefix) { + out.push_back(""); + size_t n = prefix.size(); + for (size_t i = 0; i < n; i++) { + if (i + 1 < n) { + out.back() += prefix[i] ? ch_passing_by : ch_empty; + } else { + out.back() += prefix[i] ? ch_connect_right_and_forward : ch_connect_right_last; + } + } + out.back() += node.val; + prefix.push_back(true); + size_t m = node.childeren.size(); + for (size_t i = 0; i < m; i++) { + if (i + 1 == m) + prefix[n] = false; + toLines_dfs(node.childeren[i], out, prefix); + } + prefix.pop_back(); } - prefix.pop_back(); -} -void TreeWithStringsNode::toLines(lines &out) const { - std::vector prefix; - toLines_dfs(*this, out, prefix); -} - -std::string strMul(size_t n, const char* str) { - std::string res; - for (size_t i = 0; i < n; i++) - res += str; - return res; -} - -lines wrapWithBox(const lines &in) { - lines out; - size_t max_width = 0; - for (auto& l: in) - max_width = std::max(max_width, length_of_line(l)); - out.push_back(ch_box_crn_top_left + strMul(max_width, ch_box_top_side) + ch_box_crn_top_right); - for (auto& line: in) { - size_t s = length_of_line(line); - out.push_back(ch_box_left_side + line + strMul(max_width - s, " ") + ch_box_right_side); + void TreeWithStringsNode::toLines(lines &out) const { + std::vector prefix; + toLines_dfs(*this, out, prefix); } - out.push_back(ch_box_crn_bottom_left + strMul(max_width, ch_box_bottom_side) + ch_box_crn_bottom_right); - return out; -} -void printLines(const lines &in) { - for (auto& l: in) - printf("%s\n", l.c_str()); -} + std::string strMul(size_t n, const char* str) { + std::string res; + for (size_t i = 0; i < n; i++) + res += str; + return res; + } + lines wrapWithBox(const lines &in) { + lines out; + size_t max_width = 0; + for (auto& l: in) + max_width = std::max(max_width, length_of_line(l)); + out.push_back(ch_box_crn_top_left + strMul(max_width, ch_box_top_side) + ch_box_crn_top_right); + for (auto& line: in) { + size_t s = length_of_line(line); + out.push_back(ch_box_left_side + line + strMul(max_width - s, " ") + ch_box_right_side); + } + out.push_back(ch_box_crn_bottom_left + strMul(max_width, ch_box_bottom_side) + ch_box_crn_bottom_right); + return out; + } + + void printLines(const lines &in) { + for (auto& l: in) + printf("%s\n", l.c_str()); + } +} diff --git a/src/debugging_regexis024/prettyprint/prettyprint_util.h b/src/debugging_regexis024/prettyprint/prettyprint_util.h index 72ecb94..71559e2 100644 --- a/src/debugging_regexis024/prettyprint/prettyprint_util.h +++ b/src/debugging_regexis024/prettyprint/prettyprint_util.h @@ -6,20 +6,19 @@ #include #include -typedef std::vector lines; +namespace regexis024 { + typedef std::vector lines; -struct TreeWithStringsNode { - std::string val; - std::vector childeren; + struct TreeWithStringsNode { + std::string val; + std::vector childeren; - explicit TreeWithStringsNode(const std::string &val); - TreeWithStringsNode() = default; + void toLines(lines& out) const; + }; - void toLines(lines& out) const; -}; + lines wrapWithBox(const lines& in); -lines wrapWithBox(const lines& in); - -void printLines(const lines& in); + void printLines(const lines& in); +} #endif diff --git a/src/debugging_regexis024/vm/libregexis024vm_debug.cpp b/src/debugging_regexis024/vm/libregexis024vm_debug.cpp index 068d014..cac2b54 100644 --- a/src/debugging_regexis024/vm/libregexis024vm_debug.cpp +++ b/src/debugging_regexis024/vm/libregexis024vm_debug.cpp @@ -2,57 +2,59 @@ #include #include -std::string thread_to_str(const REGEX_IS024_Thread& thread){ - if (!(thread.slot_occupation_status & SLOT_OCCUPIED)) - return "{ unoccupied }"; - char buf[1024]; - snprintf(buf, 1024, "{ IP = %lu }", thread.IP); - return buf; -} - -std::string stack_to_str(const REGEX_IS024_Stack& stack){ - std::string res = "{ "; - for (uint32_t i = 0; i < stack.sz; i++){ - if (i != 0) - res += ", "; - res += std::to_string(stack.slots[i]); +namespace regexis024 { + std::string thread_to_str(const Thread& thread){ + if (!(thread.slot_occupation_status & SLOT_OCCUPIED)) + return "{ unoccupied }"; + char buf[1024]; + snprintf(buf, 1024, "{ IP = %lu }", thread.IP); + return buf; } - res += " }"; - return res; -} -std::string slots_to_str(const REGEX_IS024_CONTEXT& ctx){ - if (!ctx.initialized) - return "uninitialized"; - std::string READ_slots; - for (size_t i = 0; i < ctx.read_slots_number; i++){ - uint8_t stat = ctx.READ_halted_slots[i].slot_occupation_status; - READ_slots += (stat & SLOT_OCCUPIED) ? ((stat & SLOT_NEW) ? "N" : "O") : "x"; + std::string stack_to_str(const SSID_Stack& stack){ + std::string res = "{ "; + for (uint32_t i = 0; i < stack.sz; i++){ + if (i != 0) + res += ", "; + res += std::to_string(stack.slots[i]); + } + res += " }"; + return res; } - std::string FORK_slots; - for (size_t i = 0; i < ctx.fork_slots_number; i++){ - uint8_t stat = ctx.FORK_halted_slots[i].slot_occupation_status; - FORK_slots += (stat & SLOT_OCCUPIED) ? "O" : "x"; + + std::string slots_to_str(const VMContext& ctx){ + if (!ctx.initialized) + return "uninitialized"; + std::string READ_slots; + for (size_t i = 0; i < ctx.read_slots_number; i++){ + uint8_t stat = ctx.READ_halted_slots[i].slot_occupation_status; + READ_slots += (stat & SLOT_OCCUPIED) ? ((stat & SLOT_NEW) ? "N" : "O") : "x"; + } + std::string FORK_slots; + for (size_t i = 0; i < ctx.fork_slots_number; i++){ + uint8_t stat = ctx.FORK_halted_slots[i].slot_occupation_status; + FORK_slots += (stat & SLOT_OCCUPIED) ? "O" : "x"; + } + char buf[4096]; + snprintf(buf, 4096, "READ_slots: %s ; FORK_slots: %s ; READ_stack_new_main: %s ; " + "READ_stack_new_second: %s ; READ_stack_old: %s ; FORK_stack: %s", + READ_slots.c_str(), FORK_slots.c_str(), stack_to_str(ctx.READ_halted_stack_new_first).c_str(), + stack_to_str(ctx.READ_halted_stack_new_second).c_str(), + stack_to_str(ctx.READ_halted_stack_old).c_str(), stack_to_str(ctx.FORK_halted_stack).c_str()); + return buf; } - char buf[4096]; - snprintf(buf, 4096, "READ_slots: %s ; FORK_slots: %s ; READ_stack_new_main: %s ; " - "READ_stack_new_second: %s ; READ_stack_old: %s ; FORK_stack: %s", - READ_slots.c_str(), FORK_slots.c_str(), stack_to_str(ctx.READ_halted_stack_new_first).c_str(), - stack_to_str(ctx.READ_halted_stack_new_second).c_str(), - stack_to_str(ctx.READ_halted_stack_old).c_str(), stack_to_str(ctx.FORK_halted_stack).c_str()); - return buf; -} -void debug_print_context(const REGEX_IS024_CONTEXT& ctx, const char* place) { - printf("== DEBUG `%s` ==\n", place); + void debug_print_context(const VMContext& ctx, const char* place) { + printf("== DEBUG `%s` ==\n", place); - printf("Active thread: %s, sifting_with: %s, match: %s\n%s\n", - thread_to_str(ctx.active_thread).c_str(), - ctx.sifting_with ? thread_to_str(*ctx.sifting_with).c_str() : "NO", thread_to_str(ctx.matched_thread).c_str(), - slots_to_str(ctx).c_str()); -} + printf("Active thread: %s, sifting_with: %s, match: %s\n%s\n", + thread_to_str(ctx.active_thread).c_str(), + ctx.sifting_with ? thread_to_str(*ctx.sifting_with).c_str() : "NO", thread_to_str(ctx.matched_thread).c_str(), + slots_to_str(ctx).c_str()); + } -void debug_print_thread(const REGEX_IS024_Thread& thr, const char *place) { - printf("== DEBUG `%s` ==\n", place); - printf("This thread: %s\n", thread_to_str(thr).c_str()); + void debug_print_thread(const Thread& thr, const char *place) { + printf("== DEBUG `%s` ==\n", place); + printf("This thread: %s\n", thread_to_str(thr).c_str()); + } } diff --git a/src/debugging_regexis024/vm/libregexis024vm_debug.h b/src/debugging_regexis024/vm/libregexis024vm_debug.h index 9bf2265..5ef5031 100644 --- a/src/debugging_regexis024/vm/libregexis024vm_debug.h +++ b/src/debugging_regexis024/vm/libregexis024vm_debug.h @@ -4,8 +4,10 @@ #include #include -void debug_print_context(const REGEX_IS024_CONTEXT& ctx, const char* place); +namespace regexis024 { + void debug_print_context(const VMContext& ctx, const char* place); -void debug_print_thread(const REGEX_IS024_Thread& thr, const char *place); + void debug_print_thread(const Thread& thr, const char *place); +} #endif diff --git a/src/libregexis024fa/codeset.cpp b/src/libregexis024fa/codeset.cpp index a70c831..30be413 100644 --- a/src/libregexis024fa/codeset.cpp +++ b/src/libregexis024fa/codeset.cpp @@ -1,19 +1,20 @@ #include #include -codeset_t invert_set(const codeset_t &X) { - if (X.empty()) - return {{0, UINT32_MAX}}; - codeset_t res; - if (X[0].first != 0) - res.emplace_back(0, X[0].first - 1); - for (size_t i = 0; i + 1 < X.size(); i++){ - res.emplace_back(X[i].second + 1, X[i + 1].first - 1); +namespace regexis024 { + codeset_t invert_set(const codeset_t &X) { + if (X.empty()) + return {{0, UINT32_MAX}}; + codeset_t res; + if (X[0].first != 0) + res.emplace_back(0, X[0].first - 1); + for (size_t i = 0; i + 1 < X.size(); i++){ + res.emplace_back(X[i].second + 1, X[i + 1].first - 1); + } + if (X.back().second != UINT32_MAX) + res.emplace_back(X.back().second + 1, UINT32_MAX); + return res; } - if (X.back().second != UINT32_MAX) - res.emplace_back(X.back().second + 1, UINT32_MAX); - return res; -} #define elA (A[i]) #define elB (B[j]) @@ -23,98 +24,99 @@ codeset_t invert_set(const codeset_t &X) { #define Aended (i == An) #define Bended (j == Bn) -codeset_t merge_sets(const codeset_t &A, const codeset_t &B) { - codeset_t res; - prepare - std::pair cur; - while (true){ - if (Aended && Bended) - break; - if (i == An){ - cur = elB; - Binc; - } else if (j == Bn){ - cur = elA; - Ainc; - } else { - if (elA.first < elB.first) { + codeset_t merge_sets(const codeset_t &A, const codeset_t &B) { + codeset_t res; + prepare + std::pair cur; + while (true){ + if (Aended && Bended) + break; + if (i == An){ + cur = elB; + Binc; + } else if (j == Bn){ cur = elA; Ainc; } else { - cur = elB; - Binc; + if (elA.first < elB.first) { + cur = elA; + Ainc; + } else { + cur = elB; + Binc; + } + } + while (true){ + if (Aended && Bended){ + res.push_back(cur); + break; + } + if (i < An && (cur.second == UINT32_MAX || elA.first <= cur.second + 1)){ + cur.second = std::max(elA.second, cur.second); + Ainc; + } else if (j < Bn && (cur.second == UINT32_MAX || elB.first <= cur.second + 1)){ + cur.second = std::max(elB.second, cur.second); + Binc; + } else { + res.push_back(cur); + break; + } } } + return res; + } + + codeset_t intersect_sets(const codeset_t &A, const codeset_t &B) { + codeset_t res; + prepare while (true){ - if (Aended && Bended){ - res.push_back(cur); + if (Aended || Bended) break; - } - if (i < An && (cur.second == UINT32_MAX || elA.first <= cur.second + 1)){ - cur.second = std::max(elA.second, cur.second); + if (elB.first <= elA.first && elA.first <= elB.second) + res.emplace_back(elA.first, std::min(elA.second, elB.second)); + else if (elA.first <= elB.first && elB.first <= elA.second) + res.emplace_back(elB.first, std::min(elA.second, elB.second)); + + if (elA.second <= elB.second) Ainc; - } else if (j < Bn && (cur.second == UINT32_MAX || elB.first <= cur.second + 1)){ - cur.second = std::max(elB.second, cur.second); + else Binc; - } else { - res.push_back(cur); - break; - } } + return res; } - return res; -} -codeset_t intersect_sets(const codeset_t &A, const codeset_t &B) { - codeset_t res; - prepare - while (true){ - if (Aended || Bended) - break; - if (elB.first <= elA.first && elA.first <= elB.second) - res.emplace_back(elA.first, std::min(elA.second, elB.second)); - else if (elA.first <= elB.first && elB.first <= elA.second) - res.emplace_back(elB.first, std::min(elA.second, elB.second)); - - if (elA.second <= elB.second) - Ainc; - else - Binc; + codeset_t subtract_sets(const codeset_t &A, const codeset_t &B) { + return intersect_sets(A, invert_set(B)); } - return res; -} -codeset_t subtract_sets(const codeset_t &A, const codeset_t &B) { - return intersect_sets(A, invert_set(B)); -} - -bool is_inside(uint32_t start, uint32_t end, codeset_t &X) { - for (auto& p: X){ - if (p.first <= start && end <= p.second) - return true; - assert(end < p.first || p.second < start); + bool is_inside(uint32_t start, uint32_t end, codeset_t &X) { + for (auto& p: X){ + if (p.first <= start && end <= p.second) + return true; + assert(end < p.first || p.second < start); + } + return false; } - return false; -} -codeset_t set_add_char(const codeset_t& X, uint32_t cp) { - return merge_sets(X, {{cp, cp}}); -} - -codeset_t set_add_range(const codeset_t& X, uint32_t start, uint32_t end) { - return merge_sets(X, {{start, end}}); -} - -codeset_t codeset_of_one_char(uint32_t ch) { - return codeset_t({{ch, ch}}); -} - -std::string stringifyCodesetBase10(const codeset_t& CS) { - std::string cs; - for (auto p: CS) { - if (!cs.empty()) - cs += "; "; - cs += std::to_string(p.first) + "-" + std::to_string(p.second); + codeset_t set_add_char(const codeset_t& X, uint32_t cp) { + return merge_sets(X, {{cp, cp}}); + } + + codeset_t set_add_range(const codeset_t& X, uint32_t start, uint32_t end) { + return merge_sets(X, {{start, end}}); + } + + codeset_t codeset_of_one_char(uint32_t ch) { + return codeset_t({{ch, ch}}); + } + + std::string stringifyCodesetBase10(const codeset_t& CS) { + std::string cs; + for (auto p: CS) { + if (!cs.empty()) + cs += "; "; + cs += std::to_string(p.first) + "-" + std::to_string(p.second); + } + return cs; } - return cs; } diff --git a/src/libregexis024fa/codeset.h b/src/libregexis024fa/codeset.h index b936589..e48aa53 100644 --- a/src/libregexis024fa/codeset.h +++ b/src/libregexis024fa/codeset.h @@ -6,22 +6,24 @@ #include #include -typedef std::vector> codeset_t; +namespace regexis024 { + typedef std::vector> codeset_t; -codeset_t invert_set(const codeset_t& X); -codeset_t merge_sets(const codeset_t& A, const codeset_t& B); -codeset_t intersect_sets(const codeset_t& A, const codeset_t& B); -codeset_t subtract_sets(const codeset_t& A, const codeset_t& B); + codeset_t invert_set(const codeset_t& X); + codeset_t merge_sets(const codeset_t& A, const codeset_t& B); + codeset_t intersect_sets(const codeset_t& A, const codeset_t& B); + codeset_t subtract_sets(const codeset_t& A, const codeset_t& B); -/* Aborts if segment in question hit the edge (unsafe function) */ -bool is_inside(uint32_t start, uint32_t end, codeset_t& X); + /* Aborts if segment in question hit the edge (unsafe function) */ + bool is_inside(uint32_t start, uint32_t end, codeset_t& X); -codeset_t set_add_char(const codeset_t& X, uint32_t cp); -codeset_t set_add_range(const codeset_t& X, uint32_t start, uint32_t end); + codeset_t set_add_char(const codeset_t& X, uint32_t cp); + codeset_t set_add_range(const codeset_t& X, uint32_t start, uint32_t end); -codeset_t codeset_of_one_char(uint32_t ch); + codeset_t codeset_of_one_char(uint32_t ch); #define codeset_of_all codeset_t({{0, UINT32_MAX}}) -std::string stringifyCodesetBase10(const codeset_t& CS); + std::string stringifyCodesetBase10(const codeset_t& CS); +} #endif //LIBREGEXIS024_CODESET_H \ No newline at end of file diff --git a/src/libregexis024fa/colored_codeset.cpp b/src/libregexis024fa/colored_codeset.cpp index 267e7a8..653f5a3 100644 --- a/src/libregexis024fa/colored_codeset.cpp +++ b/src/libregexis024fa/colored_codeset.cpp @@ -2,182 +2,184 @@ #include -ColoredCodesetSegment::ColoredCodesetSegment(uint32_t color, uint32_t right_code): color(color), right_code(right_code) {} +namespace regexis024 { + ColoredCodesetSegment::ColoredCodesetSegment(uint32_t color, uint32_t right_code): color(color), right_code(right_code) {} -ColoredCodesetSegmentList::ColoredCodesetSegmentList() { - first = new ColoredCodesetSegment(0, UINT32_MAX); -} - -void ColoredCodesetSegmentList::replace_myself(const ColoredCodesetSegmentList &other) { - assert(other.first); - ColoredCodesetSegment** in_cur = &first; - ColoredCodesetSegment* in_other = other.first; - while (in_other) { - *in_cur = new ColoredCodesetSegment(*in_other); - in_cur = &((**in_cur).next); - in_other = in_other->next; + ColoredCodesetSegmentList::ColoredCodesetSegmentList() { + first = new ColoredCodesetSegment(0, UINT32_MAX); } -} -ColoredCodesetSegmentList::ColoredCodesetSegmentList(const ColoredCodesetSegmentList &other) { - replace_myself(other); -} - -void ColoredCodesetSegmentList::free_myself() { - ColoredCodesetSegment* cur = first; - while (cur) { - ColoredCodesetSegment* nxt = cur->next; - delete cur; - cur = nxt; + void ColoredCodesetSegmentList::replace_myself(const ColoredCodesetSegmentList &other) { + assert(other.first); + ColoredCodesetSegment** in_cur = &first; + ColoredCodesetSegment* in_other = other.first; + while (in_other) { + *in_cur = new ColoredCodesetSegment(*in_other); + in_cur = &((**in_cur).next); + in_other = in_other->next; + } } -} -ColoredCodesetSegmentList::~ColoredCodesetSegmentList() { - free_myself(); -} + ColoredCodesetSegmentList::ColoredCodesetSegmentList(const ColoredCodesetSegmentList &other) { + replace_myself(other); + } -ColoredCodesetSegmentList& ColoredCodesetSegmentList::operator=(const ColoredCodesetSegmentList &other) { - free_myself(); - replace_myself(other); - return *this; -} + void ColoredCodesetSegmentList::free_myself() { + ColoredCodesetSegment* cur = first; + while (cur) { + ColoredCodesetSegment* nxt = cur->next; + delete cur; + cur = nxt; + } + } -ColoredCodeset::ColoredCodeset(uint64_t dummy_n): DummyN(dummy_n) { - requests = {{}}; -} + ColoredCodesetSegmentList::~ColoredCodesetSegmentList() { + free_myself(); + } -void ColoredCodeset::split_phase(const codeset_t &X) { + ColoredCodesetSegmentList& ColoredCodesetSegmentList::operator=(const ColoredCodesetSegmentList &other) { + free_myself(); + replace_myself(other); + return *this; + } - uint32_t cA = 0; - ColoredCodesetSegment* cur_seg = list.first; + ColoredCodeset::ColoredCodeset(uint64_t dummy_n): DummyN(dummy_n) { + requests = {{}}; + } - uint32_t pi = 0; + void ColoredCodeset::split_phase(const codeset_t &X) { - auto advance_old = [&]()->void{ - cA = cur_seg->right_code + 1; - cur_seg = cur_seg->next; - }; + uint32_t cA = 0; + ColoredCodesetSegment* cur_seg = list.first; - /* How to use: splits are made from left to right. After each split cur_seg - * points to the rightest among sub-segments of cur_segment. */ - auto SPLIT = [&](uint32_t code_before_split)->void { - assert(code_before_split < cur_seg->right_code); - ColoredCodesetSegment* new_next = new ColoredCodesetSegment(cur_seg->color, cur_seg->right_code); - new_next->divisor_on_left = true; - cur_seg->right_code = code_before_split; - new_next->next = cur_seg->next; - cur_seg->next = new_next; - advance_old(); - }; + uint32_t pi = 0; - while (cur_seg && pi < X.size()) { - uint32_t cB = cur_seg->right_code; - uint32_t L = X[pi].first, R = X[pi].second; + auto advance_old = [&]()->void{ + cA = cur_seg->right_code + 1; + cur_seg = cur_seg->next; + }; - if (L < cA) { - if (R != UINT32_MAX && R + 1 < cA) { - pi++; - } else if (R != UINT32_MAX && R + 1 == cA) { - cur_seg->divisor_on_left = true; - pi++; - } else if (R < cB) { - SPLIT(R); - pi++; - } else { - advance_old(); - } - } else if (L == cA) { - cur_seg->divisor_on_left = true; - if (R < cB) { - SPLIT(R); - pi++; - } else { - advance_old(); - } - } else if (L <= cB) { - SPLIT(L - 1); - if (R < cB) { - SPLIT(R); - pi++; - } else { - advance_old(); - } - } else { + /* How to use: splits are made from left to right. After each split cur_seg + * points to the rightest among sub-segments of cur_segment. */ + auto SPLIT = [&](uint32_t code_before_split)->void { + assert(code_before_split < cur_seg->right_code); + ColoredCodesetSegment* new_next = new ColoredCodesetSegment(cur_seg->color, cur_seg->right_code); + new_next->divisor_on_left = true; + cur_seg->right_code = code_before_split; + new_next->next = cur_seg->next; + cur_seg->next = new_next; advance_old(); - } - } -} + }; -void ColoredCodeset::apply_divisor(const codeset_t &X) { - split_phase(X); - size_t X_id = nxt_request_id++; - size_t m = requests.size(); - size_t bm = m; - std::vector skipped(bm, false); - std::vector overlapped(bm, false); - { - bool inside = false; - ColoredCodesetSegment* cur = list.first; - while (cur) { - inside = (inside != cur->divisor_on_left); - if (inside) { - overlapped[cur->color] = true; + while (cur_seg && pi < X.size()) { + uint32_t cB = cur_seg->right_code; + uint32_t L = X[pi].first, R = X[pi].second; + + if (L < cA) { + if (R != UINT32_MAX && R + 1 < cA) { + pi++; + } else if (R != UINT32_MAX && R + 1 == cA) { + cur_seg->divisor_on_left = true; + pi++; + } else if (R < cB) { + SPLIT(R); + pi++; + } else { + advance_old(); + } + } else if (L == cA) { + cur_seg->divisor_on_left = true; + if (R < cB) { + SPLIT(R); + pi++; + } else { + advance_old(); + } + } else if (L <= cB) { + SPLIT(L - 1); + if (R < cB) { + SPLIT(R); + pi++; + } else { + advance_old(); + } } else { - skipped[cur->color] = true; + advance_old(); } - cur = cur->next; } } - std::vector alt_color(bm, 0); - for (size_t i = 0; i < bm; i++) { - if (skipped[i] && overlapped[i]) { - alt_color[i] = m++; - requests.push_back(requests[i]); - if (X_id >= DummyN) - requests.back().push_back(X_id - DummyN); - } else if (overlapped[i]) { - if (X_id >= DummyN) - requests[i].push_back(X_id - DummyN); - } else - assert(skipped[i]); + + void ColoredCodeset::apply_divisor(const codeset_t &X) { + split_phase(X); + size_t X_id = nxt_request_id++; + size_t m = requests.size(); + size_t bm = m; + std::vector skipped(bm, false); + std::vector overlapped(bm, false); + { + bool inside = false; + ColoredCodesetSegment* cur = list.first; + while (cur) { + inside = (inside != cur->divisor_on_left); + if (inside) { + overlapped[cur->color] = true; + } else { + skipped[cur->color] = true; + } + cur = cur->next; + } + } + std::vector alt_color(bm, 0); + for (size_t i = 0; i < bm; i++) { + if (skipped[i] && overlapped[i]) { + alt_color[i] = m++; + requests.push_back(requests[i]); + if (X_id >= DummyN) + requests.back().push_back(X_id - DummyN); + } else if (overlapped[i]) { + if (X_id >= DummyN) + requests[i].push_back(X_id - DummyN); + } else + assert(skipped[i]); + } + { + bool inside = false; + ColoredCodesetSegment* cur = list.first; + while (cur) { + inside = (inside != cur->divisor_on_left); + cur->divisor_on_left = false; + uint32_t c = cur->color; + if (inside && skipped[c] && overlapped[c]) { + cur->color = alt_color[c]; + } + cur = cur->next; + } + } } - { - bool inside = false; + + void ColoredCodeset::get_splits_of_non_dummy(std::vector &res_input, + std::vector> &res_color_to_requests) { + size_t n = requests.size(); + std::vector nonclean_to_clean(n, -1); + res_color_to_requests = {}; + + for (size_t i = 0; i < n; i++) { + if (!requests[i].empty()) { + nonclean_to_clean[i] = res_color_to_requests.size(); + res_color_to_requests.push_back(requests[i]); + } + } + ColoredCodesetSegment* cur = list.first; + uint32_t L = 0; + res_input.assign(res_color_to_requests.size(), {}); while (cur) { - inside = (inside != cur->divisor_on_left); - cur->divisor_on_left = false; - uint32_t c = cur->color; - if (inside && skipped[c] && overlapped[c]) { - cur->color = alt_color[c]; + size_t Sc = cur->color; + if (nonclean_to_clean[Sc] >= 0) { + res_input[nonclean_to_clean[Sc]].emplace_back(L, cur->right_code); } + L = cur->right_code + 1; cur = cur->next; } } -} - -void ColoredCodeset::get_splits_of_non_dummy(std::vector &res_input, - std::vector> &res_color_to_requests) { - size_t n = requests.size(); - std::vector nonclean_to_clean(n, -1); - res_color_to_requests = {}; - - for (size_t i = 0; i < n; i++) { - if (!requests[i].empty()) { - nonclean_to_clean[i] = res_color_to_requests.size(); - res_color_to_requests.push_back(requests[i]); - } - } - - ColoredCodesetSegment* cur = list.first; - uint32_t L = 0; - res_input.assign(res_color_to_requests.size(), {}); - while (cur) { - size_t Sc = cur->color; - if (nonclean_to_clean[Sc] >= 0) { - res_input[nonclean_to_clean[Sc]].emplace_back(L, cur->right_code); - } - L = cur->right_code + 1; - cur = cur->next; - } -} +} \ No newline at end of file diff --git a/src/libregexis024fa/colored_codeset.h b/src/libregexis024fa/colored_codeset.h index 9adec03..e992f1d 100644 --- a/src/libregexis024fa/colored_codeset.h +++ b/src/libregexis024fa/colored_codeset.h @@ -7,60 +7,60 @@ #include -/* Used for determinizer. Nowhere else */ +namespace regexis024 { + /* Used for determinizer. Nowhere else */ + struct ColoredCodesetSegment { + uint32_t color; + uint32_t right_code; + ColoredCodesetSegment* next = NULL; -struct ColoredCodesetSegment { - uint32_t color; - uint32_t right_code; - ColoredCodesetSegment* next = NULL; + /* Temporary varaible (used by apply_divisor() method) */ + bool divisor_on_left = false; - /* Temporary varaible (used by apply_divisor() method) */ - bool divisor_on_left = false; + ColoredCodesetSegment(uint32_t color, uint32_t right_code); + }; - ColoredCodesetSegment(uint32_t color, uint32_t right_code); -}; + /* Warning!!! This stupid class is OOM-unsafe!!! + * This is not an issue as far as you don't show any of it's instance to the user of libregexis024 */ + struct ColoredCodesetSegmentList { + ColoredCodesetSegment* first = NULL; -/* Warning!!! This stupid class is OOM-unsafe!!! - * This is not an issue as far as you don't show any of it's instance to the user of libregexis024 */ -struct ColoredCodesetSegmentList { - ColoredCodesetSegment* first = NULL; - - ColoredCodesetSegmentList(); + ColoredCodesetSegmentList(); - void replace_myself(const ColoredCodesetSegmentList& other); + void replace_myself(const ColoredCodesetSegmentList& other); - ColoredCodesetSegmentList(const ColoredCodesetSegmentList& other); + ColoredCodesetSegmentList(const ColoredCodesetSegmentList& other); - /* Use only internally */ - void free_myself(); + /* Use only internally */ + void free_myself(); - ~ColoredCodesetSegmentList(); + ~ColoredCodesetSegmentList(); - ColoredCodesetSegmentList& operator=(const ColoredCodesetSegmentList& other); -}; + ColoredCodesetSegmentList& operator=(const ColoredCodesetSegmentList& other); + }; -/* Highly unoptimized algorithm on this data structure O(C^2) time*/ -class ColoredCodeset { - ColoredCodesetSegmentList list; - /* Size of this vector is equal to the number of colors */ - std::vector> requests; - uint64_t DummyN; - size_t nxt_request_id = 0; + /* Highly unoptimized algorithm on this data structure O(C^2) time*/ + class ColoredCodeset { + ColoredCodesetSegmentList list; + /* Size of this vector is equal to the number of colors */ + std::vector> requests; + uint64_t DummyN; + size_t nxt_request_id = 0; - void split_phase(const codeset_t& X); -public: - /* First dummy_n split requests will be viewed as 'dummy requests', when complete map of splits is requested, - * colors that are registed indide only dummy requests won't be returned. */ - ColoredCodeset(uint64_t dummy_n); + void split_phase(const codeset_t& X); + public: + /* First dummy_n split requests will be viewed as 'dummy requests', when complete map of splits is requested, + * colors that are registed indide only dummy requests won't be returned. */ + ColoredCodeset(uint64_t dummy_n); - /* O(C, which is bad, but my library's compiler is already slow by itself, so who cares) */ - void apply_divisor(const codeset_t& X); - - /* Returned 'requests' mapping will feature request id's with DummyN substituted from them */ - void get_splits_of_non_dummy(std::vector& res_input, - std::vector>& res_color_to_requests); -}; + /* O(C, which is bad, but my library's compiler is already slow by itself, so who cares) */ + void apply_divisor(const codeset_t& X); + /* Returned 'requests' mapping will feature request id's with DummyN substituted from them */ + void get_splits_of_non_dummy(std::vector& res_input, + std::vector>& res_color_to_requests); + }; +} #endif diff --git a/src/libregexis024fa/fa_first_stage_fix.cpp b/src/libregexis024fa/fa_first_stage_fix.cpp index 94d5801..4f1c235 100644 --- a/src/libregexis024fa/fa_first_stage_fix.cpp +++ b/src/libregexis024fa/fa_first_stage_fix.cpp @@ -7,189 +7,191 @@ // #include // #endif -REGEX_IS024_FA_FirstStageFixInfo first_stage_fix_fa(FA_Container& sourceFa, FA_Container& resultFa) { - assert(sourceFa.start); - REGEX_IS024_FA_FirstStageFixInfo info; +namespace regexis024 { + REGEX_IS024_FA_FirstStageFixInfo first_stage_fix_fa(FA_Container& sourceFa, FA_Container& resultFa) { + assert(sourceFa.start); + REGEX_IS024_FA_FirstStageFixInfo info; - for (size_t I_scans = 0; I_scans < sourceFa.all.size(); I_scans++){ - FA_Node* beg = sourceFa.all[I_scans]; - if (beg->type != look_one_ahead) - continue; - FA_NodeOfLookOneAhead& loa = (*(FA_NodeOfLookOneAhead*)beg); - codeset_t& restriction = loa.restriction; - assert(loa.nxt_node); + for (size_t I_scans = 0; I_scans < sourceFa.all.size(); I_scans++){ + FA_Node* beg = sourceFa.all[I_scans]; + if (beg->type != look_one_ahead) + continue; + FA_NodeOfLookOneAhead& loa = (*(FA_NodeOfLookOneAhead*)beg); + codeset_t& restriction = loa.restriction; + assert(loa.nxt_node); - struct Marked{ - FA_Node* node; - size_t refs_from_my = 1; - bool making_copy = false; - FA_Node* copy = NULL; + struct Marked{ + FA_Node* node; + size_t refs_from_my = 1; + bool making_copy = false; + FA_Node* copy = NULL; - explicit Marked(FA_Node *node) : node(node) {} - }; + explicit Marked(FA_Node *node) : node(node) {} + }; - std::vector searched; - searched.emplace_back(loa.nxt_node); - loa.nxt_node->search_mark = 0; + std::vector searched; + searched.emplace_back(loa.nxt_node); + loa.nxt_node->search_mark = 0; - for (size_t done = 0; done < searched.size(); done++){ - FA_Node& cur = *searched[done].node; - for (FA_Node** nxtN : cur.get_all_empty_valid_transitions()){ - if ((**nxtN).search_mark == -1){ - assert((**nxtN).nodeId != loa.nodeId); - (**nxtN).search_mark = (int64_t)searched.size(); - searched.emplace_back(*nxtN); - } else { - searched[(**nxtN).search_mark].refs_from_my++; + for (size_t done = 0; done < searched.size(); done++){ + FA_Node& cur = *searched[done].node; + for (FA_Node** nxtN : cur.get_all_empty_valid_transitions()){ + if ((**nxtN).search_mark == -1){ + assert((**nxtN).nodeId != loa.nodeId); + (**nxtN).search_mark = (int64_t)searched.size(); + searched.emplace_back(*nxtN); + } else { + searched[(**nxtN).search_mark].refs_from_my++; + } } } - } - std::vector s2s; - for (auto& v_sete: searched){ - if (v_sete.refs_from_my < v_sete.node->refs){ - v_sete.making_copy = true; - s2s.push_back(v_sete.node); - } - } - while (!s2s.empty()){ - FA_Node& m = *s2s.back(); s2s.pop_back(); - assert(searched[m.search_mark].making_copy); - /* Beacuse of this operation source Fa is not read-only. It becomes useless after renerating resultFa */ - searched[m.search_mark].copy = copy_fa_node(m, sourceFa); - - for (FA_Node** nxtN: m.get_all_empty_valid_transitions()){ - Marked& nxtNaux = searched[(**nxtN).search_mark]; - if (!nxtNaux.making_copy){ - nxtNaux.making_copy = true; - s2s.push_back(*nxtN); + std::vector s2s; + for (auto& v_sete: searched){ + if (v_sete.refs_from_my < v_sete.node->refs){ + v_sete.making_copy = true; + s2s.push_back(v_sete.node); } } + while (!s2s.empty()){ + FA_Node& m = *s2s.back(); s2s.pop_back(); + assert(searched[m.search_mark].making_copy); + /* Beacuse of this operation source Fa is not read-only. It becomes useless after renerating resultFa */ + searched[m.search_mark].copy = copy_fa_node(m, sourceFa); + + for (FA_Node** nxtN: m.get_all_empty_valid_transitions()){ + Marked& nxtNaux = searched[(**nxtN).search_mark]; + if (!nxtNaux.making_copy){ + nxtNaux.making_copy = true; + s2s.push_back(*nxtN); + } + } + } + + for (auto& v_sete : searched){ + FA_Node* my = v_sete.making_copy ? v_sete.copy : v_sete.node; + for (FA_Node** nxtN: my->get_all_empty_valid_transitions()){ + Marked& nxtNaux = searched[(**nxtN).search_mark]; + if (nxtNaux.making_copy) + reattach_fa_node_edge(nxtN, nxtNaux.copy); + } + my->apply_lookahead_restriction(restriction); + if (my->type == match) + info.fed_chars_extend_one_right = true; + } + { + Marked& loa_nxt_aux = searched[loa.nxt_node->search_mark]; + if (loa_nxt_aux.making_copy) + reattach_nxt_node(&loa, loa_nxt_aux.copy); + } + + for (auto& v_sete: searched) + v_sete.node->search_mark = -1; } - for (auto& v_sete : searched){ - FA_Node* my = v_sete.making_copy ? v_sete.copy : v_sete.node; - for (FA_Node** nxtN: my->get_all_empty_valid_transitions()){ - Marked& nxtNaux = searched[(**nxtN).search_mark]; - if (nxtNaux.making_copy) - reattach_fa_node_edge(nxtN, nxtNaux.copy); - } - my->apply_lookahead_restriction(restriction); - if (my->type == match) - info.fed_chars_extend_one_right = true; - } + // show_fa_with_sxiv_after_dot(sourceFa, {{}, {}}, {}); + { - Marked& loa_nxt_aux = searched[loa.nxt_node->search_mark]; - if (loa_nxt_aux.making_copy) - reattach_nxt_node(&loa, loa_nxt_aux.copy); + /* Now it's time to fill resultFa. Skipping all look one ahead's */ + auto skip_useless = [&](FA_Node* v) -> FA_Node* { + while (v->type == look_one_ahead){ + v = ((FA_NodeOfLookOneAhead*)v)->nxt_node; + } + return v; + }; + + resultFa.start = sourceFa.start; + std::vector homework = {&(resultFa.start)}; + std::vector sourceIdToResNode(sourceFa.all.size(), NULL); + + while (!homework.empty()) { + FA_Node** vPtr = homework.back(); homework.pop_back(); + FA_Node* right_source_v = skip_useless(*vPtr); + size_t vid = right_source_v->nodeId; + if (!sourceIdToResNode[vid]) { + sourceIdToResNode[vid] = copy_fa_node_to_another_fa(*right_source_v, resultFa); + for (FA_Node** uuPtr: sourceIdToResNode[vid]->get_all_transitions()) + homework.push_back(uuPtr); + } + *vPtr = sourceIdToResNode[vid]; + sourceIdToResNode[vid]->refs++; + } } - for (auto& v_sete: searched) - v_sete.node->search_mark = -1; + + { + /* Guessing info.fed_chars_extend_one_left */ + size_t done = 0; + std::vector searched; + searched.push_back(resultFa.start); + resultFa.start->search_mark = 0; + while (done < searched.size()){ + if (searched[done]->type == look_one_behind){ + info.fed_chars_extend_one_left = true; + break; + } + for (FA_Node** nxtN: searched[done]->get_all_empty_valid_transitions()){ + if ((**nxtN).search_mark < 0){ + (**nxtN).search_mark = 0; + searched.push_back(*nxtN); + } + } + done++; + } + for (FA_Node* d: searched) + d->search_mark = -1; + } + return info; } - // show_fa_with_sxiv_after_dot(sourceFa, {{}, {}}, {}); + FA_NodeOfOneCharRead* generate_alt_ending(const codeset_t& restriction, FA_Container& fa){ + FA_NodeOfOneCharRead* n1 = fa.makeOneCharRead(restriction, true); + FA_NodeOfMatch* n2 = fa.makeMatch(); + n2->ext_filter_added = true; // Won't actually be used + reattach_fa_node_edge(&(n1->nxt_node), n2); + return n1; + } + void regular_second_stage_fix(const FA_Container& sourceFa, FA_Container& resultFa, + const REGEX_IS024_FA_FirstStageFixInfo &info1) { - /* Now it's time to fill resultFa. Skipping all look one ahead's */ - auto skip_useless = [&](FA_Node* v) -> FA_Node* { - while (v->type == look_one_ahead){ - v = ((FA_NodeOfLookOneAhead*)v)->nxt_node; - } - return v; - }; - + assert(resultFa.all.empty() && !resultFa.start); + if (!sourceFa.start) + return; resultFa.start = sourceFa.start; + // A vector of pointers in resutFa to nodes that belong to sourceFa. They should undergo a little bit of copying. std::vector homework = {&(resultFa.start)}; + // source node id s index. Element is NULL if no copy (in resultFa) exists and resFa node if copying was performed std::vector sourceIdToResNode(sourceFa.all.size(), NULL); - while (!homework.empty()) { FA_Node** vPtr = homework.back(); homework.pop_back(); - FA_Node* right_source_v = skip_useless(*vPtr); - size_t vid = right_source_v->nodeId; - if (!sourceIdToResNode[vid]) { - sourceIdToResNode[vid] = copy_fa_node_to_another_fa(*right_source_v, resultFa); - for (FA_Node** uuPtr: sourceIdToResNode[vid]->get_all_transitions()) - homework.push_back(uuPtr); - } - *vPtr = sourceIdToResNode[vid]; - sourceIdToResNode[vid]->refs++; - } - } - - - { - /* Guessing info.fed_chars_extend_one_left */ - size_t done = 0; - std::vector searched; - searched.push_back(resultFa.start); - resultFa.start->search_mark = 0; - while (done < searched.size()){ - if (searched[done]->type == look_one_behind){ - info.fed_chars_extend_one_left = true; - break; - } - for (FA_Node** nxtN: searched[done]->get_all_empty_valid_transitions()){ - if ((**nxtN).search_mark < 0){ - (**nxtN).search_mark = 0; - searched.push_back(*nxtN); - } - } - done++; - } - for (FA_Node* d: searched) - d->search_mark = -1; - } - return info; -} - -FA_NodeOfOneCharRead* generate_alt_ending(const codeset_t& restriction, FA_Container& fa){ - FA_NodeOfOneCharRead* n1 = fa.makeOneCharRead(restriction, true); - FA_NodeOfMatch* n2 = fa.makeMatch(); - n2->ext_filter_added = true; // Won't actually be used - reattach_fa_node_edge(&(n1->nxt_node), n2); - return n1; -} - -void regular_second_stage_fix(const FA_Container& sourceFa, FA_Container& resultFa, - const REGEX_IS024_FA_FirstStageFixInfo &info1) -{ - assert(resultFa.all.empty() && !resultFa.start); - if (!sourceFa.start) - return; - resultFa.start = sourceFa.start; - // A vector of pointers in resutFa to nodes that belong to sourceFa. They should undergo a little bit of copying. - std::vector homework = {&(resultFa.start)}; - // source node id s index. Element is NULL if no copy (in resultFa) exists and resFa node if copying was performed - std::vector sourceIdToResNode(sourceFa.all.size(), NULL); - while (!homework.empty()) { - FA_Node** vPtr = homework.back(); homework.pop_back(); - FA_Node* sourceV = *vPtr; assert(sourceV); - size_t sourceVId = sourceV->nodeId; - if (!sourceIdToResNode[sourceVId]) { - if (sourceV->type == match) { - FA_NodeOfMatch& mn = dynamic_cast(*sourceV); - FA_NodeOfMatch* res_mn = resultFa.makeMatch(); - if (info1.fed_chars_extend_one_right) { - FA_NodeOfOneCharRead* res_ocr2n = resultFa.makeOneCharRead( - mn.ext_filter_added ? mn.pending_filter : codeset_of_all, true); - reattach_nxt_node(res_ocr2n, res_mn); - sourceIdToResNode[sourceVId] = res_ocr2n; + FA_Node* sourceV = *vPtr; assert(sourceV); + size_t sourceVId = sourceV->nodeId; + if (!sourceIdToResNode[sourceVId]) { + if (sourceV->type == match) { + FA_NodeOfMatch& mn = dynamic_cast(*sourceV); + FA_NodeOfMatch* res_mn = resultFa.makeMatch(); + if (info1.fed_chars_extend_one_right) { + FA_NodeOfOneCharRead* res_ocr2n = resultFa.makeOneCharRead( + mn.ext_filter_added ? mn.pending_filter : codeset_of_all, true); + reattach_nxt_node(res_ocr2n, res_mn); + sourceIdToResNode[sourceVId] = res_ocr2n; + } else { + sourceIdToResNode[sourceVId] = res_mn; + } } else { - sourceIdToResNode[sourceVId] = res_mn; + sourceIdToResNode[sourceVId] = copy_fa_node_to_another_fa(*sourceV, resultFa); + /* O_o */ + for (FA_Node** uuPtr: sourceIdToResNode[sourceVId]->get_all_transitions()) + homework.push_back(uuPtr); } - } else { - sourceIdToResNode[sourceVId] = copy_fa_node_to_another_fa(*sourceV, resultFa); - /* O_o */ - for (FA_Node** uuPtr: sourceIdToResNode[sourceVId]->get_all_transitions()) - homework.push_back(uuPtr); } + *vPtr = sourceIdToResNode[sourceVId]; + sourceIdToResNode[sourceVId]->refs++; } - *vPtr = sourceIdToResNode[sourceVId]; - sourceIdToResNode[sourceVId]->refs++; - } - if (info1.fed_chars_extend_one_left) { - FA_NodeOfOneCharRead* ns = resultFa.makeOneCharRead(codeset_of_all, true); - yay_new_start(resultFa, ns); + if (info1.fed_chars_extend_one_left) { + FA_NodeOfOneCharRead* ns = resultFa.makeOneCharRead(codeset_of_all, true); + yay_new_start(resultFa, ns); + } } } diff --git a/src/libregexis024fa/fa_first_stage_fix.h b/src/libregexis024fa/fa_first_stage_fix.h index 0be0a4d..1c77e98 100644 --- a/src/libregexis024fa/fa_first_stage_fix.h +++ b/src/libregexis024fa/fa_first_stage_fix.h @@ -3,16 +3,18 @@ #include "finite_automaton.h" -struct REGEX_IS024_FA_FirstStageFixInfo{ - bool fed_chars_extend_one_left = false; - bool fed_chars_extend_one_right = false; -}; +namespace regexis024 { + struct REGEX_IS024_FA_FirstStageFixInfo{ + bool fed_chars_extend_one_left = false; + bool fed_chars_extend_one_right = false; + }; -/* Will look for look_one_ahead nodes and apply their filter to reading filters ahead * - * sourceFa will be ruined. The output will be in resultFa */ -REGEX_IS024_FA_FirstStageFixInfo first_stage_fix_fa(FA_Container& sourceFa, FA_Container& resultFa); + /* Will look for look_one_ahead nodes and apply their filter to reading filters ahead * + * sourceFa will be ruined. The output will be in resultFa */ + REGEX_IS024_FA_FirstStageFixInfo first_stage_fix_fa(FA_Container& sourceFa, FA_Container& resultFa); -void regular_second_stage_fix(const FA_Container& sourceFa, FA_Container& resultFa, - const REGEX_IS024_FA_FirstStageFixInfo &info1); + void regular_second_stage_fix(const FA_Container& sourceFa, FA_Container& resultFa, + const REGEX_IS024_FA_FirstStageFixInfo &info1); +} #endif //LIBREGEXIS024_FA_FIRST_STAGE_FIX_H diff --git a/src/libregexis024fa/fa_make_deterministic.cpp b/src/libregexis024fa/fa_make_deterministic.cpp index 9f86c32..930fb0e 100644 --- a/src/libregexis024fa/fa_make_deterministic.cpp +++ b/src/libregexis024fa/fa_make_deterministic.cpp @@ -1,6 +1,5 @@ #include #include -#include /* to get exitf */ #include #include #include @@ -17,649 +16,649 @@ #define PR_DEB #endif -/* debug nonsence */ -void input_fa_assert(const FA_Container& fa){ - assert(fa.start); - for (FA_Node* node: fa.all){ - if (node->type == one_char_read){ - assert(!dynamic_cast(node)->second_ns); - } else if (node->type == look_one_ahead || - node->type == det_char_crossroads){ - exitf("not allowed at this stage\n"); +namespace regexis024 { + /* debug nonsence */ + void input_fa_assert(const FA_Container& fa){ + assert(fa.start); + for (FA_Node* node: fa.all){ + if (node->type == one_char_read){ + assert(!dynamic_cast(node)->second_ns); + } else if (node->type == look_one_ahead || node->type == det_char_crossroads) { + assert(false); + } } } -} -struct OperHistoryNodeTransition { - TrackingOperationInFa op; - size_t u; + struct OperHistoryNodeTransition { + TrackingOperationInFa op; + size_t u; - OperHistoryNodeTransition(const TrackingOperationInFa &op, size_t u): op(op), u(u) {} -}; + OperHistoryNodeTransition(const TrackingOperationInFa &op, size_t u): op(op), u(u) {} + }; -struct OperHistoryNode { - std::vector next; - /* When it is part of clean history, this */ - std::vector compressed_selarr; - std::vector raisin; + struct OperHistoryNode { + std::vector next; + /* When it is part of clean history, this */ + std::vector compressed_selarr; + std::vector raisin; - OperHistoryNode() = default; -}; + OperHistoryNode() = default; + }; -/* This object can describe an empty superstate (needed to describe clean history nodes without raisin) - * If det_stops is empty, interpret it as empty superstate */ -struct SuperState { - std::vector sorted_raisin; - std::vector double_compressed_selarr; + /* This object can describe an empty superstate (needed to describe clean history nodes without raisin) + * If det_stops is empty, interpret it as empty superstate */ + struct SuperState { + std::vector sorted_raisin; + std::vector double_compressed_selarr; - bool empty() const { - return sorted_raisin.empty(); - } + bool empty() const { + return sorted_raisin.empty(); + } #ifdef PR_DEB - std::string toString() const { - std::string f1_raisin; - for (uint64_t el: sorted_raisin) { - if (!f1_raisin.empty()) - f1_raisin += ", "; - f1_raisin += std::to_string(el); - } - std::string f2_selarr; - for (uint64_t el: double_compressed_selarr) { - if (!f2_selarr.empty()) - f2_selarr += ", "; - f2_selarr += std::to_string(el); - } - - return "sorted_raisin: {" + f1_raisin + "}, double_comp_selarr: {" + f2_selarr + "}"; - } -#endif -}; - -struct CleanOperHistoryNode { - std::vector next; - SuperState exit; -}; - -struct SelarrCompressionScheme { - size_t SN1, SN2 = 0, SN3 = 0; - std::vector S1_to_S2; - std::vector S2_to_sifter; - std::vector S3_to_sifter; - const RegexPriorityTable& sifter; - - SelarrCompressionScheme(size_t sn1, const RegexPriorityTable &sifter) : SN1(sn1), sifter(sifter) { - assert(sifter.size() <= UINT32_MAX); - S1_to_S2.assign(SN1, -1); - for (regex_tai_t i = 0; i < sifter.size(); i++) { - auto& act = sifter[i].pos; - regex_tai_t first_on_s2 = S2_to_sifter.size(); - S2_to_sifter.push_back(i); - S1_to_S2[act.first] = first_on_s2; - if (act.type != tracking_var_types::dot_cur_pos) { - S3_to_sifter.push_back(i); + std::string toString() const { + std::string f1_raisin; + for (uint64_t el: sorted_raisin) { + if (!f1_raisin.empty()) + f1_raisin += ", "; + f1_raisin += std::to_string(el); } - if (act.type == tracking_var_types::range) { - regex_tai_t second_on_s2 = S2_to_sifter.size(); + std::string f2_selarr; + for (uint64_t el: double_compressed_selarr) { + if (!f2_selarr.empty()) + f2_selarr += ", "; + f2_selarr += std::to_string(el); + } + + return "sorted_raisin: {" + f1_raisin + "}, double_comp_selarr: {" + f2_selarr + "}"; + } +#endif + }; + + struct CleanOperHistoryNode { + std::vector next; + SuperState exit; + }; + + struct SelarrCompressionScheme { + size_t SN1, SN2 = 0, SN3 = 0; + std::vector S1_to_S2; + std::vector S2_to_sifter; + std::vector S3_to_sifter; + const RegexPriorityTable& sifter; + + SelarrCompressionScheme(size_t sn1, const RegexPriorityTable &sifter) : SN1(sn1), sifter(sifter) { + assert(sifter.size() <= UINT32_MAX); + S1_to_S2.assign(SN1, -1); + for (tai_t i = 0; i < sifter.size(); i++) { + auto& act = sifter[i].pos; + tai_t first_on_s2 = S2_to_sifter.size(); S2_to_sifter.push_back(i); - S1_to_S2[act.second] = second_on_s2; + S1_to_S2[act.first] = first_on_s2; + if (act.type != tracking_var_types::dot_cur_pos) { + S3_to_sifter.push_back(i); + } + if (act.type == tracking_var_types::range) { + tai_t second_on_s2 = S2_to_sifter.size(); + S2_to_sifter.push_back(i); + S1_to_S2[act.second] = second_on_s2; + } } + SN2 = S2_to_sifter.size(); + SN3 = S3_to_sifter.size(); + assert(SN3 <= SN2 && SN2 <= SN1 && SN1 <= UINT16_MAX); + } - SN2 = S2_to_sifter.size(); - SN3 = S3_to_sifter.size(); - assert(SN3 <= SN2 && SN2 <= SN1 && SN1 <= UINT16_MAX); - - } -}; - -std::vector compress_compressed_selarr(const std::vector& S2, - const SelarrCompressionScheme& cmp) { - std::vector S3(cmp.SN3); - for (size_t i = 0; i < cmp.SN3; i++) { - const RegexPriorityTableAction_Pos& act = cmp.sifter[cmp.S3_to_sifter[i]].pos; - if (act.type == tracking_var_types::dot_immediate) { - S3[i] = S2[cmp.S1_to_S2[act.first]]; - } else { - assert(act.type == tracking_var_types::range); // It must be range type - uint64_t onFirstBorder = S2[cmp.S1_to_S2[act.first]]; - uint64_t onSecondBorder = S2[cmp.S1_to_S2[act.second]]; - S3[i] = (onFirstBorder > onSecondBorder) ? 1 : 0; - } - } - return S3; -} - -bool compressed_selarr_A_outranks_B(const std::vector& A, const std::vector& B, - const SelarrCompressionScheme& cmp) { - for (const RegexPriorityTableAction& act: cmp.sifter) { - uint64_t valA = A[cmp.S1_to_S2[act.pos.first]]; - uint64_t valB = B[cmp.S1_to_S2[act.pos.first]]; - if (act.pos.type == tracking_var_types::range) { - uint64_t valAsecond = A[cmp.S1_to_S2[act.pos.second]]; - uint64_t valBsecond = A[cmp.S1_to_S2[act.pos.second]]; - valA = valAsecond > valA ? valAsecond - valA : 0; - valB = valBsecond > valB ? valBsecond - valB : 0; - } - if (valA == valB) - continue; - return (valA < valB) == act.minimize; - } - return false; -} - -/* Beacuse of the way wash_history_bush builds this structure, root is te last node. - * rankdir is from left to right (guaranteed). Can be empty if original history contained no raisin */ -struct RaisinBush { - std::vector clean_history; - ssize_t start = -1; - - bool empty() const { - return start < 0; - } - -#ifdef PR_DEB - void print() { - lines text; - text.push_back("Raisin bush"); - if (start >= 0) { - size_t n = clean_history.size(); - std::vector m(n, false); - TreeWithStringsNode e{""}; - std::function dfs = [&] - (TreeWithStringsNode& fill, size_t nodeId) - { - if (m[nodeId]) { - fill.val = "PARADOX"; - return; - } - m[nodeId] = true; - const CleanOperHistoryNode& node = clean_history[nodeId]; - fill.val = "[" + std::to_string(nodeId) + "]"; - if (!node.exit.empty()) - fill.val += (" EXIT: " + node.exit.toString()); - size_t CN = node.next.size(); - fill.childeren.resize(CN); - for (size_t i = 0; i < CN; i++) { - fill.childeren[i].val = node.next[i].op.toString(); - fill.childeren[i].childeren = {{}}; - dfs(fill.childeren[i].childeren[0], node.next[i].u); - } - }; - dfs(e, start); - size_t am = 0; - for (bool el: m) - am += static_cast(el); - if (am < n) - text[0] += ": " + std::to_string(n - am) + " nodes are unreachable by detour"; - e.toLines(text); - } else { - if (clean_history.empty()) - text[0] = "Empty Raisin Bush"; - else - text [0] = "Raisin bush with no root and " + std::to_string(clean_history.size()) = " nodes missed"; - } - printLines(wrapWithBox(text)); - } -#endif -}; - -void wash_history_bush(const std::vector& history, RaisinBush& answer, - const SelarrCompressionScheme& cmp) { - assert(!history.empty()); - std::vector has_raisin(history.size()); - std::vector dirty_to_clean(history.size(), -1); - std::vector > callStack = {{0, 0}}; - - auto hist_clean_detour_init_clean = [&](uint64_t v) -> uint64_t { - if (!has_raisin[v]) { - has_raisin[v] = true; - dirty_to_clean[v] = answer.clean_history.size(); - answer.clean_history.emplace_back(); - } - return dirty_to_clean[v]; }; - while (!callStack.empty()) { - size_t v = callStack.back().first; - size_t od = callStack.back().second; - if (od == 0) { - if (!history[v].raisin.empty()) { - size_t cleanVId = hist_clean_detour_init_clean(v); - std::vector& sr = answer.clean_history[cleanVId].exit.sorted_raisin; - sr = history[v].raisin; - std::sort(sr.begin(), sr.end()); - answer.clean_history[cleanVId].exit.double_compressed_selarr = compress_compressed_selarr(history[v].compressed_selarr, cmp); - } - } else { - const OperHistoryNodeTransition& old_hist_tr = history[v].next[od - 1]; - uint64_t ou = old_hist_tr.u; - if (has_raisin[ou]) { - size_t cleanVId = hist_clean_detour_init_clean(v); - answer.clean_history[cleanVId].next.emplace_back(old_hist_tr.op, dirty_to_clean[ou]); + std::vector compress_compressed_selarr(const std::vector& S2, + const SelarrCompressionScheme& cmp) { + std::vector S3(cmp.SN3); + for (size_t i = 0; i < cmp.SN3; i++) { + const RegexPriorityTableAction_Pos& act = cmp.sifter[cmp.S3_to_sifter[i]].pos; + if (act.type == tracking_var_types::dot_immediate) { + S3[i] = S2[cmp.S1_to_S2[act.first]]; + } else { + assert(act.type == tracking_var_types::range); // It must be range type + uint64_t onFirstBorder = S2[cmp.S1_to_S2[act.first]]; + uint64_t onSecondBorder = S2[cmp.S1_to_S2[act.second]]; + S3[i] = (onFirstBorder > onSecondBorder) ? 1 : 0; } } - - if (od == history[v].next.size()) { - callStack.pop_back(); - } else { - callStack.back().second++; - callStack.emplace_back(history[v].next[od].u, 0); - } + return S3; } - if (has_raisin[0]) { - assert(dirty_to_clean[0] >= 0); - answer.start = dirty_to_clean[0]; - } - -} - -/* If is_it_after_read is false, unknown selarr range variable border and cur pos are evaluated to 0. - * Otherwise, cur pos considered to be greater than previous values of selarr ange variable boundaries */ -void building_detour(const SelarrCompressionScheme& cmp, - const std::vector& outer_selarr, const std::vector& zeroeps, const codeset_t& I, - RaisinBush& answer, bool is_it_after_read) -{ -#ifdef PR_DEB - printf("Det Debug: build_detour started with zeroeps:{"); - for (FA_Node* node: zeroeps) - printf("%lu,", node->nodeId); - printf("}, I: {%s}\n", stringifyCodesetBase10(I).c_str()); -#endif - assert(cmp.SN3 == outer_selarr.size()); - if (!is_it_after_read) - for (uint64_t val: outer_selarr) - assert(val == 0); - - struct SearchMark { - FA_Node* domain_node; - uint64_t epsilon_refs = 0; - uint64_t detour_sat = 0; - /* id of corresponding history node */ - size_t Hv = 0; - - explicit SearchMark(FA_Node *domain_node) : domain_node(domain_node) {} - }; - - /* Default values are good for me */ - std::vector marks; - for (size_t i = 0; i < zeroeps.size(); i++) { - marks.emplace_back(zeroeps[i]); - zeroeps[i]->search_mark = i; - } - - auto lob_allows_to_pass = [&](FA_NodeOfLookOneBehind* lob) -> bool { - if (!intersect_sets(lob->filter, I).empty()) { - assert(merge_sets(lob->filter, I) == lob->filter); - return true; + bool compressed_selarr_A_outranks_B(const std::vector& A, const std::vector& B, + const SelarrCompressionScheme& cmp) { + for (const RegexPriorityTableAction& act: cmp.sifter) { + uint64_t valA = A[cmp.S1_to_S2[act.pos.first]]; + uint64_t valB = B[cmp.S1_to_S2[act.pos.first]]; + if (act.pos.type == tracking_var_types::range) { + uint64_t valAsecond = A[cmp.S1_to_S2[act.pos.second]]; + uint64_t valBsecond = A[cmp.S1_to_S2[act.pos.second]]; + valA = valAsecond > valA ? valAsecond - valA : 0; + valB = valBsecond > valB ? valBsecond - valB : 0; + } + if (valA == valB) + continue; + return (valA < valB) == act.minimize; } return false; - }; - - { /* First i need to know exacly how many of MINE epsilon transitions are referencing each NODE */ - std::vector domain_detour = zeroeps; - while (!domain_detour.empty()) { - FA_Node* v = domain_detour.back(); domain_detour.pop_back(); - if (v->type == look_one_behind && !lob_allows_to_pass(dynamic_cast(v))) - continue; - for (FA_Node** uPtr: v->get_all_empty_valid_transitions()) { - assert(*uPtr); - int64_t &rds = (**uPtr).search_mark; - if (rds == -1) { - rds = marks.size(); - domain_detour.push_back(*uPtr); - marks.emplace_back(*uPtr); - } - marks[rds].epsilon_refs++; - } - } } - std::vector history = {OperHistoryNode()}; - history[0].compressed_selarr.assign(cmp.SN2, 0); - for (size_t i = 0; i < cmp.SN3; i++) { - const RegexPriorityTableAction_Pos& act = cmp.sifter[cmp.S3_to_sifter[i]].pos; - if (act.type == tracking_var_types::range) { - if (outer_selarr[i]) { - history[0].compressed_selarr[cmp.S1_to_S2[act.second]] = 1; - } - } else { - assert(act.type == tracking_var_types::dot_immediate); - history[0].compressed_selarr[cmp.S1_to_S2[act.first]] = outer_selarr[i]; + + /* Beacuse of the way wash_history_bush builds this structure, root is te last node. + * rankdir is from left to right (guaranteed). Can be empty if original history contained no raisin */ + struct RaisinBush { + std::vector clean_history; + ssize_t start = -1; + + bool empty() const { + return start < 0; } - } - /* As a result, dot_cur_pos variables will be initialized as zero (always) */ - /* In my second detour, I will pass each vertex here only one time: after hitting the total epsilon refcount */ - std::vector can_process = zeroeps; - /* - auto increase_sat_refcount = [&](SearchMark& mark) { - mark.detour_sat++; - if (mark.detour_sat == mark.epsilon_refs && mark.ever_walked_in) { - can_process.push_back(mark.domain_node); - } - }; - */ - - auto add_history_update = [&](TrackingOperationInFa how, uint64_t where, uint64_t from_where) { - history[from_where].next.emplace_back(how, where); - }; - - while (!can_process.empty()) { - FA_Node* v = can_process.back(); can_process.pop_back(); - SearchMark& Vmark = marks[v->search_mark]; - assert(Vmark.detour_sat == Vmark.epsilon_refs); - uint64_t Hv = Vmark.Hv; - uint64_t Hop = Hv; - if (v->type == look_one_behind) { - FA_NodeOfLookOneBehind* tv = dynamic_cast(v); - if (!lob_allows_to_pass(tv)) - continue; - } else if (isTrackingFaNode(v)) { - Hop = history.size(); - history.emplace_back(); - std::vector& val2 = history.back().compressed_selarr; - val2 = history[Hv].compressed_selarr; - if (v->type == track_array_mov_imm) { - FA_NodeOfTrackArrayMovImm* tv = dynamic_cast(v); - if (isSelarrOpcode(tv->operation)) { - int key_s2 = cmp.S1_to_S2[tv->key]; - if (key_s2 >= 0){ - assert(cmp.sifter[cmp.S2_to_sifter[key_s2]].pos.type == tracking_var_types::dot_immediate); - val2[key_s2] = tv->imm_value; +#ifdef PR_DEB + void print() { + lines text; + text.push_back("Raisin bush"); + if (start >= 0) { + size_t n = clean_history.size(); + std::vector m(n, false); + TreeWithStringsNode e{""}; + std::function dfs = [&] + (TreeWithStringsNode& fill, size_t nodeId) + { + if (m[nodeId]) { + fill.val = "PARADOX"; + return; } - } - add_history_update(TrackingOperationInFa(tv->operation, tv->key, tv->imm_value), Hop, Hv); - } else if (v->type == track_array_mov_halfinvariant) { - FA_NodeOfTrackArrayMovHalfinvariant* tv = dynamic_cast(v); - if (isSelarrOpcode(tv->operation)) { - int key_s2 = cmp.S1_to_S2[tv->key]; - if (key_s2 >= 0){ - const RegexPriorityTableAction_Pos& act = cmp.sifter[cmp.S2_to_sifter[key_s2]].pos; - assert(act.type != tracking_var_types::dot_immediate); - if (act.type == tracking_var_types::dot_cur_pos) { - val2[key_s2] = is_it_after_read ? 1 : 0; - } else { - val2[key_s2] = is_it_after_read ? 2 : 0; - } + m[nodeId] = true; + const CleanOperHistoryNode& node = clean_history[nodeId]; + fill.val = "[" + std::to_string(nodeId) + "]"; + if (!node.exit.empty()) + fill.val += (" EXIT: " + node.exit.toString()); + size_t CN = node.next.size(); + fill.childeren.resize(CN); + for (size_t i = 0; i < CN; i++) { + fill.childeren[i].val = node.next[i].op.toString(); + fill.childeren[i].childeren = {{}}; + dfs(fill.childeren[i].childeren[0], node.next[i].u); } - } - add_history_update(TrackingOperationInFa(tv->operation, tv->key), Hop, Hv); + }; + dfs(e, start); + size_t am = 0; + for (bool el: m) + am += static_cast(el); + if (am < n) + text[0] += ": " + std::to_string(n - am) + " nodes are unreachable by detour"; + e.toLines(text); + } else { + if (clean_history.empty()) + text[0] = "Empty Raisin Bush"; + else + text [0] = "Raisin bush with no root and " + std::to_string(clean_history.size()) = " nodes missed"; } - } else if (v->type == match || v->type == one_char_read) { - // Determinization stop - history[Hv].raisin.push_back(v->nodeId); + printLines(wrapWithBox(text)); } - for (FA_Node** uPtr: v->get_all_empty_valid_transitions()) { - assert(*uPtr); - SearchMark& Umark = marks[(**uPtr).search_mark]; - /* Here I use Hop to determine Hv value of u */ - if (Umark.detour_sat == 0) { - Umark.Hv = Hop; - } else if (Umark.Hv != Hop) { - if (compressed_selarr_A_outranks_B( - history[Hop].compressed_selarr, history[Umark.Hv].compressed_selarr, cmp)){ - Umark.Hv = Hop; - } +#endif + }; + + void wash_history_bush(const std::vector& history, RaisinBush& answer, + const SelarrCompressionScheme& cmp) { + assert(!history.empty()); + std::vector has_raisin(history.size()); + std::vector dirty_to_clean(history.size(), -1); + std::vector > callStack = {{0, 0}}; + + auto hist_clean_detour_init_clean = [&](uint64_t v) -> uint64_t { + if (!has_raisin[v]) { + has_raisin[v] = true; + dirty_to_clean[v] = answer.clean_history.size(); + answer.clean_history.emplace_back(); } - /* Collision calculation finished */ - Umark.detour_sat++; - if (Umark.detour_sat == Umark.epsilon_refs) { - can_process.push_back(Umark.domain_node); - } - } - } - /* Cleaning this mess */ - for (auto& m: marks) { - m.domain_node->search_mark = -1; - } - /* Packaging the answer (we do a little bit of dfs here) */ - wash_history_bush(history, answer, cmp); -} - -void update_had_to_fork_status(const RaisinBush& bush, int& had_to_fork) { - for (const CleanOperHistoryNode& node: bush.clean_history) { - if (node.next.size() > 1 || (!node.next.empty() && !node.exit.empty())) { - had_to_fork = 1; - return; - } - } -} - -typedef size_t superstate_id_t; - -typedef std::vector> homework_t; - -struct LessSuperState { - bool operator()(const SuperState& A, const SuperState& B) const { - std::less> f1L; - if (f1L(A.sorted_raisin, B.sorted_raisin)) - return true; - if (f1L(B.sorted_raisin, A.sorted_raisin)) - return false; - return f1L(A.double_compressed_selarr, B.double_compressed_selarr); - } -}; - -struct GlobalDetourProgress { - std::map superstates; - /* Each element is a root of some megabush in resFa */ - std::vector superstate_megabush_constructed; - std::vector todo_superstaes; -}; - -/* If x was not previously achieved, it will also add it to t o d o list of global detour */ -superstate_id_t convertSuperstateToId(const SuperState& x, GlobalDetourProgress& gdp) { - if (gdp.superstates.count(x)) { - return gdp.superstates[x]; - } - size_t n = gdp.superstates.size(); - gdp.superstates.insert({x, n}); - gdp.todo_superstaes.push_back(x); - gdp.superstate_megabush_constructed.push_back(NULL); - return n; -} - -FA_Node* build_dead_end(FA_Container& resFa) { - return resFa.makeForking(); -} - -void build_bush(const RaisinBush& alpha, FA_Node** sowing_location, FA_Container& resFa, - homework_t& homework, GlobalDetourProgress& gdp) { - size_t n = alpha.clean_history.size(); - if (n == 0) { - FA_Node* dead_end = build_dead_end(resFa); - reattach_fa_node_edge(sowing_location, dead_end); - return; - } - std::vector> todo = {{sowing_location, alpha.start}}; - - while (!todo.empty()) { - FA_Node** sl = todo.back().first; - const CleanOperHistoryNode& hnode = alpha.clean_history[todo.back().second]; - todo.pop_back(); - auto history_transition = [&](size_t i, FA_Node** of_sl) { - FA_NodePathPart* pn = convert_to_node(hnode.next[i].op, resFa); - reattach_fa_node_edge(of_sl, pn); - todo.emplace_back(&(pn->nxt_node), hnode.next[i].u); + return dirty_to_clean[v]; }; - if (hnode.next.empty()) { - assert(!hnode.exit.empty()); - superstate_id_t w = convertSuperstateToId(hnode.exit, gdp); - homework.emplace_back(sl, w); - } else if (hnode.next.size() == 1 && hnode.exit.empty()) { - history_transition(0, sl); - } else { - FA_NodeOfForking* forker = resFa.makeForking(); - bool raisin = !hnode.exit.empty(); - size_t k = hnode.next.size(); - forker->nxt_options.assign(k + static_cast(raisin), NULL); - for (size_t i = 0; i < k; i++) { - history_transition(i, &(forker->nxt_options[i])); - } - if (raisin) { - superstate_id_t w = convertSuperstateToId(hnode.exit, gdp); - homework.emplace_back(&(forker->nxt_options[k]), w); - } - reattach_fa_node_edge(sl, forker); - } - } -} - -ColoredCodeset get_pretreated_cc(FA_Container& sourceFa) { - std::set little_insects; - for (FA_Node* v: sourceFa.all) { - if (v->type == look_one_behind) { - little_insects.insert(static_cast(v)->filter); - } - } - ColoredCodeset pretreated_cc(little_insects.size()); - for (const codeset_t& cs: little_insects) { - pretreated_cc.apply_divisor(cs); - } - return pretreated_cc; -} - -// todo add a check on size of dfa -void try_determinize_fa(FA_Container &sourceFa, const RegexPriorityTable &sifter, regex_tai_t selarr_sz, - const REGEX_IS024_FA_FirstStageFixInfo &info1, FA_Container &resFa, int &error, int& had_to_fork) -{ - /* During execuion, i will create pointers to field res.start and store them (inside the scope of this function) - * Luckily res argument is already immovable in this scope. */ - error = 0; - had_to_fork = 0; - assert(resFa.start == NULL && resFa.all.empty()); - input_fa_assert(sourceFa); - SelarrCompressionScheme cmp(selarr_sz, sifter); - - GlobalDetourProgress gdp; - homework_t homework; - - ColoredCodeset pretreated_cc = get_pretreated_cc(sourceFa); - - FA_Node** res_start_ptr = &(resFa.start); - if (info1.fed_chars_extend_one_left) { - ColoredCodeset inp_distinction = pretreated_cc; - inp_distinction.apply_divisor(codeset_of_all); - std::vector starting_Is; - std::vector> starting_Cids; /* Filler variable */ - inp_distinction.get_splits_of_non_dummy(starting_Is, starting_Cids); - size_t R = starting_Is.size(); - for (auto& rdh: starting_Cids) { - assert(rdh.size() == 1 && rdh[0] == 0); - } - FA_NodeOfDetCharCrossroads* very_first_cr = resFa.makeDetCharCrossroads(); - very_first_cr->second_ns = true; - reattach_fa_node_edge(res_start_ptr, very_first_cr); - very_first_cr->crossroads.resize(R); /* After that, nobody has right to resize crossroads array */ - for (size_t i = 0; i < R; i++) { - very_first_cr->crossroads[i].input = starting_Is[i]; - FA_Node** sowing_place = &(very_first_cr->crossroads[i].nxt_node); - RaisinBush alpha; - building_detour(cmp, std::vector(cmp.SN3, 0), {sourceFa.start}, starting_Is[i], alpha, false); -#ifdef PR_DEB - printf("Initialization hard %ld/%ld\n", i + 1, R); - alpha.print(); -#endif - update_had_to_fork_status(alpha, had_to_fork); - build_bush(alpha, sowing_place, resFa, homework, gdp); - } - } else { - RaisinBush alpha; - building_detour(cmp, std::vector(cmp.SN3, 0), {sourceFa.start}, codeset_of_all, alpha, false); -#ifdef PR_DEB - printf("Initialization easy\n"); - alpha.print(); -#endif - update_had_to_fork_status(alpha, had_to_fork); - build_bush(alpha, res_start_ptr, resFa, homework, gdp); - } - /* Now we start the actual detour. */ - while (!gdp.todo_superstaes.empty()) { - SuperState SS = gdp.todo_superstaes.back(); gdp.todo_superstaes.pop_back(); - // printf("Global detour turn: %s\n", SS.toString().c_str()); - std::vector reading_stops; - codeset_t how_can_i_finish = {}; - for (size_t v: SS.sorted_raisin) { - FA_Node* node = sourceFa.all[v]; - if (node->type == one_char_read) { - reading_stops.push_back(static_cast(node)); - } else if (node->type == match) { - auto fn = static_cast(node); - assert(!fn->ext_filter_added || info1.fed_chars_extend_one_right); - if (fn->ext_filter_added) { - how_can_i_finish = merge_sets(how_can_i_finish, fn->pending_filter); - } else { - how_can_i_finish = codeset_of_all; + while (!callStack.empty()) { + size_t v = callStack.back().first; + size_t od = callStack.back().second; + if (od == 0) { + if (!history[v].raisin.empty()) { + size_t cleanVId = hist_clean_detour_init_clean(v); + std::vector& sr = answer.clean_history[cleanVId].exit.sorted_raisin; + sr = history[v].raisin; + std::sort(sr.begin(), sr.end()); + answer.clean_history[cleanVId].exit.double_compressed_selarr = compress_compressed_selarr(history[v].compressed_selarr, cmp); + } + } else { + const OperHistoryNodeTransition& old_hist_tr = history[v].next[od - 1]; + uint64_t ou = old_hist_tr.u; + if (has_raisin[ou]) { + size_t cleanVId = hist_clean_detour_init_clean(v); + answer.clean_history[cleanVId].next.emplace_back(old_hist_tr.op, dirty_to_clean[ou]); } - } else - assert(false); - } - // Determinization stop: one char read (input) - ColoredCodeset inp_distinction = pretreated_cc; - size_t pr = reading_stops.size(); - for (size_t i = 0; i < pr; i++) { - inp_distinction.apply_divisor(reading_stops[i]->filter); - } - std::vector Is; - std::vector> Cids; - inp_distinction.get_splits_of_non_dummy(Is, Cids); - size_t R = Is.size(); - FA_NodeOfDetCharCrossroads* my_cr = NULL; - if (R > 0) { - my_cr = resFa.makeDetCharCrossroads(); - if (!info1.fed_chars_extend_one_right && !how_can_i_finish.empty()) { - assert(how_can_i_finish == codeset_of_all); - my_cr->matching = true; } - my_cr->crossroads.resize(R); - } - for (size_t i = 0; i < R; i++) { - my_cr->crossroads[i].input = Is[i]; - my_cr->crossroads[i].nxt_node = NULL; - std::vector fl_passed_filters; - for (size_t j: Cids[i]) { - fl_passed_filters.push_back(reading_stops[j]->nxt_node); + + if (od == history[v].next.size()) { + callStack.pop_back(); + } else { + callStack.back().second++; + callStack.emplace_back(history[v].next[od].u, 0); } - // todo: make a function out of next 6 lines of code - RaisinBush alpha; - building_detour(cmp, SS.double_compressed_selarr, fl_passed_filters, Is[i], alpha, true); + } + + if (has_raisin[0]) { + assert(dirty_to_clean[0] >= 0); + answer.start = dirty_to_clean[0]; + } + + } + + /* If is_it_after_read is false, unknown selarr range variable border and cur pos are evaluated to 0. + * Otherwise, cur pos considered to be greater than previous values of selarr ange variable boundaries */ + void building_detour(const SelarrCompressionScheme& cmp, + const std::vector& outer_selarr, const std::vector& zeroeps, const codeset_t& I, + RaisinBush& answer, bool is_it_after_read) + { #ifdef PR_DEB - printf("That same turn, subbush %ld/%ld\n", i + 1, R); + printf("Det Debug: build_detour started with zeroeps:{"); + for (FA_Node* node: zeroeps) + printf("%lu,", node->nodeId); + printf("}, I: {%s}\n", stringifyCodesetBase10(I).c_str()); +#endif + assert(cmp.SN3 == outer_selarr.size()); + if (!is_it_after_read) + for (uint64_t val: outer_selarr) + assert(val == 0); + + struct SearchMark { + FA_Node* domain_node; + uint64_t epsilon_refs = 0; + uint64_t detour_sat = 0; + /* id of corresponding history node */ + size_t Hv = 0; + + explicit SearchMark(FA_Node *domain_node) : domain_node(domain_node) {} + }; + + /* Default values are good for me */ + std::vector marks; + for (size_t i = 0; i < zeroeps.size(); i++) { + marks.emplace_back(zeroeps[i]); + zeroeps[i]->search_mark = i; + } + + auto lob_allows_to_pass = [&](FA_NodeOfLookOneBehind* lob) -> bool { + if (!intersect_sets(lob->filter, I).empty()) { + assert(merge_sets(lob->filter, I) == lob->filter); + return true; + } + return false; + }; + + { /* First i need to know exacly how many of MINE epsilon transitions are referencing each NODE */ + std::vector domain_detour = zeroeps; + while (!domain_detour.empty()) { + FA_Node* v = domain_detour.back(); domain_detour.pop_back(); + if (v->type == look_one_behind && !lob_allows_to_pass(dynamic_cast(v))) + continue; + for (FA_Node** uPtr: v->get_all_empty_valid_transitions()) { + assert(*uPtr); + int64_t &rds = (**uPtr).search_mark; + if (rds == -1) { + rds = marks.size(); + domain_detour.push_back(*uPtr); + marks.emplace_back(*uPtr); + } + marks[rds].epsilon_refs++; + } + } + } + std::vector history = {OperHistoryNode()}; + history[0].compressed_selarr.assign(cmp.SN2, 0); + for (size_t i = 0; i < cmp.SN3; i++) { + const RegexPriorityTableAction_Pos& act = cmp.sifter[cmp.S3_to_sifter[i]].pos; + if (act.type == tracking_var_types::range) { + if (outer_selarr[i]) { + history[0].compressed_selarr[cmp.S1_to_S2[act.second]] = 1; + } + } else { + assert(act.type == tracking_var_types::dot_immediate); + history[0].compressed_selarr[cmp.S1_to_S2[act.first]] = outer_selarr[i]; + } + } + /* As a result, dot_cur_pos variables will be initialized as zero (always) */ + + /* In my second detour, I will pass each vertex here only one time: after hitting the total epsilon refcount */ + std::vector can_process = zeroeps; + /* + auto increase_sat_refcount = [&](SearchMark& mark) { + mark.detour_sat++; + if (mark.detour_sat == mark.epsilon_refs && mark.ever_walked_in) { + can_process.push_back(mark.domain_node); + } + }; + */ + + auto add_history_update = [&](TrackingOperationInFa how, uint64_t where, uint64_t from_where) { + history[from_where].next.emplace_back(how, where); + }; + + while (!can_process.empty()) { + FA_Node* v = can_process.back(); can_process.pop_back(); + SearchMark& Vmark = marks[v->search_mark]; + assert(Vmark.detour_sat == Vmark.epsilon_refs); + uint64_t Hv = Vmark.Hv; + uint64_t Hop = Hv; + if (v->type == look_one_behind) { + FA_NodeOfLookOneBehind* tv = dynamic_cast(v); + if (!lob_allows_to_pass(tv)) + continue; + } else if (isTrackingFaNode(v)) { + Hop = history.size(); + history.emplace_back(); + std::vector& val2 = history.back().compressed_selarr; + val2 = history[Hv].compressed_selarr; + if (v->type == track_array_mov_imm) { + FA_NodeOfTrackArrayMovImm* tv = dynamic_cast(v); + if (isSelarrOpcode(tv->operation)) { + int key_s2 = cmp.S1_to_S2[tv->key]; + if (key_s2 >= 0){ + assert(cmp.sifter[cmp.S2_to_sifter[key_s2]].pos.type == tracking_var_types::dot_immediate); + val2[key_s2] = tv->imm_value; + } + } + add_history_update(TrackingOperationInFa{tv->operation, tv->key, tv->imm_value}, Hop, Hv); + } else if (v->type == track_array_mov_halfinvariant) { + FA_NodeOfTrackArrayMovHalfinvariant* tv = dynamic_cast(v); + if (isSelarrOpcode(tv->operation)) { + int key_s2 = cmp.S1_to_S2[tv->key]; + if (key_s2 >= 0){ + const RegexPriorityTableAction_Pos& act = cmp.sifter[cmp.S2_to_sifter[key_s2]].pos; + assert(act.type != tracking_var_types::dot_immediate); + if (act.type == tracking_var_types::dot_cur_pos) { + val2[key_s2] = is_it_after_read ? 1 : 0; + } else { + val2[key_s2] = is_it_after_read ? 2 : 0; + } + } + } + add_history_update(TrackingOperationInFa{tv->operation, tv->key}, Hop, Hv); + } + } else if (v->type == match || v->type == one_char_read) { + // Determinization stop + history[Hv].raisin.push_back(v->nodeId); + } + for (FA_Node** uPtr: v->get_all_empty_valid_transitions()) { + assert(*uPtr); + SearchMark& Umark = marks[(**uPtr).search_mark]; + /* Here I use Hop to determine Hv value of u */ + if (Umark.detour_sat == 0) { + Umark.Hv = Hop; + } else if (Umark.Hv != Hop) { + if (compressed_selarr_A_outranks_B( + history[Hop].compressed_selarr, history[Umark.Hv].compressed_selarr, cmp)){ + Umark.Hv = Hop; + } + } + /* Collision calculation finished */ + Umark.detour_sat++; + if (Umark.detour_sat == Umark.epsilon_refs) { + can_process.push_back(Umark.domain_node); + } + } + } + /* Cleaning this mess */ + for (auto& m: marks) { + m.domain_node->search_mark = -1; + } + /* Packaging the answer (we do a little bit of dfs here) */ + wash_history_bush(history, answer, cmp); + } + + void update_had_to_fork_status(const RaisinBush& bush, int& had_to_fork) { + for (const CleanOperHistoryNode& node: bush.clean_history) { + if (node.next.size() > 1 || (!node.next.empty() && !node.exit.empty())) { + had_to_fork = 1; + return; + } + } + } + + typedef size_t superstate_id_t; + + typedef std::vector> homework_t; + + struct LessSuperState { + bool operator()(const SuperState& A, const SuperState& B) const { + std::less> f1L; + if (f1L(A.sorted_raisin, B.sorted_raisin)) + return true; + if (f1L(B.sorted_raisin, A.sorted_raisin)) + return false; + return f1L(A.double_compressed_selarr, B.double_compressed_selarr); + } + }; + + struct GlobalDetourProgress { + std::map superstates; + /* Each element is a root of some megabush in resFa */ + std::vector superstate_megabush_constructed; + std::vector todo_superstaes; + }; + + /* If x was not previously achieved, it will also add it to t o d o list of global detour */ + superstate_id_t convertSuperstateToId(const SuperState& x, GlobalDetourProgress& gdp) { + if (gdp.superstates.count(x)) { + return gdp.superstates[x]; + } + size_t n = gdp.superstates.size(); + gdp.superstates.insert({x, n}); + gdp.todo_superstaes.push_back(x); + gdp.superstate_megabush_constructed.push_back(NULL); + return n; + } + + FA_Node* build_dead_end(FA_Container& resFa) { + return resFa.makeForking(); + } + + void build_bush(const RaisinBush& alpha, FA_Node** sowing_location, FA_Container& resFa, + homework_t& homework, GlobalDetourProgress& gdp) { + size_t n = alpha.clean_history.size(); + if (n == 0) { + FA_Node* dead_end = build_dead_end(resFa); + reattach_fa_node_edge(sowing_location, dead_end); + return; + } + std::vector> todo = {{sowing_location, alpha.start}}; + + while (!todo.empty()) { + FA_Node** sl = todo.back().first; + const CleanOperHistoryNode& hnode = alpha.clean_history[todo.back().second]; + todo.pop_back(); + auto history_transition = [&](size_t i, FA_Node** of_sl) { + FA_NodePathPart* pn = convert_to_node(hnode.next[i].op, resFa); + reattach_fa_node_edge(of_sl, pn); + todo.emplace_back(&(pn->nxt_node), hnode.next[i].u); + }; + + if (hnode.next.empty()) { + assert(!hnode.exit.empty()); + superstate_id_t w = convertSuperstateToId(hnode.exit, gdp); + homework.emplace_back(sl, w); + } else if (hnode.next.size() == 1 && hnode.exit.empty()) { + history_transition(0, sl); + } else { + FA_NodeOfForking* forker = resFa.makeForking(); + bool raisin = !hnode.exit.empty(); + size_t k = hnode.next.size(); + forker->nxt_options.assign(k + static_cast(raisin), NULL); + for (size_t i = 0; i < k; i++) { + history_transition(i, &(forker->nxt_options[i])); + } + if (raisin) { + superstate_id_t w = convertSuperstateToId(hnode.exit, gdp); + homework.emplace_back(&(forker->nxt_options[k]), w); + } + reattach_fa_node_edge(sl, forker); + } + } + } + + ColoredCodeset get_pretreated_cc(FA_Container& sourceFa) { + std::set little_insects; + for (FA_Node* v: sourceFa.all) { + if (v->type == look_one_behind) { + little_insects.insert(static_cast(v)->filter); + } + } + ColoredCodeset pretreated_cc(little_insects.size()); + for (const codeset_t& cs: little_insects) { + pretreated_cc.apply_divisor(cs); + } + return pretreated_cc; + } + + // todo add a check on size of dfa + void try_determinize_fa(FA_Container &sourceFa, const RegexPriorityTable &sifter, tai_t selarr_sz, + const REGEX_IS024_FA_FirstStageFixInfo &info1, FA_Container &resFa, int &error, int& had_to_fork) + { + /* During execuion, i will create pointers to field res.start and store them (inside the scope of this function) + * Luckily res argument is already immovable in this scope. */ + error = 0; + had_to_fork = 0; + assert(resFa.start == NULL && resFa.all.empty()); + input_fa_assert(sourceFa); + SelarrCompressionScheme cmp(selarr_sz, sifter); + + GlobalDetourProgress gdp; + homework_t homework; + + ColoredCodeset pretreated_cc = get_pretreated_cc(sourceFa); + + FA_Node** res_start_ptr = &(resFa.start); + if (info1.fed_chars_extend_one_left) { + ColoredCodeset inp_distinction = pretreated_cc; + inp_distinction.apply_divisor(codeset_of_all); + std::vector starting_Is; + std::vector> starting_Cids; /* Filler variable */ + inp_distinction.get_splits_of_non_dummy(starting_Is, starting_Cids); + size_t R = starting_Is.size(); + for (auto& rdh: starting_Cids) { + assert(rdh.size() == 1 && rdh[0] == 0); + } + FA_NodeOfDetCharCrossroads* very_first_cr = resFa.makeDetCharCrossroads(); + very_first_cr->second_ns = true; + reattach_fa_node_edge(res_start_ptr, very_first_cr); + very_first_cr->crossroads.resize(R); /* After that, nobody has right to resize crossroads array */ + for (size_t i = 0; i < R; i++) { + very_first_cr->crossroads[i].input = starting_Is[i]; + FA_Node** sowing_place = &(very_first_cr->crossroads[i].nxt_node); + RaisinBush alpha; + building_detour(cmp, std::vector(cmp.SN3, 0), {sourceFa.start}, starting_Is[i], alpha, false); +#ifdef PR_DEB + printf("Initialization hard %ld/%ld\n", i + 1, R); + alpha.print(); +#endif + update_had_to_fork_status(alpha, had_to_fork); + build_bush(alpha, sowing_place, resFa, homework, gdp); + } + } else { + RaisinBush alpha; + building_detour(cmp, std::vector(cmp.SN3, 0), {sourceFa.start}, codeset_of_all, alpha, false); +#ifdef PR_DEB + printf("Initialization easy\n"); alpha.print(); #endif update_had_to_fork_status(alpha, had_to_fork); - build_bush(alpha, &(my_cr->crossroads[i].nxt_node), resFa, homework, gdp); + build_bush(alpha, res_start_ptr, resFa, homework, gdp); } - // Determinization stop: match (finish) - FA_Node* finish_route = NULL; - if (!how_can_i_finish.empty() && (info1.fed_chars_extend_one_right || R == 0)) { - FA_NodeOfMatch* matcher = resFa.makeMatch(); - finish_route = matcher; - if (info1.fed_chars_extend_one_right) { - FA_NodeOfOneCharRead* right_ext_read = resFa.makeOneCharRead(how_can_i_finish, true); - reattach_nxt_node(right_ext_read, matcher); - finish_route = right_ext_read; + /* Now we start the actual detour. */ + while (!gdp.todo_superstaes.empty()) { + SuperState SS = gdp.todo_superstaes.back(); gdp.todo_superstaes.pop_back(); + // printf("Global detour turn: %s\n", SS.toString().c_str()); + std::vector reading_stops; + codeset_t how_can_i_finish = {}; + for (size_t v: SS.sorted_raisin) { + FA_Node* node = sourceFa.all[v]; + if (node->type == one_char_read) { + reading_stops.push_back(static_cast(node)); + } else if (node->type == match) { + auto fn = static_cast(node); + assert(!fn->ext_filter_added || info1.fed_chars_extend_one_right); + if (fn->ext_filter_added) { + how_can_i_finish = merge_sets(how_can_i_finish, fn->pending_filter); + } else { + how_can_i_finish = codeset_of_all; + } + } else + assert(false); + } + // Determinization stop: one char read (input) + ColoredCodeset inp_distinction = pretreated_cc; + size_t pr = reading_stops.size(); + for (size_t i = 0; i < pr; i++) { + inp_distinction.apply_divisor(reading_stops[i]->filter); + } + std::vector Is; + std::vector> Cids; + inp_distinction.get_splits_of_non_dummy(Is, Cids); + size_t R = Is.size(); + FA_NodeOfDetCharCrossroads* my_cr = NULL; + if (R > 0) { + my_cr = resFa.makeDetCharCrossroads(); + if (!info1.fed_chars_extend_one_right && !how_can_i_finish.empty()) { + assert(how_can_i_finish == codeset_of_all); + my_cr->matching = true; + } + my_cr->crossroads.resize(R); + } + for (size_t i = 0; i < R; i++) { + my_cr->crossroads[i].input = Is[i]; + my_cr->crossroads[i].nxt_node = NULL; + std::vector fl_passed_filters; + for (size_t j: Cids[i]) { + fl_passed_filters.push_back(reading_stops[j]->nxt_node); + } + // todo: make a function out of next 6 lines of code + RaisinBush alpha; + building_detour(cmp, SS.double_compressed_selarr, fl_passed_filters, Is[i], alpha, true); +#ifdef PR_DEB + printf("That same turn, subbush %ld/%ld\n", i + 1, R); + alpha.print(); +#endif + update_had_to_fork_status(alpha, had_to_fork); + build_bush(alpha, &(my_cr->crossroads[i].nxt_node), resFa, homework, gdp); + } + // Determinization stop: match (finish) + FA_Node* finish_route = NULL; + if (!how_can_i_finish.empty() && (info1.fed_chars_extend_one_right || R == 0)) { + FA_NodeOfMatch* matcher = resFa.makeMatch(); + finish_route = matcher; + if (info1.fed_chars_extend_one_right) { + FA_NodeOfOneCharRead* right_ext_read = resFa.makeOneCharRead(how_can_i_finish, true); + reattach_nxt_node(right_ext_read, matcher); + finish_route = right_ext_read; + } + } + // Combining these two cases + assert(finish_route || my_cr); + FA_Node*& endsUp = gdp.superstate_megabush_constructed[gdp.superstates[SS]]; + if (!finish_route) { + endsUp = my_cr; + } else if (!my_cr) { + endsUp = finish_route; + } else { + FA_NodeOfForking* F = resFa.makeForking(); + F->nxt_options = {NULL, NULL}; + reattach_fa_node_edge(&(F->nxt_options[0]), my_cr); + reattach_fa_node_edge(&(F->nxt_options[1]), finish_route); + endsUp = F; } } - // Combining these two cases - assert(finish_route || my_cr); - FA_Node*& endsUp = gdp.superstate_megabush_constructed[gdp.superstates[SS]]; - if (!finish_route) { - endsUp = my_cr; - } else if (!my_cr) { - endsUp = finish_route; - } else { - FA_NodeOfForking* F = resFa.makeForking(); - F->nxt_options = {NULL, NULL}; - reattach_fa_node_edge(&(F->nxt_options[0]), my_cr); - reattach_fa_node_edge(&(F->nxt_options[1]), finish_route); - endsUp = F; + /* Now it's time to do the homework: link all megabushes */ + for (auto& p: homework) { + reattach_fa_node_edge(p.first, gdp.superstate_megabush_constructed[p.second]); } } - /* Now it's time to do the homework: link all megabushes */ - for (auto& p: homework) { - reattach_fa_node_edge(p.first, gdp.superstate_megabush_constructed[p.second]); - } } - diff --git a/src/libregexis024fa/fa_make_deterministic.h b/src/libregexis024fa/fa_make_deterministic.h index 6c08469..c15585d 100644 --- a/src/libregexis024fa/fa_make_deterministic.h +++ b/src/libregexis024fa/fa_make_deterministic.h @@ -4,7 +4,9 @@ #include #include -void try_determinize_fa(FA_Container &sourceFa, const RegexPriorityTable &sifter, regex_tai_t selarr_sz, - const REGEX_IS024_FA_FirstStageFixInfo &info1, FA_Container &resFa, int &error, int& had_to_fork); +namespace regexis024 { + void try_determinize_fa(FA_Container &sourceFa, const RegexPriorityTable &sifter, tai_t selarr_sz, + const REGEX_IS024_FA_FirstStageFixInfo &info1, FA_Container &resFa, int &error, int& had_to_fork); +} #endif //LIBREGEXIS024_FA_MAKE_DETERMINISTIC_H diff --git a/src/libregexis024fa/finite_automaton.cpp b/src/libregexis024fa/finite_automaton.cpp index 727ce3d..4305102 100644 --- a/src/libregexis024fa/finite_automaton.cpp +++ b/src/libregexis024fa/finite_automaton.cpp @@ -2,140 +2,142 @@ #include #include -bool FA_Node::empty() { - return type != one_char_read && type != det_char_crossroads; -} - -void FA_Node::apply_lookahead_restriction(const codeset_t &restriction) {} - -void FA_Node::reAdd_references() { - for (FA_Node** nxtPtr: get_all_transitions()){ - if (*nxtPtr) - (**nxtPtr).refs++; +namespace regexis024 { + bool FA_Node::empty() { + return type != one_char_read && type != det_char_crossroads; } -} -std::vector FA_Node::get_all_transitions() { - return {}; -} + void FA_Node::apply_lookahead_restriction(const codeset_t &restriction) {} -std::vector FA_Node::get_all_empty_valid_transitions() { - return {}; -} + void FA_Node::reAdd_references() { + for (FA_Node** nxtPtr: get_all_transitions()){ + if (*nxtPtr) + (**nxtPtr).refs++; + } + } -std::vector FA_NodePathPart::get_all_transitions() { - return {&nxt_node}; -} + std::vector FA_Node::get_all_transitions() { + return {}; + } -std::vector FA_NodePathPart::get_all_empty_valid_transitions() { - if (nxt_node) + std::vector FA_Node::get_all_empty_valid_transitions() { + return {}; + } + + std::vector FA_NodePathPart::get_all_transitions() { return {&nxt_node}; - return {}; -} - -FA_NodeOfMatch::FA_NodeOfMatch() {type = match;} - -void FA_NodeOfMatch::apply_lookahead_restriction(const codeset_t &restriction) { - ext_filter_added = true; - pending_filter = restriction; -} - -FA_NodeOfOneCharRead::FA_NodeOfOneCharRead(const codeset_t &filter, bool second_namespace) : filter(filter), - second_ns(second_namespace) { type = one_char_read;} - -void FA_NodeOfOneCharRead::apply_lookahead_restriction(const codeset_t &restriction) { - filter = intersect_sets(filter, restriction); -} - -std::vector FA_NodeOfOneCharRead::get_all_empty_valid_transitions() { - return {}; -} - -FA_NodeOfForking::FA_NodeOfForking() {type = forking;} - -std::vector FA_NodeOfForking::get_all_empty_valid_transitions() { - std::vector res; - for (size_t i = 0; i < nxt_options.size(); i++) - if (nxt_options[i]) - res.push_back(&nxt_options[i]); - return res; -} - -std::vector FA_NodeOfForking::get_all_transitions() { - std::vector res; - for (size_t i = 0; i < nxt_options.size(); i++) - res.push_back(&nxt_options[i]); - return res; -} - -FA_NodeOfLookOneBehind::FA_NodeOfLookOneBehind(const codeset_t &filter) : filter(filter) {type = look_one_behind;} - -FA_NodeOfLookOneAhead::FA_NodeOfLookOneAhead(const codeset_t &restriction) : restriction(restriction) { - type = look_one_ahead; -} - -FA_NodeOfTrackArrayMovImm::FA_NodeOfTrackArrayMovImm(regex024_opcode operation, uint16_t key, uint64_t immValue) : - operation(operation), key(key), imm_value(immValue) {type = track_array_mov_imm;} -// - -FA_NodeOfTrackArrayMovHalfinvariant::FA_NodeOfTrackArrayMovHalfinvariant(regex024_opcode operation, uint16_t key): - operation(operation), key(key){type = track_array_mov_halfinvariant;} -// - -void FA_NodeOfDetCharCrossroads::apply_lookahead_restriction(const codeset_t &restriction) { - exitf("What?? Oh, no, no. I am NOT doing it"); -} - -FA_NodeOfDetCharCrossroads::FA_NodeOfDetCharCrossroads(const std::vector &crossroads) - : crossroads(crossroads) {type = det_char_crossroads;} - -std::vector FA_NodeOfDetCharCrossroads::get_all_empty_valid_transitions() { - return {}; -} - -std::vector FA_NodeOfDetCharCrossroads::get_all_transitions() { - std::vector res; - for (auto& tr: crossroads) - res.push_back(&tr.nxt_node); - return res; -} - -/* If transferring ownership of node to container has failed, node is freed (which means it is ivalidated) - * If this semi-ownership transfer succeded (no std::bad_alloc), then node is still valid to use, and at the end - * of FA_Container lifetime it is guaranteed to be deleted - */ -void FA_Container::registerNew(FA_Node *node) { - try { - node->nodeId = (int64_t)all.size(); - all.push_back(node); - } catch (const std::bad_alloc& ba) { - delete node; - throw; } -} -DFA_CrossroadPath::DFA_CrossroadPath(const codeset_t &input, FA_Node *nxt_node): input(input),nxt_node(nxt_node) {} -// + std::vector FA_NodePathPart::get_all_empty_valid_transitions() { + if (nxt_node) + return {&nxt_node}; + return {}; + } -FA_Container::~FA_Container() { - for (FA_Node* n: all) - delete n; -} + FA_NodeOfMatch::FA_NodeOfMatch() {type = match;} + + void FA_NodeOfMatch::apply_lookahead_restriction(const codeset_t &restriction) { + ext_filter_added = true; + pending_filter = restriction; + } + + FA_NodeOfOneCharRead::FA_NodeOfOneCharRead(const codeset_t &filter, bool second_namespace) : filter(filter), + second_ns(second_namespace) { type = one_char_read;} + + void FA_NodeOfOneCharRead::apply_lookahead_restriction(const codeset_t &restriction) { + filter = intersect_sets(filter, restriction); + } + + std::vector FA_NodeOfOneCharRead::get_all_empty_valid_transitions() { + return {}; + } + + FA_NodeOfForking::FA_NodeOfForking() {type = forking;} + + std::vector FA_NodeOfForking::get_all_empty_valid_transitions() { + std::vector res; + for (size_t i = 0; i < nxt_options.size(); i++) + if (nxt_options[i]) + res.push_back(&nxt_options[i]); + return res; + } + + std::vector FA_NodeOfForking::get_all_transitions() { + std::vector res; + for (size_t i = 0; i < nxt_options.size(); i++) + res.push_back(&nxt_options[i]); + return res; + } + + FA_NodeOfLookOneBehind::FA_NodeOfLookOneBehind(const codeset_t &filter) : filter(filter) {type = look_one_behind;} + + FA_NodeOfLookOneAhead::FA_NodeOfLookOneAhead(const codeset_t &restriction) : restriction(restriction) { + type = look_one_ahead; + } + + FA_NodeOfTrackArrayMovImm::FA_NodeOfTrackArrayMovImm(opcode_t operation, uint16_t key, uint64_t immValue) : + operation(operation), key(key), imm_value(immValue) {type = track_array_mov_imm;} + // + + FA_NodeOfTrackArrayMovHalfinvariant::FA_NodeOfTrackArrayMovHalfinvariant(opcode_t operation, uint16_t key): + operation(operation), key(key){type = track_array_mov_halfinvariant;} + // + + void FA_NodeOfDetCharCrossroads::apply_lookahead_restriction(const codeset_t &restriction) { + assert(false); + } + + FA_NodeOfDetCharCrossroads::FA_NodeOfDetCharCrossroads(const std::vector &crossroads) + : crossroads(crossroads) {type = det_char_crossroads;} + + std::vector FA_NodeOfDetCharCrossroads::get_all_empty_valid_transitions() { + return {}; + } + + std::vector FA_NodeOfDetCharCrossroads::get_all_transitions() { + std::vector res; + for (auto& tr: crossroads) + res.push_back(&tr.nxt_node); + return res; + } + + /* If transferring ownership of node to container has failed, node is freed (which means it is ivalidated) + * If this semi-ownership transfer succeded (no std::bad_alloc), then node is still valid to use, and at the end + * of FA_Container lifetime it is guaranteed to be deleted + */ + void FA_Container::registerNew(FA_Node *node) { + try { + node->nodeId = (int64_t)all.size(); + all.push_back(node); + } catch (const std::exception& ba) { + delete node; + throw; + } + } + + DFA_CrossroadPath::DFA_CrossroadPath(const codeset_t &input, FA_Node *nxt_node): input(input),nxt_node(nxt_node) {} + // + + FA_Container::~FA_Container() { + for (FA_Node* n: all) + delete n; + } #define bs(name, args, params) \ - FA_NodeOf ## name *FA_Container::make ## name(args) { \ - FA_NodeOf ## name *node = new FA_NodeOf ## name(params); \ - registerNew(node); \ - return node; \ - } +FA_NodeOf ## name *FA_Container::make ## name(args) { \ +FA_NodeOf ## name *node = new FA_NodeOf ## name(params); \ +registerNew(node); \ +return node; \ +} #define COMMA , -bs(Match, , ) -bs(OneCharRead, const codeset_t& filter COMMA bool second_namespace, filter COMMA second_namespace) -bs(Forking, , ) -bs(LookOneBehind, const codeset_t& filter, filter) -bs(LookOneAhead, const codeset_t& filter, filter) -bs(TrackArrayMovImm, regex024_opcode operation COMMA uint16_t key COMMA uint64_t immValue, - operation COMMA key COMMA immValue) -bs(TrackArrayMovHalfinvariant, regex024_opcode operation COMMA uint16_t key, operation COMMA key) -bs(DetCharCrossroads, ,{}) + bs(Match, , ) + bs(OneCharRead, const codeset_t& filter COMMA bool second_namespace, filter COMMA second_namespace) + bs(Forking, , ) + bs(LookOneBehind, const codeset_t& filter, filter) + bs(LookOneAhead, const codeset_t& filter, filter) + bs(TrackArrayMovImm, opcode_t operation COMMA uint16_t key COMMA uint64_t immValue, + operation COMMA key COMMA immValue) + bs(TrackArrayMovHalfinvariant, opcode_t operation COMMA uint16_t key, operation COMMA key) + bs(DetCharCrossroads, ,{}) +} diff --git a/src/libregexis024fa/finite_automaton.h b/src/libregexis024fa/finite_automaton.h index b12e1d4..db45215 100644 --- a/src/libregexis024fa/finite_automaton.h +++ b/src/libregexis024fa/finite_automaton.h @@ -6,144 +6,146 @@ #include #include -enum FA_Node_type: uint8_t { - match, - one_char_read, - forking, - look_one_behind, - look_one_ahead, - track_array_mov_imm, - track_array_mov_halfinvariant, - /* Used for DFA */ - det_char_crossroads, -}; +namespace regexis024 { + enum FA_Node_type: uint8_t { + match, + one_char_read, + forking, + look_one_behind, + look_one_ahead, + track_array_mov_imm, + track_array_mov_halfinvariant, + /* Used for DFA */ + det_char_crossroads, + }; -struct FA_Node{ - size_t refs = 0; - /* If node is not in searched subset (at least yet), `search mark == -1`, otherwise - * it is an index (for that particular node) in the vector that captures all nodes in - * searched subset*/ - int64_t search_mark = -1; - FA_Node_type type; - int64_t nodeId; + struct FA_Node{ + size_t refs = 0; + /* If node is not in searched subset (at least yet), `search mark == -1`, otherwise + * it is an index (for that particular node) in the vector that captures all nodes in + * searched subset*/ + int64_t search_mark = -1; + FA_Node_type type; + int64_t nodeId; - bool empty(); - virtual std::vector get_all_empty_valid_transitions(); - virtual void apply_lookahead_restriction(const codeset_t &restriction); - void reAdd_references(); - virtual ~FA_Node() = default; - virtual std::vector get_all_transitions(); -}; + bool empty(); + virtual std::vector get_all_empty_valid_transitions(); + virtual void apply_lookahead_restriction(const codeset_t &restriction); + void reAdd_references(); + virtual ~FA_Node() = default; + virtual std::vector get_all_transitions(); + }; -struct FA_NodePathPart: public FA_Node{ - FA_Node* nxt_node = NULL; + struct FA_NodePathPart: public FA_Node{ + FA_Node* nxt_node = NULL; - std::vector get_all_empty_valid_transitions() override; - std::vector get_all_transitions() override; -}; + std::vector get_all_empty_valid_transitions() override; + std::vector get_all_transitions() override; + }; -struct FA_NodeOfMatch: public FA_Node{ - bool ext_filter_added = false; - codeset_t pending_filter; + struct FA_NodeOfMatch: public FA_Node{ + bool ext_filter_added = false; + codeset_t pending_filter; - explicit FA_NodeOfMatch(); - void apply_lookahead_restriction(const codeset_t &restriction) override; -}; + explicit FA_NodeOfMatch(); + void apply_lookahead_restriction(const codeset_t &restriction) override; + }; -/* .type == one_char_read */ -struct FA_NodeOfOneCharRead: public FA_NodePathPart{ - codeset_t filter; - bool second_ns = false; + /* .type == one_char_read */ + struct FA_NodeOfOneCharRead: public FA_NodePathPart{ + codeset_t filter; + bool second_ns = false; - FA_NodeOfOneCharRead(const codeset_t &filter, bool second_namespace); - void apply_lookahead_restriction(const codeset_t &restriction) override; - std::vector get_all_empty_valid_transitions() override; -}; + FA_NodeOfOneCharRead(const codeset_t &filter, bool second_namespace); + void apply_lookahead_restriction(const codeset_t &restriction) override; + std::vector get_all_empty_valid_transitions() override; + }; -/* .type == forking */ -struct FA_NodeOfForking: public FA_Node{ - /* Won't be modified after init (in regexp compilation into NFA) */ - std::vector nxt_options; - int64_t stopId = -1; + /* .type == forking */ + struct FA_NodeOfForking: public FA_Node{ + /* Won't be modified after init (in regexp compilation into NFA) */ + std::vector nxt_options; + int64_t stopId = -1; - explicit FA_NodeOfForking(); - std::vector get_all_empty_valid_transitions() override; - std::vector get_all_transitions() override; -}; + explicit FA_NodeOfForking(); + std::vector get_all_empty_valid_transitions() override; + std::vector get_all_transitions() override; + }; -/* .type == look_one_behind */ -struct FA_NodeOfLookOneBehind: public FA_NodePathPart{ - /* [0; UINT32_MAX] is equivalent to no filter */ - codeset_t filter; + /* .type == look_one_behind */ + struct FA_NodeOfLookOneBehind: public FA_NodePathPart{ + /* [0; UINT32_MAX] is equivalent to no filter */ + codeset_t filter; - explicit FA_NodeOfLookOneBehind(const codeset_t &filter); -}; + explicit FA_NodeOfLookOneBehind(const codeset_t &filter); + }; -/* .type == look_one_ahead */ -struct FA_NodeOfLookOneAhead: public FA_NodePathPart{ - /* [0; UINT32_MAX] is equivalent to no restriction */ - codeset_t restriction; + /* .type == look_one_ahead */ + struct FA_NodeOfLookOneAhead: public FA_NodePathPart{ + /* [0; UINT32_MAX] is equivalent to no restriction */ + codeset_t restriction; - explicit FA_NodeOfLookOneAhead(const codeset_t &restriction); -}; + explicit FA_NodeOfLookOneAhead(const codeset_t &restriction); + }; -/* .type == track_array_mov_imm */ -struct FA_NodeOfTrackArrayMovImm: public FA_NodePathPart{ - regex024_opcode operation; - uint16_t key; - uint64_t imm_value; + /* .type == track_array_mov_imm */ + struct FA_NodeOfTrackArrayMovImm: public FA_NodePathPart{ + opcode_t operation; + uint16_t key; + uint64_t imm_value; - FA_NodeOfTrackArrayMovImm(regex024_opcode operation, uint16_t key, uint64_t immValue); -}; + FA_NodeOfTrackArrayMovImm(opcode_t operation, uint16_t key, uint64_t immValue); + }; -/* .type == track_array_mov_halfinvariant */ -struct FA_NodeOfTrackArrayMovHalfinvariant: public FA_NodePathPart{ - regex024_opcode operation; - uint16_t key; + /* .type == track_array_mov_halfinvariant */ + struct FA_NodeOfTrackArrayMovHalfinvariant: public FA_NodePathPart{ + opcode_t operation; + uint16_t key; - FA_NodeOfTrackArrayMovHalfinvariant(regex024_opcode operation, uint16_t key); -}; + FA_NodeOfTrackArrayMovHalfinvariant(opcode_t operation, uint16_t key); + }; -struct DFA_CrossroadPath{ - codeset_t input; - FA_Node* nxt_node = NULL; + struct DFA_CrossroadPath{ + codeset_t input; + FA_Node* nxt_node = NULL; - DFA_CrossroadPath(const codeset_t &input, FA_Node *nxt_node); - DFA_CrossroadPath() = default; -}; + DFA_CrossroadPath(const codeset_t &input, FA_Node *nxt_node); + DFA_CrossroadPath() = default; + }; -/* .type == det_char_crossroads */ -struct FA_NodeOfDetCharCrossroads: public FA_Node{ - std::vector crossroads; - bool matching = false; - bool second_ns = false; + /* .type == det_char_crossroads */ + struct FA_NodeOfDetCharCrossroads: public FA_Node{ + std::vector crossroads; + bool matching = false; + bool second_ns = false; - explicit FA_NodeOfDetCharCrossroads(const std::vector &crossroads); - void apply_lookahead_restriction(const codeset_t &restriction) override; - std::vector get_all_empty_valid_transitions() override; - std::vector get_all_transitions() override; -}; + explicit FA_NodeOfDetCharCrossroads(const std::vector &crossroads); + void apply_lookahead_restriction(const codeset_t &restriction) override; + std::vector get_all_empty_valid_transitions() override; + std::vector get_all_transitions() override; + }; -struct FA_Container{ - FA_Container(const FA_Container&) = delete; - FA_Container& operator=(const FA_Container&) = delete; - FA_Container() = default; + struct FA_Container{ + FA_Container(const FA_Container&) = delete; + FA_Container& operator=(const FA_Container&) = delete; + FA_Container() = default; - std::vector all; - FA_Node* start = NULL; + std::vector all; + FA_Node* start = NULL; - void registerNew(FA_Node* node); + void registerNew(FA_Node* node); - FA_NodeOfMatch* makeMatch(); - FA_NodeOfOneCharRead* makeOneCharRead(const codeset_t& filter, bool second_namespace); - FA_NodeOfForking* makeForking(); - FA_NodeOfLookOneBehind* makeLookOneBehind(const codeset_t& filter); - FA_NodeOfLookOneAhead* makeLookOneAhead(const codeset_t& filter); - FA_NodeOfTrackArrayMovImm* makeTrackArrayMovImm(regex024_opcode operation, uint16_t key, uint64_t immValue); - FA_NodeOfTrackArrayMovHalfinvariant* makeTrackArrayMovHalfinvariant(regex024_opcode operation, uint16_t key); - FA_NodeOfDetCharCrossroads* makeDetCharCrossroads(); + FA_NodeOfMatch* makeMatch(); + FA_NodeOfOneCharRead* makeOneCharRead(const codeset_t& filter, bool second_namespace); + FA_NodeOfForking* makeForking(); + FA_NodeOfLookOneBehind* makeLookOneBehind(const codeset_t& filter); + FA_NodeOfLookOneAhead* makeLookOneAhead(const codeset_t& filter); + FA_NodeOfTrackArrayMovImm* makeTrackArrayMovImm(opcode_t operation, uint16_t key, uint64_t immValue); + FA_NodeOfTrackArrayMovHalfinvariant* makeTrackArrayMovHalfinvariant(opcode_t operation, uint16_t key); + FA_NodeOfDetCharCrossroads* makeDetCharCrossroads(); - ~FA_Container(); -}; + ~FA_Container(); + }; +} #endif //LIBREGEXIS024_FINITE_AUTOMATON_H diff --git a/src/libregexis024fa/graph_to_bytecode/core.cpp b/src/libregexis024fa/graph_to_bytecode/core.cpp index df2f9bc..38c97fa 100644 --- a/src/libregexis024fa/graph_to_bytecode/core.cpp +++ b/src/libregexis024fa/graph_to_bytecode/core.cpp @@ -5,113 +5,114 @@ #include +namespace regexis024 { #define nonthrowing_assert(expr) if (!(expr)) {error = -1; return; } + void compilation_core(std::vector& result, FA_Container& fa, explicit_bookmarks& bookmark_manager, + size_t& first_read_ns, size_t& second_read_ns, size_t& fork_ss_ns, int& error) + { + bookmark_id_t node_start_bm_offset = bookmark_manager.new_range_of_bookmarks(fa.all.size()); + std::vector not_yet_dedicated_second_read_ns_ssids; + first_read_ns = 0; + second_read_ns = 0; + fork_ss_ns = 0; + assert(fa.start); + std::vector todo = {fa.start}; + // std::vector promised(fa.all.size(), false); + // promised[fa.start->nodeId] = true; -void compilation_core(std::vector& result, FA_Container& fa, explicit_bookmarks& bookmark_manager, - size_t& first_read_ns, size_t& second_read_ns, size_t& fork_ss_ns, int& error) -{ - bookmark_id_t node_start_bm_offset = bookmark_manager.new_range_of_bookmarks(fa.all.size()); - std::vector not_yet_dedicated_second_read_ns_ssids; - first_read_ns = 0; - second_read_ns = 0; - fork_ss_ns = 0; - assert(fa.start); - std::vector todo = {fa.start}; - // std::vector promised(fa.all.size(), false); - // promised[fa.start->nodeId] = true; + auto nodesBookmark = [&](FA_Node* node) -> bookmark_id_t { + assert(node); + return node_start_bm_offset + node->nodeId; + }; - auto nodesBookmark = [&](FA_Node* node) -> bookmark_id_t { - assert(node); - return node_start_bm_offset + node->nodeId; - }; + auto addBranching = [&](FA_Node* node) { + todo.push_back(node); + }; - auto addBranching = [&](FA_Node* node) { - todo.push_back(node); - }; - - auto reading_head = [&](bool is_in_second_ns) { - if (is_in_second_ns) { - cmd_READ_second_ns(result, not_yet_dedicated_second_read_ns_ssids); - second_read_ns++; - } else { - cmd_READ_first_ns(result, first_read_ns++); - } - }; - - while (!todo.empty()) { - FA_Node* node = todo.back(); todo.pop_back(); - if (bookmark_manager.has_landed(nodesBookmark(node))) { - continue; - } - while (true) { - if (bookmark_manager.has_landed(nodesBookmark(node))) { - cmd_JUMP(result, bookmark_manager, nodesBookmark(node)); - break; + auto reading_head = [&](bool is_in_second_ns) { + if (is_in_second_ns) { + cmd_READ_second_ns(result, not_yet_dedicated_second_read_ns_ssids); + second_read_ns++; + } else { + cmd_READ_first_ns(result, first_read_ns++); } - bookmark_manager.land_bookmark(result, nodesBookmark(node)); - if (node->type == match) { - cmd_MATCH(result); - cmd_DIE(result); - break; - } else if (node->type == one_char_read) { - FA_NodeOfOneCharRead* ocr = dynamic_cast(node); - nonthrowing_assert(first_read_ns + second_read_ns < UINT32_MAX); - reading_head(ocr->second_ns); - write_filter(result, bookmark_manager, {ocr->filter},{nodesBookmark(ocr->nxt_node)}); - node = ocr->nxt_node; - } else if (node->type == look_one_behind) { - FA_NodeOfLookOneBehind* lob = dynamic_cast(node); - write_filter(result, bookmark_manager, {lob->filter}, {nodesBookmark(lob->nxt_node)}); - node = lob->nxt_node; - } else if (node->type == forking) { - FA_NodeOfForking* fn = dynamic_cast(node); - std::vector& nxt_options = fn->nxt_options; - if (nxt_options.empty()) { + }; + + while (!todo.empty()) { + FA_Node* node = todo.back(); todo.pop_back(); + if (bookmark_manager.has_landed(nodesBookmark(node))) { + continue; + } + while (true) { + if (bookmark_manager.has_landed(nodesBookmark(node))) { + cmd_JUMP(result, bookmark_manager, nodesBookmark(node)); + break; + } + bookmark_manager.land_bookmark(result, nodesBookmark(node)); + if (node->type == match) { + cmd_MATCH(result); cmd_DIE(result); break; - } - if (nxt_options.size() >= 2) { - nonthrowing_assert(fork_ss_ns < UINT32_MAX); - regex_sslot_id_t sslot = fork_ss_ns++; - for (size_t i = 0; i + 1 < nxt_options.size(); i++) { - cmd_FORK(result, bookmark_manager, sslot, nodesBookmark(nxt_options[i])); - addBranching(nxt_options[i]); + } else if (node->type == one_char_read) { + FA_NodeOfOneCharRead* ocr = dynamic_cast(node); + nonthrowing_assert(first_read_ns + second_read_ns < UINT32_MAX); + reading_head(ocr->second_ns); + write_filter(result, bookmark_manager, {ocr->filter},{nodesBookmark(ocr->nxt_node)}); + node = ocr->nxt_node; + } else if (node->type == look_one_behind) { + FA_NodeOfLookOneBehind* lob = dynamic_cast(node); + write_filter(result, bookmark_manager, {lob->filter}, {nodesBookmark(lob->nxt_node)}); + node = lob->nxt_node; + } else if (node->type == forking) { + FA_NodeOfForking* fn = dynamic_cast(node); + std::vector& nxt_options = fn->nxt_options; + if (nxt_options.empty()) { + cmd_DIE(result); + break; } - } - node = nxt_options.back(); - } else if (node->type == track_array_mov_imm) { - FA_NodeOfTrackArrayMovImm* tami = dynamic_cast(node); - write_byte(result, tami->operation); - write_tai(result, tami->key); - write_quadword(result, tami->imm_value); - node = tami->nxt_node; - } else if (node->type == track_array_mov_halfinvariant) { - FA_NodeOfTrackArrayMovHalfinvariant* tamh = dynamic_cast(node); - write_byte(result, tamh->operation); - write_tai(result, tamh->key); - node = tamh->nxt_node; - } else if (node->type == det_char_crossroads) { - FA_NodeOfDetCharCrossroads* dcc = dynamic_cast(node); - nonthrowing_assert(first_read_ns + second_read_ns < UINT32_MAX); - if (dcc->matching) - cmd_MATCH(result); - reading_head(dcc->second_ns); - std::vector codesets; - std::vector branches; - for (const DFA_CrossroadPath& p: dcc->crossroads) { - codesets.push_back(p.input); - branches.push_back(nodesBookmark(p.nxt_node)); - addBranching(p.nxt_node); - } - write_filter(result, bookmark_manager, codesets, branches); - if (dcc->crossroads.empty()) - break; - node = dcc->crossroads[0].nxt_node; - } else - assert(false); + if (nxt_options.size() >= 2) { + nonthrowing_assert(fork_ss_ns < UINT32_MAX); + sslot_id_t sslot = fork_ss_ns++; + for (size_t i = 0; i + 1 < nxt_options.size(); i++) { + cmd_FORK(result, bookmark_manager, sslot, nodesBookmark(nxt_options[i])); + addBranching(nxt_options[i]); + } + } + node = nxt_options.back(); + } else if (node->type == track_array_mov_imm) { + FA_NodeOfTrackArrayMovImm* tami = dynamic_cast(node); + write_byte(result, tami->operation); + write_tai(result, tami->key); + write_quadword(result, tami->imm_value); + node = tami->nxt_node; + } else if (node->type == track_array_mov_halfinvariant) { + FA_NodeOfTrackArrayMovHalfinvariant* tamh = dynamic_cast(node); + write_byte(result, tamh->operation); + write_tai(result, tamh->key); + node = tamh->nxt_node; + } else if (node->type == det_char_crossroads) { + FA_NodeOfDetCharCrossroads* dcc = dynamic_cast(node); + nonthrowing_assert(first_read_ns + second_read_ns < UINT32_MAX); + if (dcc->matching) + cmd_MATCH(result); + reading_head(dcc->second_ns); + std::vector codesets; + std::vector branches; + for (const DFA_CrossroadPath& p: dcc->crossroads) { + codesets.push_back(p.input); + branches.push_back(nodesBookmark(p.nxt_node)); + addBranching(p.nxt_node); + } + write_filter(result, bookmark_manager, codesets, branches); + if (dcc->crossroads.empty()) + break; + node = dcc->crossroads[0].nxt_node; + } else + assert(false); + } + } + for (size_t j = 0; j < not_yet_dedicated_second_read_ns_ssids.size(); j++) { + belated_sslot_id(result, not_yet_dedicated_second_read_ns_ssids[j], j + first_read_ns); } } - for (size_t j = 0; j < not_yet_dedicated_second_read_ns_ssids.size(); j++) { - belated_sslot_id(result, not_yet_dedicated_second_read_ns_ssids[j], j + first_read_ns); - } } diff --git a/src/libregexis024fa/graph_to_bytecode/core.h b/src/libregexis024fa/graph_to_bytecode/core.h index ef71883..c1fb732 100644 --- a/src/libregexis024fa/graph_to_bytecode/core.h +++ b/src/libregexis024fa/graph_to_bytecode/core.h @@ -4,7 +4,9 @@ #include #include -void compilation_core(std::vector& result, FA_Container& fa, explicit_bookmarks& bookmark_manager, - size_t& first_read_ns, size_t& second_read_ns, size_t& fork_ss_ns, int& error); +namespace regexis024 { + void compilation_core(std::vector& result, FA_Container& fa, explicit_bookmarks& bookmark_manager, + size_t& first_read_ns, size_t& second_read_ns, size_t& fork_ss_ns, int& error); +} #endif diff --git a/src/libregexis024fa/graph_to_bytecode/fa_compiler.cpp b/src/libregexis024fa/graph_to_bytecode/fa_compiler.cpp index 79f3e62..8584491 100644 --- a/src/libregexis024fa/graph_to_bytecode/fa_compiler.cpp +++ b/src/libregexis024fa/graph_to_bytecode/fa_compiler.cpp @@ -7,96 +7,98 @@ #include -void write_priority_table_actions(std::vector& result, RegexPriorityTable &priority_table) { - for (RegexPriorityTableAction& act: priority_table) { - if (act.pos.isForRange()) { - write_byte(result, regex024_opcodes::DDIST_RABX_SELARR); - write_tai(result, act.pos.first); - write_tai(result, act.pos.second); - } else { - write_byte(result, regex024_opcodes::DMOV_RABX_SELARR); - write_tai(result, act.pos.first); +namespace regexis024 { + void write_priority_table_actions(std::vector& result, RegexPriorityTable &priority_table) { + for (RegexPriorityTableAction& act: priority_table) { + if (act.pos.isForRange()) { + write_byte(result, opcodes::DDIST_RABX_SELARR); + write_tai(result, act.pos.first); + write_tai(result, act.pos.second); + } else { + write_byte(result, opcodes::DMOV_RABX_SELARR); + write_tai(result, act.pos.first); + } + write_byte(result, act.minimize ? + opcodes::SIFTPRIOR_MIN_RABX : + opcodes::SIFTPRIOR_MAX_RABX); } - write_byte(result, act.minimize ? - regex024_opcodes::SIFTPRIOR_MIN_RABX : - regex024_opcodes::SIFTPRIOR_MAX_RABX); + write_byte(result, opcodes::SIFT_DONE); } - write_byte(result, regex024_opcodes::SIFT_DONE); -} -struct belate_initialization_parameters { - size_t todo_pos_read_ss_n; - size_t todo_pos_fork_ss_n; - size_t todo_pos_second_ns_size; + struct belate_initialization_parameters { + size_t todo_pos_read_ss_n; + size_t todo_pos_fork_ss_n; + size_t todo_pos_second_ns_size; - void complete_it(std::vector& result, - regex_sslot_id_t first_read_ns, regex_sslot_id_t second_read_ns, regex_sslot_id_t fork_ss_ns) + void complete_it(std::vector& result, + sslot_id_t first_read_ns, sslot_id_t second_read_ns, sslot_id_t fork_ss_ns) + { + assert((uint64_t)first_read_ns + (uint64_t)second_read_ns <= UINT32_MAX); + belated_sslot_id(result, todo_pos_read_ss_n , first_read_ns + second_read_ns); + belated_sslot_id(result, todo_pos_fork_ss_n, fork_ss_ns); + belated_sslot_id(result, todo_pos_second_ns_size, second_read_ns); + } + }; + + /* when I compile initializational part of program, I don't yet know what to put in + * PARAM_READ_SS_NUMBER, PARAM_FORK_SS_NUMBER and MSG_FED_INPUT_EXTENDED (second namespace size). + * These values are belate. */ + belate_initialization_parameters write_some_normal_initialization(std::vector& result, + size_t selarr_size, const REGEX_IS024_FA_FirstStageFixInfo& info1) { - assert((uint64_t)first_read_ns + (uint64_t)second_read_ns <= UINT32_MAX); - belated_sslot_id(result, todo_pos_read_ss_n , first_read_ns + second_read_ns); - belated_sslot_id(result, todo_pos_fork_ss_n, fork_ss_ns); - belated_sslot_id(result, todo_pos_second_ns_size, second_read_ns); - } -}; + belate_initialization_parameters todo; -/* when I compile initializational part of program, I don't yet know what to put in - * PARAM_READ_SS_NUMBER, PARAM_FORK_SS_NUMBER and MSG_FED_INPUT_EXTENDED (second namespace size). - * These values are belate. */ -belate_initialization_parameters write_some_normal_initialization(std::vector& result, - size_t selarr_size, const REGEX_IS024_FA_FirstStageFixInfo& info1) -{ - belate_initialization_parameters todo; + write_byte(result, opcodes::PARAM_READ_SS_NUMBER); + todo.todo_pos_read_ss_n = result.size(); + write_sslot_id(result, 0); // Belate - write_byte(result, regex024_opcodes::PARAM_READ_SS_NUMBER); - todo.todo_pos_read_ss_n = result.size(); - write_sslot_id(result, 0); // Belate + write_byte(result, opcodes::PARAM_FORK_SS_NUMBER); + todo.todo_pos_fork_ss_n = result.size(); + write_sslot_id(result, 0); // Belate - write_byte(result, regex024_opcodes::PARAM_FORK_SS_NUMBER); - todo.todo_pos_fork_ss_n = result.size(); - write_sslot_id(result, 0); // Belate + write_byte(result, opcodes::PARAM_SELARR_LEN); + write_tai(result, selarr_size); - write_byte(result, regex024_opcodes::PARAM_SELARR_LEN); - write_tai(result, selarr_size); + write_byte(result, opcodes::MSG_MULTISTART_ALLOWED); + write_byte(result, 1); - write_byte(result, regex024_opcodes::MSG_MULTISTART_ALLOWED); - write_byte(result, 1); + write_byte(result, opcodes::MSG_FED_INPUT_EXTENDED); + write_byte(result, info1.fed_chars_extend_one_left ? 1 : 0); + write_byte(result, info1.fed_chars_extend_one_right ? 1 : 0); + todo.todo_pos_second_ns_size = result.size(); + write_sslot_id(result, 0); // Belate - write_byte(result, regex024_opcodes::MSG_FED_INPUT_EXTENDED); - write_byte(result, info1.fed_chars_extend_one_left ? 1 : 0); - write_byte(result, info1.fed_chars_extend_one_right ? 1 : 0); - todo.todo_pos_second_ns_size = result.size(); - write_sslot_id(result, 0); // Belate - - write_byte(result, regex024_opcodes::INIT); - return todo; -} - -void compile_fa_to_regexis024_bytecode(std::vector& result, - FA_Container &fa, RegexPriorityTable &priority_table, - size_t selarr_size, const REGEX_IS024_FA_FirstStageFixInfo& info1, int& error) -{ - error = 0; - explicit_bookmarks bookmark_manager; - - if (!priority_table.empty()) { - bookmark_id_t BM_sift_function = bookmark_manager.new_bookmark(); - bookmark_id_t BM_after_sift = bookmark_manager.new_bookmark(); - - cmd_JUMP(result, bookmark_manager, BM_after_sift); - bookmark_manager.land_bookmark(result, BM_sift_function); - write_priority_table_actions(result, priority_table); - bookmark_manager.land_bookmark(result, BM_after_sift); - - write_byte(result, regex024_opcodes::PARAM_COLSIFTFUNC_SET); - bookmark_manager.write_unresolved_reference(result, BM_sift_function); + write_byte(result, opcodes::INIT); + return todo; } - belate_initialization_parameters init_param_todo = write_some_normal_initialization(result, selarr_size, info1); + void compile_fa_to_regexis024_bytecode(std::vector& result, + FA_Container &fa, RegexPriorityTable &priority_table, + size_t selarr_size, const REGEX_IS024_FA_FirstStageFixInfo& info1, int& error) + { + error = 0; + explicit_bookmarks bookmark_manager; - size_t first_read_ns, second_read_ns, fork_ss_ns; - compilation_core(result, fa, bookmark_manager, first_read_ns, second_read_ns, fork_ss_ns, error); - if (error < 0) - return; - init_param_todo.complete_it(result, first_read_ns, second_read_ns, fork_ss_ns); - bookmark_manager.finish(result); + if (!priority_table.empty()) { + bookmark_id_t BM_sift_function = bookmark_manager.new_bookmark(); + bookmark_id_t BM_after_sift = bookmark_manager.new_bookmark(); + + cmd_JUMP(result, bookmark_manager, BM_after_sift); + bookmark_manager.land_bookmark(result, BM_sift_function); + write_priority_table_actions(result, priority_table); + bookmark_manager.land_bookmark(result, BM_after_sift); + + write_byte(result, opcodes::PARAM_COLSIFTFUNC_SET); + bookmark_manager.write_unresolved_reference(result, BM_sift_function); + } + + belate_initialization_parameters init_param_todo = write_some_normal_initialization(result, selarr_size, info1); + + size_t first_read_ns, second_read_ns, fork_ss_ns; + compilation_core(result, fa, bookmark_manager, first_read_ns, second_read_ns, fork_ss_ns, error); + if (error < 0) + return; + init_param_todo.complete_it(result, first_read_ns, second_read_ns, fork_ss_ns); + bookmark_manager.finish(result); + } } diff --git a/src/libregexis024fa/graph_to_bytecode/fa_compiler.h b/src/libregexis024fa/graph_to_bytecode/fa_compiler.h index 96f340d..d190fa9 100644 --- a/src/libregexis024fa/graph_to_bytecode/fa_compiler.h +++ b/src/libregexis024fa/graph_to_bytecode/fa_compiler.h @@ -7,8 +7,10 @@ #include #include -void compile_fa_to_regexis024_bytecode(std::vector& result, FA_Container& fa, RegexPriorityTable& priority_table, - size_t selarr_size, const REGEX_IS024_FA_FirstStageFixInfo& info1, int& error); +namespace regexis024 { + void compile_fa_to_regexis024_bytecode(std::vector& result, FA_Container& fa, RegexPriorityTable& priority_table, + size_t selarr_size, const REGEX_IS024_FA_FirstStageFixInfo& info1, int& error); +} #endif diff --git a/src/libregexis024fa/graph_to_bytecode/filter.cpp b/src/libregexis024fa/graph_to_bytecode/filter.cpp index 74d9600..e438a48 100644 --- a/src/libregexis024fa/graph_to_bytecode/filter.cpp +++ b/src/libregexis024fa/graph_to_bytecode/filter.cpp @@ -4,117 +4,116 @@ #include #include -std::vector convert_to_compSeg(const std::vector& crossroad_codesets) -{ - std::vector compSeg; - std::vector seg; - for (size_t i = 0; i < crossroad_codesets.size(); i++) { - for (auto& p: crossroad_codesets[i]) { - seg.emplace_back(i, p.first, p.second); - } - } - std::sort(seg.begin(), seg.end(), - [](const FilterSegment& a, const FilterSegment& b)->bool{return a.L < b.L;}); - if (seg.empty()) { - compSeg.emplace_back(-1, 0, UINT32_MAX); - } else { - if (seg[0].L > 0) - compSeg.emplace_back(-1, 0, seg[0].L - 1); - size_t N = seg.size(); - for (size_t i = 0; i + 1 < N; i++) { - compSeg.push_back(seg[i]); - assert(seg[i].R < seg[i + 1].L); - if (seg[i].R + 1 < seg[i + 1].L) - compSeg.emplace_back(-1, seg[i].R + 1, seg[i + 1].L - 1); - } - compSeg.push_back(seg.back()); - if (seg.back().R < UINT32_MAX) - compSeg.emplace_back(-1, seg[N - 1].R + 1, UINT32_MAX); - } - assert(!compSeg.empty()); - return compSeg; -} - -/* Return whether the resulting bytecode relies on me placing [0]'th node at the end */ -void write_filter_exit(std::vector& result, explicit_bookmarks& bookmark_manager, - const std::vector& crossroad_marks, - ssize_t color, bool at_the_end, bool& relies_on_proper_ending) -{ - if (color < 0) { - cmd_DIE(result); - } else if (color != 0 || !at_the_end) { - cmd_JUMP(result, bookmark_manager, crossroad_marks[color]); - } else { - relies_on_proper_ending = true; - } -} - -// todo: use return value of this function -bool write_filter(std::vector& result, explicit_bookmarks& bookmark_manager, - const std::vector& crossroad_codesets, const std::vector& crossroad_marks) -{ - bool relies_on_proper_ending = false; - - std::vector compSeg = convert_to_compSeg(crossroad_codesets); - size_t N = compSeg.size(); - struct RecFrame { - size_t Li; - size_t Ri; - bool second_part = false; - bookmark_id_t to_the_right_part; - - RecFrame(size_t li, size_t ri): Li(li),Ri(ri) {} - }; - - std::vector call_stack = {RecFrame(0, N - 1)}; - - auto is_sandwich = [&](size_t Li, size_t Ri) -> bool { - return Li + 2 == Ri && compSeg[Li].color == compSeg[Ri].color && compSeg[Li + 1].L == compSeg[Li + 1].R; - }; - - while (!call_stack.empty()) { - RecFrame& cur_frame = call_stack.back(); - size_t Li = cur_frame.Li; - size_t Ri = cur_frame.Ri; - if (Li == Ri) { - write_filter_exit(result, bookmark_manager, crossroad_marks, compSeg[Li].color, - Ri + 1 == N, relies_on_proper_ending); - call_stack.pop_back(); - } else if (is_sandwich(Li, Ri)){ - ssize_t A = compSeg[Li].color; - ssize_t B = compSeg[Li + 1].color; - size_t midVal = compSeg[Li + 1].L; - if (B < 0) { - assert(A >= 0); - bookmark_id_t b_to_end = bookmark_manager.new_bookmark(); - cmd_JCEQUAL(result, bookmark_manager, midVal, b_to_end); - cmd_JUMP(result, bookmark_manager, crossroad_marks[A]); - bookmark_manager.land_bookmark(result, b_to_end); - cmd_DIE(result); - } else { - cmd_JCEQUAL(result, bookmark_manager, midVal, crossroad_marks[B]); - write_filter_exit(result, bookmark_manager, crossroad_marks, A, - Ri + 1 == N, relies_on_proper_ending); +namespace regexis024 { + std::vector convert_to_compSeg(const std::vector& crossroad_codesets) + { + std::vector compSeg; + std::vector seg; + for (size_t i = 0; i < crossroad_codesets.size(); i++) { + for (auto& p: crossroad_codesets[i]) { + seg.push_back({(ssize_t)i, p.first, p.second}); } - call_stack.pop_back(); + } + std::sort(seg.begin(), seg.end(), + [](const FilterSegment& a, const FilterSegment& b)->bool{return a.L < b.L;}); + if (seg.empty()) { + compSeg.push_back({-1, 0, UINT32_MAX}); } else { - size_t m = (Li + Ri) / 2; - if (!cur_frame.second_part) { - cur_frame.to_the_right_part = bookmark_manager.new_bookmark(); - cmd_JCGRTR(result, bookmark_manager, compSeg[m].R, cur_frame.to_the_right_part); - cur_frame.second_part = true; - /* cur_frame was just invalidated */ - call_stack.emplace_back(Li, m); - } else { - bookmark_manager.land_bookmark(result, cur_frame.to_the_right_part); - /* cur_frame was invalidated */ - call_stack.pop_back(); - call_stack.emplace_back(m + 1, Ri); + if (seg[0].L > 0) + compSeg.push_back({-1, 0, seg[0].L - 1}); + size_t N = seg.size(); + for (size_t i = 0; i + 1 < N; i++) { + compSeg.push_back(seg[i]); + assert(seg[i].R < seg[i + 1].L); + if (seg[i].R + 1 < seg[i + 1].L) + compSeg.push_back({-1, seg[i].R + 1, seg[i + 1].L - 1}); } + compSeg.push_back(seg.back()); + if (seg.back().R < UINT32_MAX) + compSeg.push_back({-1, seg[N - 1].R + 1, UINT32_MAX}); + } + assert(!compSeg.empty()); + return compSeg; + } + + /* Return whether the resulting bytecode relies on me placing [0]'th node at the end */ + void write_filter_exit(std::vector& result, explicit_bookmarks& bookmark_manager, + const std::vector& crossroad_marks, + ssize_t color, bool at_the_end, bool& relies_on_proper_ending) + { + if (color < 0) { + cmd_DIE(result); + } else if (color != 0 || !at_the_end) { + cmd_JUMP(result, bookmark_manager, crossroad_marks[color]); + } else { + relies_on_proper_ending = true; } } - return relies_on_proper_ending; -} -FilterSegment::FilterSegment(ssize_t color, uint32_t l, uint32_t r): color(color), L(l), R(r) {} -// + // todo: use return value of this function + bool write_filter(std::vector& result, explicit_bookmarks& bookmark_manager, + const std::vector& crossroad_codesets, const std::vector& crossroad_marks) + { + bool relies_on_proper_ending = false; + + std::vector compSeg = convert_to_compSeg(crossroad_codesets); + size_t N = compSeg.size(); + struct RecFrame { + size_t Li; + size_t Ri; + bool second_part = false; + bookmark_id_t to_the_right_part; + + RecFrame(size_t li, size_t ri): Li(li),Ri(ri) {} + }; + + std::vector call_stack = {RecFrame(0, N - 1)}; + + auto is_sandwich = [&](size_t Li, size_t Ri) -> bool { + return Li + 2 == Ri && compSeg[Li].color == compSeg[Ri].color && compSeg[Li + 1].L == compSeg[Li + 1].R; + }; + + while (!call_stack.empty()) { + RecFrame& cur_frame = call_stack.back(); + size_t Li = cur_frame.Li; + size_t Ri = cur_frame.Ri; + if (Li == Ri) { + write_filter_exit(result, bookmark_manager, crossroad_marks, compSeg[Li].color, + Ri + 1 == N, relies_on_proper_ending); + call_stack.pop_back(); + } else if (is_sandwich(Li, Ri)){ + ssize_t A = compSeg[Li].color; + ssize_t B = compSeg[Li + 1].color; + size_t midVal = compSeg[Li + 1].L; + if (B < 0) { + assert(A >= 0); + bookmark_id_t b_to_end = bookmark_manager.new_bookmark(); + cmd_JCEQUAL(result, bookmark_manager, midVal, b_to_end); + cmd_JUMP(result, bookmark_manager, crossroad_marks[A]); + bookmark_manager.land_bookmark(result, b_to_end); + cmd_DIE(result); + } else { + cmd_JCEQUAL(result, bookmark_manager, midVal, crossroad_marks[B]); + write_filter_exit(result, bookmark_manager, crossroad_marks, A, + Ri + 1 == N, relies_on_proper_ending); + } + call_stack.pop_back(); + } else { + size_t m = (Li + Ri) / 2; + if (!cur_frame.second_part) { + cur_frame.to_the_right_part = bookmark_manager.new_bookmark(); + cmd_JCGRTR(result, bookmark_manager, compSeg[m].R, cur_frame.to_the_right_part); + cur_frame.second_part = true; + /* cur_frame was just invalidated */ + call_stack.emplace_back(Li, m); + } else { + bookmark_manager.land_bookmark(result, cur_frame.to_the_right_part); + /* cur_frame was invalidated */ + call_stack.pop_back(); + call_stack.emplace_back(m + 1, Ri); + } + } + } + return relies_on_proper_ending; + } +} diff --git a/src/libregexis024fa/graph_to_bytecode/filter.h b/src/libregexis024fa/graph_to_bytecode/filter.h index 5284526..31c9cc1 100644 --- a/src/libregexis024fa/graph_to_bytecode/filter.h +++ b/src/libregexis024fa/graph_to_bytecode/filter.h @@ -6,16 +6,17 @@ #include #include -struct FilterSegment { - ssize_t color; - uint32_t L, R; +namespace regexis024 { + struct FilterSegment { + ssize_t color; + uint32_t L; + uint32_t R; + }; - FilterSegment(ssize_t color, uint32_t l, uint32_t r); -}; - -/* Return whether user of function must place [0]'th option after the filter - * The filter can end up being written in such a way that the end will never be reached */ -bool write_filter(std::vector& result, explicit_bookmarks& bookmark_manager, - const std::vector& crossroad_codesets, const std::vector& crossroad_marks); + /* Return whether user of function must place [0]'th option after the filter + * The filter can end up being written in such a way that the end will never be reached */ + bool write_filter(std::vector& result, explicit_bookmarks& bookmark_manager, + const std::vector& crossroad_codesets, const std::vector& crossroad_marks); +} #endif diff --git a/src/libregexis024fa/graph_to_bytecode/natural_compiler_utils.cpp b/src/libregexis024fa/graph_to_bytecode/natural_compiler_utils.cpp index 16fa2ca..351f5e0 100644 --- a/src/libregexis024fa/graph_to_bytecode/natural_compiler_utils.cpp +++ b/src/libregexis024fa/graph_to_bytecode/natural_compiler_utils.cpp @@ -2,114 +2,115 @@ #include #include +namespace regexis024 { #define push_to_res_least_signif result.push_back(x & 0xffLU); x >>= 8 - -void write_byte(std::vector& result, uint8_t x) { - result.push_back(x); -} - -void write_word(std::vector& result, uint16_t x) { - push_to_res_least_signif; push_to_res_least_signif; -} - -void write_doubleword(std::vector& result, uint32_t x) { - push_to_res_least_signif; push_to_res_least_signif; push_to_res_least_signif; push_to_res_least_signif; -} - -void write_quadword(std::vector& result, uint64_t x) { - for (int i = 0; i < 8; i++) { - push_to_res_least_signif; + void write_byte(std::vector& result, uint8_t x) { + result.push_back(x); + } + + void write_word(std::vector& result, uint16_t x) { + push_to_res_least_signif; push_to_res_least_signif; + } + + void write_doubleword(std::vector& result, uint32_t x) { + push_to_res_least_signif; push_to_res_least_signif; push_to_res_least_signif; push_to_res_least_signif; + } + + void write_quadword(std::vector& result, uint64_t x) { + for (int i = 0; i < 8; i++) { + push_to_res_least_signif; + } } -} #undef push_to_res_least_signif #define put_belated_to_res assert(result[pos] == 0); result[pos++] = value & 0xffLU; value >>= 8 -void belated_byte(std::vector& result, size_t pos, uint8_t value) { - assert(pos < result.size()); - result[pos] = value; -} - -void belated_word(std::vector& result, size_t pos, uint16_t value) { - assert(pos + 2 <= result.size()); - put_belated_to_res; put_belated_to_res; -} - -void belated_doubleword(std::vector& result, size_t pos, uint32_t value) { - assert(pos + 4 <= result.size()); - put_belated_to_res; put_belated_to_res; put_belated_to_res; put_belated_to_res; -} - -void belated_quadword(std::vector& result, size_t pos, uint64_t value) { - assert(pos + 8 <= result.size()); - for (int i = 0; i < 8; i++) { - put_belated_to_res; + void belated_byte(std::vector& result, size_t pos, uint8_t value) { + assert(pos < result.size()); + result[pos] = value; + } + + void belated_word(std::vector& result, size_t pos, uint16_t value) { + assert(pos + 2 <= result.size()); + put_belated_to_res; put_belated_to_res; + } + + void belated_doubleword(std::vector& result, size_t pos, uint32_t value) { + assert(pos + 4 <= result.size()); + put_belated_to_res; put_belated_to_res; put_belated_to_res; put_belated_to_res; + } + + void belated_quadword(std::vector& result, size_t pos, uint64_t value) { + assert(pos + 8 <= result.size()); + for (int i = 0; i < 8; i++) { + put_belated_to_res; + } } -} #undef put_belated_to_res -void write_sslot_id(std::vector& result, regex_sslot_id_t x) { - write_doubleword(result, x); -} + void write_sslot_id(std::vector& result, sslot_id_t x) { + write_doubleword(result, x); + } -void write_tai(std::vector& result, regex_tai_t x) { - write_word(result, x); -} + void write_tai(std::vector& result, tai_t x) { + write_word(result, x); + } -void write_near_ptr(std::vector& result, regex_near_ptr_t x) { - write_quadword(result, x); -} + void write_near_ptr(std::vector& result, near_ptr_t x) { + write_quadword(result, x); + } -void belated_sslot_id(std::vector& result, size_t pos, regex_sslot_id_t value) { - belated_doubleword(result, pos, value); -} + void belated_sslot_id(std::vector& result, size_t pos, sslot_id_t value) { + belated_doubleword(result, pos, value); + } -void belated_tai(std::vector& result, size_t pos, regex_tai_t value) { - belated_word(result, pos, value); -} + void belated_tai(std::vector& result, size_t pos, tai_t value) { + belated_word(result, pos, value); + } -void belated_near_ptr(std::vector& result, size_t pos, regex_near_ptr_t value) { - belated_quadword(result, pos, value); -} + void belated_near_ptr(std::vector& result, size_t pos, near_ptr_t value) { + belated_quadword(result, pos, value); + } -bookmark_id_t explicit_bookmarks::new_bookmark() { - pile.emplace_back(); - return free_bid++; -} + bookmark_id_t explicit_bookmarks::new_bookmark() { + pile.emplace_back(); + return free_bid++; + } -void explicit_bookmarks::write_unresolved_reference(std::vector &result, bookmark_id_t bm) { - size_t where_to_fill_later = result.size(); - write_near_ptr(result, 0); - pile[bm].positions_of_belated_refs.push_back(where_to_fill_later); -} + void explicit_bookmarks::write_unresolved_reference(std::vector &result, bookmark_id_t bm) { + size_t where_to_fill_later = result.size(); + write_near_ptr(result, 0); + pile[bm].positions_of_belated_refs.push_back(where_to_fill_later); + } -void explicit_bookmarks::land_bookmark(std::vector &result, bookmark_id_t bm) { - assert(!pile[bm].placed_somewhere); - pile[bm].placed_somewhere = true; - pile[bm].actual_position = result.size(); -} + void explicit_bookmarks::land_bookmark(std::vector &result, bookmark_id_t bm) { + assert(!pile[bm].placed_somewhere); + pile[bm].placed_somewhere = true; + pile[bm].actual_position = result.size(); + } -void explicit_bookmarks::finish(std::vector &result) { - for (explicit_bookmark_info& bmi: pile) { - assert(bmi.positions_of_belated_refs.empty() || bmi.placed_somewhere); - if (bmi.placed_somewhere) { - for (size_t ref_to_mine_belate: bmi.positions_of_belated_refs) { - belated_near_ptr(result, ref_to_mine_belate, bmi.actual_position); + void explicit_bookmarks::finish(std::vector &result) { + for (explicit_bookmark_info& bmi: pile) { + assert(bmi.positions_of_belated_refs.empty() || bmi.placed_somewhere); + if (bmi.placed_somewhere) { + for (size_t ref_to_mine_belate: bmi.positions_of_belated_refs) { + belated_near_ptr(result, ref_to_mine_belate, bmi.actual_position); + } } } } -} -bookmark_id_t explicit_bookmarks::new_range_of_bookmarks(size_t n) { - bookmark_id_t offset = free_bid; - free_bid += n; - for (size_t i = 0; i < n; i++) { - pile.emplace_back(); + bookmark_id_t explicit_bookmarks::new_range_of_bookmarks(size_t n) { + bookmark_id_t offset = free_bid; + free_bid += n; + for (size_t i = 0; i < n; i++) { + pile.emplace_back(); + } + return offset; } - return offset; -} -bool explicit_bookmarks::has_landed(bookmark_id_t bm) { - return pile[bm].placed_somewhere; + bool explicit_bookmarks::has_landed(bookmark_id_t bm) { + return pile[bm].placed_somewhere; + } } #undef put_belated_to_res diff --git a/src/libregexis024fa/graph_to_bytecode/natural_compiler_utils.h b/src/libregexis024fa/graph_to_bytecode/natural_compiler_utils.h index b5a4b96..f10b41b 100644 --- a/src/libregexis024fa/graph_to_bytecode/natural_compiler_utils.h +++ b/src/libregexis024fa/graph_to_bytecode/natural_compiler_utils.h @@ -4,60 +4,60 @@ #include #include #include +namespace regexis024 { + void write_byte(std::vector& result, uint8_t x); + void write_word(std::vector& result, uint16_t x); + void write_doubleword(std::vector& result, uint32_t x); + void write_quadword(std::vector& result, uint64_t x); -void write_byte(std::vector& result, uint8_t x); -void write_word(std::vector& result, uint16_t x); -void write_doubleword(std::vector& result, uint32_t x); -void write_quadword(std::vector& result, uint64_t x); - -void belated_byte(std::vector& result, size_t pos, uint8_t value); -void belated_word(std::vector& result, size_t pos, uint16_t value); -void belated_doubleword(std::vector& result, size_t pos, uint32_t value); -void belated_quadword(std::vector& result, size_t pos, uint64_t value); + void belated_byte(std::vector& result, size_t pos, uint8_t value); + void belated_word(std::vector& result, size_t pos, uint16_t value); + void belated_doubleword(std::vector& result, size_t pos, uint32_t value); + void belated_quadword(std::vector& result, size_t pos, uint64_t value); -void write_sslot_id(std::vector& result, regex_sslot_id_t x); -void write_tai(std::vector& result, regex_tai_t x); -void write_near_ptr(std::vector& result, regex_near_ptr_t x); + void write_sslot_id(std::vector& result, sslot_id_t x); + void write_tai(std::vector& result, tai_t x); + void write_near_ptr(std::vector& result, near_ptr_t x); -void belated_sslot_id(std::vector& result, size_t pos, regex_sslot_id_t value); -void belated_tai(std::vector& result, size_t pos, regex_tai_t value); -void belated_near_ptr(std::vector& result, size_t pos, regex_near_ptr_t value); + void belated_sslot_id(std::vector& result, size_t pos, sslot_id_t value); + void belated_tai(std::vector& result, size_t pos, tai_t value); + void belated_near_ptr(std::vector& result, size_t pos, near_ptr_t value); -// constexpr uint64_t INSTRUCTION_SZ = REGEX024_BYTECODE_INSTRUCTION_SZ; -// constexpr uint64_t SSLOT_ID_SZ = REGEX024_BYTECODE_SSLOT_ID_SZ; -// constexpr uint64_t TRACK_ARRAY_INDEX_ID_SZ = REGEX024_BYTECODE_TRACK_ARRAY_INDEX_ID_SZ; -// constexpr uint64_t NEAR_POINTER_SZ = REGEX024_BYTECODE_NEAR_POINTER_SZ; + // constexpr uint64_t INSTRUCTION_SZ = REGEX024_BYTECODE_INSTRUCTION_SZ; + // constexpr uint64_t SSLOT_ID_SZ = REGEX024_BYTECODE_SSLOT_ID_SZ; + // constexpr uint64_t TRACK_ARRAY_INDEX_ID_SZ = REGEX024_BYTECODE_TRACK_ARRAY_INDEX_ID_SZ; + // constexpr uint64_t NEAR_POINTER_SZ = REGEX024_BYTECODE_NEAR_POINTER_SZ; -typedef size_t bookmark_id_t; + typedef size_t bookmark_id_t; -struct explicit_bookmark_info { - std::vector positions_of_belated_refs; - bool placed_somewhere = false; - size_t actual_position; -}; + struct explicit_bookmark_info { + std::vector positions_of_belated_refs; + bool placed_somewhere = false; + size_t actual_position; + }; -struct explicit_bookmarks { - bookmark_id_t free_bid = 0; - /* For each named explicit bookmark there is an element in PILE */ - std::vector pile; + struct explicit_bookmarks { + bookmark_id_t free_bid = 0; + /* For each named explicit bookmark there is an element in PILE */ + std::vector pile; - bookmark_id_t new_bookmark(); + bookmark_id_t new_bookmark(); - /* bm is the bookmark I refer to. Each bookmark has an id. It is like a name, but fits in 8 bytes */ - void write_unresolved_reference(std::vector& result, bookmark_id_t bm); + /* bm is the bookmark I refer to. Each bookmark has an id. It is like a name, but fits in 8 bytes */ + void write_unresolved_reference(std::vector& result, bookmark_id_t bm); - /* bm is the bookmark I place into program `result` */ - void land_bookmark(std::vector& result, bookmark_id_t bm); + /* bm is the bookmark I place into program `result` */ + void land_bookmark(std::vector& result, bookmark_id_t bm); - /* call it at the very end of bytecode-building */ - void finish(std::vector& result); + /* call it at the very end of bytecode-building */ + void finish(std::vector& result); - /* Returns offset of range of bookmark id's */ - bookmark_id_t new_range_of_bookmarks(size_t n); - - bool has_landed(bookmark_id_t bm); -}; + /* Returns offset of range of bookmark id's */ + bookmark_id_t new_range_of_bookmarks(size_t n); + bool has_landed(bookmark_id_t bm); + }; +} #endif diff --git a/src/libregexis024fa/graph_to_bytecode/writing_commands.cpp b/src/libregexis024fa/graph_to_bytecode/writing_commands.cpp index ffc42e3..b57ff8f 100644 --- a/src/libregexis024fa/graph_to_bytecode/writing_commands.cpp +++ b/src/libregexis024fa/graph_to_bytecode/writing_commands.cpp @@ -2,74 +2,76 @@ #include #include -void cmd_JUMP(std::vector& result, explicit_bookmarks& bookmark_manager, bookmark_id_t dest) { - write_byte(result, regex024_opcodes::JUMP); - bookmark_manager.write_unresolved_reference(result, dest); -} - -constexpr regex024_opcode cmp_EQUAL[4] = {regex024_opcodes::JCEQUAL_B, regex024_opcodes::JCEQUAL_W, - regex024_opcodes::JCEQUAL_DW, regex024_opcodes::JCEQUAL_QW}; -constexpr regex024_opcode cmp_LESS[4] = {regex024_opcodes::JCLESS_B, regex024_opcodes::JCLESS_W, - regex024_opcodes::JCLESS_DW, regex024_opcodes::JCLESS_QW}; -constexpr regex024_opcode cmp_GRTR[4] = {regex024_opcodes::JCGRTR_B, regex024_opcodes::JCGRTR_W, - regex024_opcodes::JCGRTR_DW, regex024_opcodes::JCGRTR_QW}; - - -void cmd_JC(const regex024_opcode cmpT[4], - std::vector& result, explicit_bookmarks& bookmark_manager, uint64_t val, bookmark_id_t dest) -{ - if (val <= UINT8_MAX) { - write_byte(result, cmpT[0]); - write_byte(result, static_cast(val)); - } else if (val <= UINT16_MAX) { - write_byte(result, cmpT[1]); - write_word(result, static_cast(val)); - } else if (val <= UINT32_MAX) { - write_byte(result, cmpT[2]); - write_doubleword(result, static_cast(val)); - } else { - write_byte(result, cmpT[3]); - write_quadword(result, val); +namespace regexis024 { + void cmd_JUMP(std::vector& result, explicit_bookmarks& bookmark_manager, bookmark_id_t dest) { + write_byte(result, opcodes::JUMP); + bookmark_manager.write_unresolved_reference(result, dest); } - bookmark_manager.write_unresolved_reference(result, dest); -} + + constexpr opcode_t cmp_EQUAL[4] = {opcodes::JCEQUAL_B, opcodes::JCEQUAL_W, + opcodes::JCEQUAL_DW, opcodes::JCEQUAL_QW}; + constexpr opcode_t cmp_LESS[4] = {opcodes::JCLESS_B, opcodes::JCLESS_W, + opcodes::JCLESS_DW, opcodes::JCLESS_QW}; + constexpr opcode_t cmp_GRTR[4] = {opcodes::JCGRTR_B, opcodes::JCGRTR_W, + opcodes::JCGRTR_DW, opcodes::JCGRTR_QW}; -void cmd_JCEQUAL(std::vector& result, explicit_bookmarks& bookmark_manager, uint64_t val, bookmark_id_t dest) { - cmd_JC(cmp_EQUAL, result, bookmark_manager, val, dest); -} + void cmd_JC(const opcode_t cmpT[4], + std::vector& result, explicit_bookmarks& bookmark_manager, uint64_t val, bookmark_id_t dest) + { + if (val <= UINT8_MAX) { + write_byte(result, cmpT[0]); + write_byte(result, static_cast(val)); + } else if (val <= UINT16_MAX) { + write_byte(result, cmpT[1]); + write_word(result, static_cast(val)); + } else if (val <= UINT32_MAX) { + write_byte(result, cmpT[2]); + write_doubleword(result, static_cast(val)); + } else { + write_byte(result, cmpT[3]); + write_quadword(result, val); + } + bookmark_manager.write_unresolved_reference(result, dest); + } -void cmd_JCLESS(std::vector& result, explicit_bookmarks& bookmark_manager, uint64_t val, bookmark_id_t dest) { - cmd_JC(cmp_LESS, result, bookmark_manager, val, dest); -} -void cmd_JCGRTR(std::vector& result, explicit_bookmarks& bookmark_manager, uint64_t val, bookmark_id_t dest) { - cmd_JC(cmp_GRTR, result, bookmark_manager, val, dest); -} + void cmd_JCEQUAL(std::vector& result, explicit_bookmarks& bookmark_manager, uint64_t val, bookmark_id_t dest) { + cmd_JC(cmp_EQUAL, result, bookmark_manager, val, dest); + } -void cmd_DIE(std::vector &result) { - write_byte(result, regex024_opcodes::DIE); -} + void cmd_JCLESS(std::vector& result, explicit_bookmarks& bookmark_manager, uint64_t val, bookmark_id_t dest) { + cmd_JC(cmp_LESS, result, bookmark_manager, val, dest); + } -void cmd_MATCH(std::vector &result) { - write_byte(result, regex024_opcodes::MATCH); -} + void cmd_JCGRTR(std::vector& result, explicit_bookmarks& bookmark_manager, uint64_t val, bookmark_id_t dest) { + cmd_JC(cmp_GRTR, result, bookmark_manager, val, dest); + } -void cmd_READ_first_ns(std::vector& result, size_t slot) { - assert(slot <= UINT32_MAX); - write_byte(result, regex024_opcodes::READ); - write_sslot_id(result, slot); -} + void cmd_DIE(std::vector &result) { + write_byte(result, opcodes::DIE); + } -void cmd_FORK(std::vector &result, explicit_bookmarks& bookmark_manager, size_t slot, bookmark_id_t dest) { - assert(slot <= UINT32_MAX); - write_byte(result, regex024_opcodes::FORK); - write_sslot_id(result, slot); - bookmark_manager.write_unresolved_reference(result, dest); -} + void cmd_MATCH(std::vector &result) { + write_byte(result, opcodes::MATCH); + } -void cmd_READ_second_ns(std::vector& result, std::vector& belate_second_read_ns_slot_args) { - write_byte(result, regex024_opcodes::READ); - belate_second_read_ns_slot_args.push_back(result.size()); - write_sslot_id(result, 0); -} + void cmd_READ_first_ns(std::vector& result, size_t slot) { + assert(slot <= UINT32_MAX); + write_byte(result, opcodes::READ); + write_sslot_id(result, slot); + } + + void cmd_FORK(std::vector &result, explicit_bookmarks& bookmark_manager, size_t slot, bookmark_id_t dest) { + assert(slot <= UINT32_MAX); + write_byte(result, opcodes::FORK); + write_sslot_id(result, slot); + bookmark_manager.write_unresolved_reference(result, dest); + } + + void cmd_READ_second_ns(std::vector& result, std::vector& belate_second_read_ns_slot_args) { + write_byte(result, opcodes::READ); + belate_second_read_ns_slot_args.push_back(result.size()); + write_sslot_id(result, 0); + } +} \ No newline at end of file diff --git a/src/libregexis024fa/graph_to_bytecode/writing_commands.h b/src/libregexis024fa/graph_to_bytecode/writing_commands.h index 43ed9e1..64efac5 100644 --- a/src/libregexis024fa/graph_to_bytecode/writing_commands.h +++ b/src/libregexis024fa/graph_to_bytecode/writing_commands.h @@ -4,17 +4,19 @@ #include #include -void cmd_JUMP(std::vector& result, explicit_bookmarks& bookmark_manager, bookmark_id_t dest); +namespace regexis024 { + void cmd_JUMP(std::vector& result, explicit_bookmarks& bookmark_manager, bookmark_id_t dest); -void cmd_JCEQUAL(std::vector& result, explicit_bookmarks& bookmark_manager, uint64_t val, bookmark_id_t dest); -void cmd_JCLESS(std::vector& result, explicit_bookmarks& bookmark_manager, uint64_t val, bookmark_id_t dest); -void cmd_JCGRTR(std::vector& result, explicit_bookmarks& bookmark_manager, uint64_t val, bookmark_id_t dest); + void cmd_JCEQUAL(std::vector& result, explicit_bookmarks& bookmark_manager, uint64_t val, bookmark_id_t dest); + void cmd_JCLESS(std::vector& result, explicit_bookmarks& bookmark_manager, uint64_t val, bookmark_id_t dest); + void cmd_JCGRTR(std::vector& result, explicit_bookmarks& bookmark_manager, uint64_t val, bookmark_id_t dest); -void cmd_DIE(std::vector& result); -void cmd_MATCH(std::vector& result); + void cmd_DIE(std::vector& result); + void cmd_MATCH(std::vector& result); -void cmd_READ_first_ns(std::vector& result, size_t slot); -void cmd_READ_second_ns(std::vector& result, std::vector& belate_second_read_ns_slot_args); -void cmd_FORK(std::vector &result, explicit_bookmarks& bookmark_manager, size_t slot, bookmark_id_t dest); + void cmd_READ_first_ns(std::vector& result, size_t slot); + void cmd_READ_second_ns(std::vector& result, std::vector& belate_second_read_ns_slot_args); + void cmd_FORK(std::vector &result, explicit_bookmarks& bookmark_manager, size_t slot, bookmark_id_t dest); +} #endif diff --git a/src/libregexis024fa/misc_fa_funcs.cpp b/src/libregexis024fa/misc_fa_funcs.cpp index 3b0496e..9b2bcdc 100644 --- a/src/libregexis024fa/misc_fa_funcs.cpp +++ b/src/libregexis024fa/misc_fa_funcs.cpp @@ -3,69 +3,70 @@ #include #include -void reattach_fa_node_edge(FA_Node **old_node_ptr, FA_Node *new_node) { - assert(old_node_ptr); - if (*old_node_ptr){ - assert((**old_node_ptr).refs); - (**old_node_ptr).refs--; +namespace regexis024 { + void reattach_fa_node_edge(FA_Node **old_node_ptr, FA_Node *new_node) { + assert(old_node_ptr); + if (*old_node_ptr){ + assert((**old_node_ptr).refs); + (**old_node_ptr).refs--; + } + if (new_node) + new_node->refs++; + *old_node_ptr = new_node; } - if (new_node) - new_node->refs++; - *old_node_ptr = new_node; -} -/* We basically reattch fa.start to node */ -void yay_new_start(FA_Container &fa, FA_NodePathPart *node) { - assert(node); - node->refs++; - node->nxt_node = fa.start; - fa.start = node; -} + /* We basically reattch fa.start to node */ + void yay_new_start(FA_Container &fa, FA_NodePathPart *node) { + assert(node); + node->refs++; + node->nxt_node = fa.start; + fa.start = node; + } -void add_option_to_fork_node(FA_NodeOfForking *fnode, FA_Node *transition_dest) { - fnode->nxt_options.push_back(transition_dest); - if(transition_dest) - transition_dest->refs++; -} + void add_option_to_fork_node(FA_NodeOfForking *fnode, FA_Node *transition_dest) { + fnode->nxt_options.push_back(transition_dest); + if(transition_dest) + transition_dest->refs++; + } -void reattach_nxt_node(FA_NodePathPart *node, FA_Node *dest) { - reattach_fa_node_edge(&(node->nxt_node), dest); -} + void reattach_nxt_node(FA_NodePathPart *node, FA_Node *dest) { + reattach_fa_node_edge(&(node->nxt_node), dest); + } -// todo: get rid of exitf in the whole project -FA_Node* copy_node_no_container_adjustments(FA_Node& node){ - FA_Node* res; - /* Using implicitly defined copy constructors */ + FA_Node* copy_node_no_container_adjustments(FA_Node& node){ + FA_Node* res; + /* Using implicitly defined copy constructors */ #define typeCase(etype, ctype) case etype: res = new ctype((ctype&)node); break; - switch (node.type) { - typeCase(match, FA_NodeOfMatch) - typeCase(one_char_read, FA_NodeOfOneCharRead) - typeCase(forking, FA_NodeOfForking) - typeCase(look_one_behind, FA_NodeOfLookOneBehind) - typeCase(look_one_ahead, FA_NodeOfLookOneAhead) - typeCase(track_array_mov_imm, FA_NodeOfTrackArrayMovImm) - typeCase(track_array_mov_halfinvariant, FA_NodeOfTrackArrayMovHalfinvariant) - typeCase(det_char_crossroads, FA_NodeOfDetCharCrossroads) - default: - assert(false); - } + switch (node.type) { + typeCase(match, FA_NodeOfMatch) + typeCase(one_char_read, FA_NodeOfOneCharRead) + typeCase(forking, FA_NodeOfForking) + typeCase(look_one_behind, FA_NodeOfLookOneBehind) + typeCase(look_one_ahead, FA_NodeOfLookOneAhead) + typeCase(track_array_mov_imm, FA_NodeOfTrackArrayMovImm) + typeCase(track_array_mov_halfinvariant, FA_NodeOfTrackArrayMovHalfinvariant) + typeCase(det_char_crossroads, FA_NodeOfDetCharCrossroads) + default: + assert(false); + } #undef typeCase - res->refs = 0; - res->search_mark = -1; - return res; -} + res->refs = 0; + res->search_mark = -1; + return res; + } -/* In case when transferring the ownership of this new raw pointer has failed, node is destroyed, exception is thrown */ -FA_Node *copy_fa_node(FA_Node& node, FA_Container &fa) { - FA_Node* res = copy_node_no_container_adjustments(node); - /* Can invalidate ponter res (in which case it also throws exeption, so none of this matters in the end) */ - fa.registerNew(res); - res->reAdd_references(); - return res; -} + /* In case when transferring the ownership of this new raw pointer has failed, node is destroyed, exception is thrown */ + FA_Node *copy_fa_node(FA_Node& node, FA_Container &fa) { + FA_Node* res = copy_node_no_container_adjustments(node); + /* Can invalidate ponter res (in which case it also throws exeption, so none of this matters in the end) */ + fa.registerNew(res); + res->reAdd_references(); + return res; + } -FA_Node *copy_fa_node_to_another_fa(FA_Node& node, FA_Container &resultFa) { - FA_Node* res = copy_node_no_container_adjustments(node); - resultFa.registerNew(res); - return res; -} + FA_Node *copy_fa_node_to_another_fa(FA_Node& node, FA_Container &resultFa) { + FA_Node* res = copy_node_no_container_adjustments(node); + resultFa.registerNew(res); + return res; + } +} \ No newline at end of file diff --git a/src/libregexis024fa/misc_fa_funcs.h b/src/libregexis024fa/misc_fa_funcs.h index 7510dd6..39d4ec2 100644 --- a/src/libregexis024fa/misc_fa_funcs.h +++ b/src/libregexis024fa/misc_fa_funcs.h @@ -4,14 +4,16 @@ #include "finite_automaton.h" #include "fa_first_stage_fix.h" -FA_Node* copy_fa_node(FA_Node& node, FA_Container& fa); -void yay_new_start(FA_Container& fa, FA_NodePathPart* node); -void reattach_fa_node_edge(FA_Node** old_node_ptr, FA_Node* new_node); -void add_option_to_fork_node(FA_NodeOfForking* fnode, FA_Node* transition_dest); -void reattach_nxt_node(FA_NodePathPart* node, FA_Node* dest); +namespace regexis024 { + FA_Node* copy_fa_node(FA_Node& node, FA_Container& fa); + void yay_new_start(FA_Container& fa, FA_NodePathPart* node); + void reattach_fa_node_edge(FA_Node** old_node_ptr, FA_Node* new_node); + void add_option_to_fork_node(FA_NodeOfForking* fnode, FA_Node* transition_dest); + void reattach_nxt_node(FA_NodePathPart* node, FA_Node* dest); -/* This is a one weird operation. New node in resultFa will still point to nodes in sourceFa, - * without increasing refcount of those nodes. YOU HAVE TO FIX IT ASAP */ -FA_Node* copy_fa_node_to_another_fa(FA_Node& node, FA_Container& resultFa); + /* This is a one weird operation. New node in resultFa will still point to nodes in sourceFa, + * without increasing refcount of those nodes. YOU HAVE TO FIX IT ASAP */ + FA_Node* copy_fa_node_to_another_fa(FA_Node& node, FA_Container& resultFa); +} #endif //LIBREGEXIS024_MISC_FA_FUNCS_H diff --git a/src/libregexis024fa/selarr_priority_table.cpp b/src/libregexis024fa/selarr_priority_table.cpp index 21f07a9..3ea6544 100644 --- a/src/libregexis024fa/selarr_priority_table.cpp +++ b/src/libregexis024fa/selarr_priority_table.cpp @@ -1,15 +1,16 @@ #include #include +namespace regexis024 { + bool RegexPriorityTableAction_Pos::isForRange() const { + return second >= 0; + } -bool RegexPriorityTableAction_Pos::isForRange() const { - return second >= 0; + RegexPriorityTableAction_Pos::RegexPriorityTableAction_Pos(int first, int second, tracking_var_type_t type): + first(first),second(second), type(type) {} + // + + RegexPriorityTableAction::RegexPriorityTableAction(bool minimize, int first, int second, tracking_var_type_t type): + minimize(minimize), pos(first, second, type) {} + // } - -RegexPriorityTableAction_Pos::RegexPriorityTableAction_Pos(int first, int second, tracking_var_type type): - first(first),second(second), type(type) {} -// - -RegexPriorityTableAction::RegexPriorityTableAction(bool minimize, int first, int second, tracking_var_type type): - minimize(minimize), pos(first, second, type) {} -// diff --git a/src/libregexis024fa/selarr_priority_table.h b/src/libregexis024fa/selarr_priority_table.h index bf1de74..5675d5b 100644 --- a/src/libregexis024fa/selarr_priority_table.h +++ b/src/libregexis024fa/selarr_priority_table.h @@ -5,22 +5,24 @@ #include #include -struct RegexPriorityTableAction_Pos{ - /* first and second are indexes in selarr (but second can be -1 if it is unused) */ - int first; - int second; - tracking_var_type type; - bool isForRange() const; +namespace regexis024 { + struct RegexPriorityTableAction_Pos{ + /* first and second are indexes in selarr (but second can be -1 if it is unused) */ + int first; + int second; + tracking_var_type_t type; + bool isForRange() const; - RegexPriorityTableAction_Pos(int first, int second, tracking_var_type type); -}; + RegexPriorityTableAction_Pos(int first, int second, tracking_var_type_t type); + }; -struct RegexPriorityTableAction{ - bool minimize; - RegexPriorityTableAction_Pos pos; - RegexPriorityTableAction(bool minimize, int first, int second, tracking_var_type type); -}; + struct RegexPriorityTableAction{ + bool minimize; + RegexPriorityTableAction_Pos pos; + RegexPriorityTableAction(bool minimize, int first, int second, tracking_var_type_t type); + }; -typedef std::vector RegexPriorityTable; + typedef std::vector RegexPriorityTable; +} #endif //LIBREGEXIS024_SRC_LIBREGEXIS024FA_SELARR_PRIORITY_TABLE_H diff --git a/src/libregexis024fa/tracking_fa_nodes.cpp b/src/libregexis024fa/tracking_fa_nodes.cpp index 12a92d0..3f0286f 100644 --- a/src/libregexis024fa/tracking_fa_nodes.cpp +++ b/src/libregexis024fa/tracking_fa_nodes.cpp @@ -1,53 +1,48 @@ #include #include -bool isImmMovOpcode(regex024_opcode inst) { - return inst == regex024_opcodes::MOV_COLARR_IMM || inst == regex024_opcodes::MOV_SELARR_IMM; -} +namespace regexis024 { + bool isImmMovOpcode(opcode_t inst) { + return inst == opcodes::MOV_COLARR_IMM || inst == opcodes::MOV_SELARR_IMM; + } -bool isCurPosMovOpcode(regex024_opcode inst) { - return inst == regex024_opcodes::MOV_COLARR_BTPOS || inst == regex024_opcodes::MOV_SELARR_CHPOS; -} + bool isCurPosMovOpcode(opcode_t inst) { + return inst == opcodes::MOV_COLARR_BTPOS || inst == opcodes::MOV_SELARR_CHPOS; + } -bool isColarrOpcode(regex024_opcode inst) { - return inst == regex024_opcodes::MOV_COLARR_IMM || inst == regex024_opcodes::MOV_COLARR_BTPOS; -} + bool isColarrOpcode(opcode_t inst) { + return inst == opcodes::MOV_COLARR_IMM || inst == opcodes::MOV_COLARR_BTPOS; + } -bool isSelarrOpcode(regex024_opcode inst) { - return inst == regex024_opcodes::MOV_SELARR_IMM || inst == regex024_opcodes::MOV_SELARR_CHPOS; -} + bool isSelarrOpcode(opcode_t inst) { + return inst == opcodes::MOV_SELARR_IMM || inst == opcodes::MOV_SELARR_CHPOS; + } -bool isTrackingFaNode(const FA_Node *n) { - return n->type == track_array_mov_imm || n->type == track_array_mov_halfinvariant; -} + bool isTrackingFaNode(const FA_Node *n) { + return n->type == track_array_mov_imm || n->type == track_array_mov_halfinvariant; + } -TrackingOperationInFa::TrackingOperationInFa(regex024_opcode opcode, regex_tai_t key, uint64_t imm_value) - : opcode(opcode), key(key), immValue(imm_value) {} + std::string TrackingOperationInFa::toString() const { + switch (opcode){ + case opcodes::MOV_COLARR_IMM: + return "colarr[" + std::to_string(key) + "] := " + std::to_string(immValue); + case opcodes::MOV_SELARR_IMM: + return "selarr[" + std::to_string(key) + "] := " + std::to_string(immValue); + case opcodes::MOV_COLARR_BTPOS: + return "colarr[" + std::to_string(key) + "] := cur byte position"; + case opcodes::MOV_SELARR_CHPOS: + return "selarr[" + std::to_string(key) + "] := cur char position"; + default: + return "wrong collection operation"; + } + } -TrackingOperationInFa::TrackingOperationInFa(regex024_opcode opcode, regex_tai_t key) - : opcode(opcode), key(key) {} + FA_NodePathPart* convert_to_node(const TrackingOperationInFa& op, FA_Container& fa) { + if (isImmMovOpcode(op.opcode)) { + return fa.makeTrackArrayMovImm(op.opcode, op.key, op.immValue); + } + assert(isCurPosMovOpcode(op.opcode)); + return fa.makeTrackArrayMovHalfinvariant(op.opcode, op.key); -std::string TrackingOperationInFa::toString() const { - switch (opcode){ - case regex024_opcodes::MOV_COLARR_IMM: - return "colarr[" + std::to_string(key) + "] := " + std::to_string(immValue); - case regex024_opcodes::MOV_SELARR_IMM: - return "selarr[" + std::to_string(key) + "] := " + std::to_string(immValue); - case regex024_opcodes::MOV_COLARR_BTPOS: - return "colarr[" + std::to_string(key) + "] := cur byte position"; - case regex024_opcodes::MOV_SELARR_CHPOS: - return "selarr[" + std::to_string(key) + "] := cur char position"; - default: - return "wrong collection operation"; } } - -FA_NodePathPart* convert_to_node(const TrackingOperationInFa& op, FA_Container& fa) { - if (isImmMovOpcode(op.opcode)) { - return fa.makeTrackArrayMovImm(op.opcode, op.key, op.immValue); - } - assert(isCurPosMovOpcode(op.opcode)); - return fa.makeTrackArrayMovHalfinvariant(op.opcode, op.key); - -} - diff --git a/src/libregexis024fa/tracking_fa_nodes.h b/src/libregexis024fa/tracking_fa_nodes.h index 618869a..93043cd 100644 --- a/src/libregexis024fa/tracking_fa_nodes.h +++ b/src/libregexis024fa/tracking_fa_nodes.h @@ -5,27 +5,24 @@ #include #include -bool isImmMovOpcode(regex024_opcode inst); -bool isCurPosMovOpcode(regex024_opcode inst); -bool isColarrOpcode(regex024_opcode inst); -bool isSelarrOpcode(regex024_opcode inst); +namespace regexis024 { + bool isImmMovOpcode(opcode_t inst); + bool isCurPosMovOpcode(opcode_t inst); + bool isColarrOpcode(opcode_t inst); + bool isSelarrOpcode(opcode_t inst); -bool isTrackingFaNode(const FA_Node* n); + bool isTrackingFaNode(const FA_Node* n); -struct TrackingOperationInFa { - regex024_opcode opcode; - regex_tai_t key; - /* Not needed for halfinvariant operations */ - uint64_t immValue; + struct TrackingOperationInFa { + opcode_t opcode; + tai_t key; + /* Not needed for halfinvariant operations */ + uint64_t immValue; - TrackingOperationInFa(regex024_opcode opcode, regex_tai_t key, uint64_t imm_value); - - TrackingOperationInFa(regex024_opcode opcode, regex_tai_t key); - - std::string toString() const; -}; - -FA_NodePathPart* convert_to_node(const TrackingOperationInFa& op, FA_Container& fa); + std::string toString() const; + }; + FA_NodePathPart* convert_to_node(const TrackingOperationInFa& op, FA_Container& fa); +} #endif diff --git a/src/libregexis024fa/tracking_variables.h b/src/libregexis024fa/tracking_variables.h index b80f4ec..a882adb 100644 --- a/src/libregexis024fa/tracking_variables.h +++ b/src/libregexis024fa/tracking_variables.h @@ -1,14 +1,16 @@ #ifndef LIBREGEXIS024_SRC_LIBREGEXIS024FA_TRACKING_VARIABLES_H #define LIBREGEXIS024_SRC_LIBREGEXIS024FA_TRACKING_VARIABLES_H -namespace tracking_var_types { - enum tracking_var_type_I { - range, - dot_cur_pos, - dot_immediate - }; +namespace regexis024 { + namespace tracking_var_types { + enum tracking_var_type_I { + range, + dot_cur_pos, + dot_immediate + }; + } + + typedef tracking_var_types::tracking_var_type_I tracking_var_type_t; } -typedef tracking_var_types::tracking_var_type_I tracking_var_type; - #endif diff --git a/src/libregexis024sol/backslash_expression.cpp b/src/libregexis024sol/backslash_expression.cpp index 0f97d2b..f265e3e 100644 --- a/src/libregexis024sol/backslash_expression.cpp +++ b/src/libregexis024sol/backslash_expression.cpp @@ -2,61 +2,63 @@ #include #include -uint32_t read_hex(REGEX_IS024_MeaningContext& ctx, int sz){ - uint32_t res = 0; - for (int i = 0; i < sz; i++){ - int32_t ch = peep(ctx); - if ('0' <= ch && ch <= '9') - res = ((res << 4) | ((uint32_t)ch - '0')); - else if ('a' <= ch && ch <= 'z') - res = ((res << 4) | ((uint32_t)ch - 'a' + 10)); - else if ('A' <= ch && ch <= 'Z') - res = ((res << 4) | ((uint32_t)ch - 'A' + 10)); - else{ - report(ctx, "escape backslash expression: bad unicode code"); - return 0; - } - readChar(ctx); - } - return res; -} - -void unicode_in_bs_case(REGEX_IS024_MeaningContext &ctx, bool &ret_is_multicode, codeset_t &ret_set, int sz){ - ret_is_multicode = false; - readChar(ctx); - uint32_t hc = read_hex(ctx, sz); // Might create an error - ret_set = codeset_of_one_char(hc); -} - -void -backslash_expression_parsing_try_regular(REGEX_IS024_MeaningContext &ctx, const CommonCodesets& cc, - bool &ret_is_multicode, codeset_t &ret_set) -{ - int32_t leader = peep(ctx); - if (ctx.error) - return; -#define block(l, b, E) case l: ret_is_multicode = b; ret_set = E; readChar(ctx); break; - switch (leader) { - block('s', false, codeset_of_one_char(U' ')) - block('t', false, codeset_of_one_char(U'\t')) - block('n', false, codeset_of_one_char(U'\n')) - block('r', false, codeset_of_one_char(U'\r')) - block('e', true, cc.spaces); - block('E', true, invert_set(cc.spaces)) - block('w', true, cc.word_constituents); - block('W', true, invert_set(cc.word_constituents)); - case 'u': - unicode_in_bs_case(ctx, ret_is_multicode, ret_set, 4); - break; - case 'U': - unicode_in_bs_case(ctx, ret_is_multicode, ret_set, 8); - break; - default: - if (leader >= 0){ - ret_is_multicode = false; - ret_set = codeset_of_one_char(leader); - } else { - report(ctx, "backslash in the wrong place"); +namespace regexis024 { + uint32_t read_hex(REGEX_IS024_MeaningContext& ctx, int sz){ + uint32_t res = 0; + for (int i = 0; i < sz; i++){ + int32_t ch = peep(ctx); + if ('0' <= ch && ch <= '9') + res = ((res << 4) | ((uint32_t)ch - '0')); + else if ('a' <= ch && ch <= 'z') + res = ((res << 4) | ((uint32_t)ch - 'a' + 10)); + else if ('A' <= ch && ch <= 'Z') + res = ((res << 4) | ((uint32_t)ch - 'A' + 10)); + else{ + report(ctx, "escape backslash expression: bad unicode code"); + return 0; } + readChar(ctx); + } + return res; + } + + void unicode_in_bs_case(REGEX_IS024_MeaningContext &ctx, bool &ret_is_multicode, codeset_t &ret_set, int sz){ + ret_is_multicode = false; + readChar(ctx); + uint32_t hc = read_hex(ctx, sz); // Might create an error + ret_set = codeset_of_one_char(hc); + } + + void + backslash_expression_parsing_try_regular(REGEX_IS024_MeaningContext &ctx, const CommonCodesets& cc, + bool &ret_is_multicode, codeset_t &ret_set) + { + int32_t leader = peep(ctx); + if (ctx.error) + return; +#define block(l, b, E) case l: ret_is_multicode = b; ret_set = E; readChar(ctx); break; + switch (leader) { + block('s', false, codeset_of_one_char(U' ')) + block('t', false, codeset_of_one_char(U'\t')) + block('n', false, codeset_of_one_char(U'\n')) + block('r', false, codeset_of_one_char(U'\r')) + block('e', true, cc.spaces); + block('E', true, invert_set(cc.spaces)) + block('w', true, cc.word_constituents); + block('W', true, invert_set(cc.word_constituents)); + case 'u': + unicode_in_bs_case(ctx, ret_is_multicode, ret_set, 4); + break; + case 'U': + unicode_in_bs_case(ctx, ret_is_multicode, ret_set, 8); + break; + default: + if (leader >= 0){ + ret_is_multicode = false; + ret_set = codeset_of_one_char(leader); + } else { + report(ctx, "backslash in the wrong place"); + } + } } } \ No newline at end of file diff --git a/src/libregexis024sol/command_expression.cpp b/src/libregexis024sol/command_expression.cpp index fb61eba..3afdfe6 100644 --- a/src/libregexis024sol/command_expression.cpp +++ b/src/libregexis024sol/command_expression.cpp @@ -5,139 +5,141 @@ #include #include -struct ParseCall{ - virtual ~ParseCall() = default; - virtual std::unique_ptr afterReceive(REGEX_IS024_MeaningContext& ctx) { assert(false); } - virtual std::unique_ptr firstTime(REGEX_IS024_MeaningContext& ctx) { assert(false); } -}; +namespace regexis024 { + struct ParseCall{ + virtual ~ParseCall() = default; + virtual std::unique_ptr afterReceive(REGEX_IS024_MeaningContext& ctx) { assert(false); } + virtual std::unique_ptr firstTime(REGEX_IS024_MeaningContext& ctx) { assert(false); } + }; -struct Top_ParseCall: public ParseCall{ - Command& res; - explicit Top_ParseCall(Command &res) : res(res) {} - std::unique_ptr firstTime(REGEX_IS024_MeaningContext &ctx) override; - std::unique_ptr afterReceive(REGEX_IS024_MeaningContext &ctx) override; -}; + struct Top_ParseCall: public ParseCall{ + Command& res; + explicit Top_ParseCall(Command &res) : res(res) {} + std::unique_ptr firstTime(REGEX_IS024_MeaningContext &ctx) override; + std::unique_ptr afterReceive(REGEX_IS024_MeaningContext &ctx) override; + }; -struct Bracker_ParseCall: public ParseCall{ - std::vector& res; - bool closingBraceEnded = false; - explicit Bracker_ParseCall(std::vector &res) : res(res) {} - std::unique_ptr argReadProc(REGEX_IS024_MeaningContext& ctx); - std::unique_ptr firstTime(REGEX_IS024_MeaningContext &ctx) override; - std::unique_ptr afterReceive(REGEX_IS024_MeaningContext &ctx) override; -}; + struct Bracker_ParseCall: public ParseCall{ + std::vector& res; + bool closingBraceEnded = false; + explicit Bracker_ParseCall(std::vector &res) : res(res) {} + std::unique_ptr argReadProc(REGEX_IS024_MeaningContext& ctx); + std::unique_ptr firstTime(REGEX_IS024_MeaningContext &ctx) override; + std::unique_ptr afterReceive(REGEX_IS024_MeaningContext &ctx) override; + }; #define call_ERROR_CHECK do { if (ctx.error) { return NULL; } } while (0) #define call_THROW(str) do { report(ctx, "command expression: " str); return NULL; } while (0) -std::unique_ptr Top_ParseCall::firstTime(REGEX_IS024_MeaningContext &ctx) { - assert(readChar(ctx) == U'!'); - int32_t ch = peep(ctx); call_ERROR_CHECK; - if (ch == U'~'){ - /* I assume during construction I received reference to newly initialized struct */ - res.tilda = true; - return NULL; - } - res.name = tryRead_REGEX024_name(ctx); call_ERROR_CHECK; - if (res.name.empty()) - call_THROW("top lvl: no command name specified"); - ch = peep(ctx); call_ERROR_CHECK; - if (ch == U';'){ - readChar(ctx); - return NULL; - } - if (ch == U'{'){ - return std::make_unique(res.arguments); - } - call_THROW("top lvl: command call should be ended with ';' or '{...}'"); -} - -std::unique_ptr Top_ParseCall::afterReceive(REGEX_IS024_MeaningContext &ctx) { - return NULL; -} - -std::unique_ptr Bracker_ParseCall::firstTime(REGEX_IS024_MeaningContext &ctx) { - assert(readChar(ctx) == U'{'); - return argReadProc(ctx); -} - -std::unique_ptr Bracker_ParseCall::afterReceive(REGEX_IS024_MeaningContext &ctx) { - closingBraceEnded = true; - return argReadProc(ctx); -} - -std::unique_ptr Bracker_ParseCall::argReadProc(REGEX_IS024_MeaningContext &ctx) { - repeat: - int32_t ch = peep(ctx); call_ERROR_CHECK; - if (ch == U';'){ - res.emplace_back(); - readChar(ctx); - closingBraceEnded = false; - goto repeat; - } else if (ch == U'}'){ - readChar(ctx); - if (!closingBraceEnded){ - res.emplace_back(); + std::unique_ptr Top_ParseCall::firstTime(REGEX_IS024_MeaningContext &ctx) { + assert(readChar(ctx) == U'!'); + int32_t ch = peep(ctx); call_ERROR_CHECK; + if (ch == U'~'){ + /* I assume during construction I received reference to newly initialized struct */ + res.tilda = true; + return NULL; } - return NULL; - } else if (is_REGEX024_nameConstituent(ch)){ - res.emplace_back(); - res.back().is_empty = false; - res.back().name = tryRead_REGEX024_name(ctx); - int32_t eCh = peep(ctx); call_ERROR_CHECK; - if (eCh == U';'){ - readChar(ctx); - closingBraceEnded = false; - goto repeat; - } else if (eCh == U'{'){ - return std::make_unique(res.back().arguments); - } else if (eCh == U'}'){ + res.name = tryRead_REGEX024_name(ctx); call_ERROR_CHECK; + if (res.name.empty()) + call_THROW("top lvl: no command name specified"); + ch = peep(ctx); call_ERROR_CHECK; + if (ch == U';'){ readChar(ctx); return NULL; } - call_THROW("brace lvl: argument ends with ';' or {...}"); - } - call_THROW("brace lvl: argument starts with ';' or it's name"); -} - -Command command_expr_parse(REGEX_IS024_MeaningContext &ctx) { - std::vector> callStack; - Command res; - callStack.push_back(std::make_unique(res)); - bool first_time = true; - while (!callStack.empty()){ - if (ctx.error) - return {}; - auto nxt = first_time ? callStack.back()->firstTime(ctx) : callStack.back()->afterReceive(ctx); - if (nxt){ - callStack.push_back(std::move(nxt)); - first_time = true; - } else { - callStack.pop_back(); - first_time = false; + if (ch == U'{'){ + return std::make_unique(res.arguments); } + call_THROW("top lvl: command call should be ended with ';' or '{...}'"); } - return res; -} -const char* commands_for_codesets[] = {"word", "space", "digit", "variable", "any", "A", NULL}; + std::unique_ptr Top_ParseCall::afterReceive(REGEX_IS024_MeaningContext &ctx) { + return NULL; + } -bool is_command_for_charset(const Command &cmd) { - return !cmd.tilda && cmd.arguments.empty() && is_string_in_stringset(cmd.name.c_str(), commands_for_codesets); -} + std::unique_ptr Bracker_ParseCall::firstTime(REGEX_IS024_MeaningContext &ctx) { + assert(readChar(ctx) == U'{'); + return argReadProc(ctx); + } -void interpret_command_as_charset_giving(const CommonCodesets& cc, const Command &cmd, codeset_t& ret) -{ - if (cmd.name == "word") - ret = cc.word_constituents; - else if (cmd.name == "space") - ret = cc.spaces; - else if (cmd.name == "digit") - ret = cc.digits; - else if (cmd.name == "variable") - ret = cc.variable_constituents; - else if (cmd.name == "any" || cmd.name == "A") - ret = codeset_of_all; - else - assert(false); + std::unique_ptr Bracker_ParseCall::afterReceive(REGEX_IS024_MeaningContext &ctx) { + closingBraceEnded = true; + return argReadProc(ctx); + } + + std::unique_ptr Bracker_ParseCall::argReadProc(REGEX_IS024_MeaningContext &ctx) { + repeat: + int32_t ch = peep(ctx); call_ERROR_CHECK; + if (ch == U';'){ + res.emplace_back(); + readChar(ctx); + closingBraceEnded = false; + goto repeat; + } else if (ch == U'}'){ + readChar(ctx); + if (!closingBraceEnded){ + res.emplace_back(); + } + return NULL; + } else if (is_REGEX024_nameConstituent(ch)){ + res.emplace_back(); + res.back().is_empty = false; + res.back().name = tryRead_REGEX024_name(ctx); + int32_t eCh = peep(ctx); call_ERROR_CHECK; + if (eCh == U';'){ + readChar(ctx); + closingBraceEnded = false; + goto repeat; + } else if (eCh == U'{'){ + return std::make_unique(res.back().arguments); + } else if (eCh == U'}'){ + readChar(ctx); + return NULL; + } + call_THROW("brace lvl: argument ends with ';' or {...}"); + } + call_THROW("brace lvl: argument starts with ';' or it's name"); + } + + Command command_expr_parse(REGEX_IS024_MeaningContext &ctx) { + std::vector> callStack; + Command res; + callStack.push_back(std::make_unique(res)); + bool first_time = true; + while (!callStack.empty()){ + if (ctx.error) + return {}; + auto nxt = first_time ? callStack.back()->firstTime(ctx) : callStack.back()->afterReceive(ctx); + if (nxt){ + callStack.push_back(std::move(nxt)); + first_time = true; + } else { + callStack.pop_back(); + first_time = false; + } + } + return res; + } + + const char* commands_for_codesets[] = {"word", "space", "digit", "variable", "any", "A", NULL}; + + bool is_command_for_charset(const Command &cmd) { + return !cmd.tilda && cmd.arguments.empty() && is_string_in_stringset(cmd.name.c_str(), commands_for_codesets); + } + + void interpret_command_as_charset_giving(const CommonCodesets& cc, const Command &cmd, codeset_t& ret) + { + if (cmd.name == "word") + ret = cc.word_constituents; + else if (cmd.name == "space") + ret = cc.spaces; + else if (cmd.name == "digit") + ret = cc.digits; + else if (cmd.name == "variable") + ret = cc.variable_constituents; + else if (cmd.name == "any" || cmd.name == "A") + ret = codeset_of_all; + else + assert(false); + } } diff --git a/src/libregexis024sol/common_codesets.cpp b/src/libregexis024sol/common_codesets.cpp index 791c6c7..0ac20af 100644 --- a/src/libregexis024sol/common_codesets.cpp +++ b/src/libregexis024sol/common_codesets.cpp @@ -1,13 +1,15 @@ #include -CommonCodesets::CommonCodesets() { - spaces = set_add_char(spaces, U'\n'); - spaces = set_add_char(spaces, U' '); - spaces = set_add_char(spaces, U'\t'); - spaces = set_add_char(spaces, U'\r'); - word_constituents = set_add_range(word_constituents, U'a', U'z'); - word_constituents = set_add_range(word_constituents, U'A', U'Z'); - digits = codeset_t({{'0', '9'}}); - variable_constituents = set_add_char(word_constituents, U'-'); - variable_constituents = merge_sets(variable_constituents, digits); +namespace regexis024 { + CommonCodesets::CommonCodesets() { + spaces = set_add_char(spaces, U'\n'); + spaces = set_add_char(spaces, U' '); + spaces = set_add_char(spaces, U'\t'); + spaces = set_add_char(spaces, U'\r'); + word_constituents = set_add_range(word_constituents, U'a', U'z'); + word_constituents = set_add_range(word_constituents, U'A', U'Z'); + digits = codeset_t({{'0', '9'}}); + variable_constituents = set_add_char(word_constituents, U'-'); + variable_constituents = merge_sets(variable_constituents, digits); + } } diff --git a/src/libregexis024sol/common_codesets.h b/src/libregexis024sol/common_codesets.h index f2d9ee6..4d2b370 100644 --- a/src/libregexis024sol/common_codesets.h +++ b/src/libregexis024sol/common_codesets.h @@ -3,12 +3,14 @@ #include -struct CommonCodesets { - codeset_t spaces; - codeset_t word_constituents; - codeset_t digits; - codeset_t variable_constituents; - CommonCodesets(); -}; +namespace regexis024 { + struct CommonCodesets { + codeset_t spaces; + codeset_t word_constituents; + codeset_t digits; + codeset_t variable_constituents; + CommonCodesets(); + }; +} #endif diff --git a/src/libregexis024sol/expr_compiler.cpp b/src/libregexis024sol/expr_compiler.cpp index 382154d..25ea0f6 100644 --- a/src/libregexis024sol/expr_compiler.cpp +++ b/src/libregexis024sol/expr_compiler.cpp @@ -23,258 +23,260 @@ #define aux_ERROR_CHECK do { if (ctx.error) { return; } } while (0) #define aux_THROW(str) do { report(ctx, "regex: " str); return; } while (0) -/* ****************************** Top */ +namespace regexis024 { + /* ****************************** Top */ -const char* dfa_arg_aliases_condone[] = {"forgive", "condone", "okay", "optional", "nonimportant", "ifpossible", NULL}; -const char* dfa_arg_aliases_acerbic[] = {"acerbic", "angry", "pedantic", "nofork", "pure", "important", "fierce", NULL}; + const char* dfa_arg_aliases_condone[] = {"forgive", "condone", "okay", "optional", "nonimportant", "ifpossible", NULL}; + const char* dfa_arg_aliases_acerbic[] = {"acerbic", "angry", "pedantic", "nofork", "pure", "important", "fierce", NULL}; -void dfa_command_processing(REGEX_IS024_MeaningContext &ctx, ParsingContext& pctx, const Command& cmdBuf){ - if (pctx.dfa_cmd_activated){ - report(ctx, "repeating !dfa command"); - return; - } - pctx.dfa_cmd_activated = true; - if (cmdBuf.arguments.empty()) - return; - if (cmdBuf.arguments.size() == 1 && cmdBuf.arguments[0].arguments.empty()){ - const std::string& arg_name = cmdBuf.arguments[0].name; - if (is_string_in_stringset(arg_name.c_str(), dfa_arg_aliases_acerbic)) { - pctx.dfa_cmd_unforgiving = true; + void dfa_command_processing(REGEX_IS024_MeaningContext &ctx, ParsingContext& pctx, const Command& cmdBuf){ + if (pctx.dfa_cmd_activated){ + report(ctx, "repeating !dfa command"); return; } - if (is_string_in_stringset(arg_name.c_str(), dfa_arg_aliases_condone)) { - pctx.dfa_cmd_nonimportant = true; + pctx.dfa_cmd_activated = true; + if (cmdBuf.arguments.empty()) return; - } - } - report(ctx, "wrong arguments in !dfa command"); -} - -void select_command_processing(REGEX_IS024_MeaningContext& ctx, ParsingContext& pctx, const Command& cmdBuf){ - if (pctx.select_cmd_encountered) - aux_THROW("repeating !select command"); - pctx.select_cmd_encountered = true; - for (const CommandArgument& arg: cmdBuf.arguments){ - if (arg.is_empty) - aux_THROW("wrong arguments in !select command"); - if (ctx.ktr.track_names.count(arg.name) != 0) - aux_THROW("repeated names in !select command"); - int64_t namedThingId = static_cast(ctx.ktr.track_names.size()); - ctx.ktr.track_names.insert({arg.name, namedThingId}); - ctx.ktr.retrieval_info.emplace_back(); - ctx.ktr.retrieval_info.back().stored_in_sa = true; - ctx.ktr.retrieval_info.back().stored_in_ca = false; - bool mm = false, coll = false; - for (const CommandArgument& argarg: arg.arguments){ -#define mm_shenanigans if (mm) {aux_THROW("bad argument to !select command");} mm = true; - if (argarg.name == "ca" || argarg.name == "col") { - if (coll) - aux_THROW("bad argument to !select command"); - coll = true; - ctx.ktr.retrieval_info.back().stored_in_ca = true; - } else if (argarg.name == "min") { - mm_shenanigans - ctx.ktr.retrieval_info.back().used_in_sifting = true; - ctx.ktr.retrieval_info.back().minimizing = true; - } else if (argarg.name == "max"){ - mm_shenanigans - ctx.ktr.retrieval_info.back().used_in_sifting = true; - } else if (argarg.name == "ign") { - mm_shenanigans - } else { - aux_THROW("wrong parameter for prioritized parameter in !select command"); + if (cmdBuf.arguments.size() == 1 && cmdBuf.arguments[0].arguments.empty()){ + const std::string& arg_name = cmdBuf.arguments[0].name; + if (is_string_in_stringset(arg_name.c_str(), dfa_arg_aliases_acerbic)) { + pctx.dfa_cmd_unforgiving = true; + return; } + if (is_string_in_stringset(arg_name.c_str(), dfa_arg_aliases_condone)) { + pctx.dfa_cmd_nonimportant = true; + return; + } + } + report(ctx, "wrong arguments in !dfa command"); + } + + void select_command_processing(REGEX_IS024_MeaningContext& ctx, ParsingContext& pctx, const Command& cmdBuf){ + if (pctx.select_cmd_encountered) + aux_THROW("repeating !select command"); + pctx.select_cmd_encountered = true; + for (const CommandArgument& arg: cmdBuf.arguments){ + if (arg.is_empty) + aux_THROW("wrong arguments in !select command"); + if (ctx.ktr.track_names.count(arg.name) != 0) + aux_THROW("repeated names in !select command"); + int64_t namedThingId = static_cast(ctx.ktr.track_names.size()); + ctx.ktr.track_names.insert({arg.name, namedThingId}); + ctx.ktr.retrieval_info.emplace_back(); + ctx.ktr.retrieval_info.back().stored_in_sa = true; + ctx.ktr.retrieval_info.back().stored_in_ca = false; + bool mm = false, coll = false; + for (const CommandArgument& argarg: arg.arguments){ +#define mm_shenanigans if (mm) {aux_THROW("bad argument to !select command");} mm = true; + if (argarg.name == "ca" || argarg.name == "col") { + if (coll) + aux_THROW("bad argument to !select command"); + coll = true; + ctx.ktr.retrieval_info.back().stored_in_ca = true; + } else if (argarg.name == "min") { + mm_shenanigans + ctx.ktr.retrieval_info.back().used_in_sifting = true; + ctx.ktr.retrieval_info.back().minimizing = true; + } else if (argarg.name == "max"){ + mm_shenanigans + ctx.ktr.retrieval_info.back().used_in_sifting = true; + } else if (argarg.name == "ign") { + mm_shenanigans + } else { + aux_THROW("wrong parameter for prioritized parameter in !select command"); + } #undef mm_shenanigans - } - pctx.is_inside_of_these_sa_subexpressions.assign(ctx.ktr.retrieval_info.size(), false); - /* Other info will be filled once a tracking-unit with such name will be actually found in regex */ - } -} - -void jump_into_madness(ctx_t& ctx, ParsingContext& pctx, FA_Container &fa, int hn){ - while (true){ - int32_t pch = peep(ctx); aux_ERROR_CHECK; - if (pch != U'!'){ - return; - } - size_t before_it = ctx.pos; - Command cmd = command_expr_parse(ctx); aux_ERROR_CHECK; - if (cmd.tilda){ - ctx.have_comment_tail = true; - ctx.comment_tail_start = ctx.pos; - ctx.pos = ctx.input_size; - } else if (is_header_dfa_cmd(cmd)){ - dfa_command_processing(ctx, pctx, cmd); - } else if (is_header_select_cmd(cmd)){ - if (hn != 1) - aux_THROW("!select command at the wrong place"); - select_command_processing(ctx, pctx, cmd); - } else { - assert(!is_header_cmd(cmd)); - ctx.pos = before_it; - break; + } + pctx.is_inside_of_these_sa_subexpressions.assign(ctx.ktr.retrieval_info.size(), false); + /* Other info will be filled once a tracking-unit with such name will be actually found in regex */ } } -} -chekushka TopLvl_ParseCall::firstTime(ctx_t &ctx, ParsingContext &pctx, FA_Container &fa) { - result.assertDefault(); - jump_into_madness(ctx, pctx, fa, 1); - if (ctx.have_comment_tail) + void jump_into_madness(ctx_t& ctx, ParsingContext& pctx, FA_Container &fa, int hn){ + while (true){ + int32_t pch = peep(ctx); aux_ERROR_CHECK; + if (pch != U'!'){ + return; + } + size_t before_it = ctx.pos; + Command cmd = command_expr_parse(ctx); aux_ERROR_CHECK; + if (cmd.tilda){ + ctx.have_comment_tail = true; + ctx.comment_tail_start = ctx.pos; + ctx.pos = ctx.input_size; + } else if (is_header_dfa_cmd(cmd)){ + dfa_command_processing(ctx, pctx, cmd); + } else if (is_header_select_cmd(cmd)){ + if (hn != 1) + aux_THROW("!select command at the wrong place"); + select_command_processing(ctx, pctx, cmd); + } else { + assert(!is_header_cmd(cmd)); + ctx.pos = before_it; + break; + } + } + } + + chekushka TopLvl_ParseCall::firstTime(ctx_t &ctx, ParsingContext &pctx, FA_Container &fa) { + result.assertDefault(); + jump_into_madness(ctx, pctx, fa, 1); + if (ctx.have_comment_tail) + return NULL; + return std::make_unique(result); + } + + chekushka TopLvl_ParseCall::afterReceive(ctx_t &ctx, ParsingContext &pctx, FA_Container &fa) { + jump_into_madness(ctx, pctx, fa, 2); + if (!isEnd(ctx)) + call_THROW("top lvl: EOF expected"); return NULL; - return std::make_unique(result); -} - -chekushka TopLvl_ParseCall::afterReceive(ctx_t &ctx, ParsingContext &pctx, FA_Container &fa) { - jump_into_madness(ctx, pctx, fa, 2); - if (!isEnd(ctx)) - call_THROW("top lvl: EOF expected"); - return NULL; -} - -/* ********************************* Bracket */ - -chekushka BracketLvl_ParseCall::firstTime(REGEX_IS024_MeaningContext& ctx, ParsingContext& pctx, FA_Container& fa) { - result.assertDefault(); - assert(readChar(ctx) == U'('); - /* sequence lvl already took care about resolving name and configuring SubtrackingNameInfo */ - if (namedSubexpressionId >= 0){ - assert(ctx.ktr.retrieval_info[namedSubexpressionId].type == tracking_var_types::range); - if (ctx.ktr.retrieval_info[namedSubexpressionId].stored_in_sa){ - assert(namedSubexpressionId < (int64_t)pctx.is_inside_of_these_sa_subexpressions.size()); - if (pctx.is_inside_of_these_sa_subexpressions[namedSubexpressionId]) - call_THROW("subexpression that selection array tracks is nested"); - pctx.is_inside_of_these_sa_subexpressions[namedSubexpressionId] = true; - } } - return std::make_unique(tmp_ret_buff); -} -chekushka BracketLvl_ParseCall::afterReceive(REGEX_IS024_MeaningContext& ctx, ParsingContext& pctx, FA_Container& fa) { - if (peep(ctx) != U')') - call_THROW("missing ')'"); - readChar(ctx); - result = tmp_ret_buff; - if (namedSubexpressionId >= 0) { - SubtrackingNameInfo& tai_slots = ctx.ktr.retrieval_info[namedSubexpressionId]; - if (tai_slots.stored_in_ca){ - assert(tai_slots.colarr_first >= 0 && tai_slots.colarr_first < UINT16_MAX); - assert(tai_slots.colarr_second >= 0 && tai_slots.colarr_second < UINT16_MAX); - result = join(subexpression_from_path(fa.makeTrackArrayMovHalfinvariant( - regex024_opcodes::MOV_COLARR_BTPOS, tai_slots.colarr_first)), result); - result = join(result, subexpression_from_path(fa.makeTrackArrayMovHalfinvariant( - regex024_opcodes::MOV_COLARR_BTPOS, tai_slots.colarr_second))); - } - if (tai_slots.stored_in_sa){ - assert(tai_slots.selarr_first >= 0 && tai_slots.selarr_first < UINT16_MAX); - assert(tai_slots.selarr_second >= 0 && tai_slots.selarr_second < UINT16_MAX); - result = join(subexpression_from_path(fa.makeTrackArrayMovHalfinvariant( - regex024_opcodes::MOV_SELARR_CHPOS, tai_slots.selarr_first)), result); - result = join(result, subexpression_from_path(fa.makeTrackArrayMovHalfinvariant( - regex024_opcodes::MOV_SELARR_CHPOS, tai_slots.selarr_second))); - pctx.is_inside_of_these_sa_subexpressions[namedSubexpressionId] = false; + /* ********************************* Bracket */ + + chekushka BracketLvl_ParseCall::firstTime(REGEX_IS024_MeaningContext& ctx, ParsingContext& pctx, FA_Container& fa) { + result.assertDefault(); + assert(readChar(ctx) == U'('); + /* sequence lvl already took care about resolving name and configuring SubtrackingNameInfo */ + if (namedSubexpressionId >= 0){ + assert(ctx.ktr.retrieval_info[namedSubexpressionId].type == tracking_var_types::range); + if (ctx.ktr.retrieval_info[namedSubexpressionId].stored_in_sa){ + assert(namedSubexpressionId < (int64_t)pctx.is_inside_of_these_sa_subexpressions.size()); + if (pctx.is_inside_of_these_sa_subexpressions[namedSubexpressionId]) + call_THROW("subexpression that selection array tracks is nested"); + pctx.is_inside_of_these_sa_subexpressions[namedSubexpressionId] = true; + } } + return std::make_unique(tmp_ret_buff); } - return NULL; -} -/* ******************************* Fork */ - -chekushka ForkLvl_ParseCall::firstTime(REGEX_IS024_MeaningContext& ctx, ParsingContext& pctx, FA_Container& fa) { - result.assertDefault(); - options.emplace_back(); // Default one contains nothing. It will be overwritten - return std::make_unique(options.back()); -} - -chekushka ForkLvl_ParseCall::afterReceive(REGEX_IS024_MeaningContext& ctx, ParsingContext& pctx, FA_Container& fa) { - int32_t end_reason = peep(ctx); call_ERROR_CHECK; - if (end_reason == U'|'){ + chekushka BracketLvl_ParseCall::afterReceive(REGEX_IS024_MeaningContext& ctx, ParsingContext& pctx, FA_Container& fa) { + if (peep(ctx) != U')') + call_THROW("missing ')'"); readChar(ctx); - return firstTime(ctx, pctx, fa); + result = tmp_ret_buff; + if (namedSubexpressionId >= 0) { + SubtrackingNameInfo& tai_slots = ctx.ktr.retrieval_info[namedSubexpressionId]; + if (tai_slots.stored_in_ca){ + assert(tai_slots.colarr_first >= 0 && tai_slots.colarr_first < UINT16_MAX); + assert(tai_slots.colarr_second >= 0 && tai_slots.colarr_second < UINT16_MAX); + result = join(subexpression_from_path(fa.makeTrackArrayMovHalfinvariant( + opcodes::MOV_COLARR_BTPOS, tai_slots.colarr_first)), result); + result = join(result, subexpression_from_path(fa.makeTrackArrayMovHalfinvariant( + opcodes::MOV_COLARR_BTPOS, tai_slots.colarr_second))); + } + if (tai_slots.stored_in_sa){ + assert(tai_slots.selarr_first >= 0 && tai_slots.selarr_first < UINT16_MAX); + assert(tai_slots.selarr_second >= 0 && tai_slots.selarr_second < UINT16_MAX); + result = join(subexpression_from_path(fa.makeTrackArrayMovHalfinvariant( + opcodes::MOV_SELARR_CHPOS, tai_slots.selarr_first)), result); + result = join(result, subexpression_from_path(fa.makeTrackArrayMovHalfinvariant( + opcodes::MOV_SELARR_CHPOS, tai_slots.selarr_second))); + pctx.is_inside_of_these_sa_subexpressions[namedSubexpressionId] = false; + } + } + return NULL; } - result = forkify(options, fa); - return NULL; -} -void parseBody(REGEX_IS024_MeaningContext& ctx, FA_Container& fa, SubExprCompiled& result, ParsingContext& pctx){ - std::vector> callStack; - callStack.push_back(std::make_unique(result)); - bool first_time = true; - while (!callStack.empty()){ - aux_ERROR_CHECK; - auto nxt = first_time ? callStack.back()->firstTime(ctx, pctx, fa) : \ - callStack.back()->afterReceive(ctx, pctx, fa); - if (nxt){ - callStack.push_back(std::move(nxt)); - first_time = true; + /* ******************************* Fork */ + + chekushka ForkLvl_ParseCall::firstTime(REGEX_IS024_MeaningContext& ctx, ParsingContext& pctx, FA_Container& fa) { + result.assertDefault(); + options.emplace_back(); // Default one contains nothing. It will be overwritten + return std::make_unique(options.back()); + } + + chekushka ForkLvl_ParseCall::afterReceive(REGEX_IS024_MeaningContext& ctx, ParsingContext& pctx, FA_Container& fa) { + int32_t end_reason = peep(ctx); call_ERROR_CHECK; + if (end_reason == U'|'){ + readChar(ctx); + return firstTime(ctx, pctx, fa); + } + result = forkify(options, fa); + return NULL; + } + + void parseBody(REGEX_IS024_MeaningContext& ctx, FA_Container& fa, SubExprCompiled& result, ParsingContext& pctx){ + std::vector> callStack; + callStack.push_back(std::make_unique(result)); + bool first_time = true; + while (!callStack.empty()){ + aux_ERROR_CHECK; + auto nxt = first_time ? callStack.back()->firstTime(ctx, pctx, fa) : \ + callStack.back()->afterReceive(ctx, pctx, fa); + if (nxt){ + callStack.push_back(std::move(nxt)); + first_time = true; + } else { + callStack.pop_back(); + first_time = false; + } + } + /* Generating priority table (sifting program) */ + for (const SubtrackingNameInfo& sni: ctx.ktr.retrieval_info) { + if (!sni.discovered) + aux_THROW("tracking tool named in !select is not used anywhere"); + if (sni.used_in_sifting) { + assert(sni.selarr_first >= 0); + assert((sni.type == tracking_var_types::range) == (sni.selarr_second != -1)); + pctx.priority_table.emplace_back(sni.minimizing, sni.selarr_first, sni.selarr_second, sni.type); + } + } + } + + REGEX_IS024_MeaningContext::REGEX_IS024_MeaningContext(size_t inputSize, const char *input) : input_size(inputSize), + input(input) { + CommonCodesets codeset_collection; + FA_Container fa; + FA_Container fa_1f; + FA_Container fa_2f; + SubExprCompiled result; + ParsingContext pctx(codeset_collection); + parseBody(*this, fa, result, pctx); + /* CLion gone crazy here. It thinks error is always false (It doesn't know about such thing as macros) */ + if (error) + return; + + FA_NodeOfMatch* matcher = fa.makeMatch(); + if (!result.start){ + fa.start = matcher; } else { - callStack.pop_back(); - first_time = false; + fa.start = result.start; + for (FA_Node** ending: result.ends) + reattach_fa_node_edge(ending, matcher); } - } - /* Generating priority table (sifting program) */ - for (const SubtrackingNameInfo& sni: ctx.ktr.retrieval_info) { - if (!sni.discovered) - aux_THROW("tracking tool named in !select is not used anywhere"); - if (sni.used_in_sifting) { - assert(sni.selarr_first >= 0); - assert((sni.type == tracking_var_types::range) == (sni.selarr_second != -1)); - pctx.priority_table.emplace_back(sni.minimizing, sni.selarr_first, sni.selarr_second, sni.type); + fa.start->refs++; + + // show_fa_with_sxiv_after_dot(fa, ktr, pctx.priority_table); // todo debug + + REGEX_IS024_FA_FirstStageFixInfo info1 = first_stage_fix_fa(fa, fa_1f); + + // show_fa_with_sxiv_after_dot(fa_1f, ktr, pctx.priority_table); // todo debug + + if (pctx.dfa_cmd_activated) { + int det_err; + int had_to_fork; + try_determinize_fa(fa_1f, pctx.priority_table, free_selarr_tai, info1, fa_2f, det_err, had_to_fork); + if (det_err < 0 && !pctx.dfa_cmd_nonimportant) { + report(*this, "Unable to determinize dfa"); + return; + } + if (pctx.dfa_cmd_unforgiving && had_to_fork < 0) { + report(*this, "Attempt to determinize dfa was not good enough"); + return; + } + } else { + regular_second_stage_fix(fa_1f, fa_2f, info1); + } + + // show_fa_with_sxiv_after_dot(fa_2f, ktr, pctx.priority_table); // todo debug + + int compilation_error; + compile_fa_to_regexis024_bytecode(compiled_program, fa_2f, pctx.priority_table, free_selarr_tai, info1, compilation_error); + if (compilation_error) { + report(*this, "Failed to compile graph representation to bytecode representation"); + return; } } } - -REGEX_IS024_MeaningContext::REGEX_IS024_MeaningContext(size_t inputSize, const char *input) : input_size(inputSize), - input(reinterpret_cast(input)) { - CommonCodesets codeset_collection; - FA_Container fa; - FA_Container fa_1f; - FA_Container fa_2f; - SubExprCompiled result; - ParsingContext pctx(codeset_collection); - parseBody(*this, fa, result, pctx); - /* CLion gone crazy here. It thinks error is always false (It doesn't know about such thing as macros) */ - if (error) - return; - - FA_NodeOfMatch* matcher = fa.makeMatch(); - if (!result.start){ - fa.start = matcher; - } else { - fa.start = result.start; - for (FA_Node** ending: result.ends) - reattach_fa_node_edge(ending, matcher); - } - fa.start->refs++; - - // show_fa_with_sxiv_after_dot(fa, ktr, pctx.priority_table); // todo debug - - REGEX_IS024_FA_FirstStageFixInfo info1 = first_stage_fix_fa(fa, fa_1f); - - // show_fa_with_sxiv_after_dot(fa_1f, ktr, pctx.priority_table); // todo debug - - if (pctx.dfa_cmd_activated) { - int det_err; - int had_to_fork; - try_determinize_fa(fa_1f, pctx.priority_table, free_selarr_tai, info1, fa_2f, det_err, had_to_fork); - if (det_err < 0 && !pctx.dfa_cmd_nonimportant) { - report(*this, "Unable to determinize dfa"); - return; - } - if (pctx.dfa_cmd_unforgiving && had_to_fork < 0) { - report(*this, "Attempt to determinize dfa was not good enough"); - return; - } - } else { - regular_second_stage_fix(fa_1f, fa_2f, info1); - } - - // show_fa_with_sxiv_after_dot(fa_2f, ktr, pctx.priority_table); // todo debug - - int compilation_error; - compile_fa_to_regexis024_bytecode(compiled_program, fa_2f, pctx.priority_table, free_selarr_tai, info1, compilation_error); - if (compilation_error) { - report(*this, "Failed to compile graph representation to bytecode representation"); - return; - } -} diff --git a/src/libregexis024sol/expr_compiler.h b/src/libregexis024sol/expr_compiler.h index 62dcb20..32dd260 100644 --- a/src/libregexis024sol/expr_compiler.h +++ b/src/libregexis024sol/expr_compiler.h @@ -5,30 +5,27 @@ #include #include - -// todo: SUPER HIGHT PRIORITY: MOVE all this spaces digits variable_constituents junk out of this class -// todo: also PLEEEASE, write static before literally nearly every single one little stupid function in this library #include +namespace regexis024 { + struct REGEX_IS024_MeaningContext{ + size_t input_size; + const char* input; -struct REGEX_IS024_MeaningContext{ - size_t input_size; - const uint8_t* input; + bool error = false; + std::string error_msg; - bool error = false; - std::string error_msg; + size_t pos = 0; - size_t pos = 0; + bool have_comment_tail = false; + size_t comment_tail_start; + std::vector compiled_program; - bool have_comment_tail = false; - size_t comment_tail_start; - std::vector compiled_program; + KnownTrackingTools ktr; - KnownTrackingTools ktr; - - uint16_t free_selarr_tai = 0; - uint16_t free_colarr_tai = 0; - - REGEX_IS024_MeaningContext(size_t inputSize, const char *input); -}; + uint16_t free_selarr_tai = 0; + uint16_t free_colarr_tai = 0; + REGEX_IS024_MeaningContext(size_t inputSize, const char *input); + }; +} #endif //LIBREGEXIS024_EXPR_COMPILER_H diff --git a/src/libregexis024sol/expr_parse_functions/command_recognition.cpp b/src/libregexis024sol/expr_parse_functions/command_recognition.cpp index 15cae70..248b9ff 100644 --- a/src/libregexis024sol/expr_parse_functions/command_recognition.cpp +++ b/src/libregexis024sol/expr_parse_functions/command_recognition.cpp @@ -6,29 +6,31 @@ #define aux_ERROR_CHECK do { if (ctx.error) { return; } } while (0) #define aux_THROW(str) do { report(ctx, "regex: " str); return; } while (0) -const char* header_command_dfa_names[] = {"dfa", "determinize", NULL}; +namespace regexis024 { + const char* header_command_dfa_names[] = {"dfa", "determinize", NULL}; -const char* header_command_select_names[] = {"s", "select", "selarr", "selectional", NULL}; + const char* header_command_select_names[] = {"s", "select", "selarr", "selectional", NULL}; -bool is_header_cmd(const Command &cmd) { - return cmd.tilda || is_header_dfa_cmd(cmd), is_header_dfa_cmd(cmd); -} + bool is_header_cmd(const Command &cmd) { + return cmd.tilda || is_header_dfa_cmd(cmd), is_header_dfa_cmd(cmd); + } -bool is_header_dfa_cmd(const Command &cmd) { - return is_string_in_stringset(cmd.name.c_str(), header_command_dfa_names); -} + bool is_header_dfa_cmd(const Command &cmd) { + return is_string_in_stringset(cmd.name.c_str(), header_command_dfa_names); + } -bool is_header_select_cmd(const Command &cmd) { - return is_string_in_stringset(cmd.name.c_str(), header_command_select_names); -} + bool is_header_select_cmd(const Command &cmd) { + return is_string_in_stringset(cmd.name.c_str(), header_command_select_names); + } -void int_parse_with_limit_concern(const std::string &str, REGEX_IS024_MeaningContext &ctx, size_t &res, int lim) { - res = 0; - for (char ch: str){ - if (!('0' <= ch && ch <= '9')) - aux_THROW("bad integer argument"); - res = res * 10 + (ch - '0'); - if (res > (size_t)lim) - aux_THROW("integer is too big"); + void int_parse_with_limit_concern(const std::string &str, REGEX_IS024_MeaningContext &ctx, size_t &res, int lim) { + res = 0; + for (char ch: str){ + if (!('0' <= ch && ch <= '9')) + aux_THROW("bad integer argument"); + res = res * 10 + (ch - '0'); + if (res > (size_t)lim) + aux_THROW("integer is too big"); + } } } diff --git a/src/libregexis024sol/expr_parse_functions/command_recognition.h b/src/libregexis024sol/expr_parse_functions/command_recognition.h index 9be9ace..027b0b3 100644 --- a/src/libregexis024sol/expr_parse_functions/command_recognition.h +++ b/src/libregexis024sol/expr_parse_functions/command_recognition.h @@ -4,10 +4,11 @@ #include -bool is_header_cmd(const Command& cmd); -bool is_header_dfa_cmd(const Command& cmd); -bool is_header_select_cmd(const Command& cmd); -void int_parse_with_limit_concern(const std::string& str, REGEX_IS024_MeaningContext &ctx, size_t& res, int lim); - +namespace regexis024 { + bool is_header_cmd(const Command& cmd); + bool is_header_dfa_cmd(const Command& cmd); + bool is_header_select_cmd(const Command& cmd); + void int_parse_with_limit_concern(const std::string& str, REGEX_IS024_MeaningContext &ctx, size_t& res, int lim); +} #endif //LIBREGEXIS024_SRC_LIBREGEXIS024SOL_EXPR_PARSE_FUNCTIONS_COMMAND_RECOGNITION_H diff --git a/src/libregexis024sol/expr_parse_functions/ep_sequence.cpp b/src/libregexis024sol/expr_parse_functions/ep_sequence.cpp index 1e4b04f..400c61b 100644 --- a/src/libregexis024sol/expr_parse_functions/ep_sequence.cpp +++ b/src/libregexis024sol/expr_parse_functions/ep_sequence.cpp @@ -14,209 +14,211 @@ #define aux_ERROR_CHECK do { if (ctx.error) { return; } } while (0) #define aux_THROW(str) do { report(ctx, "regex: " str); return; } while (0) -/* **************************** Sequence */ +namespace regexis024 { + /* **************************** Sequence */ -void in_case_of_backslash(REGEX_IS024_MeaningContext &ctx, const CommonCodesets& cc, FA_Container &fa, SubExprCompiled& backPart) { - assert(readChar(ctx) == U'\\'); - int32_t leader = peep(ctx); aux_ERROR_CHECK; - if (leader == U'b'){ - FA_NodeOfForking* n1 = fa.makeForking(); - FA_NodeOfLookOneBehind* n1a = fa.makeLookOneBehind(invert_set(cc.word_constituents)); - FA_NodeOfLookOneAhead* n2a = fa.makeLookOneAhead(cc.word_constituents); - reattach_nxt_node(n1a, n2a); - FA_NodeOfLookOneBehind* n1b = fa.makeLookOneBehind(cc.word_constituents); - FA_NodeOfLookOneAhead* n2b = fa.makeLookOneAhead(invert_set(cc.word_constituents)); - reattach_nxt_node(n1b, n2b); - add_option_to_fork_node(n1, n1a); - add_option_to_fork_node(n1, n1b); - backPart.start = n1; - backPart.ends = {&(n2a->nxt_node), &(n2b->nxt_node)}; - } else if (leader == U'B'){ - FA_NodeOfForking* n1 = fa.makeForking(); - FA_NodeOfLookOneBehind* n1a = fa.makeLookOneBehind(cc.word_constituents); - FA_NodeOfLookOneAhead* n2a = fa.makeLookOneAhead(cc.word_constituents); - reattach_nxt_node(n1a, n2a); - FA_NodeOfLookOneBehind* n1b = fa.makeLookOneBehind(invert_set(cc.word_constituents)); - FA_NodeOfLookOneAhead* n2b = fa.makeLookOneAhead(invert_set(cc.word_constituents)); - reattach_nxt_node(n1b, n2b); - add_option_to_fork_node(n1, n1a); - add_option_to_fork_node(n1, n1b); - backPart.start = n1; - backPart.ends = {&(n2a->nxt_node), &(n2b->nxt_node)}; - } else if (leader == U'<'){ - FA_NodeOfLookOneBehind *n1 = fa.makeLookOneBehind(invert_set(cc.word_constituents)); - FA_NodeOfLookOneAhead *n2 = fa.makeLookOneAhead(cc.word_constituents); - reattach_nxt_node(n1, n2); - backPart.start = n1; - backPart.ends = {&(n2->nxt_node)}; - } else if (leader == U'>'){ - FA_NodeOfLookOneBehind *n1 = fa.makeLookOneBehind(cc.word_constituents); - FA_NodeOfLookOneAhead *n2 = fa.makeLookOneAhead(invert_set(cc.word_constituents)); - reattach_nxt_node(n1, n2); - backPart.start = n1; - backPart.ends = {&(n2->nxt_node)}; - } else { - bool ret_is_multicode; codeset_t res_codeset; - backslash_expression_parsing_try_regular(ctx, cc, ret_is_multicode, res_codeset); - backPart = subexpr_charset_reading_filter(res_codeset, fa); - return; // To avoid reading leader again (it gets read in the end) + void in_case_of_backslash(REGEX_IS024_MeaningContext &ctx, const CommonCodesets& cc, FA_Container &fa, SubExprCompiled& backPart) { + assert(readChar(ctx) == U'\\'); + int32_t leader = peep(ctx); aux_ERROR_CHECK; + if (leader == U'b'){ + FA_NodeOfForking* n1 = fa.makeForking(); + FA_NodeOfLookOneBehind* n1a = fa.makeLookOneBehind(invert_set(cc.word_constituents)); + FA_NodeOfLookOneAhead* n2a = fa.makeLookOneAhead(cc.word_constituents); + reattach_nxt_node(n1a, n2a); + FA_NodeOfLookOneBehind* n1b = fa.makeLookOneBehind(cc.word_constituents); + FA_NodeOfLookOneAhead* n2b = fa.makeLookOneAhead(invert_set(cc.word_constituents)); + reattach_nxt_node(n1b, n2b); + add_option_to_fork_node(n1, n1a); + add_option_to_fork_node(n1, n1b); + backPart.start = n1; + backPart.ends = {&(n2a->nxt_node), &(n2b->nxt_node)}; + } else if (leader == U'B'){ + FA_NodeOfForking* n1 = fa.makeForking(); + FA_NodeOfLookOneBehind* n1a = fa.makeLookOneBehind(cc.word_constituents); + FA_NodeOfLookOneAhead* n2a = fa.makeLookOneAhead(cc.word_constituents); + reattach_nxt_node(n1a, n2a); + FA_NodeOfLookOneBehind* n1b = fa.makeLookOneBehind(invert_set(cc.word_constituents)); + FA_NodeOfLookOneAhead* n2b = fa.makeLookOneAhead(invert_set(cc.word_constituents)); + reattach_nxt_node(n1b, n2b); + add_option_to_fork_node(n1, n1a); + add_option_to_fork_node(n1, n1b); + backPart.start = n1; + backPart.ends = {&(n2a->nxt_node), &(n2b->nxt_node)}; + } else if (leader == U'<'){ + FA_NodeOfLookOneBehind *n1 = fa.makeLookOneBehind(invert_set(cc.word_constituents)); + FA_NodeOfLookOneAhead *n2 = fa.makeLookOneAhead(cc.word_constituents); + reattach_nxt_node(n1, n2); + backPart.start = n1; + backPart.ends = {&(n2->nxt_node)}; + } else if (leader == U'>'){ + FA_NodeOfLookOneBehind *n1 = fa.makeLookOneBehind(cc.word_constituents); + FA_NodeOfLookOneAhead *n2 = fa.makeLookOneAhead(invert_set(cc.word_constituents)); + reattach_nxt_node(n1, n2); + backPart.start = n1; + backPart.ends = {&(n2->nxt_node)}; + } else { + bool ret_is_multicode; codeset_t res_codeset; + backslash_expression_parsing_try_regular(ctx, cc, ret_is_multicode, res_codeset); + backPart = subexpr_charset_reading_filter(res_codeset, fa); + return; // To avoid reading leader again (it gets read in the end) + } + readChar(ctx); } - readChar(ctx); -} -void repeat_stuff_with_check(REGEX_IS024_MeaningContext& ctx, - SubExprCompiled &patient, FA_Container& fa, size_t min_allowed, size_t max_allowed){ - if (min_allowed > max_allowed) - aux_THROW("repeat operation: min > max"); - if (min_allowed > REGEXIS024_MAX_REPEAT) - aux_THROW("minimum repeat factor is too high"); - if (max_allowed > REGEXIS024_MAX_REPEAT && patient.can_be_empty) - aux_THROW("safety abortion: possible infinite loop. Если вы считаете, что ваше регулярное " - "выражение корректно и не вызвает бесконечного цикла, напишите об этом в жалобную книгу: " - "По ссылке: file:///dev/null Ваши предложения по улучшению libregexis024 обязательно будут рассмотрены."); - apply_repeat_to_subexpression(patient, fa, min_allowed, max_allowed); -} - -void repeat_command_processing(REGEX_IS024_MeaningContext &ctx, FA_Container &fa, std::vector& parts, - const Command& cmd){ - if (parts.empty()) - aux_THROW("no subexpression before !repeat command"); - if (cmd.arguments.empty() || (cmd.arguments.size() == 1 && cmd.arguments[0].is_empty)) { - repeat_stuff_with_check(ctx, parts.back(), fa, 0, REGEXIS024_MAX_REPEAT + 1); aux_ERROR_CHECK; - } else if (cmd.arguments.size() == 1){ - size_t mm; - int_parse_with_limit_concern(cmd.arguments[0].name, ctx, mm, REGEXIS024_MAX_REPEAT); aux_ERROR_CHECK; - repeat_stuff_with_check(ctx, parts.back(), fa, mm, mm); aux_ERROR_CHECK; - } else if (cmd.arguments.size() > 2){ - aux_THROW("too many arguments in !repeat command"); - } else { - size_t min_allowed, max_allowed; - if (cmd.arguments[0].is_empty){ - min_allowed = 0; - } else { - int_parse_with_limit_concern(cmd.arguments[0].name, ctx, min_allowed, REGEXIS024_MAX_REPEAT); - aux_ERROR_CHECK; - } - if (cmd.arguments[1].is_empty){ - max_allowed = REGEXIS024_MAX_REPEAT + 1; - } else { - int_parse_with_limit_concern(cmd.arguments[1].name, ctx, max_allowed, REGEXIS024_MAX_REPEAT); - aux_ERROR_CHECK; - } + void repeat_stuff_with_check(REGEX_IS024_MeaningContext& ctx, + SubExprCompiled &patient, FA_Container& fa, size_t min_allowed, size_t max_allowed){ if (min_allowed > max_allowed) - aux_THROW("!repeat: min > max"); - repeat_stuff_with_check(ctx, parts.back(), fa, min_allowed, max_allowed); aux_ERROR_CHECK; + aux_THROW("repeat operation: min > max"); + if (min_allowed > REGEXIS024_MAX_REPEAT) + aux_THROW("minimum repeat factor is too high"); + if (max_allowed > REGEXIS024_MAX_REPEAT && patient.can_be_empty) + aux_THROW("safety abortion: possible infinite loop. Если вы считаете, что ваше регулярное " + "выражение корректно и не вызвает бесконечного цикла, напишите об этом в жалобную книгу: " + "По ссылке: file:///dev/null Ваши предложения по улучшению libregexis024 обязательно будут рассмотрены."); + apply_repeat_to_subexpression(patient, fa, min_allowed, max_allowed); } -} - -chekushka Sequence_ParseCall::firstTime(REGEX_IS024_MeaningContext &ctx, ParsingContext &pctx, FA_Container &fa) { - while (true) { - int32_t fst = peep(ctx); - call_ERROR_CHECK; - if (fst == U'!') { - Command cmdBuf; - size_t before_cmd = ctx.pos; - cmdBuf = command_expr_parse(ctx); - call_ERROR_CHECK; - if (is_header_cmd(cmdBuf)){ - ctx.pos = before_cmd; - break; - } else if (cmdBuf.name == "r" || cmdBuf.name == "repeat"){ - repeat_command_processing(ctx, fa, parts, cmdBuf); call_ERROR_CHECK; - } else if (is_command_for_charset(cmdBuf)){ - codeset_t cs; - interpret_command_as_charset_giving(pctx.cc, cmdBuf, cs); call_ERROR_CHECK; - parts.push_back(subexpr_charset_reading_filter(cs, fa)); - } else { - call_THROW("unknown command"); - } - } else if (fst == U'\\') { - parts.emplace_back(); - in_case_of_backslash(ctx, pctx.cc, fa, parts.back()); - call_ERROR_CHECK; - } else if (fst == U'^'){ - readChar(ctx); - parts.push_back(subexpression_from_path(fa.makeLookOneBehind(codeset_of_one_char(U'\n')))); - } else if (fst == U'$'){ - readChar(ctx); - parts.push_back(subexpression_from_path(fa.makeLookOneAhead(codeset_of_one_char(U'\n')))); - } else if (fst == U'*'){ -#define vibe_check(sn) if (parts.empty()) { call_THROW("no subexpression before `" sn "` operator"); } readChar(ctx); - vibe_check("*") - repeat_stuff_with_check(ctx, parts.back(), fa, 0, REGEXIS024_MAX_REPEAT + 1); call_ERROR_CHECK; - } else if (fst == U'+'){ - vibe_check("+") - repeat_stuff_with_check(ctx, parts.back(), fa, 1, REGEXIS024_MAX_REPEAT + 1); call_ERROR_CHECK; - } else if (fst == U'?'){ - vibe_check("?") - repeat_stuff_with_check(ctx, parts.back(), fa, 0, 1); call_ERROR_CHECK; -#undef vibe_check - } else if (fst == U'#'){ - readChar(ctx); - std::string name = tryRead_REGEX024_name(ctx); call_ERROR_CHECK; - if (name.empty()) - call_THROW("No name provided after #"); - if (ctx.ktr.track_names.count(name) == 0){ - ctx.ktr.track_names[name] = static_cast(ctx.ktr.retrieval_info.size()); - ctx.ktr.retrieval_info.emplace_back(); - } - int64_t id = ctx.ktr.track_names[name]; - int32_t typeDet = peep(ctx); - if (typeDet == U'('){ - ensure_space_for_track_unit(ctx, name, tracking_var_types::range); call_ERROR_CHECK; - parts.emplace_back(); - return std::make_unique(parts.back(), id); - } else if (typeDet == U':'){ - ensure_space_for_track_unit(ctx, name, tracking_var_types::dot_immediate); call_ERROR_CHECK; - readChar(ctx); - std::string value_str = tryRead_REGEX024_name(ctx); - size_t value; - int_parse_with_limit_concern(value_str, ctx, value, UINT16_MAX); - int32_t cl = peep(ctx); - if (cl != U';') - call_THROW("Missing ; after dot track unit operator"); - readChar(ctx); - if (ctx.ktr.retrieval_info[id].stored_in_sa) - parts.emplace_back(subexpression_from_path( - fa.makeTrackArrayMovImm(regex024_opcodes::MOV_SELARR_IMM, - ctx.ktr.retrieval_info[id].selarr_first, value))); - if (ctx.ktr.retrieval_info[id].stored_in_ca) - parts.emplace_back(subexpression_from_path( - fa.makeTrackArrayMovImm(regex024_opcodes::MOV_COLARR_IMM, - ctx.ktr.retrieval_info[id].colarr_first, value))); - } else if (typeDet == U';'){ - ensure_space_for_track_unit(ctx, name, tracking_var_types::dot_cur_pos); call_ERROR_CHECK; - readChar(ctx); - if (ctx.ktr.retrieval_info[id].stored_in_sa) - parts.emplace_back(subexpression_from_path( - fa.makeTrackArrayMovHalfinvariant(regex024_opcodes::MOV_SELARR_CHPOS, - ctx.ktr.retrieval_info[id].selarr_first))); - if (ctx.ktr.retrieval_info[id].stored_in_ca) - parts.emplace_back(subexpression_from_path( - fa.makeTrackArrayMovHalfinvariant(regex024_opcodes::MOV_COLARR_BTPOS, - ctx.ktr.retrieval_info[id].colarr_first))); - } else - call_THROW("Missing ; or ( in the beginning of tracking unit"); - } else if (fst == U'(') { - parts.emplace_back(); - return std::make_unique(parts.back(), -1); - } else if (fst == U'[') { - codeset_t filter = sq_bracket_expr_parse(ctx, pctx.cc); call_ERROR_CHECK; - parts.push_back(subexpr_charset_reading_filter(filter, fa)); - } else if (fst >= 0 && fst != U')' && fst != U'|' && fst != U']'){ - readChar(ctx); - parts.push_back(subexpr_charset_reading_filter(codeset_of_one_char(fst), fa)); + void repeat_command_processing(REGEX_IS024_MeaningContext &ctx, FA_Container &fa, std::vector& parts, + const Command& cmd){ + if (parts.empty()) + aux_THROW("no subexpression before !repeat command"); + if (cmd.arguments.empty() || (cmd.arguments.size() == 1 && cmd.arguments[0].is_empty)) { + repeat_stuff_with_check(ctx, parts.back(), fa, 0, REGEXIS024_MAX_REPEAT + 1); aux_ERROR_CHECK; + } else if (cmd.arguments.size() == 1){ + size_t mm; + int_parse_with_limit_concern(cmd.arguments[0].name, ctx, mm, REGEXIS024_MAX_REPEAT); aux_ERROR_CHECK; + repeat_stuff_with_check(ctx, parts.back(), fa, mm, mm); aux_ERROR_CHECK; + } else if (cmd.arguments.size() > 2){ + aux_THROW("too many arguments in !repeat command"); } else { - break; + size_t min_allowed, max_allowed; + if (cmd.arguments[0].is_empty){ + min_allowed = 0; + } else { + int_parse_with_limit_concern(cmd.arguments[0].name, ctx, min_allowed, REGEXIS024_MAX_REPEAT); + aux_ERROR_CHECK; + } + if (cmd.arguments[1].is_empty){ + max_allowed = REGEXIS024_MAX_REPEAT + 1; + } else { + int_parse_with_limit_concern(cmd.arguments[1].name, ctx, max_allowed, REGEXIS024_MAX_REPEAT); + aux_ERROR_CHECK; + } + if (min_allowed > max_allowed) + aux_THROW("!repeat: min > max"); + repeat_stuff_with_check(ctx, parts.back(), fa, min_allowed, max_allowed); aux_ERROR_CHECK; } } - for (SubExprCompiled& part: parts) - result = join(result, part); - return NULL; -} -chekushka Sequence_ParseCall::afterReceive(REGEX_IS024_MeaningContext &ctx, ParsingContext &pctx, FA_Container &fa) { - // This is possible only if I received a bracket expression - return firstTime(ctx, pctx, fa); -} + + chekushka Sequence_ParseCall::firstTime(REGEX_IS024_MeaningContext &ctx, ParsingContext &pctx, FA_Container &fa) { + while (true) { + int32_t fst = peep(ctx); + call_ERROR_CHECK; + if (fst == U'!') { + Command cmdBuf; + size_t before_cmd = ctx.pos; + cmdBuf = command_expr_parse(ctx); + call_ERROR_CHECK; + if (is_header_cmd(cmdBuf)){ + ctx.pos = before_cmd; + break; + } else if (cmdBuf.name == "r" || cmdBuf.name == "repeat"){ + repeat_command_processing(ctx, fa, parts, cmdBuf); call_ERROR_CHECK; + } else if (is_command_for_charset(cmdBuf)){ + codeset_t cs; + interpret_command_as_charset_giving(pctx.cc, cmdBuf, cs); call_ERROR_CHECK; + parts.push_back(subexpr_charset_reading_filter(cs, fa)); + } else { + call_THROW("unknown command"); + } + } else if (fst == U'\\') { + parts.emplace_back(); + in_case_of_backslash(ctx, pctx.cc, fa, parts.back()); + call_ERROR_CHECK; + } else if (fst == U'^'){ + readChar(ctx); + parts.push_back(subexpression_from_path(fa.makeLookOneBehind(codeset_of_one_char(U'\n')))); + } else if (fst == U'$'){ + readChar(ctx); + parts.push_back(subexpression_from_path(fa.makeLookOneAhead(codeset_of_one_char(U'\n')))); + } else if (fst == U'*'){ +#define vibe_check(sn) if (parts.empty()) { call_THROW("no subexpression before `" sn "` operator"); } readChar(ctx); + vibe_check("*") + repeat_stuff_with_check(ctx, parts.back(), fa, 0, REGEXIS024_MAX_REPEAT + 1); call_ERROR_CHECK; + } else if (fst == U'+'){ + vibe_check("+") + repeat_stuff_with_check(ctx, parts.back(), fa, 1, REGEXIS024_MAX_REPEAT + 1); call_ERROR_CHECK; + } else if (fst == U'?'){ + vibe_check("?") + repeat_stuff_with_check(ctx, parts.back(), fa, 0, 1); call_ERROR_CHECK; +#undef vibe_check + } else if (fst == U'#'){ + readChar(ctx); + std::string name = tryRead_REGEX024_name(ctx); call_ERROR_CHECK; + if (name.empty()) + call_THROW("No name provided after #"); + if (ctx.ktr.track_names.count(name) == 0){ + ctx.ktr.track_names[name] = static_cast(ctx.ktr.retrieval_info.size()); + ctx.ktr.retrieval_info.emplace_back(); + } + int64_t id = ctx.ktr.track_names[name]; + int32_t typeDet = peep(ctx); + if (typeDet == U'('){ + ensure_space_for_track_unit(ctx, name, tracking_var_types::range); call_ERROR_CHECK; + parts.emplace_back(); + return std::make_unique(parts.back(), id); + } else if (typeDet == U':'){ + ensure_space_for_track_unit(ctx, name, tracking_var_types::dot_immediate); call_ERROR_CHECK; + readChar(ctx); + std::string value_str = tryRead_REGEX024_name(ctx); + size_t value; + int_parse_with_limit_concern(value_str, ctx, value, UINT16_MAX); + int32_t cl = peep(ctx); + if (cl != U';') + call_THROW("Missing ; after dot track unit operator"); + readChar(ctx); + if (ctx.ktr.retrieval_info[id].stored_in_sa) + parts.emplace_back(subexpression_from_path( + fa.makeTrackArrayMovImm(opcodes::MOV_SELARR_IMM, + ctx.ktr.retrieval_info[id].selarr_first, value))); + if (ctx.ktr.retrieval_info[id].stored_in_ca) + parts.emplace_back(subexpression_from_path( + fa.makeTrackArrayMovImm(opcodes::MOV_COLARR_IMM, + ctx.ktr.retrieval_info[id].colarr_first, value))); + } else if (typeDet == U';'){ + ensure_space_for_track_unit(ctx, name, tracking_var_types::dot_cur_pos); call_ERROR_CHECK; + readChar(ctx); + if (ctx.ktr.retrieval_info[id].stored_in_sa) + parts.emplace_back(subexpression_from_path( + fa.makeTrackArrayMovHalfinvariant(opcodes::MOV_SELARR_CHPOS, + ctx.ktr.retrieval_info[id].selarr_first))); + if (ctx.ktr.retrieval_info[id].stored_in_ca) + parts.emplace_back(subexpression_from_path( + fa.makeTrackArrayMovHalfinvariant(opcodes::MOV_COLARR_BTPOS, + ctx.ktr.retrieval_info[id].colarr_first))); + } else + call_THROW("Missing ; or ( in the beginning of tracking unit"); + } else if (fst == U'(') { + parts.emplace_back(); + return std::make_unique(parts.back(), -1); + } else if (fst == U'[') { + codeset_t filter = sq_bracket_expr_parse(ctx, pctx.cc); call_ERROR_CHECK; + parts.push_back(subexpr_charset_reading_filter(filter, fa)); + } else if (fst >= 0 && fst != U')' && fst != U'|' && fst != U']'){ + readChar(ctx); + parts.push_back(subexpr_charset_reading_filter(codeset_of_one_char(fst), fa)); + } else { + break; + } + } + for (SubExprCompiled& part: parts) + result = join(result, part); + return NULL; + } + + chekushka Sequence_ParseCall::afterReceive(REGEX_IS024_MeaningContext &ctx, ParsingContext &pctx, FA_Container &fa) { + // This is possible only if I received a bracket expression + return firstTime(ctx, pctx, fa); + } +} \ No newline at end of file diff --git a/src/libregexis024sol/expr_parse_functions/epf.h b/src/libregexis024sol/expr_parse_functions/epf.h index 8b04132..90ed134 100644 --- a/src/libregexis024sol/expr_parse_functions/epf.h +++ b/src/libregexis024sol/expr_parse_functions/epf.h @@ -10,65 +10,64 @@ #include #include -struct ParsingContext{ - /* Those subexpressions, that are tracket by s`a are forbidden from nesting inside themselves */ - std::vector is_inside_of_these_sa_subexpressions; - bool select_cmd_encountered = false; - RegexPriorityTable priority_table; - bool dfa_cmd_activated = false; - /* Completely failing to build dfa with this flag on will result in no error */ - bool dfa_cmd_nonimportant = false; - /* With this flag, your dfa should be absolutely pure, no forks are allowed. */ - bool dfa_cmd_unforgiving = false; +namespace regexis024 { + struct ParsingContext{ + /* Those subexpressions, that are tracket by s`a are forbidden from nesting inside themselves */ + std::vector is_inside_of_these_sa_subexpressions; + bool select_cmd_encountered = false; + RegexPriorityTable priority_table; + bool dfa_cmd_activated = false; + /* Completely failing to build dfa with this flag on will result in no error */ + bool dfa_cmd_nonimportant = false; + /* With this flag, your dfa should be absolutely pure, no forks are allowed. */ + bool dfa_cmd_unforgiving = false; - /* Reference to active cc set (actually, there is only one cc, but who cares, I placed - * it here to lower the number of arguments in ParseCall methods, again WHO CARES?) */ - const CommonCodesets& cc; - explicit ParsingContext(const CommonCodesets& cc_): cc(cc_){} - }; + /* Reference to active cc set (actually, there is only one cc, but who cares, I placed + * it here to lower the number of arguments in ParseCall methods, again WHO CARES?) */ + const CommonCodesets& cc; + explicit ParsingContext(const CommonCodesets& cc_): cc(cc_){} + }; -typedef REGEX_IS024_MeaningContext ctx_t; -struct ParseCall; -typedef std::unique_ptr chekushka; -struct ParseCall{ - SubExprCompiled& result; - explicit ParseCall(SubExprCompiled &result) : result(result) {} - virtual ~ParseCall() = default; - virtual chekushka afterReceive(ctx_t& ctx, ParsingContext& pctx, FA_Container& fa) { assert(false); } - virtual chekushka firstTime(ctx_t& ctx, ParsingContext& pctx, FA_Container& fa) { assert(false); } -}; + typedef REGEX_IS024_MeaningContext ctx_t; + struct ParseCall; + typedef std::unique_ptr chekushka; + struct ParseCall{ + SubExprCompiled& result; + explicit ParseCall(SubExprCompiled &result) : result(result) {} + virtual ~ParseCall() = default; + virtual chekushka afterReceive(ctx_t& ctx, ParsingContext& pctx, FA_Container& fa) { assert(false); } + virtual chekushka firstTime(ctx_t& ctx, ParsingContext& pctx, FA_Container& fa) { assert(false); } + }; -struct TopLvl_ParseCall: public ParseCall{ - explicit TopLvl_ParseCall(SubExprCompiled &result) : ParseCall(result) {} - chekushka afterReceive(ctx_t &ctx, ParsingContext &pctx, FA_Container &fa) override; - chekushka firstTime(ctx_t &ctx, ParsingContext &pctx, FA_Container &fa) override; -}; + struct TopLvl_ParseCall: public ParseCall{ + explicit TopLvl_ParseCall(SubExprCompiled &result) : ParseCall(result) {} + chekushka afterReceive(ctx_t &ctx, ParsingContext &pctx, FA_Container &fa) override; + chekushka firstTime(ctx_t &ctx, ParsingContext &pctx, FA_Container &fa) override; + }; -struct BracketLvl_ParseCall: public ParseCall{ - /* -1 if this is a normal bracket expression. Otherwise, it is an index in ctx.retrieval_info vector */ - int64_t namedSubexpressionId; - SubExprCompiled tmp_ret_buff; - explicit BracketLvl_ParseCall(SubExprCompiled& result, int64_t namedSubexpressionId) : - ParseCall(result), namedSubexpressionId(namedSubexpressionId) {} - chekushka afterReceive(REGEX_IS024_MeaningContext& ctx, ParsingContext& pctx, FA_Container& fa) override; - chekushka firstTime(REGEX_IS024_MeaningContext& ctx, ParsingContext& pctx, FA_Container& fa) override; -}; + struct BracketLvl_ParseCall: public ParseCall{ + /* -1 if this is a normal bracket expression. Otherwise, it is an index in ctx.retrieval_info vector */ + int64_t namedSubexpressionId; + SubExprCompiled tmp_ret_buff; + explicit BracketLvl_ParseCall(SubExprCompiled& result, int64_t namedSubexpressionId) : + ParseCall(result), namedSubexpressionId(namedSubexpressionId) {} + chekushka afterReceive(REGEX_IS024_MeaningContext& ctx, ParsingContext& pctx, FA_Container& fa) override; + chekushka firstTime(REGEX_IS024_MeaningContext& ctx, ParsingContext& pctx, FA_Container& fa) override; + }; -struct ForkLvl_ParseCall: public ParseCall{ - std::vector options; - explicit ForkLvl_ParseCall(SubExprCompiled &result) : ParseCall(result) {} - chekushka afterReceive(REGEX_IS024_MeaningContext& ctx, ParsingContext& pctx, FA_Container& fa); - chekushka firstTime(REGEX_IS024_MeaningContext& ctx, ParsingContext& pctx, FA_Container& fa); -}; - -struct Sequence_ParseCall: public ParseCall{ - std::vector parts; - explicit Sequence_ParseCall(SubExprCompiled &result) :ParseCall(result) {} - chekushka afterReceive(REGEX_IS024_MeaningContext& ctx, ParsingContext& pctx, FA_Container& fa); - chekushka firstTime(REGEX_IS024_MeaningContext& ctx, ParsingContext& pctx, FA_Container& fa); -}; - -/* Some auxilary functions */ + struct ForkLvl_ParseCall: public ParseCall{ + std::vector options; + explicit ForkLvl_ParseCall(SubExprCompiled &result) : ParseCall(result) {} + chekushka afterReceive(REGEX_IS024_MeaningContext& ctx, ParsingContext& pctx, FA_Container& fa); + chekushka firstTime(REGEX_IS024_MeaningContext& ctx, ParsingContext& pctx, FA_Container& fa); + }; + struct Sequence_ParseCall: public ParseCall{ + std::vector parts; + explicit Sequence_ParseCall(SubExprCompiled &result) :ParseCall(result) {} + chekushka afterReceive(REGEX_IS024_MeaningContext& ctx, ParsingContext& pctx, FA_Container& fa); + chekushka firstTime(REGEX_IS024_MeaningContext& ctx, ParsingContext& pctx, FA_Container& fa); + }; +} #endif //LIBREGEXIS024_SRC_LIBREGEXIS024SOL_EXPR_PARSE_FUNCTIONS_EPF_H diff --git a/src/libregexis024sol/expr_parse_functions/tracking_units.cpp b/src/libregexis024sol/expr_parse_functions/tracking_units.cpp index 546edba..148d048 100644 --- a/src/libregexis024sol/expr_parse_functions/tracking_units.cpp +++ b/src/libregexis024sol/expr_parse_functions/tracking_units.cpp @@ -4,35 +4,36 @@ #define aux_THROW(str) do { report(ctx, "regex: " str); return; } while (0) #define aux_ERROR_CHECK do { if (ctx.error) { return; } } while (0) - -void for_one_type(REGEX_IS024_MeaningContext &ctx, uint16_t& free_ARR_tai, int& ARR_first, int& ARR_second, - const std::string& ARR_NAME, tracking_var_type type){ +namespace regexis024 { + void for_one_type(REGEX_IS024_MeaningContext &ctx, uint16_t& free_ARR_tai, int& ARR_first, int& ARR_second, + const std::string& ARR_NAME, tracking_var_type_t type){ #define check_is_available() if (free_ARR_tai == UINT16_MAX) { \ - report(ctx, ("regex: " + ARR_NAME + ": key namespace overflow").c_str()); return;} - check_is_available() - ARR_first = free_ARR_tai++; - if (type == tracking_var_types::range){ +report(ctx, ("regex: " + ARR_NAME + ": key namespace overflow").c_str()); return;} check_is_available() - ARR_second = free_ARR_tai++; + ARR_first = free_ARR_tai++; + if (type == tracking_var_types::range){ + check_is_available() + ARR_second = free_ARR_tai++; + } } -} -void ensure_space_for_track_unit(REGEX_IS024_MeaningContext &ctx, const std::string& name, tracking_var_type type) { - size_t id = ctx.ktr.track_names[name]; - /* Size of this verctor won't be changed. THis is a safe reference */ - SubtrackingNameInfo& info = ctx.ktr.retrieval_info[id]; - if (!info.discovered){ - info.type = type; - if (info.stored_in_ca) { - for_one_type(ctx, ctx.free_colarr_tai, info.colarr_first, info.colarr_second, "collection array", type); - aux_ERROR_CHECK; + void ensure_space_for_track_unit(REGEX_IS024_MeaningContext &ctx, const std::string& name, tracking_var_type_t type) { + size_t id = ctx.ktr.track_names[name]; + /* Size of this verctor won't be changed. THis is a safe reference */ + SubtrackingNameInfo& info = ctx.ktr.retrieval_info[id]; + if (!info.discovered){ + info.type = type; + if (info.stored_in_ca) { + for_one_type(ctx, ctx.free_colarr_tai, info.colarr_first, info.colarr_second, "collection array", type); + aux_ERROR_CHECK; + } + if (info.stored_in_sa) { + for_one_type(ctx, ctx.free_selarr_tai, info.selarr_first, info.selarr_second, "selection array", type); + aux_ERROR_CHECK; + } + info.discovered = true; + } else if (info.type != type){ + aux_THROW("tracking tool unit type mismatch"); } - if (info.stored_in_sa) { - for_one_type(ctx, ctx.free_selarr_tai, info.selarr_first, info.selarr_second, "selection array", type); - aux_ERROR_CHECK; - } - info.discovered = true; - } else if (info.type != type){ - aux_THROW("tracking tool unit type mismatch"); } -} +} \ No newline at end of file diff --git a/src/libregexis024sol/expr_parse_functions/tracking_units.h b/src/libregexis024sol/expr_parse_functions/tracking_units.h index 74e5f22..1299856 100644 --- a/src/libregexis024sol/expr_parse_functions/tracking_units.h +++ b/src/libregexis024sol/expr_parse_functions/tracking_units.h @@ -4,7 +4,8 @@ #include -void ensure_space_for_track_unit(REGEX_IS024_MeaningContext &ctx, const std::string& name, tracking_var_type type); - +namespace regexis024 { + void ensure_space_for_track_unit(REGEX_IS024_MeaningContext &ctx, const std::string& name, tracking_var_type_t type); +} #endif //LIBREGEXIS024_SRC_LIBREGEXIS024SOL_TRACKING_UNITS_H diff --git a/src/libregexis024sol/part_of_expr_that_tracks.cpp b/src/libregexis024sol/part_of_expr_that_tracks.cpp deleted file mode 100644 index db5928f..0000000 --- a/src/libregexis024sol/part_of_expr_that_tracks.cpp +++ /dev/null @@ -1,2 +0,0 @@ -// #include - diff --git a/src/libregexis024sol/part_of_expr_that_tracks.h b/src/libregexis024sol/part_of_expr_that_tracks.h index 9aaf8a0..7faf007 100644 --- a/src/libregexis024sol/part_of_expr_that_tracks.h +++ b/src/libregexis024sol/part_of_expr_that_tracks.h @@ -6,26 +6,27 @@ #include #include -struct SubtrackingNameInfo{ - bool stored_in_ca = true; - bool stored_in_sa = false; +namespace regexis024 { + struct SubtrackingNameInfo{ + bool stored_in_ca = true; + bool stored_in_sa = false; - bool discovered = false; - tracking_var_type type; - /* These fields will be -1 if unused */ - int colarr_first = -1; - int colarr_second = -1; + bool discovered = false; + tracking_var_type_t type; + /* These fields will be -1 if unused */ + int colarr_first = -1; + int colarr_second = -1; - bool used_in_sifting = false; - bool minimizing = false; - int selarr_first = -1; - int selarr_second = -1; -}; - -struct KnownTrackingTools { - std::map track_names; - std::vector retrieval_info; -}; + bool used_in_sifting = false; + bool minimizing = false; + int selarr_first = -1; + int selarr_second = -1; + }; + struct KnownTrackingTools { + std::map track_names; + std::vector retrieval_info; + }; +} #endif //PART_OF_EXPR_THAT_TRACKS_H diff --git a/src/libregexis024sol/sol_misc_base.cpp b/src/libregexis024sol/sol_misc_base.cpp index a0177cc..abe2a1c 100644 --- a/src/libregexis024sol/sol_misc_base.cpp +++ b/src/libregexis024sol/sol_misc_base.cpp @@ -1,55 +1,55 @@ #include #include -void report(REGEX_IS024_MeaningContext &ctx, const char *error) { - if (!ctx.error){ - ctx.error = true; - ctx.error_msg = error; - } -} - -bool isEnd(REGEX_IS024_MeaningContext &ctx) { - return ctx.pos == ctx.input_size; -} - -int32_t peep(REGEX_IS024_MeaningContext &ctx) { -// printf("pos = %lu\n", ctx.pos); - if (isEnd(ctx)) - return -1; // This is probably the only place where getting negative return does not generate error - int32_t cp; size_t sz; - utf8_string_iterat(cp, sz, ctx.pos, ctx.input, ctx.input_size); - if (cp < 0) - report(ctx, "encoding error"); - return cp; -} - -int32_t readChar(REGEX_IS024_MeaningContext &ctx) { -// printf("READ pos = %lu\n", ctx.pos); - int32_t cp; size_t sz; - utf8_string_iterat(cp, sz, ctx.pos, ctx.input, ctx.input_size); - if (cp >= 0) - ctx.pos += sz; - else - report(ctx, "bruh what?? How this even happened"); - return cp; -} - -bool is_REGEX024_nameConstituent(int32_t ch) { - return ('0' <= ch && ch <= '9') || ('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z'); -} - -std::string tryRead_REGEX024_name(REGEX_IS024_MeaningContext &ctx) { - std::string res; - while (true){ - int32_t ch = peep(ctx); - if (is_REGEX024_nameConstituent(ch)){ - res += (char)ch; - readChar(ctx); - } else { - break; +namespace regexis024 { + void report(REGEX_IS024_MeaningContext &ctx, const char *error) { + if (!ctx.error){ + ctx.error = true; + ctx.error_msg = error; } } - return res; + + bool isEnd(REGEX_IS024_MeaningContext &ctx) { + return ctx.pos == ctx.input_size; + } + + int32_t peep(REGEX_IS024_MeaningContext &ctx) { + // printf("pos = %lu\n", ctx.pos); + if (isEnd(ctx)) + return -1; // This is probably the only place where getting negative return does not generate error + int32_t cp; size_t sz; + utf8_string_iterat(cp, sz, ctx.pos, ctx.input, ctx.input_size); + if (cp < 0) + report(ctx, "encoding error"); + return cp; + } + + int32_t readChar(REGEX_IS024_MeaningContext &ctx) { + // printf("READ pos = %lu\n", ctx.pos); + int32_t cp; size_t sz; + utf8_string_iterat(cp, sz, ctx.pos, ctx.input, ctx.input_size); + if (cp >= 0) + ctx.pos += sz; + else + report(ctx, "bruh what?? How this even happened"); + return cp; + } + + bool is_REGEX024_nameConstituent(int32_t ch) { + return ('0' <= ch && ch <= '9') || ('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z'); + } + + std::string tryRead_REGEX024_name(REGEX_IS024_MeaningContext &ctx) { + std::string res; + while (true){ + int32_t ch = peep(ctx); + if (is_REGEX024_nameConstituent(ch)){ + res += (char)ch; + readChar(ctx); + } else { + break; + } + } + return res; + } } - - diff --git a/src/libregexis024sol/sol_misc_base.h b/src/libregexis024sol/sol_misc_base.h index 8ecb3c1..503743f 100644 --- a/src/libregexis024sol/sol_misc_base.h +++ b/src/libregexis024sol/sol_misc_base.h @@ -5,16 +5,17 @@ #include #include -void report(REGEX_IS024_MeaningContext& ctx, const char* error); +namespace regexis024 { + void report(REGEX_IS024_MeaningContext& ctx, const char* error); -bool isEnd(REGEX_IS024_MeaningContext& ctx); -int32_t peep(REGEX_IS024_MeaningContext& ctx); -int32_t readChar(REGEX_IS024_MeaningContext& ctx); + bool isEnd(REGEX_IS024_MeaningContext& ctx); + int32_t peep(REGEX_IS024_MeaningContext& ctx); + int32_t readChar(REGEX_IS024_MeaningContext& ctx); -bool is_REGEX024_nameConstituent(int32_t ch); -/* Name in my library consists of [0-9a-zA-Z]. If the first peeped letter is not name constituent, - * empty string is returned */ -std::string tryRead_REGEX024_name(REGEX_IS024_MeaningContext& ctx); - + bool is_REGEX024_nameConstituent(int32_t ch); + /* Name in my library consists of [0-9a-zA-Z]. If the first peeped letter is not name constituent, + * empty string is returned */ + std::string tryRead_REGEX024_name(REGEX_IS024_MeaningContext& ctx); +} #endif //LIBREGEXIS024_SRC_LIBREGEXIS024SOL_SOL_MISC_BASE_H diff --git a/src/libregexis024sol/special_terminals.h b/src/libregexis024sol/special_terminals.h index ab10fcb..4315c71 100644 --- a/src/libregexis024sol/special_terminals.h +++ b/src/libregexis024sol/special_terminals.h @@ -5,32 +5,34 @@ #include #include -/* This option of backslash usage should be checked last. - * Function can generate error. Always check the error first */ -void -backslash_expression_parsing_try_regular(REGEX_IS024_MeaningContext& ctx, const CommonCodesets& cc, - bool& ret_is_multicode, codeset_t& ret_set); +namespace regexis024 { + /* This option of backslash usage should be checked last. + * Function can generate error. Always check the error first */ + void + backslash_expression_parsing_try_regular(REGEX_IS024_MeaningContext& ctx, const CommonCodesets& cc, + bool& ret_is_multicode, codeset_t& ret_set); -struct CommandEntity; -struct Command; -struct CommandArgument; + struct CommandEntity; + struct Command; + struct CommandArgument; -struct CommandEntity{ - std::string name; - std::vector arguments; -}; + struct CommandEntity{ + std::string name; + std::vector arguments; + }; -struct CommandArgument: CommandEntity{ - bool is_empty = true; -}; + struct CommandArgument: CommandEntity{ + bool is_empty = true; + }; -struct Command: CommandEntity{ - bool tilda = false; -}; + struct Command: CommandEntity{ + bool tilda = false; + }; -/* Zlaya sobaka. Kidaet oshibki v context */ -Command command_expr_parse(REGEX_IS024_MeaningContext& ctx); -bool is_command_for_charset(const Command& cmd); -void interpret_command_as_charset_giving(const CommonCodesets& cc, const Command& cmd, codeset_t& ret); + /* Zlaya sobaka. Kidaet oshibki v context */ + Command command_expr_parse(REGEX_IS024_MeaningContext& ctx); + bool is_command_for_charset(const Command& cmd); + void interpret_command_as_charset_giving(const CommonCodesets& cc, const Command& cmd, codeset_t& ret); +} #endif //LIBREGEXIS024_SRC_LIBREGEXIS024SOL_SPECIAL_TERMINALS_H diff --git a/src/libregexis024sol/square_bracket_expression.cpp b/src/libregexis024sol/square_bracket_expression.cpp index f2aaad2..f40968b 100644 --- a/src/libregexis024sol/square_bracket_expression.cpp +++ b/src/libregexis024sol/square_bracket_expression.cpp @@ -6,184 +6,186 @@ #include #include -/* Can allow backslash (later should check that backslash expression is not multicharar or empty */ -bool soundsLikeCharOrRangeStart(int32_t peeped) { - return peeped >= 0 && (peeped != U'[' && peeped != U']' && peeped != U'!' && \ - peeped != '^' && peeped != '&' && peeped != '-'); -} +namespace regexis024 { + /* Can allow backslash (later should check that backslash expression is not multicharar or empty */ + bool soundsLikeCharOrRangeStart(int32_t peeped) { + return peeped >= 0 && (peeped != U'[' && peeped != U']' && peeped != U'!' && \ + peeped != '^' && peeped != '&' && peeped != '-'); + } -typedef REGEX_IS024_MeaningContext ctx_t; + typedef REGEX_IS024_MeaningContext ctx_t; -struct ParseCall; -typedef std::shared_ptr chekushka; + struct ParseCall; + typedef std::shared_ptr chekushka; -struct ParseCall{ - codeset_t& result; + struct ParseCall{ + codeset_t& result; - explicit ParseCall(codeset_t &result) : result(result) {} - virtual ~ParseCall() = default; - virtual chekushka afterReceive(ctx_t& ctx, const CommonCodesets& cc) { assert(false); } - virtual chekushka firstTime(ctx_t& ctx, const CommonCodesets& cc) { assert(false); } -}; + explicit ParseCall(codeset_t &result) : result(result) {} + virtual ~ParseCall() = default; + virtual chekushka afterReceive(ctx_t& ctx, const CommonCodesets& cc) { assert(false); } + virtual chekushka firstTime(ctx_t& ctx, const CommonCodesets& cc) { assert(false); } + }; #define call_ERROR_CHECK do { if (ctx.error) { return NULL; } } while (0) #define call_THROW(str) do { report(ctx, "square bracket expression: " str); return NULL; } while (0) -/* [...] */ -struct ZeroLvl_ParseCall: public ParseCall{ - explicit ZeroLvl_ParseCall(codeset_t &result) : ParseCall(result) {} - chekushka afterReceive(ctx_t &ctx, const CommonCodesets& cc) override; - chekushka firstTime(ctx_t &ctx, const CommonCodesets& cc) override; -}; + /* [...] */ + struct ZeroLvl_ParseCall: public ParseCall{ + explicit ZeroLvl_ParseCall(codeset_t &result) : ParseCall(result) {} + chekushka afterReceive(ctx_t &ctx, const CommonCodesets& cc) override; + chekushka firstTime(ctx_t &ctx, const CommonCodesets& cc) override; + }; -/* ...&...&... */ -struct FirstLvl_ParseCall: public ParseCall{ - codeset_t ret_buf_for_new; - bool got_one = false; - explicit FirstLvl_ParseCall(codeset_t& result) : ParseCall(result) {} - chekushka afterReceive(ctx_t &ctx, const CommonCodesets& cc) override; - chekushka firstTime(ctx_t &ctx, const CommonCodesets& cc) override; -}; + /* ...&...&... */ + struct FirstLvl_ParseCall: public ParseCall{ + codeset_t ret_buf_for_new; + bool got_one = false; + explicit FirstLvl_ParseCall(codeset_t& result) : ParseCall(result) {} + chekushka afterReceive(ctx_t &ctx, const CommonCodesets& cc) override; + chekushka firstTime(ctx_t &ctx, const CommonCodesets& cc) override; + }; -/* ab[]vgd[]eyo[]zhz */ -struct SecondLvl_ParseCall: public ParseCall{ - codeset_t ret_buf_for_new; - explicit SecondLvl_ParseCall(codeset_t& result) : ParseCall(result) {} - chekushka afterReceive(ctx_t &ctx, const CommonCodesets& cc) override; - chekushka firstTime(ctx_t &ctx, const CommonCodesets& cc) override; -}; + /* ab[]vgd[]eyo[]zhz */ + struct SecondLvl_ParseCall: public ParseCall{ + codeset_t ret_buf_for_new; + explicit SecondLvl_ParseCall(codeset_t& result) : ParseCall(result) {} + chekushka afterReceive(ctx_t &ctx, const CommonCodesets& cc) override; + chekushka firstTime(ctx_t &ctx, const CommonCodesets& cc) override; + }; -/* ^... */ -struct CircumflexLvl_ParseCall: public ParseCall{ - codeset_t ret_buf_for_new; - explicit CircumflexLvl_ParseCall(codeset_t& result) : ParseCall(result) {} - chekushka afterReceive(ctx_t &ctx, const CommonCodesets& cc) override; - chekushka firstTime(ctx_t &ctx, const CommonCodesets& cc) override; -}; + /* ^... */ + struct CircumflexLvl_ParseCall: public ParseCall{ + codeset_t ret_buf_for_new; + explicit CircumflexLvl_ParseCall(codeset_t& result) : ParseCall(result) {} + chekushka afterReceive(ctx_t &ctx, const CommonCodesets& cc) override; + chekushka firstTime(ctx_t &ctx, const CommonCodesets& cc) override; + }; -/* ********* ZeroLvl_ParseCall ********** */ + /* ********* ZeroLvl_ParseCall ********** */ -chekushka ZeroLvl_ParseCall::firstTime(ctx_t &ctx, const CommonCodesets& cc) { - assert(readChar(ctx) == U'['); - return std::make_shared(result); -} - -chekushka ZeroLvl_ParseCall::afterReceive(ctx_t &ctx, const CommonCodesets& cc) { - if (peep(ctx) != U']') - call_THROW("lvl 0: missing ]"); - readChar(ctx); - return NULL; -} - -/* ********* FirstLvl_ParseCall ********** */ - -chekushka FirstLvl_ParseCall::firstTime(ctx_t &ctx, const CommonCodesets& cc) { - return std::make_shared(result); -} - -chekushka FirstLvl_ParseCall::afterReceive(ctx_t &ctx, const CommonCodesets& cc) { - if (got_one) - result = intersect_sets(result, ret_buf_for_new); - else - got_one = true; - if (peep(ctx) == U'&'){ - readChar(ctx); - return std::make_shared(ret_buf_for_new); + chekushka ZeroLvl_ParseCall::firstTime(ctx_t &ctx, const CommonCodesets& cc) { + assert(readChar(ctx) == U'['); + return std::make_shared(result); } - return NULL; -} -/* ********* SecondLvl_ParseCall ********** */ - -chekushka SecondLvl_ParseCall::firstTime(ctx_t &ctx, const CommonCodesets& cc) { - repeat: - int32_t ch = peep(ctx); call_ERROR_CHECK; - if (ch == U'^'){ - return std::make_shared(ret_buf_for_new); - } else if (ch == U'!'){ - Command cmd = command_expr_parse(ctx); call_ERROR_CHECK; - if (!is_command_for_charset(cmd)) - call_THROW("second lvl: illegal command"); - interpret_command_as_charset_giving(cc, cmd, ret_buf_for_new); - result = merge_sets(result, ret_buf_for_new); - goto repeat; - } else if (ch == U'['){ - return std::make_shared(ret_buf_for_new); - } else if (soundsLikeCharOrRangeStart(ch)){ + chekushka ZeroLvl_ParseCall::afterReceive(ctx_t &ctx, const CommonCodesets& cc) { + if (peep(ctx) != U']') + call_THROW("lvl 0: missing ]"); readChar(ctx); - bool bs_multicode; - codeset_t bs_stuff; + return NULL; + } - if (ch == '\\'){ - backslash_expression_parsing_try_regular(ctx, cc, bs_multicode, bs_stuff); - if (bs_multicode){ - result = merge_sets(result, bs_stuff); - goto repeat; - } else { - ret_buf_for_new = codeset_of_one_char(bs_stuff[0].first); - } - } else { - ret_buf_for_new = codeset_of_one_char(ch); + /* ********* FirstLvl_ParseCall ********** */ + + chekushka FirstLvl_ParseCall::firstTime(ctx_t &ctx, const CommonCodesets& cc) { + return std::make_shared(result); + } + + chekushka FirstLvl_ParseCall::afterReceive(ctx_t &ctx, const CommonCodesets& cc) { + if (got_one) + result = intersect_sets(result, ret_buf_for_new); + else + got_one = true; + if (peep(ctx) == U'&'){ + readChar(ctx); + return std::make_shared(ret_buf_for_new); } - int32_t mCh = peep(ctx); call_ERROR_CHECK; - if (mCh == U'-'){ + return NULL; + } + + /* ********* SecondLvl_ParseCall ********** */ + + chekushka SecondLvl_ParseCall::firstTime(ctx_t &ctx, const CommonCodesets& cc) { + repeat: + int32_t ch = peep(ctx); call_ERROR_CHECK; + if (ch == U'^'){ + return std::make_shared(ret_buf_for_new); + } else if (ch == U'!'){ + Command cmd = command_expr_parse(ctx); call_ERROR_CHECK; + if (!is_command_for_charset(cmd)) + call_THROW("second lvl: illegal command"); + interpret_command_as_charset_giving(cc, cmd, ret_buf_for_new); + result = merge_sets(result, ret_buf_for_new); + goto repeat; + } else if (ch == U'['){ + return std::make_shared(ret_buf_for_new); + } else if (soundsLikeCharOrRangeStart(ch)){ readChar(ctx); - int32_t scnd = peep(ctx); call_ERROR_CHECK; - readChar(ctx); - if (scnd == U'\\'){ + bool bs_multicode; + codeset_t bs_stuff; + + if (ch == '\\'){ backslash_expression_parsing_try_regular(ctx, cc, bs_multicode, bs_stuff); - if (bs_multicode) - call_THROW("second lvl: char range: bad escape expression after hyphen"); - ret_buf_for_new[0].second = bs_stuff[0].first; - } else if (soundsLikeCharOrRangeStart(scnd)){ - ret_buf_for_new[0].second = (uint32_t)scnd; + if (bs_multicode){ + result = merge_sets(result, bs_stuff); + goto repeat; + } else { + ret_buf_for_new = codeset_of_one_char(bs_stuff[0].first); + } } else { - call_THROW("second lvl: char range: bad value after hyphen"); + ret_buf_for_new = codeset_of_one_char(ch); } - if (ret_buf_for_new[0].second < ret_buf_for_new[0].first) - call_THROW("second: lvl: char range: invalid range"); + int32_t mCh = peep(ctx); call_ERROR_CHECK; + if (mCh == U'-'){ + readChar(ctx); + int32_t scnd = peep(ctx); call_ERROR_CHECK; + readChar(ctx); + if (scnd == U'\\'){ + backslash_expression_parsing_try_regular(ctx, cc, bs_multicode, bs_stuff); + if (bs_multicode) + call_THROW("second lvl: char range: bad escape expression after hyphen"); + ret_buf_for_new[0].second = bs_stuff[0].first; + } else if (soundsLikeCharOrRangeStart(scnd)){ + ret_buf_for_new[0].second = (uint32_t)scnd; + } else { + call_THROW("second lvl: char range: bad value after hyphen"); + } + if (ret_buf_for_new[0].second < ret_buf_for_new[0].first) + call_THROW("second: lvl: char range: invalid range"); + } + result = merge_sets(result, ret_buf_for_new); + goto repeat; } + return NULL; + } + + chekushka SecondLvl_ParseCall::afterReceive(ctx_t &ctx, const CommonCodesets& cc) { result = merge_sets(result, ret_buf_for_new); - goto repeat; + return firstTime(ctx, cc); } - return NULL; -} -chekushka SecondLvl_ParseCall::afterReceive(ctx_t &ctx, const CommonCodesets& cc) { - result = merge_sets(result, ret_buf_for_new); - return firstTime(ctx, cc); -} + /* ********* CircumflexLvl_ParseCall ********* */ -/* ********* CircumflexLvl_ParseCall ********* */ + chekushka CircumflexLvl_ParseCall::firstTime(ctx_t &ctx, const CommonCodesets& cc) { + assert(readChar(ctx) == U'^'); + return std::make_shared(ret_buf_for_new); + } -chekushka CircumflexLvl_ParseCall::firstTime(ctx_t &ctx, const CommonCodesets& cc) { - assert(readChar(ctx) == U'^'); - return std::make_shared(ret_buf_for_new); -} - -chekushka CircumflexLvl_ParseCall::afterReceive(ctx_t &ctx, const CommonCodesets& cc) { - result = invert_set(ret_buf_for_new); - return NULL; -} + chekushka CircumflexLvl_ParseCall::afterReceive(ctx_t &ctx, const CommonCodesets& cc) { + result = invert_set(ret_buf_for_new); + return NULL; + } -/* Aaaaaaaaand... The function we have all been waiting for so long! */ -codeset_t sq_bracket_expr_parse(REGEX_IS024_MeaningContext &ctx, const CommonCodesets& cc) { - std::vector> callStack; - codeset_t res; - callStack.push_back(std::make_shared(res)); - bool first_time = true; - while (!callStack.empty()){ - if (ctx.error) - return {}; - auto nxt = first_time ? callStack.back()->firstTime(ctx, cc) : callStack.back()->afterReceive(ctx, cc); - if (nxt){ - callStack.push_back(nxt); - first_time = true; - } else { - callStack.pop_back(); - first_time = false; + /* Aaaaaaaaand... The function we have all been waiting for so long! */ + codeset_t sq_bracket_expr_parse(REGEX_IS024_MeaningContext &ctx, const CommonCodesets& cc) { + std::vector> callStack; + codeset_t res; + callStack.push_back(std::make_shared(res)); + bool first_time = true; + while (!callStack.empty()){ + if (ctx.error) + return {}; + auto nxt = first_time ? callStack.back()->firstTime(ctx, cc) : callStack.back()->afterReceive(ctx, cc); + if (nxt){ + callStack.push_back(nxt); + first_time = true; + } else { + callStack.pop_back(); + first_time = false; + } } + return res; } - return res; } diff --git a/src/libregexis024sol/square_bracket_expression.h b/src/libregexis024sol/square_bracket_expression.h index 463ca10..7fe0945 100644 --- a/src/libregexis024sol/square_bracket_expression.h +++ b/src/libregexis024sol/square_bracket_expression.h @@ -5,6 +5,7 @@ #include #include -codeset_t sq_bracket_expr_parse(REGEX_IS024_MeaningContext& ctx, const CommonCodesets& cc); - +namespace regexis024 { + codeset_t sq_bracket_expr_parse(REGEX_IS024_MeaningContext& ctx, const CommonCodesets& cc); +} #endif //LIBREGEXIS024_SRC_LIBREGEXIS024SOL_SQUARE_BRACKET_EXPRESSION_H diff --git a/src/libregexis024sol/subexpr_fa_transformed.cpp b/src/libregexis024sol/subexpr_fa_transformed.cpp index cc83d3b..6e9f052 100644 --- a/src/libregexis024sol/subexpr_fa_transformed.cpp +++ b/src/libregexis024sol/subexpr_fa_transformed.cpp @@ -3,182 +3,184 @@ #include #include -SubExprCompiled subexpr_charset_reading_filter(const codeset_t &codeset, FA_Container &fa) { - return subexpression_from_path(fa.makeOneCharRead(codeset, false)); -} +namespace regexis024 { + SubExprCompiled subexpr_charset_reading_filter(const codeset_t &codeset, FA_Container &fa) { + return subexpression_from_path(fa.makeOneCharRead(codeset, false)); + } -SubExprCompiled join(const SubExprCompiled &A, const SubExprCompiled &B) { - if (!A.start) - return B; - if (!B.start) - return A; - SubExprCompiled res; - res.start = A.start; - for (FA_Node** ptrToptr : A.ends) - reattach_fa_node_edge(ptrToptr, B.start); - res.ends = B.ends; - res.can_be_empty = A.can_be_empty && B.can_be_empty; - return res; -} - -SubExprCompiled subexpression_from_path(FA_NodePathPart *node) { - SubExprCompiled res; - res.start = node; - res.ends.push_back(&(node->nxt_node)); - /* There is only one char reading path node type */ - res.can_be_empty = (node->type != one_char_read); - return res; -} - -SubExprCompiled RobertAngier(const SubExprCompiled& source, FA_Container& fa) { - SubExprCompiled res; - if (!source.start) + SubExprCompiled join(const SubExprCompiled &A, const SubExprCompiled &B) { + if (!A.start) + return B; + if (!B.start) + return A; + SubExprCompiled res; + res.start = A.start; + for (FA_Node** ptrToptr : A.ends) + reattach_fa_node_edge(ptrToptr, B.start); + res.ends = B.ends; + res.can_be_empty = A.can_be_empty && B.can_be_empty; return res; + } - struct Marked{ - FA_Node *original = NULL, *clone = NULL; - explicit Marked(FA_Node *original) : original(original) {} - }; - std::vector searched; - searched.push_back(Marked(source.start)); - source.start->search_mark = 0; + SubExprCompiled subexpression_from_path(FA_NodePathPart *node) { + SubExprCompiled res; + res.start = node; + res.ends.push_back(&(node->nxt_node)); + /* There is only one char reading path node type */ + res.can_be_empty = (node->type != one_char_read); + return res; + } - for (size_t done = 0; done < searched.size(); done++){ - FA_Node& v = *searched[done].original; - searched[done].clone = copy_fa_node(v, fa); - for (FA_Node **nxtN: searched[done].clone->get_all_transitions()){ - if (!(*nxtN)) - res.ends.push_back(nxtN); - else if ((**nxtN).search_mark < 0){ - (**nxtN).search_mark = (int64_t)searched.size(); - searched.emplace_back(*nxtN); + SubExprCompiled RobertAngier(const SubExprCompiled& source, FA_Container& fa) { + SubExprCompiled res; + if (!source.start) + return res; + + struct Marked{ + FA_Node *original = NULL, *clone = NULL; + explicit Marked(FA_Node *original) : original(original) {} + }; + std::vector searched; + searched.push_back(Marked(source.start)); + source.start->search_mark = 0; + + for (size_t done = 0; done < searched.size(); done++){ + FA_Node& v = *searched[done].original; + searched[done].clone = copy_fa_node(v, fa); + for (FA_Node **nxtN: searched[done].clone->get_all_transitions()){ + if (!(*nxtN)) + res.ends.push_back(nxtN); + else if ((**nxtN).search_mark < 0){ + (**nxtN).search_mark = (int64_t)searched.size(); + searched.emplace_back(*nxtN); + } } } - } - res.start = searched[0].clone; - for (Marked& mrkd: searched){ - for (FA_Node **nxtN: mrkd.clone->get_all_transitions()){ - if (*nxtN){ - assert((**nxtN).search_mark >= 0); - Marked& proc_nxt = searched[(**nxtN).search_mark]; - reattach_fa_node_edge(nxtN, proc_nxt.clone); + res.start = searched[0].clone; + for (Marked& mrkd: searched){ + for (FA_Node **nxtN: mrkd.clone->get_all_transitions()){ + if (*nxtN){ + assert((**nxtN).search_mark >= 0); + Marked& proc_nxt = searched[(**nxtN).search_mark]; + reattach_fa_node_edge(nxtN, proc_nxt.clone); + } } } + for (Marked& mrkd: searched) + mrkd.original->search_mark = -1; + return res; } - for (Marked& mrkd: searched) - mrkd.original->search_mark = -1; - return res; -} -void reattach_all_ends_to_one_node(SubExprCompiled& patient, FA_Node* node){ - assert(node); - assert(patient.start); - for (FA_Node** end: patient.ends){ - assert(!(*end)); - printf("DEBUG %lu->->->->->%lu\n", patient.start->nodeId, node->nodeId); - reattach_fa_node_edge(end, node); + void reattach_all_ends_to_one_node(SubExprCompiled& patient, FA_Node* node){ + assert(node); + assert(patient.start); + for (FA_Node** end: patient.ends){ + assert(!(*end)); + printf("DEBUG %lu->->->->->%lu\n", patient.start->nodeId, node->nodeId); + reattach_fa_node_edge(end, node); + } } -} -void apply_repeat_to_subexpression(SubExprCompiled &patient, FA_Container& fa, size_t min_allowed, size_t max_allowed) { - assert(min_allowed <= max_allowed && min_allowed <= REGEXIS024_MAX_REPEAT); - if (!patient.start) - return; - bool infinite_repeat = max_allowed > REGEXIS024_MAX_REPEAT; - if (min_allowed == 0 && max_allowed == 0){ - patient = {}; - } else if (min_allowed == 1 && max_allowed == 1){ - /* Chill */ - } else if (min_allowed == 0 && infinite_repeat){ - FA_NodeOfForking* fn = fa.makeForking(); - add_option_to_fork_node(fn, patient.start); - for (FA_Node** old_end: patient.ends) - reattach_fa_node_edge(old_end, fn); - add_option_to_fork_node(fn, NULL); - patient.start = fn; - patient.ends = {&(fn->nxt_options[1])}; - } else if (min_allowed == 1 && infinite_repeat) { - FA_NodeOfForking* fn = fa.makeForking(); - reattach_all_ends_to_one_node(patient, fn); - add_option_to_fork_node(fn, patient.start); - add_option_to_fork_node(fn, NULL); - patient.ends = {&(fn->nxt_options[1])}; - } else if (min_allowed == 0 && max_allowed == 1){ - FA_NodeOfForking* fn = fa.makeForking(); - add_option_to_fork_node(fn, patient.start); - add_option_to_fork_node(fn, NULL); - patient.start = fn; - patient.ends.push_back(&(fn->nxt_options[1])); - } else if (infinite_repeat) { - std::vector Colon(min_allowed); - Colon[0] = patient; - for (size_t i = 1; i < min_allowed; i++) - Colon[i] = RobertAngier(patient, fa); - FA_NodeOfForking* fn = fa.makeForking(); - for (size_t i = 0; i + 1 < min_allowed; i++) - reattach_all_ends_to_one_node(Colon[i], Colon[i + 1].start); - reattach_all_ends_to_one_node(Colon[min_allowed - 1], fn); - add_option_to_fork_node(fn, Colon[min_allowed - 1].start); - add_option_to_fork_node(fn, NULL); - /* patient.start is the same (the original is at Colon[0] */ - patient.ends = {&(fn->nxt_options[1])}; - } else { - std::vector Avenue(max_allowed); - Avenue[max_allowed - 1] = patient; - for (size_t i = 0; i < max_allowed - 1; i++) - Avenue[i] = RobertAngier(patient, fa); - for (size_t i = 0; i + 1 < max_allowed; i++) - reattach_all_ends_to_one_node(Avenue[i], Avenue[i + 1].start); - FA_NodeOfForking* fn = fa.makeForking(); - if (min_allowed > 0){ - for (size_t i = 0; i <= max_allowed - min_allowed; i++) - add_option_to_fork_node(fn, Avenue[i].start); - } else { - for (size_t i = 0; i < max_allowed; i++) - add_option_to_fork_node(fn, Avenue[i].start); + void apply_repeat_to_subexpression(SubExprCompiled &patient, FA_Container& fa, size_t min_allowed, size_t max_allowed) { + assert(min_allowed <= max_allowed && min_allowed <= REGEXIS024_MAX_REPEAT); + if (!patient.start) + return; + bool infinite_repeat = max_allowed > REGEXIS024_MAX_REPEAT; + if (min_allowed == 0 && max_allowed == 0){ + patient = {}; + } else if (min_allowed == 1 && max_allowed == 1){ + /* Chill */ + } else if (min_allowed == 0 && infinite_repeat){ + FA_NodeOfForking* fn = fa.makeForking(); + add_option_to_fork_node(fn, patient.start); + for (FA_Node** old_end: patient.ends) + reattach_fa_node_edge(old_end, fn); add_option_to_fork_node(fn, NULL); - patient.ends.push_back(&(fn->nxt_options[max_allowed])); + patient.start = fn; + patient.ends = {&(fn->nxt_options[1])}; + } else if (min_allowed == 1 && infinite_repeat) { + FA_NodeOfForking* fn = fa.makeForking(); + reattach_all_ends_to_one_node(patient, fn); + add_option_to_fork_node(fn, patient.start); + add_option_to_fork_node(fn, NULL); + patient.ends = {&(fn->nxt_options[1])}; + } else if (min_allowed == 0 && max_allowed == 1){ + FA_NodeOfForking* fn = fa.makeForking(); + add_option_to_fork_node(fn, patient.start); + add_option_to_fork_node(fn, NULL); + patient.start = fn; + patient.ends.push_back(&(fn->nxt_options[1])); + } else if (infinite_repeat) { + std::vector Colon(min_allowed); + Colon[0] = patient; + for (size_t i = 1; i < min_allowed; i++) + Colon[i] = RobertAngier(patient, fa); + FA_NodeOfForking* fn = fa.makeForking(); + for (size_t i = 0; i + 1 < min_allowed; i++) + reattach_all_ends_to_one_node(Colon[i], Colon[i + 1].start); + reattach_all_ends_to_one_node(Colon[min_allowed - 1], fn); + add_option_to_fork_node(fn, Colon[min_allowed - 1].start); + add_option_to_fork_node(fn, NULL); + /* patient.start is the same (the original is at Colon[0] */ + patient.ends = {&(fn->nxt_options[1])}; + } else { + std::vector Avenue(max_allowed); + Avenue[max_allowed - 1] = patient; + for (size_t i = 0; i < max_allowed - 1; i++) + Avenue[i] = RobertAngier(patient, fa); + for (size_t i = 0; i + 1 < max_allowed; i++) + reattach_all_ends_to_one_node(Avenue[i], Avenue[i + 1].start); + FA_NodeOfForking* fn = fa.makeForking(); + if (min_allowed > 0){ + for (size_t i = 0; i <= max_allowed - min_allowed; i++) + add_option_to_fork_node(fn, Avenue[i].start); + } else { + for (size_t i = 0; i < max_allowed; i++) + add_option_to_fork_node(fn, Avenue[i].start); + add_option_to_fork_node(fn, NULL); + patient.ends.push_back(&(fn->nxt_options[max_allowed])); + } + patient.start = fn; + /* patient.ends is the same (the original is Avenue.back()) */ } - patient.start = fn; - /* patient.ends is the same (the original is Avenue.back()) */ + if (min_allowed == 0) + patient.can_be_empty = true; } - if (min_allowed == 0) - patient.can_be_empty = true; -} -SubExprCompiled forkify(const std::vector &options, FA_Container& fa){ - SubExprCompiled result; - size_t non_empty = 0; - result.can_be_empty = false; - for (const SubExprCompiled& opt: options){ - result.can_be_empty |= opt.can_be_empty; - if (opt.start) - non_empty++; - } - if (non_empty == 0){ - result.can_be_empty = true; + SubExprCompiled forkify(const std::vector &options, FA_Container& fa){ + SubExprCompiled result; + size_t non_empty = 0; + result.can_be_empty = false; + for (const SubExprCompiled& opt: options){ + result.can_be_empty |= opt.can_be_empty; + if (opt.start) + non_empty++; + } + if (non_empty == 0){ + result.can_be_empty = true; + return result; + } + if (non_empty == 1){ + for (const SubExprCompiled& opt: options) + if (opt.start){ + result = opt; + break; + } + } else { + FA_NodeOfForking* n1 = fa.makeForking(); + result.start = n1; + n1->nxt_options.reserve(non_empty); + for (const SubExprCompiled& opt: options) + if (opt.start){ + add_option_to_fork_node(n1, opt.start); + for (FA_Node** end: opt.ends) + result.ends.push_back(end); + } + } return result; } - if (non_empty == 1){ - for (const SubExprCompiled& opt: options) - if (opt.start){ - result = opt; - break; - } - } else { - FA_NodeOfForking* n1 = fa.makeForking(); - result.start = n1; - n1->nxt_options.reserve(non_empty); - for (const SubExprCompiled& opt: options) - if (opt.start){ - add_option_to_fork_node(n1, opt.start); - for (FA_Node** end: opt.ends) - result.ends.push_back(end); - } - } - return result; -} -void SubExprCompiled::assertDefault() { - assert(!start && ends.empty() && can_be_empty); + void SubExprCompiled::assertDefault() { + assert(!start && ends.empty() && can_be_empty); + } } diff --git a/src/libregexis024sol/subexpr_fa_transformed.h b/src/libregexis024sol/subexpr_fa_transformed.h index a267468..a872c70 100644 --- a/src/libregexis024sol/subexpr_fa_transformed.h +++ b/src/libregexis024sol/subexpr_fa_transformed.h @@ -3,30 +3,31 @@ #include -struct SubExprCompiled{ - FA_Node* start = NULL; - /* After putting there values from neighbour vectors in nodes, these vectors must not change size */ - std::vector ends; - bool can_be_empty = true; +namespace regexis024 { + struct SubExprCompiled{ + FA_Node* start = NULL; + /* After putting there values from neighbour vectors in nodes, these vectors must not change size */ + std::vector ends; + bool can_be_empty = true; - void assertDefault(); -}; + void assertDefault(); + }; -SubExprCompiled subexpr_charset_reading_filter(const codeset_t& codeset, FA_Container& fa); + SubExprCompiled subexpr_charset_reading_filter(const codeset_t& codeset, FA_Container& fa); -SubExprCompiled join(const SubExprCompiled& A, const SubExprCompiled& B); + SubExprCompiled join(const SubExprCompiled& A, const SubExprCompiled& B); -SubExprCompiled forkify(const std::vector& options, FA_Container& fa); + SubExprCompiled forkify(const std::vector& options, FA_Container& fa); -SubExprCompiled subexpression_from_path(FA_NodePathPart* node); + SubExprCompiled subexpression_from_path(FA_NodePathPart* node); -/* And then Robert Angier said `It's prestige time` and prestiged all over the place. - * If you still don't get it, this function copies section of NFA of regexp */ -SubExprCompiled RobertAngier(const SubExprCompiled& source, FA_Container& fa); + /* And then Robert Angier said `It's prestige time` and prestiged all over the place. + * If you still don't get it, this function copies section of NFA of regexp */ + SubExprCompiled RobertAngier(const SubExprCompiled& source, FA_Container& fa); #define REGEXIS024_MAX_REPEAT 64 -/* pass REGEXIS024_MAX_REPEAT + 1 as max_allowed to allow infinite repeat */ -void apply_repeat_to_subexpression(SubExprCompiled& patient, FA_Container& fa, size_t min_allowed, size_t max_allowed); - + /* pass REGEXIS024_MAX_REPEAT + 1 as max_allowed to allow infinite repeat */ + void apply_repeat_to_subexpression(SubExprCompiled& patient, FA_Container& fa, size_t min_allowed, size_t max_allowed); +} #endif //LIBREGEXIS024_SRC_LIBREGEXIS024SOL_SUBEXPR_FA_TRANSFORMED_H diff --git a/src/libregexis024test/byte_code_assembler.h b/src/libregexis024test/byte_code_assembler.h index cd586ec..4d9555e 100644 --- a/src/libregexis024test/byte_code_assembler.h +++ b/src/libregexis024test/byte_code_assembler.h @@ -11,15 +11,17 @@ #include #include +using namespace regexis024; + struct assembler_context_bookmark{ - regex_near_ptr_t pos_in_r024program; + near_ptr_t pos_in_r024program; int LINE; }; struct pending_bookmark{ /* Must fill this byte with pos of pos_in_r024program in assembler_context_bookmark * In a sense, this is a pointer to a NULL pointer that is yet to become normal kinda pointer */ - regex_near_ptr_t pos_in_r024program; + near_ptr_t pos_in_r024program; const char* name; /* LINE of the reference is needed in case of error */ int LINE; @@ -46,7 +48,7 @@ struct assembler_context{ } /* pending bookmerk requests should be added only with beg_for_bookmark method, * or else SEGFAULT will be your frequent guest */ - *reinterpret_cast(&result[br.pos_in_r024program]) = bookmarks[br.name].pos_in_r024program; + *reinterpret_cast(&result[br.pos_in_r024program]) = bookmarks[br.name].pos_in_r024program; } } diff --git a/src/libregexis024test/byte_code_disassembler.h b/src/libregexis024test/byte_code_disassembler.h index e1e6dd7..c6cd226 100644 --- a/src/libregexis024test/byte_code_disassembler.h +++ b/src/libregexis024test/byte_code_disassembler.h @@ -11,8 +11,9 @@ #include #include #include +#include -// TODO: apply here my new change in near pointer size +using namespace regexis024; struct landing_place_resolvance{ size_t name_id; @@ -34,12 +35,14 @@ void print_disassembly(size_t prgSize, uint8_t* prg){ }; uint64_t used_names = 0; /* From program position -> to names[ind] & */ - std::map bookmarks; - regex_near_ptr_t IP = 0; + std::map bookmarks; + near_ptr_t IP = 0; auto check_inboundness = [&](int region){ - if (!vmprog_check_inboundness(prgSize, IP, region)) - exitf("This program can't be decomposed into commands in a trivial way"); + if (!vmprog_check_inboundness(prgSize, IP, region)) { + fprintf(stderr, "This program can't be decomposed into commands in a trivial way"); + std::terminate(); + } }; auto extract_b = [&]() -> uint8_t{ check_inboundness(1); @@ -60,19 +63,19 @@ void print_disassembly(size_t prgSize, uint8_t* prg){ auto extract_instruction = [&]() -> uint8_t{ return extract_b(); }; - auto extract_sslot_id = [&]() -> regex_sslot_id_t{ + auto extract_sslot_id = [&]() -> sslot_id_t{ return extract_dw(); }; - auto extract_near_pointer = [&]() -> regex_near_ptr_t{ + auto extract_near_pointer = [&]() -> near_ptr_t{ return extract_qw(); }; - auto extract_track_array_index = [&]() -> regex_tai_t{ + auto extract_track_array_index = [&]() -> tai_t{ return extract_w(); }; bool second_phase = false; - auto fph_register_landing = [&](regex_near_ptr_t pos){ + auto fph_register_landing = [&](near_ptr_t pos){ if (!second_phase){ if (bookmarks.count(pos) == 0){ if (used_names == names.size()) @@ -83,15 +86,17 @@ void print_disassembly(size_t prgSize, uint8_t* prg){ } }; - auto get_bookmark_in_2phase = [&](regex_near_ptr_t pos) -> std::string { - if (bookmarks.count(pos) == 0) - exitf("bruh"); + auto get_bookmark_in_2phase = [&](near_ptr_t pos) -> std::string { + if (bookmarks.count(pos) == 0) { + fprintf(stderr, "Bruh\n"); + std::terminate(); + } return names[bookmarks[pos].name_id]; }; auto one_reading = [&](){ while (IP < prgSize) { - regex_near_ptr_t start_pos = IP; + near_ptr_t start_pos = IP; if (second_phase){ if (bookmarks.count(IP) != 0){ printf("%s:\n", get_bookmark_in_2phase(IP).c_str()); @@ -102,11 +107,11 @@ void print_disassembly(size_t prgSize, uint8_t* prg){ switch (opcode) { #define secPrint(fmt, ...) if (second_phase) {printf("% 3lu) " fmt, start_pos, __VA_ARGS__);} } break; #define secPrintNoArg(str) if (second_phase) {printf("% 3lu) " str, start_pos);} } break; -#define instCase(oper_code) case regex024_opcodes::oper_code: { +#define instCase(oper_code) case opcodes::oper_code: { #define jcMess(cond, sz_uppercase, x_t, extract_method, printf_sign) \ instCase(JC ## cond ## _ ## sz_uppercase) \ x_t x = extract_method(); \ - regex_near_ptr_t dest = extract_near_pointer(); \ + near_ptr_t dest = extract_near_pointer(); \ fph_register_landing(dest); \ secPrint("JC" #cond "_" #sz_uppercase " %" printf_sign " $%s\n", x, get_bookmark_in_2phase(dest).c_str()) #define jcCacaphony(cond) \ @@ -131,22 +136,22 @@ void print_disassembly(size_t prgSize, uint8_t* prg){ instCase(FORK) uint32_t ssid = extract_sslot_id(); - regex_near_ptr_t dest = extract_near_pointer(); + near_ptr_t dest = extract_near_pointer(); fph_register_landing(dest); secPrint("FORK %u $%s\n", ssid, get_bookmark_in_2phase(dest).c_str()) simpleDimple(MATCH) simpleDimple(DIE) instCase(PARAM_READ_SS_NUMBER) - regex_sslot_id_t ssid_max_plus_one = extract_sslot_id(); + sslot_id_t ssid_max_plus_one = extract_sslot_id(); secPrint("PARAM_READ_SS_NUMBER %u\n", ssid_max_plus_one) instCase(PARAM_FORK_SS_NUMBER) - regex_sslot_id_t ssid_max_plus_one = extract_sslot_id(); + sslot_id_t ssid_max_plus_one = extract_sslot_id(); secPrint("PARAM_FORK_SS_NUMBER %u\n", ssid_max_plus_one) instCase(PARAM_SELARR_LEN) - regex_tai_t tai_max_plus_one = extract_track_array_index(); + tai_t tai_max_plus_one = extract_track_array_index(); secPrint("PARAM_SELARR_LEN %hu\n", tai_max_plus_one) instCase(PARAM_COLSIFTFUNC_SET) - regex_near_ptr_t entry = extract_near_pointer(); + near_ptr_t entry = extract_near_pointer(); fph_register_landing(entry); secPrint("PARAM_COLSIFTFUNC_SET $%s\n", get_bookmark_in_2phase(entry).c_str()) simpleDimple(PARAM_COLSIFTFUNC_WIPE) @@ -156,36 +161,37 @@ void print_disassembly(size_t prgSize, uint8_t* prg){ instCase(MSG_FED_INPUT_EXTENDED) uint8_t left = extract_b(); uint8_t right = extract_b(); - regex_sslot_id_t part = extract_sslot_id(); + sslot_id_t part = extract_sslot_id(); secPrint("MSG_FED_INPUT_EXTENDED %hhu %hhu %u\n", left, right, part) instCase(DMOV_RABX_SELARR) - regex_tai_t i = extract_track_array_index(); + tai_t i = extract_track_array_index(); secPrint("DMOV_RABX_SELARR %hu\n", i) instCase(DDIST_RABX_SELARR) - regex_tai_t s = extract_track_array_index(); - regex_tai_t e = extract_track_array_index(); + tai_t s = extract_track_array_index(); + tai_t e = extract_track_array_index(); secPrint("DDIST_RABX_SELARR %hu %hu\n", s, e); simpleDimple(SIFTPRIOR_MIN_RABX) simpleDimple(SIFTPRIOR_MAX_RABX) simpleDimple(SIFT_DONE) instCase(MOV_COLARR_IMM) - regex_tai_t tai = extract_track_array_index(); + tai_t tai = extract_track_array_index(); uint64_t imm = extract_qw(); secPrint("MOV_COLARR_IMM %hu %lu\n", tai, imm); instCase(MOV_COLARR_BTPOS) - regex_tai_t tai = extract_track_array_index(); + tai_t tai = extract_track_array_index(); secPrint("MOV_COLARR_BTPOS %hu\n", tai); instCase(MOV_SELARR_IMM) - regex_tai_t tai = extract_track_array_index(); + tai_t tai = extract_track_array_index(); uint64_t imm = extract_qw(); secPrint("MOV_SELARR_IMM %hu %lu\n", tai, imm); instCase(MOV_SELARR_CHPOS) - regex_tai_t tai = extract_track_array_index(); + tai_t tai = extract_track_array_index(); secPrint("MOV_SELARR_CHPOS %hu\n", tai); simpleDimple(INIT) simpleDimple(THROW) default: - exitf("Bad opcode\n"); + fprintf(stderr, "Bad opcode\n"); + std::terminate(); #undef secPrint #undef secPrintNoArg #undef instCase diff --git a/src/libregexis024test/test0.cpp b/src/libregexis024test/test0.cpp index b95fd4c..3f7ecf5 100644 --- a/src/libregexis024test/test0.cpp +++ b/src/libregexis024test/test0.cpp @@ -2,12 +2,16 @@ #include #include +using namespace regexis024; + void test_ccs_fnc(const codeset_t &got, const codeset_t &expected){ static int id = 1; - if (got == expected) + if (got == expected) { printf("Test %d passed\n", id++); - else - exitf("Test %d failed\n", id); + } else { + printf("Test %d failed\n", id); + std::terminate(); + } } void invert_test(const codeset_t& A, const codeset_t& C){ diff --git a/src/libregexis024test/test1.cpp b/src/libregexis024test/test1.cpp index 0a5f0bd..7e58830 100644 --- a/src/libregexis024test/test1.cpp +++ b/src/libregexis024test/test1.cpp @@ -10,8 +10,8 @@ static int test_id = 0; void do_test(const std::vector& prg, const std::string& str, const std::vector& prefix_matching){ assert(str.size() + 1 == prefix_matching.size()); - REGEX_IS024_CONTEXT ctx{prg.size(), prg.data(), 0, 0, 1000, 1000, 1000000}; - regex024_error_code ret; + VMContext ctx{prg.size(), prg.data(), 0, 0, 1000, 1000, 1000000}; + error_code_t ret; // todo printf("TEST %d passed\n", test_id); test_id++; diff --git a/src/libregexis024test/test2.cpp b/src/libregexis024test/test2.cpp index 5a115c2..f3cd5db 100644 --- a/src/libregexis024test/test2.cpp +++ b/src/libregexis024test/test2.cpp @@ -1,6 +1,8 @@ #include #include +using namespace regexis024; + int main(){ std::string regular_expression = "\\>1*"; REGEX_IS024_MeaningContext regex(regular_expression.size(), regular_expression.c_str()); diff --git a/src/libregexis024test/test3.cpp b/src/libregexis024test/test3.cpp index 051115f..17bae07 100644 --- a/src/libregexis024test/test3.cpp +++ b/src/libregexis024test/test3.cpp @@ -7,6 +7,8 @@ #include #include +using namespace regexis024; + struct test_id_t { int test_id; int subtest_id; @@ -214,4 +216,4 @@ int main() { {{{UINT32_MAX, UINT32_MAX}}, {4, 5}}, }); return 0; -} \ No newline at end of file +} diff --git a/src/libregexis024test/test4.cpp b/src/libregexis024test/test4.cpp index a7f645a..cca5335 100644 --- a/src/libregexis024test/test4.cpp +++ b/src/libregexis024test/test4.cpp @@ -21,19 +21,54 @@ void test(const string& input, const string& pattern, const MatchInfo& right_ans } int main() { + test("11aa", "^!A;\\B!A;\\b!any;\\B!any;$", MatchInfo({}, {})); + test("aa11", "^!A;\\B!A;\\b!any;\\B!any;$", MatchInfo({}, {})); + test("a111", "^!A;\\B!A;\\b!any;\\B!any;$", MatchInfo()); + test("aa11", "^!A;\\B!A;\\B!any;\\B!any;$", MatchInfo()); + test("1a11", "^!A;\\B!A;\\B!any;\\B!any;$", MatchInfo()); + test("11aa", "!dfa;^!A;\\B!A;\\b!any;\\B!any;$", MatchInfo({}, {})); + test("aa11", "!dfa;^!A;\\B!A;\\b!any;\\B!any;$", MatchInfo({}, {})); + test("a111", "!dfa;^!A;\\B!A;\\b!any;\\B!any;$", MatchInfo()); + test("aa11", "!dfa;^!A;\\B!A;\\B!any;\\B!any;$", MatchInfo()); + test("1a11", "!dfa;^!A;\\B!A;\\B!any;\\B!any;$", MatchInfo()); + test("LINE\r\nFirst:Second\r\nThird:12\r\n\r\n", + "!dfa;!select{fieldname{ca}fieldbody{ca}}^^^LINE\r\n(#fieldname([\\u0021-\\u007E&^:]+):#fieldbody([\\u0000-\\u007F&^\r\n]*)\r\n)*\r\n$$$", + MatchInfo({{0, 6}, {1, 11}, {2, 12}, {3, 18}, {0, 20}, {1, 25}, {2, 26}, {3, 28}}, {20, 25, 26, 28})); + test("LINE\r\nFirst:Second\r\nThird:12\r\n\r\n", + "!dfa;!select{fieldname{ca}fieldbody{ca}}^LINE\r\n(#fieldname([\\u0021-\\u007E&^:]+\\>):#fieldbody([\\u0000-\\u007F&^\r\n]*)\r\n)*\r\n", + MatchInfo({{0, 6}, {1, 11}, {2, 12}, {3, 18}, {0, 20}, {1, 25}, {2, 26}, {3, 28}}, {20, 25, 26, 28})); + test("LINE\r\nFirst:Second\r\nThird:12\r\n\r\n", + "!dfa;!select{fieldname{ca}fieldbody{ca}}^LINE\r\n(#fieldname([\\u0021-\\u007E&^:]+):#fieldbody([\\u0000-\\u007F&^\r\n]*)\r\n)*\r\n", + MatchInfo({{0, 6}, {1, 11}, {2, 12}, {3, 18}, {0, 20}, {1, 25}, {2, 26}, {3, 28}}, {20, 25, 26, 28})); + test("LINE\r\nFirst:Second\r\n\r\n", + "!select{fieldname{ca}}LINE\r\n(#fieldname([\\u0021-\\u007E&^:]+):#fieldbody([\\u0000-\\u007F&^\r\n]*)\r\n)*\r\n", + MatchInfo({{0, 6}, {1, 11}, {2, 12}, {3, 18}}, {6, 11})); + test("LINE\r\nFirst:Second\r\n\r\n", + "!select{fieldname}LINE\r\n(#fieldname([\\u0021-\\u007E&^:]+):#fieldbody([\\u0000-\\u007F&^\r\n]*)\r\n)*\r\n", + MatchInfo({{0, 12}, {1, 18}}, {6, 11})); + test("LINE\r\nFirst:Second\r\nThird:12\r\n\r\n", + "!select{fieldname{ca}fieldbody{ca}}LINE\r\n(#fieldname([\\u0021-\\u007E&^:]+):#fieldbody([\\u0000-\\u007F&^\r\n]*)\r\n)*\r\n", + MatchInfo({{0, 6}, {1, 11}, {2, 12}, {3, 18}, {0, 20}, {1, 25}, {2, 26}, {3, 28}}, {20, 25, 26, 28})); + test("абвгд", "абвгд", MatchInfo({}, {})); + test("абвввввввгд", "абв*г+д", MatchInfo({}, {})); + test("абвввввввд", "абв*г+д", MatchInfo()); + test("LINE\r\nFirst:Second\r\nThird:12\r\n\r\n", + "!dfa;^LINE\r\n(#fieldname([\\u0021-\\u007E&^:]+):#fieldbody([\\u0000-\\u007F&^\r\n]*)\r\n)*\r\n", + MatchInfo({{0, 6}, {1, 11}, {2, 12}, {3, 18}, {0, 20}, {1, 25}, {2, 26}, {3, 28}}, {})); + test("LINE\r\nFirst:Second\r\n\r\n", + "LINE\r\n(#fieldname([\\u0021-\\u007E&^:]+):#fieldbody([\\u0000-\\u007F&^\r\n]*)\r\n)*\r\n", + MatchInfo({{0, 6}, {1, 11}, {2, 12}, {3, 18}}, {})); test("C111111111111", "C\\>1*", MatchInfo({}, {})); - // return 0; - test("GET / HTTP/1.1\r\nHost: bibura sosat\r\nLos-es-raus: a\rfaafafdf\r\n\r\n", + test("GET / HTTP/1.1\r\nHost: example.com\r\nAAAAA: a\rfaafafdf\r\n\r\n", "!dfa;(GET|POST) / HTTP/(1.1|1.0|0.9)\r\n([\\u0021-\\u007E&^:]+:([\\u0000-\\u007F&^\r\n])*\r\n)*\r\n", MatchInfo()); test("\r24234\r\n", "[\\u0000-\\u007F&^\r\n]*\r\n", MatchInfo()); test("\n3432\r\n", "[\\u0000-\\u007F&^\r\n]*\r\n", MatchInfo()); test("3:::;;432\r\n", "[\\u0000-\\u007F&^\r\n]*\r\n", MatchInfo({}, {})); test("3:::;;432 \r\n", "[\\u0000-\\u007F&^\r\n]*\r\n", MatchInfo({}, {})); - test("GET / HTTP/0.9\r\nHost: bibura sosat\r\nLos-es-raus: afaafafdf\r\n\r\n", + test("GET / HTTP/0.9\r\nHost: bibur at\r\nContent-type: html\r\n\r\n", "^(GET|POST\\>) / HTTP/(1.1|1.0|0.9)\r\n([\\u0021-\\u007E&^:]+:([\\u0000-\\u007F&^\r\n])*\r\n)*\r\n", MatchInfo({}, {})); - // return 0; test("b", "#boba(b)", MatchInfo({{0, 0}, {1, 1}}, {})); test("abc", "!selarr{boba{ca}}^a#boba(b)c$", MatchInfo({{0, 1}, {1, 2}}, {1, 2})); for (int i = 0; i < 64; i++) { diff --git a/src/libregexis024tools/stringmatching.cpp b/src/libregexis024tools/stringmatching.cpp index f4f9338..9ae81a1 100644 --- a/src/libregexis024tools/stringmatching.cpp +++ b/src/libregexis024tools/stringmatching.cpp @@ -7,102 +7,104 @@ // using namespace regexis024; -void convert(regexis024::TrackingVariableInfo& to, const SubtrackingNameInfo& from) { +namespace regexis024 { + void convert(TrackingVariableInfo& to, const SubtrackingNameInfo& from) { #define plagiat(field) to.field = from.field; - plagiat(type); - plagiat(colarr_first); - plagiat(colarr_second); - plagiat(stored_in_ca); - plagiat(selarr_first); - plagiat(selarr_second); - plagiat(stored_in_sa); + plagiat(type); + plagiat(colarr_first); + plagiat(colarr_second); + plagiat(stored_in_ca); + plagiat(selarr_first); + plagiat(selarr_second); + plagiat(stored_in_sa); #undef plagiat -} + } -int regexis024::matchStrToRegexp(const std::string& input, const std::string& pattern, - MatchInfo& retMatchInfo, track_var_list& retTrackVarList, std::string& retStatus) -{ - retTrackVarList = {}; - retMatchInfo = MatchInfo(); - retStatus = ""; - REGEX_IS024_MeaningContext regexp(pattern.size(), pattern.data()); - if (regexp.error) { - retStatus = "Pattern compilation. " + regexp.error_msg; - return -1; - } - retTrackVarList = {}; - for (auto& iip: regexp.ktr.track_names) { - convert(retTrackVarList[iip.first], regexp.ktr.retrieval_info[iip.second]); - } - REGEX_IS024_VirtualMachine vm(regexp.compiled_program.size(), regexp.compiled_program.data(), - UINT64_MAX, UINT16_MAX, - UINT32_MAX, UINT32_MAX, UINT64_MAX); - auto getVMErrString = [&]() -> std::string { - return std::string(regex024_error_code_tostr(vm.getErrno())); - }; - - if (vm.initialize() != regex024_error_codes::stable) { - retStatus = "Virtual machine initialization. " + getVMErrString(); - return -1; - } - int left_ext_feed = vm.getInputLeftExtensionSize(); - int right_ext_feed = vm.getInputRightExtensionSize(); - if (left_ext_feed > 1 || right_ext_feed > 1) { - retStatus = "Unnatural extended input request."; - return -1; - } - if (vm.addNewMatchingThread() != regex024_error_codes::stable) { - retStatus = "Virtual machine first kick. " + getVMErrString(); - } - if (left_ext_feed) { - if (vm.extendedFeedCharacter('\n') != regex024_error_codes::stable) { - retStatus = "VM left extended input. " + getVMErrString(); + int matchStrToRegexp(const std::string& input, const std::string& pattern, + MatchInfo& retMatchInfo, track_var_list& retTrackVarList, std::string& retStatus) + { + retTrackVarList = {}; + retMatchInfo = MatchInfo(); + retStatus = ""; + REGEX_IS024_MeaningContext regexp(pattern.size(), pattern.data()); + if (regexp.error) { + retStatus = "Pattern compilation. " + regexp.error_msg; return -1; } - } - for (size_t cur_text_pos = 0;cur_text_pos < input.size();) { - int32_t inp_code; - size_t adj; - utf8_string_iterat(inp_code, adj, cur_text_pos, reinterpret_cast(input.data()), input.size()); - if (inp_code < 0) { - retStatus = "Input string encoding error."; + retTrackVarList = {}; + for (auto& iip: regexp.ktr.track_names) { + convert(retTrackVarList[iip.first], regexp.ktr.retrieval_info[iip.second]); + } + VirtualMachine vm(regexp.compiled_program.size(), regexp.compiled_program.data(), + UINT64_MAX, UINT16_MAX, + UINT32_MAX, UINT32_MAX, UINT64_MAX); + auto getVMErrString = [&]() -> std::string { + return std::string(error_code_to_str(vm.getErrno())); + }; + + if (vm.initialize() != error_codes::stable) { + retStatus = "Virtual machine initialization. " + getVMErrString(); return -1; } - if (vm.feedCharacter(static_cast(inp_code), adj) != regex024_error_codes::stable) { - retStatus = "VM input. " + getVMErrString(); + int left_ext_feed = vm.getInputLeftExtensionSize(); + int right_ext_feed = vm.getInputRightExtensionSize(); + if (left_ext_feed > 1 || right_ext_feed > 1) { + retStatus = "Unnatural extended input request."; return -1; } - cur_text_pos += adj; - } - if (right_ext_feed) { - if (vm.extendedFeedCharacter('\n') != regex024_error_codes::stable) { - retStatus = "VM right extended input. " + getVMErrString(); - return -1; + if (vm.addNewMatchingThread() != error_codes::stable) { + retStatus = "Virtual machine first kick. " + getVMErrString(); } + if (left_ext_feed) { + if (vm.extendedFeedCharacter('\n') != error_codes::stable) { + retStatus = "VM left extended input. " + getVMErrString(); + return -1; + } + } + for (size_t cur_text_pos = 0;cur_text_pos < input.size();) { + int32_t inp_code; + size_t adj; + utf8_string_iterat(inp_code, adj, cur_text_pos, input.data(), input.size()); + if (inp_code < 0) { + retStatus = "Input string encoding error."; + return -1; + } + if (vm.feedCharacter(static_cast(inp_code), adj) != error_codes::stable) { + retStatus = "VM input. " + getVMErrString(); + return -1; + } + cur_text_pos += adj; + } + if (right_ext_feed) { + if (vm.extendedFeedCharacter('\n') != error_codes::stable) { + retStatus = "VM right extended input. " + getVMErrString(); + return -1; + } + } + assert(vm.isUsable()); + if (vm.isMatched()) { + retMatchInfo.have_match = true; + size_t SN1 = vm.getSelectionArrayLength(); + retMatchInfo.sa.assign(SN1, 0); + for (size_t i = 0; i < SN1; i++) + retMatchInfo.sa[i] = vm.getMatchedThreadSAValue(i); + retMatchInfo.ca_history = vm.getMatchedThreadCABranchReverse(); + std::reverse(retMatchInfo.ca_history.begin(), retMatchInfo.ca_history.end()); + } + return 0; } - assert(vm.isUsable()); - if (vm.isMatched()) { - retMatchInfo.have_match = true; - size_t SN1 = vm.getSelectionArrayLength(); - retMatchInfo.sa.assign(SN1, 0); - for (size_t i = 0; i < SN1; i++) - retMatchInfo.sa[i] = vm.getMatchedThreadSAValue(i); - retMatchInfo.ca_history = vm.getMatchedThreadCABranchReverse(); - std::reverse(retMatchInfo.ca_history.begin(), retMatchInfo.ca_history.end()); + + bool MatchInfo::operator==(const MatchInfo &other) const { + if (!have_match && !other.have_match) + return true; + return (have_match == other.have_match) && (sa == other.sa) && (ca_history == other.ca_history); } - return 0; -} -bool regexis024::MatchInfo::operator==(const MatchInfo &other) const { - if (!have_match && !other.have_match) - return true; - return (have_match == other.have_match) && (sa == other.sa) && (ca_history == other.ca_history); -} + bool MatchInfo::operator!=(const MatchInfo &other) const { + return !(*this == other); + } -bool regexis024::MatchInfo::operator!=(const MatchInfo &other) const { - return !(*this == other); -} - -regexis024::MatchInfo::MatchInfo(const std::vector &ca_history, const std::vector &sa): - ca_history(ca_history), sa(sa), have_match(true) { -} + MatchInfo::MatchInfo(const std::vector &ca_history, const std::vector &sa): + ca_history(ca_history), sa(sa), have_match(true) { + } +} \ No newline at end of file diff --git a/src/libregexis024tools/stringmatching.h b/src/libregexis024tools/stringmatching.h index 1321144..ac96854 100644 --- a/src/libregexis024tools/stringmatching.h +++ b/src/libregexis024tools/stringmatching.h @@ -11,7 +11,7 @@ namespace regexis024 { bool stored_in_ca = true; bool stored_in_sa = false; - tracking_var_type type; + tracking_var_type_t type; /* These fields will be -1 if unused */ int colarr_first = -1; int colarr_second = -1; @@ -24,7 +24,7 @@ namespace regexis024 { struct MatchInfo { bool have_match = false; - std::vector ca_history; + std::vector ca_history; std::vector sa; bool operator==(const MatchInfo& other) const ; @@ -32,7 +32,7 @@ namespace regexis024 { MatchInfo() = default; - MatchInfo(const std::vector &ca_history, const std::vector &sa); + MatchInfo(const std::vector &ca_history, const std::vector &sa); }; int matchStrToRegexp(const std::string& input, const std::string& pattern, diff --git a/src/libregexis024vm/instruction_implementation.cpp b/src/libregexis024vm/instruction_implementation.cpp index fb41051..f1cd5bf 100644 --- a/src/libregexis024vm/instruction_implementation.cpp +++ b/src/libregexis024vm/instruction_implementation.cpp @@ -1,491 +1,494 @@ #include #include -void swap_old_settled_and_new_active(REGEX_IS024_CONTEXT &ctx, REGEX_IS024_Thread& old_settled){ - ctx_print_debug(ctx); - assert(old_settled.slot_occupation_status == SLOT_OCCUPIED_val); - REGEX_IS024_Thread temp = old_settled; - old_settled = ctx.active_thread; - old_settled.slot_occupation_status = SLOT_NEW_val; - ctx.active_thread = temp; - // slot_occupation_status & SLOT_OCCUPIED of actie thread is true, because it was retrieved from old_settled -} - -void start_noncloning_conflict(REGEX_IS024_CONTEXT& ctx, REGEX_IS024_Thread& other){ - ctx_print_debug(ctx); - if (ctx.have_sift_function){ - ctx.sifting_with = &other; - ctx.who_started_sift = regex024_opcode::READ; - ctx.intruder_IP = ctx.active_thread.IP; - ctx.active_thread.IP = ctx.sift_function; - ctx.RAX = ctx.RBX = 0; - } else { - ctx.active_thread.delete_thread(); - ctx.try_to_continue_scheduled(); +namespace regexis024 { + void swap_old_settled_and_new_active(VMContext &ctx, Thread& old_settled){ + ctx_print_debug(ctx); + assert(old_settled.slot_occupation_status == SLOT_OCCUPIED_val); + Thread temp = old_settled; + old_settled = ctx.active_thread; + old_settled.slot_occupation_status = SLOT_NEW_val; + ctx.active_thread = temp; + // slot_occupation_status & SLOT_OCCUPIED of active thread is true, because it was retrieved from old_settled } -} -/* The one that drops as an intruder here is current active.thread.IP */ -void start_cloning_conflict(REGEX_IS024_CONTEXT& ctx, REGEX_IS024_Thread& other, regex_near_ptr_t clone_IP){ - ctx_print_debug(ctx); - if (ctx.have_sift_function){ - ctx.sifting_with = &other; - ctx.who_started_sift = regex024_opcode::FORK; - ctx.intruder_IP = ctx.active_thread.IP; - ctx.child_ret_IP = clone_IP; - ctx.active_thread.IP = ctx.sift_function; - ctx.RAX = ctx.RBX = 0; - } else { - ctx.active_thread.IP = clone_IP; + void start_noncloning_conflict(VMContext& ctx, Thread& other){ + ctx_print_debug(ctx); + if (ctx.have_sift_function){ + ctx.sifting_with = &other; + ctx.who_started_sift = opcode_t::READ; + ctx.intruder_IP = ctx.active_thread.IP; + ctx.active_thread.IP = ctx.sift_function; + ctx.RAX = ctx.RBX = 0; + } else { + ctx.active_thread.delete_thread(); + ctx.try_to_continue_scheduled(); + } + } + + /* The one that drops as an intruder here is current active.thread.IP */ + void start_cloning_conflict(VMContext& ctx, Thread& other, near_ptr_t clone_IP){ + ctx_print_debug(ctx); + if (ctx.have_sift_function){ + ctx.sifting_with = &other; + ctx.who_started_sift = opcode_t::FORK; + ctx.intruder_IP = ctx.active_thread.IP; + ctx.child_ret_IP = clone_IP; + ctx.active_thread.IP = ctx.sift_function; + ctx.RAX = ctx.RBX = 0; + } else { + ctx.active_thread.IP = clone_IP; + } } -} #define initialization_phase_check() if (ctx.initialized){ \ - ctx.error = regex024_error_codes::too_late; return; } +ctx.error = error_codes::too_late; return; } #define general_matching_mode_check() if (!ctx.initialized){ \ - ctx.error = regex024_error_codes::too_early; return; } if(ctx.sifting_with){ \ - ctx.error = regex024_error_codes::instruction_not_for_collision_thread; return; } +ctx.error = error_codes::too_early; return; } if(ctx.sifting_with){ \ +ctx.error = error_codes::instruction_not_for_collision_thread; return; } #define sift_mode_check() if (!ctx.sifting_with){ \ - ctx.error = regex024_error_codes::instruction_not_for_collision_thread; return; } +ctx.error = error_codes::instruction_not_for_collision_thread; return; } -/* Can append to both read_halted+new stacks of context */ -void read_halted_new_type_stacks_append(REGEX_IS024_CONTEXT &ctx, regex_sslot_id_t ssid){ - ctx_print_debug(ctx); - if (ssid < ctx.portion_of_FIRST_read_halt_ns){ - ctx.READ_halted_stack_new_first.append(ssid); - } else { - ctx.READ_halted_stack_new_second.append(ssid); - } -} - -void do_i_read(REGEX_IS024_CONTEXT &ctx, regex_sslot_id_t ssid) { - ctx_print_debug(ctx); - general_matching_mode_check() - if (ssid >= ctx.read_slots_number) - smitsya(read_sslot_out_of_range); - REGEX_IS024_Thread& other = ctx.READ_halted_slots[ssid]; - if (other.slot_occupation_status & SLOT_OCCUPIED){ - if (other.slot_occupation_status & SLOT_NEW){ - start_noncloning_conflict(ctx, other); + /* Can append to both read_halted+new stacks of context */ + void read_halted_new_type_stacks_append(VMContext &ctx, sslot_id_t ssid){ + ctx_print_debug(ctx); + if (ssid < ctx.portion_of_FIRST_read_halt_ns){ + ctx.READ_halted_stack_new_first.append(ssid); } else { - swap_old_settled_and_new_active(ctx, other); - /* Even though ssid was registed in stack for elders, now young stack should also track this slot */ - read_halted_new_type_stacks_append(ctx, ssid); + ctx.READ_halted_stack_new_second.append(ssid); } - } else { - other = ctx.active_thread; - other.slot_occupation_status = SLOT_NEW_val; - ctx.active_thread.slot_occupation_status = SLOT_EMPTY_val; - read_halted_new_type_stacks_append(ctx, ssid); - ctx.try_to_continue_scheduled(); } -} -void i_READ(REGEX_IS024_CONTEXT &ctx) { - ctx_print_debug(ctx); - check_available_prg(REGEX024_BYTECODE_SSLOT_ID_SZ) - regex_sslot_id_t ssid = ctx.extract_sslot_id(); - do_i_read(ctx, ssid); -} - -void i_READZ(REGEX_IS024_CONTEXT &ctx) { - ctx_print_debug(ctx); - do_i_read(ctx, 0); -} - -void i_JUMP(REGEX_IS024_CONTEXT& ctx){ - ctx_print_debug(ctx); - check_available_prg(REGEX024_BYTECODE_NEAR_POINTER_SZ) - ctx.active_thread.IP = ctx.extract_near_pointer(); -} - -template -void i_JC(REGEX_IS024_CONTEXT& ctx) -{ - ctx_print_debug(ctx); - check_available_prg(immArgSzT::byte_sz + REGEX024_BYTECODE_NEAR_POINTER_SZ); - uint64_t imm_val_B = immArgSzT::extract(ctx); - regex_near_ptr_t dest = ctx.extract_near_pointer(); - uint64_t imm_val_A = ctx.INP; - if (conditionT::call(imm_val_A, imm_val_B)) - ctx.active_thread.IP = dest; -} - -struct condEqual{static bool call(uint64_t A, uint64_t B){return A == B;}}; -struct condLess{static bool call(uint64_t A, uint64_t B){return A < B;}}; -struct condGrtr{static bool call(uint64_t A, uint64_t B){return A > B;}}; - -struct immArgByte{ - static constexpr int byte_sz = 1; - static uint64_t extract(REGEX_IS024_CONTEXT& ctx){return ctx.extract_b();} -}; -struct immArgWord{ - static constexpr int byte_sz = 2; - static uint64_t extract(REGEX_IS024_CONTEXT& ctx){return ctx.extract_w();} -}; -struct immArgDoubleWord{ - static constexpr int byte_sz = 4; - static uint64_t extract(REGEX_IS024_CONTEXT& ctx){return ctx.extract_dw();} -}; -struct immArgQuadWord{ - static constexpr int byte_sz = 8; - static uint64_t extract(REGEX_IS024_CONTEXT& ctx){return ctx.extract_qw();} -}; - -void clone_thread_into_slot(REGEX_IS024_Thread& source, REGEX_IS024_Thread& vessel){ - thread_print_debug(source); - my_assert(!(vessel.slot_occupation_status & SLOT_OCCUPIED)); - my_assert((source.slot_occupation_status & SLOT_OCCUPIED)); - vessel = source; - if (vessel.CAHptr){ - vessel.CAHptr->refs++; + void do_i_read(VMContext &ctx, sslot_id_t ssid) { + ctx_print_debug(ctx); + general_matching_mode_check() + if (ssid >= ctx.read_slots_number) + smitsya(read_sslot_out_of_range); + Thread& other = ctx.READ_halted_slots[ssid]; + if (other.slot_occupation_status & SLOT_OCCUPIED){ + if (other.slot_occupation_status & SLOT_NEW){ + start_noncloning_conflict(ctx, other); + } else { + swap_old_settled_and_new_active(ctx, other); + /* Even though ssid was registed in stack for elders, now young stack should also track this slot */ + read_halted_new_type_stacks_append(ctx, ssid); + } + } else { + other = ctx.active_thread; + other.slot_occupation_status = SLOT_NEW_val; + ctx.active_thread.slot_occupation_status = SLOT_EMPTY_val; + read_halted_new_type_stacks_append(ctx, ssid); + ctx.try_to_continue_scheduled(); + } } - if (vessel.SAptr){ - vessel.SAptr[0]++; + + void i_READ(VMContext &ctx) { + ctx_print_debug(ctx); + check_available_prg(BYTECODE_SSLOT_ID_SZ) + sslot_id_t ssid = ctx.extract_sslot_id(); + do_i_read(ctx, ssid); } -} -/* One FORK-slot governs the one single unique position in program: the next one after the fork */ -void i_FORK(REGEX_IS024_CONTEXT& ctx){ - ctx_print_debug(ctx); - general_matching_mode_check() - check_available_prg(REGEX024_BYTECODE_SSLOT_ID_SZ + REGEX024_BYTECODE_NEAR_POINTER_SZ); - regex_sslot_id_t ssid = ctx.extract_sslot_id(); - regex_near_ptr_t dest = ctx.extract_near_pointer(); - if (ssid >= ctx.fork_slots_number) - smitsya(fork_sslot_out_of_range); - REGEX_IS024_Thread& other = ctx.FORK_halted_slots[ssid]; - if (other.slot_occupation_status & SLOT_OCCUPIED){ - start_cloning_conflict(ctx, other, dest); - } else { - clone_thread_into_slot(ctx.active_thread, other); - ctx.active_thread.IP = dest; - ctx.FORK_halted_stack.append(ssid); + void i_READZ(VMContext &ctx) { + ctx_print_debug(ctx); + do_i_read(ctx, 0); } -} -void i_MATCH(REGEX_IS024_CONTEXT& ctx){ - ctx_print_debug(ctx); - general_matching_mode_check() - if (ctx.matched_thread.slot_occupation_status & SLOT_OCCUPIED){ - start_cloning_conflict(ctx, ctx.matched_thread, ctx.active_thread.IP); - } else { - clone_thread_into_slot(ctx.active_thread, ctx.matched_thread); + void i_JUMP(VMContext& ctx){ + ctx_print_debug(ctx); + check_available_prg(BYTECODE_NEAR_POINTER_SZ) + ctx.active_thread.IP = ctx.extract_near_pointer(); } -} -void i_DIE(REGEX_IS024_CONTEXT& ctx){ - ctx_print_debug(ctx); - general_matching_mode_check() - ctx.active_thread.delete_thread(); - ctx.try_to_continue_scheduled(); -} + template + void i_JC(VMContext& ctx) + { + ctx_print_debug(ctx); + check_available_prg(immArgSzT::byte_sz + BYTECODE_NEAR_POINTER_SZ); + uint64_t imm_val_B = immArgSzT::extract(ctx); + near_ptr_t dest = ctx.extract_near_pointer(); + uint64_t imm_val_A = ctx.INP; + if (conditionT::call(imm_val_A, imm_val_B)) + ctx.active_thread.IP = dest; + } -void i_PARAM_READ_SS_NUMBER(REGEX_IS024_CONTEXT& ctx){ - ctx_print_debug(ctx); - initialization_phase_check() - check_available_prg(REGEX024_BYTECODE_SSLOT_ID_SZ) - regex_sslot_id_t read_slots_number = ctx.extract_sslot_id(); - ctx.read_slots_number = read_slots_number; -} + struct condEqual{static bool call(uint64_t A, uint64_t B){return A == B;}}; + struct condLess{static bool call(uint64_t A, uint64_t B){return A < B;}}; + struct condGrtr{static bool call(uint64_t A, uint64_t B){return A > B;}}; -void i_PARAM_FORK_SS_NUMBER(REGEX_IS024_CONTEXT& ctx){ - ctx_print_debug(ctx); - initialization_phase_check() - check_available_prg(REGEX024_BYTECODE_SSLOT_ID_SZ) - regex_sslot_id_t fork_slots_number = ctx.extract_sslot_id(); - ctx.fork_slots_number = fork_slots_number; -} + struct immArgByte{ + static constexpr int byte_sz = 1; + static uint64_t extract(VMContext& ctx){return ctx.extract_b();} + }; + struct immArgWord{ + static constexpr int byte_sz = 2; + static uint64_t extract(VMContext& ctx){return ctx.extract_w();} + }; + struct immArgDoubleWord{ + static constexpr int byte_sz = 4; + static uint64_t extract(VMContext& ctx){return ctx.extract_dw();} + }; + struct immArgQuadWord{ + static constexpr int byte_sz = 8; + static uint64_t extract(VMContext& ctx){return ctx.extract_qw();} + }; -void i_PARAM_SELARR_LEN(REGEX_IS024_CONTEXT& ctx){ - ctx_print_debug(ctx); - initialization_phase_check() - check_available_prg(REGEX024_BYTECODE_TRACK_ARRAY_INDEX_ID_SZ) - regex_tai_t selection_array_len = ctx.extract_track_array_index(); - ctx.selection_array_len = selection_array_len; -} + void clone_thread_into_slot(Thread& source, Thread& vessel){ + thread_print_debug(source); + my_assert(!(vessel.slot_occupation_status & SLOT_OCCUPIED)); + my_assert((source.slot_occupation_status & SLOT_OCCUPIED)); + vessel = source; + if (vessel.CAHptr){ + vessel.CAHptr->refs++; + } + if (vessel.SAptr){ + vessel.SAptr[0]++; + } + } -void i_PARAM_COLSIFTFUNC_SET(REGEX_IS024_CONTEXT& ctx){ - ctx_print_debug(ctx); - initialization_phase_check() - check_available_prg(REGEX024_BYTECODE_NEAR_POINTER_SZ) - regex_near_ptr_t sift_function = ctx.extract_near_pointer(); - ctx.have_sift_function = true; - ctx.sift_function = sift_function; -} + /* One FORK-slot governs the one single unique position in program: the next one after the fork */ + void i_FORK(VMContext& ctx){ + ctx_print_debug(ctx); + general_matching_mode_check() + check_available_prg(BYTECODE_SSLOT_ID_SZ + BYTECODE_NEAR_POINTER_SZ); + sslot_id_t ssid = ctx.extract_sslot_id(); + near_ptr_t dest = ctx.extract_near_pointer(); + if (ssid >= ctx.fork_slots_number) + smitsya(fork_sslot_out_of_range); + Thread& other = ctx.FORK_halted_slots[ssid]; + if (other.slot_occupation_status & SLOT_OCCUPIED){ + start_cloning_conflict(ctx, other, dest); + } else { + clone_thread_into_slot(ctx.active_thread, other); + ctx.active_thread.IP = dest; + ctx.FORK_halted_stack.append(ssid); + } + } -void i_PARAM_COLSIFTFUNC_WIPE(REGEX_IS024_CONTEXT& ctx){ - ctx_print_debug(ctx); - initialization_phase_check() - ctx.have_sift_function = false; -} + void i_MATCH(VMContext& ctx){ + ctx_print_debug(ctx); + general_matching_mode_check() + if (ctx.matched_thread.slot_occupation_status & SLOT_OCCUPIED){ + start_cloning_conflict(ctx, ctx.matched_thread, ctx.active_thread.IP); + } else { + clone_thread_into_slot(ctx.active_thread, ctx.matched_thread); + } + } -void i_MSG_MULTISTART_ALLOWED(REGEX_IS024_CONTEXT& ctx){ - ctx_print_debug(ctx); - initialization_phase_check() - check_available_prg(1) - ctx.allows_multistart = (bool)ctx.extract_b(); -} - -void i_MSG_FED_INPUT_EXTENDED(REGEX_IS024_CONTEXT& ctx){ - ctx_print_debug(ctx); - initialization_phase_check() - check_available_prg(1 + 1 + REGEX024_BYTECODE_SSLOT_ID_SZ) - ctx.fed_input_extends_left = ctx.extract_b(); - ctx.fed_input_extends_right = ctx.extract_b(); - ctx.portion_of_second_read_halt_ns = ctx.extract_sslot_id(); -} - -uint64_t get_el_from_selarr(uint64_t* sa, regex_near_ptr_t ind){ - return sa ? sa[1UL + ind] : 0; -} - -void i_DMOV_RABX_SELARR(REGEX_IS024_CONTEXT& ctx){ - ctx_print_debug(ctx); - sift_mode_check() - check_available_prg(REGEX024_BYTECODE_TRACK_ARRAY_INDEX_ID_SZ) - regex_tai_t i1 = ctx.extract_track_array_index(); - if (i1 >= ctx.selection_array_len) - smitsya(selection_arr_out_of_range); - ctx.RAX = get_el_from_selarr(ctx.active_thread.SAptr, i1); - ctx.RBX = get_el_from_selarr(ctx.sifting_with->SAptr, i1); -} - -uint64_t get_selarr_el_dist(uint64_t* sa, uint16_t start, uint16_t end){ - uint64_t v_start = get_el_from_selarr(sa, start); - uint64_t v_end = get_el_from_selarr(sa, end); - return v_end > v_start ? v_end - v_start : 0; -} - -void i_DDIST_RABX_SELARR(REGEX_IS024_CONTEXT& ctx){ - ctx_print_debug(ctx); - sift_mode_check() - check_available_prg(REGEX024_BYTECODE_TRACK_ARRAY_INDEX_ID_SZ * 2) - regex_tai_t i_start = ctx.extract_track_array_index(); - if (i_start >= ctx.selection_array_len) - smitsya(selection_arr_out_of_range); - regex_tai_t i_end = ctx.extract_track_array_index(); - if (i_end >= ctx.selection_array_len) - smitsya(selection_arr_out_of_range); - ctx.RAX = get_selarr_el_dist(ctx.active_thread.SAptr, i_start, i_end); - ctx.RBX = get_selarr_el_dist(ctx.sifting_with->SAptr, i_start, i_end); -} - -void finish_conflict_homesteader_wins(REGEX_IS024_CONTEXT& ctx){ - ctx_print_debug(ctx); - if (ctx.who_started_sift == regex024_opcodes::READ){ + void i_DIE(VMContext& ctx){ + ctx_print_debug(ctx); + general_matching_mode_check() ctx.active_thread.delete_thread(); ctx.try_to_continue_scheduled(); - } else { - /* FORK or MATCH (which will also be shown as FORK) */ - /* Cloning conflict ends, active_thread jumps to offsprings IP */ - ctx.active_thread.IP = ctx.child_ret_IP; } - ctx.sifting_with = NULL; -} -void finish_conflict_intruder_wins(REGEX_IS024_CONTEXT& ctx){ - ctx_print_debug(ctx); - ctx.sifting_with->delete_thread(); - ctx.active_thread.IP = ctx.intruder_IP; - if (ctx.who_started_sift == regex024_opcodes::READ){ - /* noncloning conflict won by intruder+ */ - *ctx.sifting_with = ctx.active_thread; - ctx.active_thread.slot_occupation_status = SLOT_EMPTY_val; - ctx.try_to_continue_scheduled(); - } else { - /* End of cloning conflict (it involved cloning) */ - clone_thread_into_slot(ctx.active_thread, *ctx.sifting_with); - ctx.active_thread.IP = ctx.child_ret_IP; + void i_PARAM_READ_SS_NUMBER(VMContext& ctx){ + ctx_print_debug(ctx); + initialization_phase_check() + check_available_prg(BYTECODE_SSLOT_ID_SZ) + sslot_id_t read_slots_number = ctx.extract_sslot_id(); + ctx.read_slots_number = read_slots_number; } - ctx.sifting_with = NULL; -} -void i_SIFTPRIOR_MIN_RABX(REGEX_IS024_CONTEXT& ctx){ - ctx_print_debug(ctx); - sift_mode_check() - if (ctx.RAX < ctx.RBX){ - finish_conflict_intruder_wins(ctx); - } else if (ctx.RAX > ctx.RBX){ + void i_PARAM_FORK_SS_NUMBER(VMContext& ctx){ + ctx_print_debug(ctx); + initialization_phase_check() + check_available_prg(BYTECODE_SSLOT_ID_SZ) + sslot_id_t fork_slots_number = ctx.extract_sslot_id(); + ctx.fork_slots_number = fork_slots_number; + } + + void i_PARAM_SELARR_LEN(VMContext& ctx){ + ctx_print_debug(ctx); + initialization_phase_check() + check_available_prg(BYTECODE_TRACK_ARRAY_INDEX_ID_SZ) + tai_t selection_array_len = ctx.extract_track_array_index(); + ctx.selection_array_len = selection_array_len; + } + + void i_PARAM_COLSIFTFUNC_SET(VMContext& ctx){ + ctx_print_debug(ctx); + initialization_phase_check() + check_available_prg(BYTECODE_NEAR_POINTER_SZ) + near_ptr_t sift_function = ctx.extract_near_pointer(); + ctx.have_sift_function = true; + ctx.sift_function = sift_function; + } + + void i_PARAM_COLSIFTFUNC_WIPE(VMContext& ctx){ + ctx_print_debug(ctx); + initialization_phase_check() + ctx.have_sift_function = false; + } + + void i_MSG_MULTISTART_ALLOWED(VMContext& ctx){ + ctx_print_debug(ctx); + initialization_phase_check() + check_available_prg(1) + ctx.allows_multistart = (bool)ctx.extract_b(); + } + + void i_MSG_FED_INPUT_EXTENDED(VMContext& ctx){ + ctx_print_debug(ctx); + initialization_phase_check() + check_available_prg(1 + 1 + BYTECODE_SSLOT_ID_SZ) + ctx.fed_input_extends_left = ctx.extract_b(); + ctx.fed_input_extends_right = ctx.extract_b(); + ctx.portion_of_second_read_halt_ns = ctx.extract_sslot_id(); + } + + uint64_t get_el_from_selarr(uint64_t* sa, near_ptr_t ind){ + return sa ? sa[1UL + ind] : 0; + } + + void i_DMOV_RABX_SELARR(VMContext& ctx){ + ctx_print_debug(ctx); + sift_mode_check() + check_available_prg(BYTECODE_TRACK_ARRAY_INDEX_ID_SZ) + tai_t i1 = ctx.extract_track_array_index(); + if (i1 >= ctx.selection_array_len) + smitsya(selection_arr_out_of_range); + ctx.RAX = get_el_from_selarr(ctx.active_thread.SAptr, i1); + ctx.RBX = get_el_from_selarr(ctx.sifting_with->SAptr, i1); + } + + uint64_t get_selarr_el_dist(uint64_t* sa, uint16_t start, uint16_t end){ + uint64_t v_start = get_el_from_selarr(sa, start); + uint64_t v_end = get_el_from_selarr(sa, end); + return v_end > v_start ? v_end - v_start : 0; + } + + void i_DDIST_RABX_SELARR(VMContext& ctx){ + ctx_print_debug(ctx); + sift_mode_check() + check_available_prg(BYTECODE_TRACK_ARRAY_INDEX_ID_SZ * 2) + tai_t i_start = ctx.extract_track_array_index(); + if (i_start >= ctx.selection_array_len) + smitsya(selection_arr_out_of_range); + tai_t i_end = ctx.extract_track_array_index(); + if (i_end >= ctx.selection_array_len) + smitsya(selection_arr_out_of_range); + ctx.RAX = get_selarr_el_dist(ctx.active_thread.SAptr, i_start, i_end); + ctx.RBX = get_selarr_el_dist(ctx.sifting_with->SAptr, i_start, i_end); + } + + void finish_conflict_homesteader_wins(VMContext& ctx){ + ctx_print_debug(ctx); + if (ctx.who_started_sift == opcodes::READ){ + ctx.active_thread.delete_thread(); + ctx.try_to_continue_scheduled(); + } else { + /* FORK or MATCH (which will also be shown as FORK) */ + /* Cloning conflict ends, active_thread jumps to offsprings IP */ + ctx.active_thread.IP = ctx.child_ret_IP; + } + ctx.sifting_with = NULL; + } + + void finish_conflict_intruder_wins(VMContext& ctx){ + ctx_print_debug(ctx); + ctx.sifting_with->delete_thread(); + ctx.active_thread.IP = ctx.intruder_IP; + if (ctx.who_started_sift == opcodes::READ){ + /* noncloning conflict won by intruder+ */ + *ctx.sifting_with = ctx.active_thread; + ctx.active_thread.slot_occupation_status = SLOT_EMPTY_val; + ctx.try_to_continue_scheduled(); + } else { + /* End of cloning conflict (it involved cloning) */ + clone_thread_into_slot(ctx.active_thread, *ctx.sifting_with); + ctx.active_thread.IP = ctx.child_ret_IP; + } + ctx.sifting_with = NULL; + } + + void i_SIFTPRIOR_MIN_RABX(VMContext& ctx){ + ctx_print_debug(ctx); + sift_mode_check() + if (ctx.RAX < ctx.RBX){ + finish_conflict_intruder_wins(ctx); + } else if (ctx.RAX > ctx.RBX){ + finish_conflict_homesteader_wins(ctx); + } + } + + void i_SIFTPRIOR_MAX_RABX(VMContext& ctx){ + ctx_print_debug(ctx); + sift_mode_check() + if (ctx.RAX > ctx.RBX){ + finish_conflict_intruder_wins(ctx); + } else if (ctx.RAX < ctx.RBX){ + finish_conflict_homesteader_wins(ctx); + } + } + + void i_SIFT_DONE(VMContext& ctx){ + ctx_print_debug(ctx); + sift_mode_check() finish_conflict_homesteader_wins(ctx); } -} -void i_SIFTPRIOR_MAX_RABX(REGEX_IS024_CONTEXT& ctx){ - ctx_print_debug(ctx); - sift_mode_check() - if (ctx.RAX > ctx.RBX){ - finish_conflict_intruder_wins(ctx); - } else if (ctx.RAX < ctx.RBX){ - finish_conflict_homesteader_wins(ctx); - } -} - -void i_SIFT_DONE(REGEX_IS024_CONTEXT& ctx){ - ctx_print_debug(ctx); - sift_mode_check() - finish_conflict_homesteader_wins(ctx); -} - -/* Can give errors */ -void ca_branch_new_node(REGEX_IS024_CONTEXT& ctx, regex_tai_t key, uint64_t val){ - ctx_print_debug(ctx); - if (ctx.CAN_total >= ctx.CA_TREE_LIMIT) - smitsya(ca_tree_limit_violation); - REGEX024_CollectionArrayNode* node = new REGEX024_CollectionArrayNode{key, val, ctx.active_thread.CAHptr, 1}; - // if (ctx.active_thread.CAHptr) + /* Can give errors */ + void ca_branch_new_node(VMContext& ctx, tai_t key, uint64_t val){ + ctx_print_debug(ctx); + if (ctx.CAN_total >= ctx.CA_TREE_LIMIT) + smitsya(ca_tree_limit_violation); + CollectionArrayNode* node = new CollectionArrayNode{key, val, ctx.active_thread.CAHptr, 1}; + // if (ctx.active_thread.CAHptr) // (ctx.active_thread.CAHptr->refs)++; - ctx.active_thread.CAHptr = node; - ctx.CAN_total++; -} - -void i_MOV_COLARR_IMM(REGEX_IS024_CONTEXT& ctx){ - ctx_print_debug(ctx); - general_matching_mode_check() - check_available_prg(REGEX024_BYTECODE_TRACK_ARRAY_INDEX_ID_SZ + 8) - regex_tai_t ca_ind = ctx.extract_track_array_index(); - uint64_t imm = ctx.extract_qw(); - ca_branch_new_node(ctx, ca_ind, imm); -} - -void i_MOV_COLARR_BTPOS(REGEX_IS024_CONTEXT& ctx){ - ctx_print_debug(ctx); - general_matching_mode_check() - check_available_prg(REGEX024_BYTECODE_TRACK_ARRAY_INDEX_ID_SZ) - regex_tai_t ca_ind = ctx.extract_track_array_index(); - ca_branch_new_node(ctx, ca_ind, ctx.passed_bytes); -} - -/* Can throw error, should be placed at the end. Call ONLY in general matching mode */ -void edit_selection_array(REGEX_IS024_CONTEXT& ctx, uint64_t key, uint64_t val){ - ctx_print_debug(ctx); - uint64_t N = ctx.selection_array_len; - if (key >= N) - smitsya(selection_arr_out_of_range); - if (!ctx.active_thread.SAptr){ - uint64_t* sa_instance = (uint64_t*)calloc(N + 1, 8); - if (!sa_instance) - throw std::bad_alloc(); - sa_instance[0] = 1; - sa_instance[key + 1] = val; - ctx.active_thread.SAptr = sa_instance; - } else if (ctx.active_thread.SAptr[0] == 1){ - ctx.active_thread.SAptr[key + 1] = val; - } else { - uint64_t* sa_instance = (uint64_t*)calloc(N + 1, 8); - if (!sa_instance) - throw std::bad_alloc(); - sa_instance[0] = 1; - for (uint64_t i = 1; i <= ctx.selection_array_len; i++) - sa_instance[i] = ctx.active_thread.SAptr[i]; - sa_instance[key + 1] = val; - ctx.active_thread.SAptr[0]--; - ctx.active_thread.SAptr = sa_instance; + ctx.active_thread.CAHptr = node; + ctx.CAN_total++; } -} -void i_MOV_SELARR_IMM(REGEX_IS024_CONTEXT& ctx){ - ctx_print_debug(ctx); - general_matching_mode_check() - check_available_prg(REGEX024_BYTECODE_TRACK_ARRAY_INDEX_ID_SZ + 8) - regex_tai_t sa_ind = ctx.extract_track_array_index(); - uint64_t imm = ctx.extract_qw(); - edit_selection_array(ctx, sa_ind, imm); -} - -void i_MOV_SELARR_CHPOS(REGEX_IS024_CONTEXT& ctx){ - ctx_print_debug(ctx); - general_matching_mode_check() - check_available_prg(REGEX024_BYTECODE_TRACK_ARRAY_INDEX_ID_SZ) - regex_tai_t sa_ind = ctx.extract_track_array_index(); - edit_selection_array(ctx, sa_ind, ctx.passed_chars); -} - -void calloc_stack_slots(REGEX_IS024_Stack& stack, regex_sslot_id_t nmemb) { - assert(stack.sz == 0 && !stack.slots); - regex_sslot_id_t* storage = static_cast(calloc(nmemb, sizeof(regex_sslot_id_t))); - if (!storage) - throw std::bad_alloc(); - stack.slots = storage; -} - -REGEX_IS024_Thread* calloc_slots_array(regex_sslot_id_t nmemb) { - REGEX_IS024_Thread* ptr = static_cast(calloc(nmemb, sizeof(REGEX_IS024_Thread))); - if (!ptr) - throw std::bad_alloc(); - return ptr; -} - -void i_INIT(REGEX_IS024_CONTEXT& ctx){ - ctx_print_debug(ctx); - initialization_phase_check() - if (ctx.selection_array_len > ctx.SA_LEN_LIMIT) - smitsya(sa_length_limit_violation); - if (ctx.read_slots_number > ctx.READ_SS_LIMIT) - smitsya(read_sslot_count_limit_violation); - if (ctx.fork_slots_number > ctx.FORK_SS_LIMIT) - smitsya(fork_sslot_count_limit_violation); - if (ctx.portion_of_second_read_halt_ns > ctx.read_slots_number) - smitsya(fork_sslot_out_of_range); - ctx.READ_halted_slots = calloc_slots_array(ctx.read_slots_number); - calloc_stack_slots(ctx.READ_halted_stack_old, ctx.read_slots_number); - - ctx.portion_of_FIRST_read_halt_ns = ctx.read_slots_number - ctx.portion_of_second_read_halt_ns; - calloc_stack_slots(ctx.READ_halted_stack_new_first, ctx.portion_of_FIRST_read_halt_ns); - calloc_stack_slots(ctx.READ_halted_stack_new_second, ctx.portion_of_second_read_halt_ns); - - ctx.FORK_halted_slots = calloc_slots_array(ctx.fork_slots_number); - calloc_stack_slots(ctx.FORK_halted_stack, ctx.fork_slots_number); - - ctx.initialized = true; - ctx.unnatural_started_thread_IP = ctx.active_thread.IP; - ctx.active_thread.delete_thread(); -} - -void i_THROW(REGEX_IS024_CONTEXT& ctx){ - ctx.error = regex024_error_codes::program_throw; -} - -void instruction_table(REGEX_IS024_CONTEXT &ctx) { - ctx_print_debug(ctx); - uint8_t opcode = ctx.extract_instruction(); - -#define rcase(inst) case regex024_opcodes::inst: return i_ ## inst (ctx); -#define jumpC(UN, st) case regex024_opcodes::JC ## UN ## _B: return i_JC(ctx); \ - case regex024_opcodes::JC ## UN ## _W: return i_JC(ctx); \ - case regex024_opcodes::JC ## UN ## _DW: return i_JC(ctx); \ - case regex024_opcodes::JC ## UN ## _QW: return i_JC(ctx); - switch (opcode) { - rcase(READ) - rcase(READZ) - rcase(JUMP) - - jumpC(EQUAL, condEqual) - jumpC(LESS, condLess) - jumpC(GRTR, condGrtr) - - rcase(FORK) - rcase(MATCH) - rcase(DIE) - rcase(PARAM_READ_SS_NUMBER) - rcase(PARAM_FORK_SS_NUMBER) - rcase(PARAM_SELARR_LEN) - rcase(PARAM_COLSIFTFUNC_SET) - rcase(PARAM_COLSIFTFUNC_WIPE) - rcase(MSG_MULTISTART_ALLOWED) - rcase(MSG_FED_INPUT_EXTENDED) - rcase(DMOV_RABX_SELARR) - rcase(DDIST_RABX_SELARR) - rcase(SIFTPRIOR_MIN_RABX) - rcase(SIFTPRIOR_MAX_RABX) - rcase(SIFT_DONE) - rcase(MOV_COLARR_IMM) - rcase(MOV_COLARR_BTPOS) - rcase(MOV_SELARR_IMM) - rcase(MOV_SELARR_CHPOS) - rcase(INIT) - rcase(THROW) - default: - ctx.error = regex024_error_codes::invalid_opcode; + void i_MOV_COLARR_IMM(VMContext& ctx){ + ctx_print_debug(ctx); + general_matching_mode_check() + check_available_prg(BYTECODE_TRACK_ARRAY_INDEX_ID_SZ + 8) + tai_t ca_ind = ctx.extract_track_array_index(); + uint64_t imm = ctx.extract_qw(); + ca_branch_new_node(ctx, ca_ind, imm); } -} + + void i_MOV_COLARR_BTPOS(VMContext& ctx){ + ctx_print_debug(ctx); + general_matching_mode_check() + check_available_prg(BYTECODE_TRACK_ARRAY_INDEX_ID_SZ) + tai_t ca_ind = ctx.extract_track_array_index(); + ca_branch_new_node(ctx, ca_ind, ctx.passed_bytes); + } + + /* Can throw error, should be placed at the end. Call ONLY in general matching mode */ + void edit_selection_array(VMContext& ctx, uint64_t key, uint64_t val){ + ctx_print_debug(ctx); + uint64_t N = ctx.selection_array_len; + if (key >= N) + smitsya(selection_arr_out_of_range); + if (!ctx.active_thread.SAptr){ + uint64_t* sa_instance = (uint64_t*)calloc(N + 1, 8); + if (!sa_instance) + throw std::bad_alloc(); + sa_instance[0] = 1; + sa_instance[key + 1] = val; + ctx.active_thread.SAptr = sa_instance; + } else if (ctx.active_thread.SAptr[0] == 1){ + ctx.active_thread.SAptr[key + 1] = val; + } else { + uint64_t* sa_instance = (uint64_t*)calloc(N + 1, 8); + if (!sa_instance) + throw std::bad_alloc(); + sa_instance[0] = 1; + for (uint64_t i = 1; i <= ctx.selection_array_len; i++) + sa_instance[i] = ctx.active_thread.SAptr[i]; + sa_instance[key + 1] = val; + ctx.active_thread.SAptr[0]--; + ctx.active_thread.SAptr = sa_instance; + } + } + + void i_MOV_SELARR_IMM(VMContext& ctx){ + ctx_print_debug(ctx); + general_matching_mode_check() + check_available_prg(BYTECODE_TRACK_ARRAY_INDEX_ID_SZ + 8) + tai_t sa_ind = ctx.extract_track_array_index(); + uint64_t imm = ctx.extract_qw(); + edit_selection_array(ctx, sa_ind, imm); + } + + void i_MOV_SELARR_CHPOS(VMContext& ctx){ + ctx_print_debug(ctx); + general_matching_mode_check() + check_available_prg(BYTECODE_TRACK_ARRAY_INDEX_ID_SZ) + tai_t sa_ind = ctx.extract_track_array_index(); + edit_selection_array(ctx, sa_ind, ctx.passed_chars); + } + + void calloc_stack_slots(SSID_Stack& stack, sslot_id_t nmemb) { + assert(stack.max_size == 0 && stack.sz == 0 && !stack.slots); + sslot_id_t* storage = static_cast(calloc(nmemb, sizeof(sslot_id_t))); + if (!storage) + throw std::bad_alloc(); + stack.slots = storage; + stack.max_size = nmemb; + } + + Thread* calloc_slots_array(sslot_id_t nmemb) { + Thread* ptr = static_cast(calloc(nmemb, sizeof(Thread))); + if (!ptr) + throw std::bad_alloc(); + return ptr; + } + + void i_INIT(VMContext& ctx){ + ctx_print_debug(ctx); + initialization_phase_check() + if (ctx.selection_array_len > ctx.SA_LEN_LIMIT) + smitsya(sa_length_limit_violation); + if (ctx.read_slots_number > ctx.READ_SS_LIMIT) + smitsya(read_sslot_count_limit_violation); + if (ctx.fork_slots_number > ctx.FORK_SS_LIMIT) + smitsya(fork_sslot_count_limit_violation); + if (ctx.portion_of_second_read_halt_ns > ctx.read_slots_number) + smitsya(fork_sslot_out_of_range); + ctx.READ_halted_slots = calloc_slots_array(ctx.read_slots_number); + calloc_stack_slots(ctx.READ_halted_stack_old, ctx.read_slots_number); + + ctx.portion_of_FIRST_read_halt_ns = ctx.read_slots_number - ctx.portion_of_second_read_halt_ns; + calloc_stack_slots(ctx.READ_halted_stack_new_first, ctx.portion_of_FIRST_read_halt_ns); + calloc_stack_slots(ctx.READ_halted_stack_new_second, ctx.portion_of_second_read_halt_ns); + + ctx.FORK_halted_slots = calloc_slots_array(ctx.fork_slots_number); + calloc_stack_slots(ctx.FORK_halted_stack, ctx.fork_slots_number); + + ctx.initialized = true; + ctx.unnatural_started_thread_IP = ctx.active_thread.IP; + ctx.active_thread.delete_thread(); + } + + void i_THROW(VMContext& ctx){ + ctx.error = error_codes::program_throw; + } + + void instruction_table(VMContext &ctx) { + ctx_print_debug(ctx); + uint8_t opcode = ctx.extract_instruction(); + +#define rcase(inst) case opcodes::inst: return i_ ## inst (ctx); +#define jumpC(UN, st) case opcodes::JC ## UN ## _B: return i_JC(ctx); \ + case opcodes::JC ## UN ## _W: return i_JC(ctx); \ + case opcodes::JC ## UN ## _DW: return i_JC(ctx); \ + case opcodes::JC ## UN ## _QW: return i_JC(ctx); + switch (opcode) { + rcase(READ) + rcase(READZ) + rcase(JUMP) + + jumpC(EQUAL, condEqual) + jumpC(LESS, condLess) + jumpC(GRTR, condGrtr) + + rcase(FORK) + rcase(MATCH) + rcase(DIE) + rcase(PARAM_READ_SS_NUMBER) + rcase(PARAM_FORK_SS_NUMBER) + rcase(PARAM_SELARR_LEN) + rcase(PARAM_COLSIFTFUNC_SET) + rcase(PARAM_COLSIFTFUNC_WIPE) + rcase(MSG_MULTISTART_ALLOWED) + rcase(MSG_FED_INPUT_EXTENDED) + rcase(DMOV_RABX_SELARR) + rcase(DDIST_RABX_SELARR) + rcase(SIFTPRIOR_MIN_RABX) + rcase(SIFTPRIOR_MAX_RABX) + rcase(SIFT_DONE) + rcase(MOV_COLARR_IMM) + rcase(MOV_COLARR_BTPOS) + rcase(MOV_SELARR_IMM) + rcase(MOV_SELARR_CHPOS) + rcase(INIT) + rcase(THROW) + default: + ctx.error = error_codes::invalid_opcode; + } + } +} \ No newline at end of file diff --git a/src/libregexis024vm/instruction_implementation.h b/src/libregexis024vm/instruction_implementation.h index 50ee2b4..36341ca 100644 --- a/src/libregexis024vm/instruction_implementation.h +++ b/src/libregexis024vm/instruction_implementation.h @@ -7,7 +7,7 @@ #include #include -#define smitsya(error_type) do {ctx.error = regex024_error_codes::error_type; return; } while (0) +#define smitsya(error_type) do {ctx.error = error_codes::error_type; return; } while (0) #define SLOT_EMPTY_val 0 #define SLOT_OCCUPIED 1 @@ -16,7 +16,7 @@ #define SLOT_NEW_val (SLOT_OCCUPIED | SLOT_NEW) #define check_available_prg(regionSz) if (!ctx.check_inboundness(regionSz)){ \ - ctx.error = regex024_error_codes::improper_finish; return; } + ctx.error = error_codes::improper_finish; return; } #if defined(LIBREGEXIS024_DEBUG) && defined(LIBREGEXIS024_ALLOW_LOUD) @@ -30,6 +30,8 @@ #define thread_print_debug(thread) #endif -void instruction_table(REGEX_IS024_CONTEXT& ctx); +namespace regexis024 { + void instruction_table(VMContext& ctx); +} #endif //LIBREGEXIS024_INSTRUCTION_IMPLEMENTATION_H \ No newline at end of file diff --git a/src/libregexis024vm/libregex024opcodes_stringification.cpp b/src/libregexis024vm/libregex024opcodes_stringification.cpp index ce0e330..0ac869f 100644 --- a/src/libregexis024vm/libregex024opcodes_stringification.cpp +++ b/src/libregexis024vm/libregex024opcodes_stringification.cpp @@ -1,47 +1,48 @@ #include #include -#define rcase(name) case regex024_opcodes::name: return #name; - -const char *regex024_opcode_tostr(regex024_opcode x) { - switch (x) { - rcase(READ) - rcase(READZ) - rcase(JUMP) - rcase(JCEQUAL_B) - rcase(JCEQUAL_W) - rcase(JCEQUAL_DW) - rcase(JCEQUAL_QW) - rcase(JCLESS_B) - rcase(JCLESS_W) - rcase(JCLESS_DW) - rcase(JCLESS_QW) - rcase(JCGRTR_B) - rcase(JCGRTR_W) - rcase(JCGRTR_DW) - rcase(JCGRTR_QW) - rcase(FORK) - rcase(MATCH) - rcase(DIE) - rcase(PARAM_READ_SS_NUMBER) - rcase(PARAM_FORK_SS_NUMBER) - rcase(PARAM_SELARR_LEN) - rcase(PARAM_COLSIFTFUNC_SET) - rcase(PARAM_COLSIFTFUNC_WIPE) - rcase(MSG_MULTISTART_ALLOWED) - rcase(MSG_FED_INPUT_EXTENDED) - rcase(DMOV_RABX_SELARR) - rcase(DDIST_RABX_SELARR) - rcase(SIFTPRIOR_MIN_RABX) - rcase(SIFTPRIOR_MAX_RABX) - rcase(SIFT_DONE) - rcase(MOV_COLARR_IMM) - rcase(MOV_COLARR_BTPOS) - rcase(MOV_SELARR_IMM) - rcase(MOV_SELARR_CHPOS) - rcase(INIT) - rcase(THROW) - default: - return "Invalid opcode"; +namespace regexis024 { + const char *opcode_to_str(opcode_t x) { + switch (x) { +#define rcase(name) case opcodes::name: return #name; + rcase(READ) + rcase(READZ) + rcase(JUMP) + rcase(JCEQUAL_B) + rcase(JCEQUAL_W) + rcase(JCEQUAL_DW) + rcase(JCEQUAL_QW) + rcase(JCLESS_B) + rcase(JCLESS_W) + rcase(JCLESS_DW) + rcase(JCLESS_QW) + rcase(JCGRTR_B) + rcase(JCGRTR_W) + rcase(JCGRTR_DW) + rcase(JCGRTR_QW) + rcase(FORK) + rcase(MATCH) + rcase(DIE) + rcase(PARAM_READ_SS_NUMBER) + rcase(PARAM_FORK_SS_NUMBER) + rcase(PARAM_SELARR_LEN) + rcase(PARAM_COLSIFTFUNC_SET) + rcase(PARAM_COLSIFTFUNC_WIPE) + rcase(MSG_MULTISTART_ALLOWED) + rcase(MSG_FED_INPUT_EXTENDED) + rcase(DMOV_RABX_SELARR) + rcase(DDIST_RABX_SELARR) + rcase(SIFTPRIOR_MIN_RABX) + rcase(SIFTPRIOR_MAX_RABX) + rcase(SIFT_DONE) + rcase(MOV_COLARR_IMM) + rcase(MOV_COLARR_BTPOS) + rcase(MOV_SELARR_IMM) + rcase(MOV_SELARR_CHPOS) + rcase(INIT) + rcase(THROW) + default: + return "Invalid opcode"; + } } } diff --git a/src/libregexis024vm/libregexis024vm.h b/src/libregexis024vm/libregexis024vm.h index 6a33aa8..b66f7dd 100644 --- a/src/libregexis024vm/libregexis024vm.h +++ b/src/libregexis024vm/libregexis024vm.h @@ -2,13 +2,13 @@ #define LIBREGEXIS024_LIBREGEXIS024VM_H /* This thing is bloated. And slow (Because I designed it imperfectly and because it is bloated). - * I could have halven the amount of bloat, but that would require me writing code in headers. - * I am gonna use it for KM, even more bloated project. So I thought that this design is on the spot. - * C++ is such a funny language. Code is divided into .cpp and .h files. But it only makes problems. - * All of my work on this C++ project was not serious from the beginning. It's all funny stuff. */ +* I could have halven the amount of bloat, but that would require me writing code in headers. +* I am gonna use it for KM, even more bloated project. So I thought that this design is on the spot. +* C++ is such a funny language. Code is divided into .cpp and .h files. But it only makes problems. +* All of my work on this C++ project was not serious from the beginning. It's all funny stuff. */ /* Also, please, consider using libregexis024vm/libregexis024vm_interface.h - * Naming in this project is super inconsistent. I don't want it to trash your namespace */ +* Naming in this project is super inconsistent. I don't want it to trash your namespace */ #include #include @@ -16,143 +16,144 @@ #include #include -struct REGEX_IS024_Stack{ - regex_sslot_id_t* slots = NULL; - regex_sslot_id_t sz = 0; +namespace regexis024 { + struct SSID_Stack{ + sslot_id_t* slots = NULL; + sslot_id_t max_size = 0; + sslot_id_t sz = 0; - regex_sslot_id_t pop(); - void append(regex_sslot_id_t x); - bool empty() const; - bool non_empty() const; + sslot_id_t pop(); + void append(sslot_id_t x); + bool empty() const; - REGEX_IS024_Stack(const REGEX_IS024_Stack&) = delete; - REGEX_IS024_Stack& operator=(const REGEX_IS024_Stack&) = delete; - REGEX_IS024_Stack() = default; + SSID_Stack(const SSID_Stack&) = delete; + SSID_Stack& operator=(const SSID_Stack&) = delete; + SSID_Stack() = default; - ~REGEX_IS024_Stack(); -}; + ~SSID_Stack(); + }; -struct REGEX024_CollectionArrayNode{ - /* Key is small for historical reasons I do not rememeber. Who cares anyway */ - regex_tai_t key; - uint64_t value; - /* NULL at the beginning */ - REGEX024_CollectionArrayNode* prev; - /* Reference counting */ - uint64_t refs = 0; -}; + struct CollectionArrayNode{ + /* Key is small for historical reasons I do not rememeber. Who cares anyway */ + tai_t key; + uint64_t value; + /* NULL at the beginning */ + CollectionArrayNode* prev; + /* Reference counting */ + uint64_t refs = 0; + }; -struct REGEX_IS024_Thread{ - /* First byte field is used only when thread is located in slot */ - uint8_t slot_occupation_status = 0; - regex_near_ptr_t IP = 0; - REGEX024_CollectionArrayNode* CAHptr = NULL; - /* Pointer to the seletion array. SA's are reference counted. Because of that every SA - * is elongated by one meta element in the beginning - reference counter. So the actual elements - * are enumerated starting from one. */ - uint64_t* SAptr = NULL; + struct Thread{ + /* First byte field is used only when thread is located in slot */ + uint8_t slot_occupation_status = 0; + near_ptr_t IP = 0; + CollectionArrayNode* CAHptr = NULL; + /* Pointer to the seletion array. SA's are reference counted. Because of that every SA + * is elongated by one meta element in the beginning - reference counter. So the actual elements + * are enumerated starting from one. */ + uint64_t* SAptr = NULL; - void delete_thread() noexcept; - void debug_print(const char* place); -}; + void delete_thread() noexcept; + void debug_print(const char* place); + }; -struct REGEX_IS024_CONTEXT{ - REGEX_IS024_CONTEXT(size_t programSize, const uint8_t *data, uint64_t caTreeLimit, regex_tai_t saLenLimit, - regex_sslot_id_t readSsLimit, regex_sslot_id_t forkSsLimit, uint64_t timeTickLimit); + struct VMContext{ + VMContext(size_t programSize, const uint8_t *data, uint64_t caTreeLimit, tai_t saLenLimit, + sslot_id_t readSsLimit, sslot_id_t forkSsLimit, uint64_t timeTickLimit); - regex024_error_code feedSOF(); - /* You can safely pile up calls to this command, nothing bad will happen */ - regex024_error_code startThread(); - regex024_error_code extendedFeedCharacter(uint64_t input); - regex024_error_code feedCharacter(uint64_t INP, uint64_t corresponding_byte_amount); + error_code_t feedSOF(); + /* You can safely pile up calls to this command, nothing bad will happen */ + error_code_t startThread(); + error_code_t extendedFeedCharacter(uint64_t input); + error_code_t feedCharacter(uint64_t INP, uint64_t corresponding_byte_amount); - ~REGEX_IS024_CONTEXT(); + ~VMContext(); - /* Program size larger than 2^62 is forbidden */ - size_t program_size = 0; - const uint8_t* prg = NULL; + /* Program size larger than 2^62 is forbidden */ + size_t program_size = 0; + const uint8_t* prg = NULL; - /* Max allowed index of CA is 2^16 - 1 - * Max allowed index of SA is 2^16 - 1. VM can be configured to allow even less */ - /* CA = Collecton array. */ - uint64_t CA_TREE_LIMIT; - /* SA = Selection array */ - regex_tai_t SA_LEN_LIMIT; - regex_sslot_id_t READ_SS_LIMIT; - regex_sslot_id_t FORK_SS_LIMIT; + /* Max allowed index of CA is 2^16 - 1 + * Max allowed index of SA is 2^16 - 1. VM can be configured to allow even less */ + /* CA = Collecton array. */ + uint64_t CA_TREE_LIMIT; + /* SA = Selection array */ + tai_t SA_LEN_LIMIT; + sslot_id_t READ_SS_LIMIT; + sslot_id_t FORK_SS_LIMIT; - /* If time_tick_limit is non-zero, regex virtual machine will stop with error - * after this many ticks. This parameter set's the timeout.*/ - uint64_t time_tick_limit; + /* If time_tick_limit is non-zero, regex virtual machine will stop with error + * after this many ticks. This parameter set's the timeout.*/ + uint64_t time_tick_limit; - /* This context is used only for one FA match session. This field measures each tick - * timer <= time_tick_limit */ - uint64_t timer = 0; - /* CAN_total <= CA_TREE_LIMIT */ - uint64_t CAN_total = 0; + /* This context is used only for one FA match session. This field measures each tick + * timer <= time_tick_limit */ + uint64_t timer = 0; + /* CAN_total <= CA_TREE_LIMIT */ + uint64_t CAN_total = 0; - /* Program selects it */ - regex_tai_t selection_array_len = 0; - regex_sslot_id_t read_slots_number = 0; - regex_sslot_id_t fork_slots_number = 0; + /* Program selects it */ + tai_t selection_array_len = 0; + sslot_id_t read_slots_number = 0; + sslot_id_t fork_slots_number = 0; - bool have_sift_function = false; - regex_near_ptr_t sift_function; + bool have_sift_function = false; + near_ptr_t sift_function; - bool allows_multistart = false; - uint8_t fed_input_extends_left = 0, fed_input_extends_right = 0; - regex_sslot_id_t portion_of_second_read_halt_ns = 0, portion_of_FIRST_read_halt_ns = 0; + bool allows_multistart = false; + uint8_t fed_input_extends_left = 0, fed_input_extends_right = 0; + sslot_id_t portion_of_second_read_halt_ns = 0, portion_of_FIRST_read_halt_ns = 0; - bool initialized = false; - regex_near_ptr_t unnatural_started_thread_IP = 1337; - regex024_error_code error = regex024_error_codes::stable; + bool initialized = false; + near_ptr_t unnatural_started_thread_IP = 1337; + error_code_t error = error_codes::stable; - REGEX_IS024_Thread* READ_halted_slots; - REGEX_IS024_Stack READ_halted_stack_old; - REGEX_IS024_Stack READ_halted_stack_new_first; - REGEX_IS024_Stack READ_halted_stack_new_second; - REGEX_IS024_Thread* FORK_halted_slots; - REGEX_IS024_Stack FORK_halted_stack; + Thread* READ_halted_slots; + SSID_Stack READ_halted_stack_old; + SSID_Stack READ_halted_stack_new_first; + SSID_Stack READ_halted_stack_new_second; + Thread* FORK_halted_slots; + SSID_Stack FORK_halted_stack; - REGEX_IS024_Thread active_thread; + Thread active_thread; - /* Environment for sifting stuff */ - REGEX_IS024_Thread* sifting_with = NULL; - /* specifies the type of operation vm should do after shift (there are only two distinct options) */ - uint8_t who_started_sift; - /* Sifting process uses IP field of active thread. Other data of thread is not modified or used during collision - * procudure. Old IP is stored there, if needed */ - regex_near_ptr_t child_ret_IP; - regex_near_ptr_t intruder_IP; - /* RAX corresponds to intruder. Its data is stored in active thread field*/ - uint64_t RAX; - /* RBX corresponds to homesteader. Its data is accessible by `REGEX_IS024_Thread* sifting_with` pointer*/ - uint64_t RBX; + /* Environment for sifting stuff */ + Thread* sifting_with = NULL; + /* specifies the type of operation vm should do after shift (there are only two distinct options) */ + uint8_t who_started_sift; + /* Sifting process uses IP field of active thread. Other data of thread is not modified or used during collision + * procudure. Old IP is stored there, if needed */ + near_ptr_t child_ret_IP; + near_ptr_t intruder_IP; + /* RAX corresponds to intruder. Its data is stored in active thread field*/ + uint64_t RAX; + /* RBX corresponds to homesteader. Its data is accessible by `REGEX_IS024_Thread* sifting_with` pointer*/ + uint64_t RBX; - /* Will be unoccupied if no threads matched. After each feed of character this field will be wiped - * User should take care of intermediate success himself */ - REGEX_IS024_Thread matched_thread; + /* Will be unoccupied if no threads matched. After each feed of character this field will be wiped + * User should take care of intermediate success himself */ + Thread matched_thread; - uint64_t INP = 0; - uint64_t passed_chars = 0; - uint64_t passed_bytes = 0; + uint64_t INP = 0; + uint64_t passed_chars = 0; + uint64_t passed_bytes = 0; - void try_to_continue_scheduled(); + void try_to_continue_scheduled(); - bool check_inboundness(int region); + bool check_inboundness(int region); - uint8_t extract_b(); - uint16_t extract_w(); - uint32_t extract_dw(); - uint64_t extract_qw(); + uint8_t extract_b(); + uint16_t extract_w(); + uint32_t extract_dw(); + uint64_t extract_qw(); - uint8_t extract_instruction(); - regex_sslot_id_t extract_sslot_id(); - regex_near_ptr_t extract_near_pointer(); - regex_tai_t extract_track_array_index(); - - void debug_print(const char* place); -}; + uint8_t extract_instruction(); + sslot_id_t extract_sslot_id(); + near_ptr_t extract_near_pointer(); + tai_t extract_track_array_index(); + void debug_print(const char* place); + }; +} #endif //LIBREGEXIS024_LIBREGEXIS024VM_H diff --git a/src/libregexis024vm/libregexis024vm_context.cpp b/src/libregexis024vm/libregexis024vm_context.cpp index 81c5e41..c289e16 100644 --- a/src/libregexis024vm/libregexis024vm_context.cpp +++ b/src/libregexis024vm/libregexis024vm_context.cpp @@ -1,197 +1,189 @@ +#include #include #include #include -regex_sslot_id_t REGEX_IS024_Stack::pop() { - assert(sz != 0); - return slots[--sz]; -} - -void REGEX_IS024_Stack::append(regex_sslot_id_t x) { - assert(slots); - slots[sz] = x; - sz++; -} - -bool REGEX_IS024_Stack::empty() const { - return !non_empty(); -} - -bool REGEX_IS024_Stack::non_empty() const { - return sz; -} - -REGEX_IS024_Stack::~REGEX_IS024_Stack() { - assert(empty()); - free(slots); -} - -REGEX_IS024_CONTEXT::REGEX_IS024_CONTEXT(size_t programSize, const uint8_t *data, - uint64_t caTreeLimit, regex_tai_t saLenLimit, - regex_sslot_id_t readSsLimit, regex_sslot_id_t forkSsLimit, - uint64_t timeTickLimit) : - program_size(programSize), prg(data), CA_TREE_LIMIT(caTreeLimit), SA_LEN_LIMIT(saLenLimit), - READ_SS_LIMIT(readSsLimit), FORK_SS_LIMIT(forkSsLimit), time_tick_limit(timeTickLimit) -{ - if (program_size > (1UL << 62)) - exitf("Program is too huge\n"); - active_thread.slot_occupation_status = SLOT_OCCUPIED; -} - -/* No only will it launch a wave of deallocation in CA tree, but as a nice bonus it's - * gonna deoccupy slot_occupation_status*/ -void REGEX_IS024_Thread::delete_thread() noexcept { - thread_print_debug(*this); - my_assert(slot_occupation_status & SLOT_OCCUPIED); - slot_occupation_status = SLOT_EMPTY_val; - REGEX024_CollectionArrayNode* cur_CAptr = CAHptr; - while (cur_CAptr){ - assert(cur_CAptr->refs > 0); - if (--(cur_CAptr->refs) == 0){ - REGEX024_CollectionArrayNode* next_CAptr = cur_CAptr->prev; - delete cur_CAptr; - cur_CAptr = next_CAptr; - } else - break; +namespace regexis024 { + sslot_id_t SSID_Stack::pop() { + assert(sz != 0); + return slots[--sz]; } - if (SAptr){ - if (--(SAptr[0]) == 0) - free(SAptr); - } -} -void emptify_one_of_new_read_halted_stacks(REGEX_IS024_CONTEXT& ctx, REGEX_IS024_Stack& type_new_stack){ - while (type_new_stack.non_empty()){ - REGEX_IS024_Thread& thread = ctx.READ_halted_slots[type_new_stack.pop()]; - assert(thread.slot_occupation_status & SLOT_OCCUPIED); - thread.delete_thread(); + void SSID_Stack::append(sslot_id_t x) { + assert(max_size > 0); + assert(slots); + assert(sz < max_size); + slots[sz] = x; + sz++; } -} -/* First it will try to pop pending thread from FORK_halted_stack - * Then it will try popping thread from READ_halted_stack_old (checking if top - * thread here is not actually SLOT_NEW). If something succeded, corresponding slot will be deoccupied, and - * active slot will be occupied with it. - * - * try_to_continue_scheduled() assumes that active thread is unoccupied.*/ -void REGEX_IS024_CONTEXT::try_to_continue_scheduled(){ - ctx_print_debug(*this); - my_assert(!(active_thread.slot_occupation_status & SLOT_OCCUPIED)); - if (FORK_halted_stack.sz){ - regex_sslot_id_t ssid = FORK_halted_stack.pop(); - active_thread = FORK_halted_slots[ssid]; - FORK_halted_slots[ssid].slot_occupation_status = SLOT_EMPTY_val; - return; + bool SSID_Stack::empty() const { + return sz == 0; } - while (READ_halted_stack_old.sz){ - regex_sslot_id_t ssid = READ_halted_stack_old.pop(); - if (READ_halted_slots[ssid].slot_occupation_status & SLOT_NEW){ - /* This is the case when old thread was silently replaced by settled new thread */ - continue; + + SSID_Stack::~SSID_Stack() { + assert(empty()); + free(slots); + } + + VMContext::VMContext(size_t programSize, const uint8_t *data, + uint64_t caTreeLimit, tai_t saLenLimit, + sslot_id_t readSsLimit, sslot_id_t forkSsLimit, + uint64_t timeTickLimit) : + program_size(programSize), prg(data), CA_TREE_LIMIT(caTreeLimit), SA_LEN_LIMIT(saLenLimit), + READ_SS_LIMIT(readSsLimit), FORK_SS_LIMIT(forkSsLimit), time_tick_limit(timeTickLimit) + { + if (program_size > (1UL << 62)) + throw std::runtime_error("Program is too big"); + active_thread.slot_occupation_status = SLOT_OCCUPIED; + } + + /* No only will it launch a wave of deallocation in CA tree, but as a nice bonus it's + * gonna deoccupy slot_occupation_status*/ + void Thread::delete_thread() noexcept { + thread_print_debug(*this); + my_assert(slot_occupation_status & SLOT_OCCUPIED); + slot_occupation_status = SLOT_EMPTY_val; + CollectionArrayNode* cur_CAptr = CAHptr; + while (cur_CAptr){ + assert(cur_CAptr->refs > 0); + if (--(cur_CAptr->refs) == 0){ + CollectionArrayNode* next_CAptr = cur_CAptr->prev; + delete cur_CAptr; + cur_CAptr = next_CAptr; + } else + break; + } + if (SAptr){ + if (--(SAptr[0]) == 0) + free(SAptr); } - active_thread = READ_halted_slots[ssid]; - READ_halted_slots[ssid].slot_occupation_status = SLOT_EMPTY_val; - return; } - /* Failure here will be detected. We started with unoccupied active thread. iterator inside kick will see it */ -} -void kick(REGEX_IS024_CONTEXT& ctx) { - ctx_print_debug(ctx); - while ((ctx.active_thread.slot_occupation_status & SLOT_OCCUPIED) - && ctx.error == regex024_error_codes::stable){ - if (ctx.timer >= ctx.time_tick_limit) - smitsya(timeout); - ctx.timer++; - - check_available_prg(REGEX024_BYTECODE_INSTRUCTION_SZ) // May return from kick(ctx) - // smivanie from those instructions will be immediately detected. Everything is OK - instruction_table(ctx); - } -} - - -regex024_error_code REGEX_IS024_CONTEXT::feedSOF() { - ctx_print_debug(*this); - kick(*this); - return error; -} - -regex024_error_code REGEX_IS024_CONTEXT::startThread() { - ctx_print_debug(*this); - active_thread.slot_occupation_status = SLOT_OCCUPIED; - active_thread.IP = unnatural_started_thread_IP; - active_thread.SAptr = NULL; - active_thread.CAHptr = NULL; - kick(*this); - return error; -} - -/* I hate C++ (aka antichrist), won't use move sementic (aka drink cornsyrup) */ -void swap_stacks(REGEX_IS024_Stack& A, REGEX_IS024_Stack& B) { - std::swap(A.sz, B.sz); - std::swap(A.slots, B.slots); -} - -void fill_empty_old_read_halted_stack(REGEX_IS024_CONTEXT& ctx, REGEX_IS024_Stack& read_halted_stack_new){ - ctx_print_debug(ctx); - my_assert(!ctx.READ_halted_stack_old.non_empty()); - - // Actually, READ_halted_stack_old is always empty in this case - assert(ctx.READ_halted_stack_old.empty()); - swap_stacks(ctx.READ_halted_stack_old, read_halted_stack_new); - for (uint32_t i = 0; i < ctx.READ_halted_stack_old.sz; i++){ - REGEX_IS024_Thread& slot = ctx.READ_halted_slots[ctx.READ_halted_stack_old.slots[i]]; - /* Should get rid of 'NEW' qualifier */ - assert(slot.slot_occupation_status & SLOT_OCCUPIED); - if (slot.slot_occupation_status & SLOT_OCCUPIED) - slot.slot_occupation_status = SLOT_OCCUPIED; - } -} - -regex024_error_code REGEX_IS024_CONTEXT::feedCharacter(uint64_t input, uint64_t corresponding_byte_amount) { - ctx_print_debug(*this); - if (matched_thread.slot_occupation_status & SLOT_OCCUPIED) - matched_thread.delete_thread(); - emptify_one_of_new_read_halted_stacks(*this, READ_halted_stack_new_second); - fill_empty_old_read_halted_stack(*this, READ_halted_stack_new_first); - INP = input; - passed_bytes += corresponding_byte_amount; - passed_chars++; - try_to_continue_scheduled(); - kick(*this); - return error; -} - -regex024_error_code REGEX_IS024_CONTEXT::extendedFeedCharacter(uint64_t input) { - ctx_print_debug(*this); - if (matched_thread.slot_occupation_status & SLOT_OCCUPIED) - matched_thread.delete_thread(); - fill_empty_old_read_halted_stack(*this, READ_halted_stack_new_second); - INP = input; - try_to_continue_scheduled(); - kick(*this); - return error; -} - -REGEX_IS024_CONTEXT::~REGEX_IS024_CONTEXT() { - ctx_print_debug(*this); - if (initialized){ - emptify_one_of_new_read_halted_stacks(*this, READ_halted_stack_new_first); - emptify_one_of_new_read_halted_stacks(*this, READ_halted_stack_new_second); - while (READ_halted_stack_old.non_empty()){ - REGEX_IS024_Thread& thread = READ_halted_slots[READ_halted_stack_old.pop()]; + void emptify_one_of_new_read_halted_stacks(VMContext& ctx, SSID_Stack& type_new_stack){ + while (!type_new_stack.empty()){ + Thread& thread = ctx.READ_halted_slots[type_new_stack.pop()]; assert(thread.slot_occupation_status & SLOT_OCCUPIED); - if (!(thread.slot_occupation_status & SLOT_NEW)) - thread.delete_thread(); + thread.delete_thread(); } - free(READ_halted_slots); - while (FORK_halted_stack.non_empty()) - FORK_halted_slots[FORK_halted_stack.pop()].delete_thread(); - free(FORK_halted_slots); + } - if (matched_thread.slot_occupation_status & SLOT_OCCUPIED){ + /* First it will try to pop pending thread from FORK_halted_stack + * Then it will try popping thread from READ_halted_stack_old (checking if top + * thread here is not actually SLOT_NEW). If something succeded, corresponding slot will be deoccupied, and + * active slot will be occupied with it. + * + * try_to_continue_scheduled() assumes that active thread is unoccupied.*/ + void VMContext::try_to_continue_scheduled(){ + ctx_print_debug(*this); + my_assert(!(active_thread.slot_occupation_status & SLOT_OCCUPIED)); + if (FORK_halted_stack.sz){ + sslot_id_t ssid = FORK_halted_stack.pop(); + active_thread = FORK_halted_slots[ssid]; + FORK_halted_slots[ssid].slot_occupation_status = SLOT_EMPTY_val; + return; + } + while (READ_halted_stack_old.sz){ + sslot_id_t ssid = READ_halted_stack_old.pop(); + if (READ_halted_slots[ssid].slot_occupation_status & SLOT_NEW){ + /* This is the case when old thread was silently replaced by settled new thread */ + continue; + } + active_thread = READ_halted_slots[ssid]; + READ_halted_slots[ssid].slot_occupation_status = SLOT_EMPTY_val; + return; + } + /* Failure here will be detected. We started with unoccupied active thread. iterator inside kick will see it */ + } + + void kick(VMContext& ctx) { + ctx_print_debug(ctx); + while ((ctx.active_thread.slot_occupation_status & SLOT_OCCUPIED) + && ctx.error == error_codes::stable){ + if (ctx.timer >= ctx.time_tick_limit) + smitsya(timeout); + ctx.timer++; + + check_available_prg(BYTECODE_INSTRUCTION_SZ) // May return from kick(ctx) + // smivanie from those instructions will be immediately detected. Everything is OK + instruction_table(ctx); + } + } + + + error_code_t VMContext::feedSOF() { + ctx_print_debug(*this); + kick(*this); + return error; + } + + error_code_t VMContext::startThread() { + ctx_print_debug(*this); + active_thread.slot_occupation_status = SLOT_OCCUPIED; + active_thread.IP = unnatural_started_thread_IP; + active_thread.SAptr = NULL; + active_thread.CAHptr = NULL; + kick(*this); + return error; + } + + void fill_empty_old_read_halted_stack(VMContext& ctx, SSID_Stack& read_halted_stack_new){ + ctx_print_debug(ctx); + // Actually, READ_halted_stack_old is always empty in this case + assert(ctx.READ_halted_stack_old.empty()); + while (!read_halted_stack_new.empty()) { + sslot_id_t sr = read_halted_stack_new.pop(); + Thread& slot = ctx.READ_halted_slots[sr]; + assert(slot.slot_occupation_status & SLOT_NEW_val); + slot.slot_occupation_status = SLOT_OCCUPIED_val; + ctx.READ_halted_stack_old.append(sr); + } + } + + error_code_t VMContext::feedCharacter(uint64_t input, uint64_t corresponding_byte_amount) { + ctx_print_debug(*this); + if (matched_thread.slot_occupation_status & SLOT_OCCUPIED) matched_thread.delete_thread(); + emptify_one_of_new_read_halted_stacks(*this, READ_halted_stack_new_second); + fill_empty_old_read_halted_stack(*this, READ_halted_stack_new_first); + INP = input; + passed_bytes += corresponding_byte_amount; + passed_chars++; + try_to_continue_scheduled(); + kick(*this); + return error; + } + + error_code_t VMContext::extendedFeedCharacter(uint64_t input) { + ctx_print_debug(*this); + if (matched_thread.slot_occupation_status & SLOT_OCCUPIED) + matched_thread.delete_thread(); + fill_empty_old_read_halted_stack(*this, READ_halted_stack_new_second); + INP = input; + try_to_continue_scheduled(); + kick(*this); + return error; + } + + VMContext::~VMContext() { + ctx_print_debug(*this); + if (initialized){ + emptify_one_of_new_read_halted_stacks(*this, READ_halted_stack_new_first); + emptify_one_of_new_read_halted_stacks(*this, READ_halted_stack_new_second); + while (!READ_halted_stack_old.empty()){ + Thread& thread = READ_halted_slots[READ_halted_stack_old.pop()]; + assert(thread.slot_occupation_status & SLOT_OCCUPIED); + if (!(thread.slot_occupation_status & SLOT_NEW)) + thread.delete_thread(); + } + free(READ_halted_slots); + while (!FORK_halted_stack.empty()) + FORK_halted_slots[FORK_halted_stack.pop()].delete_thread(); + free(FORK_halted_slots); + + if (matched_thread.slot_occupation_status & SLOT_OCCUPIED){ + matched_thread.delete_thread(); + } } } } diff --git a/src/libregexis024vm/libregexis024vm_disassembly.cpp b/src/libregexis024vm/libregexis024vm_disassembly.cpp index 8d94165..fcdea12 100644 --- a/src/libregexis024vm/libregexis024vm_disassembly.cpp +++ b/src/libregexis024vm/libregexis024vm_disassembly.cpp @@ -1,38 +1,40 @@ #include #include -bool REGEX_IS024_CONTEXT::check_inboundness(int region){ - return vmprog_check_inboundness(program_size, active_thread.IP, region); -} +namespace regexis024 { + bool VMContext::check_inboundness(int region){ + return vmprog_check_inboundness(program_size, active_thread.IP, region); + } -uint8_t REGEX_IS024_CONTEXT::extract_b() { - return vmprog_extract_b(&active_thread.IP, prg); -} + uint8_t VMContext::extract_b() { + return vmprog_extract_b(&active_thread.IP, prg); + } -uint16_t REGEX_IS024_CONTEXT::extract_w() { - return vmprog_extract_w(&active_thread.IP, prg); -} + uint16_t VMContext::extract_w() { + return vmprog_extract_w(&active_thread.IP, prg); + } -uint32_t REGEX_IS024_CONTEXT::extract_dw() { - return vmprog_extract_dw(&active_thread.IP, prg); -} + uint32_t VMContext::extract_dw() { + return vmprog_extract_dw(&active_thread.IP, prg); + } -uint64_t REGEX_IS024_CONTEXT::extract_qw() { - return vmprog_extract_qw(&active_thread.IP, prg); -} + uint64_t VMContext::extract_qw() { + return vmprog_extract_qw(&active_thread.IP, prg); + } -uint8_t REGEX_IS024_CONTEXT::extract_instruction() { - return extract_b(); -} + uint8_t VMContext::extract_instruction() { + return extract_b(); + } -regex_sslot_id_t REGEX_IS024_CONTEXT::extract_sslot_id() { - return extract_dw(); -} + sslot_id_t VMContext::extract_sslot_id() { + return extract_dw(); + } -regex_near_ptr_t REGEX_IS024_CONTEXT::extract_near_pointer() { - return extract_qw(); -} + near_ptr_t VMContext::extract_near_pointer() { + return extract_qw(); + } -regex_tai_t REGEX_IS024_CONTEXT::extract_track_array_index() { - return extract_w(); + tai_t VMContext::extract_track_array_index() { + return extract_w(); + } } diff --git a/src/libregexis024vm/libregexis024vm_interface.cpp b/src/libregexis024vm/libregexis024vm_interface.cpp index 371cce9..0594e0f 100644 --- a/src/libregexis024vm/libregexis024vm_interface.cpp +++ b/src/libregexis024vm/libregexis024vm_interface.cpp @@ -1,105 +1,106 @@ +#include #include #include #include -bool REGEX_IS024_CAEvent::operator==(const REGEX_IS024_CAEvent &other) const { - return (key == other.key) && (value == other.value); -} - -#define reveal ((REGEX_IS024_CONTEXT*)opaque) - -REGEX_IS024_VirtualMachine::REGEX_IS024_VirtualMachine(size_t programSize, const uint8_t *data, - uint64_t caTreeLimit, regex_tai_t saLenLimit, - regex_sslot_id_t readSsLimit, regex_sslot_id_t forkSsLimit, - uint64_t timeTickLimit) { - opaque = new REGEX_IS024_CONTEXT(programSize, data, caTreeLimit, saLenLimit, - readSsLimit, forkSsLimit, timeTickLimit); -} - -regex024_error_code REGEX_IS024_VirtualMachine::initialize() { - if (gave_SOF) - exitf("double feedSOF\n"); - gave_SOF = true; - return reveal->feedSOF(); -} - -bool REGEX_IS024_VirtualMachine::isInitialized() { - return reveal->initialized; -} - -bool REGEX_IS024_VirtualMachine::isUsable() { - return isInitialized() && reveal->error == regex024_error_codes::stable; -} - -REGEX_IS024_VirtualMachine::~REGEX_IS024_VirtualMachine() { - delete reveal; -} - -regex_tai_t REGEX_IS024_VirtualMachine::getSelectionArrayLength() { - return isUsable() ? reveal->selection_array_len : 0; -} - -bool REGEX_IS024_VirtualMachine::isAllowMultistart() { - return isUsable() ? reveal->allows_multistart : false; -} - -uint8_t REGEX_IS024_VirtualMachine::getInputLeftExtensionSize() { - return isUsable() ? reveal->fed_input_extends_left : 0; -} - -uint8_t REGEX_IS024_VirtualMachine::getInputRightExtensionSize() { - return isUsable() ? reveal->fed_input_extends_right : 0; -} - -regex024_error_code REGEX_IS024_VirtualMachine::getErrno() { - return reveal->error; -} - -/* Stupid kinda function. Checks if somebody is ready to continue reading the actual string */ -bool REGEX_IS024_VirtualMachine::haveSurvivors() { - return isUsable() && (reveal->READ_halted_stack_new_first.non_empty()); -} - -bool REGEX_IS024_VirtualMachine::isMatched() { - return isUsable() && static_cast((reveal->matched_thread.slot_occupation_status & SLOT_OCCUPIED)); -} - -std::vector REGEX_IS024_VirtualMachine::getMatchedThreadCABranchReverse() { - if (!isMatched()) - return {}; - std::vector res; - REGEX024_CollectionArrayNode* cur = reveal->matched_thread.CAHptr; - while (cur != NULL){ - res.push_back({cur->key, cur->value}); - cur = cur->prev; +namespace regexis024 { + bool CAEvent::operator==(const CAEvent &other) const { + return (key == other.key) && (value == other.value); } - return res; -} -uint64_t REGEX_IS024_VirtualMachine::getMatchedThreadSAValue(uint16_t key) { - if (key >= getSelectionArrayLength()) - return 0; - if (!isMatched()) - return 0; - return reveal->matched_thread.SAptr ? reveal->matched_thread.SAptr[key + 1] : 0; -} +#define reveal ((VMContext*)opaque) -regex024_error_code REGEX_IS024_VirtualMachine::addNewMatchingThread() { - if (!isUsable()) - exitf("unusable\n"); - // if (started_first_thread && !isAllowMultistart()) - // exitf("Multistart is forbidden, bad usage of program\n"); - return reveal->startThread(); -} + VirtualMachine::VirtualMachine(size_t programSize, const uint8_t *data, + uint64_t caTreeLimit, tai_t saLenLimit, + sslot_id_t readSsLimit, sslot_id_t forkSsLimit, + uint64_t timeTickLimit) { + opaque = new VMContext(programSize, data, caTreeLimit, saLenLimit, + readSsLimit, forkSsLimit, timeTickLimit); + } -regex024_error_code REGEX_IS024_VirtualMachine::extendedFeedCharacter(uint64_t input) { - if (!isUsable()) - exitf("unusable\n"); - return reveal->extendedFeedCharacter(input); -} + error_code_t VirtualMachine::initialize() { + if (gave_SOF) + throw std::runtime_error("double feedSOF\n"); + gave_SOF = true; + return reveal->feedSOF(); + } -regex024_error_code REGEX_IS024_VirtualMachine::feedCharacter(uint64_t input, uint64_t bytesResembled) { - if (!isUsable()) - exitf("unusable\n"); - return reveal->feedCharacter(input, bytesResembled); -} + bool VirtualMachine::isInitialized() { + return reveal->initialized; + } + + bool VirtualMachine::isUsable() { + return isInitialized() && reveal->error == error_codes::stable; + } + + VirtualMachine::~VirtualMachine() { + delete reveal; + } + + tai_t VirtualMachine::getSelectionArrayLength() { + return isUsable() ? reveal->selection_array_len : 0; + } + + bool VirtualMachine::isAllowMultistart() { + return isUsable() ? reveal->allows_multistart : false; + } + + uint8_t VirtualMachine::getInputLeftExtensionSize() { + return isUsable() ? reveal->fed_input_extends_left : 0; + } + + uint8_t VirtualMachine::getInputRightExtensionSize() { + return isUsable() ? reveal->fed_input_extends_right : 0; + } + + error_code_t VirtualMachine::getErrno() { + return reveal->error; + } + + /* Stupid kinda function. Checks if somebody is ready to continue reading the actual string or extended l-r input */ + bool VirtualMachine::haveSurvivors() { + return isUsable() && (!reveal->READ_halted_stack_new_first.empty() || !reveal->READ_halted_stack_new_second.empty()); + } + + bool VirtualMachine::isMatched() { + return isUsable() && static_cast((reveal->matched_thread.slot_occupation_status & SLOT_OCCUPIED)); + } + + std::vector VirtualMachine::getMatchedThreadCABranchReverse() { + if (!isMatched()) + return {}; + std::vector res; + CollectionArrayNode* cur = reveal->matched_thread.CAHptr; + while (cur != NULL){ + res.push_back({cur->key, cur->value}); + cur = cur->prev; + } + return res; + } + + uint64_t VirtualMachine::getMatchedThreadSAValue(uint16_t key) { + if (key >= getSelectionArrayLength()) + return 0; + if (!isMatched()) + return 0; + return reveal->matched_thread.SAptr ? reveal->matched_thread.SAptr[key + 1] : 0; + } + + error_code_t VirtualMachine::addNewMatchingThread() { + if (!isUsable()) + throw std::runtime_error("unusable"); + return reveal->startThread(); + } + + error_code_t VirtualMachine::extendedFeedCharacter(uint64_t input) { + if (!isUsable()) + throw std::runtime_error("unusable\n"); + return reveal->extendedFeedCharacter(input); + } + + error_code_t VirtualMachine::feedCharacter(uint64_t input, uint64_t bytesResembled) { + if (!isUsable()) + throw std::runtime_error("unusable\n"); + return reveal->feedCharacter(input, bytesResembled); + } +} \ No newline at end of file diff --git a/src/libregexis024vm/libregexis024vm_interface.h b/src/libregexis024vm/libregexis024vm_interface.h index a0d1583..b8bbf9a 100644 --- a/src/libregexis024vm/libregexis024vm_interface.h +++ b/src/libregexis024vm/libregexis024vm_interface.h @@ -6,41 +6,42 @@ #include #include -struct REGEX_IS024_CAEvent{ - regex_tai_t key; - uint64_t value; - bool operator==(const REGEX_IS024_CAEvent& other) const; -}; +namespace regexis024 { + struct CAEvent{ + tai_t key; + uint64_t value; + bool operator==(const CAEvent& other) const; + }; -class REGEX_IS024_VirtualMachine{ -public: - REGEX_IS024_VirtualMachine(size_t programSize, const uint8_t *data, uint64_t caTreeLimit, uint16_t saLenLimit, - uint32_t readSsLimit, uint32_t forkSsLimit, uint64_t timeTickLimit); + struct VirtualMachine{ + VirtualMachine(size_t programSize, const uint8_t *data, uint64_t caTreeLimit, uint16_t saLenLimit, + uint32_t readSsLimit, uint32_t forkSsLimit, uint64_t timeTickLimit); - REGEX_IS024_VirtualMachine(const REGEX_IS024_VirtualMachine& ) = delete; - REGEX_IS024_VirtualMachine& operator=(const REGEX_IS024_VirtualMachine&) = delete; + VirtualMachine(const VirtualMachine& ) = delete; + VirtualMachine& operator=(const VirtualMachine&) = delete; - regex024_error_code initialize(); - bool isInitialized(); - bool isUsable(); - virtual ~REGEX_IS024_VirtualMachine(); - regex_tai_t getSelectionArrayLength(); - bool isAllowMultistart(); - uint8_t getInputLeftExtensionSize(); - uint8_t getInputRightExtensionSize(); - regex024_error_code getErrno(); - bool haveSurvivors(); - bool isMatched(); - std::vector getMatchedThreadCABranchReverse(); - uint64_t getMatchedThreadSAValue(uint16_t key); + error_code_t initialize(); + bool isInitialized(); + bool isUsable(); + virtual ~VirtualMachine(); + tai_t getSelectionArrayLength(); + bool isAllowMultistart(); + uint8_t getInputLeftExtensionSize(); + uint8_t getInputRightExtensionSize(); + error_code_t getErrno(); + bool haveSurvivors(); + bool isMatched(); + std::vector getMatchedThreadCABranchReverse(); + uint64_t getMatchedThreadSAValue(uint16_t key); - regex024_error_code addNewMatchingThread(); - regex024_error_code extendedFeedCharacter(uint64_t input); - regex024_error_code feedCharacter(uint64_t input, uint64_t bytesResembled); + error_code_t addNewMatchingThread(); + error_code_t extendedFeedCharacter(uint64_t input); + error_code_t feedCharacter(uint64_t input, uint64_t bytesResembled); -private: - bool gave_SOF = false; - void* opaque; -}; + private: + bool gave_SOF = false; + void* opaque; + }; +} #endif //LIBREGEXIS024_LIBREGEXIS024VM_INTERFACE_H diff --git a/src/libregexis024vm/utils.cpp b/src/libregexis024vm/utils.cpp index 1b2abe2..ef42dfc 100644 --- a/src/libregexis024vm/utils.cpp +++ b/src/libregexis024vm/utils.cpp @@ -10,60 +10,52 @@ #error "Big endian is currently unsupported" #endif -void exitf(const char *fmt, ...) { - va_list va; - va_start(va, fmt); - vfprintf(stderr, fmt, va); - va_end(va); - exit(1); -} - -int utf8_retrieve_size(uint8_t firstByte) { - if (!(firstByte & 0b10000000)) - return 1; - uint8_t a = 0b11000000; - uint8_t b = 0b00100000; - for (int i = 2; i <= 4; i++){ - if ((firstByte & (a | b)) == a) - return i; - a |= b; - b >>= 1; +namespace regexis024 { + int utf8_retrieve_size(char firstByte) { + if (!((uint8_t)firstByte & 0b10000000)) + return 1; + uint8_t a = 0b11000000; + uint8_t b = 0b00100000; + for (int i = 2; i <= 4; i++){ + if (((uint8_t)firstByte & (a | b)) == a) + return i; + a |= b; + b >>= 1; + } + return -1; } - return -1; -} -int32_t utf8_retrieve_character(int sz, size_t pos, const uint8_t *string) { - if (sz == 1) - return string[pos]; - uint32_t v = string[pos] & (0b01111111 >> sz); - pos++; - for (int i = 1; i < sz; i++){ - uint32_t th = string[pos]; - if ((th & 0b11000000) != 0b10000000) - return -1; - v <<= 6; - v |= (th & 0b00111111); + int32_t utf8_retrieve_character(int sz, size_t pos, const char *string) { + if (sz == 1) + return (uint8_t)string[pos]; + uint32_t v = (uint8_t)string[pos] & (0b01111111 >> sz); pos++; + for (int i = 1; i < sz; i++){ + uint32_t th = (uint8_t)string[pos]; + if ((th & 0b11000000) != 0b10000000) + return -1; + v <<= 6; + v |= (th & 0b00111111); + pos++; + } + assert(v <= INT32_MAX); + return static_cast(v); } - assert(v <= INT32_MAX); - return static_cast(v); -} -#define AAAAAA {cp = -1; return;} - -void utf8_string_iterat(int32_t &cp, size_t &adj, size_t pos, const uint8_t *string, size_t string_size) { - if (pos >= string_size) AAAAAA - adj = utf8_retrieve_size(string[pos]); - if (adj < 0 || pos + adj > string_size) AAAAAA - if ((cp = utf8_retrieve_character(adj, pos, string)) < 0) AAAAAA -} - -bool is_string_in_stringset(const char *strSample, const char **strSet) { - const char** cmpSubject = strSet; - while ((*cmpSubject) != NULL){ - if (strcmp(strSample, *cmpSubject) == 0) - return true; - cmpSubject++; // += 8 bytes + void utf8_string_iterat(int32_t &cp, size_t &adj, size_t pos, const char *string, size_t string_size) { + if (pos >= string_size) {cp = -1; return;} + adj = utf8_retrieve_size(string[pos]); + if (adj < 0 || pos + adj > string_size) {cp = -1; return;} + if ((cp = utf8_retrieve_character(adj, pos, string)) < 0) {cp = -1;} } - return false; -} + + bool is_string_in_stringset(const char *strSample, const char **strSet) { + const char** cmpSubject = strSet; + while ((*cmpSubject) != NULL){ + if (strcmp(strSample, *cmpSubject) == 0) + return true; + cmpSubject++; // += 8 bytes + } + return false; + } +} \ No newline at end of file diff --git a/src/libregexis024vm/utils.h b/src/libregexis024vm/utils.h index 3650f19..dc683dc 100644 --- a/src/libregexis024vm/utils.h +++ b/src/libregexis024vm/utils.h @@ -4,18 +4,19 @@ #include #include -void exitf(const char* fmt, ...); +// todo: move this file out from my eyes. +namespace regexis024 { + /* 1, 2, 3, 4 on success; -1 on error */ + int utf8_retrieve_size(char firstByte); -/* 1, 2, 3, 4 on success; -1 on error */ -int utf8_retrieve_size(uint8_t firstByte); + /* sz is a positive value returned by utf8_retrieve_size. Returns negative on error */ + int32_t utf8_retrieve_character(int sz, size_t pos, const char* string); -/* sz is a positive value returned by utf8_retrieve_size. Returns negative on error */ -int32_t utf8_retrieve_character(int sz, size_t pos, const uint8_t* string); + /* cp is negative on error. adj is the size of letter in bytes. Can be used to adjust pos. + * All safety checks will be performed */ + void utf8_string_iterat(int32_t& cp, size_t& adj, size_t pos, const char* string, size_t string_size); -/* cp is negative on error. adj is the size of letter in bytes. Can be used to adjust pos. - * All safety checks will be performed */ -void utf8_string_iterat(int32_t& cp, size_t& adj, size_t pos, const uint8_t* string, size_t string_size); - -bool is_string_in_stringset(const char* strSample, const char* strSet[]); + bool is_string_in_stringset(const char* strSample, const char* strSet[]); +} #endif //LIBREGEXIS024_UTILS_H diff --git a/src/libregexis024vm/vm_errno.cpp b/src/libregexis024vm/vm_errno.cpp index 78dbcb0..5a7310b 100644 --- a/src/libregexis024vm/vm_errno.cpp +++ b/src/libregexis024vm/vm_errno.cpp @@ -1,26 +1,28 @@ #include -const char *regex024_error_code_tostr(regex024_error_code x) { -#define rcase(name) case regex024_error_codes::name: return #name; - switch (x) { - rcase(stable) - rcase(ca_tree_limit_violation) - rcase(sa_length_limit_violation) - rcase(read_sslot_count_limit_violation) - rcase(fork_sslot_count_limit_violation) - rcase(timeout) - rcase(improper_finish) - rcase(too_early) - rcase(too_late) - rcase(selection_arr_out_of_range) - rcase(read_sslot_out_of_range) - rcase(fork_sslot_out_of_range) - rcase(invalid_opcode) - rcase(invalid_register_code) - rcase(instruction_not_for_general_thread) - rcase(instruction_not_for_collision_thread) - rcase(bad_alloc) - default: - return "unknown_error_code"; +namespace regexis024 { + const char *error_code_to_str(error_code_t x) { +#define rcase(name) case error_codes::name: return #name; + switch (x) { + rcase(stable) + rcase(ca_tree_limit_violation) + rcase(sa_length_limit_violation) + rcase(read_sslot_count_limit_violation) + rcase(fork_sslot_count_limit_violation) + rcase(timeout) + rcase(improper_finish) + rcase(too_early) + rcase(too_late) + rcase(selection_arr_out_of_range) + rcase(read_sslot_out_of_range) + rcase(fork_sslot_out_of_range) + rcase(invalid_opcode) + rcase(invalid_register_code) + rcase(instruction_not_for_general_thread) + rcase(instruction_not_for_collision_thread) + rcase(bad_alloc) + default: + return "unknown_error_code"; + } } -} +} \ No newline at end of file diff --git a/src/libregexis024vm/vm_errno.h b/src/libregexis024vm/vm_errno.h index cdfa1cd..cd3162e 100644 --- a/src/libregexis024vm/vm_errno.h +++ b/src/libregexis024vm/vm_errno.h @@ -3,43 +3,45 @@ #include -namespace regex024_error_codes { - enum regex024_error_code_I: int { - stable = 0, - ca_tree_limit_violation = -1, - sa_length_limit_violation = -2, - read_sslot_count_limit_violation = -3, - fork_sslot_count_limit_violation = -4, - timeout = -5, - /* Threads should be either abandoned by user of virtual machine after MATCH, - * ot be stopped by DIE instruction. Out of bound jump is disallowed */ - improper_finish = -6, - /* Operation for general phase is executed in init phase */ - too_early = -7, - /* Operation for init phase is executed in general phase */ - too_late = -8, - /* Used selection array index is out of range */ - selection_arr_out_of_range = -9, - /* Used read slot is out of range */ - read_sslot_out_of_range = -10, - /* Used fork slot is out of range */ - fork_sslot_out_of_range = -11, +namespace regexis024 { + namespace error_codes { + enum regex024_error_code_I: int { + stable = 0, + ca_tree_limit_violation = -1, + sa_length_limit_violation = -2, + read_sslot_count_limit_violation = -3, + fork_sslot_count_limit_violation = -4, + timeout = -5, + /* Threads should be either abandoned by user of virtual machine after MATCH, + * ot be stopped by DIE instruction. Out of bound jump is disallowed */ + improper_finish = -6, + /* Operation for general phase is executed in init phase */ + too_early = -7, + /* Operation for init phase is executed in general phase */ + too_late = -8, + /* Used selection array index is out of range */ + selection_arr_out_of_range = -9, + /* Used read slot is out of range */ + read_sslot_out_of_range = -10, + /* Used fork slot is out of range */ + fork_sslot_out_of_range = -11, - invalid_opcode = -12, - invalid_register_code = -13, - /* Next operation scheduled for execution is forbidden in general thread */ - instruction_not_for_general_thread = -14, - /* Next operation scheduled for execution is forbidden in collision thread */ - instruction_not_for_collision_thread = -15, - /* Program willingly threw exception */ - program_throw = -16, - /* O_o */ - bad_alloc = -17, - }; + invalid_opcode = -12, + invalid_register_code = -13, + /* Next operation scheduled for execution is forbidden in general thread */ + instruction_not_for_general_thread = -14, + /* Next operation scheduled for execution is forbidden in collision thread */ + instruction_not_for_collision_thread = -15, + /* Program willingly threw exception */ + program_throw = -16, + /* O_o */ + bad_alloc = -17, + }; + } + + typedef error_codes::regex024_error_code_I error_code_t; + + const char* error_code_to_str(error_code_t x); } -typedef regex024_error_codes::regex024_error_code_I regex024_error_code; - -const char* regex024_error_code_tostr(regex024_error_code x); - #endif //LIBREGEXIS024_VM_ERRNO_H diff --git a/src/libregexis024vm/vm_opcodes.h b/src/libregexis024vm/vm_opcodes.h index e76e5af..c3aa7b2 100644 --- a/src/libregexis024vm/vm_opcodes.h +++ b/src/libregexis024vm/vm_opcodes.h @@ -3,97 +3,97 @@ #include -namespace regex024_opcodes { - enum regex024_opcode_I: uint8_t{ - /* READ */ - READ = 0, - /* READZ = READ 0 */ - READZ = 1, - /* JUMP */ - JUMP = 2, +namespace regexis024 { + namespace opcodes { + enum regex024_opcode_I: uint8_t{ + /* READ */ + READ = 0, + /* READZ = READ 0 */ + READZ = 1, + /* JUMP */ + JUMP = 2, - /* JCEQUAL - jump conditional (equal): JCEQUAL */ - JCEQUAL_B = 3, - JCEQUAL_W = 4, - JCEQUAL_DW = 5, - JCEQUAL_QW = 6, - /* JCLESS - jump conditional (less): JCLESS */ - JCLESS_B = 7, - JCLESS_W = 8, - JCLESS_DW = 9, - JCLESS_QW = 10, - /* JCGRTR - jump conditional (greater): JCGRTR */ - JCGRTR_B = 11, - JCGRTR_W = 12, - JCGRTR_DW = 13, - JCGRTR_QW = 14, + /* JCEQUAL - jump conditional (equal): JCEQUAL */ + JCEQUAL_B = 3, + JCEQUAL_W = 4, + JCEQUAL_DW = 5, + JCEQUAL_QW = 6, + /* JCLESS - jump conditional (less): JCLESS */ + JCLESS_B = 7, + JCLESS_W = 8, + JCLESS_DW = 9, + JCLESS_QW = 10, + /* JCGRTR - jump conditional (greater): JCGRTR */ + JCGRTR_B = 11, + JCGRTR_W = 12, + JCGRTR_DW = 13, + JCGRTR_QW = 14, - /* FORK */ - FORK = 15, - /* MATCH | */ - MATCH = 16, - /* DIE | */ - DIE = 17, - /* PARAM_READ_SS_NUMBER */ - PARAM_READ_SS_NUMBER = 18, - /* PARAM_FORK_SS_NUMBER */ - PARAM_FORK_SS_NUMBER = 19, - /* PARAM_SELARR_LEN */ - PARAM_SELARR_LEN = 20, - /* PARAM_COLSIFTFUNC_SET */ - PARAM_COLSIFTFUNC_SET = 21, - /* PARAM_COLSIFTFUNC_WIPE */ - PARAM_COLSIFTFUNC_WIPE = 22, - /* MSG_MULTISTART_ALLOWED <1B> */ - MSG_MULTISTART_ALLOWED = 23, - /* MSG_FED_INPUT_EXTENDED <1B> <1B> */ - MSG_FED_INPUT_EXTENDED = 24, - /* DMOVRABXSELARR */ - DMOV_RABX_SELARR = 25, - /* DDISTRABXSELARR */ - DDIST_RABX_SELARR = 26, - /* SIFTPRIOR_MIN_RABX */ - SIFTPRIOR_MIN_RABX = 27, - /* SIFTPRIOR_MAX_RABX */ - SIFTPRIOR_MAX_RABX = 28, - /* SIFT_DONE */ - SIFT_DONE = 29, - /* MOV_COLARR_IMM <8B> */ - MOV_COLARR_IMM = 30, - /* MOV_COLARR_BTPOS */ - MOV_COLARR_BTPOS = 31, - /* MOV_SELARR_IMM <8B> */ - MOV_SELARR_IMM = 32, - /* MOV_SELARR_CHPOS */ - MOV_SELARR_CHPOS = 33, - /* INIT */ - INIT = 34, - /* THROW */ - THROW = 35, - regex024_opcode_greaterMax = 36 - }; + /* FORK */ + FORK = 15, + /* MATCH | */ + MATCH = 16, + /* DIE | */ + DIE = 17, + /* PARAM_READ_SS_NUMBER */ + PARAM_READ_SS_NUMBER = 18, + /* PARAM_FORK_SS_NUMBER */ + PARAM_FORK_SS_NUMBER = 19, + /* PARAM_SELARR_LEN */ + PARAM_SELARR_LEN = 20, + /* PARAM_COLSIFTFUNC_SET */ + PARAM_COLSIFTFUNC_SET = 21, + /* PARAM_COLSIFTFUNC_WIPE */ + PARAM_COLSIFTFUNC_WIPE = 22, + /* MSG_MULTISTART_ALLOWED <1B> */ + MSG_MULTISTART_ALLOWED = 23, + /* MSG_FED_INPUT_EXTENDED <1B> <1B> */ + MSG_FED_INPUT_EXTENDED = 24, + /* DMOVRABXSELARR */ + DMOV_RABX_SELARR = 25, + /* DDISTRABXSELARR */ + DDIST_RABX_SELARR = 26, + /* SIFTPRIOR_MIN_RABX */ + SIFTPRIOR_MIN_RABX = 27, + /* SIFTPRIOR_MAX_RABX */ + SIFTPRIOR_MAX_RABX = 28, + /* SIFT_DONE */ + SIFT_DONE = 29, + /* MOV_COLARR_IMM <8B> */ + MOV_COLARR_IMM = 30, + /* MOV_COLARR_BTPOS */ + MOV_COLARR_BTPOS = 31, + /* MOV_SELARR_IMM <8B> */ + MOV_SELARR_IMM = 32, + /* MOV_SELARR_CHPOS */ + MOV_SELARR_CHPOS = 33, + /* INIT */ + INIT = 34, + /* THROW */ + THROW = 35, + regex024_opcode_greaterMax = 36 + }; + } + + typedef opcodes::regex024_opcode_I opcode_t; + + const char* opcode_to_str(opcode_t x); + + constexpr uint64_t BYTECODE_INSTRUCTION_SZ = 1; + constexpr uint64_t BYTECODE_SSLOT_ID_SZ = 4; + constexpr uint64_t BYTECODE_TRACK_ARRAY_INDEX_ID_SZ = 2; + constexpr uint64_t BYTECODE_NEAR_POINTER_SZ = 8; + + bool vmprog_check_inboundness(near_ptr_t prgSize, near_ptr_t IP, near_ptr_t region); + + uint8_t vmprog_extract_b(near_ptr_t* IPptr, const uint8_t* prg); + uint16_t vmprog_extract_w(near_ptr_t* IPptr, const uint8_t* prg); + uint32_t vmprog_extract_dw(near_ptr_t* IPptr, const uint8_t* prg); + uint64_t vmprog_extract_qw(near_ptr_t* IPptr, const uint8_t* prg); + + uint8_t vmprog_extract_instruction(near_ptr_t* IPptr, const uint8_t* prg); + sslot_id_t vmprog_extract_sslot_id(near_ptr_t* IPptr, const uint8_t* prg); + near_ptr_t vmprog_extract_near_pointer(near_ptr_t* IPptr, const uint8_t* prg); + tai_t vmprog_extrack_track_array_index(near_ptr_t* IPptr, const uint8_t* prg); } - -typedef regex024_opcodes::regex024_opcode_I regex024_opcode; - -const char* regex024_opcode_tostr(regex024_opcode x); - - -constexpr uint64_t REGEX024_BYTECODE_INSTRUCTION_SZ = 1; -constexpr uint64_t REGEX024_BYTECODE_SSLOT_ID_SZ = 4; -constexpr uint64_t REGEX024_BYTECODE_TRACK_ARRAY_INDEX_ID_SZ = 2; -constexpr uint64_t REGEX024_BYTECODE_NEAR_POINTER_SZ = 8; - -bool vmprog_check_inboundness(regex_near_ptr_t prgSize, regex_near_ptr_t IP, regex_near_ptr_t region); - -uint8_t vmprog_extract_b(regex_near_ptr_t* IPptr, const uint8_t* prg); -uint16_t vmprog_extract_w(regex_near_ptr_t* IPptr, const uint8_t* prg); -uint32_t vmprog_extract_dw(regex_near_ptr_t* IPptr, const uint8_t* prg); -uint64_t vmprog_extract_qw(regex_near_ptr_t* IPptr, const uint8_t* prg); - -uint8_t vmprog_extract_instruction(regex_near_ptr_t* IPptr, const uint8_t* prg); -regex_sslot_id_t vmprog_extract_sslot_id(regex_near_ptr_t* IPptr, const uint8_t* prg); -regex_near_ptr_t vmprog_extract_near_pointer(regex_near_ptr_t* IPptr, const uint8_t* prg); -regex_tai_t vmprog_extrack_track_array_index(regex_near_ptr_t* IPptr, const uint8_t* prg); - #endif //LIBREGEXIS024_VM_OPCODES_H diff --git a/src/libregexis024vm/vm_opcodes_disassembly.cpp b/src/libregexis024vm/vm_opcodes_disassembly.cpp index b0d9ffa..0cf651a 100644 --- a/src/libregexis024vm/vm_opcodes_disassembly.cpp +++ b/src/libregexis024vm/vm_opcodes_disassembly.cpp @@ -1,47 +1,54 @@ #include -#ifndef __ORDER_LITTLE_ENDIAN__ -#error "Big endian is currently unsupported" -#endif +namespace regexis024 { + bool vmprog_check_inboundness(near_ptr_t prgSz, near_ptr_t IP, near_ptr_t region) { + return IP + region <= prgSz; + } -bool vmprog_check_inboundness(regex_near_ptr_t prgSz, regex_near_ptr_t IP, regex_near_ptr_t region) { - return IP + region <= prgSz; -} + uint8_t vmprog_extract_b(near_ptr_t *IPptr, const uint8_t *prg) { + return prg[(*IPptr)++]; + } -uint8_t vmprog_extract_b(regex_near_ptr_t *IPptr, const uint8_t *prg) { - return prg[(*IPptr)++]; -} + uint16_t vmprog_extract_w(near_ptr_t *IPptr, const uint8_t *prg) { + uint16_t answer = 0; + (*IPptr) += 2; + for (int i = 1; i < 3; i++) { + answer <<= 8; answer |= prg[(*IPptr) - i]; + } + return answer; + } -uint16_t vmprog_extract_w(regex_near_ptr_t *IPptr, const uint8_t *prg) { - uint16_t answer = *(uint16_t*)(&prg[*IPptr]); - *IPptr += 2; - return answer; -} + uint32_t vmprog_extract_dw(near_ptr_t *IPptr, const uint8_t *prg) { + uint32_t answer = 0; + (*IPptr) += 4; + for (int i = 1; i < 5; i++) { + answer <<= 8; answer |= prg[(*IPptr) - i]; + } + return answer; + } -uint32_t vmprog_extract_dw(regex_near_ptr_t *IPptr, const uint8_t *prg) { - uint32_t answer = *(uint32_t *)(&prg[*IPptr]); - *IPptr += 4; - return answer; -} + uint64_t vmprog_extract_qw(near_ptr_t *IPptr, const uint8_t *prg) { + uint64_t answer = 0; + (*IPptr) += 8; + for (int i = 1; i < 9; i++) { + answer <<= 8; answer |= prg[(*IPptr) - i]; + } + return answer; + } -uint64_t vmprog_extract_qw(regex_near_ptr_t *IPptr, const uint8_t *prg) { - uint64_t answer = *(uint64_t *)(&prg[*IPptr]); - *IPptr += 8; - return answer; -} + uint8_t vmprog_extract_instruction(near_ptr_t *IPptr, const uint8_t *prg) { + return vmprog_extract_b(IPptr, prg); + } -uint8_t vmprog_extract_instruction(regex_near_ptr_t *IPptr, const uint8_t *prg) { - return vmprog_extract_b(IPptr, prg); -} + sslot_id_t vmprog_extract_sslot_id(near_ptr_t *IPptr, const uint8_t *prg) { + return vmprog_extract_dw(IPptr, prg); + } -regex_sslot_id_t vmprog_extract_sslot_id(regex_near_ptr_t *IPptr, const uint8_t *prg) { - return vmprog_extract_dw(IPptr, prg); -} + near_ptr_t vmprog_extract_near_pointer(near_ptr_t *IPptr, const uint8_t *prg) { + return vmprog_extract_qw(IPptr, prg); + } -regex_near_ptr_t vmprog_extract_near_pointer(regex_near_ptr_t *IPptr, const uint8_t *prg) { - return vmprog_extract_qw(IPptr, prg); -} - -regex_tai_t vmprog_extrack_track_array_index(regex_near_ptr_t *IPptr, const uint8_t *prg) { - return vmprog_extract_w(IPptr, prg); + tai_t vmprog_extrack_track_array_index(near_ptr_t *IPptr, const uint8_t *prg) { + return vmprog_extract_w(IPptr, prg); + } } diff --git a/src/libregexis024vm/vm_opcodes_types.h b/src/libregexis024vm/vm_opcodes_types.h index 0707402..b303e51 100644 --- a/src/libregexis024vm/vm_opcodes_types.h +++ b/src/libregexis024vm/vm_opcodes_types.h @@ -3,9 +3,10 @@ #include -typedef uint32_t regex_sslot_id_t; -typedef uint64_t regex_near_ptr_t; -typedef uint16_t regex_tai_t; - +namespace regexis024 { + typedef uint32_t sslot_id_t; + typedef uint64_t near_ptr_t; + typedef uint16_t tai_t; +} #endif //VM_OPCODES_TYPES_H