commit b4381e92387df39dd53dcf4230aa69883bf04f1d Author: Andreev Gregory Date: Sun Jul 28 19:54:57 2024 +0300 First version diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7d14e76 --- /dev/null +++ b/.gitignore @@ -0,0 +1,11 @@ +# Never use CMAKE in production +CMakeLists.txt +cmake-build-debug/ +# Output of build system +built/ +# This is a compilated build system script +building/main +building/*.png +building/*.svg + +.idea/ \ No newline at end of file diff --git a/README.txt b/README.txt new file mode 100644 index 0000000..7dcc4db --- /dev/null +++ b/README.txt @@ -0,0 +1,8 @@ +libregexis024 +Library for Regular Expressions, implementation of summer 2024 + +libregexis024vm +Provides only means of configuration and running my regexp virtual machine bytecode + +libgreexpis024sol +Provides functions to compile regular expression into libregexis024 virual machine bytecode \ No newline at end of file diff --git a/building/build_build_system.sh b/building/build_build_system.sh new file mode 100755 index 0000000..aeb562d --- /dev/null +++ b/building/build_build_system.sh @@ -0,0 +1,9 @@ +#!/bin/sh + +BUILDING_DIR="./building" +[ -d "$BUILDING_DIR" ] || exit 1 +MAIN_FILE="$BUILDING_DIR/main.cpp" +[ -f "$MAIN_FILE" ] || exit 1 +COOL_FLAGS="$(pkg-config --cflags regexis024-build-system)" + +g++ $COOL_FLAGS -o "$BUILDING_DIR/main" "$MAIN_FILE" || exit 1 diff --git a/building/main.cpp b/building/main.cpp new file mode 100644 index 0000000..edcaa54 --- /dev/null +++ b/building/main.cpp @@ -0,0 +1,156 @@ +#include + +/* + * LIBREGEXIS024 SPECIFIC BUILD COMMANDS BEGIN + */ + +struct Libregexis024BuildSystem { + /* Building runlevel */ + BuildUnitsArray runlevel_1; + /* Installation runlevel */ + BuildUnitsArray runlevel_2; + + /* "debug" or "release" */ + std::string build_type; + bool build_tests = false; + + std::vector warning_flags = {"-Wall", "-Wno-unused-variable", "-Werror=return-type","-pedantic", + "-Wno-unused-but-set-variable", "-Wno-reorder"}; + std::vector version_flags = {"--std", "c++14", "-D", "_POSIX_C_SOURCE=200809L"}; + + std::vector debug_defines_release = {"_GLIBCXX_DEBUG"}; + std::vector debug_defines_debug = {"_GLIBCXX_DEBUG", "LIBREGEXIS024_DEBUG", "LIBREGEXIS024_ALLOW_LOUD"}; + std::vector opt_flags_release = {"-g", "-O2"}; + std::vector opt_flags_debug = {"-g", "-ggdb", "-O0"}; + + std::vector getSomeRadFlags() { + std::vector my_flag_collection; + gxx_add_cli_options(my_flag_collection, warning_flags); + gxx_add_cli_options(my_flag_collection, version_flags); + if (build_type == "release") { + gxx_add_cli_defines(my_flag_collection, debug_defines_release); + gxx_add_cli_options(my_flag_collection, opt_flags_release); + } else if (build_type == "debug") { + gxx_add_cli_defines(my_flag_collection, debug_defines_debug); + gxx_add_cli_options(my_flag_collection, opt_flags_debug); + } + return my_flag_collection; + } + + Libregexis024BuildSystem(const std::string& build_type, const NormalCBuildSystemCommandMeaning& cmd) + :build_type(build_type) + { + ASSERT(build_type == "release" || build_type == "debug", "Unknown build type"); + + std::vector ext_targets; + + std::vector my_targets; + { + std::vector compilation_units_release = { + "libregexis024vm/utils.cpp", + "libregexis024vm/vm_errno.cpp", + "libregexis024vm/vm_opcodes_disassembly.cpp", + "libregexis024vm/libregexis024vm_interface.cpp", + "libregexis024vm/libregexis024vm_disassembly.cpp", + "libregexis024vm/libregexis024vm_context.cpp", + "libregexis024vm/instruction_implementation.cpp", + "libregexis024vm/libregex024opcodes_stringification.cpp", + + "libregexis024fa/codeset.cpp", + "libregexis024fa/colored_codeset.cpp", + "libregexis024fa/fa_first_stage_fix.cpp", + "libregexis024fa/finite_automaton.cpp", + "libregexis024fa/misc_fa_funcs.cpp", + "libregexis024fa/selarr_priority_table.cpp", + "libregexis024fa/tracking_fa_nodes.cpp", + "libregexis024fa/fa_make_deterministic.cpp", + + "libregexis024fa/graph_to_bytecode/natural_compiler_utils.cpp", + "libregexis024fa/graph_to_bytecode/writing_commands.cpp", + "libregexis024fa/graph_to_bytecode/filter.cpp", + "libregexis024fa/graph_to_bytecode/fa_compiler.cpp", + "libregexis024fa/graph_to_bytecode/core.cpp", + + "libregexis024sol/common_codesets.cpp", + "libregexis024sol/part_of_expr_that_tracks.cpp", + "libregexis024sol/expr_compiler.cpp", + "libregexis024sol/square_bracket_expression.cpp", + "libregexis024sol/sol_misc_base.cpp", + "libregexis024sol/command_expression.cpp", + "libregexis024sol/backslash_expression.cpp", + "libregexis024sol/subexpr_fa_transformed.cpp", + "libregexis024sol/expr_parse_functions/tracking_units.cpp", + "libregexis024sol/expr_parse_functions/ep_sequence.cpp", + "libregexis024sol/expr_parse_functions/command_recognition.cpp", + + "libregexis024tools/stringmatching.cpp", + }; + + /* These are added to compilation_units_of_release */ + std::vector additional_compilation_units_debug = { + "debugging_regexis024/prettyprint/prettyprint_util.cpp", + "debugging_regexis024/vm/libregexis024vm_debug.cpp", + "debugging_regexis024/debug_through_graphviz.cpp", + }; + + /* Suitable forr both release and debug (even though you will pretty much never need to export headers of build of + * debug build type */ + std::vector exported_headers = { + "libregexis024vm/vm_errno.h", + "libregexis024vm/vm_opcodes_types.h", + "libregexis024vm/vm_opcodes.h", + "libregexis024vm/libregexis024vm_interface.h", + + "libregexis024fa/tracking_variables.h", + + "libregexis024sol/part_of_expr_that_tracks.h", + "libregexis024sol/expr_compiler.h", + + "libregexis024tools/stringmatching.h", + }; + + CTarget T("libregexis024", "shared_library"); + T.additional_compilation_flags = getSomeRadFlags(); + array_concat(T.units, compilation_units_release); + if (build_type == "debug") + array_concat(T.units, additional_compilation_units_debug); + T.include_pr = ""; + T.include_ir = ""; + T.exported_headers = exported_headers; + T.installation_dir = ""; + T.pc_output_path = "libregexis024.pc"; + my_targets.push_back(T); + } + if (build_tests) { + CTarget T("libregexis024_test4", "executable"); + T.additional_compilation_flags = getSomeRadFlags(); + T.proj_deps = {CTargetDependenceOnProjectsLibrary("libregexis024")}; + T.units = {"libregexis024test/test4.cpp"}; + my_targets.push_back(T); + } + + regular_ctargets_to_2bus_conversion(ext_targets, my_targets, runlevel_1, runlevel_2, + cmd.project_root, cmd.installation_root); + } +}; + +int main(int argc, char** argv) { + try { + assert(argc > 0); + std::vector args(argc - 1); + for (int i = 0; i + 1 < argc; i++) { + args[i] = argv[i + 1]; + } + NormalCBuildSystemCommandMeaning cmd; + regular_bs_cli_cmd_interpret(args, cmd); + Libregexis024BuildSystem bs("debug", cmd); + show_build_units_array_with_image_viewer(bs.runlevel_1, "true"); + show_build_units_array_with_image_viewer(bs.runlevel_2, "true"); + if (cmd.need_to_build) + complete_tasks_of_build_units(bs.runlevel_1); + if (cmd.need_to_install) + complete_tasks_of_build_units(bs.runlevel_2); + } catch (const buildSystemFailure& e) { + printf("Build system failure\n""%s\n", e.toString().c_str()); + } +} \ No newline at end of file diff --git a/src/debugging_regexis024/debug_through_graphviz.cpp b/src/debugging_regexis024/debug_through_graphviz.cpp new file mode 100644 index 0000000..29595fc --- /dev/null +++ b/src/debugging_regexis024/debug_through_graphviz.cpp @@ -0,0 +1,325 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +const char* one_char_read_color = "black"; +const char* forking_color = "darkorchid1"; +const char* look_one_behind_color = "darkslateblue"; +const char* look_one_ahead_color = "coral1"; +const char* track_array_mov_imm_color = "lightblue2"; +const char* track_array_mov_halfinvariant_color = "lightseagreen"; +const char* match_pending_lob_color = "darkgoldenrod2"; +const char* match_color = "gold"; +const char* det_char_crossroads_color = "navy"; +const char* error_color = "crimson"; +const char* STAR = "★"; + +const char* get_associated_color(FA_Node* node){ + switch (node->type) { +#define ccase(tn) case tn: return tn##_color; + ccase(one_char_read) + ccase(forking) + ccase(look_one_behind) + ccase(look_one_ahead) + ccase(track_array_mov_imm) + ccase(track_array_mov_halfinvariant) + ccase(det_char_crossroads) + case match: + return dynamic_cast(node)->ext_filter_added ? match_pending_lob_color : match_color; + default: + return "black"; +#undef ccase + } +} + +struct NodesProblems{ + size_t actual_refcount = 0; + bool refcount_problem = false; + size_t edges_point_to_null = 0; +}; + +struct EdgesProblems { + bool points_to_null = false; + explicit EdgesProblems(bool points_to_null): points_to_null(points_to_null) {} +}; + +std::string get_applied_edge_attributes(FA_Node* node, const NodesProblems& np, const EdgesProblems& ep){ + std::string res = "color="; + if (ep.points_to_null) { + res += error_color; + } else { + res += get_associated_color(node); + if (node->type == one_char_read || node->type == det_char_crossroads) + res += " style=bold"; + } + return res; +} + +std::string get_applied_node_attributes(FA_Node* node, const NodesProblems& bd){ + std::string res = "color="; + res += get_associated_color(node); + if (bd.refcount_problem) + res += " fontcolor=crimson"; + if ((node->type == match) || + (node->type == det_char_crossroads && dynamic_cast(node)->matching)) + res += " shape=doublecircle"; + return res; +} + +void append_reverse_hex(std::string& res, uint32_t num){ + if (num == 0){ + res += "0"; + } else { + while (num){ + uint32_t r = num & 0x0F; + res += static_cast((r < 10) ? (r + '0') : (r - 10 + 'a')); + num >>= 4; + } + } +} + +std::string stringify_codeset(const codeset_t& cs){ + std::string res; + for (long i = static_cast(cs.size()) - 1; i >= 0; i--) { + uint64_t start = cs[i].first, end = cs[i].second; + if (start == end) { + append_reverse_hex(res, start); + } else { + append_reverse_hex(res, end); + res += '-'; + append_reverse_hex(res, start); + } + if (i != 0) + res += ','; + } + std::reverse(res.begin(), res.end()); /* ascii works wonders */ + return res; +} + +std::string get_extended_node_lable(FA_Node* node){ + if ((node->type == one_char_read && dynamic_cast(node)->second_ns) || + (node->type == det_char_crossroads && dynamic_cast(node)->second_ns)) { + return std::string(" ") + STAR; + } + if (node->type == match) { + FA_NodeOfMatch* mn = static_cast(node); + if (mn->ext_filter_added) + return std::string(" pending loa ") + stringify_codeset(mn->pending_filter); + } + return ""; +} + +std::string get_node_lable(FA_Node* node, const NodesProblems& bd){ + std::string res; + switch (node->type) { +#define tcase(tn, str) case tn: res = str; break; + tcase(one_char_read, "ocr") + tcase(match, "m") + tcase(forking, "f") + tcase(look_one_behind, "lob") + tcase(look_one_ahead, "loa") + tcase(track_array_mov_imm, "tami") + tcase(track_array_mov_halfinvariant, "tamh") + tcase(det_char_crossroads, "dcc") + } + res += ("[" + std::to_string(node->nodeId) + "]"); + res += get_extended_node_lable(node); + if (bd.refcount_problem) + res += ("!refcount: " + std::to_string(node->refs) + "!"); + return res; +} + +void print_edge(FA_Node* start, const FA_Node* dest, const std::string& label, FILE* fd, NodesProblems& np){ + if (!dest){ + fprintf(stderr, "NULL transition going from node %lu\n", start->nodeId); + fprintf(fd, "%lu->NULL_%lu_%lu [label=\"%s\" color=crimson]", start->nodeId, + start->nodeId, np.edges_point_to_null++, label.c_str()); + return; + } + fprintf(fd, "%lu->%lu [label=\"%s\" %s]\n", start->nodeId, dest->nodeId, label.c_str(), + get_applied_edge_attributes(start, np, EdgesProblems(false)).c_str()); +} + +void print_fa(const FA_Container& fa, FILE* fd, const KnownTrackingTools& ktr, + const RegexPriorityTable& priority_table){ + assert(fa.start); + assert(fd); + fprintf(fd, "digraph finite_automaton {\ngraph [" + "fontname = \"Helvetica\" charset = \"UTF-8\" label = \"Finite Automaton\" labelloc = \"t\" labeljust = \"c\" " + "bgcolor = \"#FFFAF4\" fontcolor = black fontsize = 18 style = \"filled\" rankdir = LR margin = 0.2 " + "splines = spline nodesep = 0.9 ranksep = 1.2 ]\n node [ style = \"solid,filled\" fontsize = 15 " + "fontcolor = black fontname = \"Helvetica\" color = black fillcolor = white margin = \"0.2,0.2\" shape=circle " + "]\n edge [ style = solid fontsize = 16 fontcolor = black fontname = \"Helvetica\" color = black " + "labelfloat = false labeldistance = 2.5 labelangle = 70 arrowhead = normal ]\n" + "start_state [label = \"start\\nfrom\\nhere\" shape=none style=\"\" ]\n"); + + size_t n = fa.all.size(); + std::vector breakdown; + breakdown.resize(n); + breakdown[fa.start->nodeId].actual_refcount++; + for (size_t i = 0; i < n; i++){ + assert(fa.all[i]->nodeId == static_cast(i)); + for (FA_Node** nxtN: fa.all[i]->get_all_transitions()) + if ((*nxtN) != NULL) + breakdown[(**nxtN).nodeId].actual_refcount++; + } + for (size_t i = 0; i < n; i++){ + if (fa.all[i]->refs != breakdown[i].actual_refcount){ + breakdown[i].refcount_problem = true; + fprintf(stderr, "Corrupted FA: wrong refcount on node %lu\n", fa.all[i]->nodeId); + } + } + for (size_t i = 0; i < n; i++){ + fprintf(fd, "%lu [label=\"%s\" %s]\n", i, get_node_lable(fa.all[i], breakdown[i]).c_str(), + get_applied_node_attributes(fa.all[i], breakdown[i]).c_str()); + } + + /* Two Infoboxes */ + + auto stringifyTrackingVarType = [](tracking_var_type type) -> std::string { + switch (type) { + case tracking_var_types::range: + return "range"; + case tracking_var_types::dot_cur_pos: + return "dot of cur pos"; + default: + return "dot of immediate"; + } + }; + + std::string infoText; + for (auto& p: ktr.track_names){ + const SubtrackingNameInfo& tu = ktr.retrieval_info[p.second]; + + auto getRole = [](bool presence, tracking_var_type type, int first, int second, + const std::string& ARR_NAME) -> std::string { + if (!presence) { + assert(first == -1 && second == -1); + return "Not involved in " + ARR_NAME; + } + if (type == tracking_var_types::range){ + assert(first != -1 && second != -1); + return "In " + ARR_NAME + ": " + std::to_string(first) + " <−> " + std::to_string(second); + } + assert(first != -1 && second == -1); + return "In " + ARR_NAME + ": ( " + std::to_string(first) + " )"; + }; + char buf[2048] = {0}; + snprintf(buf, 2048, "Tracking unit name: %s\\n" "Discovered: %s\\n" "Type: %s\\n" "%s\\n%s", + p.first.c_str(), tu.discovered ? "ofcourse" : "no", + stringifyTrackingVarType(tu.type).c_str(), + getRole(tu.stored_in_ca, tu.type, tu.colarr_first, tu.colarr_second, "colarr").c_str(), + getRole(tu.stored_in_sa, tu.type, tu.selarr_first, tu.selarr_second, "selarr").c_str()); + if (!infoText.empty()) + infoText += "|"; + infoText += buf; + } + fprintf(fd, "infoBoard1 [label=\"%s\" shape = record]\n", infoText.c_str()); + infoText = ""; + for (size_t i = 0; i < priority_table.size(); i++){ + const RegexPriorityTableAction& tu = priority_table[i]; + if (!infoText.empty()) + infoText += "|"; + infoText += tu.minimize ? "Minimize " : "Maximize "; + if (tu.pos.isForRange()){ + infoText += "[" + std::to_string(tu.pos.second) + "] - [" + std::to_string(tu.pos.first) + "]"; + } else { + infoText += "[" + std::to_string(tu.pos.first) + "]"; + } + } + fprintf(fd, "infoBoard2 [label=\"%s\" shape = record]\n", infoText.c_str()); + + assert(fa.start); + fprintf(fd, "start_state->%lu [color=gray style=dotted]\n", fa.start->nodeId); + + + for (FA_Node* node: fa.all){ + NodesProblems& bd = breakdown[node->nodeId]; + if (node->type == one_char_read){ + FA_NodeOfOneCharRead* cn = dynamic_cast(node); + std::string str = stringify_codeset(cn->filter); + print_edge(node, cn->nxt_node, str + (cn->second_ns ? std::string(" ") + STAR : ""), fd, bd); + } else if (node->type == forking){ + FA_NodeOfForking* cn = dynamic_cast(node); + for (FA_Node* nxt: cn->nxt_options){ + print_edge(node, nxt, "", fd, bd); + } + } else if (node->type == look_one_behind){ + FA_NodeOfLookOneBehind* cn = dynamic_cast(node); + print_edge(node, cn->nxt_node, stringify_codeset(cn->filter), fd, bd); + } else if (node->type == look_one_ahead){ + FA_NodeOfLookOneAhead* cn = dynamic_cast(node); + print_edge(node, cn->nxt_node, stringify_codeset(cn->restriction), fd, bd); + } else if (node->type == track_array_mov_imm){ + FA_NodeOfTrackArrayMovImm* cn = dynamic_cast(node); + char buf[1024]; + if (!isImmMovOpcode(cn->operation)) + fprintf(stderr, "bad operation in node %lu\n", node->nodeId); + snprintf(buf, 1024, "%s %hu %lu", + regex024_opcode_tostr(cn->operation), cn->key, cn->imm_value); + print_edge(node, cn->nxt_node,std::string(buf), fd, bd); + } else if (node->type == track_array_mov_halfinvariant){ + FA_NodeOfTrackArrayMovHalfinvariant* cn = dynamic_cast(node); + char buf[1024]; + if (!isCurPosMovOpcode(cn->operation)) + fprintf(stderr, "bad operation in node %lu\n", node->nodeId); + snprintf(buf, 1024, "%s %hu", + regex024_opcode_tostr(cn->operation), cn->key); + print_edge(node, cn->nxt_node,std::string(buf), fd, bd); + } else if (node->type == det_char_crossroads){ + FA_NodeOfDetCharCrossroads* cn = dynamic_cast(node); + for (const auto& transition: cn->crossroads){ + std::string str = stringify_codeset(transition.input); + print_edge(node, transition.nxt_node, str + (cn->second_ns ? std::string(" ") + STAR : ""), + fd, bd); + } + } + } + fprintf(fd, "}\n"); +} + +FILE* get_fd(const char* apath){ + errno = 0; + FILE *fd = fopen(apath, "w"); + if (!fd) + perror("fopen w"); + if (ftruncate(fileno(fd), 0) != 0) + perror("truncation"); + fd = fopen(apath, "a"); + if (!fd) + perror("fopen a"); + return fd; +} + +void show_fa_with_sxiv_after_dot(const FA_Container& fa, const KnownTrackingTools& ktr, + const RegexPriorityTable& priority_table) { + const char* temp_gv = "FAGraph.gv"; + const char* temp_png = "FAGraph.png"; + int temp_descriptor = open(temp_gv, O_CLOEXEC | O_APPEND | O_CREAT | O_WRONLY, S_IRWXU | S_IRWXG); + assert(temp_descriptor >= 0); + assert(fa.start); + FILE* fd = get_fd(temp_gv); + print_fa(fa, fd, ktr, priority_table); + fclose(fd); + char cmdBuf[1024]; + // todo: get rid of temporary dot file and shell usage + snprintf(cmdBuf, 1024, "dot %s -Tpng >%s", temp_gv, temp_png); + int chw = system(cmdBuf); + assert(WIFEXITED(chw)); + assert(WEXITSTATUS(chw) == 0); + snprintf(cmdBuf, 1024, "sxiv %s", temp_png); + chw = system(cmdBuf); + assert(WIFEXITED(chw)); + assert(WEXITSTATUS(chw) == 0); + assert(chw >= 0); + unlink(temp_gv); + unlink(temp_png); +} diff --git a/src/debugging_regexis024/debug_through_graphviz.h b/src/debugging_regexis024/debug_through_graphviz.h new file mode 100644 index 0000000..e341248 --- /dev/null +++ b/src/debugging_regexis024/debug_through_graphviz.h @@ -0,0 +1,12 @@ +#ifndef DEBUGGING_REGEXIS024_DEBUG_THROUGH_GRAPHVIZ_H +#define DEBUGGING_REGEXIS024_DEBUG_THROUGH_GRAPHVIZ_H + +#include +#include +#include + +/* Uses temporary file FAGraph.gv,png, dot command and sxiv */ +void show_fa_with_sxiv_after_dot(const FA_Container& fa, const KnownTrackingTools& ktr, + const RegexPriorityTable& priority_table); + +#endif diff --git a/src/debugging_regexis024/prettyprint/prettyprint_util.cpp b/src/debugging_regexis024/prettyprint/prettyprint_util.cpp new file mode 100644 index 0000000..878ff9e --- /dev/null +++ b/src/debugging_regexis024/prettyprint/prettyprint_util.cpp @@ -0,0 +1,89 @@ +#include +#include +#include + +TreeWithStringsNode::TreeWithStringsNode(const std::string &val): val(val) { +} + +static const char* ch_empty = " "; +static const char* ch_passing_by = "\u2502 "; +static const char* ch_connect_right_and_forward = "\u251c\u2500\u2500\u2500"; +static const char* ch_connect_right_last = "\u2514\u2500\u2500\u2500"; + +static const char* ch_box_left_side = "\u2551"; +static const char* ch_box_right_side = "\u2551"; +static const char* ch_box_top_side = "\u2550"; +static const char* ch_box_bottom_side = "\u2550"; +static const char* ch_box_crn_top_left = "\u2554"; +static const char* ch_box_crn_top_right = "\u2557"; +static const char* ch_box_crn_bottom_left = "\u255A"; +static const char* ch_box_crn_bottom_right = "\u255D"; + +size_t length_of_line(const std::string& str) { + size_t ch = 0; + size_t pos = 0; + while (pos < str.size()) { + int32_t code; + size_t adj; + utf8_string_iterat(code, adj, pos, reinterpret_cast(str.data()), str.size()); + if (code < 0) + return ch; + ch++; + pos += adj; + } + return ch; +} + +/* Warning: recursion used */ +void toLines_dfs(const TreeWithStringsNode& node, lines& out, std::vector& prefix) { + out.push_back(""); + size_t n = prefix.size(); + for (size_t i = 0; i < n; i++) { + if (i + 1 < n) { + out.back() += prefix[i] ? ch_passing_by : ch_empty; + } else { + out.back() += prefix[i] ? ch_connect_right_and_forward : ch_connect_right_last; + } + } + out.back() += node.val; + prefix.push_back(true); + size_t m = node.childeren.size(); + for (size_t i = 0; i < m; i++) { + if (i + 1 == m) + prefix[n] = false; + toLines_dfs(node.childeren[i], out, prefix); + } + prefix.pop_back(); +} + +void TreeWithStringsNode::toLines(lines &out) const { + std::vector prefix; + toLines_dfs(*this, out, prefix); +} + +std::string strMul(size_t n, const char* str) { + std::string res; + for (size_t i = 0; i < n; i++) + res += str; + return res; +} + +lines wrapWithBox(const lines &in) { + lines out; + size_t max_width = 0; + for (auto& l: in) + max_width = std::max(max_width, length_of_line(l)); + out.push_back(ch_box_crn_top_left + strMul(max_width, ch_box_top_side) + ch_box_crn_top_right); + for (auto& line: in) { + size_t s = length_of_line(line); + out.push_back(ch_box_left_side + line + strMul(max_width - s, " ") + ch_box_right_side); + } + out.push_back(ch_box_crn_bottom_left + strMul(max_width, ch_box_bottom_side) + ch_box_crn_bottom_right); + return out; +} + +void printLines(const lines &in) { + for (auto& l: in) + printf("%s\n", l.c_str()); +} + diff --git a/src/debugging_regexis024/prettyprint/prettyprint_util.h b/src/debugging_regexis024/prettyprint/prettyprint_util.h new file mode 100644 index 0000000..72ecb94 --- /dev/null +++ b/src/debugging_regexis024/prettyprint/prettyprint_util.h @@ -0,0 +1,25 @@ +#ifndef DEBUGGING_REGEXIS024_PRETTYPRINT_UTIL_H +#define DEBUGGING_REGEXIS024_PRETTYPRINT_UTIL_H + +/* Used for debug. Do not give to user */ + +#include +#include + +typedef std::vector lines; + +struct TreeWithStringsNode { + std::string val; + std::vector childeren; + + explicit TreeWithStringsNode(const std::string &val); + TreeWithStringsNode() = default; + + void toLines(lines& out) const; +}; + +lines wrapWithBox(const lines& in); + +void printLines(const lines& in); + +#endif diff --git a/src/debugging_regexis024/vm/libregexis024vm_debug.cpp b/src/debugging_regexis024/vm/libregexis024vm_debug.cpp new file mode 100644 index 0000000..068d014 --- /dev/null +++ b/src/debugging_regexis024/vm/libregexis024vm_debug.cpp @@ -0,0 +1,58 @@ +#include +#include +#include + +std::string thread_to_str(const REGEX_IS024_Thread& thread){ + if (!(thread.slot_occupation_status & SLOT_OCCUPIED)) + return "{ unoccupied }"; + char buf[1024]; + snprintf(buf, 1024, "{ IP = %lu }", thread.IP); + return buf; +} + +std::string stack_to_str(const REGEX_IS024_Stack& stack){ + std::string res = "{ "; + for (uint32_t i = 0; i < stack.sz; i++){ + if (i != 0) + res += ", "; + res += std::to_string(stack.slots[i]); + } + res += " }"; + return res; +} + +std::string slots_to_str(const REGEX_IS024_CONTEXT& ctx){ + if (!ctx.initialized) + return "uninitialized"; + std::string READ_slots; + for (size_t i = 0; i < ctx.read_slots_number; i++){ + uint8_t stat = ctx.READ_halted_slots[i].slot_occupation_status; + READ_slots += (stat & SLOT_OCCUPIED) ? ((stat & SLOT_NEW) ? "N" : "O") : "x"; + } + std::string FORK_slots; + for (size_t i = 0; i < ctx.fork_slots_number; i++){ + uint8_t stat = ctx.FORK_halted_slots[i].slot_occupation_status; + FORK_slots += (stat & SLOT_OCCUPIED) ? "O" : "x"; + } + char buf[4096]; + snprintf(buf, 4096, "READ_slots: %s ; FORK_slots: %s ; READ_stack_new_main: %s ; " + "READ_stack_new_second: %s ; READ_stack_old: %s ; FORK_stack: %s", + READ_slots.c_str(), FORK_slots.c_str(), stack_to_str(ctx.READ_halted_stack_new_first).c_str(), + stack_to_str(ctx.READ_halted_stack_new_second).c_str(), + stack_to_str(ctx.READ_halted_stack_old).c_str(), stack_to_str(ctx.FORK_halted_stack).c_str()); + return buf; +} + +void debug_print_context(const REGEX_IS024_CONTEXT& ctx, const char* place) { + printf("== DEBUG `%s` ==\n", place); + + printf("Active thread: %s, sifting_with: %s, match: %s\n%s\n", + thread_to_str(ctx.active_thread).c_str(), + ctx.sifting_with ? thread_to_str(*ctx.sifting_with).c_str() : "NO", thread_to_str(ctx.matched_thread).c_str(), + slots_to_str(ctx).c_str()); +} + +void debug_print_thread(const REGEX_IS024_Thread& thr, const char *place) { + printf("== DEBUG `%s` ==\n", place); + printf("This thread: %s\n", thread_to_str(thr).c_str()); +} diff --git a/src/debugging_regexis024/vm/libregexis024vm_debug.h b/src/debugging_regexis024/vm/libregexis024vm_debug.h new file mode 100644 index 0000000..9bf2265 --- /dev/null +++ b/src/debugging_regexis024/vm/libregexis024vm_debug.h @@ -0,0 +1,11 @@ +#ifndef LIBREGEXIS024_SRC_LIBREGEXIS024VM_DEBUG_H +#define LIBREGEXIS024_SRC_LIBREGEXIS024VM_DEBUG_H + +#include +#include + +void debug_print_context(const REGEX_IS024_CONTEXT& ctx, const char* place); + +void debug_print_thread(const REGEX_IS024_Thread& thr, const char *place); + +#endif diff --git a/src/libregexis024fa/codeset.cpp b/src/libregexis024fa/codeset.cpp new file mode 100644 index 0000000..a70c831 --- /dev/null +++ b/src/libregexis024fa/codeset.cpp @@ -0,0 +1,120 @@ +#include +#include + +codeset_t invert_set(const codeset_t &X) { + if (X.empty()) + return {{0, UINT32_MAX}}; + codeset_t res; + if (X[0].first != 0) + res.emplace_back(0, X[0].first - 1); + for (size_t i = 0; i + 1 < X.size(); i++){ + res.emplace_back(X[i].second + 1, X[i + 1].first - 1); + } + if (X.back().second != UINT32_MAX) + res.emplace_back(X.back().second + 1, UINT32_MAX); + return res; +} + +#define elA (A[i]) +#define elB (B[j]) +#define Ainc i++ +#define Binc j++ +#define prepare size_t An = A.size(); size_t Bn = B.size(); size_t i = 0; size_t j = 0; +#define Aended (i == An) +#define Bended (j == Bn) + +codeset_t merge_sets(const codeset_t &A, const codeset_t &B) { + codeset_t res; + prepare + std::pair cur; + while (true){ + if (Aended && Bended) + break; + if (i == An){ + cur = elB; + Binc; + } else if (j == Bn){ + cur = elA; + Ainc; + } else { + if (elA.first < elB.first) { + cur = elA; + Ainc; + } else { + cur = elB; + Binc; + } + } + while (true){ + if (Aended && Bended){ + res.push_back(cur); + break; + } + if (i < An && (cur.second == UINT32_MAX || elA.first <= cur.second + 1)){ + cur.second = std::max(elA.second, cur.second); + Ainc; + } else if (j < Bn && (cur.second == UINT32_MAX || elB.first <= cur.second + 1)){ + cur.second = std::max(elB.second, cur.second); + Binc; + } else { + res.push_back(cur); + break; + } + } + } + return res; +} + +codeset_t intersect_sets(const codeset_t &A, const codeset_t &B) { + codeset_t res; + prepare + while (true){ + if (Aended || Bended) + break; + if (elB.first <= elA.first && elA.first <= elB.second) + res.emplace_back(elA.first, std::min(elA.second, elB.second)); + else if (elA.first <= elB.first && elB.first <= elA.second) + res.emplace_back(elB.first, std::min(elA.second, elB.second)); + + if (elA.second <= elB.second) + Ainc; + else + Binc; + } + return res; +} + +codeset_t subtract_sets(const codeset_t &A, const codeset_t &B) { + return intersect_sets(A, invert_set(B)); +} + +bool is_inside(uint32_t start, uint32_t end, codeset_t &X) { + for (auto& p: X){ + if (p.first <= start && end <= p.second) + return true; + assert(end < p.first || p.second < start); + } + return false; +} + +codeset_t set_add_char(const codeset_t& X, uint32_t cp) { + return merge_sets(X, {{cp, cp}}); +} + +codeset_t set_add_range(const codeset_t& X, uint32_t start, uint32_t end) { + return merge_sets(X, {{start, end}}); +} + +codeset_t codeset_of_one_char(uint32_t ch) { + return codeset_t({{ch, ch}}); +} + +std::string stringifyCodesetBase10(const codeset_t& CS) { + std::string cs; + for (auto p: CS) { + if (!cs.empty()) + cs += "; "; + cs += std::to_string(p.first) + "-" + std::to_string(p.second); + } + return cs; +} diff --git a/src/libregexis024fa/codeset.h b/src/libregexis024fa/codeset.h new file mode 100644 index 0000000..b936589 --- /dev/null +++ b/src/libregexis024fa/codeset.h @@ -0,0 +1,27 @@ +#ifndef LIBREGEXIS024_CODESET_H +#define LIBREGEXIS024_CODESET_H + +#include +#include +#include +#include + +typedef std::vector> codeset_t; + +codeset_t invert_set(const codeset_t& X); +codeset_t merge_sets(const codeset_t& A, const codeset_t& B); +codeset_t intersect_sets(const codeset_t& A, const codeset_t& B); +codeset_t subtract_sets(const codeset_t& A, const codeset_t& B); + +/* Aborts if segment in question hit the edge (unsafe function) */ +bool is_inside(uint32_t start, uint32_t end, codeset_t& X); + +codeset_t set_add_char(const codeset_t& X, uint32_t cp); +codeset_t set_add_range(const codeset_t& X, uint32_t start, uint32_t end); + +codeset_t codeset_of_one_char(uint32_t ch); +#define codeset_of_all codeset_t({{0, UINT32_MAX}}) + +std::string stringifyCodesetBase10(const codeset_t& CS); + +#endif //LIBREGEXIS024_CODESET_H \ No newline at end of file diff --git a/src/libregexis024fa/colored_codeset.cpp b/src/libregexis024fa/colored_codeset.cpp new file mode 100644 index 0000000..267e7a8 --- /dev/null +++ b/src/libregexis024fa/colored_codeset.cpp @@ -0,0 +1,183 @@ +#include + +#include + +ColoredCodesetSegment::ColoredCodesetSegment(uint32_t color, uint32_t right_code): color(color), right_code(right_code) {} + +ColoredCodesetSegmentList::ColoredCodesetSegmentList() { + first = new ColoredCodesetSegment(0, UINT32_MAX); +} + +void ColoredCodesetSegmentList::replace_myself(const ColoredCodesetSegmentList &other) { + assert(other.first); + ColoredCodesetSegment** in_cur = &first; + ColoredCodesetSegment* in_other = other.first; + while (in_other) { + *in_cur = new ColoredCodesetSegment(*in_other); + in_cur = &((**in_cur).next); + in_other = in_other->next; + } +} + +ColoredCodesetSegmentList::ColoredCodesetSegmentList(const ColoredCodesetSegmentList &other) { + replace_myself(other); +} + +void ColoredCodesetSegmentList::free_myself() { + ColoredCodesetSegment* cur = first; + while (cur) { + ColoredCodesetSegment* nxt = cur->next; + delete cur; + cur = nxt; + } +} + +ColoredCodesetSegmentList::~ColoredCodesetSegmentList() { + free_myself(); +} + +ColoredCodesetSegmentList& ColoredCodesetSegmentList::operator=(const ColoredCodesetSegmentList &other) { + free_myself(); + replace_myself(other); + return *this; +} + +ColoredCodeset::ColoredCodeset(uint64_t dummy_n): DummyN(dummy_n) { + requests = {{}}; +} + +void ColoredCodeset::split_phase(const codeset_t &X) { + + uint32_t cA = 0; + ColoredCodesetSegment* cur_seg = list.first; + + uint32_t pi = 0; + + auto advance_old = [&]()->void{ + cA = cur_seg->right_code + 1; + cur_seg = cur_seg->next; + }; + + /* How to use: splits are made from left to right. After each split cur_seg + * points to the rightest among sub-segments of cur_segment. */ + auto SPLIT = [&](uint32_t code_before_split)->void { + assert(code_before_split < cur_seg->right_code); + ColoredCodesetSegment* new_next = new ColoredCodesetSegment(cur_seg->color, cur_seg->right_code); + new_next->divisor_on_left = true; + cur_seg->right_code = code_before_split; + new_next->next = cur_seg->next; + cur_seg->next = new_next; + advance_old(); + }; + + while (cur_seg && pi < X.size()) { + uint32_t cB = cur_seg->right_code; + uint32_t L = X[pi].first, R = X[pi].second; + + if (L < cA) { + if (R != UINT32_MAX && R + 1 < cA) { + pi++; + } else if (R != UINT32_MAX && R + 1 == cA) { + cur_seg->divisor_on_left = true; + pi++; + } else if (R < cB) { + SPLIT(R); + pi++; + } else { + advance_old(); + } + } else if (L == cA) { + cur_seg->divisor_on_left = true; + if (R < cB) { + SPLIT(R); + pi++; + } else { + advance_old(); + } + } else if (L <= cB) { + SPLIT(L - 1); + if (R < cB) { + SPLIT(R); + pi++; + } else { + advance_old(); + } + } else { + advance_old(); + } + } +} + +void ColoredCodeset::apply_divisor(const codeset_t &X) { + split_phase(X); + size_t X_id = nxt_request_id++; + size_t m = requests.size(); + size_t bm = m; + std::vector skipped(bm, false); + std::vector overlapped(bm, false); + { + bool inside = false; + ColoredCodesetSegment* cur = list.first; + while (cur) { + inside = (inside != cur->divisor_on_left); + if (inside) { + overlapped[cur->color] = true; + } else { + skipped[cur->color] = true; + } + cur = cur->next; + } + } + std::vector alt_color(bm, 0); + for (size_t i = 0; i < bm; i++) { + if (skipped[i] && overlapped[i]) { + alt_color[i] = m++; + requests.push_back(requests[i]); + if (X_id >= DummyN) + requests.back().push_back(X_id - DummyN); + } else if (overlapped[i]) { + if (X_id >= DummyN) + requests[i].push_back(X_id - DummyN); + } else + assert(skipped[i]); + } + { + bool inside = false; + ColoredCodesetSegment* cur = list.first; + while (cur) { + inside = (inside != cur->divisor_on_left); + cur->divisor_on_left = false; + uint32_t c = cur->color; + if (inside && skipped[c] && overlapped[c]) { + cur->color = alt_color[c]; + } + cur = cur->next; + } + } +} + +void ColoredCodeset::get_splits_of_non_dummy(std::vector &res_input, + std::vector> &res_color_to_requests) { + size_t n = requests.size(); + std::vector nonclean_to_clean(n, -1); + res_color_to_requests = {}; + + for (size_t i = 0; i < n; i++) { + if (!requests[i].empty()) { + nonclean_to_clean[i] = res_color_to_requests.size(); + res_color_to_requests.push_back(requests[i]); + } + } + + ColoredCodesetSegment* cur = list.first; + uint32_t L = 0; + res_input.assign(res_color_to_requests.size(), {}); + while (cur) { + size_t Sc = cur->color; + if (nonclean_to_clean[Sc] >= 0) { + res_input[nonclean_to_clean[Sc]].emplace_back(L, cur->right_code); + } + L = cur->right_code + 1; + cur = cur->next; + } +} diff --git a/src/libregexis024fa/colored_codeset.h b/src/libregexis024fa/colored_codeset.h new file mode 100644 index 0000000..9adec03 --- /dev/null +++ b/src/libregexis024fa/colored_codeset.h @@ -0,0 +1,66 @@ +#ifndef LIBREGEXIS024_COLORED_CODESET_H +#define LIBREGEXIS024_COLORED_CODESET_H + +#include +#include +#include + +#include + +/* Used for determinizer. Nowhere else */ + +struct ColoredCodesetSegment { + uint32_t color; + uint32_t right_code; + ColoredCodesetSegment* next = NULL; + + /* Temporary varaible (used by apply_divisor() method) */ + bool divisor_on_left = false; + + ColoredCodesetSegment(uint32_t color, uint32_t right_code); +}; + +/* Warning!!! This stupid class is OOM-unsafe!!! + * This is not an issue as far as you don't show any of it's instance to the user of libregexis024 */ +struct ColoredCodesetSegmentList { + ColoredCodesetSegment* first = NULL; + + ColoredCodesetSegmentList(); + + + void replace_myself(const ColoredCodesetSegmentList& other); + + ColoredCodesetSegmentList(const ColoredCodesetSegmentList& other); + + /* Use only internally */ + void free_myself(); + + ~ColoredCodesetSegmentList(); + + ColoredCodesetSegmentList& operator=(const ColoredCodesetSegmentList& other); +}; + +/* Highly unoptimized algorithm on this data structure O(C^2) time*/ +class ColoredCodeset { + ColoredCodesetSegmentList list; + /* Size of this vector is equal to the number of colors */ + std::vector> requests; + uint64_t DummyN; + size_t nxt_request_id = 0; + + void split_phase(const codeset_t& X); +public: + /* First dummy_n split requests will be viewed as 'dummy requests', when complete map of splits is requested, + * colors that are registed indide only dummy requests won't be returned. */ + ColoredCodeset(uint64_t dummy_n); + + /* O(C, which is bad, but my library's compiler is already slow by itself, so who cares) */ + void apply_divisor(const codeset_t& X); + + /* Returned 'requests' mapping will feature request id's with DummyN substituted from them */ + void get_splits_of_non_dummy(std::vector& res_input, + std::vector>& res_color_to_requests); +}; + + +#endif diff --git a/src/libregexis024fa/fa_first_stage_fix.cpp b/src/libregexis024fa/fa_first_stage_fix.cpp new file mode 100644 index 0000000..a37ee8b --- /dev/null +++ b/src/libregexis024fa/fa_first_stage_fix.cpp @@ -0,0 +1,191 @@ +#include +#include +#include +#include + +// #ifdef LIBREGEXIS024_DEBUG +// #include +// #endif + +REGEX_IS024_FA_FirstStageFixInfo first_stage_fix_fa(FA_Container& sourceFa, FA_Container& resultFa) { + assert(sourceFa.start); + REGEX_IS024_FA_FirstStageFixInfo info; + + for (size_t I_scans = 0; I_scans < sourceFa.all.size(); I_scans++){ + FA_Node* beg = sourceFa.all[I_scans]; + if (beg->type != look_one_ahead) + continue; + FA_NodeOfLookOneAhead& loa = (*(FA_NodeOfLookOneAhead*)beg); + codeset_t& restriction = loa.restriction; + assert(loa.nxt_node); + + struct Marked{ + FA_Node* node; + size_t refs_from_my = 1; + bool making_copy = false; + FA_Node* copy = NULL; + + explicit Marked(FA_Node *node) : node(node) {} + }; + + std::vector searched; + searched.emplace_back(loa.nxt_node); + beg->search_mark = 0; + + for (size_t done = 0; done < searched.size(); done++){ + FA_Node& cur = *searched[done].node; + for (FA_Node** nxtN : cur.get_all_empty_valid_transitions()){ + if ((**nxtN).search_mark == -1){ + assert((**nxtN).nodeId != loa.nodeId); + (**nxtN).search_mark = (int64_t)searched.size(); + searched.emplace_back(*nxtN); + } else { + searched[(**nxtN).search_mark].refs_from_my++; + } + } + } + std::vector s2s; + for (auto& v_sete: searched){ + if (v_sete.refs_from_my < v_sete.node->refs){ + v_sete.making_copy = true; + s2s.push_back(v_sete.node); + } + } + while (!s2s.empty()){ + FA_Node& m = *s2s.back(); s2s.pop_back(); + assert(searched[m.search_mark].making_copy); + /* Beacuse of this operation source Fa is not read-only. It becomes useless after renerating resultFa */ + searched[m.search_mark].copy = copy_fa_node(m, sourceFa); + + for (FA_Node** nxtN: m.get_all_empty_valid_transitions()){ + Marked& nxtNaux = searched[(**nxtN).search_mark]; + if (!nxtNaux.making_copy){ + nxtNaux.making_copy = true; + s2s.push_back(*nxtN); + } + } + } + + for (auto& v_sete : searched){ + FA_Node* my = v_sete.making_copy ? v_sete.copy : v_sete.node; + for (FA_Node** nxtN: my->get_all_empty_valid_transitions()){ + Marked& nxtNaux = searched[(**nxtN).search_mark]; + if (nxtNaux.making_copy) + reattach_fa_node_edge(nxtN, nxtNaux.copy); + } + my->apply_lookahead_restriction(restriction); + if (my->type == match) + info.fed_chars_extend_one_right = true; + } + + for (auto& v_sete: searched) + v_sete.node->search_mark = -1; + } + + // show_fa_with_sxiv_after_dot(sourceFa, {{}, {}}, {}); + + { + /* Now it's time to fill resultFa. Skipping all look one ahead's */ + auto skip_useless = [&](FA_Node* v) -> FA_Node* { + while (v->type == look_one_ahead){ + v = ((FA_NodeOfLookOneAhead*)v)->nxt_node; + } + return v; + }; + + resultFa.start = sourceFa.start; + std::vector homework = {&(resultFa.start)}; + std::vector sourceIdToResNode(sourceFa.all.size(), NULL); + + while (!homework.empty()) { + FA_Node** vPtr = homework.back(); homework.pop_back(); + FA_Node* right_source_v = skip_useless(*vPtr); + size_t vid = right_source_v->nodeId; + if (!sourceIdToResNode[vid]) { + sourceIdToResNode[vid] = copy_fa_node_to_another_fa(*right_source_v, resultFa); + for (FA_Node** uuPtr: sourceIdToResNode[vid]->get_all_transitions()) + homework.push_back(uuPtr); + } + *vPtr = sourceIdToResNode[vid]; + sourceIdToResNode[vid]->refs++; + } + } + + + { + /* Guessing info.fed_chars_extend_one_left */ + size_t done = 0; + std::vector searched; + searched.push_back(resultFa.start); + resultFa.start->search_mark = 0; + while (done < searched.size()){ + if (searched[done]->type == look_one_behind){ + info.fed_chars_extend_one_left = true; + break; + } + for (FA_Node** nxtN: searched[done]->get_all_empty_valid_transitions()){ + if ((**nxtN).search_mark < 0){ + (**nxtN).search_mark = 0; + searched.push_back(*nxtN); + } + } + done++; + } + for (FA_Node* d: searched) + d->search_mark = -1; + } + return info; +} + +FA_NodeOfOneCharRead* generate_alt_ending(const codeset_t& restriction, FA_Container& fa){ + FA_NodeOfOneCharRead* n1 = fa.makeOneCharRead(restriction, true); + FA_NodeOfMatch* n2 = fa.makeMatch(); + n2->ext_filter_added = true; // Won't actually be used + reattach_fa_node_edge(&(n1->nxt_node), n2); + return n1; +} + +void regular_second_stage_fix(const FA_Container& sourceFa, FA_Container& resultFa, + const REGEX_IS024_FA_FirstStageFixInfo &info1) +{ + assert(resultFa.all.empty() && !resultFa.start); + if (!sourceFa.start) + return; + // todo: rewrite first stage using that cool technique I just invented + resultFa.start = sourceFa.start; + // A vector of pointers in resutFa to nodes that belong to sourceFa. They should undergo a little bit of copying. + std::vector homework = {&(resultFa.start)}; + // source node id s index. Element is NULL if no copy (in resultFa) exists and resFa node if copying was performed + std::vector sourceIdToResNode(sourceFa.all.size(), NULL); + while (!homework.empty()) { + FA_Node** vPtr = homework.back(); homework.pop_back(); + FA_Node* sourceV = *vPtr; assert(sourceV); + size_t sourceVId = sourceV->nodeId; + if (!sourceIdToResNode[sourceVId]) { + if (sourceV->type == match) { + FA_NodeOfMatch& mn = dynamic_cast(*sourceV); + FA_NodeOfMatch* res_mn = resultFa.makeMatch(); + if (mn.ext_filter_added && mn.pending_filter != codeset_of_all) { + assert(info1.fed_chars_extend_one_right); + FA_NodeOfOneCharRead* res_ocr2n = resultFa.makeOneCharRead(mn.pending_filter, true); + reattach_nxt_node(res_ocr2n, res_mn); + sourceIdToResNode[sourceVId] = res_ocr2n; + } else { + sourceIdToResNode[sourceVId] = res_mn; + } + } else { + sourceIdToResNode[sourceVId] = copy_fa_node_to_another_fa(*sourceV, resultFa); + /* O_o */ + for (FA_Node** uuPtr: sourceIdToResNode[sourceVId]->get_all_transitions()) + homework.push_back(uuPtr); + } + } + *vPtr = sourceIdToResNode[sourceVId]; + sourceIdToResNode[sourceVId]->refs++; + } + + if (info1.fed_chars_extend_one_left) { + FA_NodeOfOneCharRead* ns = resultFa.makeOneCharRead(codeset_of_all, true); + yay_new_start(resultFa, ns); + } +} diff --git a/src/libregexis024fa/fa_first_stage_fix.h b/src/libregexis024fa/fa_first_stage_fix.h new file mode 100644 index 0000000..0be0a4d --- /dev/null +++ b/src/libregexis024fa/fa_first_stage_fix.h @@ -0,0 +1,18 @@ +#ifndef LIBREGEXIS024_FA_FIRST_STAGE_FIX_H +#define LIBREGEXIS024_FA_FIRST_STAGE_FIX_H + +#include "finite_automaton.h" + +struct REGEX_IS024_FA_FirstStageFixInfo{ + bool fed_chars_extend_one_left = false; + bool fed_chars_extend_one_right = false; +}; + +/* Will look for look_one_ahead nodes and apply their filter to reading filters ahead * + * sourceFa will be ruined. The output will be in resultFa */ +REGEX_IS024_FA_FirstStageFixInfo first_stage_fix_fa(FA_Container& sourceFa, FA_Container& resultFa); + +void regular_second_stage_fix(const FA_Container& sourceFa, FA_Container& resultFa, + const REGEX_IS024_FA_FirstStageFixInfo &info1); + +#endif //LIBREGEXIS024_FA_FIRST_STAGE_FIX_H diff --git a/src/libregexis024fa/fa_make_deterministic.cpp b/src/libregexis024fa/fa_make_deterministic.cpp new file mode 100644 index 0000000..9f86c32 --- /dev/null +++ b/src/libregexis024fa/fa_make_deterministic.cpp @@ -0,0 +1,665 @@ +#include +#include +#include /* to get exitf */ +#include +#include +#include +#include +#include +#include +#include + +#if defined(LIBREGEXIS024_DEBUG) && defined(LIBREGEXIS024_ALLOW_LOUD) +#include +#include +#include +#include +#define PR_DEB +#endif + +/* debug nonsence */ +void input_fa_assert(const FA_Container& fa){ + assert(fa.start); + for (FA_Node* node: fa.all){ + if (node->type == one_char_read){ + assert(!dynamic_cast(node)->second_ns); + } else if (node->type == look_one_ahead || + node->type == det_char_crossroads){ + exitf("not allowed at this stage\n"); + } + } +} + +struct OperHistoryNodeTransition { + TrackingOperationInFa op; + size_t u; + + OperHistoryNodeTransition(const TrackingOperationInFa &op, size_t u): op(op), u(u) {} +}; + +struct OperHistoryNode { + std::vector next; + /* When it is part of clean history, this */ + std::vector compressed_selarr; + std::vector raisin; + + OperHistoryNode() = default; +}; + +/* This object can describe an empty superstate (needed to describe clean history nodes without raisin) + * If det_stops is empty, interpret it as empty superstate */ +struct SuperState { + std::vector sorted_raisin; + std::vector double_compressed_selarr; + + bool empty() const { + return sorted_raisin.empty(); + } + +#ifdef PR_DEB + std::string toString() const { + std::string f1_raisin; + for (uint64_t el: sorted_raisin) { + if (!f1_raisin.empty()) + f1_raisin += ", "; + f1_raisin += std::to_string(el); + } + std::string f2_selarr; + for (uint64_t el: double_compressed_selarr) { + if (!f2_selarr.empty()) + f2_selarr += ", "; + f2_selarr += std::to_string(el); + } + + return "sorted_raisin: {" + f1_raisin + "}, double_comp_selarr: {" + f2_selarr + "}"; + } +#endif +}; + +struct CleanOperHistoryNode { + std::vector next; + SuperState exit; +}; + +struct SelarrCompressionScheme { + size_t SN1, SN2 = 0, SN3 = 0; + std::vector S1_to_S2; + std::vector S2_to_sifter; + std::vector S3_to_sifter; + const RegexPriorityTable& sifter; + + SelarrCompressionScheme(size_t sn1, const RegexPriorityTable &sifter) : SN1(sn1), sifter(sifter) { + assert(sifter.size() <= UINT32_MAX); + S1_to_S2.assign(SN1, -1); + for (regex_tai_t i = 0; i < sifter.size(); i++) { + auto& act = sifter[i].pos; + regex_tai_t first_on_s2 = S2_to_sifter.size(); + S2_to_sifter.push_back(i); + S1_to_S2[act.first] = first_on_s2; + if (act.type != tracking_var_types::dot_cur_pos) { + S3_to_sifter.push_back(i); + } + if (act.type == tracking_var_types::range) { + regex_tai_t second_on_s2 = S2_to_sifter.size(); + S2_to_sifter.push_back(i); + S1_to_S2[act.second] = second_on_s2; + } + } + SN2 = S2_to_sifter.size(); + SN3 = S3_to_sifter.size(); + assert(SN3 <= SN2 && SN2 <= SN1 && SN1 <= UINT16_MAX); + + } +}; + +std::vector compress_compressed_selarr(const std::vector& S2, + const SelarrCompressionScheme& cmp) { + std::vector S3(cmp.SN3); + for (size_t i = 0; i < cmp.SN3; i++) { + const RegexPriorityTableAction_Pos& act = cmp.sifter[cmp.S3_to_sifter[i]].pos; + if (act.type == tracking_var_types::dot_immediate) { + S3[i] = S2[cmp.S1_to_S2[act.first]]; + } else { + assert(act.type == tracking_var_types::range); // It must be range type + uint64_t onFirstBorder = S2[cmp.S1_to_S2[act.first]]; + uint64_t onSecondBorder = S2[cmp.S1_to_S2[act.second]]; + S3[i] = (onFirstBorder > onSecondBorder) ? 1 : 0; + } + } + return S3; +} + +bool compressed_selarr_A_outranks_B(const std::vector& A, const std::vector& B, + const SelarrCompressionScheme& cmp) { + for (const RegexPriorityTableAction& act: cmp.sifter) { + uint64_t valA = A[cmp.S1_to_S2[act.pos.first]]; + uint64_t valB = B[cmp.S1_to_S2[act.pos.first]]; + if (act.pos.type == tracking_var_types::range) { + uint64_t valAsecond = A[cmp.S1_to_S2[act.pos.second]]; + uint64_t valBsecond = A[cmp.S1_to_S2[act.pos.second]]; + valA = valAsecond > valA ? valAsecond - valA : 0; + valB = valBsecond > valB ? valBsecond - valB : 0; + } + if (valA == valB) + continue; + return (valA < valB) == act.minimize; + } + return false; +} + +/* Beacuse of the way wash_history_bush builds this structure, root is te last node. + * rankdir is from left to right (guaranteed). Can be empty if original history contained no raisin */ +struct RaisinBush { + std::vector clean_history; + ssize_t start = -1; + + bool empty() const { + return start < 0; + } + +#ifdef PR_DEB + void print() { + lines text; + text.push_back("Raisin bush"); + if (start >= 0) { + size_t n = clean_history.size(); + std::vector m(n, false); + TreeWithStringsNode e{""}; + std::function dfs = [&] + (TreeWithStringsNode& fill, size_t nodeId) + { + if (m[nodeId]) { + fill.val = "PARADOX"; + return; + } + m[nodeId] = true; + const CleanOperHistoryNode& node = clean_history[nodeId]; + fill.val = "[" + std::to_string(nodeId) + "]"; + if (!node.exit.empty()) + fill.val += (" EXIT: " + node.exit.toString()); + size_t CN = node.next.size(); + fill.childeren.resize(CN); + for (size_t i = 0; i < CN; i++) { + fill.childeren[i].val = node.next[i].op.toString(); + fill.childeren[i].childeren = {{}}; + dfs(fill.childeren[i].childeren[0], node.next[i].u); + } + }; + dfs(e, start); + size_t am = 0; + for (bool el: m) + am += static_cast(el); + if (am < n) + text[0] += ": " + std::to_string(n - am) + " nodes are unreachable by detour"; + e.toLines(text); + } else { + if (clean_history.empty()) + text[0] = "Empty Raisin Bush"; + else + text [0] = "Raisin bush with no root and " + std::to_string(clean_history.size()) = " nodes missed"; + } + printLines(wrapWithBox(text)); + } +#endif +}; + +void wash_history_bush(const std::vector& history, RaisinBush& answer, + const SelarrCompressionScheme& cmp) { + assert(!history.empty()); + std::vector has_raisin(history.size()); + std::vector dirty_to_clean(history.size(), -1); + std::vector > callStack = {{0, 0}}; + + auto hist_clean_detour_init_clean = [&](uint64_t v) -> uint64_t { + if (!has_raisin[v]) { + has_raisin[v] = true; + dirty_to_clean[v] = answer.clean_history.size(); + answer.clean_history.emplace_back(); + } + return dirty_to_clean[v]; + }; + + while (!callStack.empty()) { + size_t v = callStack.back().first; + size_t od = callStack.back().second; + if (od == 0) { + if (!history[v].raisin.empty()) { + size_t cleanVId = hist_clean_detour_init_clean(v); + std::vector& sr = answer.clean_history[cleanVId].exit.sorted_raisin; + sr = history[v].raisin; + std::sort(sr.begin(), sr.end()); + answer.clean_history[cleanVId].exit.double_compressed_selarr = compress_compressed_selarr(history[v].compressed_selarr, cmp); + } + } else { + const OperHistoryNodeTransition& old_hist_tr = history[v].next[od - 1]; + uint64_t ou = old_hist_tr.u; + if (has_raisin[ou]) { + size_t cleanVId = hist_clean_detour_init_clean(v); + answer.clean_history[cleanVId].next.emplace_back(old_hist_tr.op, dirty_to_clean[ou]); + } + } + + if (od == history[v].next.size()) { + callStack.pop_back(); + } else { + callStack.back().second++; + callStack.emplace_back(history[v].next[od].u, 0); + } + } + + if (has_raisin[0]) { + assert(dirty_to_clean[0] >= 0); + answer.start = dirty_to_clean[0]; + } + +} + +/* If is_it_after_read is false, unknown selarr range variable border and cur pos are evaluated to 0. + * Otherwise, cur pos considered to be greater than previous values of selarr ange variable boundaries */ +void building_detour(const SelarrCompressionScheme& cmp, + const std::vector& outer_selarr, const std::vector& zeroeps, const codeset_t& I, + RaisinBush& answer, bool is_it_after_read) +{ +#ifdef PR_DEB + printf("Det Debug: build_detour started with zeroeps:{"); + for (FA_Node* node: zeroeps) + printf("%lu,", node->nodeId); + printf("}, I: {%s}\n", stringifyCodesetBase10(I).c_str()); +#endif + assert(cmp.SN3 == outer_selarr.size()); + if (!is_it_after_read) + for (uint64_t val: outer_selarr) + assert(val == 0); + + struct SearchMark { + FA_Node* domain_node; + uint64_t epsilon_refs = 0; + uint64_t detour_sat = 0; + /* id of corresponding history node */ + size_t Hv = 0; + + explicit SearchMark(FA_Node *domain_node) : domain_node(domain_node) {} + }; + + /* Default values are good for me */ + std::vector marks; + for (size_t i = 0; i < zeroeps.size(); i++) { + marks.emplace_back(zeroeps[i]); + zeroeps[i]->search_mark = i; + } + + auto lob_allows_to_pass = [&](FA_NodeOfLookOneBehind* lob) -> bool { + if (!intersect_sets(lob->filter, I).empty()) { + assert(merge_sets(lob->filter, I) == lob->filter); + return true; + } + return false; + }; + + { /* First i need to know exacly how many of MINE epsilon transitions are referencing each NODE */ + std::vector domain_detour = zeroeps; + while (!domain_detour.empty()) { + FA_Node* v = domain_detour.back(); domain_detour.pop_back(); + if (v->type == look_one_behind && !lob_allows_to_pass(dynamic_cast(v))) + continue; + for (FA_Node** uPtr: v->get_all_empty_valid_transitions()) { + assert(*uPtr); + int64_t &rds = (**uPtr).search_mark; + if (rds == -1) { + rds = marks.size(); + domain_detour.push_back(*uPtr); + marks.emplace_back(*uPtr); + } + marks[rds].epsilon_refs++; + } + } + } + std::vector history = {OperHistoryNode()}; + history[0].compressed_selarr.assign(cmp.SN2, 0); + for (size_t i = 0; i < cmp.SN3; i++) { + const RegexPriorityTableAction_Pos& act = cmp.sifter[cmp.S3_to_sifter[i]].pos; + if (act.type == tracking_var_types::range) { + if (outer_selarr[i]) { + history[0].compressed_selarr[cmp.S1_to_S2[act.second]] = 1; + } + } else { + assert(act.type == tracking_var_types::dot_immediate); + history[0].compressed_selarr[cmp.S1_to_S2[act.first]] = outer_selarr[i]; + } + } + /* As a result, dot_cur_pos variables will be initialized as zero (always) */ + + /* In my second detour, I will pass each vertex here only one time: after hitting the total epsilon refcount */ + std::vector can_process = zeroeps; + /* + auto increase_sat_refcount = [&](SearchMark& mark) { + mark.detour_sat++; + if (mark.detour_sat == mark.epsilon_refs && mark.ever_walked_in) { + can_process.push_back(mark.domain_node); + } + }; + */ + + auto add_history_update = [&](TrackingOperationInFa how, uint64_t where, uint64_t from_where) { + history[from_where].next.emplace_back(how, where); + }; + + while (!can_process.empty()) { + FA_Node* v = can_process.back(); can_process.pop_back(); + SearchMark& Vmark = marks[v->search_mark]; + assert(Vmark.detour_sat == Vmark.epsilon_refs); + uint64_t Hv = Vmark.Hv; + uint64_t Hop = Hv; + if (v->type == look_one_behind) { + FA_NodeOfLookOneBehind* tv = dynamic_cast(v); + if (!lob_allows_to_pass(tv)) + continue; + } else if (isTrackingFaNode(v)) { + Hop = history.size(); + history.emplace_back(); + std::vector& val2 = history.back().compressed_selarr; + val2 = history[Hv].compressed_selarr; + if (v->type == track_array_mov_imm) { + FA_NodeOfTrackArrayMovImm* tv = dynamic_cast(v); + if (isSelarrOpcode(tv->operation)) { + int key_s2 = cmp.S1_to_S2[tv->key]; + if (key_s2 >= 0){ + assert(cmp.sifter[cmp.S2_to_sifter[key_s2]].pos.type == tracking_var_types::dot_immediate); + val2[key_s2] = tv->imm_value; + } + } + add_history_update(TrackingOperationInFa(tv->operation, tv->key, tv->imm_value), Hop, Hv); + } else if (v->type == track_array_mov_halfinvariant) { + FA_NodeOfTrackArrayMovHalfinvariant* tv = dynamic_cast(v); + if (isSelarrOpcode(tv->operation)) { + int key_s2 = cmp.S1_to_S2[tv->key]; + if (key_s2 >= 0){ + const RegexPriorityTableAction_Pos& act = cmp.sifter[cmp.S2_to_sifter[key_s2]].pos; + assert(act.type != tracking_var_types::dot_immediate); + if (act.type == tracking_var_types::dot_cur_pos) { + val2[key_s2] = is_it_after_read ? 1 : 0; + } else { + val2[key_s2] = is_it_after_read ? 2 : 0; + } + } + } + add_history_update(TrackingOperationInFa(tv->operation, tv->key), Hop, Hv); + } + } else if (v->type == match || v->type == one_char_read) { + // Determinization stop + history[Hv].raisin.push_back(v->nodeId); + } + for (FA_Node** uPtr: v->get_all_empty_valid_transitions()) { + assert(*uPtr); + SearchMark& Umark = marks[(**uPtr).search_mark]; + /* Here I use Hop to determine Hv value of u */ + if (Umark.detour_sat == 0) { + Umark.Hv = Hop; + } else if (Umark.Hv != Hop) { + if (compressed_selarr_A_outranks_B( + history[Hop].compressed_selarr, history[Umark.Hv].compressed_selarr, cmp)){ + Umark.Hv = Hop; + } + } + /* Collision calculation finished */ + Umark.detour_sat++; + if (Umark.detour_sat == Umark.epsilon_refs) { + can_process.push_back(Umark.domain_node); + } + } + } + /* Cleaning this mess */ + for (auto& m: marks) { + m.domain_node->search_mark = -1; + } + /* Packaging the answer (we do a little bit of dfs here) */ + wash_history_bush(history, answer, cmp); +} + +void update_had_to_fork_status(const RaisinBush& bush, int& had_to_fork) { + for (const CleanOperHistoryNode& node: bush.clean_history) { + if (node.next.size() > 1 || (!node.next.empty() && !node.exit.empty())) { + had_to_fork = 1; + return; + } + } +} + +typedef size_t superstate_id_t; + +typedef std::vector> homework_t; + +struct LessSuperState { + bool operator()(const SuperState& A, const SuperState& B) const { + std::less> f1L; + if (f1L(A.sorted_raisin, B.sorted_raisin)) + return true; + if (f1L(B.sorted_raisin, A.sorted_raisin)) + return false; + return f1L(A.double_compressed_selarr, B.double_compressed_selarr); + } +}; + +struct GlobalDetourProgress { + std::map superstates; + /* Each element is a root of some megabush in resFa */ + std::vector superstate_megabush_constructed; + std::vector todo_superstaes; +}; + +/* If x was not previously achieved, it will also add it to t o d o list of global detour */ +superstate_id_t convertSuperstateToId(const SuperState& x, GlobalDetourProgress& gdp) { + if (gdp.superstates.count(x)) { + return gdp.superstates[x]; + } + size_t n = gdp.superstates.size(); + gdp.superstates.insert({x, n}); + gdp.todo_superstaes.push_back(x); + gdp.superstate_megabush_constructed.push_back(NULL); + return n; +} + +FA_Node* build_dead_end(FA_Container& resFa) { + return resFa.makeForking(); +} + +void build_bush(const RaisinBush& alpha, FA_Node** sowing_location, FA_Container& resFa, + homework_t& homework, GlobalDetourProgress& gdp) { + size_t n = alpha.clean_history.size(); + if (n == 0) { + FA_Node* dead_end = build_dead_end(resFa); + reattach_fa_node_edge(sowing_location, dead_end); + return; + } + std::vector> todo = {{sowing_location, alpha.start}}; + + while (!todo.empty()) { + FA_Node** sl = todo.back().first; + const CleanOperHistoryNode& hnode = alpha.clean_history[todo.back().second]; + todo.pop_back(); + auto history_transition = [&](size_t i, FA_Node** of_sl) { + FA_NodePathPart* pn = convert_to_node(hnode.next[i].op, resFa); + reattach_fa_node_edge(of_sl, pn); + todo.emplace_back(&(pn->nxt_node), hnode.next[i].u); + }; + + if (hnode.next.empty()) { + assert(!hnode.exit.empty()); + superstate_id_t w = convertSuperstateToId(hnode.exit, gdp); + homework.emplace_back(sl, w); + } else if (hnode.next.size() == 1 && hnode.exit.empty()) { + history_transition(0, sl); + } else { + FA_NodeOfForking* forker = resFa.makeForking(); + bool raisin = !hnode.exit.empty(); + size_t k = hnode.next.size(); + forker->nxt_options.assign(k + static_cast(raisin), NULL); + for (size_t i = 0; i < k; i++) { + history_transition(i, &(forker->nxt_options[i])); + } + if (raisin) { + superstate_id_t w = convertSuperstateToId(hnode.exit, gdp); + homework.emplace_back(&(forker->nxt_options[k]), w); + } + reattach_fa_node_edge(sl, forker); + } + } +} + +ColoredCodeset get_pretreated_cc(FA_Container& sourceFa) { + std::set little_insects; + for (FA_Node* v: sourceFa.all) { + if (v->type == look_one_behind) { + little_insects.insert(static_cast(v)->filter); + } + } + ColoredCodeset pretreated_cc(little_insects.size()); + for (const codeset_t& cs: little_insects) { + pretreated_cc.apply_divisor(cs); + } + return pretreated_cc; +} + +// todo add a check on size of dfa +void try_determinize_fa(FA_Container &sourceFa, const RegexPriorityTable &sifter, regex_tai_t selarr_sz, + const REGEX_IS024_FA_FirstStageFixInfo &info1, FA_Container &resFa, int &error, int& had_to_fork) +{ + /* During execuion, i will create pointers to field res.start and store them (inside the scope of this function) + * Luckily res argument is already immovable in this scope. */ + error = 0; + had_to_fork = 0; + assert(resFa.start == NULL && resFa.all.empty()); + input_fa_assert(sourceFa); + SelarrCompressionScheme cmp(selarr_sz, sifter); + + GlobalDetourProgress gdp; + homework_t homework; + + ColoredCodeset pretreated_cc = get_pretreated_cc(sourceFa); + + FA_Node** res_start_ptr = &(resFa.start); + if (info1.fed_chars_extend_one_left) { + ColoredCodeset inp_distinction = pretreated_cc; + inp_distinction.apply_divisor(codeset_of_all); + std::vector starting_Is; + std::vector> starting_Cids; /* Filler variable */ + inp_distinction.get_splits_of_non_dummy(starting_Is, starting_Cids); + size_t R = starting_Is.size(); + for (auto& rdh: starting_Cids) { + assert(rdh.size() == 1 && rdh[0] == 0); + } + FA_NodeOfDetCharCrossroads* very_first_cr = resFa.makeDetCharCrossroads(); + very_first_cr->second_ns = true; + reattach_fa_node_edge(res_start_ptr, very_first_cr); + very_first_cr->crossroads.resize(R); /* After that, nobody has right to resize crossroads array */ + for (size_t i = 0; i < R; i++) { + very_first_cr->crossroads[i].input = starting_Is[i]; + FA_Node** sowing_place = &(very_first_cr->crossroads[i].nxt_node); + RaisinBush alpha; + building_detour(cmp, std::vector(cmp.SN3, 0), {sourceFa.start}, starting_Is[i], alpha, false); +#ifdef PR_DEB + printf("Initialization hard %ld/%ld\n", i + 1, R); + alpha.print(); +#endif + update_had_to_fork_status(alpha, had_to_fork); + build_bush(alpha, sowing_place, resFa, homework, gdp); + } + } else { + RaisinBush alpha; + building_detour(cmp, std::vector(cmp.SN3, 0), {sourceFa.start}, codeset_of_all, alpha, false); +#ifdef PR_DEB + printf("Initialization easy\n"); + alpha.print(); +#endif + update_had_to_fork_status(alpha, had_to_fork); + build_bush(alpha, res_start_ptr, resFa, homework, gdp); + } + /* Now we start the actual detour. */ + while (!gdp.todo_superstaes.empty()) { + SuperState SS = gdp.todo_superstaes.back(); gdp.todo_superstaes.pop_back(); + // printf("Global detour turn: %s\n", SS.toString().c_str()); + std::vector reading_stops; + codeset_t how_can_i_finish = {}; + for (size_t v: SS.sorted_raisin) { + FA_Node* node = sourceFa.all[v]; + if (node->type == one_char_read) { + reading_stops.push_back(static_cast(node)); + } else if (node->type == match) { + auto fn = static_cast(node); + assert(!fn->ext_filter_added || info1.fed_chars_extend_one_right); + if (fn->ext_filter_added) { + how_can_i_finish = merge_sets(how_can_i_finish, fn->pending_filter); + } else { + how_can_i_finish = codeset_of_all; + } + } else + assert(false); + } + // Determinization stop: one char read (input) + ColoredCodeset inp_distinction = pretreated_cc; + size_t pr = reading_stops.size(); + for (size_t i = 0; i < pr; i++) { + inp_distinction.apply_divisor(reading_stops[i]->filter); + } + std::vector Is; + std::vector> Cids; + inp_distinction.get_splits_of_non_dummy(Is, Cids); + size_t R = Is.size(); + FA_NodeOfDetCharCrossroads* my_cr = NULL; + if (R > 0) { + my_cr = resFa.makeDetCharCrossroads(); + if (!info1.fed_chars_extend_one_right && !how_can_i_finish.empty()) { + assert(how_can_i_finish == codeset_of_all); + my_cr->matching = true; + } + my_cr->crossroads.resize(R); + } + for (size_t i = 0; i < R; i++) { + my_cr->crossroads[i].input = Is[i]; + my_cr->crossroads[i].nxt_node = NULL; + std::vector fl_passed_filters; + for (size_t j: Cids[i]) { + fl_passed_filters.push_back(reading_stops[j]->nxt_node); + } + // todo: make a function out of next 6 lines of code + RaisinBush alpha; + building_detour(cmp, SS.double_compressed_selarr, fl_passed_filters, Is[i], alpha, true); +#ifdef PR_DEB + printf("That same turn, subbush %ld/%ld\n", i + 1, R); + alpha.print(); +#endif + update_had_to_fork_status(alpha, had_to_fork); + build_bush(alpha, &(my_cr->crossroads[i].nxt_node), resFa, homework, gdp); + } + // Determinization stop: match (finish) + FA_Node* finish_route = NULL; + if (!how_can_i_finish.empty() && (info1.fed_chars_extend_one_right || R == 0)) { + FA_NodeOfMatch* matcher = resFa.makeMatch(); + finish_route = matcher; + if (info1.fed_chars_extend_one_right) { + FA_NodeOfOneCharRead* right_ext_read = resFa.makeOneCharRead(how_can_i_finish, true); + reattach_nxt_node(right_ext_read, matcher); + finish_route = right_ext_read; + } + } + // Combining these two cases + assert(finish_route || my_cr); + FA_Node*& endsUp = gdp.superstate_megabush_constructed[gdp.superstates[SS]]; + if (!finish_route) { + endsUp = my_cr; + } else if (!my_cr) { + endsUp = finish_route; + } else { + FA_NodeOfForking* F = resFa.makeForking(); + F->nxt_options = {NULL, NULL}; + reattach_fa_node_edge(&(F->nxt_options[0]), my_cr); + reattach_fa_node_edge(&(F->nxt_options[1]), finish_route); + endsUp = F; + } + } + /* Now it's time to do the homework: link all megabushes */ + for (auto& p: homework) { + reattach_fa_node_edge(p.first, gdp.superstate_megabush_constructed[p.second]); + } +} + diff --git a/src/libregexis024fa/fa_make_deterministic.h b/src/libregexis024fa/fa_make_deterministic.h new file mode 100644 index 0000000..6c08469 --- /dev/null +++ b/src/libregexis024fa/fa_make_deterministic.h @@ -0,0 +1,10 @@ +#ifndef LIBREGEXIS024_FA_MAKE_DETERMINISTIC_H +#define LIBREGEXIS024_FA_MAKE_DETERMINISTIC_H + +#include +#include + +void try_determinize_fa(FA_Container &sourceFa, const RegexPriorityTable &sifter, regex_tai_t selarr_sz, + const REGEX_IS024_FA_FirstStageFixInfo &info1, FA_Container &resFa, int &error, int& had_to_fork); + +#endif //LIBREGEXIS024_FA_MAKE_DETERMINISTIC_H diff --git a/src/libregexis024fa/finite_automaton.cpp b/src/libregexis024fa/finite_automaton.cpp new file mode 100644 index 0000000..727ce3d --- /dev/null +++ b/src/libregexis024fa/finite_automaton.cpp @@ -0,0 +1,141 @@ +#include +#include +#include + +bool FA_Node::empty() { + return type != one_char_read && type != det_char_crossroads; +} + +void FA_Node::apply_lookahead_restriction(const codeset_t &restriction) {} + +void FA_Node::reAdd_references() { + for (FA_Node** nxtPtr: get_all_transitions()){ + if (*nxtPtr) + (**nxtPtr).refs++; + } +} + +std::vector FA_Node::get_all_transitions() { + return {}; +} + +std::vector FA_Node::get_all_empty_valid_transitions() { + return {}; +} + +std::vector FA_NodePathPart::get_all_transitions() { + return {&nxt_node}; +} + +std::vector FA_NodePathPart::get_all_empty_valid_transitions() { + if (nxt_node) + return {&nxt_node}; + return {}; +} + +FA_NodeOfMatch::FA_NodeOfMatch() {type = match;} + +void FA_NodeOfMatch::apply_lookahead_restriction(const codeset_t &restriction) { + ext_filter_added = true; + pending_filter = restriction; +} + +FA_NodeOfOneCharRead::FA_NodeOfOneCharRead(const codeset_t &filter, bool second_namespace) : filter(filter), + second_ns(second_namespace) { type = one_char_read;} + +void FA_NodeOfOneCharRead::apply_lookahead_restriction(const codeset_t &restriction) { + filter = intersect_sets(filter, restriction); +} + +std::vector FA_NodeOfOneCharRead::get_all_empty_valid_transitions() { + return {}; +} + +FA_NodeOfForking::FA_NodeOfForking() {type = forking;} + +std::vector FA_NodeOfForking::get_all_empty_valid_transitions() { + std::vector res; + for (size_t i = 0; i < nxt_options.size(); i++) + if (nxt_options[i]) + res.push_back(&nxt_options[i]); + return res; +} + +std::vector FA_NodeOfForking::get_all_transitions() { + std::vector res; + for (size_t i = 0; i < nxt_options.size(); i++) + res.push_back(&nxt_options[i]); + return res; +} + +FA_NodeOfLookOneBehind::FA_NodeOfLookOneBehind(const codeset_t &filter) : filter(filter) {type = look_one_behind;} + +FA_NodeOfLookOneAhead::FA_NodeOfLookOneAhead(const codeset_t &restriction) : restriction(restriction) { + type = look_one_ahead; +} + +FA_NodeOfTrackArrayMovImm::FA_NodeOfTrackArrayMovImm(regex024_opcode operation, uint16_t key, uint64_t immValue) : + operation(operation), key(key), imm_value(immValue) {type = track_array_mov_imm;} +// + +FA_NodeOfTrackArrayMovHalfinvariant::FA_NodeOfTrackArrayMovHalfinvariant(regex024_opcode operation, uint16_t key): + operation(operation), key(key){type = track_array_mov_halfinvariant;} +// + +void FA_NodeOfDetCharCrossroads::apply_lookahead_restriction(const codeset_t &restriction) { + exitf("What?? Oh, no, no. I am NOT doing it"); +} + +FA_NodeOfDetCharCrossroads::FA_NodeOfDetCharCrossroads(const std::vector &crossroads) + : crossroads(crossroads) {type = det_char_crossroads;} + +std::vector FA_NodeOfDetCharCrossroads::get_all_empty_valid_transitions() { + return {}; +} + +std::vector FA_NodeOfDetCharCrossroads::get_all_transitions() { + std::vector res; + for (auto& tr: crossroads) + res.push_back(&tr.nxt_node); + return res; +} + +/* If transferring ownership of node to container has failed, node is freed (which means it is ivalidated) + * If this semi-ownership transfer succeded (no std::bad_alloc), then node is still valid to use, and at the end + * of FA_Container lifetime it is guaranteed to be deleted + */ +void FA_Container::registerNew(FA_Node *node) { + try { + node->nodeId = (int64_t)all.size(); + all.push_back(node); + } catch (const std::bad_alloc& ba) { + delete node; + throw; + } +} + +DFA_CrossroadPath::DFA_CrossroadPath(const codeset_t &input, FA_Node *nxt_node): input(input),nxt_node(nxt_node) {} +// + +FA_Container::~FA_Container() { + for (FA_Node* n: all) + delete n; +} + +#define bs(name, args, params) \ + FA_NodeOf ## name *FA_Container::make ## name(args) { \ + FA_NodeOf ## name *node = new FA_NodeOf ## name(params); \ + registerNew(node); \ + return node; \ + } +#define COMMA , + +bs(Match, , ) +bs(OneCharRead, const codeset_t& filter COMMA bool second_namespace, filter COMMA second_namespace) +bs(Forking, , ) +bs(LookOneBehind, const codeset_t& filter, filter) +bs(LookOneAhead, const codeset_t& filter, filter) +bs(TrackArrayMovImm, regex024_opcode operation COMMA uint16_t key COMMA uint64_t immValue, + operation COMMA key COMMA immValue) +bs(TrackArrayMovHalfinvariant, regex024_opcode operation COMMA uint16_t key, operation COMMA key) +bs(DetCharCrossroads, ,{}) diff --git a/src/libregexis024fa/finite_automaton.h b/src/libregexis024fa/finite_automaton.h new file mode 100644 index 0000000..b12e1d4 --- /dev/null +++ b/src/libregexis024fa/finite_automaton.h @@ -0,0 +1,149 @@ +#ifndef LIBREGEXIS024_FINITE_AUTOMATON_H +#define LIBREGEXIS024_FINITE_AUTOMATON_H + +#include +#include +#include +#include + +enum FA_Node_type: uint8_t { + match, + one_char_read, + forking, + look_one_behind, + look_one_ahead, + track_array_mov_imm, + track_array_mov_halfinvariant, + /* Used for DFA */ + det_char_crossroads, +}; + +struct FA_Node{ + size_t refs = 0; + /* If node is not in searched subset (at least yet), `search mark == -1`, otherwise + * it is an index (for that particular node) in the vector that captures all nodes in + * searched subset*/ + int64_t search_mark = -1; + FA_Node_type type; + int64_t nodeId; + + bool empty(); + virtual std::vector get_all_empty_valid_transitions(); + virtual void apply_lookahead_restriction(const codeset_t &restriction); + void reAdd_references(); + virtual ~FA_Node() = default; + virtual std::vector get_all_transitions(); +}; + +struct FA_NodePathPart: public FA_Node{ + FA_Node* nxt_node = NULL; + + std::vector get_all_empty_valid_transitions() override; + std::vector get_all_transitions() override; +}; + +struct FA_NodeOfMatch: public FA_Node{ + bool ext_filter_added = false; + codeset_t pending_filter; + + explicit FA_NodeOfMatch(); + void apply_lookahead_restriction(const codeset_t &restriction) override; +}; + +/* .type == one_char_read */ +struct FA_NodeOfOneCharRead: public FA_NodePathPart{ + codeset_t filter; + bool second_ns = false; + + FA_NodeOfOneCharRead(const codeset_t &filter, bool second_namespace); + void apply_lookahead_restriction(const codeset_t &restriction) override; + std::vector get_all_empty_valid_transitions() override; +}; + +/* .type == forking */ +struct FA_NodeOfForking: public FA_Node{ + /* Won't be modified after init (in regexp compilation into NFA) */ + std::vector nxt_options; + int64_t stopId = -1; + + explicit FA_NodeOfForking(); + std::vector get_all_empty_valid_transitions() override; + std::vector get_all_transitions() override; +}; + +/* .type == look_one_behind */ +struct FA_NodeOfLookOneBehind: public FA_NodePathPart{ + /* [0; UINT32_MAX] is equivalent to no filter */ + codeset_t filter; + + explicit FA_NodeOfLookOneBehind(const codeset_t &filter); +}; + +/* .type == look_one_ahead */ +struct FA_NodeOfLookOneAhead: public FA_NodePathPart{ + /* [0; UINT32_MAX] is equivalent to no restriction */ + codeset_t restriction; + + explicit FA_NodeOfLookOneAhead(const codeset_t &restriction); +}; + +/* .type == track_array_mov_imm */ +struct FA_NodeOfTrackArrayMovImm: public FA_NodePathPart{ + regex024_opcode operation; + uint16_t key; + uint64_t imm_value; + + FA_NodeOfTrackArrayMovImm(regex024_opcode operation, uint16_t key, uint64_t immValue); +}; + +/* .type == track_array_mov_halfinvariant */ +struct FA_NodeOfTrackArrayMovHalfinvariant: public FA_NodePathPart{ + regex024_opcode operation; + uint16_t key; + + FA_NodeOfTrackArrayMovHalfinvariant(regex024_opcode operation, uint16_t key); +}; + +struct DFA_CrossroadPath{ + codeset_t input; + FA_Node* nxt_node = NULL; + + DFA_CrossroadPath(const codeset_t &input, FA_Node *nxt_node); + DFA_CrossroadPath() = default; +}; + +/* .type == det_char_crossroads */ +struct FA_NodeOfDetCharCrossroads: public FA_Node{ + std::vector crossroads; + bool matching = false; + bool second_ns = false; + + explicit FA_NodeOfDetCharCrossroads(const std::vector &crossroads); + void apply_lookahead_restriction(const codeset_t &restriction) override; + std::vector get_all_empty_valid_transitions() override; + std::vector get_all_transitions() override; +}; + +struct FA_Container{ + FA_Container(const FA_Container&) = delete; + FA_Container& operator=(const FA_Container&) = delete; + FA_Container() = default; + + std::vector all; + FA_Node* start = NULL; + + void registerNew(FA_Node* node); + + FA_NodeOfMatch* makeMatch(); + FA_NodeOfOneCharRead* makeOneCharRead(const codeset_t& filter, bool second_namespace); + FA_NodeOfForking* makeForking(); + FA_NodeOfLookOneBehind* makeLookOneBehind(const codeset_t& filter); + FA_NodeOfLookOneAhead* makeLookOneAhead(const codeset_t& filter); + FA_NodeOfTrackArrayMovImm* makeTrackArrayMovImm(regex024_opcode operation, uint16_t key, uint64_t immValue); + FA_NodeOfTrackArrayMovHalfinvariant* makeTrackArrayMovHalfinvariant(regex024_opcode operation, uint16_t key); + FA_NodeOfDetCharCrossroads* makeDetCharCrossroads(); + + ~FA_Container(); +}; + +#endif //LIBREGEXIS024_FINITE_AUTOMATON_H diff --git a/src/libregexis024fa/graph_to_bytecode/core.cpp b/src/libregexis024fa/graph_to_bytecode/core.cpp new file mode 100644 index 0000000..df2f9bc --- /dev/null +++ b/src/libregexis024fa/graph_to_bytecode/core.cpp @@ -0,0 +1,117 @@ +#include + +#include +#include + +#include + +#define nonthrowing_assert(expr) if (!(expr)) {error = -1; return; } + +void compilation_core(std::vector& result, FA_Container& fa, explicit_bookmarks& bookmark_manager, + size_t& first_read_ns, size_t& second_read_ns, size_t& fork_ss_ns, int& error) +{ + bookmark_id_t node_start_bm_offset = bookmark_manager.new_range_of_bookmarks(fa.all.size()); + std::vector not_yet_dedicated_second_read_ns_ssids; + first_read_ns = 0; + second_read_ns = 0; + fork_ss_ns = 0; + assert(fa.start); + std::vector todo = {fa.start}; + // std::vector promised(fa.all.size(), false); + // promised[fa.start->nodeId] = true; + + auto nodesBookmark = [&](FA_Node* node) -> bookmark_id_t { + assert(node); + return node_start_bm_offset + node->nodeId; + }; + + auto addBranching = [&](FA_Node* node) { + todo.push_back(node); + }; + + auto reading_head = [&](bool is_in_second_ns) { + if (is_in_second_ns) { + cmd_READ_second_ns(result, not_yet_dedicated_second_read_ns_ssids); + second_read_ns++; + } else { + cmd_READ_first_ns(result, first_read_ns++); + } + }; + + while (!todo.empty()) { + FA_Node* node = todo.back(); todo.pop_back(); + if (bookmark_manager.has_landed(nodesBookmark(node))) { + continue; + } + while (true) { + if (bookmark_manager.has_landed(nodesBookmark(node))) { + cmd_JUMP(result, bookmark_manager, nodesBookmark(node)); + break; + } + bookmark_manager.land_bookmark(result, nodesBookmark(node)); + if (node->type == match) { + cmd_MATCH(result); + cmd_DIE(result); + break; + } else if (node->type == one_char_read) { + FA_NodeOfOneCharRead* ocr = dynamic_cast(node); + nonthrowing_assert(first_read_ns + second_read_ns < UINT32_MAX); + reading_head(ocr->second_ns); + write_filter(result, bookmark_manager, {ocr->filter},{nodesBookmark(ocr->nxt_node)}); + node = ocr->nxt_node; + } else if (node->type == look_one_behind) { + FA_NodeOfLookOneBehind* lob = dynamic_cast(node); + write_filter(result, bookmark_manager, {lob->filter}, {nodesBookmark(lob->nxt_node)}); + node = lob->nxt_node; + } else if (node->type == forking) { + FA_NodeOfForking* fn = dynamic_cast(node); + std::vector& nxt_options = fn->nxt_options; + if (nxt_options.empty()) { + cmd_DIE(result); + break; + } + if (nxt_options.size() >= 2) { + nonthrowing_assert(fork_ss_ns < UINT32_MAX); + regex_sslot_id_t sslot = fork_ss_ns++; + for (size_t i = 0; i + 1 < nxt_options.size(); i++) { + cmd_FORK(result, bookmark_manager, sslot, nodesBookmark(nxt_options[i])); + addBranching(nxt_options[i]); + } + } + node = nxt_options.back(); + } else if (node->type == track_array_mov_imm) { + FA_NodeOfTrackArrayMovImm* tami = dynamic_cast(node); + write_byte(result, tami->operation); + write_tai(result, tami->key); + write_quadword(result, tami->imm_value); + node = tami->nxt_node; + } else if (node->type == track_array_mov_halfinvariant) { + FA_NodeOfTrackArrayMovHalfinvariant* tamh = dynamic_cast(node); + write_byte(result, tamh->operation); + write_tai(result, tamh->key); + node = tamh->nxt_node; + } else if (node->type == det_char_crossroads) { + FA_NodeOfDetCharCrossroads* dcc = dynamic_cast(node); + nonthrowing_assert(first_read_ns + second_read_ns < UINT32_MAX); + if (dcc->matching) + cmd_MATCH(result); + reading_head(dcc->second_ns); + std::vector codesets; + std::vector branches; + for (const DFA_CrossroadPath& p: dcc->crossroads) { + codesets.push_back(p.input); + branches.push_back(nodesBookmark(p.nxt_node)); + addBranching(p.nxt_node); + } + write_filter(result, bookmark_manager, codesets, branches); + if (dcc->crossroads.empty()) + break; + node = dcc->crossroads[0].nxt_node; + } else + assert(false); + } + } + for (size_t j = 0; j < not_yet_dedicated_second_read_ns_ssids.size(); j++) { + belated_sslot_id(result, not_yet_dedicated_second_read_ns_ssids[j], j + first_read_ns); + } +} diff --git a/src/libregexis024fa/graph_to_bytecode/core.h b/src/libregexis024fa/graph_to_bytecode/core.h new file mode 100644 index 0000000..ef71883 --- /dev/null +++ b/src/libregexis024fa/graph_to_bytecode/core.h @@ -0,0 +1,10 @@ +#ifndef LIBREGEXIS024_SRC_LIBREGEXIS024FA_GRAPH_TO_BYTECODE_CORE_H +#define LIBREGEXIS024_SRC_LIBREGEXIS024FA_GRAPH_TO_BYTECODE_CORE_H + +#include +#include + +void compilation_core(std::vector& result, FA_Container& fa, explicit_bookmarks& bookmark_manager, + size_t& first_read_ns, size_t& second_read_ns, size_t& fork_ss_ns, int& error); + +#endif diff --git a/src/libregexis024fa/graph_to_bytecode/fa_compiler.cpp b/src/libregexis024fa/graph_to_bytecode/fa_compiler.cpp new file mode 100644 index 0000000..79f3e62 --- /dev/null +++ b/src/libregexis024fa/graph_to_bytecode/fa_compiler.cpp @@ -0,0 +1,102 @@ +#include + +#include +#include +#include +#include + +#include + +void write_priority_table_actions(std::vector& result, RegexPriorityTable &priority_table) { + for (RegexPriorityTableAction& act: priority_table) { + if (act.pos.isForRange()) { + write_byte(result, regex024_opcodes::DDIST_RABX_SELARR); + write_tai(result, act.pos.first); + write_tai(result, act.pos.second); + } else { + write_byte(result, regex024_opcodes::DMOV_RABX_SELARR); + write_tai(result, act.pos.first); + } + write_byte(result, act.minimize ? + regex024_opcodes::SIFTPRIOR_MIN_RABX : + regex024_opcodes::SIFTPRIOR_MAX_RABX); + } + write_byte(result, regex024_opcodes::SIFT_DONE); +} + +struct belate_initialization_parameters { + size_t todo_pos_read_ss_n; + size_t todo_pos_fork_ss_n; + size_t todo_pos_second_ns_size; + + void complete_it(std::vector& result, + regex_sslot_id_t first_read_ns, regex_sslot_id_t second_read_ns, regex_sslot_id_t fork_ss_ns) + { + assert((uint64_t)first_read_ns + (uint64_t)second_read_ns <= UINT32_MAX); + belated_sslot_id(result, todo_pos_read_ss_n , first_read_ns + second_read_ns); + belated_sslot_id(result, todo_pos_fork_ss_n, fork_ss_ns); + belated_sslot_id(result, todo_pos_second_ns_size, second_read_ns); + } +}; + +/* when I compile initializational part of program, I don't yet know what to put in + * PARAM_READ_SS_NUMBER, PARAM_FORK_SS_NUMBER and MSG_FED_INPUT_EXTENDED (second namespace size). + * These values are belate. */ +belate_initialization_parameters write_some_normal_initialization(std::vector& result, + size_t selarr_size, const REGEX_IS024_FA_FirstStageFixInfo& info1) +{ + belate_initialization_parameters todo; + + write_byte(result, regex024_opcodes::PARAM_READ_SS_NUMBER); + todo.todo_pos_read_ss_n = result.size(); + write_sslot_id(result, 0); // Belate + + write_byte(result, regex024_opcodes::PARAM_FORK_SS_NUMBER); + todo.todo_pos_fork_ss_n = result.size(); + write_sslot_id(result, 0); // Belate + + write_byte(result, regex024_opcodes::PARAM_SELARR_LEN); + write_tai(result, selarr_size); + + write_byte(result, regex024_opcodes::MSG_MULTISTART_ALLOWED); + write_byte(result, 1); + + write_byte(result, regex024_opcodes::MSG_FED_INPUT_EXTENDED); + write_byte(result, info1.fed_chars_extend_one_left ? 1 : 0); + write_byte(result, info1.fed_chars_extend_one_right ? 1 : 0); + todo.todo_pos_second_ns_size = result.size(); + write_sslot_id(result, 0); // Belate + + write_byte(result, regex024_opcodes::INIT); + return todo; +} + +void compile_fa_to_regexis024_bytecode(std::vector& result, + FA_Container &fa, RegexPriorityTable &priority_table, + size_t selarr_size, const REGEX_IS024_FA_FirstStageFixInfo& info1, int& error) +{ + error = 0; + explicit_bookmarks bookmark_manager; + + if (!priority_table.empty()) { + bookmark_id_t BM_sift_function = bookmark_manager.new_bookmark(); + bookmark_id_t BM_after_sift = bookmark_manager.new_bookmark(); + + cmd_JUMP(result, bookmark_manager, BM_after_sift); + bookmark_manager.land_bookmark(result, BM_sift_function); + write_priority_table_actions(result, priority_table); + bookmark_manager.land_bookmark(result, BM_after_sift); + + write_byte(result, regex024_opcodes::PARAM_COLSIFTFUNC_SET); + bookmark_manager.write_unresolved_reference(result, BM_sift_function); + } + + belate_initialization_parameters init_param_todo = write_some_normal_initialization(result, selarr_size, info1); + + size_t first_read_ns, second_read_ns, fork_ss_ns; + compilation_core(result, fa, bookmark_manager, first_read_ns, second_read_ns, fork_ss_ns, error); + if (error < 0) + return; + init_param_todo.complete_it(result, first_read_ns, second_read_ns, fork_ss_ns); + bookmark_manager.finish(result); +} diff --git a/src/libregexis024fa/graph_to_bytecode/fa_compiler.h b/src/libregexis024fa/graph_to_bytecode/fa_compiler.h new file mode 100644 index 0000000..96f340d --- /dev/null +++ b/src/libregexis024fa/graph_to_bytecode/fa_compiler.h @@ -0,0 +1,14 @@ +#ifndef LIBREGEXIS024_SRC_LIBREGEXIS024FA_GRAPH_TO_BYTECODE_FA_COMPILER_H +#define LIBREGEXIS024_SRC_LIBREGEXIS024FA_GRAPH_TO_BYTECODE_FA_COMPILER_H + +#include +#include +#include +#include +#include + +void compile_fa_to_regexis024_bytecode(std::vector& result, FA_Container& fa, RegexPriorityTable& priority_table, + size_t selarr_size, const REGEX_IS024_FA_FirstStageFixInfo& info1, int& error); + +#endif + diff --git a/src/libregexis024fa/graph_to_bytecode/filter.cpp b/src/libregexis024fa/graph_to_bytecode/filter.cpp new file mode 100644 index 0000000..74d9600 --- /dev/null +++ b/src/libregexis024fa/graph_to_bytecode/filter.cpp @@ -0,0 +1,120 @@ +#include + +#include +#include +#include + +std::vector convert_to_compSeg(const std::vector& crossroad_codesets) +{ + std::vector compSeg; + std::vector seg; + for (size_t i = 0; i < crossroad_codesets.size(); i++) { + for (auto& p: crossroad_codesets[i]) { + seg.emplace_back(i, p.first, p.second); + } + } + std::sort(seg.begin(), seg.end(), + [](const FilterSegment& a, const FilterSegment& b)->bool{return a.L < b.L;}); + if (seg.empty()) { + compSeg.emplace_back(-1, 0, UINT32_MAX); + } else { + if (seg[0].L > 0) + compSeg.emplace_back(-1, 0, seg[0].L - 1); + size_t N = seg.size(); + for (size_t i = 0; i + 1 < N; i++) { + compSeg.push_back(seg[i]); + assert(seg[i].R < seg[i + 1].L); + if (seg[i].R + 1 < seg[i + 1].L) + compSeg.emplace_back(-1, seg[i].R + 1, seg[i + 1].L - 1); + } + compSeg.push_back(seg.back()); + if (seg.back().R < UINT32_MAX) + compSeg.emplace_back(-1, seg[N - 1].R + 1, UINT32_MAX); + } + assert(!compSeg.empty()); + return compSeg; +} + +/* Return whether the resulting bytecode relies on me placing [0]'th node at the end */ +void write_filter_exit(std::vector& result, explicit_bookmarks& bookmark_manager, + const std::vector& crossroad_marks, + ssize_t color, bool at_the_end, bool& relies_on_proper_ending) +{ + if (color < 0) { + cmd_DIE(result); + } else if (color != 0 || !at_the_end) { + cmd_JUMP(result, bookmark_manager, crossroad_marks[color]); + } else { + relies_on_proper_ending = true; + } +} + +// todo: use return value of this function +bool write_filter(std::vector& result, explicit_bookmarks& bookmark_manager, + const std::vector& crossroad_codesets, const std::vector& crossroad_marks) +{ + bool relies_on_proper_ending = false; + + std::vector compSeg = convert_to_compSeg(crossroad_codesets); + size_t N = compSeg.size(); + struct RecFrame { + size_t Li; + size_t Ri; + bool second_part = false; + bookmark_id_t to_the_right_part; + + RecFrame(size_t li, size_t ri): Li(li),Ri(ri) {} + }; + + std::vector call_stack = {RecFrame(0, N - 1)}; + + auto is_sandwich = [&](size_t Li, size_t Ri) -> bool { + return Li + 2 == Ri && compSeg[Li].color == compSeg[Ri].color && compSeg[Li + 1].L == compSeg[Li + 1].R; + }; + + while (!call_stack.empty()) { + RecFrame& cur_frame = call_stack.back(); + size_t Li = cur_frame.Li; + size_t Ri = cur_frame.Ri; + if (Li == Ri) { + write_filter_exit(result, bookmark_manager, crossroad_marks, compSeg[Li].color, + Ri + 1 == N, relies_on_proper_ending); + call_stack.pop_back(); + } else if (is_sandwich(Li, Ri)){ + ssize_t A = compSeg[Li].color; + ssize_t B = compSeg[Li + 1].color; + size_t midVal = compSeg[Li + 1].L; + if (B < 0) { + assert(A >= 0); + bookmark_id_t b_to_end = bookmark_manager.new_bookmark(); + cmd_JCEQUAL(result, bookmark_manager, midVal, b_to_end); + cmd_JUMP(result, bookmark_manager, crossroad_marks[A]); + bookmark_manager.land_bookmark(result, b_to_end); + cmd_DIE(result); + } else { + cmd_JCEQUAL(result, bookmark_manager, midVal, crossroad_marks[B]); + write_filter_exit(result, bookmark_manager, crossroad_marks, A, + Ri + 1 == N, relies_on_proper_ending); + } + call_stack.pop_back(); + } else { + size_t m = (Li + Ri) / 2; + if (!cur_frame.second_part) { + cur_frame.to_the_right_part = bookmark_manager.new_bookmark(); + cmd_JCGRTR(result, bookmark_manager, compSeg[m].R, cur_frame.to_the_right_part); + cur_frame.second_part = true; + /* cur_frame was just invalidated */ + call_stack.emplace_back(Li, m); + } else { + bookmark_manager.land_bookmark(result, cur_frame.to_the_right_part); + /* cur_frame was invalidated */ + call_stack.pop_back(); + call_stack.emplace_back(m + 1, Ri); + } + } + } + return relies_on_proper_ending; +} + +FilterSegment::FilterSegment(ssize_t color, uint32_t l, uint32_t r): color(color), L(l), R(r) {} +// diff --git a/src/libregexis024fa/graph_to_bytecode/filter.h b/src/libregexis024fa/graph_to_bytecode/filter.h new file mode 100644 index 0000000..5284526 --- /dev/null +++ b/src/libregexis024fa/graph_to_bytecode/filter.h @@ -0,0 +1,21 @@ +#ifndef LIBREGEXIS024_SRC_LIBREGEXIS024FA_FILTER_H +#define LIBREGEXIS024_SRC_LIBREGEXIS024FA_FILTER_H + +#include +#include +#include +#include + +struct FilterSegment { + ssize_t color; + uint32_t L, R; + + FilterSegment(ssize_t color, uint32_t l, uint32_t r); +}; + +/* Return whether user of function must place [0]'th option after the filter + * The filter can end up being written in such a way that the end will never be reached */ +bool write_filter(std::vector& result, explicit_bookmarks& bookmark_manager, + const std::vector& crossroad_codesets, const std::vector& crossroad_marks); + +#endif diff --git a/src/libregexis024fa/graph_to_bytecode/natural_compiler_utils.cpp b/src/libregexis024fa/graph_to_bytecode/natural_compiler_utils.cpp new file mode 100644 index 0000000..16fa2ca --- /dev/null +++ b/src/libregexis024fa/graph_to_bytecode/natural_compiler_utils.cpp @@ -0,0 +1,115 @@ +#include +#include +#include + +#define push_to_res_least_signif result.push_back(x & 0xffLU); x >>= 8 + +void write_byte(std::vector& result, uint8_t x) { + result.push_back(x); +} + +void write_word(std::vector& result, uint16_t x) { + push_to_res_least_signif; push_to_res_least_signif; +} + +void write_doubleword(std::vector& result, uint32_t x) { + push_to_res_least_signif; push_to_res_least_signif; push_to_res_least_signif; push_to_res_least_signif; +} + +void write_quadword(std::vector& result, uint64_t x) { + for (int i = 0; i < 8; i++) { + push_to_res_least_signif; + } +} +#undef push_to_res_least_signif + +#define put_belated_to_res assert(result[pos] == 0); result[pos++] = value & 0xffLU; value >>= 8 +void belated_byte(std::vector& result, size_t pos, uint8_t value) { + assert(pos < result.size()); + result[pos] = value; +} + +void belated_word(std::vector& result, size_t pos, uint16_t value) { + assert(pos + 2 <= result.size()); + put_belated_to_res; put_belated_to_res; +} + +void belated_doubleword(std::vector& result, size_t pos, uint32_t value) { + assert(pos + 4 <= result.size()); + put_belated_to_res; put_belated_to_res; put_belated_to_res; put_belated_to_res; +} + +void belated_quadword(std::vector& result, size_t pos, uint64_t value) { + assert(pos + 8 <= result.size()); + for (int i = 0; i < 8; i++) { + put_belated_to_res; + } +} +#undef put_belated_to_res + +void write_sslot_id(std::vector& result, regex_sslot_id_t x) { + write_doubleword(result, x); +} + +void write_tai(std::vector& result, regex_tai_t x) { + write_word(result, x); +} + +void write_near_ptr(std::vector& result, regex_near_ptr_t x) { + write_quadword(result, x); +} + +void belated_sslot_id(std::vector& result, size_t pos, regex_sslot_id_t value) { + belated_doubleword(result, pos, value); +} + +void belated_tai(std::vector& result, size_t pos, regex_tai_t value) { + belated_word(result, pos, value); +} + +void belated_near_ptr(std::vector& result, size_t pos, regex_near_ptr_t value) { + belated_quadword(result, pos, value); +} + +bookmark_id_t explicit_bookmarks::new_bookmark() { + pile.emplace_back(); + return free_bid++; +} + +void explicit_bookmarks::write_unresolved_reference(std::vector &result, bookmark_id_t bm) { + size_t where_to_fill_later = result.size(); + write_near_ptr(result, 0); + pile[bm].positions_of_belated_refs.push_back(where_to_fill_later); +} + +void explicit_bookmarks::land_bookmark(std::vector &result, bookmark_id_t bm) { + assert(!pile[bm].placed_somewhere); + pile[bm].placed_somewhere = true; + pile[bm].actual_position = result.size(); +} + +void explicit_bookmarks::finish(std::vector &result) { + for (explicit_bookmark_info& bmi: pile) { + assert(bmi.positions_of_belated_refs.empty() || bmi.placed_somewhere); + if (bmi.placed_somewhere) { + for (size_t ref_to_mine_belate: bmi.positions_of_belated_refs) { + belated_near_ptr(result, ref_to_mine_belate, bmi.actual_position); + } + } + } +} + +bookmark_id_t explicit_bookmarks::new_range_of_bookmarks(size_t n) { + bookmark_id_t offset = free_bid; + free_bid += n; + for (size_t i = 0; i < n; i++) { + pile.emplace_back(); + } + return offset; +} + +bool explicit_bookmarks::has_landed(bookmark_id_t bm) { + return pile[bm].placed_somewhere; +} + +#undef put_belated_to_res diff --git a/src/libregexis024fa/graph_to_bytecode/natural_compiler_utils.h b/src/libregexis024fa/graph_to_bytecode/natural_compiler_utils.h new file mode 100644 index 0000000..b5a4b96 --- /dev/null +++ b/src/libregexis024fa/graph_to_bytecode/natural_compiler_utils.h @@ -0,0 +1,63 @@ +#ifndef LIBREGEXIS024_SRC_LIBREGEXIS024FA_NATURAL_COMPILER_UTILS_H +#define LIBREGEXIS024_SRC_LIBREGEXIS024FA_NATURAL_COMPILER_UTILS_H + +#include +#include +#include + +void write_byte(std::vector& result, uint8_t x); +void write_word(std::vector& result, uint16_t x); +void write_doubleword(std::vector& result, uint32_t x); +void write_quadword(std::vector& result, uint64_t x); + +void belated_byte(std::vector& result, size_t pos, uint8_t value); +void belated_word(std::vector& result, size_t pos, uint16_t value); +void belated_doubleword(std::vector& result, size_t pos, uint32_t value); +void belated_quadword(std::vector& result, size_t pos, uint64_t value); + + +void write_sslot_id(std::vector& result, regex_sslot_id_t x); +void write_tai(std::vector& result, regex_tai_t x); +void write_near_ptr(std::vector& result, regex_near_ptr_t x); + +void belated_sslot_id(std::vector& result, size_t pos, regex_sslot_id_t value); +void belated_tai(std::vector& result, size_t pos, regex_tai_t value); +void belated_near_ptr(std::vector& result, size_t pos, regex_near_ptr_t value); + +// constexpr uint64_t INSTRUCTION_SZ = REGEX024_BYTECODE_INSTRUCTION_SZ; +// constexpr uint64_t SSLOT_ID_SZ = REGEX024_BYTECODE_SSLOT_ID_SZ; +// constexpr uint64_t TRACK_ARRAY_INDEX_ID_SZ = REGEX024_BYTECODE_TRACK_ARRAY_INDEX_ID_SZ; +// constexpr uint64_t NEAR_POINTER_SZ = REGEX024_BYTECODE_NEAR_POINTER_SZ; + +typedef size_t bookmark_id_t; + +struct explicit_bookmark_info { + std::vector positions_of_belated_refs; + bool placed_somewhere = false; + size_t actual_position; +}; + +struct explicit_bookmarks { + bookmark_id_t free_bid = 0; + /* For each named explicit bookmark there is an element in PILE */ + std::vector pile; + + bookmark_id_t new_bookmark(); + + /* bm is the bookmark I refer to. Each bookmark has an id. It is like a name, but fits in 8 bytes */ + void write_unresolved_reference(std::vector& result, bookmark_id_t bm); + + /* bm is the bookmark I place into program `result` */ + void land_bookmark(std::vector& result, bookmark_id_t bm); + + /* call it at the very end of bytecode-building */ + void finish(std::vector& result); + + /* Returns offset of range of bookmark id's */ + bookmark_id_t new_range_of_bookmarks(size_t n); + + bool has_landed(bookmark_id_t bm); +}; + + +#endif diff --git a/src/libregexis024fa/graph_to_bytecode/writing_commands.cpp b/src/libregexis024fa/graph_to_bytecode/writing_commands.cpp new file mode 100644 index 0000000..ffc42e3 --- /dev/null +++ b/src/libregexis024fa/graph_to_bytecode/writing_commands.cpp @@ -0,0 +1,75 @@ +#include +#include +#include + +void cmd_JUMP(std::vector& result, explicit_bookmarks& bookmark_manager, bookmark_id_t dest) { + write_byte(result, regex024_opcodes::JUMP); + bookmark_manager.write_unresolved_reference(result, dest); +} + +constexpr regex024_opcode cmp_EQUAL[4] = {regex024_opcodes::JCEQUAL_B, regex024_opcodes::JCEQUAL_W, + regex024_opcodes::JCEQUAL_DW, regex024_opcodes::JCEQUAL_QW}; +constexpr regex024_opcode cmp_LESS[4] = {regex024_opcodes::JCLESS_B, regex024_opcodes::JCLESS_W, + regex024_opcodes::JCLESS_DW, regex024_opcodes::JCLESS_QW}; +constexpr regex024_opcode cmp_GRTR[4] = {regex024_opcodes::JCGRTR_B, regex024_opcodes::JCGRTR_W, + regex024_opcodes::JCGRTR_DW, regex024_opcodes::JCGRTR_QW}; + + +void cmd_JC(const regex024_opcode cmpT[4], + std::vector& result, explicit_bookmarks& bookmark_manager, uint64_t val, bookmark_id_t dest) +{ + if (val <= UINT8_MAX) { + write_byte(result, cmpT[0]); + write_byte(result, static_cast(val)); + } else if (val <= UINT16_MAX) { + write_byte(result, cmpT[1]); + write_word(result, static_cast(val)); + } else if (val <= UINT32_MAX) { + write_byte(result, cmpT[2]); + write_doubleword(result, static_cast(val)); + } else { + write_byte(result, cmpT[3]); + write_quadword(result, val); + } + bookmark_manager.write_unresolved_reference(result, dest); +} + + +void cmd_JCEQUAL(std::vector& result, explicit_bookmarks& bookmark_manager, uint64_t val, bookmark_id_t dest) { + cmd_JC(cmp_EQUAL, result, bookmark_manager, val, dest); +} + +void cmd_JCLESS(std::vector& result, explicit_bookmarks& bookmark_manager, uint64_t val, bookmark_id_t dest) { + cmd_JC(cmp_LESS, result, bookmark_manager, val, dest); +} + +void cmd_JCGRTR(std::vector& result, explicit_bookmarks& bookmark_manager, uint64_t val, bookmark_id_t dest) { + cmd_JC(cmp_GRTR, result, bookmark_manager, val, dest); +} + +void cmd_DIE(std::vector &result) { + write_byte(result, regex024_opcodes::DIE); +} + +void cmd_MATCH(std::vector &result) { + write_byte(result, regex024_opcodes::MATCH); +} + +void cmd_READ_first_ns(std::vector& result, size_t slot) { + assert(slot <= UINT32_MAX); + write_byte(result, regex024_opcodes::READ); + write_sslot_id(result, slot); +} + +void cmd_FORK(std::vector &result, explicit_bookmarks& bookmark_manager, size_t slot, bookmark_id_t dest) { + assert(slot <= UINT32_MAX); + write_byte(result, regex024_opcodes::FORK); + write_sslot_id(result, slot); + bookmark_manager.write_unresolved_reference(result, dest); +} + +void cmd_READ_second_ns(std::vector& result, std::vector& belate_second_read_ns_slot_args) { + write_byte(result, regex024_opcodes::READ); + belate_second_read_ns_slot_args.push_back(result.size()); + write_sslot_id(result, 0); +} diff --git a/src/libregexis024fa/graph_to_bytecode/writing_commands.h b/src/libregexis024fa/graph_to_bytecode/writing_commands.h new file mode 100644 index 0000000..43ed9e1 --- /dev/null +++ b/src/libregexis024fa/graph_to_bytecode/writing_commands.h @@ -0,0 +1,20 @@ +#ifndef LIBREGEXIS024_SRC_LIBREGEXIS024FA_WRITING_COMMANDS_H +#define LIBREGEXIS024_SRC_LIBREGEXIS024FA_WRITING_COMMANDS_H + +#include +#include + +void cmd_JUMP(std::vector& result, explicit_bookmarks& bookmark_manager, bookmark_id_t dest); + +void cmd_JCEQUAL(std::vector& result, explicit_bookmarks& bookmark_manager, uint64_t val, bookmark_id_t dest); +void cmd_JCLESS(std::vector& result, explicit_bookmarks& bookmark_manager, uint64_t val, bookmark_id_t dest); +void cmd_JCGRTR(std::vector& result, explicit_bookmarks& bookmark_manager, uint64_t val, bookmark_id_t dest); + +void cmd_DIE(std::vector& result); +void cmd_MATCH(std::vector& result); + +void cmd_READ_first_ns(std::vector& result, size_t slot); +void cmd_READ_second_ns(std::vector& result, std::vector& belate_second_read_ns_slot_args); +void cmd_FORK(std::vector &result, explicit_bookmarks& bookmark_manager, size_t slot, bookmark_id_t dest); + +#endif diff --git a/src/libregexis024fa/misc_fa_funcs.cpp b/src/libregexis024fa/misc_fa_funcs.cpp new file mode 100644 index 0000000..3b0496e --- /dev/null +++ b/src/libregexis024fa/misc_fa_funcs.cpp @@ -0,0 +1,71 @@ +#include +#include +#include +#include + +void reattach_fa_node_edge(FA_Node **old_node_ptr, FA_Node *new_node) { + assert(old_node_ptr); + if (*old_node_ptr){ + assert((**old_node_ptr).refs); + (**old_node_ptr).refs--; + } + if (new_node) + new_node->refs++; + *old_node_ptr = new_node; +} + +/* We basically reattch fa.start to node */ +void yay_new_start(FA_Container &fa, FA_NodePathPart *node) { + assert(node); + node->refs++; + node->nxt_node = fa.start; + fa.start = node; +} + +void add_option_to_fork_node(FA_NodeOfForking *fnode, FA_Node *transition_dest) { + fnode->nxt_options.push_back(transition_dest); + if(transition_dest) + transition_dest->refs++; +} + +void reattach_nxt_node(FA_NodePathPart *node, FA_Node *dest) { + reattach_fa_node_edge(&(node->nxt_node), dest); +} + +// todo: get rid of exitf in the whole project +FA_Node* copy_node_no_container_adjustments(FA_Node& node){ + FA_Node* res; + /* Using implicitly defined copy constructors */ +#define typeCase(etype, ctype) case etype: res = new ctype((ctype&)node); break; + switch (node.type) { + typeCase(match, FA_NodeOfMatch) + typeCase(one_char_read, FA_NodeOfOneCharRead) + typeCase(forking, FA_NodeOfForking) + typeCase(look_one_behind, FA_NodeOfLookOneBehind) + typeCase(look_one_ahead, FA_NodeOfLookOneAhead) + typeCase(track_array_mov_imm, FA_NodeOfTrackArrayMovImm) + typeCase(track_array_mov_halfinvariant, FA_NodeOfTrackArrayMovHalfinvariant) + typeCase(det_char_crossroads, FA_NodeOfDetCharCrossroads) + default: + assert(false); + } +#undef typeCase + res->refs = 0; + res->search_mark = -1; + return res; +} + +/* In case when transferring the ownership of this new raw pointer has failed, node is destroyed, exception is thrown */ +FA_Node *copy_fa_node(FA_Node& node, FA_Container &fa) { + FA_Node* res = copy_node_no_container_adjustments(node); + /* Can invalidate ponter res (in which case it also throws exeption, so none of this matters in the end) */ + fa.registerNew(res); + res->reAdd_references(); + return res; +} + +FA_Node *copy_fa_node_to_another_fa(FA_Node& node, FA_Container &resultFa) { + FA_Node* res = copy_node_no_container_adjustments(node); + resultFa.registerNew(res); + return res; +} diff --git a/src/libregexis024fa/misc_fa_funcs.h b/src/libregexis024fa/misc_fa_funcs.h new file mode 100644 index 0000000..7510dd6 --- /dev/null +++ b/src/libregexis024fa/misc_fa_funcs.h @@ -0,0 +1,17 @@ +#ifndef LIBREGEXIS024_MISC_FA_FUNCS_H +#define LIBREGEXIS024_MISC_FA_FUNCS_H + +#include "finite_automaton.h" +#include "fa_first_stage_fix.h" + +FA_Node* copy_fa_node(FA_Node& node, FA_Container& fa); +void yay_new_start(FA_Container& fa, FA_NodePathPart* node); +void reattach_fa_node_edge(FA_Node** old_node_ptr, FA_Node* new_node); +void add_option_to_fork_node(FA_NodeOfForking* fnode, FA_Node* transition_dest); +void reattach_nxt_node(FA_NodePathPart* node, FA_Node* dest); + +/* This is a one weird operation. New node in resultFa will still point to nodes in sourceFa, + * without increasing refcount of those nodes. YOU HAVE TO FIX IT ASAP */ +FA_Node* copy_fa_node_to_another_fa(FA_Node& node, FA_Container& resultFa); + +#endif //LIBREGEXIS024_MISC_FA_FUNCS_H diff --git a/src/libregexis024fa/selarr_priority_table.cpp b/src/libregexis024fa/selarr_priority_table.cpp new file mode 100644 index 0000000..21f07a9 --- /dev/null +++ b/src/libregexis024fa/selarr_priority_table.cpp @@ -0,0 +1,15 @@ +#include +#include + + +bool RegexPriorityTableAction_Pos::isForRange() const { + return second >= 0; +} + +RegexPriorityTableAction_Pos::RegexPriorityTableAction_Pos(int first, int second, tracking_var_type type): + first(first),second(second), type(type) {} +// + +RegexPriorityTableAction::RegexPriorityTableAction(bool minimize, int first, int second, tracking_var_type type): + minimize(minimize), pos(first, second, type) {} +// diff --git a/src/libregexis024fa/selarr_priority_table.h b/src/libregexis024fa/selarr_priority_table.h new file mode 100644 index 0000000..bf1de74 --- /dev/null +++ b/src/libregexis024fa/selarr_priority_table.h @@ -0,0 +1,26 @@ +#ifndef LIBREGEXIS024_SRC_LIBREGEXIS024FA_SELARR_PRIORITY_TABLE_H +#define LIBREGEXIS024_SRC_LIBREGEXIS024FA_SELARR_PRIORITY_TABLE_H + +#include +#include +#include + +struct RegexPriorityTableAction_Pos{ + /* first and second are indexes in selarr (but second can be -1 if it is unused) */ + int first; + int second; + tracking_var_type type; + bool isForRange() const; + + RegexPriorityTableAction_Pos(int first, int second, tracking_var_type type); +}; + +struct RegexPriorityTableAction{ + bool minimize; + RegexPriorityTableAction_Pos pos; + RegexPriorityTableAction(bool minimize, int first, int second, tracking_var_type type); +}; + +typedef std::vector RegexPriorityTable; + +#endif //LIBREGEXIS024_SRC_LIBREGEXIS024FA_SELARR_PRIORITY_TABLE_H diff --git a/src/libregexis024fa/tracking_fa_nodes.cpp b/src/libregexis024fa/tracking_fa_nodes.cpp new file mode 100644 index 0000000..12a92d0 --- /dev/null +++ b/src/libregexis024fa/tracking_fa_nodes.cpp @@ -0,0 +1,53 @@ +#include +#include + +bool isImmMovOpcode(regex024_opcode inst) { + return inst == regex024_opcodes::MOV_COLARR_IMM || inst == regex024_opcodes::MOV_SELARR_IMM; +} + +bool isCurPosMovOpcode(regex024_opcode inst) { + return inst == regex024_opcodes::MOV_COLARR_BTPOS || inst == regex024_opcodes::MOV_SELARR_CHPOS; +} + +bool isColarrOpcode(regex024_opcode inst) { + return inst == regex024_opcodes::MOV_COLARR_IMM || inst == regex024_opcodes::MOV_COLARR_BTPOS; +} + +bool isSelarrOpcode(regex024_opcode inst) { + return inst == regex024_opcodes::MOV_SELARR_IMM || inst == regex024_opcodes::MOV_SELARR_CHPOS; +} + +bool isTrackingFaNode(const FA_Node *n) { + return n->type == track_array_mov_imm || n->type == track_array_mov_halfinvariant; +} + +TrackingOperationInFa::TrackingOperationInFa(regex024_opcode opcode, regex_tai_t key, uint64_t imm_value) + : opcode(opcode), key(key), immValue(imm_value) {} + +TrackingOperationInFa::TrackingOperationInFa(regex024_opcode opcode, regex_tai_t key) + : opcode(opcode), key(key) {} + +std::string TrackingOperationInFa::toString() const { + switch (opcode){ + case regex024_opcodes::MOV_COLARR_IMM: + return "colarr[" + std::to_string(key) + "] := " + std::to_string(immValue); + case regex024_opcodes::MOV_SELARR_IMM: + return "selarr[" + std::to_string(key) + "] := " + std::to_string(immValue); + case regex024_opcodes::MOV_COLARR_BTPOS: + return "colarr[" + std::to_string(key) + "] := cur byte position"; + case regex024_opcodes::MOV_SELARR_CHPOS: + return "selarr[" + std::to_string(key) + "] := cur char position"; + default: + return "wrong collection operation"; + } +} + +FA_NodePathPart* convert_to_node(const TrackingOperationInFa& op, FA_Container& fa) { + if (isImmMovOpcode(op.opcode)) { + return fa.makeTrackArrayMovImm(op.opcode, op.key, op.immValue); + } + assert(isCurPosMovOpcode(op.opcode)); + return fa.makeTrackArrayMovHalfinvariant(op.opcode, op.key); + +} + diff --git a/src/libregexis024fa/tracking_fa_nodes.h b/src/libregexis024fa/tracking_fa_nodes.h new file mode 100644 index 0000000..618869a --- /dev/null +++ b/src/libregexis024fa/tracking_fa_nodes.h @@ -0,0 +1,31 @@ +#ifndef LIBREGEXIS024_TRACKING_FA_NODES_H +#define LIBREGEXIS024_TRACKING_FA_NODES_H + +#include +#include +#include + +bool isImmMovOpcode(regex024_opcode inst); +bool isCurPosMovOpcode(regex024_opcode inst); +bool isColarrOpcode(regex024_opcode inst); +bool isSelarrOpcode(regex024_opcode inst); + +bool isTrackingFaNode(const FA_Node* n); + +struct TrackingOperationInFa { + regex024_opcode opcode; + regex_tai_t key; + /* Not needed for halfinvariant operations */ + uint64_t immValue; + + TrackingOperationInFa(regex024_opcode opcode, regex_tai_t key, uint64_t imm_value); + + TrackingOperationInFa(regex024_opcode opcode, regex_tai_t key); + + std::string toString() const; +}; + +FA_NodePathPart* convert_to_node(const TrackingOperationInFa& op, FA_Container& fa); + + +#endif diff --git a/src/libregexis024fa/tracking_variables.h b/src/libregexis024fa/tracking_variables.h new file mode 100644 index 0000000..b80f4ec --- /dev/null +++ b/src/libregexis024fa/tracking_variables.h @@ -0,0 +1,14 @@ +#ifndef LIBREGEXIS024_SRC_LIBREGEXIS024FA_TRACKING_VARIABLES_H +#define LIBREGEXIS024_SRC_LIBREGEXIS024FA_TRACKING_VARIABLES_H + +namespace tracking_var_types { + enum tracking_var_type_I { + range, + dot_cur_pos, + dot_immediate + }; +} + +typedef tracking_var_types::tracking_var_type_I tracking_var_type; + +#endif diff --git a/src/libregexis024sol/backslash_expression.cpp b/src/libregexis024sol/backslash_expression.cpp new file mode 100644 index 0000000..0f97d2b --- /dev/null +++ b/src/libregexis024sol/backslash_expression.cpp @@ -0,0 +1,62 @@ +#include +#include +#include + +uint32_t read_hex(REGEX_IS024_MeaningContext& ctx, int sz){ + uint32_t res = 0; + for (int i = 0; i < sz; i++){ + int32_t ch = peep(ctx); + if ('0' <= ch && ch <= '9') + res = ((res << 4) | ((uint32_t)ch - '0')); + else if ('a' <= ch && ch <= 'z') + res = ((res << 4) | ((uint32_t)ch - 'a' + 10)); + else if ('A' <= ch && ch <= 'Z') + res = ((res << 4) | ((uint32_t)ch - 'A' + 10)); + else{ + report(ctx, "escape backslash expression: bad unicode code"); + return 0; + } + readChar(ctx); + } + return res; +} + +void unicode_in_bs_case(REGEX_IS024_MeaningContext &ctx, bool &ret_is_multicode, codeset_t &ret_set, int sz){ + ret_is_multicode = false; + readChar(ctx); + uint32_t hc = read_hex(ctx, sz); // Might create an error + ret_set = codeset_of_one_char(hc); +} + +void +backslash_expression_parsing_try_regular(REGEX_IS024_MeaningContext &ctx, const CommonCodesets& cc, + bool &ret_is_multicode, codeset_t &ret_set) +{ + int32_t leader = peep(ctx); + if (ctx.error) + return; +#define block(l, b, E) case l: ret_is_multicode = b; ret_set = E; readChar(ctx); break; + switch (leader) { + block('s', false, codeset_of_one_char(U' ')) + block('t', false, codeset_of_one_char(U'\t')) + block('n', false, codeset_of_one_char(U'\n')) + block('r', false, codeset_of_one_char(U'\r')) + block('e', true, cc.spaces); + block('E', true, invert_set(cc.spaces)) + block('w', true, cc.word_constituents); + block('W', true, invert_set(cc.word_constituents)); + case 'u': + unicode_in_bs_case(ctx, ret_is_multicode, ret_set, 4); + break; + case 'U': + unicode_in_bs_case(ctx, ret_is_multicode, ret_set, 8); + break; + default: + if (leader >= 0){ + ret_is_multicode = false; + ret_set = codeset_of_one_char(leader); + } else { + report(ctx, "backslash in the wrong place"); + } + } +} \ No newline at end of file diff --git a/src/libregexis024sol/command_expression.cpp b/src/libregexis024sol/command_expression.cpp new file mode 100644 index 0000000..fb61eba --- /dev/null +++ b/src/libregexis024sol/command_expression.cpp @@ -0,0 +1,143 @@ +#include + +#include +#include +#include +#include + +struct ParseCall{ + virtual ~ParseCall() = default; + virtual std::unique_ptr afterReceive(REGEX_IS024_MeaningContext& ctx) { assert(false); } + virtual std::unique_ptr firstTime(REGEX_IS024_MeaningContext& ctx) { assert(false); } +}; + +struct Top_ParseCall: public ParseCall{ + Command& res; + explicit Top_ParseCall(Command &res) : res(res) {} + std::unique_ptr firstTime(REGEX_IS024_MeaningContext &ctx) override; + std::unique_ptr afterReceive(REGEX_IS024_MeaningContext &ctx) override; +}; + +struct Bracker_ParseCall: public ParseCall{ + std::vector& res; + bool closingBraceEnded = false; + explicit Bracker_ParseCall(std::vector &res) : res(res) {} + std::unique_ptr argReadProc(REGEX_IS024_MeaningContext& ctx); + std::unique_ptr firstTime(REGEX_IS024_MeaningContext &ctx) override; + std::unique_ptr afterReceive(REGEX_IS024_MeaningContext &ctx) override; +}; + +#define call_ERROR_CHECK do { if (ctx.error) { return NULL; } } while (0) +#define call_THROW(str) do { report(ctx, "command expression: " str); return NULL; } while (0) + +std::unique_ptr Top_ParseCall::firstTime(REGEX_IS024_MeaningContext &ctx) { + assert(readChar(ctx) == U'!'); + int32_t ch = peep(ctx); call_ERROR_CHECK; + if (ch == U'~'){ + /* I assume during construction I received reference to newly initialized struct */ + res.tilda = true; + return NULL; + } + res.name = tryRead_REGEX024_name(ctx); call_ERROR_CHECK; + if (res.name.empty()) + call_THROW("top lvl: no command name specified"); + ch = peep(ctx); call_ERROR_CHECK; + if (ch == U';'){ + readChar(ctx); + return NULL; + } + if (ch == U'{'){ + return std::make_unique(res.arguments); + } + call_THROW("top lvl: command call should be ended with ';' or '{...}'"); +} + +std::unique_ptr Top_ParseCall::afterReceive(REGEX_IS024_MeaningContext &ctx) { + return NULL; +} + +std::unique_ptr Bracker_ParseCall::firstTime(REGEX_IS024_MeaningContext &ctx) { + assert(readChar(ctx) == U'{'); + return argReadProc(ctx); +} + +std::unique_ptr Bracker_ParseCall::afterReceive(REGEX_IS024_MeaningContext &ctx) { + closingBraceEnded = true; + return argReadProc(ctx); +} + +std::unique_ptr Bracker_ParseCall::argReadProc(REGEX_IS024_MeaningContext &ctx) { + repeat: + int32_t ch = peep(ctx); call_ERROR_CHECK; + if (ch == U';'){ + res.emplace_back(); + readChar(ctx); + closingBraceEnded = false; + goto repeat; + } else if (ch == U'}'){ + readChar(ctx); + if (!closingBraceEnded){ + res.emplace_back(); + } + return NULL; + } else if (is_REGEX024_nameConstituent(ch)){ + res.emplace_back(); + res.back().is_empty = false; + res.back().name = tryRead_REGEX024_name(ctx); + int32_t eCh = peep(ctx); call_ERROR_CHECK; + if (eCh == U';'){ + readChar(ctx); + closingBraceEnded = false; + goto repeat; + } else if (eCh == U'{'){ + return std::make_unique(res.back().arguments); + } else if (eCh == U'}'){ + readChar(ctx); + return NULL; + } + call_THROW("brace lvl: argument ends with ';' or {...}"); + } + call_THROW("brace lvl: argument starts with ';' or it's name"); +} + +Command command_expr_parse(REGEX_IS024_MeaningContext &ctx) { + std::vector> callStack; + Command res; + callStack.push_back(std::make_unique(res)); + bool first_time = true; + while (!callStack.empty()){ + if (ctx.error) + return {}; + auto nxt = first_time ? callStack.back()->firstTime(ctx) : callStack.back()->afterReceive(ctx); + if (nxt){ + callStack.push_back(std::move(nxt)); + first_time = true; + } else { + callStack.pop_back(); + first_time = false; + } + } + return res; +} + +const char* commands_for_codesets[] = {"word", "space", "digit", "variable", "any", "A", NULL}; + +bool is_command_for_charset(const Command &cmd) { + return !cmd.tilda && cmd.arguments.empty() && is_string_in_stringset(cmd.name.c_str(), commands_for_codesets); +} + +void interpret_command_as_charset_giving(const CommonCodesets& cc, const Command &cmd, codeset_t& ret) +{ + if (cmd.name == "word") + ret = cc.word_constituents; + else if (cmd.name == "space") + ret = cc.spaces; + else if (cmd.name == "digit") + ret = cc.digits; + else if (cmd.name == "variable") + ret = cc.variable_constituents; + else if (cmd.name == "any" || cmd.name == "A") + ret = codeset_of_all; + else + assert(false); +} diff --git a/src/libregexis024sol/common_codesets.cpp b/src/libregexis024sol/common_codesets.cpp new file mode 100644 index 0000000..791c6c7 --- /dev/null +++ b/src/libregexis024sol/common_codesets.cpp @@ -0,0 +1,13 @@ +#include + +CommonCodesets::CommonCodesets() { + spaces = set_add_char(spaces, U'\n'); + spaces = set_add_char(spaces, U' '); + spaces = set_add_char(spaces, U'\t'); + spaces = set_add_char(spaces, U'\r'); + word_constituents = set_add_range(word_constituents, U'a', U'z'); + word_constituents = set_add_range(word_constituents, U'A', U'Z'); + digits = codeset_t({{'0', '9'}}); + variable_constituents = set_add_char(word_constituents, U'-'); + variable_constituents = merge_sets(variable_constituents, digits); +} diff --git a/src/libregexis024sol/common_codesets.h b/src/libregexis024sol/common_codesets.h new file mode 100644 index 0000000..f2d9ee6 --- /dev/null +++ b/src/libregexis024sol/common_codesets.h @@ -0,0 +1,14 @@ +#ifndef LIBREGEXIS024_SRC_LIBREGEXIS024SOL_COMMON_CODESETS_H +#define LIBREGEXIS024_SRC_LIBREGEXIS024SOL_COMMON_CODESETS_H + +#include + +struct CommonCodesets { + codeset_t spaces; + codeset_t word_constituents; + codeset_t digits; + codeset_t variable_constituents; + CommonCodesets(); +}; + +#endif diff --git a/src/libregexis024sol/expr_compiler.cpp b/src/libregexis024sol/expr_compiler.cpp new file mode 100644 index 0000000..382154d --- /dev/null +++ b/src/libregexis024sol/expr_compiler.cpp @@ -0,0 +1,280 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +/* Temporary debug measures */ +#include + +#define call_ERROR_CHECK do { if (ctx.error) { return NULL; } } while (0) +#define call_THROW(str) do { report(ctx, "regex: " str); return NULL; } while (0) +#define aux_ERROR_CHECK do { if (ctx.error) { return; } } while (0) +#define aux_THROW(str) do { report(ctx, "regex: " str); return; } while (0) + +/* ****************************** Top */ + +const char* dfa_arg_aliases_condone[] = {"forgive", "condone", "okay", "optional", "nonimportant", "ifpossible", NULL}; +const char* dfa_arg_aliases_acerbic[] = {"acerbic", "angry", "pedantic", "nofork", "pure", "important", "fierce", NULL}; + +void dfa_command_processing(REGEX_IS024_MeaningContext &ctx, ParsingContext& pctx, const Command& cmdBuf){ + if (pctx.dfa_cmd_activated){ + report(ctx, "repeating !dfa command"); + return; + } + pctx.dfa_cmd_activated = true; + if (cmdBuf.arguments.empty()) + return; + if (cmdBuf.arguments.size() == 1 && cmdBuf.arguments[0].arguments.empty()){ + const std::string& arg_name = cmdBuf.arguments[0].name; + if (is_string_in_stringset(arg_name.c_str(), dfa_arg_aliases_acerbic)) { + pctx.dfa_cmd_unforgiving = true; + return; + } + if (is_string_in_stringset(arg_name.c_str(), dfa_arg_aliases_condone)) { + pctx.dfa_cmd_nonimportant = true; + return; + } + } + report(ctx, "wrong arguments in !dfa command"); +} + +void select_command_processing(REGEX_IS024_MeaningContext& ctx, ParsingContext& pctx, const Command& cmdBuf){ + if (pctx.select_cmd_encountered) + aux_THROW("repeating !select command"); + pctx.select_cmd_encountered = true; + for (const CommandArgument& arg: cmdBuf.arguments){ + if (arg.is_empty) + aux_THROW("wrong arguments in !select command"); + if (ctx.ktr.track_names.count(arg.name) != 0) + aux_THROW("repeated names in !select command"); + int64_t namedThingId = static_cast(ctx.ktr.track_names.size()); + ctx.ktr.track_names.insert({arg.name, namedThingId}); + ctx.ktr.retrieval_info.emplace_back(); + ctx.ktr.retrieval_info.back().stored_in_sa = true; + ctx.ktr.retrieval_info.back().stored_in_ca = false; + bool mm = false, coll = false; + for (const CommandArgument& argarg: arg.arguments){ +#define mm_shenanigans if (mm) {aux_THROW("bad argument to !select command");} mm = true; + if (argarg.name == "ca" || argarg.name == "col") { + if (coll) + aux_THROW("bad argument to !select command"); + coll = true; + ctx.ktr.retrieval_info.back().stored_in_ca = true; + } else if (argarg.name == "min") { + mm_shenanigans + ctx.ktr.retrieval_info.back().used_in_sifting = true; + ctx.ktr.retrieval_info.back().minimizing = true; + } else if (argarg.name == "max"){ + mm_shenanigans + ctx.ktr.retrieval_info.back().used_in_sifting = true; + } else if (argarg.name == "ign") { + mm_shenanigans + } else { + aux_THROW("wrong parameter for prioritized parameter in !select command"); + } +#undef mm_shenanigans + } + pctx.is_inside_of_these_sa_subexpressions.assign(ctx.ktr.retrieval_info.size(), false); + /* Other info will be filled once a tracking-unit with such name will be actually found in regex */ + } +} + +void jump_into_madness(ctx_t& ctx, ParsingContext& pctx, FA_Container &fa, int hn){ + while (true){ + int32_t pch = peep(ctx); aux_ERROR_CHECK; + if (pch != U'!'){ + return; + } + size_t before_it = ctx.pos; + Command cmd = command_expr_parse(ctx); aux_ERROR_CHECK; + if (cmd.tilda){ + ctx.have_comment_tail = true; + ctx.comment_tail_start = ctx.pos; + ctx.pos = ctx.input_size; + } else if (is_header_dfa_cmd(cmd)){ + dfa_command_processing(ctx, pctx, cmd); + } else if (is_header_select_cmd(cmd)){ + if (hn != 1) + aux_THROW("!select command at the wrong place"); + select_command_processing(ctx, pctx, cmd); + } else { + assert(!is_header_cmd(cmd)); + ctx.pos = before_it; + break; + } + } +} + +chekushka TopLvl_ParseCall::firstTime(ctx_t &ctx, ParsingContext &pctx, FA_Container &fa) { + result.assertDefault(); + jump_into_madness(ctx, pctx, fa, 1); + if (ctx.have_comment_tail) + return NULL; + return std::make_unique(result); +} + +chekushka TopLvl_ParseCall::afterReceive(ctx_t &ctx, ParsingContext &pctx, FA_Container &fa) { + jump_into_madness(ctx, pctx, fa, 2); + if (!isEnd(ctx)) + call_THROW("top lvl: EOF expected"); + return NULL; +} + +/* ********************************* Bracket */ + +chekushka BracketLvl_ParseCall::firstTime(REGEX_IS024_MeaningContext& ctx, ParsingContext& pctx, FA_Container& fa) { + result.assertDefault(); + assert(readChar(ctx) == U'('); + /* sequence lvl already took care about resolving name and configuring SubtrackingNameInfo */ + if (namedSubexpressionId >= 0){ + assert(ctx.ktr.retrieval_info[namedSubexpressionId].type == tracking_var_types::range); + if (ctx.ktr.retrieval_info[namedSubexpressionId].stored_in_sa){ + assert(namedSubexpressionId < (int64_t)pctx.is_inside_of_these_sa_subexpressions.size()); + if (pctx.is_inside_of_these_sa_subexpressions[namedSubexpressionId]) + call_THROW("subexpression that selection array tracks is nested"); + pctx.is_inside_of_these_sa_subexpressions[namedSubexpressionId] = true; + } + } + return std::make_unique(tmp_ret_buff); +} + +chekushka BracketLvl_ParseCall::afterReceive(REGEX_IS024_MeaningContext& ctx, ParsingContext& pctx, FA_Container& fa) { + if (peep(ctx) != U')') + call_THROW("missing ')'"); + readChar(ctx); + result = tmp_ret_buff; + if (namedSubexpressionId >= 0) { + SubtrackingNameInfo& tai_slots = ctx.ktr.retrieval_info[namedSubexpressionId]; + if (tai_slots.stored_in_ca){ + assert(tai_slots.colarr_first >= 0 && tai_slots.colarr_first < UINT16_MAX); + assert(tai_slots.colarr_second >= 0 && tai_slots.colarr_second < UINT16_MAX); + result = join(subexpression_from_path(fa.makeTrackArrayMovHalfinvariant( + regex024_opcodes::MOV_COLARR_BTPOS, tai_slots.colarr_first)), result); + result = join(result, subexpression_from_path(fa.makeTrackArrayMovHalfinvariant( + regex024_opcodes::MOV_COLARR_BTPOS, tai_slots.colarr_second))); + } + if (tai_slots.stored_in_sa){ + assert(tai_slots.selarr_first >= 0 && tai_slots.selarr_first < UINT16_MAX); + assert(tai_slots.selarr_second >= 0 && tai_slots.selarr_second < UINT16_MAX); + result = join(subexpression_from_path(fa.makeTrackArrayMovHalfinvariant( + regex024_opcodes::MOV_SELARR_CHPOS, tai_slots.selarr_first)), result); + result = join(result, subexpression_from_path(fa.makeTrackArrayMovHalfinvariant( + regex024_opcodes::MOV_SELARR_CHPOS, tai_slots.selarr_second))); + pctx.is_inside_of_these_sa_subexpressions[namedSubexpressionId] = false; + } + } + return NULL; +} + +/* ******************************* Fork */ + +chekushka ForkLvl_ParseCall::firstTime(REGEX_IS024_MeaningContext& ctx, ParsingContext& pctx, FA_Container& fa) { + result.assertDefault(); + options.emplace_back(); // Default one contains nothing. It will be overwritten + return std::make_unique(options.back()); +} + +chekushka ForkLvl_ParseCall::afterReceive(REGEX_IS024_MeaningContext& ctx, ParsingContext& pctx, FA_Container& fa) { + int32_t end_reason = peep(ctx); call_ERROR_CHECK; + if (end_reason == U'|'){ + readChar(ctx); + return firstTime(ctx, pctx, fa); + } + result = forkify(options, fa); + return NULL; +} + +void parseBody(REGEX_IS024_MeaningContext& ctx, FA_Container& fa, SubExprCompiled& result, ParsingContext& pctx){ + std::vector> callStack; + callStack.push_back(std::make_unique(result)); + bool first_time = true; + while (!callStack.empty()){ + aux_ERROR_CHECK; + auto nxt = first_time ? callStack.back()->firstTime(ctx, pctx, fa) : \ + callStack.back()->afterReceive(ctx, pctx, fa); + if (nxt){ + callStack.push_back(std::move(nxt)); + first_time = true; + } else { + callStack.pop_back(); + first_time = false; + } + } + /* Generating priority table (sifting program) */ + for (const SubtrackingNameInfo& sni: ctx.ktr.retrieval_info) { + if (!sni.discovered) + aux_THROW("tracking tool named in !select is not used anywhere"); + if (sni.used_in_sifting) { + assert(sni.selarr_first >= 0); + assert((sni.type == tracking_var_types::range) == (sni.selarr_second != -1)); + pctx.priority_table.emplace_back(sni.minimizing, sni.selarr_first, sni.selarr_second, sni.type); + } + } +} + +REGEX_IS024_MeaningContext::REGEX_IS024_MeaningContext(size_t inputSize, const char *input) : input_size(inputSize), + input(reinterpret_cast(input)) { + CommonCodesets codeset_collection; + FA_Container fa; + FA_Container fa_1f; + FA_Container fa_2f; + SubExprCompiled result; + ParsingContext pctx(codeset_collection); + parseBody(*this, fa, result, pctx); + /* CLion gone crazy here. It thinks error is always false (It doesn't know about such thing as macros) */ + if (error) + return; + + FA_NodeOfMatch* matcher = fa.makeMatch(); + if (!result.start){ + fa.start = matcher; + } else { + fa.start = result.start; + for (FA_Node** ending: result.ends) + reattach_fa_node_edge(ending, matcher); + } + fa.start->refs++; + + // show_fa_with_sxiv_after_dot(fa, ktr, pctx.priority_table); // todo debug + + REGEX_IS024_FA_FirstStageFixInfo info1 = first_stage_fix_fa(fa, fa_1f); + + // show_fa_with_sxiv_after_dot(fa_1f, ktr, pctx.priority_table); // todo debug + + if (pctx.dfa_cmd_activated) { + int det_err; + int had_to_fork; + try_determinize_fa(fa_1f, pctx.priority_table, free_selarr_tai, info1, fa_2f, det_err, had_to_fork); + if (det_err < 0 && !pctx.dfa_cmd_nonimportant) { + report(*this, "Unable to determinize dfa"); + return; + } + if (pctx.dfa_cmd_unforgiving && had_to_fork < 0) { + report(*this, "Attempt to determinize dfa was not good enough"); + return; + } + } else { + regular_second_stage_fix(fa_1f, fa_2f, info1); + } + + // show_fa_with_sxiv_after_dot(fa_2f, ktr, pctx.priority_table); // todo debug + + int compilation_error; + compile_fa_to_regexis024_bytecode(compiled_program, fa_2f, pctx.priority_table, free_selarr_tai, info1, compilation_error); + if (compilation_error) { + report(*this, "Failed to compile graph representation to bytecode representation"); + return; + } +} diff --git a/src/libregexis024sol/expr_compiler.h b/src/libregexis024sol/expr_compiler.h new file mode 100644 index 0000000..62dcb20 --- /dev/null +++ b/src/libregexis024sol/expr_compiler.h @@ -0,0 +1,34 @@ +#ifndef LIBREGEXIS024_EXPR_COMPILER_H +#define LIBREGEXIS024_EXPR_COMPILER_H + +#include +#include +#include + + +// todo: SUPER HIGHT PRIORITY: MOVE all this spaces digits variable_constituents junk out of this class +// todo: also PLEEEASE, write static before literally nearly every single one little stupid function in this library +#include + +struct REGEX_IS024_MeaningContext{ + size_t input_size; + const uint8_t* input; + + bool error = false; + std::string error_msg; + + size_t pos = 0; + + bool have_comment_tail = false; + size_t comment_tail_start; + std::vector compiled_program; + + KnownTrackingTools ktr; + + uint16_t free_selarr_tai = 0; + uint16_t free_colarr_tai = 0; + + REGEX_IS024_MeaningContext(size_t inputSize, const char *input); +}; + +#endif //LIBREGEXIS024_EXPR_COMPILER_H diff --git a/src/libregexis024sol/expr_parse_functions/command_recognition.cpp b/src/libregexis024sol/expr_parse_functions/command_recognition.cpp new file mode 100644 index 0000000..15cae70 --- /dev/null +++ b/src/libregexis024sol/expr_parse_functions/command_recognition.cpp @@ -0,0 +1,34 @@ +#include +#include +#include +#include + +#define aux_ERROR_CHECK do { if (ctx.error) { return; } } while (0) +#define aux_THROW(str) do { report(ctx, "regex: " str); return; } while (0) + +const char* header_command_dfa_names[] = {"dfa", "determinize", NULL}; + +const char* header_command_select_names[] = {"s", "select", "selarr", "selectional", NULL}; + +bool is_header_cmd(const Command &cmd) { + return cmd.tilda || is_header_dfa_cmd(cmd), is_header_dfa_cmd(cmd); +} + +bool is_header_dfa_cmd(const Command &cmd) { + return is_string_in_stringset(cmd.name.c_str(), header_command_dfa_names); +} + +bool is_header_select_cmd(const Command &cmd) { + return is_string_in_stringset(cmd.name.c_str(), header_command_select_names); +} + +void int_parse_with_limit_concern(const std::string &str, REGEX_IS024_MeaningContext &ctx, size_t &res, int lim) { + res = 0; + for (char ch: str){ + if (!('0' <= ch && ch <= '9')) + aux_THROW("bad integer argument"); + res = res * 10 + (ch - '0'); + if (res > (size_t)lim) + aux_THROW("integer is too big"); + } +} diff --git a/src/libregexis024sol/expr_parse_functions/command_recognition.h b/src/libregexis024sol/expr_parse_functions/command_recognition.h new file mode 100644 index 0000000..9be9ace --- /dev/null +++ b/src/libregexis024sol/expr_parse_functions/command_recognition.h @@ -0,0 +1,13 @@ +/* Internal use only */ +#ifndef LIBREGEXIS024_SRC_LIBREGEXIS024SOL_EXPR_PARSE_FUNCTIONS_COMMAND_RECOGNITION_H +#define LIBREGEXIS024_SRC_LIBREGEXIS024SOL_EXPR_PARSE_FUNCTIONS_COMMAND_RECOGNITION_H + +#include + +bool is_header_cmd(const Command& cmd); +bool is_header_dfa_cmd(const Command& cmd); +bool is_header_select_cmd(const Command& cmd); +void int_parse_with_limit_concern(const std::string& str, REGEX_IS024_MeaningContext &ctx, size_t& res, int lim); + + +#endif //LIBREGEXIS024_SRC_LIBREGEXIS024SOL_EXPR_PARSE_FUNCTIONS_COMMAND_RECOGNITION_H diff --git a/src/libregexis024sol/expr_parse_functions/ep_sequence.cpp b/src/libregexis024sol/expr_parse_functions/ep_sequence.cpp new file mode 100644 index 0000000..1e4b04f --- /dev/null +++ b/src/libregexis024sol/expr_parse_functions/ep_sequence.cpp @@ -0,0 +1,222 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define call_ERROR_CHECK do { if (ctx.error) { return NULL; } } while (0) +#define call_THROW(str) do { report(ctx, "regex: " str); return NULL; } while (0) +#define aux_ERROR_CHECK do { if (ctx.error) { return; } } while (0) +#define aux_THROW(str) do { report(ctx, "regex: " str); return; } while (0) + +/* **************************** Sequence */ + +void in_case_of_backslash(REGEX_IS024_MeaningContext &ctx, const CommonCodesets& cc, FA_Container &fa, SubExprCompiled& backPart) { + assert(readChar(ctx) == U'\\'); + int32_t leader = peep(ctx); aux_ERROR_CHECK; + if (leader == U'b'){ + FA_NodeOfForking* n1 = fa.makeForking(); + FA_NodeOfLookOneBehind* n1a = fa.makeLookOneBehind(invert_set(cc.word_constituents)); + FA_NodeOfLookOneAhead* n2a = fa.makeLookOneAhead(cc.word_constituents); + reattach_nxt_node(n1a, n2a); + FA_NodeOfLookOneBehind* n1b = fa.makeLookOneBehind(cc.word_constituents); + FA_NodeOfLookOneAhead* n2b = fa.makeLookOneAhead(invert_set(cc.word_constituents)); + reattach_nxt_node(n1b, n2b); + add_option_to_fork_node(n1, n1a); + add_option_to_fork_node(n1, n1b); + backPart.start = n1; + backPart.ends = {&(n2a->nxt_node), &(n2b->nxt_node)}; + } else if (leader == U'B'){ + FA_NodeOfForking* n1 = fa.makeForking(); + FA_NodeOfLookOneBehind* n1a = fa.makeLookOneBehind(cc.word_constituents); + FA_NodeOfLookOneAhead* n2a = fa.makeLookOneAhead(cc.word_constituents); + reattach_nxt_node(n1a, n2a); + FA_NodeOfLookOneBehind* n1b = fa.makeLookOneBehind(invert_set(cc.word_constituents)); + FA_NodeOfLookOneAhead* n2b = fa.makeLookOneAhead(invert_set(cc.word_constituents)); + reattach_nxt_node(n1b, n2b); + add_option_to_fork_node(n1, n1a); + add_option_to_fork_node(n1, n1b); + backPart.start = n1; + backPart.ends = {&(n2a->nxt_node), &(n2b->nxt_node)}; + } else if (leader == U'<'){ + FA_NodeOfLookOneBehind *n1 = fa.makeLookOneBehind(invert_set(cc.word_constituents)); + FA_NodeOfLookOneAhead *n2 = fa.makeLookOneAhead(cc.word_constituents); + reattach_nxt_node(n1, n2); + backPart.start = n1; + backPart.ends = {&(n2->nxt_node)}; + } else if (leader == U'>'){ + FA_NodeOfLookOneBehind *n1 = fa.makeLookOneBehind(cc.word_constituents); + FA_NodeOfLookOneAhead *n2 = fa.makeLookOneAhead(invert_set(cc.word_constituents)); + reattach_nxt_node(n1, n2); + backPart.start = n1; + backPart.ends = {&(n2->nxt_node)}; + } else { + bool ret_is_multicode; codeset_t res_codeset; + backslash_expression_parsing_try_regular(ctx, cc, ret_is_multicode, res_codeset); + backPart = subexpr_charset_reading_filter(res_codeset, fa); + return; // To avoid reading leader again (it gets read in the end) + } + readChar(ctx); +} + +void repeat_stuff_with_check(REGEX_IS024_MeaningContext& ctx, + SubExprCompiled &patient, FA_Container& fa, size_t min_allowed, size_t max_allowed){ + if (min_allowed > max_allowed) + aux_THROW("repeat operation: min > max"); + if (min_allowed > REGEXIS024_MAX_REPEAT) + aux_THROW("minimum repeat factor is too high"); + if (max_allowed > REGEXIS024_MAX_REPEAT && patient.can_be_empty) + aux_THROW("safety abortion: possible infinite loop. Если вы считаете, что ваше регулярное " + "выражение корректно и не вызвает бесконечного цикла, напишите об этом в жалобную книгу: " + "По ссылке: file:///dev/null Ваши предложения по улучшению libregexis024 обязательно будут рассмотрены."); + apply_repeat_to_subexpression(patient, fa, min_allowed, max_allowed); +} + +void repeat_command_processing(REGEX_IS024_MeaningContext &ctx, FA_Container &fa, std::vector& parts, + const Command& cmd){ + if (parts.empty()) + aux_THROW("no subexpression before !repeat command"); + if (cmd.arguments.empty() || (cmd.arguments.size() == 1 && cmd.arguments[0].is_empty)) { + repeat_stuff_with_check(ctx, parts.back(), fa, 0, REGEXIS024_MAX_REPEAT + 1); aux_ERROR_CHECK; + } else if (cmd.arguments.size() == 1){ + size_t mm; + int_parse_with_limit_concern(cmd.arguments[0].name, ctx, mm, REGEXIS024_MAX_REPEAT); aux_ERROR_CHECK; + repeat_stuff_with_check(ctx, parts.back(), fa, mm, mm); aux_ERROR_CHECK; + } else if (cmd.arguments.size() > 2){ + aux_THROW("too many arguments in !repeat command"); + } else { + size_t min_allowed, max_allowed; + if (cmd.arguments[0].is_empty){ + min_allowed = 0; + } else { + int_parse_with_limit_concern(cmd.arguments[0].name, ctx, min_allowed, REGEXIS024_MAX_REPEAT); + aux_ERROR_CHECK; + } + if (cmd.arguments[1].is_empty){ + max_allowed = REGEXIS024_MAX_REPEAT + 1; + } else { + int_parse_with_limit_concern(cmd.arguments[1].name, ctx, max_allowed, REGEXIS024_MAX_REPEAT); + aux_ERROR_CHECK; + } + if (min_allowed > max_allowed) + aux_THROW("!repeat: min > max"); + repeat_stuff_with_check(ctx, parts.back(), fa, min_allowed, max_allowed); aux_ERROR_CHECK; + } +} + + +chekushka Sequence_ParseCall::firstTime(REGEX_IS024_MeaningContext &ctx, ParsingContext &pctx, FA_Container &fa) { + while (true) { + int32_t fst = peep(ctx); + call_ERROR_CHECK; + if (fst == U'!') { + Command cmdBuf; + size_t before_cmd = ctx.pos; + cmdBuf = command_expr_parse(ctx); + call_ERROR_CHECK; + if (is_header_cmd(cmdBuf)){ + ctx.pos = before_cmd; + break; + } else if (cmdBuf.name == "r" || cmdBuf.name == "repeat"){ + repeat_command_processing(ctx, fa, parts, cmdBuf); call_ERROR_CHECK; + } else if (is_command_for_charset(cmdBuf)){ + codeset_t cs; + interpret_command_as_charset_giving(pctx.cc, cmdBuf, cs); call_ERROR_CHECK; + parts.push_back(subexpr_charset_reading_filter(cs, fa)); + } else { + call_THROW("unknown command"); + } + } else if (fst == U'\\') { + parts.emplace_back(); + in_case_of_backslash(ctx, pctx.cc, fa, parts.back()); + call_ERROR_CHECK; + } else if (fst == U'^'){ + readChar(ctx); + parts.push_back(subexpression_from_path(fa.makeLookOneBehind(codeset_of_one_char(U'\n')))); + } else if (fst == U'$'){ + readChar(ctx); + parts.push_back(subexpression_from_path(fa.makeLookOneAhead(codeset_of_one_char(U'\n')))); + } else if (fst == U'*'){ +#define vibe_check(sn) if (parts.empty()) { call_THROW("no subexpression before `" sn "` operator"); } readChar(ctx); + vibe_check("*") + repeat_stuff_with_check(ctx, parts.back(), fa, 0, REGEXIS024_MAX_REPEAT + 1); call_ERROR_CHECK; + } else if (fst == U'+'){ + vibe_check("+") + repeat_stuff_with_check(ctx, parts.back(), fa, 1, REGEXIS024_MAX_REPEAT + 1); call_ERROR_CHECK; + } else if (fst == U'?'){ + vibe_check("?") + repeat_stuff_with_check(ctx, parts.back(), fa, 0, 1); call_ERROR_CHECK; +#undef vibe_check + } else if (fst == U'#'){ + readChar(ctx); + std::string name = tryRead_REGEX024_name(ctx); call_ERROR_CHECK; + if (name.empty()) + call_THROW("No name provided after #"); + if (ctx.ktr.track_names.count(name) == 0){ + ctx.ktr.track_names[name] = static_cast(ctx.ktr.retrieval_info.size()); + ctx.ktr.retrieval_info.emplace_back(); + } + int64_t id = ctx.ktr.track_names[name]; + int32_t typeDet = peep(ctx); + if (typeDet == U'('){ + ensure_space_for_track_unit(ctx, name, tracking_var_types::range); call_ERROR_CHECK; + parts.emplace_back(); + return std::make_unique(parts.back(), id); + } else if (typeDet == U':'){ + ensure_space_for_track_unit(ctx, name, tracking_var_types::dot_immediate); call_ERROR_CHECK; + readChar(ctx); + std::string value_str = tryRead_REGEX024_name(ctx); + size_t value; + int_parse_with_limit_concern(value_str, ctx, value, UINT16_MAX); + int32_t cl = peep(ctx); + if (cl != U';') + call_THROW("Missing ; after dot track unit operator"); + readChar(ctx); + if (ctx.ktr.retrieval_info[id].stored_in_sa) + parts.emplace_back(subexpression_from_path( + fa.makeTrackArrayMovImm(regex024_opcodes::MOV_SELARR_IMM, + ctx.ktr.retrieval_info[id].selarr_first, value))); + if (ctx.ktr.retrieval_info[id].stored_in_ca) + parts.emplace_back(subexpression_from_path( + fa.makeTrackArrayMovImm(regex024_opcodes::MOV_COLARR_IMM, + ctx.ktr.retrieval_info[id].colarr_first, value))); + } else if (typeDet == U';'){ + ensure_space_for_track_unit(ctx, name, tracking_var_types::dot_cur_pos); call_ERROR_CHECK; + readChar(ctx); + if (ctx.ktr.retrieval_info[id].stored_in_sa) + parts.emplace_back(subexpression_from_path( + fa.makeTrackArrayMovHalfinvariant(regex024_opcodes::MOV_SELARR_CHPOS, + ctx.ktr.retrieval_info[id].selarr_first))); + if (ctx.ktr.retrieval_info[id].stored_in_ca) + parts.emplace_back(subexpression_from_path( + fa.makeTrackArrayMovHalfinvariant(regex024_opcodes::MOV_COLARR_BTPOS, + ctx.ktr.retrieval_info[id].colarr_first))); + } else + call_THROW("Missing ; or ( in the beginning of tracking unit"); + } else if (fst == U'(') { + parts.emplace_back(); + return std::make_unique(parts.back(), -1); + } else if (fst == U'[') { + codeset_t filter = sq_bracket_expr_parse(ctx, pctx.cc); call_ERROR_CHECK; + parts.push_back(subexpr_charset_reading_filter(filter, fa)); + } else if (fst >= 0 && fst != U')' && fst != U'|' && fst != U']'){ + readChar(ctx); + parts.push_back(subexpr_charset_reading_filter(codeset_of_one_char(fst), fa)); + } else { + break; + } + } + for (SubExprCompiled& part: parts) + result = join(result, part); + return NULL; +} + +chekushka Sequence_ParseCall::afterReceive(REGEX_IS024_MeaningContext &ctx, ParsingContext &pctx, FA_Container &fa) { + // This is possible only if I received a bracket expression + return firstTime(ctx, pctx, fa); +} diff --git a/src/libregexis024sol/expr_parse_functions/epf.h b/src/libregexis024sol/expr_parse_functions/epf.h new file mode 100644 index 0000000..8b04132 --- /dev/null +++ b/src/libregexis024sol/expr_parse_functions/epf.h @@ -0,0 +1,74 @@ +#ifndef LIBREGEXIS024_SRC_LIBREGEXIS024SOL_EXPR_PARSE_FUNCTIONS_EPF_H +#define LIBREGEXIS024_SRC_LIBREGEXIS024SOL_EXPR_PARSE_FUNCTIONS_EPF_H +/* For internal usage only */ + +#include +#include +#include +#include +#include +#include +#include + +struct ParsingContext{ + /* Those subexpressions, that are tracket by s`a are forbidden from nesting inside themselves */ + std::vector is_inside_of_these_sa_subexpressions; + bool select_cmd_encountered = false; + RegexPriorityTable priority_table; + bool dfa_cmd_activated = false; + /* Completely failing to build dfa with this flag on will result in no error */ + bool dfa_cmd_nonimportant = false; + /* With this flag, your dfa should be absolutely pure, no forks are allowed. */ + bool dfa_cmd_unforgiving = false; + + /* Reference to active cc set (actually, there is only one cc, but who cares, I placed + * it here to lower the number of arguments in ParseCall methods, again WHO CARES?) */ + const CommonCodesets& cc; + explicit ParsingContext(const CommonCodesets& cc_): cc(cc_){} + }; + +typedef REGEX_IS024_MeaningContext ctx_t; +struct ParseCall; +typedef std::unique_ptr chekushka; +struct ParseCall{ + SubExprCompiled& result; + explicit ParseCall(SubExprCompiled &result) : result(result) {} + virtual ~ParseCall() = default; + virtual chekushka afterReceive(ctx_t& ctx, ParsingContext& pctx, FA_Container& fa) { assert(false); } + virtual chekushka firstTime(ctx_t& ctx, ParsingContext& pctx, FA_Container& fa) { assert(false); } +}; + +struct TopLvl_ParseCall: public ParseCall{ + explicit TopLvl_ParseCall(SubExprCompiled &result) : ParseCall(result) {} + chekushka afterReceive(ctx_t &ctx, ParsingContext &pctx, FA_Container &fa) override; + chekushka firstTime(ctx_t &ctx, ParsingContext &pctx, FA_Container &fa) override; +}; + +struct BracketLvl_ParseCall: public ParseCall{ + /* -1 if this is a normal bracket expression. Otherwise, it is an index in ctx.retrieval_info vector */ + int64_t namedSubexpressionId; + SubExprCompiled tmp_ret_buff; + explicit BracketLvl_ParseCall(SubExprCompiled& result, int64_t namedSubexpressionId) : + ParseCall(result), namedSubexpressionId(namedSubexpressionId) {} + chekushka afterReceive(REGEX_IS024_MeaningContext& ctx, ParsingContext& pctx, FA_Container& fa) override; + chekushka firstTime(REGEX_IS024_MeaningContext& ctx, ParsingContext& pctx, FA_Container& fa) override; +}; + +struct ForkLvl_ParseCall: public ParseCall{ + std::vector options; + explicit ForkLvl_ParseCall(SubExprCompiled &result) : ParseCall(result) {} + chekushka afterReceive(REGEX_IS024_MeaningContext& ctx, ParsingContext& pctx, FA_Container& fa); + chekushka firstTime(REGEX_IS024_MeaningContext& ctx, ParsingContext& pctx, FA_Container& fa); +}; + +struct Sequence_ParseCall: public ParseCall{ + std::vector parts; + explicit Sequence_ParseCall(SubExprCompiled &result) :ParseCall(result) {} + chekushka afterReceive(REGEX_IS024_MeaningContext& ctx, ParsingContext& pctx, FA_Container& fa); + chekushka firstTime(REGEX_IS024_MeaningContext& ctx, ParsingContext& pctx, FA_Container& fa); +}; + +/* Some auxilary functions */ + + +#endif //LIBREGEXIS024_SRC_LIBREGEXIS024SOL_EXPR_PARSE_FUNCTIONS_EPF_H diff --git a/src/libregexis024sol/expr_parse_functions/tracking_units.cpp b/src/libregexis024sol/expr_parse_functions/tracking_units.cpp new file mode 100644 index 0000000..546edba --- /dev/null +++ b/src/libregexis024sol/expr_parse_functions/tracking_units.cpp @@ -0,0 +1,38 @@ +#include +#include + +#define aux_THROW(str) do { report(ctx, "regex: " str); return; } while (0) +#define aux_ERROR_CHECK do { if (ctx.error) { return; } } while (0) + + +void for_one_type(REGEX_IS024_MeaningContext &ctx, uint16_t& free_ARR_tai, int& ARR_first, int& ARR_second, + const std::string& ARR_NAME, tracking_var_type type){ +#define check_is_available() if (free_ARR_tai == UINT16_MAX) { \ + report(ctx, ("regex: " + ARR_NAME + ": key namespace overflow").c_str()); return;} + check_is_available() + ARR_first = free_ARR_tai++; + if (type == tracking_var_types::range){ + check_is_available() + ARR_second = free_ARR_tai++; + } +} + +void ensure_space_for_track_unit(REGEX_IS024_MeaningContext &ctx, const std::string& name, tracking_var_type type) { + size_t id = ctx.ktr.track_names[name]; + /* Size of this verctor won't be changed. THis is a safe reference */ + SubtrackingNameInfo& info = ctx.ktr.retrieval_info[id]; + if (!info.discovered){ + info.type = type; + if (info.stored_in_ca) { + for_one_type(ctx, ctx.free_colarr_tai, info.colarr_first, info.colarr_second, "collection array", type); + aux_ERROR_CHECK; + } + if (info.stored_in_sa) { + for_one_type(ctx, ctx.free_selarr_tai, info.selarr_first, info.selarr_second, "selection array", type); + aux_ERROR_CHECK; + } + info.discovered = true; + } else if (info.type != type){ + aux_THROW("tracking tool unit type mismatch"); + } +} diff --git a/src/libregexis024sol/expr_parse_functions/tracking_units.h b/src/libregexis024sol/expr_parse_functions/tracking_units.h new file mode 100644 index 0000000..74e5f22 --- /dev/null +++ b/src/libregexis024sol/expr_parse_functions/tracking_units.h @@ -0,0 +1,10 @@ +/* For internal use only */ +#ifndef LIBREGEXIS024_SRC_LIBREGEXIS024SOL_TRACKING_UNITS_H +#define LIBREGEXIS024_SRC_LIBREGEXIS024SOL_TRACKING_UNITS_H + +#include + +void ensure_space_for_track_unit(REGEX_IS024_MeaningContext &ctx, const std::string& name, tracking_var_type type); + + +#endif //LIBREGEXIS024_SRC_LIBREGEXIS024SOL_TRACKING_UNITS_H diff --git a/src/libregexis024sol/part_of_expr_that_tracks.cpp b/src/libregexis024sol/part_of_expr_that_tracks.cpp new file mode 100644 index 0000000..db5928f --- /dev/null +++ b/src/libregexis024sol/part_of_expr_that_tracks.cpp @@ -0,0 +1,2 @@ +// #include + diff --git a/src/libregexis024sol/part_of_expr_that_tracks.h b/src/libregexis024sol/part_of_expr_that_tracks.h new file mode 100644 index 0000000..9aaf8a0 --- /dev/null +++ b/src/libregexis024sol/part_of_expr_that_tracks.h @@ -0,0 +1,31 @@ +#ifndef PART_OF_EXPR_THAT_TRACKS_H +#define PART_OF_EXPR_THAT_TRACKS_H + +#include +#include +#include +#include + +struct SubtrackingNameInfo{ + bool stored_in_ca = true; + bool stored_in_sa = false; + + bool discovered = false; + tracking_var_type type; + /* These fields will be -1 if unused */ + int colarr_first = -1; + int colarr_second = -1; + + bool used_in_sifting = false; + bool minimizing = false; + int selarr_first = -1; + int selarr_second = -1; +}; + +struct KnownTrackingTools { + std::map track_names; + std::vector retrieval_info; +}; + + +#endif //PART_OF_EXPR_THAT_TRACKS_H diff --git a/src/libregexis024sol/sol_misc_base.cpp b/src/libregexis024sol/sol_misc_base.cpp new file mode 100644 index 0000000..a0177cc --- /dev/null +++ b/src/libregexis024sol/sol_misc_base.cpp @@ -0,0 +1,55 @@ +#include +#include + +void report(REGEX_IS024_MeaningContext &ctx, const char *error) { + if (!ctx.error){ + ctx.error = true; + ctx.error_msg = error; + } +} + +bool isEnd(REGEX_IS024_MeaningContext &ctx) { + return ctx.pos == ctx.input_size; +} + +int32_t peep(REGEX_IS024_MeaningContext &ctx) { +// printf("pos = %lu\n", ctx.pos); + if (isEnd(ctx)) + return -1; // This is probably the only place where getting negative return does not generate error + int32_t cp; size_t sz; + utf8_string_iterat(cp, sz, ctx.pos, ctx.input, ctx.input_size); + if (cp < 0) + report(ctx, "encoding error"); + return cp; +} + +int32_t readChar(REGEX_IS024_MeaningContext &ctx) { +// printf("READ pos = %lu\n", ctx.pos); + int32_t cp; size_t sz; + utf8_string_iterat(cp, sz, ctx.pos, ctx.input, ctx.input_size); + if (cp >= 0) + ctx.pos += sz; + else + report(ctx, "bruh what?? How this even happened"); + return cp; +} + +bool is_REGEX024_nameConstituent(int32_t ch) { + return ('0' <= ch && ch <= '9') || ('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z'); +} + +std::string tryRead_REGEX024_name(REGEX_IS024_MeaningContext &ctx) { + std::string res; + while (true){ + int32_t ch = peep(ctx); + if (is_REGEX024_nameConstituent(ch)){ + res += (char)ch; + readChar(ctx); + } else { + break; + } + } + return res; +} + + diff --git a/src/libregexis024sol/sol_misc_base.h b/src/libregexis024sol/sol_misc_base.h new file mode 100644 index 0000000..8ecb3c1 --- /dev/null +++ b/src/libregexis024sol/sol_misc_base.h @@ -0,0 +1,20 @@ +/* DO NOT INCLUDE THIS FILE OUTSIDE libregexis024sol implementation */ +#ifndef LIBREGEXIS024_SRC_LIBREGEXIS024SOL_SOL_MISC_BASE_H +#define LIBREGEXIS024_SRC_LIBREGEXIS024SOL_SOL_MISC_BASE_H + +#include +#include + +void report(REGEX_IS024_MeaningContext& ctx, const char* error); + +bool isEnd(REGEX_IS024_MeaningContext& ctx); +int32_t peep(REGEX_IS024_MeaningContext& ctx); +int32_t readChar(REGEX_IS024_MeaningContext& ctx); + + +bool is_REGEX024_nameConstituent(int32_t ch); +/* Name in my library consists of [0-9a-zA-Z]. If the first peeped letter is not name constituent, + * empty string is returned */ +std::string tryRead_REGEX024_name(REGEX_IS024_MeaningContext& ctx); + +#endif //LIBREGEXIS024_SRC_LIBREGEXIS024SOL_SOL_MISC_BASE_H diff --git a/src/libregexis024sol/special_terminals.h b/src/libregexis024sol/special_terminals.h new file mode 100644 index 0000000..ab10fcb --- /dev/null +++ b/src/libregexis024sol/special_terminals.h @@ -0,0 +1,36 @@ +/* DO NOT INCLUDE THIS FILE OUTSIDE libregexis024sol implementation */ +#ifndef LIBREGEXIS024_SRC_LIBREGEXIS024SOL_SPECIAL_TERMINALS_H +#define LIBREGEXIS024_SRC_LIBREGEXIS024SOL_SPECIAL_TERMINALS_H + +#include +#include + +/* This option of backslash usage should be checked last. + * Function can generate error. Always check the error first */ +void +backslash_expression_parsing_try_regular(REGEX_IS024_MeaningContext& ctx, const CommonCodesets& cc, + bool& ret_is_multicode, codeset_t& ret_set); + +struct CommandEntity; +struct Command; +struct CommandArgument; + +struct CommandEntity{ + std::string name; + std::vector arguments; +}; + +struct CommandArgument: CommandEntity{ + bool is_empty = true; +}; + +struct Command: CommandEntity{ + bool tilda = false; +}; + +/* Zlaya sobaka. Kidaet oshibki v context */ +Command command_expr_parse(REGEX_IS024_MeaningContext& ctx); +bool is_command_for_charset(const Command& cmd); +void interpret_command_as_charset_giving(const CommonCodesets& cc, const Command& cmd, codeset_t& ret); + +#endif //LIBREGEXIS024_SRC_LIBREGEXIS024SOL_SPECIAL_TERMINALS_H diff --git a/src/libregexis024sol/square_bracket_expression.cpp b/src/libregexis024sol/square_bracket_expression.cpp new file mode 100644 index 0000000..f2aaad2 --- /dev/null +++ b/src/libregexis024sol/square_bracket_expression.cpp @@ -0,0 +1,189 @@ +#include +#include +#include + +#include +#include +#include + +/* Can allow backslash (later should check that backslash expression is not multicharar or empty */ +bool soundsLikeCharOrRangeStart(int32_t peeped) { + return peeped >= 0 && (peeped != U'[' && peeped != U']' && peeped != U'!' && \ + peeped != '^' && peeped != '&' && peeped != '-'); +} + +typedef REGEX_IS024_MeaningContext ctx_t; + +struct ParseCall; +typedef std::shared_ptr chekushka; + +struct ParseCall{ + codeset_t& result; + + explicit ParseCall(codeset_t &result) : result(result) {} + virtual ~ParseCall() = default; + virtual chekushka afterReceive(ctx_t& ctx, const CommonCodesets& cc) { assert(false); } + virtual chekushka firstTime(ctx_t& ctx, const CommonCodesets& cc) { assert(false); } +}; + +#define call_ERROR_CHECK do { if (ctx.error) { return NULL; } } while (0) +#define call_THROW(str) do { report(ctx, "square bracket expression: " str); return NULL; } while (0) + +/* [...] */ +struct ZeroLvl_ParseCall: public ParseCall{ + explicit ZeroLvl_ParseCall(codeset_t &result) : ParseCall(result) {} + chekushka afterReceive(ctx_t &ctx, const CommonCodesets& cc) override; + chekushka firstTime(ctx_t &ctx, const CommonCodesets& cc) override; +}; + +/* ...&...&... */ +struct FirstLvl_ParseCall: public ParseCall{ + codeset_t ret_buf_for_new; + bool got_one = false; + explicit FirstLvl_ParseCall(codeset_t& result) : ParseCall(result) {} + chekushka afterReceive(ctx_t &ctx, const CommonCodesets& cc) override; + chekushka firstTime(ctx_t &ctx, const CommonCodesets& cc) override; +}; + +/* ab[]vgd[]eyo[]zhz */ +struct SecondLvl_ParseCall: public ParseCall{ + codeset_t ret_buf_for_new; + explicit SecondLvl_ParseCall(codeset_t& result) : ParseCall(result) {} + chekushka afterReceive(ctx_t &ctx, const CommonCodesets& cc) override; + chekushka firstTime(ctx_t &ctx, const CommonCodesets& cc) override; +}; + +/* ^... */ +struct CircumflexLvl_ParseCall: public ParseCall{ + codeset_t ret_buf_for_new; + explicit CircumflexLvl_ParseCall(codeset_t& result) : ParseCall(result) {} + chekushka afterReceive(ctx_t &ctx, const CommonCodesets& cc) override; + chekushka firstTime(ctx_t &ctx, const CommonCodesets& cc) override; +}; + +/* ********* ZeroLvl_ParseCall ********** */ + +chekushka ZeroLvl_ParseCall::firstTime(ctx_t &ctx, const CommonCodesets& cc) { + assert(readChar(ctx) == U'['); + return std::make_shared(result); +} + +chekushka ZeroLvl_ParseCall::afterReceive(ctx_t &ctx, const CommonCodesets& cc) { + if (peep(ctx) != U']') + call_THROW("lvl 0: missing ]"); + readChar(ctx); + return NULL; +} + +/* ********* FirstLvl_ParseCall ********** */ + +chekushka FirstLvl_ParseCall::firstTime(ctx_t &ctx, const CommonCodesets& cc) { + return std::make_shared(result); +} + +chekushka FirstLvl_ParseCall::afterReceive(ctx_t &ctx, const CommonCodesets& cc) { + if (got_one) + result = intersect_sets(result, ret_buf_for_new); + else + got_one = true; + if (peep(ctx) == U'&'){ + readChar(ctx); + return std::make_shared(ret_buf_for_new); + } + return NULL; +} + +/* ********* SecondLvl_ParseCall ********** */ + +chekushka SecondLvl_ParseCall::firstTime(ctx_t &ctx, const CommonCodesets& cc) { + repeat: + int32_t ch = peep(ctx); call_ERROR_CHECK; + if (ch == U'^'){ + return std::make_shared(ret_buf_for_new); + } else if (ch == U'!'){ + Command cmd = command_expr_parse(ctx); call_ERROR_CHECK; + if (!is_command_for_charset(cmd)) + call_THROW("second lvl: illegal command"); + interpret_command_as_charset_giving(cc, cmd, ret_buf_for_new); + result = merge_sets(result, ret_buf_for_new); + goto repeat; + } else if (ch == U'['){ + return std::make_shared(ret_buf_for_new); + } else if (soundsLikeCharOrRangeStart(ch)){ + readChar(ctx); + bool bs_multicode; + codeset_t bs_stuff; + + if (ch == '\\'){ + backslash_expression_parsing_try_regular(ctx, cc, bs_multicode, bs_stuff); + if (bs_multicode){ + result = merge_sets(result, bs_stuff); + goto repeat; + } else { + ret_buf_for_new = codeset_of_one_char(bs_stuff[0].first); + } + } else { + ret_buf_for_new = codeset_of_one_char(ch); + } + int32_t mCh = peep(ctx); call_ERROR_CHECK; + if (mCh == U'-'){ + readChar(ctx); + int32_t scnd = peep(ctx); call_ERROR_CHECK; + readChar(ctx); + if (scnd == U'\\'){ + backslash_expression_parsing_try_regular(ctx, cc, bs_multicode, bs_stuff); + if (bs_multicode) + call_THROW("second lvl: char range: bad escape expression after hyphen"); + ret_buf_for_new[0].second = bs_stuff[0].first; + } else if (soundsLikeCharOrRangeStart(scnd)){ + ret_buf_for_new[0].second = (uint32_t)scnd; + } else { + call_THROW("second lvl: char range: bad value after hyphen"); + } + if (ret_buf_for_new[0].second < ret_buf_for_new[0].first) + call_THROW("second: lvl: char range: invalid range"); + } + result = merge_sets(result, ret_buf_for_new); + goto repeat; + } + return NULL; +} + +chekushka SecondLvl_ParseCall::afterReceive(ctx_t &ctx, const CommonCodesets& cc) { + result = merge_sets(result, ret_buf_for_new); + return firstTime(ctx, cc); +} + +/* ********* CircumflexLvl_ParseCall ********* */ + +chekushka CircumflexLvl_ParseCall::firstTime(ctx_t &ctx, const CommonCodesets& cc) { + assert(readChar(ctx) == U'^'); + return std::make_shared(ret_buf_for_new); +} + +chekushka CircumflexLvl_ParseCall::afterReceive(ctx_t &ctx, const CommonCodesets& cc) { + result = invert_set(ret_buf_for_new); + return NULL; +} + + +/* Aaaaaaaaand... The function we have all been waiting for so long! */ +codeset_t sq_bracket_expr_parse(REGEX_IS024_MeaningContext &ctx, const CommonCodesets& cc) { + std::vector> callStack; + codeset_t res; + callStack.push_back(std::make_shared(res)); + bool first_time = true; + while (!callStack.empty()){ + if (ctx.error) + return {}; + auto nxt = first_time ? callStack.back()->firstTime(ctx, cc) : callStack.back()->afterReceive(ctx, cc); + if (nxt){ + callStack.push_back(nxt); + first_time = true; + } else { + callStack.pop_back(); + first_time = false; + } + } + return res; +} diff --git a/src/libregexis024sol/square_bracket_expression.h b/src/libregexis024sol/square_bracket_expression.h new file mode 100644 index 0000000..463ca10 --- /dev/null +++ b/src/libregexis024sol/square_bracket_expression.h @@ -0,0 +1,10 @@ +/* DO NOT INCLUDE THIS FILE OUTSIDE libregexis024sol implementation */ +#ifndef LIBREGEXIS024_SRC_LIBREGEXIS024SOL_SQUARE_BRACKET_EXPRESSION_H +#define LIBREGEXIS024_SRC_LIBREGEXIS024SOL_SQUARE_BRACKET_EXPRESSION_H + +#include +#include + +codeset_t sq_bracket_expr_parse(REGEX_IS024_MeaningContext& ctx, const CommonCodesets& cc); + +#endif //LIBREGEXIS024_SRC_LIBREGEXIS024SOL_SQUARE_BRACKET_EXPRESSION_H diff --git a/src/libregexis024sol/subexpr_fa_transformed.cpp b/src/libregexis024sol/subexpr_fa_transformed.cpp new file mode 100644 index 0000000..cc83d3b --- /dev/null +++ b/src/libregexis024sol/subexpr_fa_transformed.cpp @@ -0,0 +1,184 @@ +#include +#include +#include +#include + +SubExprCompiled subexpr_charset_reading_filter(const codeset_t &codeset, FA_Container &fa) { + return subexpression_from_path(fa.makeOneCharRead(codeset, false)); +} + +SubExprCompiled join(const SubExprCompiled &A, const SubExprCompiled &B) { + if (!A.start) + return B; + if (!B.start) + return A; + SubExprCompiled res; + res.start = A.start; + for (FA_Node** ptrToptr : A.ends) + reattach_fa_node_edge(ptrToptr, B.start); + res.ends = B.ends; + res.can_be_empty = A.can_be_empty && B.can_be_empty; + return res; +} + +SubExprCompiled subexpression_from_path(FA_NodePathPart *node) { + SubExprCompiled res; + res.start = node; + res.ends.push_back(&(node->nxt_node)); + /* There is only one char reading path node type */ + res.can_be_empty = (node->type != one_char_read); + return res; +} + +SubExprCompiled RobertAngier(const SubExprCompiled& source, FA_Container& fa) { + SubExprCompiled res; + if (!source.start) + return res; + + struct Marked{ + FA_Node *original = NULL, *clone = NULL; + explicit Marked(FA_Node *original) : original(original) {} + }; + std::vector searched; + searched.push_back(Marked(source.start)); + source.start->search_mark = 0; + + for (size_t done = 0; done < searched.size(); done++){ + FA_Node& v = *searched[done].original; + searched[done].clone = copy_fa_node(v, fa); + for (FA_Node **nxtN: searched[done].clone->get_all_transitions()){ + if (!(*nxtN)) + res.ends.push_back(nxtN); + else if ((**nxtN).search_mark < 0){ + (**nxtN).search_mark = (int64_t)searched.size(); + searched.emplace_back(*nxtN); + } + } + } + res.start = searched[0].clone; + for (Marked& mrkd: searched){ + for (FA_Node **nxtN: mrkd.clone->get_all_transitions()){ + if (*nxtN){ + assert((**nxtN).search_mark >= 0); + Marked& proc_nxt = searched[(**nxtN).search_mark]; + reattach_fa_node_edge(nxtN, proc_nxt.clone); + } + } + } + for (Marked& mrkd: searched) + mrkd.original->search_mark = -1; + return res; +} + +void reattach_all_ends_to_one_node(SubExprCompiled& patient, FA_Node* node){ + assert(node); + assert(patient.start); + for (FA_Node** end: patient.ends){ + assert(!(*end)); + printf("DEBUG %lu->->->->->%lu\n", patient.start->nodeId, node->nodeId); + reattach_fa_node_edge(end, node); + } +} + +void apply_repeat_to_subexpression(SubExprCompiled &patient, FA_Container& fa, size_t min_allowed, size_t max_allowed) { + assert(min_allowed <= max_allowed && min_allowed <= REGEXIS024_MAX_REPEAT); + if (!patient.start) + return; + bool infinite_repeat = max_allowed > REGEXIS024_MAX_REPEAT; + if (min_allowed == 0 && max_allowed == 0){ + patient = {}; + } else if (min_allowed == 1 && max_allowed == 1){ + /* Chill */ + } else if (min_allowed == 0 && infinite_repeat){ + FA_NodeOfForking* fn = fa.makeForking(); + add_option_to_fork_node(fn, patient.start); + for (FA_Node** old_end: patient.ends) + reattach_fa_node_edge(old_end, fn); + add_option_to_fork_node(fn, NULL); + patient.start = fn; + patient.ends = {&(fn->nxt_options[1])}; + } else if (min_allowed == 1 && infinite_repeat) { + FA_NodeOfForking* fn = fa.makeForking(); + reattach_all_ends_to_one_node(patient, fn); + add_option_to_fork_node(fn, patient.start); + add_option_to_fork_node(fn, NULL); + patient.ends = {&(fn->nxt_options[1])}; + } else if (min_allowed == 0 && max_allowed == 1){ + FA_NodeOfForking* fn = fa.makeForking(); + add_option_to_fork_node(fn, patient.start); + add_option_to_fork_node(fn, NULL); + patient.start = fn; + patient.ends.push_back(&(fn->nxt_options[1])); + } else if (infinite_repeat) { + std::vector Colon(min_allowed); + Colon[0] = patient; + for (size_t i = 1; i < min_allowed; i++) + Colon[i] = RobertAngier(patient, fa); + FA_NodeOfForking* fn = fa.makeForking(); + for (size_t i = 0; i + 1 < min_allowed; i++) + reattach_all_ends_to_one_node(Colon[i], Colon[i + 1].start); + reattach_all_ends_to_one_node(Colon[min_allowed - 1], fn); + add_option_to_fork_node(fn, Colon[min_allowed - 1].start); + add_option_to_fork_node(fn, NULL); + /* patient.start is the same (the original is at Colon[0] */ + patient.ends = {&(fn->nxt_options[1])}; + } else { + std::vector Avenue(max_allowed); + Avenue[max_allowed - 1] = patient; + for (size_t i = 0; i < max_allowed - 1; i++) + Avenue[i] = RobertAngier(patient, fa); + for (size_t i = 0; i + 1 < max_allowed; i++) + reattach_all_ends_to_one_node(Avenue[i], Avenue[i + 1].start); + FA_NodeOfForking* fn = fa.makeForking(); + if (min_allowed > 0){ + for (size_t i = 0; i <= max_allowed - min_allowed; i++) + add_option_to_fork_node(fn, Avenue[i].start); + } else { + for (size_t i = 0; i < max_allowed; i++) + add_option_to_fork_node(fn, Avenue[i].start); + add_option_to_fork_node(fn, NULL); + patient.ends.push_back(&(fn->nxt_options[max_allowed])); + } + patient.start = fn; + /* patient.ends is the same (the original is Avenue.back()) */ + } + if (min_allowed == 0) + patient.can_be_empty = true; +} + +SubExprCompiled forkify(const std::vector &options, FA_Container& fa){ + SubExprCompiled result; + size_t non_empty = 0; + result.can_be_empty = false; + for (const SubExprCompiled& opt: options){ + result.can_be_empty |= opt.can_be_empty; + if (opt.start) + non_empty++; + } + if (non_empty == 0){ + result.can_be_empty = true; + return result; + } + if (non_empty == 1){ + for (const SubExprCompiled& opt: options) + if (opt.start){ + result = opt; + break; + } + } else { + FA_NodeOfForking* n1 = fa.makeForking(); + result.start = n1; + n1->nxt_options.reserve(non_empty); + for (const SubExprCompiled& opt: options) + if (opt.start){ + add_option_to_fork_node(n1, opt.start); + for (FA_Node** end: opt.ends) + result.ends.push_back(end); + } + } + return result; +} + +void SubExprCompiled::assertDefault() { + assert(!start && ends.empty() && can_be_empty); +} diff --git a/src/libregexis024sol/subexpr_fa_transformed.h b/src/libregexis024sol/subexpr_fa_transformed.h new file mode 100644 index 0000000..a267468 --- /dev/null +++ b/src/libregexis024sol/subexpr_fa_transformed.h @@ -0,0 +1,32 @@ +#ifndef LIBREGEXIS024_SRC_LIBREGEXIS024SOL_SUBEXPR_FA_TRANSFORMED_H +#define LIBREGEXIS024_SRC_LIBREGEXIS024SOL_SUBEXPR_FA_TRANSFORMED_H + +#include + +struct SubExprCompiled{ + FA_Node* start = NULL; + /* After putting there values from neighbour vectors in nodes, these vectors must not change size */ + std::vector ends; + bool can_be_empty = true; + + void assertDefault(); +}; + +SubExprCompiled subexpr_charset_reading_filter(const codeset_t& codeset, FA_Container& fa); + +SubExprCompiled join(const SubExprCompiled& A, const SubExprCompiled& B); + +SubExprCompiled forkify(const std::vector& options, FA_Container& fa); + +SubExprCompiled subexpression_from_path(FA_NodePathPart* node); + +/* And then Robert Angier said `It's prestige time` and prestiged all over the place. + * If you still don't get it, this function copies section of NFA of regexp */ +SubExprCompiled RobertAngier(const SubExprCompiled& source, FA_Container& fa); + +#define REGEXIS024_MAX_REPEAT 64 + +/* pass REGEXIS024_MAX_REPEAT + 1 as max_allowed to allow infinite repeat */ +void apply_repeat_to_subexpression(SubExprCompiled& patient, FA_Container& fa, size_t min_allowed, size_t max_allowed); + +#endif //LIBREGEXIS024_SRC_LIBREGEXIS024SOL_SUBEXPR_FA_TRANSFORMED_H diff --git a/src/libregexis024test/byte_code_assembler.h b/src/libregexis024test/byte_code_assembler.h new file mode 100644 index 0000000..cd586ec --- /dev/null +++ b/src/libregexis024test/byte_code_assembler.h @@ -0,0 +1,141 @@ +/* This file is used for testing purposes only. Do not copy this file to installation prefix. + * This tehnique exploits C compiler capabilities to get regex024 assembler for free*/ +#ifndef LIBREGEXIS024_BYTE_CODE_ASSEMBLER_H +#define LIBREGEXIS024_BYTE_CODE_ASSEMBLER_H + +#include "vibe_check.h" + +#include +#include +#include +#include +#include + +struct assembler_context_bookmark{ + regex_near_ptr_t pos_in_r024program; + int LINE; +}; + +struct pending_bookmark{ + /* Must fill this byte with pos of pos_in_r024program in assembler_context_bookmark + * In a sense, this is a pointer to a NULL pointer that is yet to become normal kinda pointer */ + regex_near_ptr_t pos_in_r024program; + const char* name; + /* LINE of the reference is needed in case of error */ + int LINE; +}; + +struct assembler_context{ + std::map bookmarks; + std::vector unresolved_references; + std::vector result; + + void declare_bookmark(const char* name, int LINE_of_this){ + if (bookmarks.count(name)){ + fprintf(stderr, "Double bookmark '%s' definition in lines %d and %d\n", name, bookmarks[name].LINE, LINE_of_this); + exit(1); + } + bookmarks[name] = {result.size(), LINE_of_this}; + } + + void resolve_references(){ + for (pending_bookmark& br: unresolved_references){ + if (bookmarks.count(br.name) == 0){ + fprintf(stderr, "Unknown bookmark '%s' is requested on line %d\n", br.name, br.LINE); + exit(1); + } + /* pending bookmerk requests should be added only with beg_for_bookmark method, + * or else SEGFAULT will be your frequent guest */ + *reinterpret_cast(&result[br.pos_in_r024program]) = bookmarks[br.name].pos_in_r024program; + } + } + + void put_byte(uint8_t x){ + result.push_back(x); + } + + void put_word(uint16_t x){ + put_byte(x & UINT8_MAX); + put_byte(x >> 8); + } + + void put_doubleword(uint32_t x){ + put_word(x & UINT16_MAX); + put_word(x >> 16); + } + + void put_quadword(uint64_t x){ + put_doubleword(x & UINT32_MAX); + put_doubleword(x >> 32); + } + + void beg_for_bookmark(const char* name, int LINE_of_this){ + unresolved_references.push_back({result.size(), name, LINE_of_this}); + put_quadword(0); + } +}; + +#define msh_put_instr(ename) daCtx.put_byte(regex024_opcodes::ename); +#define msh_put_sslot(ssid) daCtx.put_doubleword(ssid); +#define msh_put_track_arr_ind(i) daCtx.put_word(i); +#define msh_put_x(meth, x) daCtx.meth(x); +/* Here my assembler begs for bookmark to jump on */ +#define msh_bookmark_reference(name) daCtx.beg_for_bookmark(name, __LINE__); + +#define s_BEGIN_ASSEMBLER_CONTEXT() { assembler_context daCtx{}; +#define s_END_ASSEMBLER_CONTEXT(pass_vec) daCtx.resolve_references(); (pass_vec) = std::move(daCtx.result); } + +/* Here user declares a bookmark */ +#define c_BOOKMARK(name) daCtx.declare_bookmark((name), __LINE__); + +#define i_READ(ssid) msh_put_instr(READ) msh_put_sslot(ssid) +#define i_READZ() msh_put_instr(READZ) +#define i_JUMP(bookmark) msh_put_instr(JUMP) msh_bookmark_reference(bookmark) + +#define msh_conditional_jump(condition, size_postfix, meth, x, bookmark) \ + msh_put_instr(JC ## condition ## _ ## size_postfix) \ + msh_put_x(meth, x) msh_bookmark_reference(bookmark) + +#define i_JCEQUAL_B(x, bookmark) msh_conditional_jump(EQUAL, B, put_byte, x, bookmark) +#define i_JCEQUAL_W(x, bookmark) msh_conditional_jump(EQUAL, W, put_word, x, bookmark) +#define i_JCEQUAL_DW(x, bookmark) msh_conditional_jump(EQUAL, DW, put_doubleword, x, bookmark) +#define i_JCEQUAL_QW(x, bookmark) msh_conditional_jump(EQUAL, QW, put_quadword, x, bookmark) + +#define i_JCLESS_B(x, bookmark) msh_conditional_jump(LESS, B, put_byte, x, bookmark) +#define i_JCLESS_W(x, bookmark) msh_conditional_jump(LESS, W, put_word, x, bookmark) +#define i_JCLESS_DW(x, bookmark) msh_conditional_jump(LESS, DW, put_doubleword, x, bookmark) +#define i_JCLESS_QW(x, bookmark) msh_conditional_jump(LESS, QW, put_quadword, x, bookmark) + +#define i_JCGRTR_B(x, bookmark) msh_conditional_jump(GRTR, B, put_byte, x, bookmark) +#define i_JCGRTR_W(x, bookmark) msh_conditional_jump(GRTR, W, put_word, x, bookmark) +#define i_JCGRTR_DW(x, bookmark) msh_conditional_jump(GRTR, DW, put_doubleword, x, bookmark) +#define i_JCGRTR_QW(x, bookmark) msh_conditional_jump(GRTR, QW, put_quadword, x, bookmark) + +#define i_FORK(ssid, bookmark) msh_put_instr(FORK) msh_put_sslot(ssid) msh_bookmark_reference(bookmark) +#define i_MATCH() msh_put_instr(MATCH) +#define i_DIE() msh_put_instr(DIE) +#define i_PARAM_READ_SS_NUMBER(ssid) msh_put_instr(PARAM_READ_SS_NUMBER) msh_put_sslot(ssid) +#define i_PARAM_FORK_SS_NUMBER(ssid) msh_put_instr(PARAM_FORK_SS_NUMBER) msh_put_sslot(ssid) +#define i_PARAM_SELARR_LEN(tai) msh_put_instr(PARAM_SELARR_LEN) msh_put_track_arr_ind(tai) +#define i_PARAM_COLSIFTFUNC_SET(bookmark) msh_put_instr(PARAM_COLSIFTFUNC_SET) msh_bookmark_reference(bookmark) +#define i_PARAM_COLSIFTFUNC_WIPE() msh_put_instr(PARAM_COLSIFTFUNC_WIPE) +#define i_MSG_MULTISTART_ALLOWED(is_allowed) msh_put_instr(MSG_MULTISTART_ALLOWED) daCtx.put_byte(is_allowed); +#define i_MSG_FED_INPUT_EXTENDED(left, right, part) msh_put_instr(MSG_FED_INPUT_EXTENDED) \ + daCtx.put_byte(left); daCtx.put_byte(right); msh_put_sslot(part) +#define i_DMOV_RABX_SELARR(tai) msh_put_instr(DMOV_RABX_SELARR) msh_put_track_arr_ind(tai) +#define DDIST_RABX_SELARR(tai_beg, tai_end) msh_put_instr(DDIST_RABX_SELARR) \ + msh_put_track_arr_ind(tai_beg) msh_put_track_arr_ind(tai_end) +#define i_SIFTPRIOR_MIN_RABX() msh_put_instr(SIFTPRIOR_MIN_RABX) +#define i_SIFTPRIOR_MAX_RABX() msh_put_instr(SIFTPRIOR_MAX_RABX) +#define i_SIFT_DONE() msh_put_instr(SIFT_DONE) +#define i_MOV_COLARR_IMM(tai, qw_x) msh_put_instr(MOV_COLARR_IMM) msh_put_track_arr_ind(tai) \ + daCtx.put_quadword(qw_x); +#define i_MOV_COLARR_BTPOS(tai) msh_put_instr(MOV_COLARR_BTPOS) msh_put_track_arr_ind(tai) +#define i_MOV_SELARR_IMM(tai, qw_x) msh_put_instr(MOV_SELARR_IMM) msh_put_track_arr_ind(tai) \ + daCtx.put_quadword(qw_x); +#define i_MOV_SELARR_CHPOS(tai) msh_put_instr(MOV_SELARR_CHPOS) +#define i_INIT() msh_put_instr(INIT) +#define i_THROW() msh_put_instr(THROW) +#define i_DEBUG() msh_put_instr(DEBUG) + +#endif //LIBREGEXIS024_BYTE_CODE_ASSEMBLER_H diff --git a/src/libregexis024test/byte_code_disassembler.h b/src/libregexis024test/byte_code_disassembler.h new file mode 100644 index 0000000..e1e6dd7 --- /dev/null +++ b/src/libregexis024test/byte_code_disassembler.h @@ -0,0 +1,205 @@ +/* This file is used for testing purposes only. Do not copy this file to installation prefix. */ +#ifndef LIBREGEXIS024_BYTE_CODE_DISASSEMBLER_H +#define LIBREGEXIS024_BYTE_CODE_DISASSEMBLER_H + +#include "vibe_check.h" + +#include +#include +#include +#include +#include +#include +#include + +// TODO: apply here my new change in near pointer size + +struct landing_place_resolvance{ + size_t name_id; + bool visited = false; + landing_place_resolvance() = default; + landing_place_resolvance(size_t nameId, bool visited) : name_id(nameId), visited(visited) {} +}; + +void print_disassembly(size_t prgSize, uint8_t* prg){ + std::vector names = { + "Александр", "Мария", "Иван", "Анна", "Дмитрий", "Екатерина", "Алексей", + "Ольга", "Михаил", "София", "Сергей", "Анастасия", "Артем", "Виктория", + "Андрей", "Елена", "Максим", "Алиса", "Павел", "Наталья", "Денис", "Юлия", + "Владимир", "Маргарита", "Никита", "Дарья", "Илья", "Алина", "Роман", "Евгения", + "Кирилл", "Елизавета", "Антон", "Татьяна", "Владислав", "Валерия", "Георгий", + "Ксения", "Арсений", "Милана", "Даниил", "Вероника", "Тимофей", "Арина", + "Николай", "Кристина", "Степан", "Алёна", "Игорь", "Алла", "Григорий", "Ева", + "Олег", "Яна", "Семен", "Марина", "Федор", "Светлана", "Василий", "Людмила" + }; + uint64_t used_names = 0; + /* From program position -> to names[ind] & */ + std::map bookmarks; + regex_near_ptr_t IP = 0; + + auto check_inboundness = [&](int region){ + if (!vmprog_check_inboundness(prgSize, IP, region)) + exitf("This program can't be decomposed into commands in a trivial way"); + }; + auto extract_b = [&]() -> uint8_t{ + check_inboundness(1); + return vmprog_extract_b(&IP, prg); + }; + auto extract_w = [&]() -> uint16_t { + check_inboundness(2); + return vmprog_extract_w(&IP, prg); + }; + auto extract_dw = [&]() -> uint32_t { + check_inboundness(4); + return vmprog_extract_dw(&IP, prg); + }; + auto extract_qw = [&]() -> uint64_t { + check_inboundness(8); + return vmprog_extract_qw(&IP, prg); + }; + auto extract_instruction = [&]() -> uint8_t{ + return extract_b(); + }; + auto extract_sslot_id = [&]() -> regex_sslot_id_t{ + return extract_dw(); + }; + auto extract_near_pointer = [&]() -> regex_near_ptr_t{ + return extract_qw(); + }; + auto extract_track_array_index = [&]() -> regex_tai_t{ + return extract_w(); + }; + + bool second_phase = false; + + auto fph_register_landing = [&](regex_near_ptr_t pos){ + if (!second_phase){ + if (bookmarks.count(pos) == 0){ + if (used_names == names.size()) + names.push_back("Закладка_" + std::to_string(used_names)); + bookmarks.insert({pos, {used_names, false}}); + used_names++; + } + } + }; + + auto get_bookmark_in_2phase = [&](regex_near_ptr_t pos) -> std::string { + if (bookmarks.count(pos) == 0) + exitf("bruh"); + return names[bookmarks[pos].name_id]; + }; + + auto one_reading = [&](){ + while (IP < prgSize) { + regex_near_ptr_t start_pos = IP; + if (second_phase){ + if (bookmarks.count(IP) != 0){ + printf("%s:\n", get_bookmark_in_2phase(IP).c_str()); + bookmarks[IP].visited = true; + } + } + uint8_t opcode = extract_instruction(); + switch (opcode) { +#define secPrint(fmt, ...) if (second_phase) {printf("% 3lu) " fmt, start_pos, __VA_ARGS__);} } break; +#define secPrintNoArg(str) if (second_phase) {printf("% 3lu) " str, start_pos);} } break; +#define instCase(oper_code) case regex024_opcodes::oper_code: { +#define jcMess(cond, sz_uppercase, x_t, extract_method, printf_sign) \ + instCase(JC ## cond ## _ ## sz_uppercase) \ + x_t x = extract_method(); \ + regex_near_ptr_t dest = extract_near_pointer(); \ + fph_register_landing(dest); \ + secPrint("JC" #cond "_" #sz_uppercase " %" printf_sign " $%s\n", x, get_bookmark_in_2phase(dest).c_str()) +#define jcCacaphony(cond) \ + jcMess(cond, B, uint8_t, extract_b, PRIu8) \ + jcMess(cond, W, uint16_t, extract_w, PRIu16) \ + jcMess(cond, DW, uint32_t, extract_dw, PRIu32) \ + jcMess(cond, QW, uint64_t, extract_qw, PRIu64) +#define simpleDimple(name) instCase(name) secPrintNoArg(#name "\n") + + instCase(READ) + uint32_t ssid = extract_sslot_id(); + secPrint("READ %u\n", ssid) + simpleDimple(READZ) + instCase(JUMP) + uint32_t dest = extract_near_pointer(); + fph_register_landing(dest); + secPrint("JUMP $%s\n", get_bookmark_in_2phase(dest).c_str()) + + jcCacaphony(EQUAL) + jcCacaphony(LESS) + jcCacaphony(GRTR) + + instCase(FORK) + uint32_t ssid = extract_sslot_id(); + regex_near_ptr_t dest = extract_near_pointer(); + fph_register_landing(dest); + secPrint("FORK %u $%s\n", ssid, get_bookmark_in_2phase(dest).c_str()) + simpleDimple(MATCH) + simpleDimple(DIE) + instCase(PARAM_READ_SS_NUMBER) + regex_sslot_id_t ssid_max_plus_one = extract_sslot_id(); + secPrint("PARAM_READ_SS_NUMBER %u\n", ssid_max_plus_one) + instCase(PARAM_FORK_SS_NUMBER) + regex_sslot_id_t ssid_max_plus_one = extract_sslot_id(); + secPrint("PARAM_FORK_SS_NUMBER %u\n", ssid_max_plus_one) + instCase(PARAM_SELARR_LEN) + regex_tai_t tai_max_plus_one = extract_track_array_index(); + secPrint("PARAM_SELARR_LEN %hu\n", tai_max_plus_one) + instCase(PARAM_COLSIFTFUNC_SET) + regex_near_ptr_t entry = extract_near_pointer(); + fph_register_landing(entry); + secPrint("PARAM_COLSIFTFUNC_SET $%s\n", get_bookmark_in_2phase(entry).c_str()) + simpleDimple(PARAM_COLSIFTFUNC_WIPE) + instCase(MSG_MULTISTART_ALLOWED) + uint8_t is_allowed = extract_b(); + secPrint("MSG_MULTISTART_ALLOWED %hhu\n", is_allowed) + instCase(MSG_FED_INPUT_EXTENDED) + uint8_t left = extract_b(); + uint8_t right = extract_b(); + regex_sslot_id_t part = extract_sslot_id(); + secPrint("MSG_FED_INPUT_EXTENDED %hhu %hhu %u\n", left, right, part) + instCase(DMOV_RABX_SELARR) + regex_tai_t i = extract_track_array_index(); + secPrint("DMOV_RABX_SELARR %hu\n", i) + instCase(DDIST_RABX_SELARR) + regex_tai_t s = extract_track_array_index(); + regex_tai_t e = extract_track_array_index(); + secPrint("DDIST_RABX_SELARR %hu %hu\n", s, e); + simpleDimple(SIFTPRIOR_MIN_RABX) + simpleDimple(SIFTPRIOR_MAX_RABX) + simpleDimple(SIFT_DONE) + instCase(MOV_COLARR_IMM) + regex_tai_t tai = extract_track_array_index(); + uint64_t imm = extract_qw(); + secPrint("MOV_COLARR_IMM %hu %lu\n", tai, imm); + instCase(MOV_COLARR_BTPOS) + regex_tai_t tai = extract_track_array_index(); + secPrint("MOV_COLARR_BTPOS %hu\n", tai); + instCase(MOV_SELARR_IMM) + regex_tai_t tai = extract_track_array_index(); + uint64_t imm = extract_qw(); + secPrint("MOV_SELARR_IMM %hu %lu\n", tai, imm); + instCase(MOV_SELARR_CHPOS) + regex_tai_t tai = extract_track_array_index(); + secPrint("MOV_SELARR_CHPOS %hu\n", tai); + simpleDimple(INIT) + simpleDimple(THROW) + default: + exitf("Bad opcode\n"); +#undef secPrint +#undef secPrintNoArg +#undef instCase +#undef jcMess +#undef jcCacaphony +#undef simpleDimple + } + } + }; + + one_reading(); + second_phase = true; + IP = 0; + one_reading(); +} + +#endif //LIBREGEXIS024_BYTE_CODE_DISASSEMBLER_H diff --git a/src/libregexis024test/test0.cpp b/src/libregexis024test/test0.cpp new file mode 100644 index 0000000..b95fd4c --- /dev/null +++ b/src/libregexis024test/test0.cpp @@ -0,0 +1,52 @@ +#include +#include +#include + +void test_ccs_fnc(const codeset_t &got, const codeset_t &expected){ + static int id = 1; + if (got == expected) + printf("Test %d passed\n", id++); + else + exitf("Test %d failed\n", id); +} + +void invert_test(const codeset_t& A, const codeset_t& C){ + test_ccs_fnc(invert_set(A), C); + test_ccs_fnc(invert_set(C), A); +} + +void merge_test(const codeset_t& A, const codeset_t& B, const codeset_t& C){ + test_ccs_fnc(merge_sets(A, A), A); + test_ccs_fnc(merge_sets(B, B), B); + test_ccs_fnc(merge_sets(A, B), C); + test_ccs_fnc(merge_sets(B, A), C); +} + +void intersect_test(const codeset_t& A, const codeset_t& B, const codeset_t& C){ + test_ccs_fnc(intersect_sets(A, A), A); + test_ccs_fnc(intersect_sets(B, B), B); + test_ccs_fnc(intersect_sets(A, B), C); + test_ccs_fnc(intersect_sets(B, A), C); +} + +int main(){ + merge_test({{34, 111}}, {}, {{34, 111}}); + merge_test({{1, 1}}, {{3, 3}}, {{1, 1}, {3, 3}}); + invert_test({{0, 1}}, {{2, UINT32_MAX}}); + invert_test({{32, 34}}, {{0, 31}, {35, UINT32_MAX}}); + merge_test({{10, 10}, {20, 20}}, {{19, 19}}, {{10, 10}, {19, 20}}); + merge_test({{0, 5}, {7, 10}}, {{4, 6}}, {{0, 10}}); + merge_test({{1, 10}, {50, 60}}, {{11, 70}}, {{1, 70}}); + merge_test({{23, 23}, {56, 100}}, {{30, 55}}, {{23, 23}, {30, 100}}); + intersect_test({{100, 200}, {300, 400}}, {}, {}); + intersect_test({{2, 30}}, {{15, 50}}, {{15, 30}}); + intersect_test({{10, 30}}, {{15, 25}}, {{15, 25}}); + intersect_test({{10, 20}}, {{21, 30}}, {}); + intersect_test({{1, 100}, {150, 200}}, {{50, 175}}, {{50, 100}, {150, 175}}); + intersect_test({{1, 100}}, {}, {}); + intersect_test({{50, 50}}, {{50, 50}}, {{50, 50}}); + intersect_test({{49, 49}}, {{50, 50}}, {}); + intersect_test({{1, 20}, {50, 80}}, {{10, 55}, {60, 100}}, {{10, 20}, {50, 55}, {60, 80}}); + merge_test({{2, 3}, {5, 5}, {7, 7}}, {{1, 1}, {3, 7}}, {{1, 7}}); + return 0; +} \ No newline at end of file diff --git a/src/libregexis024test/test1.cpp b/src/libregexis024test/test1.cpp new file mode 100644 index 0000000..0a5f0bd --- /dev/null +++ b/src/libregexis024test/test1.cpp @@ -0,0 +1,136 @@ +#include + +#include +#include +#include +#include +#include + +static int test_id = 0; + +void do_test(const std::vector& prg, const std::string& str, const std::vector& prefix_matching){ + assert(str.size() + 1 == prefix_matching.size()); + REGEX_IS024_CONTEXT ctx{prg.size(), prg.data(), 0, 0, 1000, 1000, 1000000}; + regex024_error_code ret; + // todo + printf("TEST %d passed\n", test_id); + test_id++; +} + +std::vector tfff(size_t n){ + std::vector res = std::vector(n + 1, false); + res[0] = true; + return res; +} + +std::vector ffft(size_t n){ + std::vector res = std::vector(n + 1, false); + res.back() = true; + return res; +} + +std::vector program_0(){ + std::vector res; + s_BEGIN_ASSEMBLER_CONTEXT() + i_PARAM_READ_SS_NUMBER(2) + i_PARAM_FORK_SS_NUMBER(1) + i_INIT() + i_FORK(0, "wb") + i_READ(0) + i_JCEQUAL_B('a', "finish") + i_DIE() + + c_BOOKMARK("wb") + i_READ(1) + i_JCEQUAL_B('b', "finish") + i_DIE() + + c_BOOKMARK("finish") + i_MATCH() + i_DIE() + s_END_ASSEMBLER_CONTEXT(res) + return res; +} + +std::vector program_1(){ + std::vector res; + s_BEGIN_ASSEMBLER_CONTEXT() + i_PARAM_READ_SS_NUMBER(4) + i_PARAM_FORK_SS_NUMBER(2) +// i_PARAM_SELARR_LEN(0) + i_INIT() + i_FORK(0, "wb") + c_BOOKMARK("wa") + i_READ(0) + i_JCEQUAL_B('a', "razvilka") + i_DIE() + + c_BOOKMARK("wb") + i_READ(1) + i_JCEQUAL_B('b', "razvilka") + i_DIE() + + c_BOOKMARK("razvilka") + i_FORK(1, "wd") + c_BOOKMARK("wc") + i_READ(2) + i_JCEQUAL_B('c', "finish") + i_DIE() + + c_BOOKMARK("wd") + i_READ(3) + i_JCEQUAL_B('d', "finish") + i_DIE() + + c_BOOKMARK("finish") + i_MATCH() + i_DIE() + s_END_ASSEMBLER_CONTEXT(res) + return res; +} + +/* +int main(){ + auto prg0 = program_0(); + auto prg1 = program_1(); +// printf("Disassembled program:\n"); +// print_disassembly(prg.size(), prg.data()); + + printf("Testing starts\n"); + test_id = 0; + do_test(prg0, "a", {false, true}); + do_test(prg0, "b", {false, true}); + do_test(prg0, "c", {false, false}); + do_test(prg0, "a4", {false, true, false}); + do_test(prg0, "b4", {false, true, false}); + do_test(prg1, "aa", {false, false, false}); + do_test(prg1, "db", {false, false, false}); + do_test(prg1, "ac", {false, false, true}); + do_test(prg1, "bc", {false, false, true}); + do_test(prg1, "ad", {false, false, true}); + do_test(prg1, "bd", {false, false, true}); + do_test(prg1, "bd12", {false, false, true, false, false}); + + return 0; +} +*/ + +int main() { + std::vector prg; + s_BEGIN_ASSEMBLER_CONTEXT() + c_BOOKMARK("111") + i_READ(0) + i_READ(1) + i_FORK(12, "vdv") + c_BOOKMARK("vdv") + i_READ(2) + i_READ(10000000) + i_THROW() + i_THROW() + i_THROW() + i_JUMP("111") + + s_END_ASSEMBLER_CONTEXT(prg) + + print_disassembly(prg.size(), prg.data()); +} \ No newline at end of file diff --git a/src/libregexis024test/test2.cpp b/src/libregexis024test/test2.cpp new file mode 100644 index 0000000..cddbfac --- /dev/null +++ b/src/libregexis024test/test2.cpp @@ -0,0 +1,12 @@ +#include +#include + +int main(){ + std::string regular_expression = "!selarr{boba{ca}}^a#boba(b)c$"; + REGEX_IS024_MeaningContext regex(regular_expression.size(), regular_expression.c_str()); + if (regex.error) + fprintf(stderr, "%s\n", regex.error_msg.c_str()); + std::vector res = regex.compiled_program; + print_disassembly(res.size(), res.data()); + return 0; +} diff --git a/src/libregexis024test/test3.cpp b/src/libregexis024test/test3.cpp new file mode 100644 index 0000000..051115f --- /dev/null +++ b/src/libregexis024test/test3.cpp @@ -0,0 +1,217 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +struct test_id_t { + int test_id; + int subtest_id; + + test_id_t(int test_id, int subtest_id) : test_id(test_id),subtest_id(subtest_id) {} + + std::string toString() const { + char buf[128]; + snprintf(buf, 128, "#%d::%d", test_id + 1, subtest_id + 1); + return buf; + } +}; + +std::string stringifyCodeset(const codeset_t& CS) { + std::string cs; + for (auto p: CS) { + if (!cs.empty()) + cs += "; "; + cs += std::to_string(p.first) + "-" + std::to_string(p.second); + } + return cs; +} + +std::string stringifyRequestList(const std::vector& arr) { + std::string rl; + for (auto r: arr) { + if (!rl.empty()) + rl += ", "; + rl += std::to_string(r); + } + return rl; +} + +void print_obj(const std::vector>>& answer) { + size_t R = answer.size(); + for (int i = 0; i < R; i++) { + std::string cs = stringifyCodeset(answer[i].first); + std::string rl = stringifyRequestList(answer[i].second); + printf("{%s} -> {%s}\n", cs.c_str(), rl.c_str()); + } +} + +void print_answer_canonic(const std::map, codeset_t>& answer) { + for (auto& p: answer) { + printf("{%s} -> {%s}\n", stringifyCodeset(p.second).c_str(), stringifyRequestList(p.first).c_str()); + } +} + +void print_test_details(int dummyN, const std::vector& requests) { + for (int i = 0; i < requests.size(); i++) { + printf("%d) %s\n", i - dummyN, stringifyCodeset(requests[i]).c_str()); + } +} + +void fail_test(test_id_t my_test_id) { + char buf[1024]; + snprintf(buf, 1024, "Test %s failed", my_test_id.toString().c_str()); + throw std::runtime_error(buf); +} + +std::map, codeset_t> safe_canonificate_answer(test_id_t my_test_id, + const std::vector>>& answer) +{ + std::map, codeset_t> answer_canonic; + for (auto& p: answer) { + if (answer_canonic.count(p.second) == 0) + answer_canonic[p.second] = {}; + if (!intersect_sets(answer_canonic[p.second], p.first).empty()) + fail_test(my_test_id); + answer_canonic[p.second] = merge_sets(answer_canonic[p.second], p.first); + } + return answer_canonic; +} + +std::vector>> safe_zip_answer(test_id_t my_test_id, + const std::vector& ti, const std::vector>& to) +{ + size_t R = ti.size(); + std::vector>> res(R); + if (to.size() != R) + fail_test(my_test_id); + for (size_t i = 0; i < R; i++) { + res[i].first = ti[i]; + res[i].second = to[i]; + } + return res; +} + +void perform_test(test_id_t my_test_id, int InpDummyN, const std::vector& rqInpCs, const std::vector>>& answer_right) +{ + std::map, codeset_t> answer_canonic_right = safe_canonificate_answer(my_test_id, answer_right); + ColoredCodeset cc(InpDummyN); + + for (auto& c: rqInpCs) + cc.apply_divisor(c); + std::vector ti; + std::vector> to; + cc.get_splits_of_non_dummy(ti, to); + std::vector>> answer_returned = safe_zip_answer(my_test_id, ti, to); + std::map, codeset_t> answer_canonic_returned = safe_canonificate_answer(my_test_id, + answer_returned); + if (answer_canonic_right != answer_canonic_returned) { + printf("Test failed!!\n"); + printf("Test details:\n"); + print_test_details(InpDummyN, rqInpCs); + printf("Right answer:\n"); + print_obj(answer_right); + printf("Given answer:\n"); + print_obj(answer_returned); + fail_test(my_test_id); + } + printf("Test %s passed\n", my_test_id.toString().c_str()); +} + +void perform_test_with_shuffle(const std::vector& rqInpCs, + const std::vector>>& answer) +{ + static int cur_test_id = 0; + int my_test_id = cur_test_id++; + int dn = rqInpCs.size(); + std::vector shuf(dn); + for (int i = 0; i < dn; i++) + shuf[i] = i; + for (int stid = 0; stid < 14; stid++) { + std::random_device d; + std::mt19937 r_gen(d()); + std::shuffle(shuf.begin(), shuf.end(), r_gen); + int InpDummyN = std::uniform_int_distribution(0, dn - 1)(r_gen); + // for (int e: shuf) + // printf("%d ", e); + // printf("\n"); + std::vector new_rqInpCs(dn); + for (int i = 0; i < dn; i++) + new_rqInpCs[shuf[i]] = rqInpCs[i]; + std::vector>> new_right_answer; + for (const std::pair>& p: answer) { + std::vector new_request_list; + for (size_t oldRid: p.second) { + size_t unabridged_new_rid = shuf[oldRid]; + if (unabridged_new_rid >= InpDummyN) { + new_request_list.push_back(unabridged_new_rid - InpDummyN); + } + } + if (!new_request_list.empty()) { + std::sort(new_request_list.begin(), new_request_list.end()); + new_right_answer.emplace_back(p.first, new_request_list); + } + } + perform_test(test_id_t(my_test_id, stid), InpDummyN, new_rqInpCs, new_right_answer); + } +} + +int main() { + perform_test_with_shuffle( + { + {{5, 500}}, + {{10, 20}}, + {{5, 10}}, + }, + { + {{{21, 500}}, {0}}, + {{{11, 20}}, {0, 1}}, + {{{5, 9}}, {0, 2}}, + {{{10, 10}}, {0, 1, 2}}, + }); + + perform_test_with_shuffle({ + {{10, 19}}, + {{10, 15}}, + {{5, 9}}, + {{20, 40}}, + {{16, UINT32_MAX}}, + }, + { + {{{5, 9}}, {2}}, + {{{10, 15}}, {0, 1}}, + {{{16, 19}}, {0, 4}}, + {{{20, 40}}, {3, 4}}, + {{{41, UINT32_MAX}}, {4}} + + }); + + perform_test_with_shuffle( + { + {{10, 19}, {30, 39}, {50, 69}}, + {{20, 29}, {40, 59}, }, + {{20, 39}, {70, 79}}, + codeset_of_one_char(UINT32_MAX - 1), + codeset_of_one_char(UINT32_MAX), + {{UINT32_MAX - 1, UINT32_MAX}}, + codeset_of_one_char(0), + codeset_of_one_char(1), + {{0, 1}}, + }, + { + {{{10, 19}, {60, 69}}, {0}}, + {{{40, 49}}, {1}}, + {{{70, 79}}, {2}}, + {{{50, 59}}, {0, 1}}, + {{{20, 29}}, {1, 2}}, + {{{30, 39}}, {0, 2}}, + {{{0, 0}}, {6, 8}}, + {{{1, 1}}, {7, 8}}, + {{{UINT32_MAX - 1, UINT32_MAX - 1}}, {3, 5}}, + {{{UINT32_MAX, UINT32_MAX}}, {4, 5}}, + }); + return 0; +} \ No newline at end of file diff --git a/src/libregexis024test/test4.cpp b/src/libregexis024test/test4.cpp new file mode 100644 index 0000000..4fd5ec0 --- /dev/null +++ b/src/libregexis024test/test4.cpp @@ -0,0 +1,43 @@ +#include +#include +#include +#include + +using namespace regexis024; +using namespace std; + +void test(const string& input, const string& pattern, const MatchInfo& right_answer) { + MatchInfo given_answer; + track_var_list retTrackVarList; + string retStatus; + matchStrToRegexp(input, pattern, given_answer, retTrackVarList, retStatus); + if (given_answer != right_answer) { + throw runtime_error("Test failed"); + } + printf("Test passed\n"); +} + +int main() { + test("b", "#boba(b)", MatchInfo({{0, 0}, {1, 1}}, {})); + test("abc", "!selarr{boba{ca}}^a#boba(b)c$", MatchInfo({{0, 1}, {1, 2}}, {1, 2})); + for (int i = 0; i < 64; i++) { + std::string T; + T += ('a' + (i >> 3)); + T+= ('a' + (i % 8)); + test(T, "(((a|b)|(c|d))|((e|f)|(g|h)))!r{2}", MatchInfo({}, {})); + } + test("abba", "!select{M{max}}a#M(b*)a", MatchInfo({}, {1, 3})); + test("abba", "!dfa;!select{M{max}}a#M(b*)a", MatchInfo({}, {1, 3})); + test("abba", "!select{M{max}}a#M(!any;*)a", MatchInfo({}, {1, 3})); + test("abba", "!dfa;!select{M{max}}a#M(!any;*)a", MatchInfo({}, {1, 3})); + test("", "", MatchInfo({}, {})); + test("a", "a", MatchInfo({}, {})); + test("a3", "[abc]3", MatchInfo({}, {})); + test("b3", "[abc]3", MatchInfo({}, {})); + test("c3", "[abc]3", MatchInfo({}, {})); + test("aa", "aa", MatchInfo({}, {})); + test("aaaaa", "a*", MatchInfo({}, {})); + test("bababbaa", "[ab]*", MatchInfo({}, {})); + test("bababbaa", "!dfa;[ab]*", MatchInfo({}, {})); + return 0; +} diff --git a/src/libregexis024test/vibe_check.h b/src/libregexis024test/vibe_check.h new file mode 100644 index 0000000..a83de68 --- /dev/null +++ b/src/libregexis024test/vibe_check.h @@ -0,0 +1,14 @@ +#ifndef VIBE_CHECK_H +#define VIBE_CHECK_H + +#ifndef __ORDER_LITTLE_ENDIAN__ +#error "All the cool kids use little endian. Get lost, you are forbidden from entering this party" +#endif + +#ifndef _GLIBCXX_DEBUG +#error "Kinda stupid to test without _GLIBCXX_DEBIG. Or... Don't tell me you are using this header in non-testing environment. OH MY \ +GOD! THIS LUNATIC USES TESTING HEADER IN PRODUCTION CODE. I-I-I am calling 911, COMON, SOMEBODY CATCH HIM AND PUT HIM IN LOONEYBLOCK!!!" +#endif + + +#endif //VIBE_CHECK_H diff --git a/src/libregexis024tools/stringmatching.cpp b/src/libregexis024tools/stringmatching.cpp new file mode 100644 index 0000000..41f10b3 --- /dev/null +++ b/src/libregexis024tools/stringmatching.cpp @@ -0,0 +1,109 @@ +#include +#include +#include +#include +#include +#include + +// using namespace regexis024; + +void convert(regexis024::TrackingVariableInfo& to, const SubtrackingNameInfo& from) { +#define plagiat(field) to.field = from.field; + plagiat(type); + plagiat(colarr_first); + plagiat(colarr_second); + plagiat(stored_in_ca); + plagiat(selarr_first); + plagiat(selarr_second); + plagiat(stored_in_sa); +#undef plagiat +} + +int regexis024::matchStrToRegexp(const std::string& input, const std::string& pattern, + MatchInfo& retMatchInfo, track_var_list& retTrackVarList, std::string& retStatus) +{ + retTrackVarList = {}; + retMatchInfo = MatchInfo(); + retStatus = ""; + REGEX_IS024_MeaningContext regexp(pattern.size(), pattern.data()); + if (regexp.error) { + retStatus = "Pattern compilation. " + regexp.error_msg; + return -1; + } + retTrackVarList = {}; + for (auto& iip: regexp.ktr.track_names) { + convert(retTrackVarList[iip.first], regexp.ktr.retrieval_info[iip.second]); + } + REGEX_IS024_VirtualMachine vm(regexp.compiled_program.size(), regexp.compiled_program.data(), + UINT64_MAX, UINT16_MAX, + UINT32_MAX, UINT32_MAX, UINT64_MAX); + auto getVMErrString = [&]() -> std::string { + return std::string(regex024_error_code_tostr(vm.getErrno())); + }; + + if (vm.initialize() != regex024_error_codes::stable) { + retStatus = "Virtual machine initialization. " + getVMErrString(); + return -1; + } + int left_ext_feed = vm.getInputLeftExtensionSize(); + int right_ext_feed = vm.getInputRightExtensionSize(); + if (left_ext_feed > 1 || right_ext_feed > 1) { + retStatus = "Unnatural extended input request."; + return -1; + } + if (vm.addNewMatchingThread() != regex024_error_codes::stable) { + retStatus = "Virtual machine first kick. " + getVMErrString(); + } + if (left_ext_feed) { + if (vm.extendedFeedCharacter('\n') != regex024_error_codes::stable) { + retStatus = "VM left extended input. " + getVMErrString(); + return -1; + } + } + for (size_t cur_text_pos = 0;cur_text_pos < input.size();) { + int32_t inp_code; + size_t adj; + utf8_string_iterat(inp_code, adj, cur_text_pos, reinterpret_cast(input.data()), input.size()); + if (inp_code < 0) { + retStatus = "Input string encoding error."; + return -1; + } + if (vm.feedCharacter(static_cast(inp_code), adj) != regex024_error_codes::stable) { + retStatus = "VM input. " + getVMErrString(); + return -1; + } + cur_text_pos += adj; + } + if (right_ext_feed) { + if (vm.extendedFeedCharacter('\n') != regex024_error_codes::stable) { + retStatus = "VM right extended input. " + getVMErrString(); + return -1; + } + } + assert(vm.isUsable()); + if (vm.isMatched()) { + retMatchInfo.have_match = true; + size_t SN1 = vm.getSelectionArrayLength(); + retMatchInfo.sa.assign(SN1, 0); + for (size_t i = 0; i < SN1; i++) + retMatchInfo.sa[i] = vm.getMatchedThreadSAValue(i); + retMatchInfo.ca_history = vm.getMatchedThreadCABranchReverse(); + std::reverse(retMatchInfo.ca_history.begin(), retMatchInfo.ca_history.end()); + return 0; + } + return -1; +} + +bool regexis024::MatchInfo::operator==(const MatchInfo &other) const { + if (!have_match && !other.have_match) + return true; + return (have_match == other.have_match) && (sa == other.sa) && (ca_history == other.ca_history); +} + +bool regexis024::MatchInfo::operator!=(const MatchInfo &other) const { + return !(*this == other); +} + +regexis024::MatchInfo::MatchInfo(const std::vector &ca_history, const std::vector &sa): + ca_history(ca_history), sa(sa), have_match(true) { +} diff --git a/src/libregexis024tools/stringmatching.h b/src/libregexis024tools/stringmatching.h new file mode 100644 index 0000000..1321144 --- /dev/null +++ b/src/libregexis024tools/stringmatching.h @@ -0,0 +1,42 @@ +#ifndef LIBREGEXIS024_SRC_LIBREGEXIS024TOOLS_STRINGMATCHING_H +#define LIBREGEXIS024_SRC_LIBREGEXIS024TOOLS_STRINGMATCHING_H + +#include +#include +#include +#include + +namespace regexis024 { + struct TrackingVariableInfo { + bool stored_in_ca = true; + bool stored_in_sa = false; + + tracking_var_type type; + /* These fields will be -1 if unused */ + int colarr_first = -1; + int colarr_second = -1; + + int selarr_first = -1; + int selarr_second = -1; + }; + + typedef std::map track_var_list; + + struct MatchInfo { + bool have_match = false; + std::vector ca_history; + std::vector sa; + + bool operator==(const MatchInfo& other) const ; + bool operator!=(const MatchInfo& other) const ; + + MatchInfo() = default; + + MatchInfo(const std::vector &ca_history, const std::vector &sa); + }; + + int matchStrToRegexp(const std::string& input, const std::string& pattern, + MatchInfo& retMatchInfo, track_var_list& retTrackVarList, std::string& retStatus); +} + +#endif diff --git a/src/libregexis024vm/instruction_implementation.cpp b/src/libregexis024vm/instruction_implementation.cpp new file mode 100644 index 0000000..fb41051 --- /dev/null +++ b/src/libregexis024vm/instruction_implementation.cpp @@ -0,0 +1,491 @@ +#include +#include + +void swap_old_settled_and_new_active(REGEX_IS024_CONTEXT &ctx, REGEX_IS024_Thread& old_settled){ + ctx_print_debug(ctx); + assert(old_settled.slot_occupation_status == SLOT_OCCUPIED_val); + REGEX_IS024_Thread temp = old_settled; + old_settled = ctx.active_thread; + old_settled.slot_occupation_status = SLOT_NEW_val; + ctx.active_thread = temp; + // slot_occupation_status & SLOT_OCCUPIED of actie thread is true, because it was retrieved from old_settled +} + +void start_noncloning_conflict(REGEX_IS024_CONTEXT& ctx, REGEX_IS024_Thread& other){ + ctx_print_debug(ctx); + if (ctx.have_sift_function){ + ctx.sifting_with = &other; + ctx.who_started_sift = regex024_opcode::READ; + ctx.intruder_IP = ctx.active_thread.IP; + ctx.active_thread.IP = ctx.sift_function; + ctx.RAX = ctx.RBX = 0; + } else { + ctx.active_thread.delete_thread(); + ctx.try_to_continue_scheduled(); + } +} + +/* The one that drops as an intruder here is current active.thread.IP */ +void start_cloning_conflict(REGEX_IS024_CONTEXT& ctx, REGEX_IS024_Thread& other, regex_near_ptr_t clone_IP){ + ctx_print_debug(ctx); + if (ctx.have_sift_function){ + ctx.sifting_with = &other; + ctx.who_started_sift = regex024_opcode::FORK; + ctx.intruder_IP = ctx.active_thread.IP; + ctx.child_ret_IP = clone_IP; + ctx.active_thread.IP = ctx.sift_function; + ctx.RAX = ctx.RBX = 0; + } else { + ctx.active_thread.IP = clone_IP; + } +} + +#define initialization_phase_check() if (ctx.initialized){ \ + ctx.error = regex024_error_codes::too_late; return; } +#define general_matching_mode_check() if (!ctx.initialized){ \ + ctx.error = regex024_error_codes::too_early; return; } if(ctx.sifting_with){ \ + ctx.error = regex024_error_codes::instruction_not_for_collision_thread; return; } +#define sift_mode_check() if (!ctx.sifting_with){ \ + ctx.error = regex024_error_codes::instruction_not_for_collision_thread; return; } + +/* Can append to both read_halted+new stacks of context */ +void read_halted_new_type_stacks_append(REGEX_IS024_CONTEXT &ctx, regex_sslot_id_t ssid){ + ctx_print_debug(ctx); + if (ssid < ctx.portion_of_FIRST_read_halt_ns){ + ctx.READ_halted_stack_new_first.append(ssid); + } else { + ctx.READ_halted_stack_new_second.append(ssid); + } +} + +void do_i_read(REGEX_IS024_CONTEXT &ctx, regex_sslot_id_t ssid) { + ctx_print_debug(ctx); + general_matching_mode_check() + if (ssid >= ctx.read_slots_number) + smitsya(read_sslot_out_of_range); + REGEX_IS024_Thread& other = ctx.READ_halted_slots[ssid]; + if (other.slot_occupation_status & SLOT_OCCUPIED){ + if (other.slot_occupation_status & SLOT_NEW){ + start_noncloning_conflict(ctx, other); + } else { + swap_old_settled_and_new_active(ctx, other); + /* Even though ssid was registed in stack for elders, now young stack should also track this slot */ + read_halted_new_type_stacks_append(ctx, ssid); + } + } else { + other = ctx.active_thread; + other.slot_occupation_status = SLOT_NEW_val; + ctx.active_thread.slot_occupation_status = SLOT_EMPTY_val; + read_halted_new_type_stacks_append(ctx, ssid); + ctx.try_to_continue_scheduled(); + } +} + +void i_READ(REGEX_IS024_CONTEXT &ctx) { + ctx_print_debug(ctx); + check_available_prg(REGEX024_BYTECODE_SSLOT_ID_SZ) + regex_sslot_id_t ssid = ctx.extract_sslot_id(); + do_i_read(ctx, ssid); +} + +void i_READZ(REGEX_IS024_CONTEXT &ctx) { + ctx_print_debug(ctx); + do_i_read(ctx, 0); +} + +void i_JUMP(REGEX_IS024_CONTEXT& ctx){ + ctx_print_debug(ctx); + check_available_prg(REGEX024_BYTECODE_NEAR_POINTER_SZ) + ctx.active_thread.IP = ctx.extract_near_pointer(); +} + +template +void i_JC(REGEX_IS024_CONTEXT& ctx) +{ + ctx_print_debug(ctx); + check_available_prg(immArgSzT::byte_sz + REGEX024_BYTECODE_NEAR_POINTER_SZ); + uint64_t imm_val_B = immArgSzT::extract(ctx); + regex_near_ptr_t dest = ctx.extract_near_pointer(); + uint64_t imm_val_A = ctx.INP; + if (conditionT::call(imm_val_A, imm_val_B)) + ctx.active_thread.IP = dest; +} + +struct condEqual{static bool call(uint64_t A, uint64_t B){return A == B;}}; +struct condLess{static bool call(uint64_t A, uint64_t B){return A < B;}}; +struct condGrtr{static bool call(uint64_t A, uint64_t B){return A > B;}}; + +struct immArgByte{ + static constexpr int byte_sz = 1; + static uint64_t extract(REGEX_IS024_CONTEXT& ctx){return ctx.extract_b();} +}; +struct immArgWord{ + static constexpr int byte_sz = 2; + static uint64_t extract(REGEX_IS024_CONTEXT& ctx){return ctx.extract_w();} +}; +struct immArgDoubleWord{ + static constexpr int byte_sz = 4; + static uint64_t extract(REGEX_IS024_CONTEXT& ctx){return ctx.extract_dw();} +}; +struct immArgQuadWord{ + static constexpr int byte_sz = 8; + static uint64_t extract(REGEX_IS024_CONTEXT& ctx){return ctx.extract_qw();} +}; + +void clone_thread_into_slot(REGEX_IS024_Thread& source, REGEX_IS024_Thread& vessel){ + thread_print_debug(source); + my_assert(!(vessel.slot_occupation_status & SLOT_OCCUPIED)); + my_assert((source.slot_occupation_status & SLOT_OCCUPIED)); + vessel = source; + if (vessel.CAHptr){ + vessel.CAHptr->refs++; + } + if (vessel.SAptr){ + vessel.SAptr[0]++; + } +} + +/* One FORK-slot governs the one single unique position in program: the next one after the fork */ +void i_FORK(REGEX_IS024_CONTEXT& ctx){ + ctx_print_debug(ctx); + general_matching_mode_check() + check_available_prg(REGEX024_BYTECODE_SSLOT_ID_SZ + REGEX024_BYTECODE_NEAR_POINTER_SZ); + regex_sslot_id_t ssid = ctx.extract_sslot_id(); + regex_near_ptr_t dest = ctx.extract_near_pointer(); + if (ssid >= ctx.fork_slots_number) + smitsya(fork_sslot_out_of_range); + REGEX_IS024_Thread& other = ctx.FORK_halted_slots[ssid]; + if (other.slot_occupation_status & SLOT_OCCUPIED){ + start_cloning_conflict(ctx, other, dest); + } else { + clone_thread_into_slot(ctx.active_thread, other); + ctx.active_thread.IP = dest; + ctx.FORK_halted_stack.append(ssid); + } +} + +void i_MATCH(REGEX_IS024_CONTEXT& ctx){ + ctx_print_debug(ctx); + general_matching_mode_check() + if (ctx.matched_thread.slot_occupation_status & SLOT_OCCUPIED){ + start_cloning_conflict(ctx, ctx.matched_thread, ctx.active_thread.IP); + } else { + clone_thread_into_slot(ctx.active_thread, ctx.matched_thread); + } +} + +void i_DIE(REGEX_IS024_CONTEXT& ctx){ + ctx_print_debug(ctx); + general_matching_mode_check() + ctx.active_thread.delete_thread(); + ctx.try_to_continue_scheduled(); +} + +void i_PARAM_READ_SS_NUMBER(REGEX_IS024_CONTEXT& ctx){ + ctx_print_debug(ctx); + initialization_phase_check() + check_available_prg(REGEX024_BYTECODE_SSLOT_ID_SZ) + regex_sslot_id_t read_slots_number = ctx.extract_sslot_id(); + ctx.read_slots_number = read_slots_number; +} + +void i_PARAM_FORK_SS_NUMBER(REGEX_IS024_CONTEXT& ctx){ + ctx_print_debug(ctx); + initialization_phase_check() + check_available_prg(REGEX024_BYTECODE_SSLOT_ID_SZ) + regex_sslot_id_t fork_slots_number = ctx.extract_sslot_id(); + ctx.fork_slots_number = fork_slots_number; +} + +void i_PARAM_SELARR_LEN(REGEX_IS024_CONTEXT& ctx){ + ctx_print_debug(ctx); + initialization_phase_check() + check_available_prg(REGEX024_BYTECODE_TRACK_ARRAY_INDEX_ID_SZ) + regex_tai_t selection_array_len = ctx.extract_track_array_index(); + ctx.selection_array_len = selection_array_len; +} + +void i_PARAM_COLSIFTFUNC_SET(REGEX_IS024_CONTEXT& ctx){ + ctx_print_debug(ctx); + initialization_phase_check() + check_available_prg(REGEX024_BYTECODE_NEAR_POINTER_SZ) + regex_near_ptr_t sift_function = ctx.extract_near_pointer(); + ctx.have_sift_function = true; + ctx.sift_function = sift_function; +} + +void i_PARAM_COLSIFTFUNC_WIPE(REGEX_IS024_CONTEXT& ctx){ + ctx_print_debug(ctx); + initialization_phase_check() + ctx.have_sift_function = false; +} + +void i_MSG_MULTISTART_ALLOWED(REGEX_IS024_CONTEXT& ctx){ + ctx_print_debug(ctx); + initialization_phase_check() + check_available_prg(1) + ctx.allows_multistart = (bool)ctx.extract_b(); +} + +void i_MSG_FED_INPUT_EXTENDED(REGEX_IS024_CONTEXT& ctx){ + ctx_print_debug(ctx); + initialization_phase_check() + check_available_prg(1 + 1 + REGEX024_BYTECODE_SSLOT_ID_SZ) + ctx.fed_input_extends_left = ctx.extract_b(); + ctx.fed_input_extends_right = ctx.extract_b(); + ctx.portion_of_second_read_halt_ns = ctx.extract_sslot_id(); +} + +uint64_t get_el_from_selarr(uint64_t* sa, regex_near_ptr_t ind){ + return sa ? sa[1UL + ind] : 0; +} + +void i_DMOV_RABX_SELARR(REGEX_IS024_CONTEXT& ctx){ + ctx_print_debug(ctx); + sift_mode_check() + check_available_prg(REGEX024_BYTECODE_TRACK_ARRAY_INDEX_ID_SZ) + regex_tai_t i1 = ctx.extract_track_array_index(); + if (i1 >= ctx.selection_array_len) + smitsya(selection_arr_out_of_range); + ctx.RAX = get_el_from_selarr(ctx.active_thread.SAptr, i1); + ctx.RBX = get_el_from_selarr(ctx.sifting_with->SAptr, i1); +} + +uint64_t get_selarr_el_dist(uint64_t* sa, uint16_t start, uint16_t end){ + uint64_t v_start = get_el_from_selarr(sa, start); + uint64_t v_end = get_el_from_selarr(sa, end); + return v_end > v_start ? v_end - v_start : 0; +} + +void i_DDIST_RABX_SELARR(REGEX_IS024_CONTEXT& ctx){ + ctx_print_debug(ctx); + sift_mode_check() + check_available_prg(REGEX024_BYTECODE_TRACK_ARRAY_INDEX_ID_SZ * 2) + regex_tai_t i_start = ctx.extract_track_array_index(); + if (i_start >= ctx.selection_array_len) + smitsya(selection_arr_out_of_range); + regex_tai_t i_end = ctx.extract_track_array_index(); + if (i_end >= ctx.selection_array_len) + smitsya(selection_arr_out_of_range); + ctx.RAX = get_selarr_el_dist(ctx.active_thread.SAptr, i_start, i_end); + ctx.RBX = get_selarr_el_dist(ctx.sifting_with->SAptr, i_start, i_end); +} + +void finish_conflict_homesteader_wins(REGEX_IS024_CONTEXT& ctx){ + ctx_print_debug(ctx); + if (ctx.who_started_sift == regex024_opcodes::READ){ + ctx.active_thread.delete_thread(); + ctx.try_to_continue_scheduled(); + } else { + /* FORK or MATCH (which will also be shown as FORK) */ + /* Cloning conflict ends, active_thread jumps to offsprings IP */ + ctx.active_thread.IP = ctx.child_ret_IP; + } + ctx.sifting_with = NULL; +} + +void finish_conflict_intruder_wins(REGEX_IS024_CONTEXT& ctx){ + ctx_print_debug(ctx); + ctx.sifting_with->delete_thread(); + ctx.active_thread.IP = ctx.intruder_IP; + if (ctx.who_started_sift == regex024_opcodes::READ){ + /* noncloning conflict won by intruder+ */ + *ctx.sifting_with = ctx.active_thread; + ctx.active_thread.slot_occupation_status = SLOT_EMPTY_val; + ctx.try_to_continue_scheduled(); + } else { + /* End of cloning conflict (it involved cloning) */ + clone_thread_into_slot(ctx.active_thread, *ctx.sifting_with); + ctx.active_thread.IP = ctx.child_ret_IP; + } + ctx.sifting_with = NULL; +} + +void i_SIFTPRIOR_MIN_RABX(REGEX_IS024_CONTEXT& ctx){ + ctx_print_debug(ctx); + sift_mode_check() + if (ctx.RAX < ctx.RBX){ + finish_conflict_intruder_wins(ctx); + } else if (ctx.RAX > ctx.RBX){ + finish_conflict_homesteader_wins(ctx); + } +} + +void i_SIFTPRIOR_MAX_RABX(REGEX_IS024_CONTEXT& ctx){ + ctx_print_debug(ctx); + sift_mode_check() + if (ctx.RAX > ctx.RBX){ + finish_conflict_intruder_wins(ctx); + } else if (ctx.RAX < ctx.RBX){ + finish_conflict_homesteader_wins(ctx); + } +} + +void i_SIFT_DONE(REGEX_IS024_CONTEXT& ctx){ + ctx_print_debug(ctx); + sift_mode_check() + finish_conflict_homesteader_wins(ctx); +} + +/* Can give errors */ +void ca_branch_new_node(REGEX_IS024_CONTEXT& ctx, regex_tai_t key, uint64_t val){ + ctx_print_debug(ctx); + if (ctx.CAN_total >= ctx.CA_TREE_LIMIT) + smitsya(ca_tree_limit_violation); + REGEX024_CollectionArrayNode* node = new REGEX024_CollectionArrayNode{key, val, ctx.active_thread.CAHptr, 1}; + // if (ctx.active_thread.CAHptr) + // (ctx.active_thread.CAHptr->refs)++; + ctx.active_thread.CAHptr = node; + ctx.CAN_total++; +} + +void i_MOV_COLARR_IMM(REGEX_IS024_CONTEXT& ctx){ + ctx_print_debug(ctx); + general_matching_mode_check() + check_available_prg(REGEX024_BYTECODE_TRACK_ARRAY_INDEX_ID_SZ + 8) + regex_tai_t ca_ind = ctx.extract_track_array_index(); + uint64_t imm = ctx.extract_qw(); + ca_branch_new_node(ctx, ca_ind, imm); +} + +void i_MOV_COLARR_BTPOS(REGEX_IS024_CONTEXT& ctx){ + ctx_print_debug(ctx); + general_matching_mode_check() + check_available_prg(REGEX024_BYTECODE_TRACK_ARRAY_INDEX_ID_SZ) + regex_tai_t ca_ind = ctx.extract_track_array_index(); + ca_branch_new_node(ctx, ca_ind, ctx.passed_bytes); +} + +/* Can throw error, should be placed at the end. Call ONLY in general matching mode */ +void edit_selection_array(REGEX_IS024_CONTEXT& ctx, uint64_t key, uint64_t val){ + ctx_print_debug(ctx); + uint64_t N = ctx.selection_array_len; + if (key >= N) + smitsya(selection_arr_out_of_range); + if (!ctx.active_thread.SAptr){ + uint64_t* sa_instance = (uint64_t*)calloc(N + 1, 8); + if (!sa_instance) + throw std::bad_alloc(); + sa_instance[0] = 1; + sa_instance[key + 1] = val; + ctx.active_thread.SAptr = sa_instance; + } else if (ctx.active_thread.SAptr[0] == 1){ + ctx.active_thread.SAptr[key + 1] = val; + } else { + uint64_t* sa_instance = (uint64_t*)calloc(N + 1, 8); + if (!sa_instance) + throw std::bad_alloc(); + sa_instance[0] = 1; + for (uint64_t i = 1; i <= ctx.selection_array_len; i++) + sa_instance[i] = ctx.active_thread.SAptr[i]; + sa_instance[key + 1] = val; + ctx.active_thread.SAptr[0]--; + ctx.active_thread.SAptr = sa_instance; + } +} + +void i_MOV_SELARR_IMM(REGEX_IS024_CONTEXT& ctx){ + ctx_print_debug(ctx); + general_matching_mode_check() + check_available_prg(REGEX024_BYTECODE_TRACK_ARRAY_INDEX_ID_SZ + 8) + regex_tai_t sa_ind = ctx.extract_track_array_index(); + uint64_t imm = ctx.extract_qw(); + edit_selection_array(ctx, sa_ind, imm); +} + +void i_MOV_SELARR_CHPOS(REGEX_IS024_CONTEXT& ctx){ + ctx_print_debug(ctx); + general_matching_mode_check() + check_available_prg(REGEX024_BYTECODE_TRACK_ARRAY_INDEX_ID_SZ) + regex_tai_t sa_ind = ctx.extract_track_array_index(); + edit_selection_array(ctx, sa_ind, ctx.passed_chars); +} + +void calloc_stack_slots(REGEX_IS024_Stack& stack, regex_sslot_id_t nmemb) { + assert(stack.sz == 0 && !stack.slots); + regex_sslot_id_t* storage = static_cast(calloc(nmemb, sizeof(regex_sslot_id_t))); + if (!storage) + throw std::bad_alloc(); + stack.slots = storage; +} + +REGEX_IS024_Thread* calloc_slots_array(regex_sslot_id_t nmemb) { + REGEX_IS024_Thread* ptr = static_cast(calloc(nmemb, sizeof(REGEX_IS024_Thread))); + if (!ptr) + throw std::bad_alloc(); + return ptr; +} + +void i_INIT(REGEX_IS024_CONTEXT& ctx){ + ctx_print_debug(ctx); + initialization_phase_check() + if (ctx.selection_array_len > ctx.SA_LEN_LIMIT) + smitsya(sa_length_limit_violation); + if (ctx.read_slots_number > ctx.READ_SS_LIMIT) + smitsya(read_sslot_count_limit_violation); + if (ctx.fork_slots_number > ctx.FORK_SS_LIMIT) + smitsya(fork_sslot_count_limit_violation); + if (ctx.portion_of_second_read_halt_ns > ctx.read_slots_number) + smitsya(fork_sslot_out_of_range); + ctx.READ_halted_slots = calloc_slots_array(ctx.read_slots_number); + calloc_stack_slots(ctx.READ_halted_stack_old, ctx.read_slots_number); + + ctx.portion_of_FIRST_read_halt_ns = ctx.read_slots_number - ctx.portion_of_second_read_halt_ns; + calloc_stack_slots(ctx.READ_halted_stack_new_first, ctx.portion_of_FIRST_read_halt_ns); + calloc_stack_slots(ctx.READ_halted_stack_new_second, ctx.portion_of_second_read_halt_ns); + + ctx.FORK_halted_slots = calloc_slots_array(ctx.fork_slots_number); + calloc_stack_slots(ctx.FORK_halted_stack, ctx.fork_slots_number); + + ctx.initialized = true; + ctx.unnatural_started_thread_IP = ctx.active_thread.IP; + ctx.active_thread.delete_thread(); +} + +void i_THROW(REGEX_IS024_CONTEXT& ctx){ + ctx.error = regex024_error_codes::program_throw; +} + +void instruction_table(REGEX_IS024_CONTEXT &ctx) { + ctx_print_debug(ctx); + uint8_t opcode = ctx.extract_instruction(); + +#define rcase(inst) case regex024_opcodes::inst: return i_ ## inst (ctx); +#define jumpC(UN, st) case regex024_opcodes::JC ## UN ## _B: return i_JC(ctx); \ + case regex024_opcodes::JC ## UN ## _W: return i_JC(ctx); \ + case regex024_opcodes::JC ## UN ## _DW: return i_JC(ctx); \ + case regex024_opcodes::JC ## UN ## _QW: return i_JC(ctx); + switch (opcode) { + rcase(READ) + rcase(READZ) + rcase(JUMP) + + jumpC(EQUAL, condEqual) + jumpC(LESS, condLess) + jumpC(GRTR, condGrtr) + + rcase(FORK) + rcase(MATCH) + rcase(DIE) + rcase(PARAM_READ_SS_NUMBER) + rcase(PARAM_FORK_SS_NUMBER) + rcase(PARAM_SELARR_LEN) + rcase(PARAM_COLSIFTFUNC_SET) + rcase(PARAM_COLSIFTFUNC_WIPE) + rcase(MSG_MULTISTART_ALLOWED) + rcase(MSG_FED_INPUT_EXTENDED) + rcase(DMOV_RABX_SELARR) + rcase(DDIST_RABX_SELARR) + rcase(SIFTPRIOR_MIN_RABX) + rcase(SIFTPRIOR_MAX_RABX) + rcase(SIFT_DONE) + rcase(MOV_COLARR_IMM) + rcase(MOV_COLARR_BTPOS) + rcase(MOV_SELARR_IMM) + rcase(MOV_SELARR_CHPOS) + rcase(INIT) + rcase(THROW) + default: + ctx.error = regex024_error_codes::invalid_opcode; + } +} diff --git a/src/libregexis024vm/instruction_implementation.h b/src/libregexis024vm/instruction_implementation.h new file mode 100644 index 0000000..50ee2b4 --- /dev/null +++ b/src/libregexis024vm/instruction_implementation.h @@ -0,0 +1,35 @@ +#ifndef LIBREGEXIS024_INSTRUCTION_IMPLEMENTATION_H +#define LIBREGEXIS024_INSTRUCTION_IMPLEMENTATION_H + +/* This file should not be included outside libregex024 virtual machine implementation */ + +#include +#include +#include + +#define smitsya(error_type) do {ctx.error = regex024_error_codes::error_type; return; } while (0) + +#define SLOT_EMPTY_val 0 +#define SLOT_OCCUPIED 1 +#define SLOT_OCCUPIED_val SLOT_OCCUPIED +#define SLOT_NEW 2 +#define SLOT_NEW_val (SLOT_OCCUPIED | SLOT_NEW) + +#define check_available_prg(regionSz) if (!ctx.check_inboundness(regionSz)){ \ + ctx.error = regex024_error_codes::improper_finish; return; } + + +#if defined(LIBREGEXIS024_DEBUG) && defined(LIBREGEXIS024_ALLOW_LOUD) +#include +#define my_assert(expr) assert(expr) +#define ctx_print_debug(ctx) debug_print_context(ctx, __func__) +#define thread_print_debug(thread) debug_print_thread(thread, __func__) +#else +#define my_assert(expr) assert(expr) +#define ctx_print_debug(ctx) +#define thread_print_debug(thread) +#endif + +void instruction_table(REGEX_IS024_CONTEXT& ctx); + +#endif //LIBREGEXIS024_INSTRUCTION_IMPLEMENTATION_H \ No newline at end of file diff --git a/src/libregexis024vm/libregex024opcodes_stringification.cpp b/src/libregexis024vm/libregex024opcodes_stringification.cpp new file mode 100644 index 0000000..ce0e330 --- /dev/null +++ b/src/libregexis024vm/libregex024opcodes_stringification.cpp @@ -0,0 +1,47 @@ +#include +#include + +#define rcase(name) case regex024_opcodes::name: return #name; + +const char *regex024_opcode_tostr(regex024_opcode x) { + switch (x) { + rcase(READ) + rcase(READZ) + rcase(JUMP) + rcase(JCEQUAL_B) + rcase(JCEQUAL_W) + rcase(JCEQUAL_DW) + rcase(JCEQUAL_QW) + rcase(JCLESS_B) + rcase(JCLESS_W) + rcase(JCLESS_DW) + rcase(JCLESS_QW) + rcase(JCGRTR_B) + rcase(JCGRTR_W) + rcase(JCGRTR_DW) + rcase(JCGRTR_QW) + rcase(FORK) + rcase(MATCH) + rcase(DIE) + rcase(PARAM_READ_SS_NUMBER) + rcase(PARAM_FORK_SS_NUMBER) + rcase(PARAM_SELARR_LEN) + rcase(PARAM_COLSIFTFUNC_SET) + rcase(PARAM_COLSIFTFUNC_WIPE) + rcase(MSG_MULTISTART_ALLOWED) + rcase(MSG_FED_INPUT_EXTENDED) + rcase(DMOV_RABX_SELARR) + rcase(DDIST_RABX_SELARR) + rcase(SIFTPRIOR_MIN_RABX) + rcase(SIFTPRIOR_MAX_RABX) + rcase(SIFT_DONE) + rcase(MOV_COLARR_IMM) + rcase(MOV_COLARR_BTPOS) + rcase(MOV_SELARR_IMM) + rcase(MOV_SELARR_CHPOS) + rcase(INIT) + rcase(THROW) + default: + return "Invalid opcode"; + } +} diff --git a/src/libregexis024vm/libregexis024vm.h b/src/libregexis024vm/libregexis024vm.h new file mode 100644 index 0000000..6a33aa8 --- /dev/null +++ b/src/libregexis024vm/libregexis024vm.h @@ -0,0 +1,158 @@ +#ifndef LIBREGEXIS024_LIBREGEXIS024VM_H +#define LIBREGEXIS024_LIBREGEXIS024VM_H + +/* This thing is bloated. And slow (Because I designed it imperfectly and because it is bloated). + * I could have halven the amount of bloat, but that would require me writing code in headers. + * I am gonna use it for KM, even more bloated project. So I thought that this design is on the spot. + * C++ is such a funny language. Code is divided into .cpp and .h files. But it only makes problems. + * All of my work on this C++ project was not serious from the beginning. It's all funny stuff. */ + +/* Also, please, consider using libregexis024vm/libregexis024vm_interface.h + * Naming in this project is super inconsistent. I don't want it to trash your namespace */ + +#include +#include +#include +#include +#include + +struct REGEX_IS024_Stack{ + regex_sslot_id_t* slots = NULL; + regex_sslot_id_t sz = 0; + + regex_sslot_id_t pop(); + void append(regex_sslot_id_t x); + bool empty() const; + bool non_empty() const; + + REGEX_IS024_Stack(const REGEX_IS024_Stack&) = delete; + REGEX_IS024_Stack& operator=(const REGEX_IS024_Stack&) = delete; + REGEX_IS024_Stack() = default; + + ~REGEX_IS024_Stack(); +}; + +struct REGEX024_CollectionArrayNode{ + /* Key is small for historical reasons I do not rememeber. Who cares anyway */ + regex_tai_t key; + uint64_t value; + /* NULL at the beginning */ + REGEX024_CollectionArrayNode* prev; + /* Reference counting */ + uint64_t refs = 0; +}; + +struct REGEX_IS024_Thread{ + /* First byte field is used only when thread is located in slot */ + uint8_t slot_occupation_status = 0; + regex_near_ptr_t IP = 0; + REGEX024_CollectionArrayNode* CAHptr = NULL; + /* Pointer to the seletion array. SA's are reference counted. Because of that every SA + * is elongated by one meta element in the beginning - reference counter. So the actual elements + * are enumerated starting from one. */ + uint64_t* SAptr = NULL; + + void delete_thread() noexcept; + void debug_print(const char* place); +}; + +struct REGEX_IS024_CONTEXT{ + REGEX_IS024_CONTEXT(size_t programSize, const uint8_t *data, uint64_t caTreeLimit, regex_tai_t saLenLimit, + regex_sslot_id_t readSsLimit, regex_sslot_id_t forkSsLimit, uint64_t timeTickLimit); + + regex024_error_code feedSOF(); + /* You can safely pile up calls to this command, nothing bad will happen */ + regex024_error_code startThread(); + regex024_error_code extendedFeedCharacter(uint64_t input); + regex024_error_code feedCharacter(uint64_t INP, uint64_t corresponding_byte_amount); + + + ~REGEX_IS024_CONTEXT(); + + /* Program size larger than 2^62 is forbidden */ + size_t program_size = 0; + const uint8_t* prg = NULL; + + /* Max allowed index of CA is 2^16 - 1 + * Max allowed index of SA is 2^16 - 1. VM can be configured to allow even less */ + /* CA = Collecton array. */ + uint64_t CA_TREE_LIMIT; + /* SA = Selection array */ + regex_tai_t SA_LEN_LIMIT; + regex_sslot_id_t READ_SS_LIMIT; + regex_sslot_id_t FORK_SS_LIMIT; + + /* If time_tick_limit is non-zero, regex virtual machine will stop with error + * after this many ticks. This parameter set's the timeout.*/ + uint64_t time_tick_limit; + + /* This context is used only for one FA match session. This field measures each tick + * timer <= time_tick_limit */ + uint64_t timer = 0; + /* CAN_total <= CA_TREE_LIMIT */ + uint64_t CAN_total = 0; + + /* Program selects it */ + regex_tai_t selection_array_len = 0; + regex_sslot_id_t read_slots_number = 0; + regex_sslot_id_t fork_slots_number = 0; + + bool have_sift_function = false; + regex_near_ptr_t sift_function; + + bool allows_multistart = false; + uint8_t fed_input_extends_left = 0, fed_input_extends_right = 0; + regex_sslot_id_t portion_of_second_read_halt_ns = 0, portion_of_FIRST_read_halt_ns = 0; + + bool initialized = false; + regex_near_ptr_t unnatural_started_thread_IP = 1337; + regex024_error_code error = regex024_error_codes::stable; + + REGEX_IS024_Thread* READ_halted_slots; + REGEX_IS024_Stack READ_halted_stack_old; + REGEX_IS024_Stack READ_halted_stack_new_first; + REGEX_IS024_Stack READ_halted_stack_new_second; + REGEX_IS024_Thread* FORK_halted_slots; + REGEX_IS024_Stack FORK_halted_stack; + + REGEX_IS024_Thread active_thread; + + /* Environment for sifting stuff */ + REGEX_IS024_Thread* sifting_with = NULL; + /* specifies the type of operation vm should do after shift (there are only two distinct options) */ + uint8_t who_started_sift; + /* Sifting process uses IP field of active thread. Other data of thread is not modified or used during collision + * procudure. Old IP is stored there, if needed */ + regex_near_ptr_t child_ret_IP; + regex_near_ptr_t intruder_IP; + /* RAX corresponds to intruder. Its data is stored in active thread field*/ + uint64_t RAX; + /* RBX corresponds to homesteader. Its data is accessible by `REGEX_IS024_Thread* sifting_with` pointer*/ + uint64_t RBX; + + /* Will be unoccupied if no threads matched. After each feed of character this field will be wiped + * User should take care of intermediate success himself */ + REGEX_IS024_Thread matched_thread; + + uint64_t INP = 0; + uint64_t passed_chars = 0; + uint64_t passed_bytes = 0; + + void try_to_continue_scheduled(); + + bool check_inboundness(int region); + + uint8_t extract_b(); + uint16_t extract_w(); + uint32_t extract_dw(); + uint64_t extract_qw(); + + uint8_t extract_instruction(); + regex_sslot_id_t extract_sslot_id(); + regex_near_ptr_t extract_near_pointer(); + regex_tai_t extract_track_array_index(); + + void debug_print(const char* place); +}; + +#endif //LIBREGEXIS024_LIBREGEXIS024VM_H diff --git a/src/libregexis024vm/libregexis024vm_context.cpp b/src/libregexis024vm/libregexis024vm_context.cpp new file mode 100644 index 0000000..81c5e41 --- /dev/null +++ b/src/libregexis024vm/libregexis024vm_context.cpp @@ -0,0 +1,197 @@ +#include +#include +#include + +regex_sslot_id_t REGEX_IS024_Stack::pop() { + assert(sz != 0); + return slots[--sz]; +} + +void REGEX_IS024_Stack::append(regex_sslot_id_t x) { + assert(slots); + slots[sz] = x; + sz++; +} + +bool REGEX_IS024_Stack::empty() const { + return !non_empty(); +} + +bool REGEX_IS024_Stack::non_empty() const { + return sz; +} + +REGEX_IS024_Stack::~REGEX_IS024_Stack() { + assert(empty()); + free(slots); +} + +REGEX_IS024_CONTEXT::REGEX_IS024_CONTEXT(size_t programSize, const uint8_t *data, + uint64_t caTreeLimit, regex_tai_t saLenLimit, + regex_sslot_id_t readSsLimit, regex_sslot_id_t forkSsLimit, + uint64_t timeTickLimit) : + program_size(programSize), prg(data), CA_TREE_LIMIT(caTreeLimit), SA_LEN_LIMIT(saLenLimit), + READ_SS_LIMIT(readSsLimit), FORK_SS_LIMIT(forkSsLimit), time_tick_limit(timeTickLimit) +{ + if (program_size > (1UL << 62)) + exitf("Program is too huge\n"); + active_thread.slot_occupation_status = SLOT_OCCUPIED; +} + +/* No only will it launch a wave of deallocation in CA tree, but as a nice bonus it's + * gonna deoccupy slot_occupation_status*/ +void REGEX_IS024_Thread::delete_thread() noexcept { + thread_print_debug(*this); + my_assert(slot_occupation_status & SLOT_OCCUPIED); + slot_occupation_status = SLOT_EMPTY_val; + REGEX024_CollectionArrayNode* cur_CAptr = CAHptr; + while (cur_CAptr){ + assert(cur_CAptr->refs > 0); + if (--(cur_CAptr->refs) == 0){ + REGEX024_CollectionArrayNode* next_CAptr = cur_CAptr->prev; + delete cur_CAptr; + cur_CAptr = next_CAptr; + } else + break; + } + if (SAptr){ + if (--(SAptr[0]) == 0) + free(SAptr); + } +} + +void emptify_one_of_new_read_halted_stacks(REGEX_IS024_CONTEXT& ctx, REGEX_IS024_Stack& type_new_stack){ + while (type_new_stack.non_empty()){ + REGEX_IS024_Thread& thread = ctx.READ_halted_slots[type_new_stack.pop()]; + assert(thread.slot_occupation_status & SLOT_OCCUPIED); + thread.delete_thread(); + } +} + +/* First it will try to pop pending thread from FORK_halted_stack + * Then it will try popping thread from READ_halted_stack_old (checking if top + * thread here is not actually SLOT_NEW). If something succeded, corresponding slot will be deoccupied, and + * active slot will be occupied with it. + * + * try_to_continue_scheduled() assumes that active thread is unoccupied.*/ +void REGEX_IS024_CONTEXT::try_to_continue_scheduled(){ + ctx_print_debug(*this); + my_assert(!(active_thread.slot_occupation_status & SLOT_OCCUPIED)); + if (FORK_halted_stack.sz){ + regex_sslot_id_t ssid = FORK_halted_stack.pop(); + active_thread = FORK_halted_slots[ssid]; + FORK_halted_slots[ssid].slot_occupation_status = SLOT_EMPTY_val; + return; + } + while (READ_halted_stack_old.sz){ + regex_sslot_id_t ssid = READ_halted_stack_old.pop(); + if (READ_halted_slots[ssid].slot_occupation_status & SLOT_NEW){ + /* This is the case when old thread was silently replaced by settled new thread */ + continue; + } + active_thread = READ_halted_slots[ssid]; + READ_halted_slots[ssid].slot_occupation_status = SLOT_EMPTY_val; + return; + } + /* Failure here will be detected. We started with unoccupied active thread. iterator inside kick will see it */ +} + +void kick(REGEX_IS024_CONTEXT& ctx) { + ctx_print_debug(ctx); + while ((ctx.active_thread.slot_occupation_status & SLOT_OCCUPIED) + && ctx.error == regex024_error_codes::stable){ + if (ctx.timer >= ctx.time_tick_limit) + smitsya(timeout); + ctx.timer++; + + check_available_prg(REGEX024_BYTECODE_INSTRUCTION_SZ) // May return from kick(ctx) + // smivanie from those instructions will be immediately detected. Everything is OK + instruction_table(ctx); + } +} + + +regex024_error_code REGEX_IS024_CONTEXT::feedSOF() { + ctx_print_debug(*this); + kick(*this); + return error; +} + +regex024_error_code REGEX_IS024_CONTEXT::startThread() { + ctx_print_debug(*this); + active_thread.slot_occupation_status = SLOT_OCCUPIED; + active_thread.IP = unnatural_started_thread_IP; + active_thread.SAptr = NULL; + active_thread.CAHptr = NULL; + kick(*this); + return error; +} + +/* I hate C++ (aka antichrist), won't use move sementic (aka drink cornsyrup) */ +void swap_stacks(REGEX_IS024_Stack& A, REGEX_IS024_Stack& B) { + std::swap(A.sz, B.sz); + std::swap(A.slots, B.slots); +} + +void fill_empty_old_read_halted_stack(REGEX_IS024_CONTEXT& ctx, REGEX_IS024_Stack& read_halted_stack_new){ + ctx_print_debug(ctx); + my_assert(!ctx.READ_halted_stack_old.non_empty()); + + // Actually, READ_halted_stack_old is always empty in this case + assert(ctx.READ_halted_stack_old.empty()); + swap_stacks(ctx.READ_halted_stack_old, read_halted_stack_new); + for (uint32_t i = 0; i < ctx.READ_halted_stack_old.sz; i++){ + REGEX_IS024_Thread& slot = ctx.READ_halted_slots[ctx.READ_halted_stack_old.slots[i]]; + /* Should get rid of 'NEW' qualifier */ + assert(slot.slot_occupation_status & SLOT_OCCUPIED); + if (slot.slot_occupation_status & SLOT_OCCUPIED) + slot.slot_occupation_status = SLOT_OCCUPIED; + } +} + +regex024_error_code REGEX_IS024_CONTEXT::feedCharacter(uint64_t input, uint64_t corresponding_byte_amount) { + ctx_print_debug(*this); + if (matched_thread.slot_occupation_status & SLOT_OCCUPIED) + matched_thread.delete_thread(); + emptify_one_of_new_read_halted_stacks(*this, READ_halted_stack_new_second); + fill_empty_old_read_halted_stack(*this, READ_halted_stack_new_first); + INP = input; + passed_bytes += corresponding_byte_amount; + passed_chars++; + try_to_continue_scheduled(); + kick(*this); + return error; +} + +regex024_error_code REGEX_IS024_CONTEXT::extendedFeedCharacter(uint64_t input) { + ctx_print_debug(*this); + if (matched_thread.slot_occupation_status & SLOT_OCCUPIED) + matched_thread.delete_thread(); + fill_empty_old_read_halted_stack(*this, READ_halted_stack_new_second); + INP = input; + try_to_continue_scheduled(); + kick(*this); + return error; +} + +REGEX_IS024_CONTEXT::~REGEX_IS024_CONTEXT() { + ctx_print_debug(*this); + if (initialized){ + emptify_one_of_new_read_halted_stacks(*this, READ_halted_stack_new_first); + emptify_one_of_new_read_halted_stacks(*this, READ_halted_stack_new_second); + while (READ_halted_stack_old.non_empty()){ + REGEX_IS024_Thread& thread = READ_halted_slots[READ_halted_stack_old.pop()]; + assert(thread.slot_occupation_status & SLOT_OCCUPIED); + if (!(thread.slot_occupation_status & SLOT_NEW)) + thread.delete_thread(); + } + free(READ_halted_slots); + while (FORK_halted_stack.non_empty()) + FORK_halted_slots[FORK_halted_stack.pop()].delete_thread(); + free(FORK_halted_slots); + + if (matched_thread.slot_occupation_status & SLOT_OCCUPIED){ + matched_thread.delete_thread(); + } + } +} diff --git a/src/libregexis024vm/libregexis024vm_disassembly.cpp b/src/libregexis024vm/libregexis024vm_disassembly.cpp new file mode 100644 index 0000000..8d94165 --- /dev/null +++ b/src/libregexis024vm/libregexis024vm_disassembly.cpp @@ -0,0 +1,38 @@ +#include +#include + +bool REGEX_IS024_CONTEXT::check_inboundness(int region){ + return vmprog_check_inboundness(program_size, active_thread.IP, region); +} + +uint8_t REGEX_IS024_CONTEXT::extract_b() { + return vmprog_extract_b(&active_thread.IP, prg); +} + +uint16_t REGEX_IS024_CONTEXT::extract_w() { + return vmprog_extract_w(&active_thread.IP, prg); +} + +uint32_t REGEX_IS024_CONTEXT::extract_dw() { + return vmprog_extract_dw(&active_thread.IP, prg); +} + +uint64_t REGEX_IS024_CONTEXT::extract_qw() { + return vmprog_extract_qw(&active_thread.IP, prg); +} + +uint8_t REGEX_IS024_CONTEXT::extract_instruction() { + return extract_b(); +} + +regex_sslot_id_t REGEX_IS024_CONTEXT::extract_sslot_id() { + return extract_dw(); +} + +regex_near_ptr_t REGEX_IS024_CONTEXT::extract_near_pointer() { + return extract_qw(); +} + +regex_tai_t REGEX_IS024_CONTEXT::extract_track_array_index() { + return extract_w(); +} diff --git a/src/libregexis024vm/libregexis024vm_interface.cpp b/src/libregexis024vm/libregexis024vm_interface.cpp new file mode 100644 index 0000000..371cce9 --- /dev/null +++ b/src/libregexis024vm/libregexis024vm_interface.cpp @@ -0,0 +1,105 @@ +#include +#include +#include + +bool REGEX_IS024_CAEvent::operator==(const REGEX_IS024_CAEvent &other) const { + return (key == other.key) && (value == other.value); +} + +#define reveal ((REGEX_IS024_CONTEXT*)opaque) + +REGEX_IS024_VirtualMachine::REGEX_IS024_VirtualMachine(size_t programSize, const uint8_t *data, + uint64_t caTreeLimit, regex_tai_t saLenLimit, + regex_sslot_id_t readSsLimit, regex_sslot_id_t forkSsLimit, + uint64_t timeTickLimit) { + opaque = new REGEX_IS024_CONTEXT(programSize, data, caTreeLimit, saLenLimit, + readSsLimit, forkSsLimit, timeTickLimit); +} + +regex024_error_code REGEX_IS024_VirtualMachine::initialize() { + if (gave_SOF) + exitf("double feedSOF\n"); + gave_SOF = true; + return reveal->feedSOF(); +} + +bool REGEX_IS024_VirtualMachine::isInitialized() { + return reveal->initialized; +} + +bool REGEX_IS024_VirtualMachine::isUsable() { + return isInitialized() && reveal->error == regex024_error_codes::stable; +} + +REGEX_IS024_VirtualMachine::~REGEX_IS024_VirtualMachine() { + delete reveal; +} + +regex_tai_t REGEX_IS024_VirtualMachine::getSelectionArrayLength() { + return isUsable() ? reveal->selection_array_len : 0; +} + +bool REGEX_IS024_VirtualMachine::isAllowMultistart() { + return isUsable() ? reveal->allows_multistart : false; +} + +uint8_t REGEX_IS024_VirtualMachine::getInputLeftExtensionSize() { + return isUsable() ? reveal->fed_input_extends_left : 0; +} + +uint8_t REGEX_IS024_VirtualMachine::getInputRightExtensionSize() { + return isUsable() ? reveal->fed_input_extends_right : 0; +} + +regex024_error_code REGEX_IS024_VirtualMachine::getErrno() { + return reveal->error; +} + +/* Stupid kinda function. Checks if somebody is ready to continue reading the actual string */ +bool REGEX_IS024_VirtualMachine::haveSurvivors() { + return isUsable() && (reveal->READ_halted_stack_new_first.non_empty()); +} + +bool REGEX_IS024_VirtualMachine::isMatched() { + return isUsable() && static_cast((reveal->matched_thread.slot_occupation_status & SLOT_OCCUPIED)); +} + +std::vector REGEX_IS024_VirtualMachine::getMatchedThreadCABranchReverse() { + if (!isMatched()) + return {}; + std::vector res; + REGEX024_CollectionArrayNode* cur = reveal->matched_thread.CAHptr; + while (cur != NULL){ + res.push_back({cur->key, cur->value}); + cur = cur->prev; + } + return res; +} + +uint64_t REGEX_IS024_VirtualMachine::getMatchedThreadSAValue(uint16_t key) { + if (key >= getSelectionArrayLength()) + return 0; + if (!isMatched()) + return 0; + return reveal->matched_thread.SAptr ? reveal->matched_thread.SAptr[key + 1] : 0; +} + +regex024_error_code REGEX_IS024_VirtualMachine::addNewMatchingThread() { + if (!isUsable()) + exitf("unusable\n"); + // if (started_first_thread && !isAllowMultistart()) + // exitf("Multistart is forbidden, bad usage of program\n"); + return reveal->startThread(); +} + +regex024_error_code REGEX_IS024_VirtualMachine::extendedFeedCharacter(uint64_t input) { + if (!isUsable()) + exitf("unusable\n"); + return reveal->extendedFeedCharacter(input); +} + +regex024_error_code REGEX_IS024_VirtualMachine::feedCharacter(uint64_t input, uint64_t bytesResembled) { + if (!isUsable()) + exitf("unusable\n"); + return reveal->feedCharacter(input, bytesResembled); +} diff --git a/src/libregexis024vm/libregexis024vm_interface.h b/src/libregexis024vm/libregexis024vm_interface.h new file mode 100644 index 0000000..a0d1583 --- /dev/null +++ b/src/libregexis024vm/libregexis024vm_interface.h @@ -0,0 +1,46 @@ +#ifndef LIBREGEXIS024_LIBREGEXIS024VM_INTERFACE_H +#define LIBREGEXIS024_LIBREGEXIS024VM_INTERFACE_H + +#include +#include +#include +#include + +struct REGEX_IS024_CAEvent{ + regex_tai_t key; + uint64_t value; + bool operator==(const REGEX_IS024_CAEvent& other) const; +}; + +class REGEX_IS024_VirtualMachine{ +public: + REGEX_IS024_VirtualMachine(size_t programSize, const uint8_t *data, uint64_t caTreeLimit, uint16_t saLenLimit, + uint32_t readSsLimit, uint32_t forkSsLimit, uint64_t timeTickLimit); + + REGEX_IS024_VirtualMachine(const REGEX_IS024_VirtualMachine& ) = delete; + REGEX_IS024_VirtualMachine& operator=(const REGEX_IS024_VirtualMachine&) = delete; + + regex024_error_code initialize(); + bool isInitialized(); + bool isUsable(); + virtual ~REGEX_IS024_VirtualMachine(); + regex_tai_t getSelectionArrayLength(); + bool isAllowMultistart(); + uint8_t getInputLeftExtensionSize(); + uint8_t getInputRightExtensionSize(); + regex024_error_code getErrno(); + bool haveSurvivors(); + bool isMatched(); + std::vector getMatchedThreadCABranchReverse(); + uint64_t getMatchedThreadSAValue(uint16_t key); + + regex024_error_code addNewMatchingThread(); + regex024_error_code extendedFeedCharacter(uint64_t input); + regex024_error_code feedCharacter(uint64_t input, uint64_t bytesResembled); + +private: + bool gave_SOF = false; + void* opaque; +}; + +#endif //LIBREGEXIS024_LIBREGEXIS024VM_INTERFACE_H diff --git a/src/libregexis024vm/utils.cpp b/src/libregexis024vm/utils.cpp new file mode 100644 index 0000000..1b2abe2 --- /dev/null +++ b/src/libregexis024vm/utils.cpp @@ -0,0 +1,69 @@ +#include + +#include +#include +#include +#include +#include + +#ifndef __ORDER_LITTLE_ENDIAN__ +#error "Big endian is currently unsupported" +#endif + +void exitf(const char *fmt, ...) { + va_list va; + va_start(va, fmt); + vfprintf(stderr, fmt, va); + va_end(va); + exit(1); +} + +int utf8_retrieve_size(uint8_t firstByte) { + if (!(firstByte & 0b10000000)) + return 1; + uint8_t a = 0b11000000; + uint8_t b = 0b00100000; + for (int i = 2; i <= 4; i++){ + if ((firstByte & (a | b)) == a) + return i; + a |= b; + b >>= 1; + } + return -1; +} + +int32_t utf8_retrieve_character(int sz, size_t pos, const uint8_t *string) { + if (sz == 1) + return string[pos]; + uint32_t v = string[pos] & (0b01111111 >> sz); + pos++; + for (int i = 1; i < sz; i++){ + uint32_t th = string[pos]; + if ((th & 0b11000000) != 0b10000000) + return -1; + v <<= 6; + v |= (th & 0b00111111); + pos++; + } + assert(v <= INT32_MAX); + return static_cast(v); +} + +#define AAAAAA {cp = -1; return;} + +void utf8_string_iterat(int32_t &cp, size_t &adj, size_t pos, const uint8_t *string, size_t string_size) { + if (pos >= string_size) AAAAAA + adj = utf8_retrieve_size(string[pos]); + if (adj < 0 || pos + adj > string_size) AAAAAA + if ((cp = utf8_retrieve_character(adj, pos, string)) < 0) AAAAAA +} + +bool is_string_in_stringset(const char *strSample, const char **strSet) { + const char** cmpSubject = strSet; + while ((*cmpSubject) != NULL){ + if (strcmp(strSample, *cmpSubject) == 0) + return true; + cmpSubject++; // += 8 bytes + } + return false; +} diff --git a/src/libregexis024vm/utils.h b/src/libregexis024vm/utils.h new file mode 100644 index 0000000..3650f19 --- /dev/null +++ b/src/libregexis024vm/utils.h @@ -0,0 +1,21 @@ +#ifndef LIBREGEXIS024_UTILS_H +#define LIBREGEXIS024_UTILS_H + +#include +#include + +void exitf(const char* fmt, ...); + +/* 1, 2, 3, 4 on success; -1 on error */ +int utf8_retrieve_size(uint8_t firstByte); + +/* sz is a positive value returned by utf8_retrieve_size. Returns negative on error */ +int32_t utf8_retrieve_character(int sz, size_t pos, const uint8_t* string); + +/* cp is negative on error. adj is the size of letter in bytes. Can be used to adjust pos. + * All safety checks will be performed */ +void utf8_string_iterat(int32_t& cp, size_t& adj, size_t pos, const uint8_t* string, size_t string_size); + +bool is_string_in_stringset(const char* strSample, const char* strSet[]); + +#endif //LIBREGEXIS024_UTILS_H diff --git a/src/libregexis024vm/vm_errno.cpp b/src/libregexis024vm/vm_errno.cpp new file mode 100644 index 0000000..78dbcb0 --- /dev/null +++ b/src/libregexis024vm/vm_errno.cpp @@ -0,0 +1,26 @@ +#include + +const char *regex024_error_code_tostr(regex024_error_code x) { +#define rcase(name) case regex024_error_codes::name: return #name; + switch (x) { + rcase(stable) + rcase(ca_tree_limit_violation) + rcase(sa_length_limit_violation) + rcase(read_sslot_count_limit_violation) + rcase(fork_sslot_count_limit_violation) + rcase(timeout) + rcase(improper_finish) + rcase(too_early) + rcase(too_late) + rcase(selection_arr_out_of_range) + rcase(read_sslot_out_of_range) + rcase(fork_sslot_out_of_range) + rcase(invalid_opcode) + rcase(invalid_register_code) + rcase(instruction_not_for_general_thread) + rcase(instruction_not_for_collision_thread) + rcase(bad_alloc) + default: + return "unknown_error_code"; + } +} diff --git a/src/libregexis024vm/vm_errno.h b/src/libregexis024vm/vm_errno.h new file mode 100644 index 0000000..cdfa1cd --- /dev/null +++ b/src/libregexis024vm/vm_errno.h @@ -0,0 +1,45 @@ +#ifndef LIBREGEXIS024_VM_ERRNO_H +#define LIBREGEXIS024_VM_ERRNO_H + +#include + +namespace regex024_error_codes { + enum regex024_error_code_I: int { + stable = 0, + ca_tree_limit_violation = -1, + sa_length_limit_violation = -2, + read_sslot_count_limit_violation = -3, + fork_sslot_count_limit_violation = -4, + timeout = -5, + /* Threads should be either abandoned by user of virtual machine after MATCH, + * ot be stopped by DIE instruction. Out of bound jump is disallowed */ + improper_finish = -6, + /* Operation for general phase is executed in init phase */ + too_early = -7, + /* Operation for init phase is executed in general phase */ + too_late = -8, + /* Used selection array index is out of range */ + selection_arr_out_of_range = -9, + /* Used read slot is out of range */ + read_sslot_out_of_range = -10, + /* Used fork slot is out of range */ + fork_sslot_out_of_range = -11, + + invalid_opcode = -12, + invalid_register_code = -13, + /* Next operation scheduled for execution is forbidden in general thread */ + instruction_not_for_general_thread = -14, + /* Next operation scheduled for execution is forbidden in collision thread */ + instruction_not_for_collision_thread = -15, + /* Program willingly threw exception */ + program_throw = -16, + /* O_o */ + bad_alloc = -17, + }; +} + +typedef regex024_error_codes::regex024_error_code_I regex024_error_code; + +const char* regex024_error_code_tostr(regex024_error_code x); + +#endif //LIBREGEXIS024_VM_ERRNO_H diff --git a/src/libregexis024vm/vm_opcodes.h b/src/libregexis024vm/vm_opcodes.h new file mode 100644 index 0000000..e76e5af --- /dev/null +++ b/src/libregexis024vm/vm_opcodes.h @@ -0,0 +1,99 @@ +#ifndef LIBREGEXIS024_VM_OPCODES_H +#define LIBREGEXIS024_VM_OPCODES_H + +#include + +namespace regex024_opcodes { + enum regex024_opcode_I: uint8_t{ + /* READ */ + READ = 0, + /* READZ = READ 0 */ + READZ = 1, + /* JUMP */ + JUMP = 2, + + /* JCEQUAL - jump conditional (equal): JCEQUAL */ + JCEQUAL_B = 3, + JCEQUAL_W = 4, + JCEQUAL_DW = 5, + JCEQUAL_QW = 6, + /* JCLESS - jump conditional (less): JCLESS */ + JCLESS_B = 7, + JCLESS_W = 8, + JCLESS_DW = 9, + JCLESS_QW = 10, + /* JCGRTR - jump conditional (greater): JCGRTR */ + JCGRTR_B = 11, + JCGRTR_W = 12, + JCGRTR_DW = 13, + JCGRTR_QW = 14, + + /* FORK */ + FORK = 15, + /* MATCH | */ + MATCH = 16, + /* DIE | */ + DIE = 17, + /* PARAM_READ_SS_NUMBER */ + PARAM_READ_SS_NUMBER = 18, + /* PARAM_FORK_SS_NUMBER */ + PARAM_FORK_SS_NUMBER = 19, + /* PARAM_SELARR_LEN */ + PARAM_SELARR_LEN = 20, + /* PARAM_COLSIFTFUNC_SET */ + PARAM_COLSIFTFUNC_SET = 21, + /* PARAM_COLSIFTFUNC_WIPE */ + PARAM_COLSIFTFUNC_WIPE = 22, + /* MSG_MULTISTART_ALLOWED <1B> */ + MSG_MULTISTART_ALLOWED = 23, + /* MSG_FED_INPUT_EXTENDED <1B> <1B> */ + MSG_FED_INPUT_EXTENDED = 24, + /* DMOVRABXSELARR */ + DMOV_RABX_SELARR = 25, + /* DDISTRABXSELARR */ + DDIST_RABX_SELARR = 26, + /* SIFTPRIOR_MIN_RABX */ + SIFTPRIOR_MIN_RABX = 27, + /* SIFTPRIOR_MAX_RABX */ + SIFTPRIOR_MAX_RABX = 28, + /* SIFT_DONE */ + SIFT_DONE = 29, + /* MOV_COLARR_IMM <8B> */ + MOV_COLARR_IMM = 30, + /* MOV_COLARR_BTPOS */ + MOV_COLARR_BTPOS = 31, + /* MOV_SELARR_IMM <8B> */ + MOV_SELARR_IMM = 32, + /* MOV_SELARR_CHPOS */ + MOV_SELARR_CHPOS = 33, + /* INIT */ + INIT = 34, + /* THROW */ + THROW = 35, + regex024_opcode_greaterMax = 36 + }; +} + +typedef regex024_opcodes::regex024_opcode_I regex024_opcode; + +const char* regex024_opcode_tostr(regex024_opcode x); + + +constexpr uint64_t REGEX024_BYTECODE_INSTRUCTION_SZ = 1; +constexpr uint64_t REGEX024_BYTECODE_SSLOT_ID_SZ = 4; +constexpr uint64_t REGEX024_BYTECODE_TRACK_ARRAY_INDEX_ID_SZ = 2; +constexpr uint64_t REGEX024_BYTECODE_NEAR_POINTER_SZ = 8; + +bool vmprog_check_inboundness(regex_near_ptr_t prgSize, regex_near_ptr_t IP, regex_near_ptr_t region); + +uint8_t vmprog_extract_b(regex_near_ptr_t* IPptr, const uint8_t* prg); +uint16_t vmprog_extract_w(regex_near_ptr_t* IPptr, const uint8_t* prg); +uint32_t vmprog_extract_dw(regex_near_ptr_t* IPptr, const uint8_t* prg); +uint64_t vmprog_extract_qw(regex_near_ptr_t* IPptr, const uint8_t* prg); + +uint8_t vmprog_extract_instruction(regex_near_ptr_t* IPptr, const uint8_t* prg); +regex_sslot_id_t vmprog_extract_sslot_id(regex_near_ptr_t* IPptr, const uint8_t* prg); +regex_near_ptr_t vmprog_extract_near_pointer(regex_near_ptr_t* IPptr, const uint8_t* prg); +regex_tai_t vmprog_extrack_track_array_index(regex_near_ptr_t* IPptr, const uint8_t* prg); + +#endif //LIBREGEXIS024_VM_OPCODES_H diff --git a/src/libregexis024vm/vm_opcodes_disassembly.cpp b/src/libregexis024vm/vm_opcodes_disassembly.cpp new file mode 100644 index 0000000..b0d9ffa --- /dev/null +++ b/src/libregexis024vm/vm_opcodes_disassembly.cpp @@ -0,0 +1,47 @@ +#include + +#ifndef __ORDER_LITTLE_ENDIAN__ +#error "Big endian is currently unsupported" +#endif + +bool vmprog_check_inboundness(regex_near_ptr_t prgSz, regex_near_ptr_t IP, regex_near_ptr_t region) { + return IP + region <= prgSz; +} + +uint8_t vmprog_extract_b(regex_near_ptr_t *IPptr, const uint8_t *prg) { + return prg[(*IPptr)++]; +} + +uint16_t vmprog_extract_w(regex_near_ptr_t *IPptr, const uint8_t *prg) { + uint16_t answer = *(uint16_t*)(&prg[*IPptr]); + *IPptr += 2; + return answer; +} + +uint32_t vmprog_extract_dw(regex_near_ptr_t *IPptr, const uint8_t *prg) { + uint32_t answer = *(uint32_t *)(&prg[*IPptr]); + *IPptr += 4; + return answer; +} + +uint64_t vmprog_extract_qw(regex_near_ptr_t *IPptr, const uint8_t *prg) { + uint64_t answer = *(uint64_t *)(&prg[*IPptr]); + *IPptr += 8; + return answer; +} + +uint8_t vmprog_extract_instruction(regex_near_ptr_t *IPptr, const uint8_t *prg) { + return vmprog_extract_b(IPptr, prg); +} + +regex_sslot_id_t vmprog_extract_sslot_id(regex_near_ptr_t *IPptr, const uint8_t *prg) { + return vmprog_extract_dw(IPptr, prg); +} + +regex_near_ptr_t vmprog_extract_near_pointer(regex_near_ptr_t *IPptr, const uint8_t *prg) { + return vmprog_extract_qw(IPptr, prg); +} + +regex_tai_t vmprog_extrack_track_array_index(regex_near_ptr_t *IPptr, const uint8_t *prg) { + return vmprog_extract_w(IPptr, prg); +} diff --git a/src/libregexis024vm/vm_opcodes_types.h b/src/libregexis024vm/vm_opcodes_types.h new file mode 100644 index 0000000..0707402 --- /dev/null +++ b/src/libregexis024vm/vm_opcodes_types.h @@ -0,0 +1,11 @@ +#ifndef VM_OPCODES_TYPES_H +#define VM_OPCODES_TYPES_H + +#include + +typedef uint32_t regex_sslot_id_t; +typedef uint64_t regex_near_ptr_t; +typedef uint16_t regex_tai_t; + + +#endif //VM_OPCODES_TYPES_H