libregexis024/src/libregexis024sol/backslash_expression.cpp

64 lines
2.4 KiB
C++

#include <libregexis024sol/special_terminals.h>
#include <libregexis024sol/sol_misc_base.h>
#include <assert.h>
namespace regexis024 {
uint32_t read_hex(REGEX_IS024_MeaningContext& ctx, int sz){
uint32_t res = 0;
for (int i = 0; i < sz; i++){
int32_t ch = peep(ctx);
if ('0' <= ch && ch <= '9')
res = ((res << 4) | ((uint32_t)ch - '0'));
else if ('a' <= ch && ch <= 'z')
res = ((res << 4) | ((uint32_t)ch - 'a' + 10));
else if ('A' <= ch && ch <= 'Z')
res = ((res << 4) | ((uint32_t)ch - 'A' + 10));
else{
report(ctx, "escape backslash expression: bad unicode code");
return 0;
}
readChar(ctx);
}
return res;
}
void unicode_in_bs_case(REGEX_IS024_MeaningContext &ctx, bool &ret_is_multicode, codeset_t &ret_set, int sz){
ret_is_multicode = false;
readChar(ctx);
uint32_t hc = read_hex(ctx, sz); // Might create an error
ret_set = codeset_of_one_char(hc);
}
void
backslash_expression_parsing_try_regular(REGEX_IS024_MeaningContext &ctx, const CommonCodesets& cc,
bool &ret_is_multicode, codeset_t &ret_set)
{
int32_t leader = peep(ctx);
if (ctx.error)
return;
#define block(l, b, E) case l: ret_is_multicode = b; ret_set = E; readChar(ctx); break;
switch (leader) {
block('s', false, codeset_of_one_char(U' '))
block('t', false, codeset_of_one_char(U'\t'))
block('n', false, codeset_of_one_char(U'\n'))
block('r', false, codeset_of_one_char(U'\r'))
block('e', true, cc.spaces);
block('E', true, invert_set(cc.spaces))
block('w', true, cc.word_constituents);
block('W', true, invert_set(cc.word_constituents));
case 'u':
unicode_in_bs_case(ctx, ret_is_multicode, ret_set, 4);
break;
case 'U':
unicode_in_bs_case(ctx, ret_is_multicode, ret_set, 8);
break;
default:
if (leader >= 0){
ret_is_multicode = false;
ret_set = codeset_of_one_char(leader);
} else {
report(ctx, "backslash in the wrong place");
}
}
}
}