// SPDX-License-Identifier: MPL-2.0 // (c) Hare authors use ascii; use bufio; use encoding::utf8; use io; use memio; use strconv; use strings; use types; // An error string describing a compilation error. export type error = !str; export type inst_lit = rune; export type inst_charset = struct { idx: size, is_positive: bool }; export type inst_any = void; export type inst_split = size; export type inst_jump = size; export type inst_skip = void; export type inst_match = bool; export type inst_groupstart = size; export type inst_groupend = void; export type inst_repeat = struct { id: size, origin: size, min: (void | size), max: (void | size), }; export type inst = (inst_lit | inst_any | inst_split | inst_jump | inst_skip | inst_match | inst_charset | inst_groupstart | inst_groupend | inst_repeat); // The resulting match of a [[regex]] applied to a string. // // The first [[capture]] corresponds to the implicit zeroth capture group, // i.e. the whole expression. // // The rest of the [[capture]]s correspond to the rest of the capture groups, // i.e. the sub-expressions. export type result = []capture; // A (sub)match corresponding to a regular expression's capture group. export type capture = struct { content: str, start: size, start_bytesize: size, end: size, end_bytesize: size }; type thread = struct { pc: size, start_idx: size, start_bytesize: size, root_capture: capture, captures: []capture, rep_counters: []size, matched: bool, failed: bool, }; type newmatch = void; export type charset = [](charset_lit_item | charset_range_item | charset_class_item); export type charset_lit_item = rune; export type charset_range_item = (u32, u32); export type charset_class_item = (str, *fn(c: rune) bool); const charclass_map: [](str, *fn(c: rune) bool) = [ (":alnum:]", &ascii::isalnum), (":alpha:]", &ascii::isalpha), (":blank:]", &ascii::isblank), (":cntrl:]", &ascii::iscntrl), (":digit:]", &ascii::isdigit), (":graph:]", &ascii::isgraph), (":lower:]", &ascii::islower), (":print:]", &ascii::isprint), (":punct:]", &ascii::ispunct), (":space:]", &ascii::isspace), (":upper:]", &ascii::isupper), (":xdigit:]", &ascii::isxdigit), ]; export type regex = struct { insts: []inst, charsets: []charset, n_reps: size, }; // Frees resources associated with a [[regex]]. export fn finish(re: *regex) void = { free(re.insts); for (let charset .. re.charsets) { free(charset); }; free(re.charsets); }; fn find_last_groupstart(insts: []inst) (size | error) = { let nested = 0u; for (let i = len(insts); i > 0; i -= 1) { match (insts[i - 1]) { case inst_groupstart => if (nested == 0) { return i - 1; }; nested -= 1; case inst_groupend => nested += 1; case => void; }; }; return `Unmatched ')'`: error; }; // increments all [[inst_jump]] and [[inst_split]] instructions to account for a // newly inserted instruction before the given slice fn shift(sl: []inst) void = { for (let inst &.. sl) { match (*inst) { case let z: inst_jump => *inst = (z + 1): inst_jump; case let z: inst_split => *inst = (z + 1): inst_split; case => void; }; }; }; fn handle_bracket( insts: *[]inst, r: rune, r_idx: *size, bracket_idx: *int, iter: *strings::iterator, charsets: *[]charset, skip_charclass_rest: *bool, is_charset_positive: *bool, in_bracket: *bool ) (void | error | nomem) = { const peek1 = strings::next(iter); const peek2 = strings::next(iter); const peek3 = strings::next(iter); if (!(peek1 is done)) { strings::prev(iter); }; if (!(peek2 is done)) { strings::prev(iter); }; if (!(peek3 is done)) { strings::prev(iter); }; if (*bracket_idx == -1) { append(charsets, [])?; }; *bracket_idx += 1; if (*skip_charclass_rest) { if (r == ']') { *skip_charclass_rest = false; }; *r_idx += 1; return; }; const is_range = peek1 is rune && peek1 as rune == '-' && !(peek2 is done) && !(peek3 is done) && !(peek2 as rune == ']'); const range_end = peek2; const is_first_char = *bracket_idx == 0 || *bracket_idx == 1 && !*is_charset_positive; if (r == '\\') { if (peek1 is done) { return `Trailing backslash '\'`: error; } else { append(charsets[len(charsets) - 1], peek1: charset_lit_item)?; strings::next(iter); *r_idx += 1; }; } else if (r == ']' && !is_first_char) { const newinst = inst_charset { idx = len(charsets) - 1, is_positive = *is_charset_positive, }; append(insts, newinst)?; *in_bracket = false; *bracket_idx = -1; *is_charset_positive = true; } else if (r == '^' && *bracket_idx == 0) { *is_charset_positive = false; } else if (r == '[' && !(peek1 is done) && peek1 as rune == ':') { const rest = strings::iterstr(iter); const n_cc = len(charclass_map); for (let cc_idx = 0z; cc_idx < n_cc; cc_idx += 1) { if (strings::hasprefix(rest, charclass_map[cc_idx].0)) { append(charsets[len(charsets) - 1], charclass_map[cc_idx])?; *skip_charclass_rest = true; break; }; }; if (!*skip_charclass_rest) { return `No character class after '[:'`: error; }; } else if (is_range) { const start_b = r: u32; const end_b = range_end as rune: u32; if (end_b < start_b) { return `Descending bracket expression range '[z-a]'`: error; }; append(charsets[len(charsets) - 1], (start_b, end_b): charset_range_item)?; strings::next(iter); strings::next(iter); *r_idx += 2; } else { append(charsets[len(charsets) - 1], r: charset_lit_item)?; }; *r_idx += 1; }; // Compiles a regular expression string into a [[regex]]. export fn compile(expr: str) (regex | error | nomem) = { let ok = false; let insts: []inst = []; defer if (!ok) free(insts); let charsets: []charset = []; defer if (!ok) { for (let c .. charsets) free(c); free(charsets); }; let iter = strings::iter(expr); let r_idx = 0z; let jump_idxs: [][]size = []; append(jump_idxs, [])?; defer { for (let j .. jump_idxs) { free(j); }; free(jump_idxs); }; let in_bracket = false; let skip_charclass_rest = false; let bracket_idx = -1; let is_charset_positive = true; let was_prev_rune_pipe = false; let n_reps = 0z; let group_level = 0z; let capture_idx = 0z; for (true) { const next = strings::next(&iter); if (r_idx == 0 && next is rune && next: rune != '^') { append(insts, inst_skip)?; }; if (in_bracket) { if (next is done) { return `Unmatched '['`: error; }; const r = next: rune; handle_bracket(&insts, r, &r_idx, &bracket_idx, &iter, &charsets, &skip_charclass_rest, &is_charset_positive, &in_bracket)?; continue; }; const r = match (next) { case done => if (group_level > 0) { return `Unmatched '('`: error; }; break; case let r: rune => yield r; }; switch (r) { case '\\' => const peek1 = strings::next(&iter); if (peek1 is done) { return `Trailing backslash '\'`: error; } else { append(insts, (peek1 as rune): inst_lit)?; r_idx += 1; }; case '^' => if (group_level > 0) { return `Anchor '^' in capture groups is unsupported`: error; }; if (!(r_idx == 0 || was_prev_rune_pipe)) { return `Anchor '^' not at start of whole pattern or alternation`: error; }; case '$' => if (group_level > 0) { return `Anchor '$' in capture groups is unsupported`: error; }; const peek1 = strings::next(&iter); if (peek1 is rune) { if (peek1 as rune != '|') { return `Anchor '$' not at end of whole pattern or alternation`: error; }; strings::prev(&iter); }; append(insts, true: inst_match)?; case '[' => in_bracket = true; case ']' => append(insts, r: inst_lit)?; case '(' => append(insts, capture_idx: inst_groupstart)?; group_level += 1; capture_idx += 1; for (len(jump_idxs) < group_level + 1) { append(jump_idxs, [])?; }; case ')' => if (group_level == 0) { return `Unmatched ')'`: error; }; append(insts, inst_groupend)?; for (let jump_idx .. jump_idxs[group_level]) { assert(insts[jump_idx] is inst_jump); insts[jump_idx] = (len(insts) - 1): inst_jump; }; delete(jump_idxs[group_level][..]); group_level -= 1; case '|' => append(insts, types::SIZE_MAX: inst_jump)?; const origin = match (find_last_groupstart(insts)) { case error => yield 0z; case let sz: size => yield sz + 1; }; const newinst = (len(insts) + 1): inst_split; // add split after last jump (if any) or at origin const split_idx = if (len(jump_idxs[group_level]) > 0) jump_idxs[group_level][len(jump_idxs[group_level]) - 1] + 1 else origin; insert(insts[split_idx], newinst)?; shift(insts[split_idx + 1..]); // our insertion of our split_idx should never interfere // with an existing jump_idx for (let jump_idx .. jump_idxs[group_level]) { // if this assertion ends up being hit in the // future, it is a sign that jump_idx should be // incremented assert(jump_idx < split_idx, `Found jump_idx interference. Please report this as a bug`); }; append(jump_idxs[group_level], len(insts) - 1)?; // add skip if it's a whole-expression alternation if (origin == 0) { const peek1 = strings::next(&iter); if (peek1 is rune) { if (peek1 as rune != '^') { append(insts, inst_skip)?; }; strings::prev(&iter); }; }; case '{' => let origin = len(insts) - 1; if (insts[origin] is inst_groupend) { origin = find_last_groupstart(insts[..origin])?; }; const rest = strings::iterstr(&iter); const rep_parts = parse_repetition(rest)?; const can_skip = rep_parts.0 == 0; const min = if (rep_parts.0 == 0) { yield 1z; } else { yield rep_parts.0; }; if (can_skip) { // len(insts) - 1 is the current last instruction // len(insts) is the next instruction // advance to len(insts) + 1 to make space for the `inst_split` // advance to len(insts) + 2 to make space for the `inst_repeat` insert(insts[origin], len(insts) + 2: inst_split)?; shift(insts[origin + 1..]); origin += 1; }; const newinst = inst_repeat { id = n_reps, origin = origin, min = min, max = rep_parts.1, }; for (let i = 0z; i <= rep_parts.2; i += 1) { strings::next(&iter); r_idx += 1; }; append(insts, newinst)?; n_reps += 1; case '?' => if (r_idx == 0 || len(insts) == 0) { return `Unused '?'`: error; }; let term_start_idx = len(insts) - 1; match (insts[term_start_idx]) { case (inst_lit | inst_charset | inst_any) => void; case inst_groupend => term_start_idx = find_last_groupstart( insts[..term_start_idx])?; case inst_groupstart => return `Unused '?'`: error; case => return `Misused '?'`: error; }; const after_idx = len(insts) + 1; insert(insts[term_start_idx], after_idx: inst_split)?; shift(insts[term_start_idx + 1..]); case '*' => if (r_idx == 0 || len(insts) == 0) { return `Unused '*'`: error; }; const new_inst_offset = 1z; const jump_idx = len(insts) + new_inst_offset; const after_idx = jump_idx + 1z; let term_start_idx = len(insts) - 1z; match (insts[term_start_idx]) { case (inst_lit | inst_charset | inst_any) => void; case inst_groupend => term_start_idx = find_last_groupstart( insts[..term_start_idx])?; case inst_groupstart => return `Unused '*'`: error; case => return `Misused '*'`: error; }; const split_idx = term_start_idx; term_start_idx += new_inst_offset; insert(insts[split_idx], after_idx: inst_split)?; shift(insts[split_idx + 1..]); append(insts, split_idx: inst_jump)?; case '+' => if (r_idx == 0 || len(insts) == 0) { return `Unused '+'`: error; }; let term_start_idx = len(insts) - 1; match (insts[term_start_idx]) { case (inst_lit | inst_charset | inst_any) => void; case inst_groupend => term_start_idx = find_last_groupstart( insts[..term_start_idx])?; case inst_groupstart => return `Unused '+'`: error; case => return `Misused '+'`: error; }; append(insts, term_start_idx: inst_split)?; case '.' => append(insts, inst_any)?; case => append(insts, r: inst_lit)?; }; was_prev_rune_pipe = (r == '|'); r_idx += 1; }; // handle whole expression alternation for (let jump_idx .. jump_idxs[0]) { assert(insts[jump_idx] is inst_jump); insts[jump_idx] = len(insts): inst_jump; }; if (len(insts) == 0 || !(insts[len(insts) - 1] is inst_match)) { append(insts, false: inst_match)?; }; ok = true; return regex { insts = insts, charsets = charsets, n_reps = n_reps, }; }; fn parse_repetition( s: str ) (((void | size), (void | size), size) | error) = { const first_comma = strings::index(s, ","); const first_endbrace = strings::index(s, "}"); if (first_endbrace is void) { return `Repetition expression syntax error '{n}'`: error; }; const first_endbrace = first_endbrace as size; let min_str = ""; let max_str = ""; let is_single_arg = false; if (first_comma is void || first_endbrace < first_comma as size) { const cut = strings::cut(s, "}"); min_str = cut.0; max_str = cut.0; is_single_arg = true; } else { const cut = strings::cut(s, ","); min_str = cut.0; max_str = strings::cut(cut.1, "}").0; }; let min: (void | size) = void; let max: (void | size) = void; if (len(min_str) > 0) { min = match (strconv::stoi(min_str)) { case let res: int => yield if (res < 0) { return `Negative repetition count '{-n}'`: error; } else { yield res: size; }; case => return `Repetition expression syntax error '{n}'`: error; }; } else { min = 0; }; if (len(max_str) > 0) { max = match (strconv::stoi(max_str)) { case let res: int => yield if (res < 0) { return `Negative repetition count '{-n}'`: error; } else { yield res: size; }; case => return `Repetition expression syntax error '{n}'`: error; }; }; const rep_len = if (is_single_arg) { yield len(min_str); } else { yield len(min_str) + 1 + len(max_str); }; return (min, max, rep_len); }; fn delete_thread(i: size, threads: *[]thread) void = { free(threads[i].captures); free(threads[i].rep_counters); delete(threads[i]); }; fn is_consuming_inst(a: inst) bool = { return a is (inst_lit | inst_any | inst_charset); }; fn add_thread(threads: *[]thread, parent_idx: size, new_pc: size) (void | nomem) = { // Do not add this thread if there is already another thread with // the same PC for (let thread &.. *threads) { if (thread.pc == new_pc && !thread.matched && thread.start_idx < threads[parent_idx].start_idx) { return; }; }; let ok = false; let captures = alloc(threads[parent_idx].captures...)?; defer if (!ok) free(captures); let rep_counters = alloc(threads[parent_idx].rep_counters...)?; defer if (!ok) free(rep_counters); append(threads, thread { pc = new_pc, start_idx = threads[parent_idx].start_idx, start_bytesize = threads[parent_idx].start_bytesize, matched = threads[parent_idx].matched, failed = threads[parent_idx].failed, captures = captures, rep_counters = rep_counters, ... })?; ok = true; }; fn run_thread( i: size, re: *regex, string: str, threads: *[]thread, r_or_end: (rune | io::EOF), str_idx: size, str_bytesize: size ) (void | newmatch | nomem) = { const str_bytes = strings::toutf8(string); if (threads[i].matched) { return; }; for (!is_consuming_inst(re.insts[threads[i].pc])) { match (re.insts[threads[i].pc]) { case inst_lit => abort(); case inst_any => abort(); case inst_split => const new_pc = re.insts[threads[i].pc]: inst_split: size; add_thread(threads, i, new_pc)?; threads[i].pc += 1; case inst_jump => threads[i].pc = re.insts[threads[i].pc]: inst_jump: size; case inst_skip => const new_pc = threads[i].pc + 1; threads[i].start_idx = str_idx; threads[i].start_bytesize = str_bytesize; add_thread(threads, i, new_pc)?; break; case let anchored: inst_match => // Do not match if we need an end-anchored match, but we // have not exhausted our string if (anchored && !(r_or_end is io::EOF)) { threads[i].failed = true; return; }; const content = strings::fromutf8_unsafe(str_bytes[ threads[i].start_bytesize..str_bytesize]); threads[i].root_capture = capture { start = threads[i].start_idx, start_bytesize = threads[i].start_bytesize, end = str_idx, end_bytesize = str_bytesize, content = content, }; threads[i].matched = true; return newmatch; case let idx: inst_groupstart => if (idx >= len(threads[i].captures)) { append(threads[i].captures, [capture { ... }...], idx - len(threads[i].captures) + 1)?; }; assert(threads[i].captures[idx].end != types::SIZE_MAX); threads[i].captures[idx] = capture { content = "", start = str_idx, start_bytesize = str_bytesize, // end=types::SIZE_MAX indicates that the // capture group hasn't ended yet end = types::SIZE_MAX, end_bytesize = types::SIZE_MAX, }; threads[i].pc += 1; case inst_groupend => let curr_capture = len(threads[i].captures); for (curr_capture > 0; curr_capture -= 1) { // find inner-most unclosed capture group if (threads[i].captures[curr_capture - 1].end == types::SIZE_MAX) { break; }; }; assert(curr_capture > 0, `Found a groupend token ")" without having previously seen a groupstart token "(". Please report this as a bug`); let capture = &threads[i].captures[curr_capture - 1]; capture.end = str_idx; capture.end_bytesize = str_bytesize; capture.content = strings::fromutf8_unsafe(str_bytes[ capture.start_bytesize..capture.end_bytesize]); threads[i].pc += 1; case let ir: inst_repeat => assert(ir.id < len(threads[i].rep_counters)); threads[i].rep_counters[ir.id] += 1; if (ir.max is size && threads[i].rep_counters[ir.id] > ir.max as size) { threads[i].failed = true; return; }; const new_pc = threads[i].pc + 1; threads[i].pc = ir.origin; if (ir.min is void || threads[i].rep_counters[ir.id] >= ir.min as size) { add_thread(threads, i, new_pc)?; }; }; }; // From now on, we're only matching consuming instructions, and these // can't do anything without another rune. if (r_or_end is io::EOF) { threads[i].failed = true; return; }; const r = r_or_end as rune; match (re.insts[threads[i].pc]) { case inst_skip => return; case let lit: inst_lit => if (r != lit) { threads[i].failed = true; }; case inst_any => void; case let cs: inst_charset => const charset = re.charsets[cs.idx]; // Disprove the match if we're looking for a negative match // Prove the match if we're looking for a positive match let matched = !cs.is_positive; for (let i = 0z; i < len(charset); i += 1) match (charset[i]) { case let lit: charset_lit_item => if (r == lit) { // Succeeded if positive match // Failed if negative match matched = cs.is_positive; break; }; case let range: charset_range_item => const r_b = r: u32; if (r_b >= range.0 && r_b <= range.1) { // Succeeded if positive match // Failed if negative match matched = cs.is_positive; break; }; case let class_item: charset_class_item => const classfn = class_item.1; if (classfn(r)) { // Succeeded if positive match // Failed if negative match matched = cs.is_positive; break; }; }; if (!matched) { threads[i].failed = true; }; case => abort(); // unreachable }; threads[i].pc += 1; }; // Attempts to match a regular expression against a string and returns the // either the longest leftmost match or all matches. fn search( re: *regex, string: str, handle: io::handle, need_captures: bool ) (void | []capture | nomem) = { let threads: []thread = alloc([ thread { captures = [], ... } ])?; defer { for (let i = 0z; i < len(threads); i += 1) { free(threads[i].captures); free(threads[i].rep_counters); }; free(threads); }; if (re.n_reps > 0) { threads[0].rep_counters = alloc([0...], re.n_reps)?; }; let str_idx = 0z; let first_match_idx: (void | size) = void; let str_bytesize = 0z; let last_bytesize = 0z; const scan = bufio::newscanner(handle); defer bufio::finish(&scan); for (true) { str_bytesize += last_bytesize; if (len(threads) == 0) { return void; }; let all_matched = true; for (let i = 0z; i < len(threads); i += 1) { if (!threads[i].matched) { all_matched = false; break; }; }; if (all_matched) { let best_len = 0z; let best_n_captures = 0z; let best_idx = 0z; for (let i = 0z; i < len(threads); i += 1) { let match_len = threads[i].root_capture.end - threads[i].root_capture.start; const is_better = match_len > best_len || match_len == best_len && len(threads[i].captures) > best_n_captures; if (is_better) { best_len = match_len; best_idx = i; best_n_captures = len(threads[i].captures); }; }; // length = number of captures (index of final group + // 1) + root capture let length = 1z; for (let i = len(re.insts); i > 0; i -= 1) { match (re.insts[i - 1]) { case let z: inst_groupstart => length = z + 2; break; case => void; }; }; let res: result = alloc([], length)?; static append(res, threads[best_idx].root_capture)!; static append(res, threads[best_idx].captures...)!; if (length != len(res)) { static append(res, [capture { ... }...], length - len(res))!; }; return res; }; const r_or_end = match (bufio::scan_rune(&scan)) { case let r_or_end: (rune | io::EOF) => yield r_or_end; case nomem => return nomem; case (io::error | utf8::invalid) => abort(); }; if (r_or_end is rune) { last_bytesize = utf8::runesz(r_or_end as rune); }; for (let i = 0z; i < len(threads); i += 1) { const res = run_thread(i, re, string, &threads, r_or_end, str_idx, str_bytesize); const matchlen = threads[i].root_capture.end - threads[i].root_capture.start; if (res is newmatch && matchlen > 0 && !need_captures) { return []; }; const is_better = res is newmatch && matchlen > 0 && (first_match_idx is void || threads[i].start_idx < first_match_idx as size); if (is_better) { first_match_idx = threads[i].start_idx; }; }; str_idx += 1; // When we only want the leftmost match, delete all threads that // start after the earliest non-zero-length matched thread if (first_match_idx is size) { for (let thread &.. threads) { if (thread.start_idx > first_match_idx as size) { thread.failed = true; }; }; }; // Delete threads that have a PC that has already been // encountered in previous threads. Prioritise threads that // have an earlier start_idx, and threads that were added // earlier. for (let i = 0i64; i < len(threads): i64 - 1; i += 1) { for (let j = i + 1; j < len(threads): i64; j += 1) { const same_pc = threads[i].pc == threads[j].pc; const none_matched = !threads[j].matched && !threads[i].matched; if (same_pc && none_matched) { if (threads[i].start_idx <= threads[j].start_idx) { delete_thread(j: size, &threads); j -= 1; } else { delete_thread(i: size, &threads); i -= 1; break; }; }; }; }; for (let i = 0z; i < len(threads); i += 1) { if (threads[i].failed) { delete_thread(i, &threads); i -= 1; }; }; }; }; // Returns whether or not a [[regex]] matches any part of a given string. export fn test(re: *regex, string: str) (bool | nomem) = { let strm = memio::fixed(strings::toutf8(string)); return search(re, string, &strm, false)? is []capture; }; // Attempts to match a [[regex]] against a string and returns the longest // leftmost match as a [[result]]. The caller must free the return value with // [[result_free]]. export fn find(re: *regex, string: str) (result | nomem) = { let strm = memio::fixed(strings::toutf8(string)); match (search(re, string, &strm, true)?) { case let m: []capture => return m; case void => return []; }; }; // Attempts to match a [[regex]] against a string and returns all // non-overlapping matches as a slice of [[result]]s. The caller must free the // return value with [[result_freeall]]. export fn findall(re: *regex, string: str) ([]result | nomem) = { let ok = false; let res: []result = []; defer if (!ok) result_freeall(res); let str_idx = 0z, str_bytesize = 0z; let strm = memio::fixed(strings::toutf8(string)); const str_bytes = strings::toutf8(string); for (true) { let substring = strings::fromutf8_unsafe( str_bytes[str_bytesize..]); match (search(re, substring, &strm, true)?) { case let m: []capture => append(res, m)?; m[0].start += str_idx; m[0].end += str_idx; m[0].start_bytesize += str_bytesize; m[0].end_bytesize += str_bytesize; str_idx = m[0].end; str_bytesize = m[0].end_bytesize; if (m[0].start_bytesize == len(str_bytes)) { // end-of-string reached break; }; if (m[0].start_bytesize == m[0].end_bytesize) { // zero-length match // forward rune and byte indices str_idx += 1; str_bytesize += utf8::utf8sz( str_bytes[str_bytesize])!; }; io::seek(&strm, str_bytesize: io::off, io::whence::SET)!; case void => break; }; }; ok = true; return res; }; // Replaces all non-overlapping matches of a regular expression against a string // with 'targetstr'. // // A backslash followed by a single decimal number within 'targetstr' is // replaced by the capture at that index (starting at 1), or an empty string if // no such capture exists. For example, `\1` is replaced with the first capture, // `\2` with the second, etc. `\0` is substituted with the entire substring that // was matched. `\\` is replaced with a literal backslash. The caller must free // the return value. // // An error is only returned if 'targetstr' isn't formatted correctly. export fn replace(re: *regex, string: str, targetstr: str) (str | error | nomem) = { return replacen(re, string, targetstr, types::SIZE_MAX); }; // Replaces up to 'n' non-overlapping matches of a regular expression against a // string with 'targetstr', in the same manner as [[replace]]. The caller must // free the return value. export fn replacen( re: *regex, string: str, targetstr: str, n: size, ) (str | error | nomem) = { const target = parse_replace_target(targetstr)?; defer free(target); // Check if n == 0 after parse_replace_target so errors are propagated if (n == 0) { return strings::dup(string)?; }; const matches = findall(re, string)?; if (len(matches) == 0) { return strings::dup(string)?; }; defer result_freeall(matches); const bytes = strings::toutf8(string); let ok = false; let buf = alloc(bytes[..matches[0][0].start_bytesize]...)?; defer if (!ok) free(buf); const n = if (len(matches) > n) n else len(matches); for (let i = 0z; i < n; i += 1) { for (let j = 0z; j < len(target); j += 1) { match (target[j]) { case let b: []u8 => append(buf, b...)?; case let z: size => if (z >= len(matches[i])) yield; const b = strings::toutf8(matches[i][z].content); append(buf, b...)?; }; }; const start = matches[i][0].end_bytesize; const end = if (i == n - 1) len(bytes) else matches[i + 1][0].start_bytesize; append(buf, bytes[start..end]...)?; }; ok = true; return strings::fromutf8(buf)!; }; fn parse_replace_target(targetstr: str) ([]([]u8 | size) | error | nomem) = { const bytes = strings::toutf8(targetstr); let ok = false; let target: []([]u8 | size) = alloc([], 1)?; defer if (!ok) free(target); let iter = strings::iter(targetstr); let start = 0z, end = 0z; for (true) match (strings::next(&iter)) { case done => if (start != end) { append(target, bytes[start..])?; }; break; case let r: rune => if (r == '\\') { const r = match (strings::next(&iter)) { case done => return "Trailing backslash": error; case let r: rune => yield r; }; if (r == '\\') { append(target, bytes[start..end + 1])?; } else if (ascii::isdigit(r)) { if (start != end) { append(target, bytes[start..end])?; }; append(target, r: u32: size - 0x30)?; } else { return "Backslash must be followed by positive decimal number or a backslash": error; }; end += 2; start = end; } else { end += utf8::runesz(r); }; }; ok = true; return target; }; // Replaces all non-overlapping matches of a regular expression against a string // with 'targetstr'. 'targetstr' is isn't interpreted in any special way; all // backslashes are treated literally. The caller must free the return value. export fn rawreplace(re: *regex, string: str, targetstr: str) (str | nomem) = { return rawreplacen(re, string, targetstr, types::SIZE_MAX); }; // Replaces up to 'n' non-overlapping matches of a regular expression against a // string with 'targetstr', in the same manner as [[rawreplace]]. The caller // must free the return value. export fn rawreplacen(re: *regex, string: str, targetstr: str, n: size) (str | nomem) = { if (n == 0) { return strings::dup(string)?; }; const matches = findall(re, string)?; if (len(matches) == 0) { return strings::dup(string)?; }; defer result_freeall(matches); const target = strings::toutf8(targetstr); const bytes = strings::toutf8(string); let buf: []u8 = []; let ok = false; defer if (!ok) free(buf); append(buf, bytes[..matches[0][0].start_bytesize]...)?; const n = if (len(matches) > n) n else len(matches); for (let i = 1z; i < n; i += 1) { append(buf, target...)?; const start = matches[i - 1][0].end_bytesize; const end = matches[i][0].start_bytesize; append(buf, bytes[start..end]...)?; }; append(buf, target...)?; append(buf, bytes[matches[n - 1][0].end_bytesize..]...)?; ok = true; return strings::fromutf8(buf)!; }; // Frees a [[result]]. export fn result_free(s: result) void = { free(s); }; // Frees a slice of [[result]]s. export fn result_freeall(s: []result) void = { for (let res .. s) { result_free(res); }; free(s); }; // Converts an [[error]] into a user-friendly string. export fn strerror(err: error) str = err;