diff --git a/benchmark/toStdRegex_output_length-result.txt b/benchmark/toStdRegex_output_length-result.txt index acb10b1..33b39f6 100644 --- a/benchmark/toStdRegex_output_length-result.txt +++ b/benchmark/toStdRegex_output_length-result.txt @@ -1,6 +1,6 @@ failed instances: -- parseError : 45 +- parseError : 49 - cacheOverflow : 89 - veryLargeSyntaTree : 24 - stackOverflow : 12 diff --git a/src/ast.ts b/src/ast.ts index 12bdbb7..f93988b 100644 --- a/src/ast.ts +++ b/src/ast.ts @@ -5,15 +5,15 @@ import { assert, checkedAllCases, isOneOf } from './utils' /** * TODO: docs - * + * * @public */ -export type RepeatBounds = +export type RepeatBounds = | number | { min: number, max?: number } | { min?: number, max: number } -export type RegExpAST = +export type RegExpAST = | { type: "epsilon" } | { type: "literal", charset: CharSet.CharSet } | { type: "concat", left: RegExpAST, right: RegExpAST } @@ -78,7 +78,7 @@ function desugar(ast: RegExpAST): RegExpAST { case 'star': return star(desugar(ast.inner)) case 'start-anchor': return startAnchor(desugar(ast.left), desugar(ast.right)) case 'end-anchor': return endAnchor(desugar(ast.left), desugar(ast.right)) - case 'lookahead': return lookahead(ast.isPositive, desugar(ast.inner), desugar(ast.right)) + case 'lookahead': return lookahead(ast.isPositive, desugar(ast.inner), desugar(ast.right)) // sugar nodes: case 'capture-group': return desugar(ast.inner) case 'plus': { @@ -98,7 +98,7 @@ function desugar(ast: RegExpAST): RegExpAST { } else { const { min = 0, max = Infinity } = ast.bounds assert(0 <= min && min <= max) - return desugarRepeat(inner, min, max) + return desugarRepeat(inner, min, max) } } @@ -110,24 +110,24 @@ function desugarRepeat(ast: RegExpAST, min: number, max: number): RegExpAST { if (max === Infinity) return concat(requiredPrefix, star(ast)) - else + else return concat( requiredPrefix, seq(Array(max - min).fill(union(epsilon, ast))) ) } -function pullUpStartAnchor(ast: RegExpAST): RegExpAST { +function pullUpStartAnchor(ast: RegExpAST, isLeftClosed: boolean): RegExpAST { assert(!isOneOf(ast.type, sugarNodeTypes), `Got ${ast.type} node. Expected desugared AST.`) switch (ast.type) { case "epsilon": return ast case "literal": return ast case "concat": { - // Pull up start anchors on subexpressions first, so if they contain start + // Pull up start anchors on subexpressions first, so if they contain start // anchors then `left` and `right` will have the start anchor at the top. - const left = pullUpStartAnchor(ast.left) - const right = pullUpStartAnchor(ast.right) + const left = pullUpStartAnchor(ast.left, isLeftClosed) + const right = pullUpStartAnchor(ast.right, true) // TODO: maybe more like `hasPrefix || left != epsilon` if (right.type === 'start-anchor') { // Expression has the form `l^r` where `r` contains no start anchor. // `l` may contain one but it does not matter. `l` can at most match epsilon, @@ -149,53 +149,65 @@ function pullUpStartAnchor(ast: RegExpAST): RegExpAST { } } case "union": { - const left = pullUpStartAnchor(ast.left) - const right = pullUpStartAnchor(ast.right) - if (left.type === 'start-anchor' && right.type === 'start-anchor') + const left = pullUpStartAnchor(ast.left, isLeftClosed) + const right = pullUpStartAnchor(ast.right, isLeftClosed) + if (left.type === 'start-anchor' && right.type === 'start-anchor') { // Expression has the form `(^l|^r)`: return startAnchor(undefined, union(left.right, right.right)) // i.e. `^(l|r)` - else if (left.type === 'start-anchor') - // Expression has the form `(^l|r)`: - return startAnchor(undefined, union(left.right, concat(dotStar, right))) // i.e. `^(l|.*r)` - else if (right.type === 'start-anchor') - // Expression has the form `(l|^r)`: - return startAnchor(undefined, union(concat(dotStar, left), right.right)) // i.e. `^(.*l|r)` - else + } else if (left.type === 'start-anchor') { + if (isLeftClosed) { + // Expression has the form `p(^l|r)`: + throw new UnsupportedSyntaxError('union with non-empty prefix where only some members have anchors like a(^b|c)') + } else { + // Expression has the form `(^l|r)`: + return startAnchor(undefined, union(left.right, concat(dotStar, right))) // i.e. `^(l|.*r)` + } + } else if (right.type === 'start-anchor') { + if (isLeftClosed) + // Expression has the form `p(l|^r)`: + throw new UnsupportedSyntaxError('union with non-empty prefix where only some members have anchors like a(b|^c)') + else + // Expression has the form `(l|^r)`: + return startAnchor(undefined, union(concat(dotStar, left), right.right)) // i.e. `^(.*l|r)` + } else { // Expression has the form `(l|r)`: return union(left, right) + } } case "star": { - const inner = pullUpStartAnchor(ast.inner) - if (inner.type === 'start-anchor') + const inner = pullUpStartAnchor(ast.inner, true) // TODO: correct? + if (inner.type === 'start-anchor') { // Expression has the form `(^r)*`. We can expand the star to: // - // (^r)* == ε | (^r) | (^r)(^r)* - // - // It turns out that the case `(^r)(^r)*` can be eliminated. - // If `r` is nullable then the `(^r)` on the left can only match epsilon, so: + // (^r)* == ε | (^r) | (^r)(^r)(^r)* // - // (^r)(^r)* == (^r)* - // - // ==> (^r)* == ε | (^r) | (^r)* == ε | (^r) + // It turns out that the case `(^r)(^r)(^r)*` can be eliminated. + // If `r` is nullable then the leftmost `(^r)` can only match epsilon, so: // - // Otherwise, `r` is not nullable and the expression collapses to the empty set: - // - // (^r)(^r)* == ∅ - // - // ==> (^r)* == ε | (^r) | ∅ == ε | (^r) - // - // Either way, we have: - // - // (^r)* == ε | (^r) == ^(.*|r) == ^.* - // - return startAnchor(undefined, dotStar) - else + // (^r)(^r)(^r)* == (^r)(^r)* + // + // ==> (^r)* == ε | (^r) | (^r)(^r)* + // == ε | (^r) + // + // If `r` is not nullable the expression collapses to the empty set: + // + // (^r)(^r)(^r)* == ∅ + // + // ==> (^r)* == ε | (^r) | ∅ + // == ε | (^r) + // + if (isLeftClosed) + throw new UnsupportedSyntaxError('start anchor inside quantifier with non-empty prefix like (^a)*') + else + return startAnchor(undefined, union(dotStar, inner.right)) // i.e. `^(.*|r)` + } else { // Expression has the form `r*` so no start anchor to deal with: return star(inner) + } } case "start-anchor": { - const left = pullUpEndAnchor(ast.left) - const right = pullUpStartAnchor(ast.right) + const left = pullUpEndAnchor(ast.left, isLeftClosed) + const right = pullUpStartAnchor(ast.right, true) if (!isNullable(left)) { // Expression has the form `l^r` where `l` is not nullable. Thus, the whole @@ -218,8 +230,8 @@ function pullUpStartAnchor(ast: RegExpAST): RegExpAST { } } case "end-anchor": { - const left = pullUpStartAnchor(ast.left) - const right = pullUpStartAnchor(ast.right) + const left = pullUpStartAnchor(ast.left, isLeftClosed) + const right = pullUpStartAnchor(ast.right, true) if (!isNullable(ast.right)) { // Expression has the form `l$r` where `r` is not nullable. Thus, the whole @@ -241,31 +253,31 @@ function pullUpStartAnchor(ast: RegExpAST): RegExpAST { } } case "lookahead": { - const inner = pullUpStartAnchor(ast.inner) - const right = pullUpStartAnchor(ast.right) + const inner = pullUpStartAnchor(ast.inner, true) + const right = pullUpStartAnchor(ast.right, isLeftClosed) if (inner.type === 'start-anchor') { - throw new UnsupportedSyntaxError('start anchors inside lookaheads like (?=^a) are not supported') + throw new UnsupportedSyntaxError('start anchors inside lookaheads like (?=^a)') } else if (right.type === 'start-anchor') { return startAnchor(undefined, lookahead(ast.isPositive, ast.inner, right.right)) } else { - return lookahead(ast.isPositive, ast.inner, right) + return lookahead(ast.isPositive, inner, right) } } } checkedAllCases(ast.type) } -function pullUpEndAnchor(ast: RegExpAST): RegExpAST { +function pullUpEndAnchor(ast: RegExpAST, isRightClosed: boolean): RegExpAST { assert(!isOneOf(ast.type, sugarNodeTypes), `Got ${ast.type} node. Expected desugared AST.`) - + switch (ast.type) { case "epsilon": return ast case "literal": return ast case "concat": { // Pull up end anchors on subexpressions first, so if they contain end // anchors then `left` and `right` will have the end anchor at the top. - const left = pullUpEndAnchor(ast.left) - const right = pullUpEndAnchor(ast.right) + const left = pullUpEndAnchor(ast.left, true) // TODO: rather `isRightClosed || right != epsilon` + const right = pullUpEndAnchor(ast.right, isRightClosed) if (left.type === 'end-anchor') { // Expression has the form `l$r` where `l` contains no end anchor. // `r` may contain one but it does not matter. `r` can at most match epsilon, @@ -287,53 +299,47 @@ function pullUpEndAnchor(ast: RegExpAST): RegExpAST { } } case "union": { - const left = pullUpEndAnchor(ast.left) - const right = pullUpEndAnchor(ast.right) - if (left.type === 'end-anchor' && right.type === 'end-anchor') + const left = pullUpEndAnchor(ast.left, isRightClosed) + const right = pullUpEndAnchor(ast.right, isRightClosed) + if (left.type === 'end-anchor' && right.type === 'end-anchor') { // Expression has the form `(l$|r$)`: return endAnchor(union(left.left, right.left), undefined) // i.e. `(l$|r$)` - else if (left.type === 'end-anchor') - // Expression has the form `(l$|r)`: - return endAnchor(union(left.left, concat(right, dotStar)), undefined) // i.e. `(l|r.*)$` - else if (right.type === 'end-anchor') - // Expression has the form `(l|r$)`: - return endAnchor(union(concat(left, dotStar), right.left), undefined) // i.e. `(l.*|r)$` - else + } else if (left.type === 'end-anchor') { + if (isRightClosed) + // Expression has the form `(l$|r)s`: + throw new UnsupportedSyntaxError('union with non-empty suffix where only some members have anchors like (a$|b)c') + else + // Expression has the form `(l$|r)`: + return endAnchor(union(left.left, concat(right, dotStar)), undefined) // i.e. `(l|r.*)$` + } else if (right.type === 'end-anchor') { + // Expression has the form `(l|r$)s`: + if (isRightClosed) + throw new UnsupportedSyntaxError('union with non-empty suffix where only some members have anchors like (a|b$)c') + else + // Expression has the form `(l|r$)`: + return endAnchor(union(concat(left, dotStar), right.left), undefined) // i.e. `(l.*|r)$` + } else { // Expression has the form `(l|r)`: return union(left, right) + } } case "star": { - const inner = pullUpEndAnchor(ast.inner) - if (inner.type === 'end-anchor') - // Expression has the form `(l$)*`. We can expand the star to: - // - // (l$)* == ε | (l$) | (l$)*(l$) - // - // It turns out that the case `(l$)*(l$)` can be eliminated. - // If `l` is nullable then the `(l$)` on the right can only match epsilon, so: - // - // (l$)*(l$) == (l$)* - // - // ==> (l$)* == ε | (l$) | (l$)* == ε | (l$) - // - // Otherwise, `l` is not nullable and the expression collapses to the empty set: - // - // (l$)*(l$) == ∅ - // - // ==> (l$)* == ε | (l$) | ∅ == ε | (l$) - // - // Either way, we have: - // - // (l$)* == ε | (l$) == (.*|l)$ == .*$ - // - return endAnchor(dotStar, undefined) + const inner = pullUpEndAnchor(ast.inner, true) // TODO: correct? + if (inner.type === 'end-anchor') + if (isRightClosed) + // Expression has the form `(l$)*s`: + // (see explanation for the "star" case in `pullUpStartAnchor`) + throw new UnsupportedSyntaxError('end anchors inside quantifiers with non-empty suffix like (a$)*b') + else + // Expression has the form `(l$)*` + return endAnchor(union(dotStar, inner.left), undefined) // i.e. `(.*|l)$` else // Expression has the form `l*` so no end anchor to deal with: return star(inner) } case "start-anchor": { - const left = pullUpEndAnchor(ast.left) - const right = pullUpEndAnchor(ast.right) + const left = pullUpEndAnchor(ast.left, true) + const right = pullUpEndAnchor(ast.right, isRightClosed) if (!isNullable(left)) { // Expression has the form `l^r` where `r` is not nullable. Thus, the whole @@ -355,8 +361,8 @@ function pullUpEndAnchor(ast: RegExpAST): RegExpAST { } } case "end-anchor": { - const left = pullUpEndAnchor(ast.left) - const right = pullUpStartAnchor(ast.right) + const left = pullUpEndAnchor(ast.left, true) + const right = pullUpStartAnchor(ast.right, isRightClosed) if (!isNullable(right)) { // Expression has the form `l$r` where `r` is not nullable. Thus, the whole @@ -379,14 +385,14 @@ function pullUpEndAnchor(ast: RegExpAST): RegExpAST { } } case "lookahead": { - const inner = pullUpEndAnchor(ast.inner) - const right = pullUpEndAnchor(ast.right) + const inner = pullUpEndAnchor(ast.inner, false) + const right = pullUpEndAnchor(ast.right, isRightClosed) if (inner.type === 'end-anchor') { - throw new UnsupportedSyntaxError('end anchors inside lookaheads like (?=a$) are not supported') + throw new UnsupportedSyntaxError('end anchors inside lookaheads like (?=a$)') } else if (right.type === 'end-anchor') { return endAnchor(lookahead(ast.isPositive, ast.inner, right.left), undefined) } else { - return lookahead(ast.isPositive, ast.inner, right) + return lookahead(ast.isPositive, inner, right) } } } @@ -398,7 +404,7 @@ export function toExtRegex(ast: RegExpAST): RE.ExtRegex { ast = desugar(ast) // Then eliminate start anchors by first pulling them to the top: - ast = pullUpStartAnchor(ast) + ast = pullUpStartAnchor(ast, false) if (ast.type === 'start-anchor') { // If the root node is indeed a start anchor now, then start anchors have been // eliminated from all sub-expressions and we can just drop the root-level one: @@ -410,7 +416,7 @@ export function toExtRegex(ast: RegExpAST): RE.ExtRegex { } // Then eliminate end anchors by first pulling them to the top: - ast = pullUpEndAnchor(ast) + ast = pullUpEndAnchor(ast, false) if (ast.type === 'end-anchor') { // If the root node is indeed an end anchor now, then end anchors have been // eliminated from all sub-expressions and we can just drop the root-level one: @@ -436,7 +442,7 @@ function toExtRegexAux(ast: RegExpAST): RE.ExtRegex { case 'lookahead': { const inner = toExtRegexAux(ast.inner) const right = toExtRegexAux(ast.right) - if (ast.isPositive) + if (ast.isPositive) return RE.intersection(inner, right) else return RE.intersection(RE.complement(inner), right) @@ -529,9 +535,9 @@ function repeatBoundsToString(bounds: RepeatBounds): string { } function captureGroupToString(name: string | undefined, inner: RegExpAST, options: RenderOptions) { - if (name === undefined) + if (name === undefined) return `(${toString(inner, options)})` - else + else return `(?<${name}>${toString(inner, options)})` } @@ -583,36 +589,36 @@ export function toString(ast: RegExpAST, options: RenderOptions): string { return CharSet.toString(ast.charset) case 'concat': return maybeWithParens(ast.left, ast, options) + maybeWithParens(ast.right, ast, options) - case 'union': - return maybeWithParens(ast.left, ast, options) + '|' + maybeWithParens(ast.right, ast, options) + case 'union': + return maybeWithParens(ast.left, ast, options) + '|' + maybeWithParens(ast.right, ast, options) // For postfix operators if we have to check whether `ast.inner` is not effectively epsilon. // In that case we shouldn't append the operator, otherwise can generate invalid expressions. // For example, `aε*` would become `a*`. case 'star': { const innerStr = maybeWithParens(ast.inner, ast, options) - if (innerStr === '') + if (innerStr === '') return '' else return innerStr + '*' } case 'plus': { const innerStr = maybeWithParens(ast.inner, ast, options) - if (innerStr === '') + if (innerStr === '') return '' else return innerStr + '+' } case 'optional': { const innerStr = maybeWithParens(ast.inner, ast, options) - if (innerStr === '') + if (innerStr === '') return '' else return innerStr + '?' } case 'repeat': { const innerStr = maybeWithParens(ast.inner, ast, options) - if (innerStr === '') + if (innerStr === '') return '' else if (typeof ast.bounds === 'number' && ast.bounds <= 3 && innerStr.length === 1) // Just duplicate `innerStr` if that makes rendered expression shorter. @@ -635,7 +641,7 @@ export function toString(ast: RegExpAST, options: RenderOptions): string { } checkedAllCases(ast) } - + // TODO: information is duplicated in parser: function precLevel(nodeType: RegExpAST['type']) { switch (nodeType) { @@ -667,8 +673,8 @@ function precLevel(nodeType: RegExpAST['type']) { */ const needsNoParensOnSamePrecLevel = new Set([ 'concat', - 'positive-lookahead', - 'negative-lookahead', + 'positive-lookahead', + 'negative-lookahead', 'start-anchor', 'end-anchor', 'union', @@ -681,7 +687,7 @@ const needsNoParensOnSamePrecLevel = new Set([ * semantics. */ function maybeWithParens(ast: RegExpAST, parent: RegExpAST, options: RenderOptions): string { - if (precLevel(ast.type) > precLevel(parent.type)) + if (precLevel(ast.type) > precLevel(parent.type)) return toString(ast, options) else if (precLevel(ast.type) === precLevel(parent.type) && needsNoParensOnSamePrecLevel.has(ast.type)) return toString(ast, options) diff --git a/src/parser.ts b/src/parser.ts index 4661a7f..dbcff6a 100644 --- a/src/parser.ts +++ b/src/parser.ts @@ -4,6 +4,8 @@ export type ParseResult = { value: T, restInput: string } export class ParseError extends Error { + name = "ParseError" + constructor( message: string, public readonly restInput: string diff --git a/src/regex-parser.ts b/src/regex-parser.ts index 16678db..e2766d9 100644 --- a/src/regex-parser.ts +++ b/src/regex-parser.ts @@ -29,7 +29,9 @@ const unescapedCharInsideBrackets = P.satisfy(char => !Range.mustBeEscapedInside const unescapedCharOutsideBrackets = P.satisfy(char => !Range.mustBeEscapedOutsideBrackets(char)) .map(CharSet.singleton) -export class UnsupportedSyntaxError extends Error {} +export class UnsupportedSyntaxError extends Error { + name = "UnsupportedSyntaxError" +} const escapeSequence = P.string('\\').andThen(_ => P.anyChar).andThen(escapedChar => { switch (escapedChar) { @@ -45,16 +47,16 @@ const escapeSequence = P.string('\\').andThen(_ => P.anyChar).andThen(escapedCha case 'v': return P.pure(CharSet.singleton('\v')) // vertical tab case 'f': return P.pure(CharSet.singleton('\f')) // form feed case '0': return P.pure(CharSet.singleton('\0')) // NUL character - case 'b': throw new UnsupportedSyntaxError('\b word-boundary assertion not supported') - case 'c': throw new UnsupportedSyntaxError('\cX control characters not supported') + case 'b': throw new UnsupportedSyntaxError('\b word-boundary assertion') + case 'c': throw new UnsupportedSyntaxError('\cX control characters') case 'x': return P.count(2, P.hexChar).map(chars => CharSet.fromRange(Range.singleton(parseInt(chars.join(''), 16))) ) case 'u': return P.count(4, P.hexChar).map(chars => CharSet.fromRange(Range.singleton(parseInt(chars.join(''), 16))) ) - case 'p': throw new UnsupportedSyntaxError('\\p not supported') - case 'P': throw new UnsupportedSyntaxError('\\P not supported') + case 'p': throw new UnsupportedSyntaxError('\\p') + case 'P': throw new UnsupportedSyntaxError('\\P') default: return P.pure(CharSet.singleton(escapedChar)) // match character literally } }) @@ -175,7 +177,7 @@ const lookbehind: P.Parser = P.string(')'), regex(), ).map(_ => { - throw new UnsupportedSyntaxError('lookbehind assertions are not supported') + throw new UnsupportedSyntaxError('lookbehind assertions') }) function regexTerm() { diff --git a/src/regex.ts b/src/regex.ts index 44835be..7d4aa90 100644 --- a/src/regex.ts +++ b/src/regex.ts @@ -428,7 +428,9 @@ export function isEmpty(regex: ExtRegex): boolean { return regex.type === 'literal' && CharSet.isEmpty(regex.charset) } -export class CacheOverflowError extends Error {} +export class CacheOverflowError extends Error { + name = "CacheOverflowError" +} export function codePointDerivative(codePoint: number, regex: StdRegex, cache: Table.Table): StdRegex export function codePointDerivative(codePoint: number, regex: ExtRegex, cache: Table.Table): ExtRegex @@ -484,7 +486,7 @@ function codePointDerivativeAux(codePoint: number, regex: ExtRegex, cache: Table // At least errors can be caught and handled. The limit is somewhat arbitrary. // TODO: maybe make this user configurable: if (Table.size(cache) >= 10_000) { - throw new CacheOverflowError('Cache overflow while computing DFA transitions.') + throw new CacheOverflowError('while computing DFA transitions.') } const result = codePointDerivative(codePoint, regex, cache) @@ -668,7 +670,9 @@ function derivativeClassesAux( ///// exclusive standard regex utils ///// ////////////////////////////////////////////// -export class VeryLargeSyntaxTreeError extends Error {} +export class VeryLargeSyntaxTreeError extends Error { + name = "VeryLargeSyntaxTreeError" +} /** * TODO: docs diff --git a/test/arbitrary-ast.ts b/test/arbitrary-ast.ts index 7f76b75..fa25d41 100644 --- a/test/arbitrary-ast.ts +++ b/test/arbitrary-ast.ts @@ -78,7 +78,7 @@ function endAnchor(childArb: () => fc.Arbitrary): fc.Arbitrary { [/^a^b/, RE.empty], // but two ^^ directly in a row are not a contradiction: [/(^^a|b)/, prefix(RE.union(RE.singleChar('a'), suffix(RE.singleChar('b'))))], - // in fact, as long as anything between two ^ can match epsilon, + // in fact, as long as anything between two ^ can match epsilon, // there is no contradiction: [/(^(c|)^a|b)/, prefix(RE.union(RE.singleChar('a'), suffix(RE.singleChar('b'))))], [/(^c*^a|b)/, prefix(RE.union(RE.singleChar('a'), suffix(RE.singleChar('b'))))], @@ -60,6 +60,9 @@ describe('toExtRegex', () => { // Nullable expressions on the left and right can be ignored: [/(a?)$^(b*)/, RE.epsilon], + // Contradiction inside lookahead collapses to empty set. Then empty set lookahead can't match anything: + [/(?=a^)/, RE.empty], + [/(^a|)^b/, RE.seq([RE.singleChar('b'), dotStar])], [/^a(b^|c)/, RE.seq([RE.string('ac'), dotStar]) ], [/(^|a)b/, prefix(RE.concat(RE.optional(suffix(RE.singleChar('a'))), RE.singleChar('b')))], @@ -89,7 +92,7 @@ describe('toExtRegex', () => { // negative lookahead: [/^a(?!b)c$/, RE.concat(RE.string('a'), RE.intersection(RE.complement(RE.string('b')), RE.string('c')))], // TODO: lookahead + lookbehind - // [/^a(?=b)(?<=a)b$/, RE.string('ab')], + // [/^a(?=b)(?<=a)b$/, RE.string('ab')], // [/^b(?=ab)a(?<=ba)b$/, RE.string('bab')], // [/^a(?=b)(?<=a)(?!a)(? { it('fixme', { todo: true }, () => { const actual = AST.toExtRegex(parseRegExp(/^(a(?!b))*$/)) const expected = RE.star(RE.string('a')) - assert.equal(actual.hash, expected.hash) + assert.equal(actual.hash, expected.hash) }) }) - + }) describe('toString', () => { diff --git a/test/regex-parser.spec.ts b/test/regex-parser.spec.ts index 084b0c5..0a5fd58 100644 --- a/test/regex-parser.spec.ts +++ b/test/regex-parser.spec.ts @@ -1,6 +1,6 @@ import { describe, it, test } from "node:test" import assert from "node:assert" -import { parseRegExp, parseRegExpString } from "../src/regex-parser" +import { parseRegExp, parseRegExpString, UnsupportedSyntaxError } from "../src/regex-parser" import { RB } from "../src/index" import { ParseError } from "../src/parser" import * as AST from "../src/ast" @@ -12,7 +12,7 @@ import * as Arbitrary from './arbitrary-ast' function char(c: string) { return AST.literal(CharSet.singleton(c)) } -function str(s: string) { +function str(s: string) { const chars = [...s].map(char) // Use right-associative concatenation: a(bc) not (ab)c return chars.reduceRight((acc, curr) => AST.concat(curr, acc)) @@ -44,7 +44,7 @@ describe('parseRegExp', () => { [/a{3,5}/, AST.repeat(char('a'), { min: 3, max: 5 })], // if curly bracket is not terminated the whole thing is interpreted literally: [/a{3,5/, str('a{3,5')], - // same if max value is given but min value is missing: + // same if max value is given but min value is missing: [/a{,5}/, str('a{,5}')], // char classes / escaping: [/\w/, AST.literal(CharSet.wordChars)], @@ -72,21 +72,21 @@ describe('parseRegExp', () => { [/^abc$/, AST.startAnchor(undefined, AST.endAnchor(str('abc'), undefined))], [/$a^/, AST.startAnchor(AST.endAnchor(undefined, char('a')), undefined)], // positive lookahead - now parsed as lookahead AST nodes, not intersections - [/(?=a)b/, AST.lookahead(true, char('a'), char('b'))], - [/(?=a)(?:b)/, AST.lookahead(true, char('a'), char('b'))], - [/(?=a)(?=b)c/, AST.lookahead(true, char('a'), AST.lookahead(true, char('b'), char('c')))], - [/a(?=b)c/, AST.concat(char('a'), AST.lookahead(true, char('b'), char('c')))], - [/a(?=b)/, AST.concat(char('a'), AST.lookahead(true, char('b'), AST.epsilon))], - [/a(?=b)c(?=d)e/, AST.concat(char('a'), AST.lookahead(true, char('b'), AST.concat(char('c'), AST.lookahead(true, char('d'), char('e')))))], - [/(?=)/, AST.lookahead(true, AST.epsilon, AST.epsilon)], + [/(?=a)b/, AST.lookahead(true, char('a'), char('b'))], + [/(?=a)(?:b)/, AST.lookahead(true, char('a'), char('b'))], + [/(?=a)(?=b)c/, AST.lookahead(true, char('a'), AST.lookahead(true, char('b'), char('c')))], + [/a(?=b)c/, AST.concat(char('a'), AST.lookahead(true, char('b'), char('c')))], + [/a(?=b)/, AST.concat(char('a'), AST.lookahead(true, char('b'), AST.epsilon))], + [/a(?=b)c(?=d)e/, AST.concat(char('a'), AST.lookahead(true, char('b'), AST.concat(char('c'), AST.lookahead(true, char('d'), char('e')))))], + [/(?=)/, AST.lookahead(true, AST.epsilon, AST.epsilon)], // negative lookahead - [/(?!a)b/, AST.lookahead(false, char('a'), char('b'))], + [/(?!a)b/, AST.lookahead(false, char('a'), char('b'))], [/(?!a)b|c/, AST.union(AST.lookahead(false, char('a'), char('b')), char('c'))], [/(?!)/, AST.lookahead(false, AST.epsilon, AST.epsilon)], // TODO: positive lookbehind - // [/(?<=a)/, AST.positiveLookbehind(char('a'))], + // [/(?<=a)/, AST.positiveLookbehind(char('a'))], // TODO: negative lookbehind - // [/(? { }) +function parse_skipKnownIssues(re: RegExp) { + try { + return RB(re) + } catch (error) { + if (error instanceof UnsupportedSyntaxError) { + fc.pre(false) + } else { + throw error + } + } +} + test('parse/stringify roundtrip preserves equivalence', {todo:true}, () => { fc.assert( fc.property( Arbitrary.regexp(), (inputRegExp: RegExp) => { - const builder = RB(inputRegExp) + const builder = parse_skipKnownIssues(inputRegExp) const outputRegExp = builder.toRegExp() for (const str of builder.enumerate().take(10)) { @@ -148,19 +160,6 @@ test('parse/stringify roundtrip preserves equivalence', {todo:true}, () => { } }, ), - // { numRuns: 1000 }, + { numRuns: 100 }, ) }) - -test('fixme 1', { todo: true }, () => { - const inputRegExp = /(^)+a/ - const builder = RB(inputRegExp) - const outputRegExp = builder.toRegExp() - - // console.debug(outputRegExp) - - for (const str of builder.enumerate().take(10)) { - assert.match(str, outputRegExp) - assert.match(str, inputRegExp) - } -})