From 4cdb58ae7207f2a7399a555a6ac4d3e410e97a1d Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Tue, 7 Oct 2025 11:25:18 -0500 Subject: [PATCH 1/5] [perf] Create a flattened representation of `DSLTree` (#831) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change simplifies the pre-compilation representation of the regex to store a pre-order traversal of the syntax tree in an array instead of an indirect enum. This enables an optimization pass that can index into and mutate the tree more easily. This change includes tests that verify that list-based compilation generates the same instructions as the original tree-based compilation, and switches to the list-based compilation. This change also eliminates the "literal wrapper" node that preserved AST – we aren't using the saved AST, so it resulted in unneeded links in the chain. Because parsing still generates an AST which is converted to a DSLTree before the new list, regex compilation may be slower until the intermediate DSLTree is fully removed. --- .../ByteCodeGen+DSLList.swift | 855 ++++++++++++++++++ Sources/_StringProcessing/ByteCodeGen.swift | 18 +- Sources/_StringProcessing/Compiler.swift | 15 + .../_StringProcessing/LiteralPrinter.swift | 6 +- .../_StringProcessing/PrintAsPattern.swift | 27 +- .../Regex/ASTConversion.swift | 20 +- Sources/_StringProcessing/Regex/DSLList.swift | 96 ++ Sources/_StringProcessing/Regex/DSLTree.swift | 124 ++- Tests/RegexTests/DSLListTests.swift | 37 + Tests/RegexTests/MatchTests.swift | 36 +- 10 files changed, 1132 insertions(+), 102 deletions(-) create mode 100644 Sources/_StringProcessing/ByteCodeGen+DSLList.swift create mode 100644 Sources/_StringProcessing/Regex/DSLList.swift create mode 100644 Tests/RegexTests/DSLListTests.swift diff --git a/Sources/_StringProcessing/ByteCodeGen+DSLList.swift b/Sources/_StringProcessing/ByteCodeGen+DSLList.swift new file mode 100644 index 000000000..c61c37fdf --- /dev/null +++ b/Sources/_StringProcessing/ByteCodeGen+DSLList.swift @@ -0,0 +1,855 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2025 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +internal import _RegexParser + +extension Compiler.ByteCodeGen { + mutating func emitRoot(_ root: DSLList) throws -> MEProgram { + // If the whole regex is a matcher, then the whole-match value + // is the constructed value. Denote that the current value + // register is the processor's value output. + switch root.nodes.first { + case .matcher: + builder.denoteCurrentValueIsWholeMatchValue() + default: + break + } + + var list = root.nodes[...] + try emitNode(&list) + + builder.canOnlyMatchAtStart = canOnlyMatchAtStart(in: root) + builder.buildAccept() + return try builder.assemble() + } +} + +fileprivate extension Compiler.ByteCodeGen { + /// Implementation for `canOnlyMatchAtStart`, which maintains the option + /// state. + /// + /// For a given specific node, this method can return one of three values: + /// + /// - `true`: This node is guaranteed to match only at the start of a subject. + /// - `false`: This node can match anywhere in the subject. + /// - `nil`: This node is inconclusive about where it can match. + /// + /// In particular, non-required groups and option-setting groups are + /// inconclusive about where they can match. + private mutating func _canOnlyMatchAtStartImpl( + _ list: inout ArraySlice + ) -> Bool? { + guard let node = list.popFirst() else { return false } + switch node { + // Defining cases + case .atom(.assertion(.startOfSubject)): + return true + case .atom(.assertion(.caretAnchor)): + return !options.anchorsMatchNewlines + + // Changing options doesn't determine `true`/`false`. + case .atom(.changeMatchingOptions(let sequence)): + options.apply(sequence.ast) + return nil + + // Any other atom or consuming node returns `false`. + case .atom, .customCharacterClass, .quotedLiteral: + return false + + // Trivia/empty have no effect. + case .trivia, .empty: + return nil + + // In an alternation, all of its children must match only at start. + case .orderedChoice(let children): + for _ in 0.. Bool { + let currentOptions = options + options = MatchingOptions() + defer { options = currentOptions } + + var list = list.nodes[...] + return _canOnlyMatchAtStartImpl(&list) ?? false + } + + mutating func emitAlternationGen( + _ elements: inout ArraySlice, + alternationCount: Int, + withBacktracking: Bool, + _ body: (inout Compiler.ByteCodeGen, inout ArraySlice) throws -> Void + ) rethrows { + // Alternation: p0 | p1 | ... | pn + // save next_p1 + // + // branch done + // next_p1: + // save next_p2 + // + // branch done + // next_p2: + // save next_p... + // + // branch done + // ... + // next_pn: + // + // done: + let done = builder.makeAddress() + for _ in 1.., + alternationCount count: Int + ) throws { + try emitAlternationGen(&list, alternationCount: count, withBacktracking: true) { + try $0.emitNode(&$1) + } + } + + mutating func emitPositiveLookahead(_ list: inout ArraySlice) throws { + /* + save(restoringAt: success) + save(restoringAt: intercept) + // failure restores at intercept + clearThrough(intercept) // remove intercept and any leftovers from + fail(preservingCaptures: true) // ->success + intercept: + clearSavePoint // remove success + fail // propagate failure + success: + ... + */ + let intercept = builder.makeAddress() + let success = builder.makeAddress() + + builder.buildSave(success) + builder.buildSave(intercept) + try emitNode(&list) + builder.buildClearThrough(intercept) + builder.buildFail(preservingCaptures: true) // Lookahead succeeds here + + builder.label(intercept) + builder.buildClear() + builder.buildFail() + + builder.label(success) + } + + mutating func emitNegativeLookahead(_ list: inout ArraySlice) throws { + /* + save(restoringAt: success) + save(restoringAt: intercept) + // failure restores at intercept + clearThrough(intercept) // remove intercept and any leftovers from + clearSavePoint // remove success + fail // propagate failure + intercept: + fail // ->success + success: + ... + */ + let intercept = builder.makeAddress() + let success = builder.makeAddress() + + builder.buildSave(success) + builder.buildSave(intercept) + try emitNode(&list) + builder.buildClearThrough(intercept) + builder.buildClear() + builder.buildFail() + + builder.label(intercept) + builder.buildFail() + + builder.label(success) + } + + mutating func emitLookaround( + _ kind: (forwards: Bool, positive: Bool), + _ list: inout ArraySlice + ) throws { + guard kind.forwards else { + throw Unsupported("backwards assertions") + } + if kind.positive { + try emitPositiveLookahead(&list) + } else { + try emitNegativeLookahead(&list) + } + } + + mutating func emitAtomicNoncapturingGroup( + _ list: inout ArraySlice + ) throws { + /* + save(continuingAt: success) + save(restoringAt: intercept) + // failure restores at intercept + clearThrough(intercept) // remove intercept and any leftovers from + fail(preservingCaptures: true) // ->success + intercept: + clearSavePoint // remove success + fail // propagate failure + success: + ... + */ + + let intercept = builder.makeAddress() + let success = builder.makeAddress() + + builder.buildSaveAddress(success) + builder.buildSave(intercept) + try emitNode(&list) + builder.buildClearThrough(intercept) + builder.buildFail(preservingCaptures: true) // Atomic group succeeds here + + builder.label(intercept) + builder.buildClear() + builder.buildFail() + + builder.label(success) + } + + mutating func emitNoncapturingGroup( + _ kind: AST.Group.Kind, + _ list: inout ArraySlice + ) throws { + assert(!kind.isCapturing) + + options.beginScope() + defer { options.endScope() } + + if let lookaround = kind.lookaroundKind { + try emitLookaround(lookaround, &list) + return + } + + switch kind { + case .lookahead, .negativeLookahead, + .lookbehind, .negativeLookbehind: + throw Unreachable("TODO: reason") + + case .capture, .namedCapture, .balancedCapture: + throw Unreachable("These should produce a capture node") + + case .changeMatchingOptions(let optionSequence): + if !hasEmittedFirstMatchableAtom { + builder.initialOptions.apply(optionSequence) + } + options.apply(optionSequence) + try emitNode(&list) + + case .atomicNonCapturing: + try emitAtomicNoncapturingGroup(&list) + + default: + // FIXME: Other kinds... + try emitNode(&list) + } + } + + mutating func emitQuantification( + _ amount: AST.Quantification.Amount, + _ kind: DSLTree.QuantificationKind, + _ list: inout ArraySlice + ) throws { + let updatedKind: AST.Quantification.Kind + switch kind { + case .explicit(let kind): + updatedKind = kind.ast + case .syntax(let kind): + updatedKind = kind.ast.applying(options) + case .default: + updatedKind = options.defaultQuantificationKind + } + + let (low, high) = amount.bounds + guard let low = low else { + throw Unreachable("Must have a lower bound") + } + switch (low, high) { + case (_, 0): + try skipNode(&list) + return + case let (n, m?) where n > m: + // TODO: Should error out earlier, maybe DSL and parser + // has validation logic? + return + + case let (n, m) where m == nil || n <= m!: + // Ok + break + default: + throw Unreachable("TODO: reason") + } + + // Compiler and/or parser should enforce these invariants + // before we are called + assert(high != 0) + assert((0...(high ?? Int.max)).contains(low)) + + let maxExtraTrips: Int? + if let h = high { + maxExtraTrips = h - low + } else { + maxExtraTrips = nil + } + let minTrips = low + assert((maxExtraTrips ?? 1) >= 0) + + var tmp = list + if tryEmitFastQuant(&tmp, updatedKind, minTrips, maxExtraTrips) { + list = tmp + return + } + + // The below is a general algorithm for bounded and unbounded + // quantification. It can be specialized when the min + // is 0 or 1, or when extra trips is 1 or unbounded. + // + // Stuff inside `<` and `>` are decided at compile time, + // while run-time values stored in registers start with a `%` + _ = """ + min-trip-count control block: + if %minTrips is zero: + goto exit-policy control block + else: + decrement %minTrips and fallthrough + + loop-body: + : + mov currentPosition %pos + evaluate the subexpression + : + if %pos is currentPosition: + goto exit + goto min-trip-count control block + + exit-policy control block: + if %maxExtraTrips is zero: + goto exit + else: + decrement %maxExtraTrips and fallthrough + + : + save exit and goto loop-body + : + ratchet and goto loop + : + save loop-body and fallthrough (i.e. goto exit) + + exit + ... the rest of the program ... + """ + + // Specialization based on `minTrips` for 0 or 1: + _ = """ + min-trip-count control block: + : + goto exit-policy + : + /* fallthrough */ + + loop-body: + evaluate the subexpression + + /* fallthrough */ + """ + + // Specialization based on `maxExtraTrips` for 0 or unbounded + _ = """ + exit-policy control block: + : + goto exit + : + /* fallthrough */ + """ + + /* + NOTE: These specializations don't emit the optimal + code layout (e.g. fallthrough vs goto), but that's better + done later (not prematurely) and certainly better + done by an optimizing compiler. + + NOTE: We're intentionally emitting essentially the same + algorithm for all quantifications for now, for better + testing and surfacing difficult bugs. We can specialize + for other things, like `.*`, later. + + When it comes time for optimizing, we can also look into + quantification instructions (e.g. reduce save-point traffic) + */ + + let minTripsControl = builder.makeAddress() + let loopBody = builder.makeAddress() + let exitPolicy = builder.makeAddress() + let exit = builder.makeAddress() + + // We'll need registers if we're (non-trivially) bounded + let minTripsReg: IntRegister? + if minTrips > 1 { + minTripsReg = builder.makeIntRegister( + initialValue: minTrips) + } else { + minTripsReg = nil + } + + let maxExtraTripsReg: IntRegister? + if (maxExtraTrips ?? 0) > 0 { + maxExtraTripsReg = builder.makeIntRegister( + initialValue: maxExtraTrips!) + } else { + maxExtraTripsReg = nil + } + + // Set up a dummy save point for possessive to update + if updatedKind == .possessive { + builder.pushEmptySavePoint() + } + + // min-trip-count: + // condBranch(to: exitPolicy, ifZeroElseDecrement: %min) + builder.label(minTripsControl) + switch minTrips { + case 0: builder.buildBranch(to: exitPolicy) + case 1: break + default: + assert(minTripsReg != nil, "logic inconsistency") + builder.buildCondBranch( + to: exitPolicy, ifZeroElseDecrement: minTripsReg!) + } + + // FIXME: Possessive needs a "dummy" save point to ratchet + + // loop: + // + // branch min-trip-count + builder.label(loopBody) + + // if we aren't sure if the child node will have forward progress and + // we have an unbounded quantification + let startPosition: PositionRegister? + // FIXME: forward progress check?! + let emitPositionChecking = + (!optimizationsEnabled || (list.first?.guaranteesForwardProgress != true)) && + maxExtraTrips == nil + + if emitPositionChecking { + startPosition = builder.makePositionRegister() + builder.buildMoveCurrentPosition(into: startPosition!) + } else { + startPosition = nil + } + try emitNode(&list) + if emitPositionChecking { + // in all quantifier cases, no matter what minTrips or maxExtraTrips is, + // if we have a successful non-advancing match, branch to exit because it + // can match an arbitrary number of times + builder.buildCondBranch(to: exit, ifSamePositionAs: startPosition!) + } + + if minTrips <= 1 { + // fallthrough + } else { + builder.buildBranch(to: minTripsControl) + } + + // exit-policy: + // condBranch(to: exit, ifZeroElseDecrement: %maxExtraTrips) + // + // + // , + _ kind: AST.Quantification.Kind, + _ minTrips: Int, + _ maxExtraTrips: Int? + ) -> Bool { + let isScalarSemantics = options.semanticLevel == .unicodeScalar + guard optimizationsEnabled + && minTrips <= QuantifyPayload.maxStorableTrips + && maxExtraTrips ?? 0 <= QuantifyPayload.maxStorableTrips + && kind != .reluctant else { + return false + } + guard let child = list.popFirst() else { return false } + + switch child { + case .customCharacterClass(let ccc): + // ascii only custom character class + guard let bitset = ccc.asAsciiBitset(options) else { + return false + } + builder.buildQuantify(bitset: bitset, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics) + + case .atom(let atom): + switch atom { + case .char(let c): + if options.isCaseInsensitive && c.isCased { + // Cased character with case-insensitive matching; match only as an ASCII bitset + guard let bitset = DSLTree.CustomCharacterClass(members: [.atom(atom)]).asAsciiBitset(options) else { + return false + } + builder.buildQuantify(bitset: bitset, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics) + } else { + // Uncased character OR case-sensitive matching; match as a single scalar ascii value character + guard let val = c._singleScalarAsciiValue else { + return false + } + builder.buildQuantify(asciiChar: val, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics) + } + + case .any: + builder.buildQuantifyAny( + matchesNewlines: true, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics) + case .anyNonNewline: + builder.buildQuantifyAny( + matchesNewlines: false, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics) + case .dot: + builder.buildQuantifyAny( + matchesNewlines: options.dotMatchesNewline, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics) + + case .characterClass(let cc): + // Custom character class that consumes a single grapheme + let model = cc.asRuntimeModel(options) + builder.buildQuantify( + model: model, + kind, + minTrips, + maxExtraTrips, + isScalarSemantics: isScalarSemantics) + default: + return false + } + case .limitCaptureNesting(let node): + if tryEmitFastQuant(&list, kind, minTrips, maxExtraTrips) { + return true + } else { + return false + } + case .nonCapturingGroup(let groupKind, let node): + // .nonCapture nonCapturingGroups are ignored during compilation + guard groupKind.ast == .nonCapture else { + return false + } + if tryEmitFastQuant(&list, kind, minTrips, maxExtraTrips) { + return true + } else { + return false + } + default: + return false + } + return true + } + + mutating func emitConcatenation( + _ list: inout ArraySlice, + componentCount: Int + ) throws { + // Unlike the tree-based bytecode generator, in a DSLList concatenations + // have already been flattened. + for _ in 0..) throws -> ValueRegister? { + guard let node = list.popFirst() else { return nil } + switch node { + + case let .orderedChoice(children): + let n = children.count + try emitAlternation(&list, alternationCount: n) + + case let .concatenation(children): + let n = children.count + try emitConcatenation(&list, componentCount: n) + + case let .capture(name, refId, _, transform): + options.beginScope() + defer { options.endScope() } + + let cap = builder.makeCapture(id: refId, name: name) + builder.buildBeginCapture(cap) + let value = try emitNode(&list) + builder.buildEndCapture(cap) + // If the child node produced a custom capture value, e.g. the result of + // a matcher, this should override the captured substring. + if let value { + builder.buildMove(value, into: cap) + } + // If there's a capture transform, apply it now. + if let transform = transform { + let fn = builder.makeTransformFunction { input, cap in + // If it's a substring capture with no custom value, apply the + // transform directly to the substring to avoid existential traffic. + // + // FIXME: separate out this code path. This is fragile, + // slow, and these are clearly different constructs + if let range = cap.range, cap.value == nil { + return try transform(input[range]) + } + + let value = constructExistentialOutputComponent( + from: input, + component: cap.deconstructed, + optionalCount: 0) + return try transform(value) + } + builder.buildTransformCapture(cap, fn) + } + + case let .nonCapturingGroup(kind, _): + try emitNoncapturingGroup(kind.ast, &list) + + case let .ignoreCapturesInTypedOutput(_): + try emitNode(&list) + + case let .limitCaptureNesting(_): + return try emitNode(&list) + + case .conditional: + throw Unsupported("Conditionals") + + case let .quantification(amt, kind, _): + try emitQuantification(amt.ast, kind, &list) + + case let .customCharacterClass(ccc): + if ccc.containsDot { + if !ccc.isInverted { + try emitDot() + } else { + throw Unsupported("Inverted any") + } + } else { + try emitCustomCharacterClass(ccc) + } + + case let .atom(a): + try emitAtom(a) + + case let .quotedLiteral(s): + emitQuotedLiteral(s) + + case .absentFunction: + throw Unsupported("absent function") + case .consumer: + throw Unsupported("consumer") + + case let .matcher(_, f): + return emitMatcher(f) + + case .characterPredicate: + throw Unsupported("character predicates") + + case .trivia, .empty: + return nil + } + return nil + } +} + +// MARK: Skip node + +extension Compiler.ByteCodeGen { + mutating func skipNode( + _ list: inout ArraySlice, + preservingCaptures: Bool = true + ) throws { + guard let node = list.popFirst() else { return } + switch node { + case let .orderedChoice(children): + let n = children.count + for _ in 0.. 0 && child.guaranteesForwardProgress + case .limitCaptureNesting(let node), .ignoreCapturesInTypedOutput(let node): + return node.guaranteesForwardProgress default: return false } } diff --git a/Sources/_StringProcessing/Compiler.swift b/Sources/_StringProcessing/Compiler.swift index 33cffaf20..e2fd2a284 100644 --- a/Sources/_StringProcessing/Compiler.swift +++ b/Sources/_StringProcessing/Compiler.swift @@ -32,6 +32,10 @@ class Compiler { } __consuming func emit() throws -> MEProgram { + try emitViaList() + } + + __consuming func emitViaTree() throws -> MEProgram { // TODO: Handle global options var codegen = ByteCodeGen( options: options, @@ -40,6 +44,17 @@ class Compiler { captureList: tree.captureList) return try codegen.emitRoot(tree.root) } + + __consuming func emitViaList() throws -> MEProgram { + // TODO: Handle global options + let dslList = DSLList(tree: tree) + var codegen = ByteCodeGen( + options: options, + compileOptions: + compileOptions, + captureList: tree.captureList) + return try codegen.emitRoot(dslList) + } } /// Hashable wrapper for `Any.Type`. diff --git a/Sources/_StringProcessing/LiteralPrinter.swift b/Sources/_StringProcessing/LiteralPrinter.swift index 5c136827c..fa80f032d 100644 --- a/Sources/_StringProcessing/LiteralPrinter.swift +++ b/Sources/_StringProcessing/LiteralPrinter.swift @@ -116,11 +116,9 @@ extension LiteralPrinter { outputNode(child) output(")") - case let .ignoreCapturesInTypedOutput(child): + case let .ignoreCapturesInTypedOutput(child), + let .limitCaptureNesting(child): outputNode(child) - case .convertedRegexLiteral(let node, _): - outputNode(node) - case let .quantification(amount, kind, node): outputQuantification(amount, kind, node) case let .customCharacterClass(charClass): diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift index 34ca44f0d..2f6ebab64 100644 --- a/Sources/_StringProcessing/PrintAsPattern.swift +++ b/Sources/_StringProcessing/PrintAsPattern.swift @@ -179,6 +179,9 @@ extension PrettyPrinter { case let .ignoreCapturesInTypedOutput(child): printAsPattern(convertedFromAST: child, isTopLevel: isTopLevel) + case let .limitCaptureNesting(child): + printAsPattern(convertedFromAST: child, isTopLevel: isTopLevel) + case .conditional: print("/* TODO: conditional */") @@ -258,20 +261,6 @@ extension PrettyPrinter { break - case let .convertedRegexLiteral(.atom(a), _): - if let pattern = a._patternBase(&self), pattern.canBeWrapped { - printAtom(pattern.0) - return - } - - break - case let .convertedRegexLiteral(.customCharacterClass(ccc), _): - if ccc.isSimplePrint { - printSimpleCCC(ccc) - return - } - - break default: break } @@ -305,13 +294,6 @@ extension PrettyPrinter { case let .quotedLiteral(v): print(v._quoted) - case let .convertedRegexLiteral(n, _): - // FIXME: This recursion coordinates with back-off - // check above, so it should work out. Need a - // cleaner way to do this. This means the argument - // label is a lie. - printAsPattern(convertedFromAST: n, isTopLevel: isTopLevel) - case let .customCharacterClass(ccc): printAsPattern(ccc) @@ -1431,9 +1413,6 @@ extension DSLTree.Node { result += node.getNamedCaptures() } - case .convertedRegexLiteral(let node, _): - result += node.getNamedCaptures() - case .quantification(_, _, let node): result += node.getNamedCaptures() diff --git a/Sources/_StringProcessing/Regex/ASTConversion.swift b/Sources/_StringProcessing/Regex/ASTConversion.swift index 49094d4f1..fbb189559 100644 --- a/Sources/_StringProcessing/Regex/ASTConversion.swift +++ b/Sources/_StringProcessing/Regex/ASTConversion.swift @@ -13,28 +13,13 @@ internal import _RegexParser extension AST { var dslTree: DSLTree { - return DSLTree(root.dslTreeNode) + return DSLTree(.limitCaptureNesting(root.dslTreeNode)) } } extension AST.Node { /// Converts an AST node to a `convertedRegexLiteral` node. var dslTreeNode: DSLTree.Node { - func wrap(_ node: DSLTree.Node) -> DSLTree.Node { - switch node { - case .convertedRegexLiteral: - // FIXME: DSL can have one item concats -// assertionFailure("Double wrapping?") - return node - default: - break - } - // TODO: Should we do this for the - // single-concatenation child too, or should? - // we wrap _that_? - return .convertedRegexLiteral(node, .init(ast: self)) - } - // Convert the top-level node without wrapping func convert() throws -> DSLTree.Node { switch self { @@ -105,9 +90,8 @@ extension AST.Node { } } - // FIXME: make total function again let converted = try! convert() - return wrap(converted) + return converted } } diff --git a/Sources/_StringProcessing/Regex/DSLList.swift b/Sources/_StringProcessing/Regex/DSLList.swift new file mode 100644 index 000000000..1bbb0c9cb --- /dev/null +++ b/Sources/_StringProcessing/Regex/DSLList.swift @@ -0,0 +1,96 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2025 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +struct DSLList { + var nodes: [DSLTree.Node] + + init(_ initial: DSLTree.Node) { + self.nodes = [initial] + } + + init(_ nodes: [DSLTree.Node]) { + self.nodes = nodes + } + + init(tree: DSLTree) { + self.nodes = Array(tree.depthFirst) + } +} + +extension DSLTree.Node { + var directChildren: Int { + switch self { + case .trivia, .empty, .quotedLiteral, + .consumer, .matcher, .characterPredicate, + .customCharacterClass, .atom: + return 0 + + case .orderedChoice(let c), .concatenation(let c): + return c.count + + case .capture, .nonCapturingGroup, + .quantification, .ignoreCapturesInTypedOutput, + .limitCaptureNesting, .conditional: + return 1 + + case .absentFunction: + return 0 + } + } +} + +extension DSLTree { + struct DepthFirst: Sequence, IteratorProtocol { + typealias Element = DSLTree.Node + private var stack: [Frame] + private let getChildren: (Element) -> [Element] + + private struct Frame { + let node: Element + let children: [Element] + var nextIndex: Int = 0 + } + + fileprivate init( + root: Element, + getChildren: @escaping (Element) -> [Element] + ) { + self.getChildren = getChildren + self.stack = [Frame(node: root, children: getChildren(root))] + } + + mutating func next() -> Element? { + guard let top = stack.popLast() else { return nil } + // Push children in reverse so leftmost comes out first. + for child in top.children.reversed() { + stack.append(Frame(node: child, children: getChildren(child))) + } + + // Since we coalesce the children before adding them to the stack, + // we need an exact matching number of children in the list's + // concatenation node, so that it can provide the correct component + // count. This will go away/change when .concatenation only stores + // a count. + return switch top.node { + case .concatenation: + .concatenation(top.node.coalescedChildren) + default: + top.node + } + } + } + + var depthFirst: DepthFirst { + DepthFirst(root: root, getChildren: { + $0.coalescedChildren + }) + } +} diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index 5971cd93a..03a563978 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -44,7 +44,8 @@ extension DSLTree { /// Marks all captures in a subpattern as ignored in strongly-typed output. case ignoreCapturesInTypedOutput(Node) - + case limitCaptureNesting(Node) + // TODO: Consider splitting off grouped conditions, or have // our own kind @@ -79,13 +80,6 @@ extension DSLTree { /// TODO: Consider splitting off expression functions, or have our own kind case absentFunction(_AST.AbsentFunction) - // MARK: - Tree conversions - - /// The target of AST conversion. - /// - /// Keeps original AST around for rich syntactic and source information - case convertedRegexLiteral(Node, _AST.ASTNode) - // MARK: - Extensibility points case consumer(_ConsumerInterface) @@ -384,8 +378,9 @@ extension DSLTree.Node { case .orderedChoice(let c), .concatenation(let c): return !c.isEmpty - case .convertedRegexLiteral, .capture, .nonCapturingGroup, - .quantification, .ignoreCapturesInTypedOutput, .conditional: + case .capture, .nonCapturingGroup, + .quantification, .ignoreCapturesInTypedOutput, .limitCaptureNesting, + .conditional: return true case .absentFunction(let abs): @@ -398,16 +393,72 @@ extension DSLTree.Node { switch self { case let .orderedChoice(v): return v - case let .concatenation(v): return v + case let .concatenation(v): return v + + case let .capture(_, _, n, _): return [n] + case let .nonCapturingGroup(_, n): return [n] + case let .quantification(_, _, n): return [n] + case let .ignoreCapturesInTypedOutput(n): return [n] + case let .limitCaptureNesting(n): return [n] + + case let .conditional(_, t, f): return [t,f] + + case .trivia, .empty, .quotedLiteral, + .consumer, .matcher, .characterPredicate, + .customCharacterClass, .atom: + return [] + + case let .absentFunction(abs): + return abs.ast.children.map(\.dslTreeNode) + } + } + + public var coalescedChildren: [DSLTree.Node] { + // Before converting a concatenation in a tree to list form, we need to + // flatten out any nested concatenations, and coalesce any adjacent + // characters and scalars, forming quoted literals of their contents, + // over which we can perform grapheme breaking. - case let .convertedRegexLiteral(n, _): - // Treat this transparently - return n.children + func flatten(_ node: DSLTree.Node) -> [DSLTree.Node] { + switch node { + case .concatenation(let ch): + return ch.flatMap(flatten) + case .ignoreCapturesInTypedOutput(let n), .limitCaptureNesting(let n): + return flatten(n) + default: + return [node] + } + } + + switch self { + case let .orderedChoice(v): return v + case let .concatenation(v): + let children = v + .flatMap(flatten) + .coalescing(with: "", into: DSLTree.Node.quotedLiteral) { str, node in + switch node { + case .atom(let a): + guard let c = a.literalCharacterValue else { return false } + str.append(c) + return true + case .quotedLiteral(let q): + str += q + return true + case .trivia: + // Trivia can be completely ignored if we've already coalesced + // something. + return !str.isEmpty + default: + return false + } + } + return children case let .capture(_, _, n, _): return [n] case let .nonCapturingGroup(_, n): return [n] case let .quantification(_, _, n): return [n] - case let .ignoreCapturesInTypedOutput(n): return [n] + case let .ignoreCapturesInTypedOutput(n): return [n] + case let .limitCaptureNesting(n): return [n] case let .conditional(_, t, f): return [t,f] @@ -424,18 +475,12 @@ extension DSLTree.Node { extension DSLTree.Node { var astNode: AST.Node? { - switch self { - case let .convertedRegexLiteral(_, literal): return literal.ast - default: return nil - } + nil } /// If this node is for a converted literal, look through it. var lookingThroughConvertedLiteral: Self { - switch self { - case let .convertedRegexLiteral(n, _): return n - default: return self - } + self } } @@ -468,10 +513,6 @@ extension DSLTree.Node { switch self { case .capture: return true - case let .convertedRegexLiteral(n, re): - assert(n.hasCapture == re.ast.hasCapture) - return n.hasCapture - default: return self.children.any(\.hasCapture) } @@ -655,6 +696,9 @@ extension CaptureList.Builder { case let .ignoreCapturesInTypedOutput(child): addCaptures(of: child, optionalNesting: nesting, visibleInTypedOutput: false) + case let .limitCaptureNesting(child): + addCaptures(of: child, optionalNesting: nesting.disablingNesting, visibleInTypedOutput: visibleInTypedOutput) + case let .conditional(cond, trueBranch, falseBranch): switch cond.ast { case .group(let g): @@ -685,11 +729,11 @@ extension CaptureList.Builder { #endif } - case let .convertedRegexLiteral(n, _): - // We disable nesting for converted AST trees, as literals do not nest - // captures. This includes literals nested in a DSL. - return addCaptures(of: n, optionalNesting: nesting.disablingNesting, visibleInTypedOutput: visibleInTypedOutput) - +// case let .convertedRegexLiteral(n, _): +// // We disable nesting for converted AST trees, as literals do not nest +// // captures. This includes literals nested in a DSL. +// return addCaptures(of: n, optionalNesting: nesting.disablingNesting, visibleInTypedOutput: visibleInTypedOutput) +// case .matcher: break @@ -717,8 +761,8 @@ extension DSLTree.Node { return true case .orderedChoice, .concatenation, .capture, .conditional, .quantification, .customCharacterClass, .atom, - .trivia, .empty, .quotedLiteral, .absentFunction, - .convertedRegexLiteral, .consumer, + .trivia, .empty, .quotedLiteral, .limitCaptureNesting, + .consumer, .absentFunction, .characterPredicate, .matcher: return false } @@ -805,8 +849,7 @@ extension DSLTree.Node { options.beginScope() defer { options.endScope() } return child._canOnlyMatchAtStartImpl(&options) - case .ignoreCapturesInTypedOutput(let child), - .convertedRegexLiteral(let child, _): + case .ignoreCapturesInTypedOutput(let child), .limitCaptureNesting(let child): return child._canOnlyMatchAtStartImpl(&options) // A quantification that doesn't require its child to exist can still @@ -869,14 +912,13 @@ extension DSLTree { case let .orderedChoice(v): return v.map(_Tree.init) case let .concatenation(v): return v.map(_Tree.init) - case let .convertedRegexLiteral(n, _): - // Treat this transparently - return _Tree(n).children - case let .capture(_, _, n, _): return [_Tree(n)] case let .nonCapturingGroup(_, n): return [_Tree(n)] case let .quantification(_, _, n): return [_Tree(n)] - case let .ignoreCapturesInTypedOutput(n): return [_Tree(n)] + case let .ignoreCapturesInTypedOutput(n): return [_Tree(n)] + case let .limitCaptureNesting(n): + // This is a transparent wrapper + return _Tree(n).children case let .conditional(_, t, f): return [_Tree(t), _Tree(f)] diff --git a/Tests/RegexTests/DSLListTests.swift b/Tests/RegexTests/DSLListTests.swift new file mode 100644 index 000000000..d8acec737 --- /dev/null +++ b/Tests/RegexTests/DSLListTests.swift @@ -0,0 +1,37 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2025 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +import Testing +@testable import _StringProcessing + +@Suite +struct DSLListTests { + @available(macOS 9999, *) + @Test(arguments: [ + (#/a/#, 2), // literal, a + (#/abcd+/#, 5), // literal, concat, abc, quant, d + (#/a(?:b+)c*/#, 8), // literal, concat, a, noncap grp, quant, b, quant, c + ]) + func convertedNodeCount(regex: Regex, nodeCount: Int) { + let dslList = DSLList(tree: regex.program.tree) + #expect(dslList.nodes.count == nodeCount) + } + + @Test(arguments: [#/a|b/#, #/a+b?c/#, #/abc/#, #/a(?:b+)c*/#, #/;[\r\n]/#, #/(?=(?:[1-9]|(?:a|b)))/#]) + func compilationComparison(regex: Regex) throws { + let listCompiler = Compiler(tree: regex.program.tree) + let listProgram = try listCompiler.emitViaList() + let treeCompiler = Compiler(tree: regex.program.tree) + let treeProgram = try treeCompiler.emit() + + #expect(treeProgram.instructions == listProgram.instructions) + } +} diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index e20beeafb..e36285ae6 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -37,16 +37,34 @@ func _roundTripLiteral( return remadeRegex } +func _validateListCompilation( + _ regex: Regex +) throws -> Bool { + let treeCompiler = Compiler(tree: regex.program.tree) + let treeProgram = try treeCompiler.emitViaTree() + let listCompiler = Compiler(tree: regex.program.tree) + let listProgram = try listCompiler.emitViaList() + return treeProgram.instructions == listProgram.instructions +} + func _firstMatch( _ regexStr: String, input: String, validateOptimizations: Bool, semanticLevel: RegexSemanticLevel = .graphemeCluster, - syntax: SyntaxOptions = .traditional + syntax: SyntaxOptions = .traditional, + file: StaticString = #file, + line: UInt = #line ) throws -> (String, [String?])? { var regex = try Regex(regexStr, syntax: syntax).matchingSemantics(semanticLevel) let result = try regex.firstMatch(in: input) - + + if try !_validateListCompilation(regex) { + XCTFail( + "List compilation failed for '\(regexStr)'", + file: file, line: line) + } + func validateSubstring(_ substringInput: Substring) throws { // Sometimes the characters we add to a substring merge with existing // string members. This messes up cross-validation, so skip the test. @@ -105,14 +123,18 @@ func _firstMatch( For input '\(input)' Original: '\(regexStr)' _literalPattern: '\(roundTripRegex?._literalPattern ?? "")' - """) + """, + file: file, + line: line) case let (_, rtMatch?): XCTFail(""" Incorrectly matched as '\(rtMatch)' For input '\(input)' Original: '\(regexStr)' _literalPattern: '\(roundTripRegex!._literalPattern!)' - """) + """, + file: file, + line: line) } } @@ -184,7 +206,8 @@ func flatCaptureTest( input: test, validateOptimizations: validateOptimizations, semanticLevel: semanticLevel, - syntax: syntax + syntax: syntax, + file: file, line: line ) else { if expect == nil { continue @@ -303,7 +326,8 @@ func firstMatchTest( input: input, validateOptimizations: validateOptimizations, semanticLevel: semanticLevel, - syntax: syntax)?.0 + syntax: syntax, + file: file, line: line)?.0 if xfail { XCTAssertNotEqual(found, match, file: file, line: line) From 62b0ae2e9b699c8fdf9caa63b04a375adc6d1557 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Wed, 8 Oct 2025 23:20:17 -0500 Subject: [PATCH 2/5] Add benchmark job in CI (#833) Also tweaks the RegexBenchmark formatting/settings a bit. --- .github/workflows/pull_request.yml | 58 +++++++++++++++++++ Sources/RegexBenchmark/BenchmarkResults.swift | 33 +++++++++-- Sources/RegexBenchmark/Utils/Stats.swift | 2 +- 3 files changed, 86 insertions(+), 7 deletions(-) diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index cd669b492..16352631e 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -19,3 +19,61 @@ jobs: license_header_check_project_name: "Swift.org" unacceptable_language_check_enabled: false format_check_enabled: false + bench: + name: Benchmark + runs-on: ubuntu-latest + env: + BUILD_CMD: swift build -c release + BENCH_CMD: .build/release/RegexBenchmark + BASELINE_FILE: benchmark-baseline + COMPARE_FILE: benchmark-pr + COMPARE_OUT_FILE: benchmark-results.txt + steps: + - name: Check out baseline branch + uses: actions/checkout@v4 + with: + ref: ${{ github.event.pull_request.base.sha }} + path: base + fetch-depth: 0 + - name: Build baseline branch + working-directory: base + run: | + set -euo pipefail + eval "$BUILD_CMD" + - name: Run baseline benchmark + working-directory: base + run: | + set -euo pipefail + eval "$BENCH_CMD --save $RUNNER_TEMP/$BASELINE_FILE" + test -s "$RUNNER_TEMP/$BASELINE_FILE" || { echo "Baseline not created at $BASELINE_FILE"; exit 1; } + - name: Check out PR branch + uses: actions/checkout@v4 + with: + ref: ${{ github.event.pull_request.head.sha }} + path: pr + fetch-depth: 0 + - name: Build PR branch + working-directory: pr + run: | + set -euo pipefail + eval "$BUILD_CMD" + - name: Run PR benchmark + working-directory: pr + run: | + set -euo pipefail + eval "$BENCH_CMD --save $RUNNER_TEMP/$COMPARE_FILE" + test -s "$RUNNER_TEMP/$COMPARE_FILE" || { echo "Comparison not created at $COMPARE_FILE"; exit 1; } + eval "$BENCH_CMD --compare $RUNNER_TEMP/$BASELINE_FILE" | tee "$RUNNER_TEMP/$COMPARE_OUT_FILE" + - name: 📊 Compare benchmarks + working-directory: pr + run: | + set -euo pipefail + eval "$BENCH_CMD --load $RUNNER_TEMP/$COMPARE_FILE --compare $RUNNER_TEMP/$BASELINE_FILE --compare-compile-time $RUNNER_TEMP/$BASELINE_FILE" | tee "$RUNNER_TEMP/$COMPARE_OUT_FILE" + - name: Upload benchmark artifacts + uses: actions/upload-artifact@v4 + with: + name: benchmark-results + path: | + ${{ runner.temp }}/${{ env.BASELINE_FILE }} + ${{ runner.temp }}/${{ env.COMPARE_FILE }} + ${{ runner.temp }}/${{ env.COMPARE_OUT_FILE }} diff --git a/Sources/RegexBenchmark/BenchmarkResults.swift b/Sources/RegexBenchmark/BenchmarkResults.swift index 824322300..02eba5a14 100644 --- a/Sources/RegexBenchmark/BenchmarkResults.swift +++ b/Sources/RegexBenchmark/BenchmarkResults.swift @@ -115,12 +115,12 @@ extension BenchmarkRunner { .sorted(by: {(a,b) in a.diff!.seconds < b.diff!.seconds}) print("Comparing against \(against)") - print("=== Regressions ======================================================================") + print("=== Regressions ================================================================") for item in regressions { print(item) } - print("=== Improvements =====================================================================") + print("=== Improvements ===============================================================") for item in improvements { print(item) } @@ -128,7 +128,7 @@ extension BenchmarkRunner { #if os(macOS) && canImport(Charts) if showChart { print(""" - === Comparison chart ================================================================= + === Comparison chart =========================================================== Press Control-C to close... """) BenchmarkResultApp.comparisons = comparisons @@ -234,9 +234,17 @@ extension BenchmarkResult { return "- \(name) N/A" } let percentage = (1000 * diff.seconds / baselineTime.seconds).rounded()/10 - let len = max(40 - name.count, 1) - let nameSpacing = String(repeating: " ", count: len) - return "- \(name)\(nameSpacing)\(latestTime)\t\(baselineTime)\t\(diff)\t\t\(percentage)%" + let start = if name.count > 40 { + "- \(name)\n" + String(repeating: " ", count: 43) + } else { + "- \(name, paddingTo: 40) " + } + return start + """ + \(latestTime, paddingTo: 8, alignRight: true) \ + \(baselineTime, paddingTo: 8, alignRight: true) \ + \(diff, paddingTo: 8, alignRight: true) \ + \(percentage, paddingTo: 5, alignRight: true)% + """ } var asCsv: String { @@ -334,3 +342,16 @@ extension SuiteResult: Codable { return try decoder.decode(SuiteResult.self, from: data) } } + +extension DefaultStringInterpolation { + mutating func appendInterpolation(_ value: T, paddingTo length: Int, alignRight: Bool = false) { + let s = String(describing: value) + let paddingCount = max(0, length - s.count) + let padding = String(repeating: " ", count: paddingCount) + if alignRight { + appendLiteral(padding + s) + } else { + appendLiteral(s + padding) + } + } +} diff --git a/Sources/RegexBenchmark/Utils/Stats.swift b/Sources/RegexBenchmark/Utils/Stats.swift index bc1490d8e..8200adca5 100644 --- a/Sources/RegexBenchmark/Utils/Stats.swift +++ b/Sources/RegexBenchmark/Utils/Stats.swift @@ -15,7 +15,7 @@ enum Stats {} extension Stats { // Maximum allowed standard deviation is 7.5% of the median runtime - static let maxAllowedStdev = 0.075 + static let maxAllowedStdev = 0.15 static func tTest(_ a: Measurement, _ b: Measurement) -> Bool { // Student's t-test From e539ac103affb4c3456b2b8efb492bc5ea014734 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Thu, 16 Oct 2025 14:03:32 -0500 Subject: [PATCH 3/5] [perf] Implement auto-possessification (#832) This optimization recognizes quantifications followed by an atom that can't be matched at the start of the quantification, and converts them to possessive, eliminating any backtracking at that position. This change includes an accessor for the first "required" atom (which will be used in other optimizations), and converts DSL.QuantificationKind to a struct to support the possessification changes. When deciding on exclusion during auto-possessification, awareness of the current matching options is important for correct analysis. For example, /a+A/ can be auto-possessified, but the case insensitive pattern /(?i)a+A/ cannot be. --- .../ByteCodeGen+DSLList.swift | 18 +- Sources/_StringProcessing/ByteCodeGen.swift | 10 +- Sources/_StringProcessing/Compiler.swift | 4 +- .../_StringProcessing/LiteralPrinter.swift | 15 +- .../Optimizations/AutoPossessification.swift | 398 ++++++++++++++++++ Sources/_StringProcessing/Regex/DSLList.swift | 32 ++ Sources/_StringProcessing/Regex/DSLTree.swift | 183 +++++++- Tests/RegexTests/MatchTests.swift | 32 +- Tests/RegexTests/OptimizationTests.swift | 68 +++ 9 files changed, 722 insertions(+), 38 deletions(-) create mode 100644 Sources/_StringProcessing/Optimizations/AutoPossessification.swift create mode 100644 Tests/RegexTests/OptimizationTests.swift diff --git a/Sources/_StringProcessing/ByteCodeGen+DSLList.swift b/Sources/_StringProcessing/ByteCodeGen+DSLList.swift index c61c37fdf..330018878 100644 --- a/Sources/_StringProcessing/ByteCodeGen+DSLList.swift +++ b/Sources/_StringProcessing/ByteCodeGen+DSLList.swift @@ -12,7 +12,7 @@ internal import _RegexParser extension Compiler.ByteCodeGen { - mutating func emitRoot(_ root: DSLList) throws -> MEProgram { + mutating func emitRoot(_ root: inout DSLList) throws -> MEProgram { // If the whole regex is a matcher, then the whole-match value // is the constructed value. Denote that the current value // register is the processor's value output. @@ -22,7 +22,11 @@ extension Compiler.ByteCodeGen { default: break } - + + if optimizationsEnabled { + root.autoPossessify() + } + var list = root.nodes[...] try emitNode(&list) @@ -352,15 +356,7 @@ fileprivate extension Compiler.ByteCodeGen { _ kind: DSLTree.QuantificationKind, _ list: inout ArraySlice ) throws { - let updatedKind: AST.Quantification.Kind - switch kind { - case .explicit(let kind): - updatedKind = kind.ast - case .syntax(let kind): - updatedKind = kind.ast.applying(options) - case .default: - updatedKind = options.defaultQuantificationKind - } + let updatedKind = kind.applying(options: options) let (low, high) = amount.bounds guard let low = low else { diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index d6ec4d716..24c94da11 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -506,15 +506,7 @@ extension Compiler.ByteCodeGen { _ kind: DSLTree.QuantificationKind, _ child: DSLTree.Node ) throws { - let updatedKind: AST.Quantification.Kind - switch kind { - case .explicit(let kind): - updatedKind = kind.ast - case .syntax(let kind): - updatedKind = kind.ast.applying(options) - case .default: - updatedKind = options.defaultQuantificationKind - } + let updatedKind = kind.applying(options: options) let (low, high) = amount.bounds guard let low = low else { diff --git a/Sources/_StringProcessing/Compiler.swift b/Sources/_StringProcessing/Compiler.swift index e2fd2a284..25e6e4cf6 100644 --- a/Sources/_StringProcessing/Compiler.swift +++ b/Sources/_StringProcessing/Compiler.swift @@ -47,13 +47,13 @@ class Compiler { __consuming func emitViaList() throws -> MEProgram { // TODO: Handle global options - let dslList = DSLList(tree: tree) + var dslList = DSLList(tree: tree) var codegen = ByteCodeGen( options: options, compileOptions: compileOptions, captureList: tree.captureList) - return try codegen.emitRoot(dslList) + return try codegen.emitRoot(&dslList) } } diff --git a/Sources/_StringProcessing/LiteralPrinter.swift b/Sources/_StringProcessing/LiteralPrinter.swift index fa80f032d..e1dc3fa23 100644 --- a/Sources/_StringProcessing/LiteralPrinter.swift +++ b/Sources/_StringProcessing/LiteralPrinter.swift @@ -224,13 +224,16 @@ extension LiteralPrinter { } mutating func outputQuantificationKind(_ kind: DSLTree.QuantificationKind) { - switch kind { - case .`default`: + guard let astKind = kind.quantificationKind?.ast else { // We can treat this as if the current default had been given explicity. outputQuantificationKind( .explicit(.init(ast: options.defaultQuantificationKind))) - case let .explicit(kind): - switch kind.ast { + return + } + + if kind.isExplicit { + // Explicitly provided modifiers need to match the current option state. + switch astKind { case .eager: output(options.isReluctantByDefault ? "?" : "") case .reluctant: @@ -242,9 +245,9 @@ extension LiteralPrinter { fatalError() #endif } - case let .syntax(kind): + } else { // Syntactically-specified quantification modifiers can stay as-is. - switch kind.ast { + switch astKind { case .eager: output("") case .reluctant: diff --git a/Sources/_StringProcessing/Optimizations/AutoPossessification.swift b/Sources/_StringProcessing/Optimizations/AutoPossessification.swift new file mode 100644 index 000000000..7a728365c --- /dev/null +++ b/Sources/_StringProcessing/Optimizations/AutoPossessification.swift @@ -0,0 +1,398 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2025 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +extension DSLList { + private func _requiredAtomImpl( + _ position: inout Int, + options: inout MatchingOptions, + allowOptionsChanges: Bool + ) -> DSLTree.Atom?? { + guard position < nodes.count else { + return nil + } + + switch nodes[position] { + case .atom(let atom): + switch atom { + case .changeMatchingOptions(let seq): + // Exit early if an atom changes the matching options. + if allowOptionsChanges { + options.apply(seq.ast) + return nil + } else { + return .some(nil) + } + default: + return atom + } + + // In a concatenation, the first definitive child provides the answer, + // and then we need to skip past (in some cases at least) the remaining + // concatenation elements. + case .concatenation(let children): + var result: DSLTree.Atom?? = nil + var i = 0 + while i < children.count { + i += 1 + position += 1 + if let r = _requiredAtomImpl(&position, options: &options, allowOptionsChanges: allowOptionsChanges) { + result = r + break + } + } + + for _ in i.. DSLTree.Atom? { + var position = 0 + var options = MatchingOptions() + return _requiredAtomImpl(&position, options: &options, allowOptionsChanges: allowOptionsChanges) ?? nil + } + + internal mutating func autoPossessifyNextQuantification( + _ position: inout Int, + options: inout MatchingOptions + ) -> (Int, DSLTree.Atom)? { + guard position < nodes.count else { + return nil + } + + switch nodes[position] { + case .quantification(_, _, _): + let quantPosition = position + position += 1 + + // Limit auto-possessification to a single quantified atom, to avoid + // issues of overlapped matches. + guard position < nodes.count else { + return nil + } + switch nodes[position] { + case .atom(let atom) where atom.isMatchable: + return (quantPosition, atom) + default: + var innerPosition = position + _ = autoPossessifyNextQuantification(&innerPosition, options: &options) + return nil + } + + case .concatenation(let children): + // If we find a valid quantification among this concatenation's components, + // we must look for a required atom in the sibling. If a definitive result + // is not found, pop up the recursion stack to find a sibling at a higher + // level. + var foundQuantification: (Int, DSLTree.Atom)? = nil + var foundNextAtom: DSLTree.Atom? = nil + var i = 0 + position += 1 + while i < children.count { + i += 1 + if let result = autoPossessifyNextQuantification(&position, options: &options) { + foundQuantification = result + break + } + } + + while i < children.count { + i += 1 + position += 1 + if let result = _requiredAtomImpl(&position, options: &options, allowOptionsChanges: false) { + foundNextAtom = result + break + } + } + + for _ in i.. Bool { + switch (self, other) { + case (.char(let a), .char(let b)): + // Two characters are mutually exclusive if one does not match against + // the other. + // + // Relevant options: + // - semantic level + // - case insensitivity + + if options.semanticLevel == .graphemeCluster { + // Just call String.match(Character, ...) + let s = String(a) + return nil == s.match( + b, at: s.startIndex, + limitedBy: s.endIndex, + isCaseInsensitive: options.isCaseInsensitive) + } else { + // Call String.matchScalar(Scalar, ...) for each in scalar sequence + let s = String(a) + var i = s.startIndex + var j = b.unicodeScalars.startIndex + while i < s.endIndex { + guard j < b.unicodeScalars.endIndex else { return true } + guard let nextIndex = s.matchScalar(b.unicodeScalars[j], at: i, limitedBy: s.endIndex, boundaryCheck: false, isCaseInsensitive: options.isCaseInsensitive) else { + return true + } + i = nextIndex + b.unicodeScalars.formIndex(after: &j) + } + return false + } + + case (.scalar(let a), .scalar(let b)): + // Two scalars are mutually exclusive if one does not match against + // the other. + // + // Relevant options: + // - case insensitivity + let s = String(a) + return nil == s.matchScalar( + b, at: s.startIndex, + limitedBy: s.endIndex, + boundaryCheck: false, + isCaseInsensitive: options.isCaseInsensitive) + + case (.characterClass(let a), .characterClass(let b)): + // Certain character classes are mutually exclusive of each other. + return a.excludes(b, options: options) + + // For character class and char/scalar, we can test against the class's model. + case (.characterClass(let a), .char(let b)), (.char(let b), .characterClass(let a)): + let s = "\(b)" + return nil == a.asRuntimeModel(options).matches(in: s, at: s.startIndex, limitedBy: s.endIndex) + case (.characterClass(let a), .scalar(let b)), (.scalar(let b), .characterClass(let a)): + let s = "\(b)" + return nil == a.asRuntimeModel(options).matches(in: s, at: s.startIndex, limitedBy: s.endIndex) + + default: + return false + } + } +} + +extension DSLTree.Atom.CharacterClass { + func excludes(_ other: Self, options: MatchingOptions) -> Bool { + if other == .anyGrapheme || other == .anyUnicodeScalar { + return false + } + + return switch self { + case .anyGrapheme, .anyUnicodeScalar: + false + + case .digit: + switch other { + case .whitespace, .horizontalWhitespace, .verticalWhitespace, .newlineSequence, + .notWord, .notDigit: true + default: false + } + case .notDigit: + other == .digit + + case .horizontalWhitespace: + switch other { + case .word, .digit, .verticalWhitespace, .newlineSequence, + .notWhitespace, .notHorizontalWhitespace: true + default: false + } + case .notHorizontalWhitespace: + other == .horizontalWhitespace + + case .newlineSequence: + switch other { + case .word, .digit, .horizontalWhitespace, .notNewline: true + default: false + } + case .notNewline: + other == .newlineSequence + + case .whitespace: + switch other { + case .word, .digit, .notWhitespace: true + default: false + } + case .notWhitespace: + other == .whitespace + + case .verticalWhitespace: + switch other { + case .word, .digit, .notWhitespace, .notVerticalWhitespace: true + default: false + } + case .notVerticalWhitespace: + other == .verticalWhitespace + + case .word: + switch other { + case .whitespace, .horizontalWhitespace, .verticalWhitespace, .newlineSequence, + .notWord: true + default: false + } + case .notWord: + other == .word + } + } +} diff --git a/Sources/_StringProcessing/Regex/DSLList.swift b/Sources/_StringProcessing/Regex/DSLList.swift index 1bbb0c9cb..f8d09a953 100644 --- a/Sources/_StringProcessing/Regex/DSLList.swift +++ b/Sources/_StringProcessing/Regex/DSLList.swift @@ -94,3 +94,35 @@ extension DSLTree { }) } } + +extension DSLList { + internal func skipNode(_ position: inout Int) { + guard position < nodes.count else { + return + } + switch nodes[position] { + case let .orderedChoice(children): + let n = children.count + for _ in 0.. Self { + .init(quantificationKind: kind, isExplicit: true, canAutoPossessify: nil) + } + /// A kind set via syntax, which can be affected by options. - case syntax(_AST.QuantificationKind) + static func syntax(_ kind: _AST.QuantificationKind) -> Self { + .init(quantificationKind: kind, isExplicit: false, canAutoPossessify: nil) + } var ast: AST.Quantification.Kind? { - switch self { - case .default: return nil - case .explicit(let kind), .syntax(let kind): - return kind.ast + quantificationKind?.ast + } + + func applying(options: MatchingOptions) -> AST.Quantification.Kind { + guard let kind = quantificationKind?.ast else { + return options.defaultQuantificationKind + } + return if isExplicit { + kind + } else { + kind.applying(options) } } } @@ -889,6 +908,146 @@ extension DSLTree.Node { } } +// MARK: Required first and last atoms + +extension DSLTree.Node { + private func _requiredAtomImpl(forward: Bool) -> DSLTree.Atom?? { + switch self { + case .atom(let atom): + return switch atom { + case .changeMatchingOptions: + nil + default: + atom + } + + // In a concatenation, the first definitive child provides the answer. + case .concatenation(let children): + if forward { + for child in children { + if let result = child._requiredAtomImpl(forward: forward) { + return result + } + } + } else { + for child in children.reversed() { + if let result = child._requiredAtomImpl(forward: forward) { + return result + } + } + } + return nil + + // For a quoted literal, we can look at the first char + // TODO: matching semantics??? + case .quotedLiteral(let str): + return str.first.map(DSLTree.Atom.char) + + // TODO: custom character classes could/should participate here somehow + case .customCharacterClass: + return .some(nil) + + // Trivia/empty have no effect. + case .trivia, .empty: + return nil + + // For alternation and conditional, no required first (this could change + // if we identify the _same_ required first atom across all possibilities). + case .orderedChoice, .conditional: + return .some(nil) + + // Groups (and other parent nodes) defer to the child. + case .nonCapturingGroup(_, let child), .capture(_, _, let child, _), + .ignoreCapturesInTypedOutput(let child), + .limitCaptureNesting(let child): + return child._requiredAtomImpl(forward: forward) + + // A quantification that doesn't require its child to exist can still + // allow a start-only match. (e.g. `/(foo)?^bar/`) + case .quantification(let amount, _, let child): + return amount.requiresAtLeastOne + ? child._requiredAtomImpl(forward: forward) + : .some(nil) + + // Extended behavior isn't known, so we return `false` for safety. + case .consumer, .matcher, .characterPredicate, .absentFunction: + return .some(nil) + } + } + + internal func requiredFirstAtom() -> DSLTree.Atom? { + self._requiredAtomImpl(forward: true) ?? nil + } + + internal func requiredLastAtom() -> DSLTree.Atom? { + self._requiredAtomImpl(forward: false) ?? nil + } +} + + +private func _requiredAtomImpl(_ list: inout ArraySlice) -> DSLTree.Atom?? { + guard let node = list.popFirst() else { + return nil + } + switch node { + case .atom(let atom): + return switch atom { + case .changeMatchingOptions: + nil + default: + atom + } + + // In a concatenation, the first definitive child provides the answer. + case .concatenation(let children): + for _ in 0..) -> DSLTree.Atom? { + _requiredAtomImpl(&list) ?? nil +} + // MARK: AST wrapper types // // These wrapper types are required because even @_spi-marked public APIs can't @@ -952,6 +1111,14 @@ extension DSLTree { internal var isNegativeLookahead: Bool { self.ast == .negativeLookahead } + + internal var isChangeMatchingOptions: Bool { + if case let .changeMatchingOptions = ast { + return true + } else { + return false + } + } } @_spi(RegexBuilder) diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index e36285ae6..a87112b9e 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -37,12 +37,15 @@ func _roundTripLiteral( return remadeRegex } +// Validate that the given regex compiles to the same instructions whether +// as a tree (original) or a list (new). We need to compile with optimizations +// disabled, since new optimizations are primarily landing in list compilation. func _validateListCompilation( _ regex: Regex ) throws -> Bool { - let treeCompiler = Compiler(tree: regex.program.tree) + let treeCompiler = Compiler(tree: regex.program.tree, compileOptions: .disableOptimizations) let treeProgram = try treeCompiler.emitViaTree() - let listCompiler = Compiler(tree: regex.program.tree) + let listCompiler = Compiler(tree: regex.program.tree, compileOptions: .disableOptimizations) let listProgram = try listCompiler.emitViaList() return treeProgram.instructions == listProgram.instructions } @@ -734,6 +737,31 @@ extension RegexTests { ("baaaaabc", nil), ("baaaaaaaabc", nil)) + // Auto-possessification tests: + // - case sensitive + firstMatchTests( + "a+A", + ("aaaaA", "aaaaA"), + ("aaaaa", nil), + ("aaAaa", "aaA")) + // - case insensitive + firstMatchTests( + "(?i:a+A)", + ("aaaaA", "aaaaA"), + ("aaaaa", "aaaaa")) + firstMatchTests( + "(?i)a+A", + ("aaaaA", "aaaaA"), + ("aaaaa", "aaaaa")) + firstMatchTests( + "a+(?i:A)", + ("aaaaA", "aaaaA"), + ("aaaaa", "aaaaa")) + firstMatchTests( + "a+(?:(?i)A)", + ("aaaaA", "aaaaA"), + ("aaaaa", "aaaaa")) + // XFAIL'd possessive tests firstMatchTests( "a?+a", diff --git a/Tests/RegexTests/OptimizationTests.swift b/Tests/RegexTests/OptimizationTests.swift new file mode 100644 index 000000000..0fff0ebb1 --- /dev/null +++ b/Tests/RegexTests/OptimizationTests.swift @@ -0,0 +1,68 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2025 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +import Testing +@testable @_spi(RegexBuilder) import _StringProcessing +@testable import _RegexParser + +@Suite struct OptimizationTests { + @available(macOS 9999, *) + @Test(arguments: [#/a/#, #/a+/#, #/(?:a+)/#, #/(?:a)+/#, #/(?m)a+/#, #/ab?c/#, #/(?:a+)+$/#, #/(?:(?:a+b)+b)/#]) + func requiredFirstAtom(pattern: Regex) throws { + let list = DSLList(tree: pattern.program.tree) + let atom = list.requiredFirstAtom(allowOptionsChanges: true) + #expect(atom?.literalCharacterValue == "a", "Missing first character atom in '\(pattern._literalPattern!)'") + } + + @available(macOS 9999, *) + @Test(arguments: [#/a?/#, #/(?:a|b)/#, #/[a]/#, #/a?bc/#]) + func noRequiredFirstAtom(pattern: Regex) throws { + let list = DSLList(tree: pattern.program.tree) + let atom = list.requiredFirstAtom(allowOptionsChanges: true) + #expect(atom == nil, "Unexpected required first atom in '\(pattern._literalPattern!)'") + } + + @available(macOS 9999, *) + @Test(arguments: [#/a+b/#, #/a*b/#, #/\w+\s/#, #/(?:a+b|b+a)/#, #/\d+a/#, #/a+A/#]) + func autoPossessify(pattern: Regex) throws { + var list = DSLList(tree: pattern.program.tree) + list.autoPossessify() + for node in list.nodes { + switch node { + case .quantification(_, let kind, _): + #expect( + kind.isExplicit && kind.quantificationKind?.ast == .possessive, + "Expected possessification in '\(pattern._literalPattern!)'") + default: break + } + } + } + + @available(macOS 9999, *) + @Test(arguments: [ + #/a?/#, #/a+a/#, #/a+(?:b|c)/#, #/(?:a+|b+)/#, #/[a]/#, #/a?a/#, + #/(?i)a+A/#, #/(?i:a+A)/#, // case insensitivity when checking exclusion + #/(?:(?:ab)+b)/#, // single atom quantifications only + ]) + func noAutoPossessify(pattern: Regex) throws { + var list = DSLList(tree: pattern.program.tree) + list.autoPossessify() + for node in list.nodes { + switch node { + case .quantification(_, let kind, _): + #expect( + kind.quantificationKind?.ast != .possessive, + "Unexpected possessification in '\(pattern._literalPattern!)'") + default: break + } + } + } +} From f27b1b7776f026fe2b1b57af73e8f6fd4eed4204 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Wed, 22 Oct 2025 18:48:46 -0500 Subject: [PATCH 4/5] Add steps for comparing with 'main' branch (#842) --- .github/workflows/pull_request.yml | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index 16352631e..2a550d454 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -26,6 +26,7 @@ jobs: BUILD_CMD: swift build -c release BENCH_CMD: .build/release/RegexBenchmark BASELINE_FILE: benchmark-baseline + MAIN_FILE: benchmark-main COMPARE_FILE: benchmark-pr COMPARE_OUT_FILE: benchmark-results.txt steps: @@ -46,6 +47,26 @@ jobs: set -euo pipefail eval "$BENCH_CMD --save $RUNNER_TEMP/$BASELINE_FILE" test -s "$RUNNER_TEMP/$BASELINE_FILE" || { echo "Baseline not created at $BASELINE_FILE"; exit 1; } + - name: Check out main branch + if: ${{ github.event.pull_request.base.ref != 'main' }} + uses: actions/checkout@v4 + with: + ref: main + path: main-branch + fetch-depth: 0 + - name: Build main branch + if: ${{ github.event.pull_request.base.ref != 'main' }} + working-directory: main-branch + run: | + set -euo pipefail + eval "$BUILD_CMD" + - name: Run main benchmark + if: ${{ github.event.pull_request.base.ref != 'main' }} + working-directory: main-branch + run: | + set -euo pipefail + eval "$BENCH_CMD --save $RUNNER_TEMP/$MAIN_FILE" + test -s "$RUNNER_TEMP/$MAIN_FILE" || { echo "Baseline (main) not created at $MAIN_FILE"; exit 1; } - name: Check out PR branch uses: actions/checkout@v4 with: @@ -64,11 +85,17 @@ jobs: eval "$BENCH_CMD --save $RUNNER_TEMP/$COMPARE_FILE" test -s "$RUNNER_TEMP/$COMPARE_FILE" || { echo "Comparison not created at $COMPARE_FILE"; exit 1; } eval "$BENCH_CMD --compare $RUNNER_TEMP/$BASELINE_FILE" | tee "$RUNNER_TEMP/$COMPARE_OUT_FILE" - - name: 📊 Compare benchmarks + - name: 📊 Compare benchmarks with base working-directory: pr run: | set -euo pipefail eval "$BENCH_CMD --load $RUNNER_TEMP/$COMPARE_FILE --compare $RUNNER_TEMP/$BASELINE_FILE --compare-compile-time $RUNNER_TEMP/$BASELINE_FILE" | tee "$RUNNER_TEMP/$COMPARE_OUT_FILE" + - name: 📊 Compare benchmarks with `main` + if: ${{ github.event.pull_request.base.ref != 'main' }} + working-directory: pr + run: | + set -euo pipefail + eval "$BENCH_CMD --load $RUNNER_TEMP/$COMPARE_FILE --compare $RUNNER_TEMP/$MAIN_FILE --compare-compile-time $RUNNER_TEMP/$MAIN_FILE" - name: Upload benchmark artifacts uses: actions/upload-artifact@v4 with: From 9f0c4590c14a5fbc2e5220c5a7345e5c8d48eb83 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Thu, 11 Dec 2025 13:08:20 -0600 Subject: [PATCH 5/5] [perf] Switch to direct generation of the DSL List (#841) Implements the direct AST -> DSLList conversion --- .../ByteCodeGen+DSLList.swift | 57 +++++- Sources/_StringProcessing/Compiler.swift | 37 ++-- .../_StringProcessing/LiteralPrinter.swift | 166 +++++++++++++++++- .../Regex/ASTConversion.swift | 133 ++++++++++++++ .../Regex/AnyRegexOutput.swift | 6 +- Sources/_StringProcessing/Regex/Core.swift | 94 ++++++++-- Sources/_StringProcessing/Regex/DSLList.swift | 93 ++++++++++ Sources/_StringProcessing/Regex/DSLTree.swift | 112 ++++++++++++ Sources/_StringProcessing/Regex/Options.swift | 6 +- .../Utility/RegexFactory.swift | 59 +++---- .../Utility/TypeVerification.swift | 4 +- Tests/RegexTests/CaptureTests.swift | 2 +- Tests/RegexTests/DSLListTests.swift | 8 +- Tests/RegexTests/LiteralPrinterTests.swift | 3 + Tests/RegexTests/MatchTests.swift | 29 +-- Tests/RegexTests/OptimizationTests.swift | 8 +- 16 files changed, 721 insertions(+), 96 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen+DSLList.swift b/Sources/_StringProcessing/ByteCodeGen+DSLList.swift index 330018878..3394b319f 100644 --- a/Sources/_StringProcessing/ByteCodeGen+DSLList.swift +++ b/Sources/_StringProcessing/ByteCodeGen+DSLList.swift @@ -351,6 +351,59 @@ fileprivate extension Compiler.ByteCodeGen { } } + func _guaranteesForwardProgressImpl(_ list: ArraySlice, position: inout Int) -> Bool { + guard position < list.endIndex else { return false } + let node = list[position] + position += 1 + switch node { + case .orderedChoice(let children): + return (0.. 0 else { return false } + return _guaranteesForwardProgressImpl(list, position: &position) + case .limitCaptureNesting, .ignoreCapturesInTypedOutput: + return _guaranteesForwardProgressImpl(list, position: &position) + default: return false + } + } + + func guaranteesForwardProgress(_ list: ArraySlice) -> Bool { + var pos = list.startIndex + return _guaranteesForwardProgressImpl(list, position: &pos) + } + mutating func emitQuantification( _ amount: AST.Quantification.Amount, _ kind: DSLTree.QuantificationKind, @@ -526,8 +579,8 @@ fileprivate extension Compiler.ByteCodeGen { let startPosition: PositionRegister? // FIXME: forward progress check?! let emitPositionChecking = - (!optimizationsEnabled || (list.first?.guaranteesForwardProgress != true)) && - maxExtraTrips == nil + (!optimizationsEnabled || !guaranteesForwardProgress(list)) + && maxExtraTrips == nil if emitPositionChecking { startPosition = builder.makePositionRegister() diff --git a/Sources/_StringProcessing/Compiler.swift b/Sources/_StringProcessing/Compiler.swift index 25e6e4cf6..b34e0e5f7 100644 --- a/Sources/_StringProcessing/Compiler.swift +++ b/Sources/_StringProcessing/Compiler.swift @@ -12,21 +12,30 @@ internal import _RegexParser class Compiler { - let tree: DSLTree + let tree: DSLList // TODO: Or are these stored on the tree? var options = MatchingOptions() private var compileOptions: _CompileOptions = .default init(ast: AST) { - self.tree = ast.dslTree + self.tree = DSLList(tree: ast.dslTree) } init(tree: DSLTree) { - self.tree = tree + self.tree = DSLList(tree: tree) + } + + init(list: DSLList) { + self.tree = list } init(tree: DSLTree, compileOptions: _CompileOptions) { + self.tree = DSLList(tree: tree) + self.compileOptions = compileOptions + } + + init(tree: DSLList, compileOptions: _CompileOptions) { self.tree = tree self.compileOptions = compileOptions } @@ -42,18 +51,20 @@ class Compiler { compileOptions: compileOptions, captureList: tree.captureList) - return try codegen.emitRoot(tree.root) + fatalError() +// return try codegen.emitRoot(tree.root) } __consuming func emitViaList() throws -> MEProgram { // TODO: Handle global options - var dslList = DSLList(tree: tree) +// var dslList = DSLList(tree: tree) var codegen = ByteCodeGen( options: options, compileOptions: compileOptions, captureList: tree.captureList) - return try codegen.emitRoot(&dslList) + var tree = tree + return try codegen.emitRoot(&tree) } } @@ -105,20 +116,22 @@ func _compileRegex( _ syntax: SyntaxOptions = .traditional, _ semanticLevel: RegexSemanticLevel? = nil ) throws -> MEProgram { - let ast = try parse(regex, syntax) - let dsl: DSLTree + var ast = try parse(regex, syntax) + let dsl: DSLList switch semanticLevel?.base { case .graphemeCluster: let sequence = AST.MatchingOptionSequence(adding: [.init(.graphemeClusterSemantics, location: .fake)]) - dsl = DSLTree(.nonCapturingGroup(.init(ast: .changeMatchingOptions(sequence)), ast.dslTree.root)) + ast.root = AST.Node.group(AST.Group(.init(faking: .changeMatchingOptions(sequence)), ast.root, .fake)) + dsl = DSLList(ast: ast) case .unicodeScalar: let sequence = AST.MatchingOptionSequence(adding: [.init(.unicodeScalarSemantics, location: .fake)]) - dsl = DSLTree(.nonCapturingGroup(.init(ast: .changeMatchingOptions(sequence)), ast.dslTree.root)) + ast.root = AST.Node.group(AST.Group(.init(faking: .changeMatchingOptions(sequence)), ast.root, .fake)) + dsl = DSLList(ast: ast) case .none: - dsl = ast.dslTree + dsl = DSLList(ast: ast) } - let program = try Compiler(tree: dsl).emit() + let program = try Compiler(list: dsl).emit() return program } diff --git a/Sources/_StringProcessing/LiteralPrinter.swift b/Sources/_StringProcessing/LiteralPrinter.swift index e1dc3fa23..d9cdbb04e 100644 --- a/Sources/_StringProcessing/LiteralPrinter.swift +++ b/Sources/_StringProcessing/LiteralPrinter.swift @@ -36,7 +36,8 @@ extension Regex { @available(SwiftStdlib 6.0, *) public var _literalPattern: String? { var gen = LiteralPrinter(options: MatchingOptions()) - gen.outputNode(self.program.tree.root) + var list = self.program.list.nodes[...] + try? gen.outputList(&list) return gen.canonicalLiteralString } } @@ -83,6 +84,159 @@ fileprivate struct LiteralPrinter { mutating func saveInconvertible(_ node: DSLTree.Node) { segments.append(.inconvertible(node)) } + + mutating func inconvertible(_ node: DSLTree.Node) throws { + segments.append(.inconvertible(node)) + throw Incovertible.error + } +} + +extension LiteralPrinter { + enum Incovertible: Error { + case error + } + + mutating func outputList(_ list: inout ArraySlice) throws { + guard let node = list.popFirst() else { + return + } + + switch node { + case let .orderedChoice(children): + try outputAlternation(&list, count: children.count) + case let .concatenation(children): + try outputConcatenation(&list, count: children.count) + + case let .capture(name, nil, _, nil): + options.beginScope() + defer { options.endScope() } + try outputCapture(&list, name: name) + case .capture: + // Captures that use a reference or a transform are unsupported + try inconvertible(node) + return + + case let .nonCapturingGroup(kind, _): + guard let kindPattern = kind._patternString else { + try inconvertible(node) + return + } + options.beginScope() + defer { options.endScope() } + + output(kindPattern) + if case .changeMatchingOptions(let optionSequence) = kind.ast { + options.apply(optionSequence) + } + try outputList(&list) + output(")") + + case .ignoreCapturesInTypedOutput(_), + .limitCaptureNesting(_): + try outputList(&list) + case let .quantification(amount, kind, _): + try outputQuantification(&list, amount: amount, kind: kind) + case let .customCharacterClass(charClass): + outputCustomCharacterClass(charClass) + case let .atom(atom): + outputAtom(atom) + case let .quotedLiteral(literal): + output(prepareQuotedLiteral(literal)) + + case .trivia(_): + // TODO: Include trivia? + return + case .empty: + return + + case .conditional, .absentFunction, .consumer, .matcher, .characterPredicate: + saveInconvertible(node) + } + } + + mutating func outputAlternation(_ list: inout ArraySlice, count: Int) throws { + for i in 0.., count: Int) throws { + for _ in 0.., name: String?) throws { + if let name { + output("(?<\(name)>") + } else { + output("(") + } + try outputList(&list) + output(")") + } + + func requiresGrouping(_ list: ArraySlice) -> Bool { + guard let node = list.first else { return false } // malformed? + switch node { + case .concatenation(let children): + switch children.count { + case 0: + return false + case 1: + return requiresGrouping(list.dropFirst()) + default: + return true + } + + case .quotedLiteral(let literal): + return prepareQuotedLiteral(literal).count > 1 + + default: + return false + } + } + + mutating func outputQuantification( + _ list: inout ArraySlice, + amount: DSLTree._AST.QuantificationAmount, + kind: DSLTree.QuantificationKind + ) throws { + // RegexBuilder regexes can have children that need + if requiresGrouping(list) { + output("(?:") + try outputList(&list) + output(")") + } else { + try outputList(&list) + } + + switch amount.ast { + case .zeroOrMore: + output("*") + case .oneOrMore: + output("+") + case .zeroOrOne: + output("?") + case let .exactly(n): + output("{\(n.value!)}") + case let .nOrMore(n): + output("{\(n.value!),}") + case let .upToN(n): + output("{,\(n.value!)}") + case let .range(low, high): + output("{\(low.value!),\(high.value!)}") + #if RESILIENT_LIBRARIES + @unknown default: + fatalError() + #endif + } + + outputQuantificationKind(kind) + } } extension LiteralPrinter { @@ -455,7 +609,15 @@ extension String { } func escapingConfusableCharacters() -> String { - lazy.map(\.escapingConfusable).joined() + reduce(into: "") { result, ch in + for scalar in ch.unicodeScalars { + if scalar.isPrintableASCII { + result.append(Character(scalar)) + } else { + result.append(scalar.escapedString) + } + } + } } } diff --git a/Sources/_StringProcessing/Regex/ASTConversion.swift b/Sources/_StringProcessing/Regex/ASTConversion.swift index fbb189559..2c376fd6d 100644 --- a/Sources/_StringProcessing/Regex/ASTConversion.swift +++ b/Sources/_StringProcessing/Regex/ASTConversion.swift @@ -17,6 +17,139 @@ extension AST { } } +extension AST.Node { + func convert(into list: inout [DSLTree.Node]) throws { + switch self { + case .alternation(let alternation): + list.append(.orderedChoice(Array(repeating: TEMP_FAKE_NODE, count: alternation.children.count))) + for child in alternation.children { + try child.convert(into: &list) + } + case .concatenation(let concatenation): + let coalesced = self.coalescedChildren + list.append(.concatenation(Array(repeating: TEMP_FAKE_NODE, count: coalesced.count))) + for child in coalesced { + try child.convert(into: &list) + } + case .group(let group): + let child = group.child + switch group.kind.value { + case .capture: + list.append(.capture(TEMP_FAKE_NODE)) + try child.convert(into: &list) + case .namedCapture(let name): + list.append(.capture(name: name.value, TEMP_FAKE_NODE)) + try child.convert(into: &list) + case .balancedCapture: + throw Unsupported("TODO: balanced captures") + default: + list.append(.nonCapturingGroup(.init(ast: group.kind.value), TEMP_FAKE_NODE)) + try child.convert(into: &list) + } + case .conditional(let conditional): + list.append(.conditional(.init(ast: conditional.condition.kind), TEMP_FAKE_NODE, TEMP_FAKE_NODE)) + try conditional.trueBranch.convert(into: &list) + try conditional.falseBranch.convert(into: &list) + case .quantification(let quant): + list.append( + .quantification(.init(ast: quant.amount.value), .syntax(.init(ast: quant.kind.value)), TEMP_FAKE_NODE)) + try quant.child.convert(into: &list) + case .quote(let node): + list.append(.quotedLiteral(node.literal)) + case .trivia(let node): + list.append(.trivia(node.contents)) + case .interpolation(_): + throw Unsupported("TODO: interpolation") + case .atom(let atom): + switch atom.kind { + case .scalarSequence(let seq): + // The DSL doesn't have an equivalent node for scalar sequences. Splat + // them into a concatenation of scalars. + // list.append(.concatenation(Array(repeating: TEMP_FAKE_NODE, count: seq.scalarValues.count))) + list.append(.quotedLiteral(String(seq.scalarValues))) + default: + list.append(.atom(atom.dslTreeAtom)) + } + case .customCharacterClass(let ccc): + list.append(.customCharacterClass(ccc.dslTreeClass)) + case .absentFunction(let abs): + // TODO: What should this map to? + list.append(.absentFunction(.init(ast: abs))) + case .empty(_): + list.append(.empty) + } + } + + var coalescedChildren: [AST.Node] { + // Before converting a concatenation in a tree to list form, we need to + // flatten out any nested concatenations, and coalesce any adjacent + // characters and scalars, forming quoted literals of their contents, + // over which we can perform grapheme breaking. + + func flatten(_ node: AST.Node) -> [AST.Node] { + switch node { + case .concatenation(let concat): + return concat.children.flatMap(flatten) + default: + return [node] + } + } + + func appendAtom(_ atom: AST.Atom, to str: inout String) -> Bool { + switch atom.kind { + case .char(let c): + str.append(c) + return true + case .scalar(let s): + str.append(Character(s.value)) + return true + case .escaped(let c): + guard let value = c.scalarValue else { return false } + str.append(Character(value)) + return true + case .scalarSequence(let seq): + str.append(contentsOf: seq.scalarValues.lazy.map(Character.init)) + return true + + default: + return false + } + } + + switch self { + case .alternation(let v): return v.children + case .concatenation(let v): + let children = v.children + .flatMap(flatten) + .coalescing(with: "", into: { AST.Node.quote(.init($0, .fake)) }) { str, node in + switch node { + case .atom(let a): + return appendAtom(a, to: &str) + case .quote(let q): + str += q.literal + return true + case .trivia: + // Trivia can be completely ignored if we've already coalesced + // something. + return !str.isEmpty + default: + return false + } + } + return children + + case .group(let group): + return [group.child] + case .conditional(let conditional): + return [conditional.trueBranch, conditional.falseBranch] + case .quantification(let quant): + return [quant.child] + case .quote, .trivia, .interpolation, .atom, .customCharacterClass, .absentFunction, .empty: + return [] + } + } +} + extension AST.Node { /// Converts an AST node to a `convertedRegexLiteral` node. var dslTreeNode: DSLTree.Node { diff --git a/Sources/_StringProcessing/Regex/AnyRegexOutput.swift b/Sources/_StringProcessing/Regex/AnyRegexOutput.swift index ae8193804..a4e405f8c 100644 --- a/Sources/_StringProcessing/Regex/AnyRegexOutput.swift +++ b/Sources/_StringProcessing/Regex/AnyRegexOutput.swift @@ -265,7 +265,7 @@ extension Regex { /// - Parameter name: The name to look for among the regular expression's /// capture groups. Capture group names are case sensitive. public func contains(captureNamed name: String) -> Bool { - program.tree.captureList.captures.contains(where: { + program.list.captureList.captures.contains(where: { $0.name == name }) } @@ -284,7 +284,7 @@ extension Regex where Output == AnyRegexOutput { /// - Parameter regex: A regular expression to convert to use a dynamic /// capture list. public init(_ regex: Regex) { - self.init(node: regex.root) + self.init(list: regex.list) } } @@ -331,7 +331,7 @@ extension Regex { _ regex: Regex, as outputType: Output.Type = Output.self ) { - self.init(node: regex.root) + self.init(list: regex.list) guard _verifyType().0 else { return nil diff --git a/Sources/_StringProcessing/Regex/Core.swift b/Sources/_StringProcessing/Regex/Core.swift index 11445531c..425f64549 100644 --- a/Sources/_StringProcessing/Regex/Core.swift +++ b/Sources/_StringProcessing/Regex/Core.swift @@ -11,6 +11,8 @@ internal import _RegexParser +let TEMP_FAKE_NODE = DSLTree.Node.empty + /// A type that represents a regular expression. /// /// You can use types that conform to `RegexComponent` as parameters to string @@ -91,7 +93,10 @@ public struct Regex: RegexComponent { let program: Program var hasCapture: Bool { - program.tree.hasCapture + program.list.hasCapture + } + var hasChildren: Bool { + program.list.hasChildren } init(ast: AST) { @@ -148,7 +153,7 @@ extension Regex { /// FIXME: If Regex is the unit of composition, then it should be a Node instead, /// and we should have a separate type that handled both global options and, /// likely, compilation/caching. - let tree: DSLTree + var list: DSLList /// OptionSet of compiler options for testing purposes fileprivate var compileOptions: _CompileOptions = .default @@ -178,7 +183,7 @@ extension Regex { } // Compile the DSLTree into a lowered program and store it atomically. - let compiledProgram = try! Compiler(tree: tree, compileOptions: compileOptions).emit() + let compiledProgram = try! Compiler(tree: list, compileOptions: compileOptions).emit() let storedNewProgram = _stdlib_atomicInitializeARCRef( object: _loweredProgramStoragePtr, desired: ProgramBox(compiledProgram)) @@ -191,11 +196,15 @@ extension Regex { } init(ast: AST) { - self.tree = ast.dslTree + self.list = DSLList(ast: ast) } init(tree: DSLTree) { - self.tree = tree + self.list = DSLList(tree: tree) + } + + init(list: DSLList) { + self.list = list } } @@ -214,12 +223,77 @@ extension Regex { @available(SwiftStdlib 5.7, *) extension Regex { - var root: DSLTree.Node { - program.tree.root + var list: DSLList { + program.list } - + init(node: DSLTree.Node) { - self.program = Program(tree: .init(node)) + self.program = Program(list: .init(node)) + } + + init(list: DSLList) { + self.program = Program(list: list) + } + + func appending(_ node: DSLTree.Node) -> Regex { + var list = program.list + list.append(node) + return Regex(list: list) + } + + func appending(contentsOf node: [DSLTree.Node]) -> Regex { + var list = program.list + list.append(contentsOf: node) + return Regex(list: list) + } + + func concatenating(_ other: DSLList) -> Regex { + // TODO: Quick check to see if these copies are necessary? + var list = program.list + var other = other + list.coalesce(withFirstAtomIn: &other) + + // Sometimes coalescing consumes all of `other` + guard !other.nodes.isEmpty else { + return Regex(list: list) + } + + // Use an existing concatenation if it's already the root; + // otherwise, embed self and other in a new concatenation root. + switch list.nodes[0] { + case .concatenation(let children): + list.nodes[0] = .concatenation(Array(repeating: TEMP_FAKE_NODE, count: children.count + 1)) + list.nodes.append(contentsOf: other.nodes) + default: + list.nodes.insert(.concatenation(Array(repeating: TEMP_FAKE_NODE, count: 2)), at: 0) + list.nodes.append(contentsOf: other.nodes) + } + return Regex(list: list) + } + + func alternating(with other: some Collection) -> Regex { + var nodes = program.list.nodes + switch nodes[0] { + case .orderedChoice(let children): + nodes[0] = .orderedChoice(Array(repeating: TEMP_FAKE_NODE, count: children.count + 1)) + nodes.append(contentsOf: other) + default: + nodes.insert(.orderedChoice(Array(repeating: TEMP_FAKE_NODE, count: 2)), at: 0) + nodes.append(contentsOf: other) + } + return Regex(list: DSLList(nodes)) + } + + func prepending(_ node: DSLTree.Node) -> Regex { + var list = program.list + list.prepend(node) + return Regex(list: list) + } + + func prepending(contentsOf node: some Collection) -> Regex { + var list = program.list + list.prepend(contentsOf: node) + return Regex(list: list) } } @@ -242,7 +316,7 @@ extension Regex { return true case .recompile: let _ = try Compiler( - tree: program.tree, + tree: program.list, compileOptions: program.compileOptions).emit() return true } diff --git a/Sources/_StringProcessing/Regex/DSLList.swift b/Sources/_StringProcessing/Regex/DSLList.swift index f8d09a953..8e53c87d1 100644 --- a/Sources/_StringProcessing/Regex/DSLList.swift +++ b/Sources/_StringProcessing/Regex/DSLList.swift @@ -9,9 +9,21 @@ // //===----------------------------------------------------------------------===// +internal import _RegexParser + struct DSLList { var nodes: [DSLTree.Node] + // experimental + var hasCapture: Bool = false + var hasChildren: Bool { + (nodes.first?.directChildren ?? 0) > 0 + } + + var captureList: CaptureList { + .Builder.build(self) + } + init(_ initial: DSLTree.Node) { self.nodes = [initial] } @@ -23,6 +35,33 @@ struct DSLList { init(tree: DSLTree) { self.nodes = Array(tree.depthFirst) } + + init(ast: AST) { + self.nodes = [.limitCaptureNesting(TEMP_FAKE_NODE)] + try! ast.root.convert(into: &nodes) + } + + var first: DSLTree.Node { + nodes.first ?? .empty + } +} + +extension DSLList { + mutating func append(_ node: DSLTree.Node) { + nodes.append(node) + } + + mutating func append(contentsOf other: some Sequence) { + nodes.append(contentsOf: other) + } + + mutating func prepend(_ node: DSLTree.Node) { + nodes.insert(node, at: 0) + } + + mutating func prepend(contentsOf other: some Collection) { + nodes.insert(contentsOf: other, at: 0) + } } extension DSLTree.Node { @@ -125,4 +164,58 @@ extension DSLList { break } } + + func indexOfCoalescableAtom(startingAt position: Int, findLast: Bool = false) -> Int? { + switch nodes[position] { + case .concatenation(let children): + var position = position + 1 + if findLast { + for _ in 0..<(children.count - 1) { + skipNode(&position) + position += 1 + } + } + return indexOfCoalescableAtom(startingAt: position, findLast: findLast) + case .ignoreCapturesInTypedOutput, .limitCaptureNesting: + return indexOfCoalescableAtom(startingAt: position + 1, findLast: findLast) + case .atom(let atom): + if atom.literalCharacterValue != nil { + return position + } + case .quotedLiteral: + return position + default: + break + } + return nil + } + + mutating func coalesce(withFirstAtomIn other: inout DSLList) { + // Find the last coalescable node in the LHS and the first in the RHS + guard let prefixIndex = indexOfCoalescableAtom(startingAt: 0, findLast: true), + let postfixIndex = other.indexOfCoalescableAtom(startingAt: 0), + let prefixValue = nodes[prefixIndex].literalStringValue, + let postfixValue = other.nodes[postfixIndex].literalStringValue + else { return } + + // Replace the prefix node with a coalesced version of the two + nodes[prefixIndex] = .quotedLiteral(prefixValue + postfixValue) + + // Remove the postfix node and fix up any parent concatenations + other.nodes.remove(at: postfixIndex) + var i = postfixIndex - 1 + Loop: + while i >= 0 { + switch other.nodes[i] { + case .concatenation(let children): + other.nodes[i] = .concatenation(.init(repeating: .empty, count: children.count - 1)) + break Loop + case .limitCaptureNesting, .ignoreCapturesInTypedOutput: + other.nodes.remove(at: i) + i -= 1 + default: + break Loop + } + } + } } diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index 55d8902fa..12f559729 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -514,6 +514,16 @@ extension DSLTree.Atom { } } +extension DSLTree.Node { + var literalStringValue: String? { + switch self { + case .atom(let a): return a.literalCharacterValue.map(String.init) + case .quotedLiteral(let s): return s + default: return nil + } + } +} + extension DSLTree { struct Options { // TBD @@ -769,6 +779,91 @@ extension CaptureList.Builder { builder.addCaptures(of: dsl.root, optionalNesting: .init(canNest: true), visibleInTypedOutput: true) return builder.captures } + + mutating func addCaptures( + in list: inout ArraySlice, optionalNesting nesting: OptionalNesting, visibleInTypedOutput: Bool + ) { + guard let node = list.popFirst() else { return } + switch node { + case let .orderedChoice(children): + for _ in 0.. CaptureList { + var builder = Self() + builder.captures.append( + .init(type: dsl.first.wholeMatchType, optionalDepth: 0, visibleInTypedOutput: true, .fake)) + var nodes = dsl.nodes[...] + builder.addCaptures(in: &nodes, optionalNesting: .init(canNest: true), visibleInTypedOutput: true) + return builder.captures + } } extension DSLTree.Node { @@ -806,6 +901,23 @@ extension DSLTree.Node { } } +extension DSLList { + + /// Returns the output-defining node, peering through any output-forwarding + /// nodes. + var outputDefiningNode: DSLTree.Node? { + nodes.first(where: { !$0.isOutputForwarding }) + } + + /// Returns the type of the whole match, i.e. `.0` element type of the output. + var wholeMatchType: Any.Type { + if case .matcher(let type, _) = outputDefiningNode { + return type + } + return Substring.self + } +} + extension DSLTree.Node { /// Implementation for `canOnlyMatchAtStart`, which maintains the option /// state. diff --git a/Sources/_StringProcessing/Regex/Options.swift b/Sources/_StringProcessing/Regex/Options.swift index 6911af911..5b3121831 100644 --- a/Sources/_StringProcessing/Regex/Options.swift +++ b/Sources/_StringProcessing/Regex/Options.swift @@ -294,7 +294,9 @@ extension RegexComponent { let sequence = shouldAdd ? AST.MatchingOptionSequence(adding: [.init(option, location: .fake)]) : AST.MatchingOptionSequence(removing: [.init(option, location: .fake)]) - return Regex(node: .nonCapturingGroup( - .init(ast: .changeMatchingOptions(sequence)), regex.root)) + + var list = regex.program.list + list.nodes.insert(.nonCapturingGroup(.init(ast: .changeMatchingOptions(sequence)), TEMP_FAKE_NODE), at: 0) + return Regex(list: list) } } diff --git a/Sources/_StringProcessing/Utility/RegexFactory.swift b/Sources/_StringProcessing/Utility/RegexFactory.swift index 0c224e159..5f8dc83a2 100644 --- a/Sources/_StringProcessing/Utility/RegexFactory.swift +++ b/Sources/_StringProcessing/Utility/RegexFactory.swift @@ -26,9 +26,9 @@ public struct _RegexFactory { _ child: some RegexComponent ) -> Regex { // Don't wrap `child` again if it's a leaf node. - child.regex.root.hasChildNodes - ? .init(node: .ignoreCapturesInTypedOutput(child.regex.root)) - : .init(node: child.regex.root) + child.regex.list.hasChildren + ? child.regex.prepending(.ignoreCapturesInTypedOutput(TEMP_FAKE_NODE)) as Regex + : .init(list: child.regex.program.list) } @available(SwiftStdlib 5.7, *) @@ -36,7 +36,7 @@ public struct _RegexFactory { _ left: some RegexComponent, _ right: some RegexComponent ) -> Regex { - .init(node: left.regex.root.appending(right.regex.root)) + left.regex.concatenating(right.regex.program.list) } @available(SwiftStdlib 5.7, *) @@ -44,7 +44,7 @@ public struct _RegexFactory { _ left: some RegexComponent, _ right: some RegexComponent ) -> Regex { - .init(node: left.regex.root.appendingAlternationCase(right.regex.root)) + left.regex.alternating(with: right.regex.program.list.nodes) } @_spi(RegexBuilder) @@ -107,7 +107,7 @@ public struct _RegexFactory { _ behavior: RegexRepetitionBehavior? = nil ) -> Regex { let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default - return .init(node: .quantification(.zeroOrOne, kind, component.regex.root)) + return component.regex.prepending(.quantification(.zeroOrOne, kind, TEMP_FAKE_NODE)) } @available(SwiftStdlib 5.7, *) @@ -116,7 +116,7 @@ public struct _RegexFactory { _ behavior: RegexRepetitionBehavior? = nil ) -> Regex { let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default - return .init(node: .quantification(.zeroOrMore, kind, component.regex.root)) + return component.regex.prepending(.quantification(.zeroOrMore, kind, TEMP_FAKE_NODE)) } @available(SwiftStdlib 5.7, *) @@ -125,7 +125,7 @@ public struct _RegexFactory { _ behavior: RegexRepetitionBehavior? = nil ) -> Regex { let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default - return .init(node: .quantification(.oneOrMore, kind, component.regex.root)) + return component.regex.prepending(.quantification(.oneOrMore, kind, TEMP_FAKE_NODE)) } @available(SwiftStdlib 5.7, *) @@ -133,7 +133,7 @@ public struct _RegexFactory { _ count: Int, _ component: some RegexComponent ) -> Regex { - .init(node: .quantification(.exactly(count), .default, component.regex.root)) + component.regex.prepending(.quantification(.exactly(count), .default, TEMP_FAKE_NODE)) } @available(SwiftStdlib 5.7, *) @@ -142,14 +142,14 @@ public struct _RegexFactory { _ behavior: RegexRepetitionBehavior?, _ component: some RegexComponent ) -> Regex { - .init(node: .repeating(range, behavior, component.regex.root)) + component.regex.prepending(.repeating(range, behavior, TEMP_FAKE_NODE)) } @available(SwiftStdlib 5.7, *) public func atomicNonCapturing( _ component: some RegexComponent ) -> Regex { - .init(node: .nonCapturingGroup(.atomicNonCapturing, component.regex.root)) + component.regex.prepending(.nonCapturingGroup(.atomicNonCapturing, TEMP_FAKE_NODE)) } @_spi(RegexBuilder) @@ -157,7 +157,7 @@ public struct _RegexFactory { public func lookaheadNonCapturing( _ component: some RegexComponent ) -> Regex { - .init(node: .nonCapturingGroup(.lookahead, component.regex.root)) + component.regex.prepending(.nonCapturingGroup(.lookahead, TEMP_FAKE_NODE)) } @_spi(RegexBuilder) @@ -165,21 +165,21 @@ public struct _RegexFactory { public func negativeLookaheadNonCapturing( _ component: some RegexComponent ) -> Regex { - .init(node: .nonCapturingGroup(.negativeLookahead, component.regex.root)) + component.regex.prepending(.nonCapturingGroup(.negativeLookahead, TEMP_FAKE_NODE)) } @available(SwiftStdlib 5.7, *) public func orderedChoice( _ component: some RegexComponent ) -> Regex { - .init(node: .orderedChoice([component.regex.root])) + component.regex.prepending(.orderedChoice([TEMP_FAKE_NODE])) } @available(SwiftStdlib 5.7, *) public func capture( - _ r: some RegexComponent + _ component: some RegexComponent ) -> Regex { - .init(node: .capture(r.regex.root)) + component.regex.prepending(.capture(TEMP_FAKE_NODE)) } @available(SwiftStdlib 5.7, *) @@ -187,10 +187,7 @@ public struct _RegexFactory { _ component: some RegexComponent, _ reference: Int ) -> Regex { - .init(node: .capture( - reference: ReferenceID(reference), - component.regex.root - )) + component.regex.prepending(.capture(reference: ReferenceID(reference), TEMP_FAKE_NODE)) } @available(SwiftStdlib 5.7, *) @@ -199,11 +196,12 @@ public struct _RegexFactory { _ reference: Int? = nil, _ transform: @escaping (W) throws -> NewCapture ) -> Regex { - .init(node: .capture( - reference: reference.map { ReferenceID($0) }, - component.regex.root, - CaptureTransform(transform) - )) + component.regex.prepending( + .capture( + reference: reference.map { ReferenceID($0) }, + TEMP_FAKE_NODE, + CaptureTransform(transform) + )) } @available(SwiftStdlib 5.7, *) @@ -212,10 +210,11 @@ public struct _RegexFactory { _ reference: Int? = nil, _ transform: @escaping (W) throws -> NewCapture? ) -> Regex { - .init(node: .capture( - reference: reference.map { ReferenceID($0) }, - component.regex.root, - CaptureTransform(transform) - )) + component.regex.prepending( + .capture( + reference: reference.map { ReferenceID($0) }, + TEMP_FAKE_NODE, + CaptureTransform(transform) + )) } } diff --git a/Sources/_StringProcessing/Utility/TypeVerification.swift b/Sources/_StringProcessing/Utility/TypeVerification.swift index 11796d1e3..566127220 100644 --- a/Sources/_StringProcessing/Utility/TypeVerification.swift +++ b/Sources/_StringProcessing/Utility/TypeVerification.swift @@ -21,7 +21,7 @@ extension Regex { var tupleElements: [Any.Type] = [] var labels = "" - for capture in program.tree.captureList.captures { + for capture in program.list.captureList.captures { var captureType = capture.type var i = capture.optionalDepth @@ -41,7 +41,7 @@ extension Regex { // If we have no captures, then our Regex must be Regex. if tupleElements.count == 1 { - let wholeMatchType = program.tree.root.wholeMatchType + let wholeMatchType = program.list.wholeMatchType return (Output.self == wholeMatchType, wholeMatchType) } diff --git a/Tests/RegexTests/CaptureTests.swift b/Tests/RegexTests/CaptureTests.swift index 63ee266ec..34cc20ad7 100644 --- a/Tests/RegexTests/CaptureTests.swift +++ b/Tests/RegexTests/CaptureTests.swift @@ -157,7 +157,7 @@ func captureTest( } // Ensure DSLTree preserves literal captures - var dslCapList = ast.dslTree.captureList + var dslCapList = DSLList(ast: ast).captureList // Peel off the whole match element. dslCapList.captures.removeFirst() guard dslCapList == capList else { diff --git a/Tests/RegexTests/DSLListTests.swift b/Tests/RegexTests/DSLListTests.swift index d8acec737..3b99b40f3 100644 --- a/Tests/RegexTests/DSLListTests.swift +++ b/Tests/RegexTests/DSLListTests.swift @@ -21,17 +21,15 @@ struct DSLListTests { (#/a(?:b+)c*/#, 8), // literal, concat, a, noncap grp, quant, b, quant, c ]) func convertedNodeCount(regex: Regex, nodeCount: Int) { - let dslList = DSLList(tree: regex.program.tree) + let dslList = regex.program.list #expect(dslList.nodes.count == nodeCount) } @Test(arguments: [#/a|b/#, #/a+b?c/#, #/abc/#, #/a(?:b+)c*/#, #/;[\r\n]/#, #/(?=(?:[1-9]|(?:a|b)))/#]) func compilationComparison(regex: Regex) throws { - let listCompiler = Compiler(tree: regex.program.tree) + let listCompiler = Compiler(list: regex.program.list) let listProgram = try listCompiler.emitViaList() - let treeCompiler = Compiler(tree: regex.program.tree) - let treeProgram = try treeCompiler.emit() - #expect(treeProgram.instructions == listProgram.instructions) +// #expect(treeProgram.instructions == listProgram.instructions) } } diff --git a/Tests/RegexTests/LiteralPrinterTests.swift b/Tests/RegexTests/LiteralPrinterTests.swift index dd15d8cd1..69f273fd5 100644 --- a/Tests/RegexTests/LiteralPrinterTests.swift +++ b/Tests/RegexTests/LiteralPrinterTests.swift @@ -41,6 +41,9 @@ extension RegexTests { } func testUnicodeEscapes() throws { + let regex0 = #/[a]\u0301/# + _literalTest(regex0, expected: #"[a]\u0301"#) + let regex = #/\r\n\t cafe\u{301} \u{1D11E}/# _literalTest(regex, expected: #"\r\n\t cafe\u0301 \U0001D11E"#) } diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index a87112b9e..d67041c55 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -26,30 +26,19 @@ struct MatchError: Error { @available(SwiftStdlib 6.0, *) func _roundTripLiteral( _ regexStr: String, - syntax: SyntaxOptions + syntax: SyntaxOptions, + file: StaticString = #file, + line: UInt = #line ) throws -> Regex? { guard let pattern = try Regex(regexStr, syntax: syntax)._literalPattern else { return nil } let remadeRegex = try Regex(pattern) - XCTAssertEqual(pattern, remadeRegex._literalPattern) + XCTAssertEqual(pattern, remadeRegex._literalPattern, file: file, line: line) return remadeRegex } -// Validate that the given regex compiles to the same instructions whether -// as a tree (original) or a list (new). We need to compile with optimizations -// disabled, since new optimizations are primarily landing in list compilation. -func _validateListCompilation( - _ regex: Regex -) throws -> Bool { - let treeCompiler = Compiler(tree: regex.program.tree, compileOptions: .disableOptimizations) - let treeProgram = try treeCompiler.emitViaTree() - let listCompiler = Compiler(tree: regex.program.tree, compileOptions: .disableOptimizations) - let listProgram = try listCompiler.emitViaList() - return treeProgram.instructions == listProgram.instructions -} - func _firstMatch( _ regexStr: String, input: String, @@ -62,12 +51,6 @@ func _firstMatch( var regex = try Regex(regexStr, syntax: syntax).matchingSemantics(semanticLevel) let result = try regex.firstMatch(in: input) - if try !_validateListCompilation(regex) { - XCTFail( - "List compilation failed for '\(regexStr)'", - file: file, line: line) - } - func validateSubstring(_ substringInput: Substring) throws { // Sometimes the characters we add to a substring merge with existing // string members. This messes up cross-validation, so skip the test. @@ -110,14 +93,14 @@ func _firstMatch( } if #available(SwiftStdlib 6.0, *) { - let roundTripRegex = try? _roundTripLiteral(regexStr, syntax: syntax) + let roundTripRegex = try? _roundTripLiteral(regexStr, syntax: syntax, file: file, line: line) let roundTripResult = try? roundTripRegex? .matchingSemantics(semanticLevel) .firstMatch(in: input)?[0] .substring switch (result?[0].substring, roundTripResult) { case let (match?, rtMatch?): - XCTAssertEqual(match, rtMatch) + XCTAssertEqual(match, rtMatch, file: file, line: line) case (nil, nil): break // okay case let (match?, _): diff --git a/Tests/RegexTests/OptimizationTests.swift b/Tests/RegexTests/OptimizationTests.swift index 0fff0ebb1..a60d9bf5f 100644 --- a/Tests/RegexTests/OptimizationTests.swift +++ b/Tests/RegexTests/OptimizationTests.swift @@ -17,7 +17,7 @@ import Testing @available(macOS 9999, *) @Test(arguments: [#/a/#, #/a+/#, #/(?:a+)/#, #/(?:a)+/#, #/(?m)a+/#, #/ab?c/#, #/(?:a+)+$/#, #/(?:(?:a+b)+b)/#]) func requiredFirstAtom(pattern: Regex) throws { - let list = DSLList(tree: pattern.program.tree) + let list = pattern.program.list let atom = list.requiredFirstAtom(allowOptionsChanges: true) #expect(atom?.literalCharacterValue == "a", "Missing first character atom in '\(pattern._literalPattern!)'") } @@ -25,7 +25,7 @@ import Testing @available(macOS 9999, *) @Test(arguments: [#/a?/#, #/(?:a|b)/#, #/[a]/#, #/a?bc/#]) func noRequiredFirstAtom(pattern: Regex) throws { - let list = DSLList(tree: pattern.program.tree) + let list = pattern.program.list let atom = list.requiredFirstAtom(allowOptionsChanges: true) #expect(atom == nil, "Unexpected required first atom in '\(pattern._literalPattern!)'") } @@ -33,7 +33,7 @@ import Testing @available(macOS 9999, *) @Test(arguments: [#/a+b/#, #/a*b/#, #/\w+\s/#, #/(?:a+b|b+a)/#, #/\d+a/#, #/a+A/#]) func autoPossessify(pattern: Regex) throws { - var list = DSLList(tree: pattern.program.tree) + var list = pattern.program.list list.autoPossessify() for node in list.nodes { switch node { @@ -53,7 +53,7 @@ import Testing #/(?:(?:ab)+b)/#, // single atom quantifications only ]) func noAutoPossessify(pattern: Regex) throws { - var list = DSLList(tree: pattern.program.tree) + var list = pattern.program.list list.autoPossessify() for node in list.nodes { switch node {