diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2813a1676..eb8e9f86e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -79,7 +79,7 @@ jobs: rust: stable-x86_64-gnu steps: - name: Checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Install Rust uses: dtolnay/rust-toolchain@master with: @@ -137,30 +137,11 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Install Rust uses: dtolnay/rust-toolchain@master with: toolchain: 1.65.0 - # The memchr 2.6 release purportedly bumped its MSRV to Rust 1.60, but it - # turned out that on aarch64, it was using something that wasn't stabilized - # until Rust 1.61[1]. (This was an oversight on my part. I had previously - # thought everything I needed was on Rust 1.60.) To resolve that, I just - # bumped memchr's MSRV to 1.61. Since it was so soon after the memchr 2.6 - # release, I treated this as a bugfix. - # - # But the regex crate's MSRV is at Rust 1.60, and it now depends on at - # least memchr 2.6 (to make use of its `alloc` feature). So we can't set - # a lower minimal version. And I can't just bump the MSRV in a patch - # release as a bug fix because regex 1.9 was released quite some time ago. - # I could just release regex 1.10 and bump the MSRV there, but eh, I don't - # want to put out another minor version release just for this. - # - # So... pin memchr to 2.6.2, which at least works on x86-64 on Rust 1.60. - # - # [1]: https://github.com/BurntSushi/memchr/issues/136 - - name: Pin memchr to 2.6.2 - run: cargo update -p memchr --precise 2.6.2 - name: Basic build run: cargo build --verbose - name: Build docs @@ -177,7 +158,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Install Rust uses: dtolnay/rust-toolchain@master with: @@ -190,7 +171,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Install Rust uses: dtolnay/rust-toolchain@master with: @@ -203,7 +184,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Install Rust uses: dtolnay/rust-toolchain@master with: @@ -216,7 +197,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Install Rust uses: dtolnay/rust-toolchain@master with: @@ -231,7 +212,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Install Rust uses: dtolnay/rust-toolchain@master with: @@ -248,7 +229,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Install Rust uses: dtolnay/rust-toolchain@master with: diff --git a/CHANGELOG.md b/CHANGELOG.md index 420e08f74..586191d75 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,37 @@ +1.10.5 (2024-06-09) +=================== +This is a new patch release with some minor fixes. + +Bug fixes: + +* [BUG #1203](https://github.com/rust-lang/regex/pull/1203): +Escape invalid UTF-8 when in the `Debug` impl of `regex::bytes::Match`. + + +1.10.4 (2024-03-22) +=================== +This is a new patch release with some minor fixes. + +* [BUG #1169](https://github.com/rust-lang/regex/issues/1169): +Fixes a bug with compiling a reverse NFA automaton in `regex-automata`. +* [BUG #1178](https://github.com/rust-lang/regex/pull/1178): +Clarifies that when `Cow::Borrowed` is returned from replace APIs, it is +equivalent to the input. + + +1.10.3 (2024-01-21) +=================== +This is a new patch release that fixes the feature configuration of optional +dependencies, and fixes an unsound use of bounds check elision. + +Bug fixes: + +* [BUG #1147](https://github.com/rust-lang/regex/issues/1147): +Set `default-features=false` for the `memchr` and `aho-corasick` dependencies. +* [BUG #1154](https://github.com/rust-lang/regex/pull/1154): +Fix unsound bounds check elision. + + 1.10.2 (2023-10-16) =================== This is a new patch release that fixes a search regression where incorrect diff --git a/Cargo.toml b/Cargo.toml index 3ba14c904..4fe3be20a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex" -version = "1.10.2" #:version +version = "1.10.5" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] license = "MIT OR Apache-2.0" readme = "README.md" @@ -165,16 +165,18 @@ pattern = [] [dependencies.aho-corasick] version = "1.0.0" optional = true +default-features = false # For skipping along search text quickly when a leading byte is known. [dependencies.memchr] version = "2.6.0" optional = true +default-features = false # For the actual regex engines. [dependencies.regex-automata] path = "regex-automata" -version = "0.4.3" +version = "0.4.4" default-features = false features = ["alloc", "syntax", "meta", "nfa-pikevm"] diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_dense_dfa-5624222820728832 b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_dense_dfa-5624222820728832 new file mode 100644 index 000000000..e236ae735 Binary files /dev/null and b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_dense_dfa-5624222820728832 differ diff --git a/regex-automata/Cargo.toml b/regex-automata/Cargo.toml index 3cb3d7c8e..97bfacfec 100644 --- a/regex-automata/Cargo.toml +++ b/regex-automata/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex-automata" -version = "0.4.3" #:version +version = "0.4.7" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] description = "Automata construction and matching using regular expressions." documentation = "https://docs.rs/regex-automata" diff --git a/regex-automata/src/dfa/dense.rs b/regex-automata/src/dfa/dense.rs index fd96bc878..ed37d3b84 100644 --- a/regex-automata/src/dfa/dense.rs +++ b/regex-automata/src/dfa/dense.rs @@ -9,7 +9,7 @@ This module also contains a [`dense::Builder`](Builder) and a #[cfg(feature = "dfa-build")] use core::cmp; -use core::{convert::TryFrom, fmt, iter, mem::size_of, slice}; +use core::{fmt, iter, mem::size_of, slice}; #[cfg(feature = "dfa-build")] use alloc::{ @@ -2340,8 +2340,8 @@ impl<'a> DFA<&'a [u32]> { // table, match states and accelerators below. If any validation fails, // then we return an error. let (dfa, nread) = unsafe { DFA::from_bytes_unchecked(slice)? }; - dfa.tt.validate(&dfa.special)?; - dfa.st.validate(&dfa.tt)?; + dfa.tt.validate(&dfa)?; + dfa.st.validate(&dfa)?; dfa.ms.validate(&dfa)?; dfa.accels.validate()?; // N.B. dfa.special doesn't have a way to do unchecked deserialization, @@ -2498,7 +2498,7 @@ impl OwnedDFA { self.tt.set(from, byte, to); } - /// An an empty state (a state where all transitions lead to a dead state) + /// An empty state (a state where all transitions lead to a dead state) /// and return its identifier. The identifier returned is guaranteed to /// not point to any other existing state. /// @@ -3593,7 +3593,8 @@ impl> TransitionTable { /// /// That is, every state ID can be used to correctly index a state in this /// table. - fn validate(&self, sp: &Special) -> Result<(), DeserializeError> { + fn validate(&self, dfa: &DFA) -> Result<(), DeserializeError> { + let sp = &dfa.special; for state in self.states() { // We check that the ID itself is well formed. That is, if it's // a special state then it must actually be a quit, dead, accel, @@ -3611,6 +3612,13 @@ impl> TransitionTable { wasn't actually special", )); } + if sp.is_match_state(state.id()) + && dfa.match_len(state.id()) == 0 + { + return Err(DeserializeError::generic( + "found match state with zero pattern IDs", + )); + } } for (_, to) in state.transitions() { if !self.is_valid(to) { @@ -4127,10 +4135,8 @@ impl> StartTable { /// it against the given transition table (which must be for the same DFA). /// /// That is, every state ID can be used to correctly index a state. - fn validate( - &self, - tt: &TransitionTable, - ) -> Result<(), DeserializeError> { + fn validate(&self, dfa: &DFA) -> Result<(), DeserializeError> { + let tt = &dfa.tt; if !self.universal_start_unanchored.map_or(true, |s| tt.is_valid(s)) { return Err(DeserializeError::generic( "found invalid universal unanchored starting state ID", diff --git a/regex-automata/src/dfa/mod.rs b/regex-automata/src/dfa/mod.rs index fd58cac23..0e6a968e3 100644 --- a/regex-automata/src/dfa/mod.rs +++ b/regex-automata/src/dfa/mod.rs @@ -190,7 +190,7 @@ assert_eq!(matches, vec![ ``` Note that unlike dense DFAs, sparse DFAs have no alignment requirements. -Conversely, dense DFAs must be be aligned to the same alignment as a +Conversely, dense DFAs must be aligned to the same alignment as a [`StateID`](crate::util::primitives::StateID). # Support for `no_std` and `alloc`-only diff --git a/regex-automata/src/dfa/sparse.rs b/regex-automata/src/dfa/sparse.rs index d461e0a0f..46278c181 100644 --- a/regex-automata/src/dfa/sparse.rs +++ b/regex-automata/src/dfa/sparse.rs @@ -38,11 +38,7 @@ assert_eq!(Some(HalfMatch::must(0, 7)), state.get_match()); #[cfg(feature = "dfa-build")] use core::iter; -use core::{ - convert::{TryFrom, TryInto}, - fmt, - mem::size_of, -}; +use core::{fmt, mem::size_of}; #[cfg(feature = "dfa-build")] use alloc::{vec, vec::Vec}; diff --git a/regex-automata/src/meta/regex.rs b/regex-automata/src/meta/regex.rs index a06d2bb48..8cfdecbec 100644 --- a/regex-automata/src/meta/regex.rs +++ b/regex-automata/src/meta/regex.rs @@ -1826,7 +1826,7 @@ impl Regex { /// /// The precise meaning of "accelerated" is specifically left unspecified, /// but the general meaning is that the search is a high likelihood of - /// running faster than than a character-at-a-time loop inside a standard + /// running faster than a character-at-a-time loop inside a standard /// regex engine. /// /// When a regex is accelerated, it is only a *probabilistic* claim. That @@ -2282,7 +2282,7 @@ impl<'r, 'h> core::iter::FusedIterator for SplitN<'r, 'h> {} /// /// Most of the regex engines in this crate require some kind of /// mutable state in order to execute a search. This mutable state is -/// explicitly separated from the the core regex object (such as a +/// explicitly separated from the core regex object (such as a /// [`thompson::NFA`](crate::nfa::thompson::NFA)) so that the read-only regex /// object can be shared across multiple threads simultaneously without any /// synchronization. Conversely, a `Cache` must either be duplicated if using diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index 2d2172957..668bca87c 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -961,10 +961,12 @@ impl Compiler { // for all matches. When an unanchored prefix is not added, then the // NFA's anchored and unanchored start states are equivalent. let all_anchored = exprs.iter().all(|e| { - e.borrow() - .properties() - .look_set_prefix() - .contains(hir::Look::Start) + let props = e.borrow().properties(); + if self.config.get_reverse() { + props.look_set_suffix().contains(hir::Look::End) + } else { + props.look_set_prefix().contains(hir::Look::Start) + } }); let anchored = !self.config.get_unanchored_prefix() || all_anchored; let unanchored_prefix = if anchored { @@ -1876,11 +1878,11 @@ impl Utf8Node { #[cfg(test)] mod tests { - use alloc::{vec, vec::Vec}; + use alloc::vec; use crate::{ - nfa::thompson::{SparseTransitions, State, Transition, NFA}, - util::primitives::{PatternID, SmallIndex, StateID}, + nfa::thompson::{SparseTransitions, State}, + util::primitives::SmallIndex, }; use super::*; @@ -1928,6 +1930,11 @@ mod tests { State::Sparse(SparseTransitions { transitions }) } + fn s_look(look: Look, next: usize) -> State { + let next = sid(next); + State::Look { look, next } + } + fn s_bin_union(alt1: usize, alt2: usize) -> State { State::BinaryUnion { alt1: sid(alt1), alt2: sid(alt2) } } @@ -1978,6 +1985,80 @@ mod tests { ); } + #[test] + fn compile_no_unanchored_prefix_with_start_anchor() { + let nfa = NFA::compiler() + .configure(NFA::config().which_captures(WhichCaptures::None)) + .build(r"^a") + .unwrap(); + assert_eq!( + nfa.states(), + &[s_look(Look::Start, 1), s_byte(b'a', 2), s_match(0)] + ); + } + + #[test] + fn compile_yes_unanchored_prefix_with_end_anchor() { + let nfa = NFA::compiler() + .configure(NFA::config().which_captures(WhichCaptures::None)) + .build(r"a$") + .unwrap(); + assert_eq!( + nfa.states(), + &[ + s_bin_union(2, 1), + s_range(0, 255, 0), + s_byte(b'a', 3), + s_look(Look::End, 4), + s_match(0), + ] + ); + } + + #[test] + fn compile_yes_reverse_unanchored_prefix_with_start_anchor() { + let nfa = NFA::compiler() + .configure( + NFA::config() + .reverse(true) + .which_captures(WhichCaptures::None), + ) + .build(r"^a") + .unwrap(); + assert_eq!( + nfa.states(), + &[ + s_bin_union(2, 1), + s_range(0, 255, 0), + s_byte(b'a', 3), + // Anchors get flipped in a reverse automaton. + s_look(Look::End, 4), + s_match(0), + ], + ); + } + + #[test] + fn compile_no_reverse_unanchored_prefix_with_end_anchor() { + let nfa = NFA::compiler() + .configure( + NFA::config() + .reverse(true) + .which_captures(WhichCaptures::None), + ) + .build(r"a$") + .unwrap(); + assert_eq!( + nfa.states(), + &[ + // Anchors get flipped in a reverse automaton. + s_look(Look::Start, 1), + s_byte(b'a', 2), + s_match(0), + ], + ); + } + #[test] fn compile_empty() { assert_eq!(build("").states(), &[s_match(0),]); diff --git a/regex-automata/src/nfa/thompson/map.rs b/regex-automata/src/nfa/thompson/map.rs index c92d4c0b8..7f074a353 100644 --- a/regex-automata/src/nfa/thompson/map.rs +++ b/regex-automata/src/nfa/thompson/map.rs @@ -65,7 +65,7 @@ const INIT: u64 = 14695981039346656037; /// Specifically, one could observe the difference with std's hashmap via /// something like the following benchmark: /// -/// hyperfine "regex-cli debug thompson -qr --no-captures '\w{90} ecurB'" +/// hyperfine "regex-cli debug thompson -qr --captures none '\w{90} ecurB'" /// /// But to observe that difference, you'd have to modify the code to use /// std's hashmap. diff --git a/regex-automata/src/nfa/thompson/range_trie.rs b/regex-automata/src/nfa/thompson/range_trie.rs index 75c9b796b..93cce1699 100644 --- a/regex-automata/src/nfa/thompson/range_trie.rs +++ b/regex-automata/src/nfa/thompson/range_trie.rs @@ -141,7 +141,7 @@ construction later by virtue of producing a much much smaller NFA. [2] - https://www.mitpressjournals.org/doi/pdfplus/10.1162/089120100561601 */ -use core::{cell::RefCell, convert::TryFrom, fmt, mem, ops::RangeInclusive}; +use core::{cell::RefCell, fmt, mem, ops::RangeInclusive}; use alloc::{format, string::String, vec, vec::Vec}; @@ -594,7 +594,7 @@ impl State { // Benchmarks suggest that binary search is just a bit faster than // straight linear search. Specifically when using the debug tool: // - // hyperfine "regex-cli debug thompson -qr --no-captures '\w{90} ecurB'" + // hyperfine "regex-cli debug thompson -qr --captures none '\w{90} ecurB'" binary_search(&self.transitions, |t| range.start <= t.range.end) } @@ -693,7 +693,7 @@ impl NextInsert { /// handle: /// /// 1. The part where the two ranges actually overlap. i.e., The intersection. -/// 2. The part of the existing range that is not in the the new range. +/// 2. The part of the existing range that is not in the new range. /// 3. The part of the new range that is not in the old range. /// /// (1) is guaranteed to always occur since all overlapping ranges have a @@ -915,10 +915,6 @@ fn intersects(r1: Utf8Range, r2: Utf8Range) -> bool { #[cfg(test)] mod tests { - use core::ops::RangeInclusive; - - use regex_syntax::utf8::Utf8Range; - use super::*; fn r(range: RangeInclusive) -> Utf8Range { diff --git a/regex-automata/src/util/alphabet.rs b/regex-automata/src/util/alphabet.rs index 22b5a7644..e0e4d2fc1 100644 --- a/regex-automata/src/util/alphabet.rs +++ b/regex-automata/src/util/alphabet.rs @@ -699,7 +699,7 @@ impl ByteClassSet { ByteClassSet(ByteSet::empty()) } - /// Indicate the the range of byte given (inclusive) can discriminate a + /// Indicate the range of byte given (inclusive) can discriminate a /// match between it and all other bytes outside of the range. pub(crate) fn set_range(&mut self, start: u8, end: u8) { debug_assert!(start <= end); diff --git a/regex-automata/src/util/captures.rs b/regex-automata/src/util/captures.rs index 05db6a993..93a0a8afa 100644 --- a/regex-automata/src/util/captures.rs +++ b/regex-automata/src/util/captures.rs @@ -1643,7 +1643,7 @@ impl GroupInfo { /// /// This also returns `None` for all inputs if these captures are empty /// (e.g., built from an empty [`GroupInfo`]). To check whether captures - /// are are present for a specific pattern, use [`GroupInfo::group_len`]. + /// are present for a specific pattern, use [`GroupInfo::group_len`]. /// /// # Example /// @@ -1695,7 +1695,7 @@ impl GroupInfo { /// /// This also returns `None` for all inputs if these captures are empty /// (e.g., built from an empty [`GroupInfo`]). To check whether captures - /// are are present for a specific pattern, use [`GroupInfo::group_len`]. + /// are present for a specific pattern, use [`GroupInfo::group_len`]. /// /// # Example /// diff --git a/regex-automata/src/util/determinize/state.rs b/regex-automata/src/util/determinize/state.rs index effa6f44d..540d5d4d1 100644 --- a/regex-automata/src/util/determinize/state.rs +++ b/regex-automata/src/util/determinize/state.rs @@ -57,7 +57,7 @@ can only be used for adding NFA state IDs and recording some assertions. The expected flow here is to use the above builders to construct a candidate DFA state to check if it already exists. If it does, then there's no need to -freeze it into a `State`. It it doesn't exist, then `StateBuilderNFA::to_state` +freeze it into a `State`. If it doesn't exist, then `StateBuilderNFA::to_state` can be called to freeze the builder into an immutable `State`. In either case, `clear` should be called on the builder to turn it back into a `StateBuilderEmpty` that reuses the underlying memory. @@ -86,7 +86,7 @@ serialized anywhere. So any kind of change can be made with reckless abandon, as long as everything in this module agrees. */ -use core::{convert::TryFrom, mem}; +use core::mem; use alloc::{sync::Arc, vec::Vec}; diff --git a/regex-automata/src/util/int.rs b/regex-automata/src/util/int.rs index e6b13bff9..b726e93f8 100644 --- a/regex-automata/src/util/int.rs +++ b/regex-automata/src/util/int.rs @@ -41,6 +41,10 @@ like `u64::from` where possible, or even `usize::try_from()` for when we do explicitly want to panic or when we want to return an error for overflow. */ +// We define a little more than what we need, but I'd rather just have +// everything via a consistent and uniform API then have holes. +#![allow(dead_code)] + pub(crate) trait U8 { fn as_usize(self) -> usize; } @@ -240,13 +244,3 @@ impl Pointer for *const T { self as usize } } - -pub(crate) trait PointerMut { - fn as_usize(self) -> usize; -} - -impl PointerMut for *mut T { - fn as_usize(self) -> usize { - self as usize - } -} diff --git a/regex-automata/src/util/prefilter/mod.rs b/regex-automata/src/util/prefilter/mod.rs index 51fc92233..d20442a69 100644 --- a/regex-automata/src/util/prefilter/mod.rs +++ b/regex-automata/src/util/prefilter/mod.rs @@ -146,6 +146,8 @@ pub struct Prefilter { pre: Arc, #[cfg(feature = "alloc")] is_fast: bool, + #[cfg(feature = "alloc")] + max_needle_len: usize, } impl Prefilter { @@ -202,12 +204,19 @@ impl Prefilter { kind: MatchKind, needles: &[B], ) -> Option { - Choice::new(kind, needles).and_then(Prefilter::from_choice) + Choice::new(kind, needles).and_then(|choice| { + let max_needle_len = + needles.iter().map(|b| b.as_ref().len()).max().unwrap_or(0); + Prefilter::from_choice(choice, max_needle_len) + }) } /// This turns a prefilter selection into a `Prefilter`. That is, in turns /// the enum given into a trait object. - fn from_choice(choice: Choice) -> Option { + fn from_choice( + choice: Choice, + max_needle_len: usize, + ) -> Option { #[cfg(not(feature = "alloc"))] { None @@ -224,7 +233,7 @@ impl Prefilter { Choice::AhoCorasick(p) => Arc::new(p), }; let is_fast = pre.is_fast(); - Some(Prefilter { pre, is_fast }) + Some(Prefilter { pre, is_fast, max_needle_len }) } } @@ -411,6 +420,20 @@ impl Prefilter { } } + /// Return the length of the longest needle + /// in this Prefilter + #[inline] + pub fn max_needle_len(&self) -> usize { + #[cfg(not(feature = "alloc"))] + { + unreachable!() + } + #[cfg(feature = "alloc")] + { + self.max_needle_len + } + } + /// Implementations might return true here if they believe themselves to /// be "fast." The concept of "fast" is deliberately left vague, but in /// practice this usually corresponds to whether it's believed that SIMD @@ -429,7 +452,7 @@ impl Prefilter { /// *know* a prefilter will be fast without actually trying the prefilter. /// (Which of course we cannot afford to do.) #[inline] - pub(crate) fn is_fast(&self) -> bool { + pub fn is_fast(&self) -> bool { #[cfg(not(feature = "alloc"))] { unreachable!() diff --git a/regex-automata/src/util/search.rs b/regex-automata/src/util/search.rs index 39aec522b..05b1cff54 100644 --- a/regex-automata/src/util/search.rs +++ b/regex-automata/src/util/search.rs @@ -110,9 +110,14 @@ impl<'h> Input<'h> { /// Create a new search configuration for the given haystack. #[inline] pub fn new>(haystack: &'h H) -> Input<'h> { + // Perform only one call to `haystack.as_ref()` to protect from incorrect + // implementations that return different values from multiple calls. + // This is important because there's code that relies on `span` not being + // out of bounds with respect to the stored `haystack`. + let haystack = haystack.as_ref(); Input { - haystack: haystack.as_ref(), - span: Span { start: 0, end: haystack.as_ref().len() }, + haystack, + span: Span { start: 0, end: haystack.len() }, anchored: Anchored::No, earliest: false, } @@ -1966,4 +1971,23 @@ mod tests { let expected_size = 3 * core::mem::size_of::(); assert_eq!(expected_size, core::mem::size_of::()); } + + #[test] + fn incorrect_asref_guard() { + struct Bad(std::cell::Cell); + + impl AsRef<[u8]> for Bad { + fn as_ref(&self) -> &[u8] { + if self.0.replace(false) { + &[] + } else { + &[0; 1000] + } + } + } + + let bad = Bad(std::cell::Cell::new(true)); + let input = Input::new(&bad); + assert!(input.end() <= input.haystack().len()); + } } diff --git a/regex-automata/src/util/start.rs b/regex-automata/src/util/start.rs index 27153780e..97988b44b 100644 --- a/regex-automata/src/util/start.rs +++ b/regex-automata/src/util/start.rs @@ -323,7 +323,7 @@ impl core::fmt::Debug for StartByteMap { /// Represents the six possible starting configurations of a DFA search. /// -/// The starting configuration is determined by inspecting the the beginning +/// The starting configuration is determined by inspecting the beginning /// of the haystack (up to 1 byte). Ultimately, this along with a pattern ID /// (if specified) and the type of search (anchored or not) is what selects the /// start state to use in a DFA. diff --git a/regex-automata/src/util/wire.rs b/regex-automata/src/util/wire.rs index ecf4fd8c0..b1351c7e9 100644 --- a/regex-automata/src/util/wire.rs +++ b/regex-automata/src/util/wire.rs @@ -41,11 +41,7 @@ generally requires serializing both its big-endian and little-endian variants, and then loading the correct one based on the target's endianness. */ -use core::{ - cmp, - convert::{TryFrom, TryInto}, - mem::size_of, -}; +use core::{cmp, mem::size_of}; #[cfg(feature = "alloc")] use alloc::{vec, vec::Vec}; @@ -867,11 +863,6 @@ pub(crate) trait Endian { /// this panics. fn write_u32(n: u32, dst: &mut [u8]); - /// Writes a u64 to the given destination buffer in a particular - /// endianness. If the destination buffer has a length smaller than 8, then - /// this panics. - fn write_u64(n: u64, dst: &mut [u8]); - /// Writes a u128 to the given destination buffer in a particular /// endianness. If the destination buffer has a length smaller than 16, /// then this panics. @@ -897,10 +888,6 @@ impl Endian for LE { dst[..4].copy_from_slice(&n.to_le_bytes()); } - fn write_u64(n: u64, dst: &mut [u8]) { - dst[..8].copy_from_slice(&n.to_le_bytes()); - } - fn write_u128(n: u128, dst: &mut [u8]) { dst[..16].copy_from_slice(&n.to_le_bytes()); } @@ -915,10 +902,6 @@ impl Endian for BE { dst[..4].copy_from_slice(&n.to_be_bytes()); } - fn write_u64(n: u64, dst: &mut [u8]) { - dst[..8].copy_from_slice(&n.to_be_bytes()); - } - fn write_u128(n: u128, dst: &mut [u8]) { dst[..16].copy_from_slice(&n.to_be_bytes()); } diff --git a/regex-capi/src/error.rs b/regex-capi/src/error.rs index a269a3913..7b91fb9d3 100644 --- a/regex-capi/src/error.rs +++ b/regex-capi/src/error.rs @@ -4,7 +4,6 @@ use std::fmt; use std::str; use libc::c_char; -use regex; #[derive(Debug)] pub struct Error { @@ -22,7 +21,7 @@ pub enum ErrorKind { impl Error { pub fn new(kind: ErrorKind) -> Error { - Error { message: None, kind: kind } + Error { message: None, kind } } pub fn is_err(&self) -> bool { diff --git a/regex-cli/Cargo.toml b/regex-cli/Cargo.toml index 3fe5390aa..ac69c9ec4 100644 --- a/regex-cli/Cargo.toml +++ b/regex-cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex-cli" -version = "0.1.1" #:version +version = "0.2.0" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] description = """ A command line tool for debugging, ad hoc benchmarking and generating regular @@ -27,7 +27,7 @@ anyhow = "1.0.28" bstr = { version = "1.4.0", default-features = false, features = ["std"] } lexopt = "0.3.0" log = { version = "0.4.17", features = ["std"] } -memmap2 = "0.5.10" +memmap2 = "0.9.4" regex = { version = "1.9.0", path = ".." } regex-automata = { version = "0.4.0", path = "../regex-automata", features = ["logging"] } regex-lite = { version = "0.1.0", path = "../regex-lite" } diff --git a/regex-cli/args/flags.rs b/regex-cli/args/flags.rs index db8a847ef..61732a28e 100644 --- a/regex-cli/args/flags.rs +++ b/regex-cli/args/flags.rs @@ -152,3 +152,55 @@ impl std::str::FromStr for MatchKind { Ok(MatchKind { kind }) } } + +/// Provides an implementation of the --captures flag, for use with Thompson +/// NFA configuration. +#[derive(Debug)] +pub struct WhichCaptures { + pub which: regex_automata::nfa::thompson::WhichCaptures, +} + +impl WhichCaptures { + pub const USAGE: Usage = Usage::new( + "--captures ", + "One of: all, implicit or none.", + r#" +Selects which capture states should be included in the Thompson NFA. The +choices are 'all' (the default), 'implicit' or 'none'. + +'all' means that both explicit and implicit capture states are included. + +'implicit' means that only implicit capture states are included. That is, the +Thompson NFA will only be able to report the overall match offsets and not the +match offsets of each explicit capture group. + +'none' means that no capture states will be included. This is useful when +capture states aren't needed (like when building a DFA) or if they aren't +supported (like when building a reverse NFA). +"#, + ); +} + +impl Default for WhichCaptures { + fn default() -> WhichCaptures { + WhichCaptures { + which: regex_automata::nfa::thompson::WhichCaptures::All, + } + } +} + +impl std::str::FromStr for WhichCaptures { + type Err = anyhow::Error; + + fn from_str(s: &str) -> anyhow::Result { + let which = match s { + "all" => regex_automata::nfa::thompson::WhichCaptures::All, + "implicit" => { + regex_automata::nfa::thompson::WhichCaptures::Implicit + } + "none" => regex_automata::nfa::thompson::WhichCaptures::None, + unk => anyhow::bail!("unrecognized captures option '{}'", unk), + }; + Ok(WhichCaptures { which }) + } +} diff --git a/regex-cli/args/thompson.rs b/regex-cli/args/thompson.rs index 151fc6a0b..bd8388d11 100644 --- a/regex-cli/args/thompson.rs +++ b/regex-cli/args/thompson.rs @@ -70,11 +70,11 @@ impl Configurable for Config { Arg::Long("shrink") => { self.thompson = self.thompson.clone().shrink(true); } - Arg::Long("no-captures") => { - self.thompson = self - .thompson - .clone() - .which_captures(thompson::WhichCaptures::None); + Arg::Long("captures") => { + let which: flags::WhichCaptures = + args::parse(p, "--captures")?; + self.thompson = + self.thompson.clone().which_captures(which.which); } Arg::Long("line-terminator") => { let byte: flags::OneByte = @@ -136,19 +136,7 @@ spent shrinking the NFA can lead to far larger savings in the subsequent DFA determinization. "#, ), - Usage::new( - "--no-captures", - "Disable capture states.", - r#" -Disables capture states. By default, NFAs include special "capture" states that -instruct some regex engines (like the PikeVM) to record offset positions in -ancillary state. - -It can be useful to disable capture states in order to reduce "clutter" in the -automaton when debugging it. Also, at time of writing, reverse NFAs require -that capture groups are disabled. -"#, - ), + flags::WhichCaptures::USAGE, Usage::new( "--line-terminator", "Set the line terminator used by line anchors.", diff --git a/regex-cli/cmd/debug/dfa.rs b/regex-cli/cmd/debug/dfa.rs index 9381cdadc..f16610fbe 100644 --- a/regex-cli/cmd/debug/dfa.rs +++ b/regex-cli/cmd/debug/dfa.rs @@ -5,7 +5,7 @@ use crate::{ util::{self, Table}, }; -use {lexopt, regex_automata::dfa::Automaton}; +use regex_automata::dfa::Automaton; pub fn run_dense(p: &mut lexopt::Parser) -> anyhow::Result<()> { const USAGE: &'static str = "\ diff --git a/regex-cli/cmd/generate/fowler.rs b/regex-cli/cmd/generate/fowler.rs index c287f6f52..404c47721 100644 --- a/regex-cli/cmd/generate/fowler.rs +++ b/regex-cli/cmd/generate/fowler.rs @@ -178,7 +178,7 @@ impl TomlTest { // this trade off (to this extent anyway), so it really wants all // capturing groups... // - // So what we do here is is look for the number of groups in the + // So what we do here is look for the number of groups in the // pattern and then just pad out the capture matches with None // values to make the number of capture matches equal to what we // would expect from the pattern. (We actually parse the regex to diff --git a/regex-cli/logger.rs b/regex-cli/logger.rs index 0fe063f1c..4e783872e 100644 --- a/regex-cli/logger.rs +++ b/regex-cli/logger.rs @@ -3,7 +3,7 @@ // print to stderr. We therefore avoid bringing in extra dependencies just // for this functionality. -use log::{self, Log}; +use log::Log; /// The simplest possible logger that logs to stderr. /// diff --git a/regex-lite/src/lib.rs b/regex-lite/src/lib.rs index 9b394a480..0aca8221d 100644 --- a/regex-lite/src/lib.rs +++ b/regex-lite/src/lib.rs @@ -257,7 +257,7 @@ let dates: Vec<(&str, &str, &str)> = re.captures_iter(hay).map(|caps| { // regex matches, and in this context, we know we have a match. // // Note that we use `caps.name("y").unwrap().as_str()` instead of - // `&caps["y"]` because the the lifetime of the former is the same as the + // `&caps["y"]` because the lifetime of the former is the same as the // lifetime of `hay` above, but the lifetime of the latter is tied to the // lifetime of `caps` due to how the `Index` trait is defined. let year = caps.name("y").unwrap().as_str(); @@ -821,7 +821,7 @@ it, a longer haystack will take more time to search. * Very large regexes can searches to be quite slow due to increasing the size `m` in the worst case `O(m * n)` bound. This is especially true when they are combined with counted repetitions. While the regex size limit above will -protect you from the most egregious cases, the the default size limit still +protect you from the most egregious cases, the default size limit still permits pretty big regexes that can execute more slowly than one might expect. * While routines like [`Regex::find`] and [`Regex::captures`] guarantee worst case `O(m * n)` search time, routines like [`Regex::find_iter`] and diff --git a/regex-lite/src/string.rs b/regex-lite/src/string.rs index 4e4de9068..5fe30ade3 100644 --- a/regex-lite/src/string.rs +++ b/regex-lite/src/string.rs @@ -1717,8 +1717,8 @@ impl<'h> Captures<'h> { /// /// This returns a tuple where the first element corresponds to the full /// substring of the haystack that matched the regex. The second element is - /// an array of substrings, with each corresponding to the to the substring - /// that matched for a particular capture group. + /// an array of substrings, with each corresponding to the substring that + /// matched for a particular capture group. /// /// # Panics /// diff --git a/regex-lite/src/utf8.rs b/regex-lite/src/utf8.rs index 5f2a6a153..2730b602d 100644 --- a/regex-lite/src/utf8.rs +++ b/regex-lite/src/utf8.rs @@ -87,7 +87,7 @@ fn decode_step(state: &mut usize, cp: &mut u32, b: u8) { // Splits the space of all bytes into equivalence classes, such that // any byte in the same class can never discriminate between whether a // particular sequence is valid UTF-8 or not. - #[cfg_attr(rustfmt, rustfmt::skip)] + #[rustfmt::skip] const CLASSES: [u8; 256] = [ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, @@ -101,7 +101,7 @@ fn decode_step(state: &mut usize, cp: &mut u32, b: u8) { // A state machine taken from `bstr` which was in turn adapted from: // https://bjoern.hoehrmann.de/utf-8/decoder/dfa/ - #[cfg_attr(rustfmt, rustfmt::skip)] + #[rustfmt::skip] const STATES_FORWARD: &'static [u8] = &[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 0, 24, 36, 60, 96, 84, 0, 0, 0, 48, 72, diff --git a/regex-syntax/Cargo.toml b/regex-syntax/Cargo.toml index c9ce87da7..3f213542b 100644 --- a/regex-syntax/Cargo.toml +++ b/regex-syntax/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex-syntax" -version = "0.8.2" #:version +version = "0.8.4" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] license = "MIT OR Apache-2.0" repository = "https://github.com/rust-lang/regex/tree/master/regex-syntax" diff --git a/regex-syntax/src/ast/mod.rs b/regex-syntax/src/ast/mod.rs index 6a77ee134..ce79a89ab 100644 --- a/regex-syntax/src/ast/mod.rs +++ b/regex-syntax/src/ast/mod.rs @@ -711,7 +711,7 @@ pub enum LiteralKind { /// The literal is written as an octal escape, e.g., `\141`. Octal, /// The literal is written as a hex code with a fixed number of digits - /// depending on the type of the escape, e.g., `\x61` or or `\u0061` or + /// depending on the type of the escape, e.g., `\x61` or `\u0061` or /// `\U00000061`. HexFixed(HexLiteralKind), /// The literal is written as a hex code with a bracketed number of diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs index 593b14fbc..0c2a35265 100644 --- a/regex-syntax/src/ast/parse.rs +++ b/regex-syntax/src/ast/parse.rs @@ -124,6 +124,7 @@ pub struct ParserBuilder { ignore_whitespace: bool, nest_limit: u32, octal: bool, + empty_min_range: bool, } impl Default for ParserBuilder { @@ -139,6 +140,7 @@ impl ParserBuilder { ignore_whitespace: false, nest_limit: 250, octal: false, + empty_min_range: false, } } @@ -149,6 +151,7 @@ impl ParserBuilder { capture_index: Cell::new(0), nest_limit: self.nest_limit, octal: self.octal, + empty_min_range: self.empty_min_range, initial_ignore_whitespace: self.ignore_whitespace, ignore_whitespace: Cell::new(self.ignore_whitespace), comments: RefCell::new(vec![]), @@ -221,6 +224,18 @@ impl ParserBuilder { self.ignore_whitespace = yes; self } + + /// Allow using `{,n}` as an equivalent to `{0,n}`. + /// + /// When enabled, the parser accepts `{,n}` as valid syntax for `{0,n}`. + /// Most regular expression engines don't support the `{,n}` syntax, but + /// some others do it, namely Python's `re` library. + /// + /// This is disabled by default. + pub fn empty_min_range(&mut self, yes: bool) -> &mut ParserBuilder { + self.empty_min_range = yes; + self + } } /// A regular expression parser. @@ -246,6 +261,9 @@ pub struct Parser { /// The initial setting for `ignore_whitespace` as provided by /// `ParserBuilder`. It is used when resetting the parser's state. initial_ignore_whitespace: bool, + /// Whether the parser supports `{,n}` repetitions as an equivalent to + /// `{0,n}.` + empty_min_range: bool, /// Whether whitespace should be ignored. When enabled, comments are /// also permitted. ignore_whitespace: Cell, @@ -1114,15 +1132,14 @@ impl<'s, P: Borrow> ParserI<'s, P> { self.parse_decimal(), ast::ErrorKind::DecimalEmpty, ast::ErrorKind::RepetitionCountDecimalEmpty, - )?; - let mut range = ast::RepetitionRange::Exactly(count_start); + ); if self.is_eof() { return Err(self.error( Span::new(start, self.pos()), ast::ErrorKind::RepetitionCountUnclosed, )); } - if self.char() == ',' { + let range = if self.char() == ',' { if !self.bump_and_bump_space() { return Err(self.error( Span::new(start, self.pos()), @@ -1130,16 +1147,33 @@ impl<'s, P: Borrow> ParserI<'s, P> { )); } if self.char() != '}' { + let count_start = match count_start { + Ok(c) => c, + Err(err) + if err.kind + == ast::ErrorKind::RepetitionCountDecimalEmpty => + { + if self.parser().empty_min_range { + 0 + } else { + return Err(err); + } + } + err => err?, + }; let count_end = specialize_err( self.parse_decimal(), ast::ErrorKind::DecimalEmpty, ast::ErrorKind::RepetitionCountDecimalEmpty, )?; - range = ast::RepetitionRange::Bounded(count_start, count_end); + ast::RepetitionRange::Bounded(count_start, count_end) } else { - range = ast::RepetitionRange::AtLeast(count_start); + ast::RepetitionRange::AtLeast(count_start?) } - } + } else { + ast::RepetitionRange::Exactly(count_start?) + }; + if self.is_eof() || self.char() != '}' { return Err(self.error( Span::new(start, self.pos()), @@ -2405,8 +2439,6 @@ mod tests { use alloc::format; - use crate::ast::{self, Ast, Position, Span}; - use super::*; // Our own assert_eq, which has slightly better formatting (but honestly @@ -2461,6 +2493,11 @@ mod tests { ParserI::new(parser, pattern) } + fn parser_empty_min_range(pattern: &str) -> ParserI<'_, Parser> { + let parser = ParserBuilder::new().empty_min_range(true).build(); + ParserI::new(parser, pattern) + } + fn parser_nest_limit( pattern: &str, nest_limit: u32, @@ -3378,6 +3415,20 @@ bar ast: Box::new(lit('a', 0)), })) ); + assert_eq!( + parser_empty_min_range(r"a{,9}").parse(), + Ok(Ast::repetition(ast::Repetition { + span: span(0..5), + op: ast::RepetitionOp { + span: span(1..5), + kind: ast::RepetitionKind::Range( + ast::RepetitionRange::Bounded(0, 9) + ), + }, + greedy: true, + ast: Box::new(lit('a', 0)), + })) + ); assert_eq!( parser_ignore_whitespace(r"a{5,9} ?").parse(), Ok(Ast::repetition(ast::Repetition { @@ -4598,8 +4649,8 @@ bar assert_eq!( parser(r"\b{ ").parse().unwrap_err(), TestError { - span: span(4..4), - kind: ast::ErrorKind::RepetitionCountDecimalEmpty, + span: span(2..4), + kind: ast::ErrorKind::RepetitionCountUnclosed, } ); // In this case, we got some valid chars that makes it look like the diff --git a/regex-syntax/src/hir/interval.rs b/regex-syntax/src/hir/interval.rs index e063390a8..d507ee724 100644 --- a/regex-syntax/src/hir/interval.rs +++ b/regex-syntax/src/hir/interval.rs @@ -479,23 +479,6 @@ pub trait Interval: ret } - /// Compute the symmetric difference the given range from this range. This - /// returns the union of the two ranges minus its intersection. - fn symmetric_difference( - &self, - other: &Self, - ) -> (Option, Option) { - let union = match self.union(other) { - None => return (Some(self.clone()), Some(other.clone())), - Some(union) => union, - }; - let intersection = match self.intersect(other) { - None => return (Some(self.clone()), Some(other.clone())), - Some(intersection) => intersection, - }; - union.difference(&intersection) - } - /// Returns true if and only if the two ranges are contiguous. Two ranges /// are contiguous if and only if the ranges are either overlapping or /// adjacent. diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index ce38ead7b..5db784388 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -322,6 +322,22 @@ impl Hir { /// let expected = HirKind::Literal(Literal(Box::from("☃".as_bytes()))); /// assert_eq!(&expected, concat.kind()); /// ``` + /// + /// # Example: building a literal from a `char` + /// + /// This example shows how to build a single `Hir` literal from a `char` + /// value. Since a [`Literal`] is just bytes, we just need to UTF-8 + /// encode a `char` value: + /// + /// ``` + /// use regex_syntax::hir::{Hir, HirKind, Literal}; + /// + /// let ch = '☃'; + /// let got = Hir::literal(ch.encode_utf8(&mut [0; 4]).as_bytes()); + /// + /// let expected = HirKind::Literal(Literal(Box::from("☃".as_bytes()))); + /// assert_eq!(&expected, got.kind()); + /// ``` #[inline] pub fn literal>>(lit: B) -> Hir { let bytes = lit.into(); @@ -642,16 +658,12 @@ impl Hir { #[inline] pub fn dot(dot: Dot) -> Hir { match dot { - Dot::AnyChar => { - let mut cls = ClassUnicode::empty(); - cls.push(ClassUnicodeRange::new('\0', '\u{10FFFF}')); - Hir::class(Class::Unicode(cls)) - } - Dot::AnyByte => { - let mut cls = ClassBytes::empty(); - cls.push(ClassBytesRange::new(b'\0', b'\xFF')); - Hir::class(Class::Bytes(cls)) - } + Dot::AnyChar => Hir::class(Class::Unicode(ClassUnicode::new([ + ClassUnicodeRange::new('\0', '\u{10FFFF}'), + ]))), + Dot::AnyByte => Hir::class(Class::Bytes(ClassBytes::new([ + ClassBytesRange::new(b'\0', b'\xFF'), + ]))), Dot::AnyCharExcept(ch) => { let mut cls = ClassUnicode::new([ClassUnicodeRange::new(ch, ch)]); @@ -659,17 +671,17 @@ impl Hir { Hir::class(Class::Unicode(cls)) } Dot::AnyCharExceptLF => { - let mut cls = ClassUnicode::empty(); - cls.push(ClassUnicodeRange::new('\0', '\x09')); - cls.push(ClassUnicodeRange::new('\x0B', '\u{10FFFF}')); - Hir::class(Class::Unicode(cls)) + Hir::class(Class::Unicode(ClassUnicode::new([ + ClassUnicodeRange::new('\0', '\x09'), + ClassUnicodeRange::new('\x0B', '\u{10FFFF}'), + ]))) } Dot::AnyCharExceptCRLF => { - let mut cls = ClassUnicode::empty(); - cls.push(ClassUnicodeRange::new('\0', '\x09')); - cls.push(ClassUnicodeRange::new('\x0B', '\x0C')); - cls.push(ClassUnicodeRange::new('\x0E', '\u{10FFFF}')); - Hir::class(Class::Unicode(cls)) + Hir::class(Class::Unicode(ClassUnicode::new([ + ClassUnicodeRange::new('\0', '\x09'), + ClassUnicodeRange::new('\x0B', '\x0C'), + ClassUnicodeRange::new('\x0E', '\u{10FFFF}'), + ]))) } Dot::AnyByteExcept(byte) => { let mut cls = @@ -678,17 +690,17 @@ impl Hir { Hir::class(Class::Bytes(cls)) } Dot::AnyByteExceptLF => { - let mut cls = ClassBytes::empty(); - cls.push(ClassBytesRange::new(b'\0', b'\x09')); - cls.push(ClassBytesRange::new(b'\x0B', b'\xFF')); - Hir::class(Class::Bytes(cls)) + Hir::class(Class::Bytes(ClassBytes::new([ + ClassBytesRange::new(b'\0', b'\x09'), + ClassBytesRange::new(b'\x0B', b'\xFF'), + ]))) } Dot::AnyByteExceptCRLF => { - let mut cls = ClassBytes::empty(); - cls.push(ClassBytesRange::new(b'\0', b'\x09')); - cls.push(ClassBytesRange::new(b'\x0B', b'\x0C')); - cls.push(ClassBytesRange::new(b'\x0E', b'\xFF')); - Hir::class(Class::Bytes(cls)) + Hir::class(Class::Bytes(ClassBytes::new([ + ClassBytesRange::new(b'\0', b'\x09'), + ClassBytesRange::new(b'\x0B', b'\x0C'), + ClassBytesRange::new(b'\x0E', b'\xFF'), + ]))) } } } diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index 313a1e9e8..3749ce307 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -1358,9 +1358,8 @@ fn ascii_class_as_chars( #[cfg(test)] mod tests { use crate::{ - ast::{self, parse::ParserBuilder, Ast, Position, Span}, - hir::{self, Hir, HirKind, Look, Properties}, - unicode::{self, ClassQuery}, + ast::{parse::ParserBuilder, Position}, + hir::{Look, Properties}, }; use super::*; diff --git a/regex-syntax/src/utf8.rs b/regex-syntax/src/utf8.rs index e13b55abf..69d749451 100644 --- a/regex-syntax/src/utf8.rs +++ b/regex-syntax/src/utf8.rs @@ -302,9 +302,9 @@ impl Utf8Sequences { /// Create a new iterator over UTF-8 byte ranges for the scalar value range /// given. pub fn new(start: char, end: char) -> Self { - let mut it = Utf8Sequences { range_stack: vec![] }; - it.push(u32::from(start), u32::from(end)); - it + let range = + ScalarRange { start: u32::from(start), end: u32::from(end) }; + Utf8Sequences { range_stack: vec![range] } } /// reset resets the scalar value range. diff --git a/regex-test/lib.rs b/regex-test/lib.rs index 2b630666e..7b5ab830c 100644 --- a/regex-test/lib.rs +++ b/regex-test/lib.rs @@ -99,9 +99,7 @@ See [`MatchKind`] for more details. This is an optional field and defaults to /// For this reason, `anyhow` is a public dependency and is re-exported here. pub extern crate anyhow; -use std::{ - borrow::Borrow, collections::HashSet, convert::TryFrom, fs, path::Path, -}; +use std::{borrow::Borrow, collections::HashSet, fs, path::Path}; use { anyhow::{bail, Context, Result}, diff --git a/src/regex/bytes.rs b/src/regex/bytes.rs index 19f5701af..39af6e71c 100644 --- a/src/regex/bytes.rs +++ b/src/regex/bytes.rs @@ -651,6 +651,9 @@ impl Regex { /// case, this implementation will likely return a `Cow::Borrowed` value /// such that no allocation is performed. /// + /// When a `Cow::Borrowed` is returned, the value returned is guaranteed + /// to be equivalent to the `haystack` given. + /// /// # Replacement string syntax /// /// All instances of `$ref` in the replacement string are replaced with @@ -761,6 +764,13 @@ impl Regex { /// replacement provided. This is the same as calling `replacen` with /// `limit` set to `0`. /// + /// If no match is found, then the haystack is returned unchanged. In that + /// case, this implementation will likely return a `Cow::Borrowed` value + /// such that no allocation is performed. + /// + /// When a `Cow::Borrowed` is returned, the value returned is guaranteed + /// to be equivalent to the `haystack` given. + /// /// The documentation for [`Regex::replace`] goes into more detail about /// what kinds of replacement strings are supported. /// @@ -855,6 +865,13 @@ impl Regex { /// matches are replaced. That is, `Regex::replace_all(hay, rep)` is /// equivalent to `Regex::replacen(hay, 0, rep)`. /// + /// If no match is found, then the haystack is returned unchanged. In that + /// case, this implementation will likely return a `Cow::Borrowed` value + /// such that no allocation is performed. + /// + /// When a `Cow::Borrowed` is returned, the value returned is guaranteed + /// to be equivalent to the `haystack` given. + /// /// The documentation for [`Regex::replace`] goes into more detail about /// what kinds of replacement strings are supported. /// @@ -1538,18 +1555,13 @@ impl<'h> Match<'h> { impl<'h> core::fmt::Debug for Match<'h> { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + use regex_automata::util::escape::DebugHaystack; + let mut fmt = f.debug_struct("Match"); - fmt.field("start", &self.start).field("end", &self.end); - if let Ok(s) = core::str::from_utf8(self.as_bytes()) { - fmt.field("bytes", &s); - } else { - // FIXME: It would be nice if this could be printed as a string - // with invalid UTF-8 replaced with hex escapes. A alloc would - // probably okay if that makes it easier, but regex-automata does - // (at time of writing) have internal routines that do this. So - // maybe we should expose them. - fmt.field("bytes", &self.as_bytes()); - } + fmt.field("start", &self.start) + .field("end", &self.end) + .field("bytes", &DebugHaystack(&self.as_bytes())); + fmt.finish() } } @@ -1568,10 +1580,15 @@ impl<'h> From> for core::ops::Range { /// Represents the capture groups for a single match. /// -/// Capture groups refer to parts of a regex enclosed in parentheses. They can -/// be optionally named. The purpose of capture groups is to be able to -/// reference different parts of a match based on the original pattern. For -/// example, say you want to match the individual letters in a 5-letter word: +/// Capture groups refer to parts of a regex enclosed in parentheses. They +/// can be optionally named. The purpose of capture groups is to be able to +/// reference different parts of a match based on the original pattern. In +/// essence, a `Captures` is a container of [`Match`] values for each group +/// that participated in a regex match. Each `Match` can be looked up by either +/// its capture group index or name (if it has one). +/// +/// For example, say you want to match the individual letters in a 5-letter +/// word: /// /// ```text /// (?\w)(\w)(?:\w)\w(?\w) @@ -1689,8 +1706,8 @@ impl<'h> Captures<'h> { /// /// This returns a tuple where the first element corresponds to the full /// substring of the haystack that matched the regex. The second element is - /// an array of substrings, with each corresponding to the to the substring - /// that matched for a particular capture group. + /// an array of substrings, with each corresponding to the substring that + /// matched for a particular capture group. /// /// # Panics /// @@ -1984,7 +2001,7 @@ impl<'h> core::ops::Index for Captures<'h> { /// The haystack substring returned can't outlive the `Captures` object if this /// method is used, because of how `Index` is defined (normally `a[i]` is part /// of `a` and can't outlive it). To work around this limitation, do that, use -/// [`Captures::get`] instead. +/// [`Captures::name`] instead. /// /// `'h` is the lifetime of the matched haystack, but the lifetime of the /// `&str` returned by this implementation is the lifetime of the `Captures` @@ -2598,3 +2615,88 @@ fn no_expansion>(replacement: &T) -> Option> { None => Some(Cow::Borrowed(replacement)), } } + +#[cfg(test)] +mod tests { + use super::*; + use alloc::format; + + #[test] + fn test_match_properties() { + let haystack = b"Hello, world!"; + let m = Match::new(haystack, 7, 12); + + assert_eq!(m.start(), 7); + assert_eq!(m.end(), 12); + assert_eq!(m.is_empty(), false); + assert_eq!(m.len(), 5); + assert_eq!(m.as_bytes(), b"world"); + } + + #[test] + fn test_empty_match() { + let haystack = b""; + let m = Match::new(haystack, 0, 0); + + assert_eq!(m.is_empty(), true); + assert_eq!(m.len(), 0); + } + + #[test] + fn test_debug_output_valid_utf8() { + let haystack = b"Hello, world!"; + let m = Match::new(haystack, 7, 12); + let debug_str = format!("{:?}", m); + + assert_eq!( + debug_str, + r#"Match { start: 7, end: 12, bytes: "world" }"# + ); + } + + #[test] + fn test_debug_output_invalid_utf8() { + let haystack = b"Hello, \xFFworld!"; + let m = Match::new(haystack, 7, 13); + let debug_str = format!("{:?}", m); + + assert_eq!( + debug_str, + r#"Match { start: 7, end: 13, bytes: "\xffworld" }"# + ); + } + + #[test] + fn test_debug_output_various_unicode() { + let haystack = + "Hello, 😊 world! 안녕하세요? مرحبا بالعالم!".as_bytes(); + let m = Match::new(haystack, 0, haystack.len()); + let debug_str = format!("{:?}", m); + + assert_eq!( + debug_str, + r#"Match { start: 0, end: 62, bytes: "Hello, 😊 world! 안녕하세요? مرحبا بالعالم!" }"# + ); + } + + #[test] + fn test_debug_output_ascii_escape() { + let haystack = b"Hello,\tworld!\nThis is a \x1b[31mtest\x1b[0m."; + let m = Match::new(haystack, 0, haystack.len()); + let debug_str = format!("{:?}", m); + + assert_eq!( + debug_str, + r#"Match { start: 0, end: 38, bytes: "Hello,\tworld!\nThis is a \u{1b}[31mtest\u{1b}[0m." }"# + ); + } + + #[test] + fn test_debug_output_match_in_middle() { + let haystack = b"The quick brown fox jumps over the lazy dog."; + let m = Match::new(haystack, 16, 19); + let debug_str = format!("{:?}", m); + + assert_eq!(debug_str, r#"Match { start: 16, end: 19, bytes: "fox" }"#); + } +} diff --git a/src/regex/string.rs b/src/regex/string.rs index 880d6082a..fab178a68 100644 --- a/src/regex/string.rs +++ b/src/regex/string.rs @@ -642,6 +642,9 @@ impl Regex { /// case, this implementation will likely return a `Cow::Borrowed` value /// such that no allocation is performed. /// + /// When a `Cow::Borrowed` is returned, the value returned is guaranteed + /// to be equivalent to the `haystack` given. + /// /// # Replacement string syntax /// /// All instances of `$ref` in the replacement string are replaced with @@ -748,6 +751,13 @@ impl Regex { /// replacement provided. This is the same as calling `replacen` with /// `limit` set to `0`. /// + /// If no match is found, then the haystack is returned unchanged. In that + /// case, this implementation will likely return a `Cow::Borrowed` value + /// such that no allocation is performed. + /// + /// When a `Cow::Borrowed` is returned, the value returned is guaranteed + /// to be equivalent to the `haystack` given. + /// /// The documentation for [`Regex::replace`] goes into more detail about /// what kinds of replacement strings are supported. /// @@ -842,6 +852,13 @@ impl Regex { /// matches are replaced. That is, `Regex::replace_all(hay, rep)` is /// equivalent to `Regex::replacen(hay, 0, rep)`. /// + /// If no match is found, then the haystack is returned unchanged. In that + /// case, this implementation will likely return a `Cow::Borrowed` value + /// such that no allocation is performed. + /// + /// When a `Cow::Borrowed` is returned, the value returned is guaranteed + /// to be equivalent to the `haystack` given. + /// /// The documentation for [`Regex::replace`] goes into more detail about /// what kinds of replacement strings are supported. /// @@ -1573,10 +1590,15 @@ impl<'h> From> for core::ops::Range { /// Represents the capture groups for a single match. /// -/// Capture groups refer to parts of a regex enclosed in parentheses. They can -/// be optionally named. The purpose of capture groups is to be able to -/// reference different parts of a match based on the original pattern. For -/// example, say you want to match the individual letters in a 5-letter word: +/// Capture groups refer to parts of a regex enclosed in parentheses. They +/// can be optionally named. The purpose of capture groups is to be able to +/// reference different parts of a match based on the original pattern. In +/// essence, a `Captures` is a container of [`Match`] values for each group +/// that participated in a regex match. Each `Match` can be looked up by either +/// its capture group index or name (if it has one). +/// +/// For example, say you want to match the individual letters in a 5-letter +/// word: /// /// ```text /// (?\w)(\w)(?:\w)\w(?\w) @@ -1694,8 +1716,8 @@ impl<'h> Captures<'h> { /// /// This returns a tuple where the first element corresponds to the full /// substring of the haystack that matched the regex. The second element is - /// an array of substrings, with each corresponding to the to the substring - /// that matched for a particular capture group. + /// an array of substrings, with each corresponding to the substring that + /// matched for a particular capture group. /// /// # Panics /// @@ -1987,7 +2009,7 @@ impl<'h> core::ops::Index for Captures<'h> { /// The haystack substring returned can't outlive the `Captures` object if this /// method is used, because of how `Index` is defined (normally `a[i]` is part /// of `a` and can't outlive it). To work around this limitation, do that, use -/// [`Captures::get`] instead. +/// [`Captures::name`] instead. /// /// `'h` is the lifetime of the matched haystack, but the lifetime of the /// `&str` returned by this implementation is the lifetime of the `Captures` diff --git a/src/regexset/bytes.rs b/src/regexset/bytes.rs index 1220a1466..2f46abc4d 100644 --- a/src/regexset/bytes.rs +++ b/src/regexset/bytes.rs @@ -355,7 +355,7 @@ impl RegexSet { ) -> bool { // This is pretty dumb. We should try to fix this, but the // regex-automata API doesn't provide a way to store matches in an - // arbitrary &mut [bool]. Thankfully, this API is is doc(hidden) and + // arbitrary &mut [bool]. Thankfully, this API is doc(hidden) and // thus not public... But regex-capi currently uses it. We should // fix regex-capi to use a PatternSet, maybe? Not sure... PatternSet // is in regex-automata, not regex. So maybe we should just accept a diff --git a/src/regexset/string.rs b/src/regexset/string.rs index 2a3e7b802..5cb9b5608 100644 --- a/src/regexset/string.rs +++ b/src/regexset/string.rs @@ -351,7 +351,7 @@ impl RegexSet { ) -> bool { // This is pretty dumb. We should try to fix this, but the // regex-automata API doesn't provide a way to store matches in an - // arbitrary &mut [bool]. Thankfully, this API is is doc(hidden) and + // arbitrary &mut [bool]. Thankfully, this API is doc(hidden) and // thus not public... But regex-capi currently uses it. We should // fix regex-capi to use a PatternSet, maybe? Not sure... PatternSet // is in regex-automata, not regex. So maybe we should just accept a