From 20b5317f7a8accbf64ee21245b0a37f636017e13 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 20 Oct 2023 07:52:52 -0400 Subject: [PATCH 01/35] automata: fix panic in dense DFA deserialization This fixes a hole in the validation logic that accidentally permitted a dense DFA to contain a match state with zero pattern IDs. Since search code is permitted to assume that every match state has at least one corresponding pattern ID, this led to a panic. Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63391 --- ...ata_deserialize_dense_dfa-5624222820728832 | Bin 0 -> 749 bytes regex-automata/src/dfa/dense.rs | 20 ++++++++++++------ 2 files changed, 13 insertions(+), 7 deletions(-) create mode 100644 fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_dense_dfa-5624222820728832 diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_dense_dfa-5624222820728832 b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_dense_dfa-5624222820728832 new file mode 100644 index 0000000000000000000000000000000000000000..e236ae735c7f413c90a0e9b61cc4add46ced15e7 GIT binary patch literal 749 zcmd5)TMED+469=gkFon-(j{nQDi!=03sh-q(;8}bMmo$a!2W|zr>V_O2(ZE8>!v1* z6U=#_h8}h#XfDExxv!cs^^Zrt{#L1#-lYZ{(nKt)H^)$Ct|9sII*#X6$oXD13{eTq K(N?9_h4%nug9@bp literal 0 HcmV?d00001 diff --git a/regex-automata/src/dfa/dense.rs b/regex-automata/src/dfa/dense.rs index fd96bc878..6fc61dc4f 100644 --- a/regex-automata/src/dfa/dense.rs +++ b/regex-automata/src/dfa/dense.rs @@ -2340,8 +2340,8 @@ impl<'a> DFA<&'a [u32]> { // table, match states and accelerators below. If any validation fails, // then we return an error. let (dfa, nread) = unsafe { DFA::from_bytes_unchecked(slice)? }; - dfa.tt.validate(&dfa.special)?; - dfa.st.validate(&dfa.tt)?; + dfa.tt.validate(&dfa)?; + dfa.st.validate(&dfa)?; dfa.ms.validate(&dfa)?; dfa.accels.validate()?; // N.B. dfa.special doesn't have a way to do unchecked deserialization, @@ -3593,7 +3593,8 @@ impl> TransitionTable { /// /// That is, every state ID can be used to correctly index a state in this /// table. - fn validate(&self, sp: &Special) -> Result<(), DeserializeError> { + fn validate(&self, dfa: &DFA) -> Result<(), DeserializeError> { + let sp = &dfa.special; for state in self.states() { // We check that the ID itself is well formed. That is, if it's // a special state then it must actually be a quit, dead, accel, @@ -3611,6 +3612,13 @@ impl> TransitionTable { wasn't actually special", )); } + if sp.is_match_state(state.id()) + && dfa.match_len(state.id()) == 0 + { + return Err(DeserializeError::generic( + "found match state with zero pattern IDs", + )); + } } for (_, to) in state.transitions() { if !self.is_valid(to) { @@ -4127,10 +4135,8 @@ impl> StartTable { /// it against the given transition table (which must be for the same DFA). /// /// That is, every state ID can be used to correctly index a state. - fn validate( - &self, - tt: &TransitionTable, - ) -> Result<(), DeserializeError> { + fn validate(&self, dfa: &DFA) -> Result<(), DeserializeError> { + let tt = &dfa.tt; if !self.universal_start_unanchored.map_or(true, |s| tt.is_valid(s)) { return Err(DeserializeError::generic( "found invalid universal unanchored starting state ID", From 6b72eec64b428859702ae5ee811048112af5269e Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Wed, 25 Oct 2023 09:37:53 -0400 Subject: [PATCH 02/35] syntax: add Hir::literal example for `char` The example shows a succinct way of creating an HIR literal from a `char` value by first encoding it to UTF-8. Closes #1114 --- regex-syntax/src/hir/mod.rs | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index ce38ead7b..ae3ba318e 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -322,6 +322,22 @@ impl Hir { /// let expected = HirKind::Literal(Literal(Box::from("☃".as_bytes()))); /// assert_eq!(&expected, concat.kind()); /// ``` + /// + /// # Example: building a literal from a `char` + /// + /// This example shows how to build a single `Hir` literal from a `char` + /// value. Since a [`Literal`] is just bytes, we just need to UTF-8 + /// encode a `char` value: + /// + /// ``` + /// use regex_syntax::hir::{Hir, HirKind, Literal}; + /// + /// let ch = '☃'; + /// let got = Hir::literal(ch.encode_utf8(&mut [0; 4]).as_bytes()); + /// + /// let expected = HirKind::Literal(Literal(Box::from("☃".as_bytes()))); + /// assert_eq!(&expected, got.kind()); + /// ``` #[inline] pub fn literal>>(lit: B) -> Hir { let bytes = lit.into(); From 662a8b93afa55b5c489f14bca83565ebe62ccf67 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Wed, 1 Nov 2023 11:52:44 -0400 Subject: [PATCH 03/35] cli: change --no-captures to --captures (all|implicit|none) When we added the WhichCaptures type, we didn't update the CLI to expose the full functionality. This change does that. --- regex-automata/src/nfa/thompson/map.rs | 2 +- regex-automata/src/nfa/thompson/range_trie.rs | 2 +- regex-cli/args/flags.rs | 52 +++++++++++++++++++ regex-cli/args/thompson.rs | 24 +++------ 4 files changed, 60 insertions(+), 20 deletions(-) diff --git a/regex-automata/src/nfa/thompson/map.rs b/regex-automata/src/nfa/thompson/map.rs index c92d4c0b8..7f074a353 100644 --- a/regex-automata/src/nfa/thompson/map.rs +++ b/regex-automata/src/nfa/thompson/map.rs @@ -65,7 +65,7 @@ const INIT: u64 = 14695981039346656037; /// Specifically, one could observe the difference with std's hashmap via /// something like the following benchmark: /// -/// hyperfine "regex-cli debug thompson -qr --no-captures '\w{90} ecurB'" +/// hyperfine "regex-cli debug thompson -qr --captures none '\w{90} ecurB'" /// /// But to observe that difference, you'd have to modify the code to use /// std's hashmap. diff --git a/regex-automata/src/nfa/thompson/range_trie.rs b/regex-automata/src/nfa/thompson/range_trie.rs index 75c9b796b..cd77cc150 100644 --- a/regex-automata/src/nfa/thompson/range_trie.rs +++ b/regex-automata/src/nfa/thompson/range_trie.rs @@ -594,7 +594,7 @@ impl State { // Benchmarks suggest that binary search is just a bit faster than // straight linear search. Specifically when using the debug tool: // - // hyperfine "regex-cli debug thompson -qr --no-captures '\w{90} ecurB'" + // hyperfine "regex-cli debug thompson -qr --captures none '\w{90} ecurB'" binary_search(&self.transitions, |t| range.start <= t.range.end) } diff --git a/regex-cli/args/flags.rs b/regex-cli/args/flags.rs index db8a847ef..61732a28e 100644 --- a/regex-cli/args/flags.rs +++ b/regex-cli/args/flags.rs @@ -152,3 +152,55 @@ impl std::str::FromStr for MatchKind { Ok(MatchKind { kind }) } } + +/// Provides an implementation of the --captures flag, for use with Thompson +/// NFA configuration. +#[derive(Debug)] +pub struct WhichCaptures { + pub which: regex_automata::nfa::thompson::WhichCaptures, +} + +impl WhichCaptures { + pub const USAGE: Usage = Usage::new( + "--captures ", + "One of: all, implicit or none.", + r#" +Selects which capture states should be included in the Thompson NFA. The +choices are 'all' (the default), 'implicit' or 'none'. + +'all' means that both explicit and implicit capture states are included. + +'implicit' means that only implicit capture states are included. That is, the +Thompson NFA will only be able to report the overall match offsets and not the +match offsets of each explicit capture group. + +'none' means that no capture states will be included. This is useful when +capture states aren't needed (like when building a DFA) or if they aren't +supported (like when building a reverse NFA). +"#, + ); +} + +impl Default for WhichCaptures { + fn default() -> WhichCaptures { + WhichCaptures { + which: regex_automata::nfa::thompson::WhichCaptures::All, + } + } +} + +impl std::str::FromStr for WhichCaptures { + type Err = anyhow::Error; + + fn from_str(s: &str) -> anyhow::Result { + let which = match s { + "all" => regex_automata::nfa::thompson::WhichCaptures::All, + "implicit" => { + regex_automata::nfa::thompson::WhichCaptures::Implicit + } + "none" => regex_automata::nfa::thompson::WhichCaptures::None, + unk => anyhow::bail!("unrecognized captures option '{}'", unk), + }; + Ok(WhichCaptures { which }) + } +} diff --git a/regex-cli/args/thompson.rs b/regex-cli/args/thompson.rs index 151fc6a0b..bd8388d11 100644 --- a/regex-cli/args/thompson.rs +++ b/regex-cli/args/thompson.rs @@ -70,11 +70,11 @@ impl Configurable for Config { Arg::Long("shrink") => { self.thompson = self.thompson.clone().shrink(true); } - Arg::Long("no-captures") => { - self.thompson = self - .thompson - .clone() - .which_captures(thompson::WhichCaptures::None); + Arg::Long("captures") => { + let which: flags::WhichCaptures = + args::parse(p, "--captures")?; + self.thompson = + self.thompson.clone().which_captures(which.which); } Arg::Long("line-terminator") => { let byte: flags::OneByte = @@ -136,19 +136,7 @@ spent shrinking the NFA can lead to far larger savings in the subsequent DFA determinization. "#, ), - Usage::new( - "--no-captures", - "Disable capture states.", - r#" -Disables capture states. By default, NFAs include special "capture" states that -instruct some regex engines (like the PikeVM) to record offset positions in -ancillary state. - -It can be useful to disable capture states in order to reduce "clutter" in the -automaton when debugging it. Also, at time of writing, reverse NFAs require -that capture groups are disabled. -"#, - ), + flags::WhichCaptures::USAGE, Usage::new( "--line-terminator", "Set the line terminator used by line anchors.", From 837fd85e79fac2a4ea64030411b9a4a7b17dfa42 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Wed, 1 Nov 2023 11:53:34 -0400 Subject: [PATCH 04/35] regex-cli-0.2.0 --- regex-cli/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-cli/Cargo.toml b/regex-cli/Cargo.toml index 3fe5390aa..a107c09df 100644 --- a/regex-cli/Cargo.toml +++ b/regex-cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex-cli" -version = "0.1.1" #:version +version = "0.2.0" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] description = """ A command line tool for debugging, ad hoc benchmarking and generating regular From 4f5992fa442bb469be7042454e6a1b74181dd9a5 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 4 Dec 2023 20:10:23 -0500 Subject: [PATCH 05/35] doc: tweak `Captures` documentation This was suggested [on Discord](https://discord.com/channels/273534239310479360/1120175689124036669/1181401471720370237). --- src/regex/bytes.rs | 13 +++++++++---- src/regex/string.rs | 13 +++++++++---- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/src/regex/bytes.rs b/src/regex/bytes.rs index 19f5701af..fd36c8676 100644 --- a/src/regex/bytes.rs +++ b/src/regex/bytes.rs @@ -1568,10 +1568,15 @@ impl<'h> From> for core::ops::Range { /// Represents the capture groups for a single match. /// -/// Capture groups refer to parts of a regex enclosed in parentheses. They can -/// be optionally named. The purpose of capture groups is to be able to -/// reference different parts of a match based on the original pattern. For -/// example, say you want to match the individual letters in a 5-letter word: +/// Capture groups refer to parts of a regex enclosed in parentheses. They +/// can be optionally named. The purpose of capture groups is to be able to +/// reference different parts of a match based on the original pattern. In +/// essence, a `Captures` is a container of [`Match`] values for each group +/// that participated in a regex match. Each `Match` can be looked up by either +/// its capture group index or name (if it has one). +/// +/// For example, say you want to match the individual letters in a 5-letter +/// word: /// /// ```text /// (?\w)(\w)(?:\w)\w(?\w) diff --git a/src/regex/string.rs b/src/regex/string.rs index 880d6082a..25f43a7f3 100644 --- a/src/regex/string.rs +++ b/src/regex/string.rs @@ -1573,10 +1573,15 @@ impl<'h> From> for core::ops::Range { /// Represents the capture groups for a single match. /// -/// Capture groups refer to parts of a regex enclosed in parentheses. They can -/// be optionally named. The purpose of capture groups is to be able to -/// reference different parts of a match based on the original pattern. For -/// example, say you want to match the individual letters in a 5-letter word: +/// Capture groups refer to parts of a regex enclosed in parentheses. They +/// can be optionally named. The purpose of capture groups is to be able to +/// reference different parts of a match based on the original pattern. In +/// essence, a `Captures` is a container of [`Match`] values for each group +/// that participated in a regex match. Each `Match` can be looked up by either +/// its capture group index or name (if it has one). +/// +/// For example, say you want to match the individual letters in a 5-letter +/// word: /// /// ```text /// (?\w)(\w)(?:\w)\w(?\w) From a3d5975c35249b0f726078a19160f9269da64964 Mon Sep 17 00:00:00 2001 From: kloune Date: Wed, 20 Dec 2023 18:22:55 +0100 Subject: [PATCH 06/35] doc: fix link in Index<&str> impl docs This referenced `Captures::get`, but it should reference `Captures::name`. This was likely a transcription error from the docs for the `Index` impl. --- src/regex/bytes.rs | 2 +- src/regex/string.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/regex/bytes.rs b/src/regex/bytes.rs index fd36c8676..ea4f7cd65 100644 --- a/src/regex/bytes.rs +++ b/src/regex/bytes.rs @@ -1989,7 +1989,7 @@ impl<'h> core::ops::Index for Captures<'h> { /// The haystack substring returned can't outlive the `Captures` object if this /// method is used, because of how `Index` is defined (normally `a[i]` is part /// of `a` and can't outlive it). To work around this limitation, do that, use -/// [`Captures::get`] instead. +/// [`Captures::name`] instead. /// /// `'h` is the lifetime of the matched haystack, but the lifetime of the /// `&str` returned by this implementation is the lifetime of the `Captures` diff --git a/src/regex/string.rs b/src/regex/string.rs index 25f43a7f3..824f45c69 100644 --- a/src/regex/string.rs +++ b/src/regex/string.rs @@ -1992,7 +1992,7 @@ impl<'h> core::ops::Index for Captures<'h> { /// The haystack substring returned can't outlive the `Captures` object if this /// method is used, because of how `Index` is defined (normally `a[i]` is part /// of `a` and can't outlive it). To work around this limitation, do that, use -/// [`Captures::get`] instead. +/// [`Captures::name`] instead. /// /// `'h` is the lifetime of the matched haystack, but the lifetime of the /// `&str` returned by this implementation is the lifetime of the `Captures` From dc0a9d239cf34cda09cbdeffea3919b4c627836c Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Thu, 28 Dec 2023 20:40:03 -0500 Subject: [PATCH 07/35] ci: small clean-ups The regex 1.10 release bumped the MSRV to Rust 1.65, so we no longer need to pin to an older memchr release. We also bump to `actions/checkout@v4`. --- .github/workflows/ci.yml | 35 ++++++++--------------------------- 1 file changed, 8 insertions(+), 27 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2813a1676..eb8e9f86e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -79,7 +79,7 @@ jobs: rust: stable-x86_64-gnu steps: - name: Checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Install Rust uses: dtolnay/rust-toolchain@master with: @@ -137,30 +137,11 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Install Rust uses: dtolnay/rust-toolchain@master with: toolchain: 1.65.0 - # The memchr 2.6 release purportedly bumped its MSRV to Rust 1.60, but it - # turned out that on aarch64, it was using something that wasn't stabilized - # until Rust 1.61[1]. (This was an oversight on my part. I had previously - # thought everything I needed was on Rust 1.60.) To resolve that, I just - # bumped memchr's MSRV to 1.61. Since it was so soon after the memchr 2.6 - # release, I treated this as a bugfix. - # - # But the regex crate's MSRV is at Rust 1.60, and it now depends on at - # least memchr 2.6 (to make use of its `alloc` feature). So we can't set - # a lower minimal version. And I can't just bump the MSRV in a patch - # release as a bug fix because regex 1.9 was released quite some time ago. - # I could just release regex 1.10 and bump the MSRV there, but eh, I don't - # want to put out another minor version release just for this. - # - # So... pin memchr to 2.6.2, which at least works on x86-64 on Rust 1.60. - # - # [1]: https://github.com/BurntSushi/memchr/issues/136 - - name: Pin memchr to 2.6.2 - run: cargo update -p memchr --precise 2.6.2 - name: Basic build run: cargo build --verbose - name: Build docs @@ -177,7 +158,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Install Rust uses: dtolnay/rust-toolchain@master with: @@ -190,7 +171,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Install Rust uses: dtolnay/rust-toolchain@master with: @@ -203,7 +184,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Install Rust uses: dtolnay/rust-toolchain@master with: @@ -216,7 +197,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Install Rust uses: dtolnay/rust-toolchain@master with: @@ -231,7 +212,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Install Rust uses: dtolnay/rust-toolchain@master with: @@ -248,7 +229,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Install Rust uses: dtolnay/rust-toolchain@master with: From 027eebd6fde307076603530c999afcfd271bb037 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Wed, 10 Jan 2024 08:16:51 -0500 Subject: [PATCH 08/35] cargo: set 'default-features = false' for memchr and aho-corasick I'm not sure how this one slipped by. Without this, I'd suppose that no-std support doesn't actually work? Or at least, one would have to disable the use of both memchr and aho-corasick entirely, since they depend on std by default. Not quite sure how to test this. Fixes #1147 --- CHANGELOG.md | 11 +++++++++++ Cargo.toml | 2 ++ 2 files changed, 13 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 420e08f74..38da512ed 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,14 @@ +1.10.3 (TBD) +============ +This is a new patch release that fixes the feature configuration of optional +dependencies. + +Bug fixes: + +* [BUG #1147](https://github.com/rust-lang/regex/issues/1147): +Set `default-features=false` for the `memchr` and `aho-corasick` dependencies. + + 1.10.2 (2023-10-16) =================== This is a new patch release that fixes a search regression where incorrect diff --git a/Cargo.toml b/Cargo.toml index 3ba14c904..09f70f6d3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -165,11 +165,13 @@ pattern = [] [dependencies.aho-corasick] version = "1.0.0" optional = true +default-features = false # For skipping along search text quickly when a leading byte is known. [dependencies.memchr] version = "2.6.0" optional = true +default-features = false # For the actual regex engines. [dependencies.regex-automata] From fbd2537a58008c1045bc063b30a3e1f130742aac Mon Sep 17 00:00:00 2001 From: Giacomo Stevanato Date: Fri, 19 Jan 2024 22:03:37 +0100 Subject: [PATCH 09/35] safety: guard in Input::new against incorrect AsRef implementations Before this commit, Input::new calls haystack.as_ref() twice, once to get the actual haystack slice and the second time to get its length. It makes the assumption that the second call will return the same slice, but malicious implementations of AsRef can return different slices and thus different lengths. This is important because there's unsafe code relying on the Input's span being inbounds with respect to the haystack, but if the second call to .as_ref() returns a bigger slice this won't be true. For example, this snippet causes Miri to report UB on an unchecked slice access in find_fwd_imp (though it will also panic sometime later when run normally, but at that point the UB already happened): use regex_automata::{Input, meta::{Builder, Config}}; use std::cell::Cell; struct Bad(Cell); impl AsRef<[u8]> for Bad { fn as_ref(&self) -> &[u8] { if self.0.replace(false) { &[] } else { &[0; 1000] } } } let bad = Bad(Cell::new(true)); let input = Input::new(&bad); let regex = Builder::new() // Not setting this causes some checked access to occur before // the unchecked ones, avoiding the UB .configure(Config::new().auto_prefilter(false)) .build("a+") .unwrap(); regex.find(input); This commit fixes the problem by just calling .as_ref() once and use the length of the returned slice as the span's end value. A regression test has also been added. Closes #1154 --- CHANGELOG.md | 4 +++- regex-automata/src/util/search.rs | 28 ++++++++++++++++++++++++++-- 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 38da512ed..527950518 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,12 +1,14 @@ 1.10.3 (TBD) ============ This is a new patch release that fixes the feature configuration of optional -dependencies. +dependencies, and fixes an unsound use of bounds check elision. Bug fixes: * [BUG #1147](https://github.com/rust-lang/regex/issues/1147): Set `default-features=false` for the `memchr` and `aho-corasick` dependencies. +* [BUG #1154](https://github.com/rust-lang/regex/pull/1154): +Fix unsound bounds check elision. 1.10.2 (2023-10-16) diff --git a/regex-automata/src/util/search.rs b/regex-automata/src/util/search.rs index 39aec522b..05b1cff54 100644 --- a/regex-automata/src/util/search.rs +++ b/regex-automata/src/util/search.rs @@ -110,9 +110,14 @@ impl<'h> Input<'h> { /// Create a new search configuration for the given haystack. #[inline] pub fn new>(haystack: &'h H) -> Input<'h> { + // Perform only one call to `haystack.as_ref()` to protect from incorrect + // implementations that return different values from multiple calls. + // This is important because there's code that relies on `span` not being + // out of bounds with respect to the stored `haystack`. + let haystack = haystack.as_ref(); Input { - haystack: haystack.as_ref(), - span: Span { start: 0, end: haystack.as_ref().len() }, + haystack, + span: Span { start: 0, end: haystack.len() }, anchored: Anchored::No, earliest: false, } @@ -1966,4 +1971,23 @@ mod tests { let expected_size = 3 * core::mem::size_of::(); assert_eq!(expected_size, core::mem::size_of::()); } + + #[test] + fn incorrect_asref_guard() { + struct Bad(std::cell::Cell); + + impl AsRef<[u8]> for Bad { + fn as_ref(&self) -> &[u8] { + if self.0.replace(false) { + &[] + } else { + &[0; 1000] + } + } + } + + let bad = Bad(std::cell::Cell::new(true)); + let input = Input::new(&bad); + assert!(input.end() <= input.haystack().len()); + } } From 1bc667d7b37c73d17d1d73d9711ff39f093a4280 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 21 Jan 2024 09:08:28 -0500 Subject: [PATCH 10/35] changelog: 1.10.3 --- CHANGELOG.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 527950518..3ffd961d7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,5 @@ -1.10.3 (TBD) -============ +1.10.3 (2024-01-21) +=================== This is a new patch release that fixes the feature configuration of optional dependencies, and fixes an unsound use of bounds check elision. From e7b5401f6aae8db34fedcd4ed0c36f539279e4e9 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 21 Jan 2024 09:08:38 -0500 Subject: [PATCH 11/35] regex-automata-0.4.4 --- regex-automata/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-automata/Cargo.toml b/regex-automata/Cargo.toml index 3cb3d7c8e..7ca64eae7 100644 --- a/regex-automata/Cargo.toml +++ b/regex-automata/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex-automata" -version = "0.4.3" #:version +version = "0.4.4" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] description = "Automata construction and matching using regular expressions." documentation = "https://docs.rs/regex-automata" From 653bb5999647fd0abf1d11e2708dd6bb7607d749 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 21 Jan 2024 09:09:23 -0500 Subject: [PATCH 12/35] deps: bump regex-automata to 0.4.4 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 09f70f6d3..e00c39f38 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -176,7 +176,7 @@ default-features = false # For the actual regex engines. [dependencies.regex-automata] path = "regex-automata" -version = "0.4.3" +version = "0.4.4" default-features = false features = ["alloc", "syntax", "meta", "nfa-pikevm"] From 0c0990399270277832fbb5b91a1fa118e6f63dba Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 21 Jan 2024 09:09:31 -0500 Subject: [PATCH 13/35] 1.10.3 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index e00c39f38..c254659d7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex" -version = "1.10.2" #:version +version = "1.10.3" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] license = "MIT OR Apache-2.0" readme = "README.md" From 07ef7f1550d59a548ee58631cf2bca263e67cb8e Mon Sep 17 00:00:00 2001 From: Pascal Kuthe Date: Thu, 25 Jan 2024 16:25:28 +0100 Subject: [PATCH 14/35] automata: make additional prefileter metadata public This commit exposes `is_fast` and also adds `max_needle_len` to a prefilter. This is useful for engines implemented outside of `regex-automata`. PR #1156 --- regex-automata/src/util/prefilter/mod.rs | 31 +++++++++++++++++++++--- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/regex-automata/src/util/prefilter/mod.rs b/regex-automata/src/util/prefilter/mod.rs index 51fc92233..d20442a69 100644 --- a/regex-automata/src/util/prefilter/mod.rs +++ b/regex-automata/src/util/prefilter/mod.rs @@ -146,6 +146,8 @@ pub struct Prefilter { pre: Arc, #[cfg(feature = "alloc")] is_fast: bool, + #[cfg(feature = "alloc")] + max_needle_len: usize, } impl Prefilter { @@ -202,12 +204,19 @@ impl Prefilter { kind: MatchKind, needles: &[B], ) -> Option { - Choice::new(kind, needles).and_then(Prefilter::from_choice) + Choice::new(kind, needles).and_then(|choice| { + let max_needle_len = + needles.iter().map(|b| b.as_ref().len()).max().unwrap_or(0); + Prefilter::from_choice(choice, max_needle_len) + }) } /// This turns a prefilter selection into a `Prefilter`. That is, in turns /// the enum given into a trait object. - fn from_choice(choice: Choice) -> Option { + fn from_choice( + choice: Choice, + max_needle_len: usize, + ) -> Option { #[cfg(not(feature = "alloc"))] { None @@ -224,7 +233,7 @@ impl Prefilter { Choice::AhoCorasick(p) => Arc::new(p), }; let is_fast = pre.is_fast(); - Some(Prefilter { pre, is_fast }) + Some(Prefilter { pre, is_fast, max_needle_len }) } } @@ -411,6 +420,20 @@ impl Prefilter { } } + /// Return the length of the longest needle + /// in this Prefilter + #[inline] + pub fn max_needle_len(&self) -> usize { + #[cfg(not(feature = "alloc"))] + { + unreachable!() + } + #[cfg(feature = "alloc")] + { + self.max_needle_len + } + } + /// Implementations might return true here if they believe themselves to /// be "fast." The concept of "fast" is deliberately left vague, but in /// practice this usually corresponds to whether it's believed that SIMD @@ -429,7 +452,7 @@ impl Prefilter { /// *know* a prefilter will be fast without actually trying the prefilter. /// (Which of course we cannot afford to do.) #[inline] - pub(crate) fn is_fast(&self) -> bool { + pub fn is_fast(&self) -> bool { #[cfg(not(feature = "alloc"))] { unreachable!() From d7f9347f2a8a7f4e7583c88876411da12a09b572 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Thu, 25 Jan 2024 10:25:38 -0500 Subject: [PATCH 15/35] regex-automata-0.4.5 --- regex-automata/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-automata/Cargo.toml b/regex-automata/Cargo.toml index 7ca64eae7..199985f4e 100644 --- a/regex-automata/Cargo.toml +++ b/regex-automata/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex-automata" -version = "0.4.4" #:version +version = "0.4.5" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] description = "Automata construction and matching using regular expressions." documentation = "https://docs.rs/regex-automata" From 10fe722a3fcfdc17068b21f3262189cc52227bb5 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 25 Feb 2024 20:37:04 -0500 Subject: [PATCH 16/35] style: clean up some recent lint violations It looks like `dead_code` got a little smarter, and more pervasively, some new lint that detects superfluous imports found a bunch of them. --- regex-automata/src/dfa/dense.rs | 2 +- regex-automata/src/dfa/sparse.rs | 6 +----- regex-automata/src/nfa/thompson/compiler.rs | 6 +++--- regex-automata/src/nfa/thompson/range_trie.rs | 6 +----- regex-automata/src/util/determinize/state.rs | 2 +- regex-automata/src/util/int.rs | 14 ++++---------- regex-automata/src/util/wire.rs | 19 +------------------ regex-capi/src/error.rs | 3 +-- regex-cli/cmd/debug/dfa.rs | 2 +- regex-cli/logger.rs | 2 +- regex-syntax/src/ast/parse.rs | 2 -- regex-syntax/src/hir/interval.rs | 17 ----------------- regex-syntax/src/hir/translate.rs | 5 ++--- regex-test/lib.rs | 4 +--- 14 files changed, 18 insertions(+), 72 deletions(-) diff --git a/regex-automata/src/dfa/dense.rs b/regex-automata/src/dfa/dense.rs index 6fc61dc4f..8e0f33c03 100644 --- a/regex-automata/src/dfa/dense.rs +++ b/regex-automata/src/dfa/dense.rs @@ -9,7 +9,7 @@ This module also contains a [`dense::Builder`](Builder) and a #[cfg(feature = "dfa-build")] use core::cmp; -use core::{convert::TryFrom, fmt, iter, mem::size_of, slice}; +use core::{fmt, iter, mem::size_of, slice}; #[cfg(feature = "dfa-build")] use alloc::{ diff --git a/regex-automata/src/dfa/sparse.rs b/regex-automata/src/dfa/sparse.rs index d461e0a0f..46278c181 100644 --- a/regex-automata/src/dfa/sparse.rs +++ b/regex-automata/src/dfa/sparse.rs @@ -38,11 +38,7 @@ assert_eq!(Some(HalfMatch::must(0, 7)), state.get_match()); #[cfg(feature = "dfa-build")] use core::iter; -use core::{ - convert::{TryFrom, TryInto}, - fmt, - mem::size_of, -}; +use core::{fmt, mem::size_of}; #[cfg(feature = "dfa-build")] use alloc::{vec, vec::Vec}; diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index 2d2172957..e6b1c9122 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -1876,11 +1876,11 @@ impl Utf8Node { #[cfg(test)] mod tests { - use alloc::{vec, vec::Vec}; + use alloc::vec; use crate::{ - nfa::thompson::{SparseTransitions, State, Transition, NFA}, - util::primitives::{PatternID, SmallIndex, StateID}, + nfa::thompson::{SparseTransitions, State}, + util::primitives::SmallIndex, }; use super::*; diff --git a/regex-automata/src/nfa/thompson/range_trie.rs b/regex-automata/src/nfa/thompson/range_trie.rs index cd77cc150..49debda40 100644 --- a/regex-automata/src/nfa/thompson/range_trie.rs +++ b/regex-automata/src/nfa/thompson/range_trie.rs @@ -141,7 +141,7 @@ construction later by virtue of producing a much much smaller NFA. [2] - https://www.mitpressjournals.org/doi/pdfplus/10.1162/089120100561601 */ -use core::{cell::RefCell, convert::TryFrom, fmt, mem, ops::RangeInclusive}; +use core::{cell::RefCell, fmt, mem, ops::RangeInclusive}; use alloc::{format, string::String, vec, vec::Vec}; @@ -915,10 +915,6 @@ fn intersects(r1: Utf8Range, r2: Utf8Range) -> bool { #[cfg(test)] mod tests { - use core::ops::RangeInclusive; - - use regex_syntax::utf8::Utf8Range; - use super::*; fn r(range: RangeInclusive) -> Utf8Range { diff --git a/regex-automata/src/util/determinize/state.rs b/regex-automata/src/util/determinize/state.rs index effa6f44d..8a8561a31 100644 --- a/regex-automata/src/util/determinize/state.rs +++ b/regex-automata/src/util/determinize/state.rs @@ -86,7 +86,7 @@ serialized anywhere. So any kind of change can be made with reckless abandon, as long as everything in this module agrees. */ -use core::{convert::TryFrom, mem}; +use core::mem; use alloc::{sync::Arc, vec::Vec}; diff --git a/regex-automata/src/util/int.rs b/regex-automata/src/util/int.rs index e6b13bff9..b726e93f8 100644 --- a/regex-automata/src/util/int.rs +++ b/regex-automata/src/util/int.rs @@ -41,6 +41,10 @@ like `u64::from` where possible, or even `usize::try_from()` for when we do explicitly want to panic or when we want to return an error for overflow. */ +// We define a little more than what we need, but I'd rather just have +// everything via a consistent and uniform API then have holes. +#![allow(dead_code)] + pub(crate) trait U8 { fn as_usize(self) -> usize; } @@ -240,13 +244,3 @@ impl Pointer for *const T { self as usize } } - -pub(crate) trait PointerMut { - fn as_usize(self) -> usize; -} - -impl PointerMut for *mut T { - fn as_usize(self) -> usize { - self as usize - } -} diff --git a/regex-automata/src/util/wire.rs b/regex-automata/src/util/wire.rs index ecf4fd8c0..b1351c7e9 100644 --- a/regex-automata/src/util/wire.rs +++ b/regex-automata/src/util/wire.rs @@ -41,11 +41,7 @@ generally requires serializing both its big-endian and little-endian variants, and then loading the correct one based on the target's endianness. */ -use core::{ - cmp, - convert::{TryFrom, TryInto}, - mem::size_of, -}; +use core::{cmp, mem::size_of}; #[cfg(feature = "alloc")] use alloc::{vec, vec::Vec}; @@ -867,11 +863,6 @@ pub(crate) trait Endian { /// this panics. fn write_u32(n: u32, dst: &mut [u8]); - /// Writes a u64 to the given destination buffer in a particular - /// endianness. If the destination buffer has a length smaller than 8, then - /// this panics. - fn write_u64(n: u64, dst: &mut [u8]); - /// Writes a u128 to the given destination buffer in a particular /// endianness. If the destination buffer has a length smaller than 16, /// then this panics. @@ -897,10 +888,6 @@ impl Endian for LE { dst[..4].copy_from_slice(&n.to_le_bytes()); } - fn write_u64(n: u64, dst: &mut [u8]) { - dst[..8].copy_from_slice(&n.to_le_bytes()); - } - fn write_u128(n: u128, dst: &mut [u8]) { dst[..16].copy_from_slice(&n.to_le_bytes()); } @@ -915,10 +902,6 @@ impl Endian for BE { dst[..4].copy_from_slice(&n.to_be_bytes()); } - fn write_u64(n: u64, dst: &mut [u8]) { - dst[..8].copy_from_slice(&n.to_be_bytes()); - } - fn write_u128(n: u128, dst: &mut [u8]) { dst[..16].copy_from_slice(&n.to_be_bytes()); } diff --git a/regex-capi/src/error.rs b/regex-capi/src/error.rs index a269a3913..7b91fb9d3 100644 --- a/regex-capi/src/error.rs +++ b/regex-capi/src/error.rs @@ -4,7 +4,6 @@ use std::fmt; use std::str; use libc::c_char; -use regex; #[derive(Debug)] pub struct Error { @@ -22,7 +21,7 @@ pub enum ErrorKind { impl Error { pub fn new(kind: ErrorKind) -> Error { - Error { message: None, kind: kind } + Error { message: None, kind } } pub fn is_err(&self) -> bool { diff --git a/regex-cli/cmd/debug/dfa.rs b/regex-cli/cmd/debug/dfa.rs index 9381cdadc..f16610fbe 100644 --- a/regex-cli/cmd/debug/dfa.rs +++ b/regex-cli/cmd/debug/dfa.rs @@ -5,7 +5,7 @@ use crate::{ util::{self, Table}, }; -use {lexopt, regex_automata::dfa::Automaton}; +use regex_automata::dfa::Automaton; pub fn run_dense(p: &mut lexopt::Parser) -> anyhow::Result<()> { const USAGE: &'static str = "\ diff --git a/regex-cli/logger.rs b/regex-cli/logger.rs index 0fe063f1c..4e783872e 100644 --- a/regex-cli/logger.rs +++ b/regex-cli/logger.rs @@ -3,7 +3,7 @@ // print to stderr. We therefore avoid bringing in extra dependencies just // for this functionality. -use log::{self, Log}; +use log::Log; /// The simplest possible logger that logs to stderr. /// diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs index 593b14fbc..1a3df56b5 100644 --- a/regex-syntax/src/ast/parse.rs +++ b/regex-syntax/src/ast/parse.rs @@ -2405,8 +2405,6 @@ mod tests { use alloc::format; - use crate::ast::{self, Ast, Position, Span}; - use super::*; // Our own assert_eq, which has slightly better formatting (but honestly diff --git a/regex-syntax/src/hir/interval.rs b/regex-syntax/src/hir/interval.rs index e063390a8..d507ee724 100644 --- a/regex-syntax/src/hir/interval.rs +++ b/regex-syntax/src/hir/interval.rs @@ -479,23 +479,6 @@ pub trait Interval: ret } - /// Compute the symmetric difference the given range from this range. This - /// returns the union of the two ranges minus its intersection. - fn symmetric_difference( - &self, - other: &Self, - ) -> (Option, Option) { - let union = match self.union(other) { - None => return (Some(self.clone()), Some(other.clone())), - Some(union) => union, - }; - let intersection = match self.intersect(other) { - None => return (Some(self.clone()), Some(other.clone())), - Some(intersection) => intersection, - }; - union.difference(&intersection) - } - /// Returns true if and only if the two ranges are contiguous. Two ranges /// are contiguous if and only if the ranges are either overlapping or /// adjacent. diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index 313a1e9e8..3749ce307 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -1358,9 +1358,8 @@ fn ascii_class_as_chars( #[cfg(test)] mod tests { use crate::{ - ast::{self, parse::ParserBuilder, Ast, Position, Span}, - hir::{self, Hir, HirKind, Look, Properties}, - unicode::{self, ClassQuery}, + ast::{parse::ParserBuilder, Position}, + hir::{Look, Properties}, }; use super::*; diff --git a/regex-test/lib.rs b/regex-test/lib.rs index 2b630666e..7b5ab830c 100644 --- a/regex-test/lib.rs +++ b/regex-test/lib.rs @@ -99,9 +99,7 @@ See [`MatchKind`] for more details. This is an optional field and defaults to /// For this reason, `anyhow` is a public dependency and is re-exported here. pub extern crate anyhow; -use std::{ - borrow::Borrow, collections::HashSet, convert::TryFrom, fs, path::Path, -}; +use std::{borrow::Borrow, collections::HashSet, fs, path::Path}; use { anyhow::{bail, Context, Result}, From 9cf4a42a9361f42d9aa6afd1245c0e37dc0c8771 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 4 Mar 2024 07:30:16 -0500 Subject: [PATCH 17/35] automata: fix bug where reverse NFA lacked an unanchored prefix Previously, when compiling a Thompson NFA, we were omitting an unanchored prefix when the HIR contained a `^` in its prefix. We did this because unanchored prefix in that case would never match because of the requirement imposed by `^`. The problem with that is it's incorrect when compiling a reverse automaton. For example, in the case of building a reverse NFA for `^Qu`, we should sitll include an unanchored prefix because the `^` in that case has no conflict with it. It would be like if we omitted an unanchored prefix for `Qu$` in a forward NFA, which is obviously wrong. The fix here is pretty simple: in the reverse case, check for `$` in the suffix of the HIR rather than a `^` in the prefix. Fixes #1169 --- regex-automata/src/nfa/thompson/compiler.rs | 89 ++++++++++++++++++++- 1 file changed, 85 insertions(+), 4 deletions(-) diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index e6b1c9122..668bca87c 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -961,10 +961,12 @@ impl Compiler { // for all matches. When an unanchored prefix is not added, then the // NFA's anchored and unanchored start states are equivalent. let all_anchored = exprs.iter().all(|e| { - e.borrow() - .properties() - .look_set_prefix() - .contains(hir::Look::Start) + let props = e.borrow().properties(); + if self.config.get_reverse() { + props.look_set_suffix().contains(hir::Look::End) + } else { + props.look_set_prefix().contains(hir::Look::Start) + } }); let anchored = !self.config.get_unanchored_prefix() || all_anchored; let unanchored_prefix = if anchored { @@ -1928,6 +1930,11 @@ mod tests { State::Sparse(SparseTransitions { transitions }) } + fn s_look(look: Look, next: usize) -> State { + let next = sid(next); + State::Look { look, next } + } + fn s_bin_union(alt1: usize, alt2: usize) -> State { State::BinaryUnion { alt1: sid(alt1), alt2: sid(alt2) } } @@ -1978,6 +1985,80 @@ mod tests { ); } + #[test] + fn compile_no_unanchored_prefix_with_start_anchor() { + let nfa = NFA::compiler() + .configure(NFA::config().which_captures(WhichCaptures::None)) + .build(r"^a") + .unwrap(); + assert_eq!( + nfa.states(), + &[s_look(Look::Start, 1), s_byte(b'a', 2), s_match(0)] + ); + } + + #[test] + fn compile_yes_unanchored_prefix_with_end_anchor() { + let nfa = NFA::compiler() + .configure(NFA::config().which_captures(WhichCaptures::None)) + .build(r"a$") + .unwrap(); + assert_eq!( + nfa.states(), + &[ + s_bin_union(2, 1), + s_range(0, 255, 0), + s_byte(b'a', 3), + s_look(Look::End, 4), + s_match(0), + ] + ); + } + + #[test] + fn compile_yes_reverse_unanchored_prefix_with_start_anchor() { + let nfa = NFA::compiler() + .configure( + NFA::config() + .reverse(true) + .which_captures(WhichCaptures::None), + ) + .build(r"^a") + .unwrap(); + assert_eq!( + nfa.states(), + &[ + s_bin_union(2, 1), + s_range(0, 255, 0), + s_byte(b'a', 3), + // Anchors get flipped in a reverse automaton. + s_look(Look::End, 4), + s_match(0), + ], + ); + } + + #[test] + fn compile_no_reverse_unanchored_prefix_with_end_anchor() { + let nfa = NFA::compiler() + .configure( + NFA::config() + .reverse(true) + .which_captures(WhichCaptures::None), + ) + .build(r"a$") + .unwrap(); + assert_eq!( + nfa.states(), + &[ + // Anchors get flipped in a reverse automaton. + s_look(Look::Start, 1), + s_byte(b'a', 2), + s_match(0), + ], + ); + } + #[test] fn compile_empty() { assert_eq!(build("").states(), &[s_match(0),]); From a5ae35153a6ec61e64cb297155f7d91c11b629c7 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 4 Mar 2024 07:49:17 -0500 Subject: [PATCH 18/35] regex-automata-0.4.6 --- regex-automata/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-automata/Cargo.toml b/regex-automata/Cargo.toml index 199985f4e..40a0ebfb9 100644 --- a/regex-automata/Cargo.toml +++ b/regex-automata/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex-automata" -version = "0.4.5" #:version +version = "0.4.6" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] description = "Automata construction and matching using regular expressions." documentation = "https://docs.rs/regex-automata" From 088d7f3269665a11aabadd89335eb09316e9c785 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 22 Mar 2024 20:34:26 -0400 Subject: [PATCH 19/35] api: add Cow guarantee to replace API This adds a guarantee to the API of the `replace`, `replace_all` and `replacen` routines that, when `Cow::Borrowed` is returned, it is guaranteed that it is equivalent to the `haystack` given. The implementation has always matched this behavior, but this elevates the implementation behavior to an API guarantee. There do exists implementations where this guarantee might not be upheld in every case. For example, if the final result were the empty string, we could return a `Cow::Borrowed`. Similarly, if the final result were a substring of `haystack`, then `Cow::Borrowed` could be returned in that case too. In practice, these sorts of optimizations are tricky to do in practice, and seem like niche corner cases that aren't important to optimize. Nevertheless, having this guarantee is useful because it can be used as a signal that the original input remains unchanged. This came up in discussions with @quicknir on Discord. Namely, in cases where one is doing a sequence of replacements and in most cases nothing is replaced, using a `Cow` is nice to be able to avoid copying the haystack over and over again. But to get this to work right, you have to know whether a `Cow::Borrowed` matches the input or not. If it doesn't, then you'd need to transform it into an owned string. For example, this code tries to do replacements on each of a sequence of `Cow` values, where the common case is no replacement: ```rust use std::borrow::Cow; use regex::Regex; fn trim_strs(strs: &mut Vec>) { strs .iter_mut() .for_each(|s| moo(s, ®ex_replace)); } fn moo Cow>(c: &mut Cow, f: F) { let result = f(&c); match result { Cow::Owned(s) => *c = Cow::Owned(s), Cow::Borrowed(s) => { *c = Cow::Borrowed(s); } } } fn regex_replace(s: &str) -> Cow { Regex::new(r"does-not-matter").unwrap().replace_all(s, "whatever") } ``` But this doesn't pass `borrowck`. Instead, you could write `moo` like this: ```rust fn moo Cow>(c: &mut Cow, f: F) { let result = f(&c); match result { Cow::Owned(s) => *c = Cow::Owned(s), Cow::Borrowed(s) => { if !std::ptr::eq(s, &**c) { *c = Cow::Owned(s.to_owned()) } } } } ``` But the `std::ptr:eq` call here is a bit strange. Instead, after this PR and the new guarantee, one can write it like this: ```rust fn moo Cow>(c: &mut Cow, f: F) { if let Cow::Owned(s) = f(&c) { *c = Cow::Owned(s); } } ``` --- src/regex/bytes.rs | 17 +++++++++++++++++ src/regex/string.rs | 17 +++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/src/regex/bytes.rs b/src/regex/bytes.rs index ea4f7cd65..7b7aad574 100644 --- a/src/regex/bytes.rs +++ b/src/regex/bytes.rs @@ -651,6 +651,9 @@ impl Regex { /// case, this implementation will likely return a `Cow::Borrowed` value /// such that no allocation is performed. /// + /// When a `Cow::Borrowed` is returned, the value returned is guaranteed + /// to be equivalent to the `haystack` given. + /// /// # Replacement string syntax /// /// All instances of `$ref` in the replacement string are replaced with @@ -761,6 +764,13 @@ impl Regex { /// replacement provided. This is the same as calling `replacen` with /// `limit` set to `0`. /// + /// If no match is found, then the haystack is returned unchanged. In that + /// case, this implementation will likely return a `Cow::Borrowed` value + /// such that no allocation is performed. + /// + /// When a `Cow::Borrowed` is returned, the value returned is guaranteed + /// to be equivalent to the `haystack` given. + /// /// The documentation for [`Regex::replace`] goes into more detail about /// what kinds of replacement strings are supported. /// @@ -855,6 +865,13 @@ impl Regex { /// matches are replaced. That is, `Regex::replace_all(hay, rep)` is /// equivalent to `Regex::replacen(hay, 0, rep)`. /// + /// If no match is found, then the haystack is returned unchanged. In that + /// case, this implementation will likely return a `Cow::Borrowed` value + /// such that no allocation is performed. + /// + /// When a `Cow::Borrowed` is returned, the value returned is guaranteed + /// to be equivalent to the `haystack` given. + /// /// The documentation for [`Regex::replace`] goes into more detail about /// what kinds of replacement strings are supported. /// diff --git a/src/regex/string.rs b/src/regex/string.rs index 824f45c69..dba94d46e 100644 --- a/src/regex/string.rs +++ b/src/regex/string.rs @@ -642,6 +642,9 @@ impl Regex { /// case, this implementation will likely return a `Cow::Borrowed` value /// such that no allocation is performed. /// + /// When a `Cow::Borrowed` is returned, the value returned is guaranteed + /// to be equivalent to the `haystack` given. + /// /// # Replacement string syntax /// /// All instances of `$ref` in the replacement string are replaced with @@ -748,6 +751,13 @@ impl Regex { /// replacement provided. This is the same as calling `replacen` with /// `limit` set to `0`. /// + /// If no match is found, then the haystack is returned unchanged. In that + /// case, this implementation will likely return a `Cow::Borrowed` value + /// such that no allocation is performed. + /// + /// When a `Cow::Borrowed` is returned, the value returned is guaranteed + /// to be equivalent to the `haystack` given. + /// /// The documentation for [`Regex::replace`] goes into more detail about /// what kinds of replacement strings are supported. /// @@ -842,6 +852,13 @@ impl Regex { /// matches are replaced. That is, `Regex::replace_all(hay, rep)` is /// equivalent to `Regex::replacen(hay, 0, rep)`. /// + /// If no match is found, then the haystack is returned unchanged. In that + /// case, this implementation will likely return a `Cow::Borrowed` value + /// such that no allocation is performed. + /// + /// When a `Cow::Borrowed` is returned, the value returned is guaranteed + /// to be equivalent to the `haystack` given. + /// /// The documentation for [`Regex::replace`] goes into more detail about /// what kinds of replacement strings are supported. /// From aa2d8bd8be283471b17b4ab6faeae5b751553572 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 22 Mar 2024 21:25:29 -0400 Subject: [PATCH 20/35] 1.10.4 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index c254659d7..68ac658c6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex" -version = "1.10.3" #:version +version = "1.10.4" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] license = "MIT OR Apache-2.0" readme = "README.md" From f5d0b69b750369d535c3f200222132403b0d9bff Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Tue, 26 Mar 2024 18:30:13 +0100 Subject: [PATCH 21/35] syntax: accept `{,n}` as an equivalent to `{0,n}` Most regular expression engines don't accept the `{,n}` syntax, but some other do it (namely Python's `re` library). This introduces a new parser configuration option that enables the `{,n}` syntax. PR #1086 --- regex-syntax/src/ast/parse.rs | 69 +++++++++++++++++++++++++++++++---- 1 file changed, 61 insertions(+), 8 deletions(-) diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs index 1a3df56b5..0c2a35265 100644 --- a/regex-syntax/src/ast/parse.rs +++ b/regex-syntax/src/ast/parse.rs @@ -124,6 +124,7 @@ pub struct ParserBuilder { ignore_whitespace: bool, nest_limit: u32, octal: bool, + empty_min_range: bool, } impl Default for ParserBuilder { @@ -139,6 +140,7 @@ impl ParserBuilder { ignore_whitespace: false, nest_limit: 250, octal: false, + empty_min_range: false, } } @@ -149,6 +151,7 @@ impl ParserBuilder { capture_index: Cell::new(0), nest_limit: self.nest_limit, octal: self.octal, + empty_min_range: self.empty_min_range, initial_ignore_whitespace: self.ignore_whitespace, ignore_whitespace: Cell::new(self.ignore_whitespace), comments: RefCell::new(vec![]), @@ -221,6 +224,18 @@ impl ParserBuilder { self.ignore_whitespace = yes; self } + + /// Allow using `{,n}` as an equivalent to `{0,n}`. + /// + /// When enabled, the parser accepts `{,n}` as valid syntax for `{0,n}`. + /// Most regular expression engines don't support the `{,n}` syntax, but + /// some others do it, namely Python's `re` library. + /// + /// This is disabled by default. + pub fn empty_min_range(&mut self, yes: bool) -> &mut ParserBuilder { + self.empty_min_range = yes; + self + } } /// A regular expression parser. @@ -246,6 +261,9 @@ pub struct Parser { /// The initial setting for `ignore_whitespace` as provided by /// `ParserBuilder`. It is used when resetting the parser's state. initial_ignore_whitespace: bool, + /// Whether the parser supports `{,n}` repetitions as an equivalent to + /// `{0,n}.` + empty_min_range: bool, /// Whether whitespace should be ignored. When enabled, comments are /// also permitted. ignore_whitespace: Cell, @@ -1114,15 +1132,14 @@ impl<'s, P: Borrow> ParserI<'s, P> { self.parse_decimal(), ast::ErrorKind::DecimalEmpty, ast::ErrorKind::RepetitionCountDecimalEmpty, - )?; - let mut range = ast::RepetitionRange::Exactly(count_start); + ); if self.is_eof() { return Err(self.error( Span::new(start, self.pos()), ast::ErrorKind::RepetitionCountUnclosed, )); } - if self.char() == ',' { + let range = if self.char() == ',' { if !self.bump_and_bump_space() { return Err(self.error( Span::new(start, self.pos()), @@ -1130,16 +1147,33 @@ impl<'s, P: Borrow> ParserI<'s, P> { )); } if self.char() != '}' { + let count_start = match count_start { + Ok(c) => c, + Err(err) + if err.kind + == ast::ErrorKind::RepetitionCountDecimalEmpty => + { + if self.parser().empty_min_range { + 0 + } else { + return Err(err); + } + } + err => err?, + }; let count_end = specialize_err( self.parse_decimal(), ast::ErrorKind::DecimalEmpty, ast::ErrorKind::RepetitionCountDecimalEmpty, )?; - range = ast::RepetitionRange::Bounded(count_start, count_end); + ast::RepetitionRange::Bounded(count_start, count_end) } else { - range = ast::RepetitionRange::AtLeast(count_start); + ast::RepetitionRange::AtLeast(count_start?) } - } + } else { + ast::RepetitionRange::Exactly(count_start?) + }; + if self.is_eof() || self.char() != '}' { return Err(self.error( Span::new(start, self.pos()), @@ -2459,6 +2493,11 @@ mod tests { ParserI::new(parser, pattern) } + fn parser_empty_min_range(pattern: &str) -> ParserI<'_, Parser> { + let parser = ParserBuilder::new().empty_min_range(true).build(); + ParserI::new(parser, pattern) + } + fn parser_nest_limit( pattern: &str, nest_limit: u32, @@ -3376,6 +3415,20 @@ bar ast: Box::new(lit('a', 0)), })) ); + assert_eq!( + parser_empty_min_range(r"a{,9}").parse(), + Ok(Ast::repetition(ast::Repetition { + span: span(0..5), + op: ast::RepetitionOp { + span: span(1..5), + kind: ast::RepetitionKind::Range( + ast::RepetitionRange::Bounded(0, 9) + ), + }, + greedy: true, + ast: Box::new(lit('a', 0)), + })) + ); assert_eq!( parser_ignore_whitespace(r"a{5,9} ?").parse(), Ok(Ast::repetition(ast::Repetition { @@ -4596,8 +4649,8 @@ bar assert_eq!( parser(r"\b{ ").parse().unwrap_err(), TestError { - span: span(4..4), - kind: ast::ErrorKind::RepetitionCountDecimalEmpty, + span: span(2..4), + kind: ast::ErrorKind::RepetitionCountUnclosed, } ); // In this case, we got some valid chars that makes it look like the From d895bd984537538240e175cc55bc010307210468 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Tue, 26 Mar 2024 13:30:26 -0400 Subject: [PATCH 22/35] regex-syntax-0.8.3 --- regex-syntax/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-syntax/Cargo.toml b/regex-syntax/Cargo.toml index c9ce87da7..1a25d1ce6 100644 --- a/regex-syntax/Cargo.toml +++ b/regex-syntax/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex-syntax" -version = "0.8.2" #:version +version = "0.8.3" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] license = "MIT OR Apache-2.0" repository = "https://github.com/rust-lang/regex/tree/master/regex-syntax" From 66a3bca217881fe8eee9a5a898aea4ecb1eb5cf3 Mon Sep 17 00:00:00 2001 From: JohnEndson <165029498+JohnEndson@users.noreply.github.com> Date: Thu, 28 Mar 2024 20:25:05 +0800 Subject: [PATCH 23/35] doc: remove repetitive words PR #1179 --- regex-automata/src/dfa/mod.rs | 2 +- regex-automata/src/meta/regex.rs | 4 ++-- regex-automata/src/nfa/thompson/range_trie.rs | 2 +- regex-automata/src/util/alphabet.rs | 2 +- regex-automata/src/util/captures.rs | 4 ++-- regex-automata/src/util/start.rs | 2 +- regex-cli/cmd/generate/fowler.rs | 2 +- regex-lite/src/lib.rs | 4 ++-- regex-syntax/src/ast/mod.rs | 2 +- src/regexset/bytes.rs | 2 +- src/regexset/string.rs | 2 +- 11 files changed, 14 insertions(+), 14 deletions(-) diff --git a/regex-automata/src/dfa/mod.rs b/regex-automata/src/dfa/mod.rs index fd58cac23..0e6a968e3 100644 --- a/regex-automata/src/dfa/mod.rs +++ b/regex-automata/src/dfa/mod.rs @@ -190,7 +190,7 @@ assert_eq!(matches, vec![ ``` Note that unlike dense DFAs, sparse DFAs have no alignment requirements. -Conversely, dense DFAs must be be aligned to the same alignment as a +Conversely, dense DFAs must be aligned to the same alignment as a [`StateID`](crate::util::primitives::StateID). # Support for `no_std` and `alloc`-only diff --git a/regex-automata/src/meta/regex.rs b/regex-automata/src/meta/regex.rs index a06d2bb48..8cfdecbec 100644 --- a/regex-automata/src/meta/regex.rs +++ b/regex-automata/src/meta/regex.rs @@ -1826,7 +1826,7 @@ impl Regex { /// /// The precise meaning of "accelerated" is specifically left unspecified, /// but the general meaning is that the search is a high likelihood of - /// running faster than than a character-at-a-time loop inside a standard + /// running faster than a character-at-a-time loop inside a standard /// regex engine. /// /// When a regex is accelerated, it is only a *probabilistic* claim. That @@ -2282,7 +2282,7 @@ impl<'r, 'h> core::iter::FusedIterator for SplitN<'r, 'h> {} /// /// Most of the regex engines in this crate require some kind of /// mutable state in order to execute a search. This mutable state is -/// explicitly separated from the the core regex object (such as a +/// explicitly separated from the core regex object (such as a /// [`thompson::NFA`](crate::nfa::thompson::NFA)) so that the read-only regex /// object can be shared across multiple threads simultaneously without any /// synchronization. Conversely, a `Cache` must either be duplicated if using diff --git a/regex-automata/src/nfa/thompson/range_trie.rs b/regex-automata/src/nfa/thompson/range_trie.rs index 49debda40..93cce1699 100644 --- a/regex-automata/src/nfa/thompson/range_trie.rs +++ b/regex-automata/src/nfa/thompson/range_trie.rs @@ -693,7 +693,7 @@ impl NextInsert { /// handle: /// /// 1. The part where the two ranges actually overlap. i.e., The intersection. -/// 2. The part of the existing range that is not in the the new range. +/// 2. The part of the existing range that is not in the new range. /// 3. The part of the new range that is not in the old range. /// /// (1) is guaranteed to always occur since all overlapping ranges have a diff --git a/regex-automata/src/util/alphabet.rs b/regex-automata/src/util/alphabet.rs index 22b5a7644..e0e4d2fc1 100644 --- a/regex-automata/src/util/alphabet.rs +++ b/regex-automata/src/util/alphabet.rs @@ -699,7 +699,7 @@ impl ByteClassSet { ByteClassSet(ByteSet::empty()) } - /// Indicate the the range of byte given (inclusive) can discriminate a + /// Indicate the range of byte given (inclusive) can discriminate a /// match between it and all other bytes outside of the range. pub(crate) fn set_range(&mut self, start: u8, end: u8) { debug_assert!(start <= end); diff --git a/regex-automata/src/util/captures.rs b/regex-automata/src/util/captures.rs index 05db6a993..93a0a8afa 100644 --- a/regex-automata/src/util/captures.rs +++ b/regex-automata/src/util/captures.rs @@ -1643,7 +1643,7 @@ impl GroupInfo { /// /// This also returns `None` for all inputs if these captures are empty /// (e.g., built from an empty [`GroupInfo`]). To check whether captures - /// are are present for a specific pattern, use [`GroupInfo::group_len`]. + /// are present for a specific pattern, use [`GroupInfo::group_len`]. /// /// # Example /// @@ -1695,7 +1695,7 @@ impl GroupInfo { /// /// This also returns `None` for all inputs if these captures are empty /// (e.g., built from an empty [`GroupInfo`]). To check whether captures - /// are are present for a specific pattern, use [`GroupInfo::group_len`]. + /// are present for a specific pattern, use [`GroupInfo::group_len`]. /// /// # Example /// diff --git a/regex-automata/src/util/start.rs b/regex-automata/src/util/start.rs index 27153780e..97988b44b 100644 --- a/regex-automata/src/util/start.rs +++ b/regex-automata/src/util/start.rs @@ -323,7 +323,7 @@ impl core::fmt::Debug for StartByteMap { /// Represents the six possible starting configurations of a DFA search. /// -/// The starting configuration is determined by inspecting the the beginning +/// The starting configuration is determined by inspecting the beginning /// of the haystack (up to 1 byte). Ultimately, this along with a pattern ID /// (if specified) and the type of search (anchored or not) is what selects the /// start state to use in a DFA. diff --git a/regex-cli/cmd/generate/fowler.rs b/regex-cli/cmd/generate/fowler.rs index c287f6f52..404c47721 100644 --- a/regex-cli/cmd/generate/fowler.rs +++ b/regex-cli/cmd/generate/fowler.rs @@ -178,7 +178,7 @@ impl TomlTest { // this trade off (to this extent anyway), so it really wants all // capturing groups... // - // So what we do here is is look for the number of groups in the + // So what we do here is look for the number of groups in the // pattern and then just pad out the capture matches with None // values to make the number of capture matches equal to what we // would expect from the pattern. (We actually parse the regex to diff --git a/regex-lite/src/lib.rs b/regex-lite/src/lib.rs index 9b394a480..0aca8221d 100644 --- a/regex-lite/src/lib.rs +++ b/regex-lite/src/lib.rs @@ -257,7 +257,7 @@ let dates: Vec<(&str, &str, &str)> = re.captures_iter(hay).map(|caps| { // regex matches, and in this context, we know we have a match. // // Note that we use `caps.name("y").unwrap().as_str()` instead of - // `&caps["y"]` because the the lifetime of the former is the same as the + // `&caps["y"]` because the lifetime of the former is the same as the // lifetime of `hay` above, but the lifetime of the latter is tied to the // lifetime of `caps` due to how the `Index` trait is defined. let year = caps.name("y").unwrap().as_str(); @@ -821,7 +821,7 @@ it, a longer haystack will take more time to search. * Very large regexes can searches to be quite slow due to increasing the size `m` in the worst case `O(m * n)` bound. This is especially true when they are combined with counted repetitions. While the regex size limit above will -protect you from the most egregious cases, the the default size limit still +protect you from the most egregious cases, the default size limit still permits pretty big regexes that can execute more slowly than one might expect. * While routines like [`Regex::find`] and [`Regex::captures`] guarantee worst case `O(m * n)` search time, routines like [`Regex::find_iter`] and diff --git a/regex-syntax/src/ast/mod.rs b/regex-syntax/src/ast/mod.rs index 6a77ee134..ce79a89ab 100644 --- a/regex-syntax/src/ast/mod.rs +++ b/regex-syntax/src/ast/mod.rs @@ -711,7 +711,7 @@ pub enum LiteralKind { /// The literal is written as an octal escape, e.g., `\141`. Octal, /// The literal is written as a hex code with a fixed number of digits - /// depending on the type of the escape, e.g., `\x61` or or `\u0061` or + /// depending on the type of the escape, e.g., `\x61` or `\u0061` or /// `\U00000061`. HexFixed(HexLiteralKind), /// The literal is written as a hex code with a bracketed number of diff --git a/src/regexset/bytes.rs b/src/regexset/bytes.rs index 1220a1466..2f46abc4d 100644 --- a/src/regexset/bytes.rs +++ b/src/regexset/bytes.rs @@ -355,7 +355,7 @@ impl RegexSet { ) -> bool { // This is pretty dumb. We should try to fix this, but the // regex-automata API doesn't provide a way to store matches in an - // arbitrary &mut [bool]. Thankfully, this API is is doc(hidden) and + // arbitrary &mut [bool]. Thankfully, this API is doc(hidden) and // thus not public... But regex-capi currently uses it. We should // fix regex-capi to use a PatternSet, maybe? Not sure... PatternSet // is in regex-automata, not regex. So maybe we should just accept a diff --git a/src/regexset/string.rs b/src/regexset/string.rs index 2a3e7b802..5cb9b5608 100644 --- a/src/regexset/string.rs +++ b/src/regexset/string.rs @@ -351,7 +351,7 @@ impl RegexSet { ) -> bool { // This is pretty dumb. We should try to fix this, but the // regex-automata API doesn't provide a way to store matches in an - // arbitrary &mut [bool]. Thankfully, this API is is doc(hidden) and + // arbitrary &mut [bool]. Thankfully, this API is doc(hidden) and // thus not public... But regex-capi currently uses it. We should // fix regex-capi to use a PatternSet, maybe? Not sure... PatternSet // is in regex-automata, not regex. So maybe we should just accept a From 4c565c8a636aa0e1e13dd801340ad6545fd256e9 Mon Sep 17 00:00:00 2001 From: careworry <167077904+careworry@users.noreply.github.com> Date: Thu, 18 Apr 2024 20:25:44 +0800 Subject: [PATCH 24/35] doc: fix typos PR #1182 --- regex-automata/src/dfa/dense.rs | 2 +- regex-automata/src/util/determinize/state.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/regex-automata/src/dfa/dense.rs b/regex-automata/src/dfa/dense.rs index 8e0f33c03..ed37d3b84 100644 --- a/regex-automata/src/dfa/dense.rs +++ b/regex-automata/src/dfa/dense.rs @@ -2498,7 +2498,7 @@ impl OwnedDFA { self.tt.set(from, byte, to); } - /// An an empty state (a state where all transitions lead to a dead state) + /// An empty state (a state where all transitions lead to a dead state) /// and return its identifier. The identifier returned is guaranteed to /// not point to any other existing state. /// diff --git a/regex-automata/src/util/determinize/state.rs b/regex-automata/src/util/determinize/state.rs index 8a8561a31..540d5d4d1 100644 --- a/regex-automata/src/util/determinize/state.rs +++ b/regex-automata/src/util/determinize/state.rs @@ -57,7 +57,7 @@ can only be used for adding NFA state IDs and recording some assertions. The expected flow here is to use the above builders to construct a candidate DFA state to check if it already exists. If it does, then there's no need to -freeze it into a `State`. It it doesn't exist, then `StateBuilderNFA::to_state` +freeze it into a `State`. If it doesn't exist, then `StateBuilderNFA::to_state` can be called to freeze the builder into an immutable `State`. In either case, `clear` should be called on the builder to turn it back into a `StateBuilderEmpty` that reuses the underlying memory. From b12a2761f91320bc8bf8246f88d2884a90034b5a Mon Sep 17 00:00:00 2001 From: Luca Bruno Date: Mon, 22 Apr 2024 16:21:41 +0200 Subject: [PATCH 25/35] syntax/utf8: avoid a spurious vector reallocation This reworks `Utf8Sequences` logic in order to avoid allocating a 0-sized vector and immediately reallocating it for the initial element. Directly create the populated vector instead. I was looking at the memory usage patterns of [rolldown] through heaptrack, and this spot showed up as a potentially-spurious temporary allocation. The consumer side is [here][consumer side]. I do not have a specific benchmark for this. [rolldown]: https://github.com/rolldown/rolldown [consumer side]: https://github.com/rolldown/rolldown/blob/ce36a195ed4e9ce7c446557cefff4750a2268e01/crates/rolldown/src/utils/extract_hash_pattern.rs#L12 --- regex-syntax/src/utf8.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/regex-syntax/src/utf8.rs b/regex-syntax/src/utf8.rs index e13b55abf..69d749451 100644 --- a/regex-syntax/src/utf8.rs +++ b/regex-syntax/src/utf8.rs @@ -302,9 +302,9 @@ impl Utf8Sequences { /// Create a new iterator over UTF-8 byte ranges for the scalar value range /// given. pub fn new(start: char, end: char) -> Self { - let mut it = Utf8Sequences { range_stack: vec![] }; - it.push(u32::from(start), u32::from(end)); - it + let range = + ScalarRange { start: u32::from(start), end: u32::from(end) }; + Utf8Sequences { range_stack: vec![range] } } /// reset resets the scalar value range. From 9c139f4fa5c64a89075749cd5e57148c8eea8c22 Mon Sep 17 00:00:00 2001 From: Nathan West Date: Mon, 6 May 2024 19:55:10 -0400 Subject: [PATCH 26/35] syntax: simplify `Hir::dot` constructors This also likely avoids a spurious alloc or two, although it assuredly doesn't matter in practice. --- regex-syntax/src/hir/mod.rs | 52 +++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 28 deletions(-) diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index ae3ba318e..5db784388 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -658,16 +658,12 @@ impl Hir { #[inline] pub fn dot(dot: Dot) -> Hir { match dot { - Dot::AnyChar => { - let mut cls = ClassUnicode::empty(); - cls.push(ClassUnicodeRange::new('\0', '\u{10FFFF}')); - Hir::class(Class::Unicode(cls)) - } - Dot::AnyByte => { - let mut cls = ClassBytes::empty(); - cls.push(ClassBytesRange::new(b'\0', b'\xFF')); - Hir::class(Class::Bytes(cls)) - } + Dot::AnyChar => Hir::class(Class::Unicode(ClassUnicode::new([ + ClassUnicodeRange::new('\0', '\u{10FFFF}'), + ]))), + Dot::AnyByte => Hir::class(Class::Bytes(ClassBytes::new([ + ClassBytesRange::new(b'\0', b'\xFF'), + ]))), Dot::AnyCharExcept(ch) => { let mut cls = ClassUnicode::new([ClassUnicodeRange::new(ch, ch)]); @@ -675,17 +671,17 @@ impl Hir { Hir::class(Class::Unicode(cls)) } Dot::AnyCharExceptLF => { - let mut cls = ClassUnicode::empty(); - cls.push(ClassUnicodeRange::new('\0', '\x09')); - cls.push(ClassUnicodeRange::new('\x0B', '\u{10FFFF}')); - Hir::class(Class::Unicode(cls)) + Hir::class(Class::Unicode(ClassUnicode::new([ + ClassUnicodeRange::new('\0', '\x09'), + ClassUnicodeRange::new('\x0B', '\u{10FFFF}'), + ]))) } Dot::AnyCharExceptCRLF => { - let mut cls = ClassUnicode::empty(); - cls.push(ClassUnicodeRange::new('\0', '\x09')); - cls.push(ClassUnicodeRange::new('\x0B', '\x0C')); - cls.push(ClassUnicodeRange::new('\x0E', '\u{10FFFF}')); - Hir::class(Class::Unicode(cls)) + Hir::class(Class::Unicode(ClassUnicode::new([ + ClassUnicodeRange::new('\0', '\x09'), + ClassUnicodeRange::new('\x0B', '\x0C'), + ClassUnicodeRange::new('\x0E', '\u{10FFFF}'), + ]))) } Dot::AnyByteExcept(byte) => { let mut cls = @@ -694,17 +690,17 @@ impl Hir { Hir::class(Class::Bytes(cls)) } Dot::AnyByteExceptLF => { - let mut cls = ClassBytes::empty(); - cls.push(ClassBytesRange::new(b'\0', b'\x09')); - cls.push(ClassBytesRange::new(b'\x0B', b'\xFF')); - Hir::class(Class::Bytes(cls)) + Hir::class(Class::Bytes(ClassBytes::new([ + ClassBytesRange::new(b'\0', b'\x09'), + ClassBytesRange::new(b'\x0B', b'\xFF'), + ]))) } Dot::AnyByteExceptCRLF => { - let mut cls = ClassBytes::empty(); - cls.push(ClassBytesRange::new(b'\0', b'\x09')); - cls.push(ClassBytesRange::new(b'\x0B', b'\x0C')); - cls.push(ClassBytesRange::new(b'\x0E', b'\xFF')); - Hir::class(Class::Bytes(cls)) + Hir::class(Class::Bytes(ClassBytes::new([ + ClassBytesRange::new(b'\0', b'\x09'), + ClassBytesRange::new(b'\x0B', b'\x0C'), + ClassBytesRange::new(b'\x0E', b'\xFF'), + ]))) } } } From 023f1c9ac117cd5ef2e45119b61b94f85d109667 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Wed, 8 May 2024 11:08:11 -0400 Subject: [PATCH 27/35] lite: fix attribute warning about rustfmt I'm not sure why I wrote it like this originally? --- regex-lite/src/utf8.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/regex-lite/src/utf8.rs b/regex-lite/src/utf8.rs index 5f2a6a153..2730b602d 100644 --- a/regex-lite/src/utf8.rs +++ b/regex-lite/src/utf8.rs @@ -87,7 +87,7 @@ fn decode_step(state: &mut usize, cp: &mut u32, b: u8) { // Splits the space of all bytes into equivalence classes, such that // any byte in the same class can never discriminate between whether a // particular sequence is valid UTF-8 or not. - #[cfg_attr(rustfmt, rustfmt::skip)] + #[rustfmt::skip] const CLASSES: [u8; 256] = [ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, @@ -101,7 +101,7 @@ fn decode_step(state: &mut usize, cp: &mut u32, b: u8) { // A state machine taken from `bstr` which was in turn adapted from: // https://bjoern.hoehrmann.de/utf-8/decoder/dfa/ - #[cfg_attr(rustfmt, rustfmt::skip)] + #[rustfmt::skip] const STATES_FORWARD: &'static [u8] = &[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 0, 24, 36, 60, 96, 84, 0, 0, 0, 48, 72, From ddeb85eaa3bdf79d6306cc92a9d8bd89d839b5cd Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Wed, 8 May 2024 11:08:35 -0400 Subject: [PATCH 28/35] cli/deps: update memmap2 to 0.9 --- regex-cli/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-cli/Cargo.toml b/regex-cli/Cargo.toml index a107c09df..ac69c9ec4 100644 --- a/regex-cli/Cargo.toml +++ b/regex-cli/Cargo.toml @@ -27,7 +27,7 @@ anyhow = "1.0.28" bstr = { version = "1.4.0", default-features = false, features = ["std"] } lexopt = "0.3.0" log = { version = "0.4.17", features = ["std"] } -memmap2 = "0.5.10" +memmap2 = "0.9.4" regex = { version = "1.9.0", path = ".." } regex-automata = { version = "0.4.0", path = "../regex-automata", features = ["logging"] } regex-lite = { version = "0.1.0", path = "../regex-lite" } From ab4c8d1f210a2e1011a4408476b2c708e64dcede Mon Sep 17 00:00:00 2001 From: denzenin Date: Mon, 3 Jun 2024 00:30:07 +0100 Subject: [PATCH 29/35] doc: fix duplicate phrasing typo PR #1198 --- regex-lite/src/string.rs | 4 ++-- src/regex/bytes.rs | 4 ++-- src/regex/string.rs | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/regex-lite/src/string.rs b/regex-lite/src/string.rs index 4e4de9068..5fe30ade3 100644 --- a/regex-lite/src/string.rs +++ b/regex-lite/src/string.rs @@ -1717,8 +1717,8 @@ impl<'h> Captures<'h> { /// /// This returns a tuple where the first element corresponds to the full /// substring of the haystack that matched the regex. The second element is - /// an array of substrings, with each corresponding to the to the substring - /// that matched for a particular capture group. + /// an array of substrings, with each corresponding to the substring that + /// matched for a particular capture group. /// /// # Panics /// diff --git a/src/regex/bytes.rs b/src/regex/bytes.rs index 7b7aad574..3de4022a8 100644 --- a/src/regex/bytes.rs +++ b/src/regex/bytes.rs @@ -1711,8 +1711,8 @@ impl<'h> Captures<'h> { /// /// This returns a tuple where the first element corresponds to the full /// substring of the haystack that matched the regex. The second element is - /// an array of substrings, with each corresponding to the to the substring - /// that matched for a particular capture group. + /// an array of substrings, with each corresponding to the substring that + /// matched for a particular capture group. /// /// # Panics /// diff --git a/src/regex/string.rs b/src/regex/string.rs index dba94d46e..fab178a68 100644 --- a/src/regex/string.rs +++ b/src/regex/string.rs @@ -1716,8 +1716,8 @@ impl<'h> Captures<'h> { /// /// This returns a tuple where the first element corresponds to the full /// substring of the haystack that matched the regex. The second element is - /// an array of substrings, with each corresponding to the to the substring - /// that matched for a particular capture group. + /// an array of substrings, with each corresponding to the substring that + /// matched for a particular capture group. /// /// # Panics /// From 1f9f9ccd393fc5342aff6db5e3d47915e87a2554 Mon Sep 17 00:00:00 2001 From: Lee ByeongJun Date: Sun, 9 Jun 2024 20:29:34 +0900 Subject: [PATCH 30/35] bytes: escape invalid UTF-8 bytes in debug output for Match PR #1203 --- src/regex/bytes.rs | 102 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 91 insertions(+), 11 deletions(-) diff --git a/src/regex/bytes.rs b/src/regex/bytes.rs index 3de4022a8..39af6e71c 100644 --- a/src/regex/bytes.rs +++ b/src/regex/bytes.rs @@ -1555,18 +1555,13 @@ impl<'h> Match<'h> { impl<'h> core::fmt::Debug for Match<'h> { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + use regex_automata::util::escape::DebugHaystack; + let mut fmt = f.debug_struct("Match"); - fmt.field("start", &self.start).field("end", &self.end); - if let Ok(s) = core::str::from_utf8(self.as_bytes()) { - fmt.field("bytes", &s); - } else { - // FIXME: It would be nice if this could be printed as a string - // with invalid UTF-8 replaced with hex escapes. A alloc would - // probably okay if that makes it easier, but regex-automata does - // (at time of writing) have internal routines that do this. So - // maybe we should expose them. - fmt.field("bytes", &self.as_bytes()); - } + fmt.field("start", &self.start) + .field("end", &self.end) + .field("bytes", &DebugHaystack(&self.as_bytes())); + fmt.finish() } } @@ -2620,3 +2615,88 @@ fn no_expansion>(replacement: &T) -> Option> { None => Some(Cow::Borrowed(replacement)), } } + +#[cfg(test)] +mod tests { + use super::*; + use alloc::format; + + #[test] + fn test_match_properties() { + let haystack = b"Hello, world!"; + let m = Match::new(haystack, 7, 12); + + assert_eq!(m.start(), 7); + assert_eq!(m.end(), 12); + assert_eq!(m.is_empty(), false); + assert_eq!(m.len(), 5); + assert_eq!(m.as_bytes(), b"world"); + } + + #[test] + fn test_empty_match() { + let haystack = b""; + let m = Match::new(haystack, 0, 0); + + assert_eq!(m.is_empty(), true); + assert_eq!(m.len(), 0); + } + + #[test] + fn test_debug_output_valid_utf8() { + let haystack = b"Hello, world!"; + let m = Match::new(haystack, 7, 12); + let debug_str = format!("{:?}", m); + + assert_eq!( + debug_str, + r#"Match { start: 7, end: 12, bytes: "world" }"# + ); + } + + #[test] + fn test_debug_output_invalid_utf8() { + let haystack = b"Hello, \xFFworld!"; + let m = Match::new(haystack, 7, 13); + let debug_str = format!("{:?}", m); + + assert_eq!( + debug_str, + r#"Match { start: 7, end: 13, bytes: "\xffworld" }"# + ); + } + + #[test] + fn test_debug_output_various_unicode() { + let haystack = + "Hello, 😊 world! 안녕하세요? مرحبا بالعالم!".as_bytes(); + let m = Match::new(haystack, 0, haystack.len()); + let debug_str = format!("{:?}", m); + + assert_eq!( + debug_str, + r#"Match { start: 0, end: 62, bytes: "Hello, 😊 world! 안녕하세요? مرحبا بالعالم!" }"# + ); + } + + #[test] + fn test_debug_output_ascii_escape() { + let haystack = b"Hello,\tworld!\nThis is a \x1b[31mtest\x1b[0m."; + let m = Match::new(haystack, 0, haystack.len()); + let debug_str = format!("{:?}", m); + + assert_eq!( + debug_str, + r#"Match { start: 0, end: 38, bytes: "Hello,\tworld!\nThis is a \u{1b}[31mtest\u{1b}[0m." }"# + ); + } + + #[test] + fn test_debug_output_match_in_middle() { + let haystack = b"The quick brown fox jumps over the lazy dog."; + let m = Match::new(haystack, 16, 19); + let debug_str = format!("{:?}", m); + + assert_eq!(debug_str, r#"Match { start: 16, end: 19, bytes: "fox" }"#); + } +} From 1430b65baeebeb67b3335e26a71f251cce9964ef Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 9 Jun 2024 07:32:22 -0400 Subject: [PATCH 31/35] changelog: 1.10.4 --- CHANGELOG.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3ffd961d7..4fc5b9197 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,13 @@ +1.10.4 (2024-06-09) +=================== +This is a new patch release with some minor fixes. + +Bug fixes: + +* [BUG #1203](https://github.com/rust-lang/regex/pull/1203): +Escape invalid UTF-8 when in the `Debug` impl of `regex::bytes::Match`. + + 1.10.3 (2024-01-21) =================== This is a new patch release that fixes the feature configuration of optional From 4757b5f01a7b9b6c8d89bd63b3d1500f7e0efa9e Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 9 Jun 2024 07:33:32 -0400 Subject: [PATCH 32/35] regex-syntax-0.8.4 --- regex-syntax/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-syntax/Cargo.toml b/regex-syntax/Cargo.toml index 1a25d1ce6..3f213542b 100644 --- a/regex-syntax/Cargo.toml +++ b/regex-syntax/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex-syntax" -version = "0.8.3" #:version +version = "0.8.4" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] license = "MIT OR Apache-2.0" repository = "https://github.com/rust-lang/regex/tree/master/regex-syntax" From 68c4f0b7b7f500e0ab3fbdd42c14f837c4ed1be4 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 9 Jun 2024 07:33:44 -0400 Subject: [PATCH 33/35] regex-automata-0.4.7 --- regex-automata/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-automata/Cargo.toml b/regex-automata/Cargo.toml index 40a0ebfb9..97bfacfec 100644 --- a/regex-automata/Cargo.toml +++ b/regex-automata/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex-automata" -version = "0.4.6" #:version +version = "0.4.7" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] description = "Automata construction and matching using regular expressions." documentation = "https://docs.rs/regex-automata" From 377463bd8200c038e7997a550aa708e0c686d90f Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 9 Jun 2024 07:40:20 -0400 Subject: [PATCH 34/35] changelog: 1.10.4 and 1.10.5 We had previously release regex 1.10.4 but omitted a changelog entry for it. So this adds it. --- CHANGELOG.md | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4fc5b9197..586191d75 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -1.10.4 (2024-06-09) +1.10.5 (2024-06-09) =================== This is a new patch release with some minor fixes. @@ -8,6 +8,17 @@ Bug fixes: Escape invalid UTF-8 when in the `Debug` impl of `regex::bytes::Match`. +1.10.4 (2024-03-22) +=================== +This is a new patch release with some minor fixes. + +* [BUG #1169](https://github.com/rust-lang/regex/issues/1169): +Fixes a bug with compiling a reverse NFA automaton in `regex-automata`. +* [BUG #1178](https://github.com/rust-lang/regex/pull/1178): +Clarifies that when `Cow::Borrowed` is returned from replace APIs, it is +equivalent to the input. + + 1.10.3 (2024-01-21) =================== This is a new patch release that fixes the feature configuration of optional From 0718fc5acbe91b84322ef1bd2c32e189e4668254 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 9 Jun 2024 07:40:47 -0400 Subject: [PATCH 35/35] 1.10.5 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 68ac658c6..4fe3be20a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex" -version = "1.10.4" #:version +version = "1.10.5" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] license = "MIT OR Apache-2.0" readme = "README.md"