diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2813a1676..eb8e9f86e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -79,7 +79,7 @@ jobs: rust: stable-x86_64-gnu steps: - name: Checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Install Rust uses: dtolnay/rust-toolchain@master with: @@ -137,30 +137,11 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Install Rust uses: dtolnay/rust-toolchain@master with: toolchain: 1.65.0 - # The memchr 2.6 release purportedly bumped its MSRV to Rust 1.60, but it - # turned out that on aarch64, it was using something that wasn't stabilized - # until Rust 1.61[1]. (This was an oversight on my part. I had previously - # thought everything I needed was on Rust 1.60.) To resolve that, I just - # bumped memchr's MSRV to 1.61. Since it was so soon after the memchr 2.6 - # release, I treated this as a bugfix. - # - # But the regex crate's MSRV is at Rust 1.60, and it now depends on at - # least memchr 2.6 (to make use of its `alloc` feature). So we can't set - # a lower minimal version. And I can't just bump the MSRV in a patch - # release as a bug fix because regex 1.9 was released quite some time ago. - # I could just release regex 1.10 and bump the MSRV there, but eh, I don't - # want to put out another minor version release just for this. - # - # So... pin memchr to 2.6.2, which at least works on x86-64 on Rust 1.60. - # - # [1]: https://github.com/BurntSushi/memchr/issues/136 - - name: Pin memchr to 2.6.2 - run: cargo update -p memchr --precise 2.6.2 - name: Basic build run: cargo build --verbose - name: Build docs @@ -177,7 +158,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Install Rust uses: dtolnay/rust-toolchain@master with: @@ -190,7 +171,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Install Rust uses: dtolnay/rust-toolchain@master with: @@ -203,7 +184,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Install Rust uses: dtolnay/rust-toolchain@master with: @@ -216,7 +197,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Install Rust uses: dtolnay/rust-toolchain@master with: @@ -231,7 +212,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Install Rust uses: dtolnay/rust-toolchain@master with: @@ -248,7 +229,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Install Rust uses: dtolnay/rust-toolchain@master with: diff --git a/CHANGELOG.md b/CHANGELOG.md index 420e08f74..3ffd961d7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,16 @@ +1.10.3 (2024-01-21) +=================== +This is a new patch release that fixes the feature configuration of optional +dependencies, and fixes an unsound use of bounds check elision. + +Bug fixes: + +* [BUG #1147](https://github.com/rust-lang/regex/issues/1147): +Set `default-features=false` for the `memchr` and `aho-corasick` dependencies. +* [BUG #1154](https://github.com/rust-lang/regex/pull/1154): +Fix unsound bounds check elision. + + 1.10.2 (2023-10-16) =================== This is a new patch release that fixes a search regression where incorrect diff --git a/Cargo.toml b/Cargo.toml index 3ba14c904..68ac658c6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex" -version = "1.10.2" #:version +version = "1.10.4" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] license = "MIT OR Apache-2.0" readme = "README.md" @@ -165,16 +165,18 @@ pattern = [] [dependencies.aho-corasick] version = "1.0.0" optional = true +default-features = false # For skipping along search text quickly when a leading byte is known. [dependencies.memchr] version = "2.6.0" optional = true +default-features = false # For the actual regex engines. [dependencies.regex-automata] path = "regex-automata" -version = "0.4.3" +version = "0.4.4" default-features = false features = ["alloc", "syntax", "meta", "nfa-pikevm"] diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_dense_dfa-5624222820728832 b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_dense_dfa-5624222820728832 new file mode 100644 index 000000000..e236ae735 Binary files /dev/null and b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_dense_dfa-5624222820728832 differ diff --git a/regex-automata/Cargo.toml b/regex-automata/Cargo.toml index 3cb3d7c8e..40a0ebfb9 100644 --- a/regex-automata/Cargo.toml +++ b/regex-automata/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex-automata" -version = "0.4.3" #:version +version = "0.4.6" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] description = "Automata construction and matching using regular expressions." documentation = "https://docs.rs/regex-automata" diff --git a/regex-automata/src/dfa/dense.rs b/regex-automata/src/dfa/dense.rs index fd96bc878..8e0f33c03 100644 --- a/regex-automata/src/dfa/dense.rs +++ b/regex-automata/src/dfa/dense.rs @@ -9,7 +9,7 @@ This module also contains a [`dense::Builder`](Builder) and a #[cfg(feature = "dfa-build")] use core::cmp; -use core::{convert::TryFrom, fmt, iter, mem::size_of, slice}; +use core::{fmt, iter, mem::size_of, slice}; #[cfg(feature = "dfa-build")] use alloc::{ @@ -2340,8 +2340,8 @@ impl<'a> DFA<&'a [u32]> { // table, match states and accelerators below. If any validation fails, // then we return an error. let (dfa, nread) = unsafe { DFA::from_bytes_unchecked(slice)? }; - dfa.tt.validate(&dfa.special)?; - dfa.st.validate(&dfa.tt)?; + dfa.tt.validate(&dfa)?; + dfa.st.validate(&dfa)?; dfa.ms.validate(&dfa)?; dfa.accels.validate()?; // N.B. dfa.special doesn't have a way to do unchecked deserialization, @@ -3593,7 +3593,8 @@ impl> TransitionTable { /// /// That is, every state ID can be used to correctly index a state in this /// table. - fn validate(&self, sp: &Special) -> Result<(), DeserializeError> { + fn validate(&self, dfa: &DFA) -> Result<(), DeserializeError> { + let sp = &dfa.special; for state in self.states() { // We check that the ID itself is well formed. That is, if it's // a special state then it must actually be a quit, dead, accel, @@ -3611,6 +3612,13 @@ impl> TransitionTable { wasn't actually special", )); } + if sp.is_match_state(state.id()) + && dfa.match_len(state.id()) == 0 + { + return Err(DeserializeError::generic( + "found match state with zero pattern IDs", + )); + } } for (_, to) in state.transitions() { if !self.is_valid(to) { @@ -4127,10 +4135,8 @@ impl> StartTable { /// it against the given transition table (which must be for the same DFA). /// /// That is, every state ID can be used to correctly index a state. - fn validate( - &self, - tt: &TransitionTable, - ) -> Result<(), DeserializeError> { + fn validate(&self, dfa: &DFA) -> Result<(), DeserializeError> { + let tt = &dfa.tt; if !self.universal_start_unanchored.map_or(true, |s| tt.is_valid(s)) { return Err(DeserializeError::generic( "found invalid universal unanchored starting state ID", diff --git a/regex-automata/src/dfa/sparse.rs b/regex-automata/src/dfa/sparse.rs index d461e0a0f..46278c181 100644 --- a/regex-automata/src/dfa/sparse.rs +++ b/regex-automata/src/dfa/sparse.rs @@ -38,11 +38,7 @@ assert_eq!(Some(HalfMatch::must(0, 7)), state.get_match()); #[cfg(feature = "dfa-build")] use core::iter; -use core::{ - convert::{TryFrom, TryInto}, - fmt, - mem::size_of, -}; +use core::{fmt, mem::size_of}; #[cfg(feature = "dfa-build")] use alloc::{vec, vec::Vec}; diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index 2d2172957..668bca87c 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -961,10 +961,12 @@ impl Compiler { // for all matches. When an unanchored prefix is not added, then the // NFA's anchored and unanchored start states are equivalent. let all_anchored = exprs.iter().all(|e| { - e.borrow() - .properties() - .look_set_prefix() - .contains(hir::Look::Start) + let props = e.borrow().properties(); + if self.config.get_reverse() { + props.look_set_suffix().contains(hir::Look::End) + } else { + props.look_set_prefix().contains(hir::Look::Start) + } }); let anchored = !self.config.get_unanchored_prefix() || all_anchored; let unanchored_prefix = if anchored { @@ -1876,11 +1878,11 @@ impl Utf8Node { #[cfg(test)] mod tests { - use alloc::{vec, vec::Vec}; + use alloc::vec; use crate::{ - nfa::thompson::{SparseTransitions, State, Transition, NFA}, - util::primitives::{PatternID, SmallIndex, StateID}, + nfa::thompson::{SparseTransitions, State}, + util::primitives::SmallIndex, }; use super::*; @@ -1928,6 +1930,11 @@ mod tests { State::Sparse(SparseTransitions { transitions }) } + fn s_look(look: Look, next: usize) -> State { + let next = sid(next); + State::Look { look, next } + } + fn s_bin_union(alt1: usize, alt2: usize) -> State { State::BinaryUnion { alt1: sid(alt1), alt2: sid(alt2) } } @@ -1978,6 +1985,80 @@ mod tests { ); } + #[test] + fn compile_no_unanchored_prefix_with_start_anchor() { + let nfa = NFA::compiler() + .configure(NFA::config().which_captures(WhichCaptures::None)) + .build(r"^a") + .unwrap(); + assert_eq!( + nfa.states(), + &[s_look(Look::Start, 1), s_byte(b'a', 2), s_match(0)] + ); + } + + #[test] + fn compile_yes_unanchored_prefix_with_end_anchor() { + let nfa = NFA::compiler() + .configure(NFA::config().which_captures(WhichCaptures::None)) + .build(r"a$") + .unwrap(); + assert_eq!( + nfa.states(), + &[ + s_bin_union(2, 1), + s_range(0, 255, 0), + s_byte(b'a', 3), + s_look(Look::End, 4), + s_match(0), + ] + ); + } + + #[test] + fn compile_yes_reverse_unanchored_prefix_with_start_anchor() { + let nfa = NFA::compiler() + .configure( + NFA::config() + .reverse(true) + .which_captures(WhichCaptures::None), + ) + .build(r"^a") + .unwrap(); + assert_eq!( + nfa.states(), + &[ + s_bin_union(2, 1), + s_range(0, 255, 0), + s_byte(b'a', 3), + // Anchors get flipped in a reverse automaton. + s_look(Look::End, 4), + s_match(0), + ], + ); + } + + #[test] + fn compile_no_reverse_unanchored_prefix_with_end_anchor() { + let nfa = NFA::compiler() + .configure( + NFA::config() + .reverse(true) + .which_captures(WhichCaptures::None), + ) + .build(r"a$") + .unwrap(); + assert_eq!( + nfa.states(), + &[ + // Anchors get flipped in a reverse automaton. + s_look(Look::Start, 1), + s_byte(b'a', 2), + s_match(0), + ], + ); + } + #[test] fn compile_empty() { assert_eq!(build("").states(), &[s_match(0),]); diff --git a/regex-automata/src/nfa/thompson/map.rs b/regex-automata/src/nfa/thompson/map.rs index c92d4c0b8..7f074a353 100644 --- a/regex-automata/src/nfa/thompson/map.rs +++ b/regex-automata/src/nfa/thompson/map.rs @@ -65,7 +65,7 @@ const INIT: u64 = 14695981039346656037; /// Specifically, one could observe the difference with std's hashmap via /// something like the following benchmark: /// -/// hyperfine "regex-cli debug thompson -qr --no-captures '\w{90} ecurB'" +/// hyperfine "regex-cli debug thompson -qr --captures none '\w{90} ecurB'" /// /// But to observe that difference, you'd have to modify the code to use /// std's hashmap. diff --git a/regex-automata/src/nfa/thompson/range_trie.rs b/regex-automata/src/nfa/thompson/range_trie.rs index 75c9b796b..49debda40 100644 --- a/regex-automata/src/nfa/thompson/range_trie.rs +++ b/regex-automata/src/nfa/thompson/range_trie.rs @@ -141,7 +141,7 @@ construction later by virtue of producing a much much smaller NFA. [2] - https://www.mitpressjournals.org/doi/pdfplus/10.1162/089120100561601 */ -use core::{cell::RefCell, convert::TryFrom, fmt, mem, ops::RangeInclusive}; +use core::{cell::RefCell, fmt, mem, ops::RangeInclusive}; use alloc::{format, string::String, vec, vec::Vec}; @@ -594,7 +594,7 @@ impl State { // Benchmarks suggest that binary search is just a bit faster than // straight linear search. Specifically when using the debug tool: // - // hyperfine "regex-cli debug thompson -qr --no-captures '\w{90} ecurB'" + // hyperfine "regex-cli debug thompson -qr --captures none '\w{90} ecurB'" binary_search(&self.transitions, |t| range.start <= t.range.end) } @@ -915,10 +915,6 @@ fn intersects(r1: Utf8Range, r2: Utf8Range) -> bool { #[cfg(test)] mod tests { - use core::ops::RangeInclusive; - - use regex_syntax::utf8::Utf8Range; - use super::*; fn r(range: RangeInclusive) -> Utf8Range { diff --git a/regex-automata/src/util/determinize/state.rs b/regex-automata/src/util/determinize/state.rs index effa6f44d..8a8561a31 100644 --- a/regex-automata/src/util/determinize/state.rs +++ b/regex-automata/src/util/determinize/state.rs @@ -86,7 +86,7 @@ serialized anywhere. So any kind of change can be made with reckless abandon, as long as everything in this module agrees. */ -use core::{convert::TryFrom, mem}; +use core::mem; use alloc::{sync::Arc, vec::Vec}; diff --git a/regex-automata/src/util/int.rs b/regex-automata/src/util/int.rs index e6b13bff9..b726e93f8 100644 --- a/regex-automata/src/util/int.rs +++ b/regex-automata/src/util/int.rs @@ -41,6 +41,10 @@ like `u64::from` where possible, or even `usize::try_from()` for when we do explicitly want to panic or when we want to return an error for overflow. */ +// We define a little more than what we need, but I'd rather just have +// everything via a consistent and uniform API then have holes. +#![allow(dead_code)] + pub(crate) trait U8 { fn as_usize(self) -> usize; } @@ -240,13 +244,3 @@ impl Pointer for *const T { self as usize } } - -pub(crate) trait PointerMut { - fn as_usize(self) -> usize; -} - -impl PointerMut for *mut T { - fn as_usize(self) -> usize { - self as usize - } -} diff --git a/regex-automata/src/util/prefilter/mod.rs b/regex-automata/src/util/prefilter/mod.rs index 51fc92233..d20442a69 100644 --- a/regex-automata/src/util/prefilter/mod.rs +++ b/regex-automata/src/util/prefilter/mod.rs @@ -146,6 +146,8 @@ pub struct Prefilter { pre: Arc, #[cfg(feature = "alloc")] is_fast: bool, + #[cfg(feature = "alloc")] + max_needle_len: usize, } impl Prefilter { @@ -202,12 +204,19 @@ impl Prefilter { kind: MatchKind, needles: &[B], ) -> Option { - Choice::new(kind, needles).and_then(Prefilter::from_choice) + Choice::new(kind, needles).and_then(|choice| { + let max_needle_len = + needles.iter().map(|b| b.as_ref().len()).max().unwrap_or(0); + Prefilter::from_choice(choice, max_needle_len) + }) } /// This turns a prefilter selection into a `Prefilter`. That is, in turns /// the enum given into a trait object. - fn from_choice(choice: Choice) -> Option { + fn from_choice( + choice: Choice, + max_needle_len: usize, + ) -> Option { #[cfg(not(feature = "alloc"))] { None @@ -224,7 +233,7 @@ impl Prefilter { Choice::AhoCorasick(p) => Arc::new(p), }; let is_fast = pre.is_fast(); - Some(Prefilter { pre, is_fast }) + Some(Prefilter { pre, is_fast, max_needle_len }) } } @@ -411,6 +420,20 @@ impl Prefilter { } } + /// Return the length of the longest needle + /// in this Prefilter + #[inline] + pub fn max_needle_len(&self) -> usize { + #[cfg(not(feature = "alloc"))] + { + unreachable!() + } + #[cfg(feature = "alloc")] + { + self.max_needle_len + } + } + /// Implementations might return true here if they believe themselves to /// be "fast." The concept of "fast" is deliberately left vague, but in /// practice this usually corresponds to whether it's believed that SIMD @@ -429,7 +452,7 @@ impl Prefilter { /// *know* a prefilter will be fast without actually trying the prefilter. /// (Which of course we cannot afford to do.) #[inline] - pub(crate) fn is_fast(&self) -> bool { + pub fn is_fast(&self) -> bool { #[cfg(not(feature = "alloc"))] { unreachable!() diff --git a/regex-automata/src/util/search.rs b/regex-automata/src/util/search.rs index 39aec522b..05b1cff54 100644 --- a/regex-automata/src/util/search.rs +++ b/regex-automata/src/util/search.rs @@ -110,9 +110,14 @@ impl<'h> Input<'h> { /// Create a new search configuration for the given haystack. #[inline] pub fn new>(haystack: &'h H) -> Input<'h> { + // Perform only one call to `haystack.as_ref()` to protect from incorrect + // implementations that return different values from multiple calls. + // This is important because there's code that relies on `span` not being + // out of bounds with respect to the stored `haystack`. + let haystack = haystack.as_ref(); Input { - haystack: haystack.as_ref(), - span: Span { start: 0, end: haystack.as_ref().len() }, + haystack, + span: Span { start: 0, end: haystack.len() }, anchored: Anchored::No, earliest: false, } @@ -1966,4 +1971,23 @@ mod tests { let expected_size = 3 * core::mem::size_of::(); assert_eq!(expected_size, core::mem::size_of::()); } + + #[test] + fn incorrect_asref_guard() { + struct Bad(std::cell::Cell); + + impl AsRef<[u8]> for Bad { + fn as_ref(&self) -> &[u8] { + if self.0.replace(false) { + &[] + } else { + &[0; 1000] + } + } + } + + let bad = Bad(std::cell::Cell::new(true)); + let input = Input::new(&bad); + assert!(input.end() <= input.haystack().len()); + } } diff --git a/regex-automata/src/util/wire.rs b/regex-automata/src/util/wire.rs index ecf4fd8c0..b1351c7e9 100644 --- a/regex-automata/src/util/wire.rs +++ b/regex-automata/src/util/wire.rs @@ -41,11 +41,7 @@ generally requires serializing both its big-endian and little-endian variants, and then loading the correct one based on the target's endianness. */ -use core::{ - cmp, - convert::{TryFrom, TryInto}, - mem::size_of, -}; +use core::{cmp, mem::size_of}; #[cfg(feature = "alloc")] use alloc::{vec, vec::Vec}; @@ -867,11 +863,6 @@ pub(crate) trait Endian { /// this panics. fn write_u32(n: u32, dst: &mut [u8]); - /// Writes a u64 to the given destination buffer in a particular - /// endianness. If the destination buffer has a length smaller than 8, then - /// this panics. - fn write_u64(n: u64, dst: &mut [u8]); - /// Writes a u128 to the given destination buffer in a particular /// endianness. If the destination buffer has a length smaller than 16, /// then this panics. @@ -897,10 +888,6 @@ impl Endian for LE { dst[..4].copy_from_slice(&n.to_le_bytes()); } - fn write_u64(n: u64, dst: &mut [u8]) { - dst[..8].copy_from_slice(&n.to_le_bytes()); - } - fn write_u128(n: u128, dst: &mut [u8]) { dst[..16].copy_from_slice(&n.to_le_bytes()); } @@ -915,10 +902,6 @@ impl Endian for BE { dst[..4].copy_from_slice(&n.to_be_bytes()); } - fn write_u64(n: u64, dst: &mut [u8]) { - dst[..8].copy_from_slice(&n.to_be_bytes()); - } - fn write_u128(n: u128, dst: &mut [u8]) { dst[..16].copy_from_slice(&n.to_be_bytes()); } diff --git a/regex-capi/src/error.rs b/regex-capi/src/error.rs index a269a3913..7b91fb9d3 100644 --- a/regex-capi/src/error.rs +++ b/regex-capi/src/error.rs @@ -4,7 +4,6 @@ use std::fmt; use std::str; use libc::c_char; -use regex; #[derive(Debug)] pub struct Error { @@ -22,7 +21,7 @@ pub enum ErrorKind { impl Error { pub fn new(kind: ErrorKind) -> Error { - Error { message: None, kind: kind } + Error { message: None, kind } } pub fn is_err(&self) -> bool { diff --git a/regex-cli/Cargo.toml b/regex-cli/Cargo.toml index 3fe5390aa..a107c09df 100644 --- a/regex-cli/Cargo.toml +++ b/regex-cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex-cli" -version = "0.1.1" #:version +version = "0.2.0" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] description = """ A command line tool for debugging, ad hoc benchmarking and generating regular diff --git a/regex-cli/args/flags.rs b/regex-cli/args/flags.rs index db8a847ef..61732a28e 100644 --- a/regex-cli/args/flags.rs +++ b/regex-cli/args/flags.rs @@ -152,3 +152,55 @@ impl std::str::FromStr for MatchKind { Ok(MatchKind { kind }) } } + +/// Provides an implementation of the --captures flag, for use with Thompson +/// NFA configuration. +#[derive(Debug)] +pub struct WhichCaptures { + pub which: regex_automata::nfa::thompson::WhichCaptures, +} + +impl WhichCaptures { + pub const USAGE: Usage = Usage::new( + "--captures ", + "One of: all, implicit or none.", + r#" +Selects which capture states should be included in the Thompson NFA. The +choices are 'all' (the default), 'implicit' or 'none'. + +'all' means that both explicit and implicit capture states are included. + +'implicit' means that only implicit capture states are included. That is, the +Thompson NFA will only be able to report the overall match offsets and not the +match offsets of each explicit capture group. + +'none' means that no capture states will be included. This is useful when +capture states aren't needed (like when building a DFA) or if they aren't +supported (like when building a reverse NFA). +"#, + ); +} + +impl Default for WhichCaptures { + fn default() -> WhichCaptures { + WhichCaptures { + which: regex_automata::nfa::thompson::WhichCaptures::All, + } + } +} + +impl std::str::FromStr for WhichCaptures { + type Err = anyhow::Error; + + fn from_str(s: &str) -> anyhow::Result { + let which = match s { + "all" => regex_automata::nfa::thompson::WhichCaptures::All, + "implicit" => { + regex_automata::nfa::thompson::WhichCaptures::Implicit + } + "none" => regex_automata::nfa::thompson::WhichCaptures::None, + unk => anyhow::bail!("unrecognized captures option '{}'", unk), + }; + Ok(WhichCaptures { which }) + } +} diff --git a/regex-cli/args/thompson.rs b/regex-cli/args/thompson.rs index 151fc6a0b..bd8388d11 100644 --- a/regex-cli/args/thompson.rs +++ b/regex-cli/args/thompson.rs @@ -70,11 +70,11 @@ impl Configurable for Config { Arg::Long("shrink") => { self.thompson = self.thompson.clone().shrink(true); } - Arg::Long("no-captures") => { - self.thompson = self - .thompson - .clone() - .which_captures(thompson::WhichCaptures::None); + Arg::Long("captures") => { + let which: flags::WhichCaptures = + args::parse(p, "--captures")?; + self.thompson = + self.thompson.clone().which_captures(which.which); } Arg::Long("line-terminator") => { let byte: flags::OneByte = @@ -136,19 +136,7 @@ spent shrinking the NFA can lead to far larger savings in the subsequent DFA determinization. "#, ), - Usage::new( - "--no-captures", - "Disable capture states.", - r#" -Disables capture states. By default, NFAs include special "capture" states that -instruct some regex engines (like the PikeVM) to record offset positions in -ancillary state. - -It can be useful to disable capture states in order to reduce "clutter" in the -automaton when debugging it. Also, at time of writing, reverse NFAs require -that capture groups are disabled. -"#, - ), + flags::WhichCaptures::USAGE, Usage::new( "--line-terminator", "Set the line terminator used by line anchors.", diff --git a/regex-cli/cmd/debug/dfa.rs b/regex-cli/cmd/debug/dfa.rs index 9381cdadc..f16610fbe 100644 --- a/regex-cli/cmd/debug/dfa.rs +++ b/regex-cli/cmd/debug/dfa.rs @@ -5,7 +5,7 @@ use crate::{ util::{self, Table}, }; -use {lexopt, regex_automata::dfa::Automaton}; +use regex_automata::dfa::Automaton; pub fn run_dense(p: &mut lexopt::Parser) -> anyhow::Result<()> { const USAGE: &'static str = "\ diff --git a/regex-cli/logger.rs b/regex-cli/logger.rs index 0fe063f1c..4e783872e 100644 --- a/regex-cli/logger.rs +++ b/regex-cli/logger.rs @@ -3,7 +3,7 @@ // print to stderr. We therefore avoid bringing in extra dependencies just // for this functionality. -use log::{self, Log}; +use log::Log; /// The simplest possible logger that logs to stderr. /// diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs index 593b14fbc..1a3df56b5 100644 --- a/regex-syntax/src/ast/parse.rs +++ b/regex-syntax/src/ast/parse.rs @@ -2405,8 +2405,6 @@ mod tests { use alloc::format; - use crate::ast::{self, Ast, Position, Span}; - use super::*; // Our own assert_eq, which has slightly better formatting (but honestly diff --git a/regex-syntax/src/hir/interval.rs b/regex-syntax/src/hir/interval.rs index e063390a8..d507ee724 100644 --- a/regex-syntax/src/hir/interval.rs +++ b/regex-syntax/src/hir/interval.rs @@ -479,23 +479,6 @@ pub trait Interval: ret } - /// Compute the symmetric difference the given range from this range. This - /// returns the union of the two ranges minus its intersection. - fn symmetric_difference( - &self, - other: &Self, - ) -> (Option, Option) { - let union = match self.union(other) { - None => return (Some(self.clone()), Some(other.clone())), - Some(union) => union, - }; - let intersection = match self.intersect(other) { - None => return (Some(self.clone()), Some(other.clone())), - Some(intersection) => intersection, - }; - union.difference(&intersection) - } - /// Returns true if and only if the two ranges are contiguous. Two ranges /// are contiguous if and only if the ranges are either overlapping or /// adjacent. diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index ce38ead7b..ae3ba318e 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -322,6 +322,22 @@ impl Hir { /// let expected = HirKind::Literal(Literal(Box::from("☃".as_bytes()))); /// assert_eq!(&expected, concat.kind()); /// ``` + /// + /// # Example: building a literal from a `char` + /// + /// This example shows how to build a single `Hir` literal from a `char` + /// value. Since a [`Literal`] is just bytes, we just need to UTF-8 + /// encode a `char` value: + /// + /// ``` + /// use regex_syntax::hir::{Hir, HirKind, Literal}; + /// + /// let ch = '☃'; + /// let got = Hir::literal(ch.encode_utf8(&mut [0; 4]).as_bytes()); + /// + /// let expected = HirKind::Literal(Literal(Box::from("☃".as_bytes()))); + /// assert_eq!(&expected, got.kind()); + /// ``` #[inline] pub fn literal>>(lit: B) -> Hir { let bytes = lit.into(); diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index 313a1e9e8..3749ce307 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -1358,9 +1358,8 @@ fn ascii_class_as_chars( #[cfg(test)] mod tests { use crate::{ - ast::{self, parse::ParserBuilder, Ast, Position, Span}, - hir::{self, Hir, HirKind, Look, Properties}, - unicode::{self, ClassQuery}, + ast::{parse::ParserBuilder, Position}, + hir::{Look, Properties}, }; use super::*; diff --git a/regex-test/lib.rs b/regex-test/lib.rs index 2b630666e..7b5ab830c 100644 --- a/regex-test/lib.rs +++ b/regex-test/lib.rs @@ -99,9 +99,7 @@ See [`MatchKind`] for more details. This is an optional field and defaults to /// For this reason, `anyhow` is a public dependency and is re-exported here. pub extern crate anyhow; -use std::{ - borrow::Borrow, collections::HashSet, convert::TryFrom, fs, path::Path, -}; +use std::{borrow::Borrow, collections::HashSet, fs, path::Path}; use { anyhow::{bail, Context, Result}, diff --git a/src/regex/bytes.rs b/src/regex/bytes.rs index 19f5701af..7b7aad574 100644 --- a/src/regex/bytes.rs +++ b/src/regex/bytes.rs @@ -651,6 +651,9 @@ impl Regex { /// case, this implementation will likely return a `Cow::Borrowed` value /// such that no allocation is performed. /// + /// When a `Cow::Borrowed` is returned, the value returned is guaranteed + /// to be equivalent to the `haystack` given. + /// /// # Replacement string syntax /// /// All instances of `$ref` in the replacement string are replaced with @@ -761,6 +764,13 @@ impl Regex { /// replacement provided. This is the same as calling `replacen` with /// `limit` set to `0`. /// + /// If no match is found, then the haystack is returned unchanged. In that + /// case, this implementation will likely return a `Cow::Borrowed` value + /// such that no allocation is performed. + /// + /// When a `Cow::Borrowed` is returned, the value returned is guaranteed + /// to be equivalent to the `haystack` given. + /// /// The documentation for [`Regex::replace`] goes into more detail about /// what kinds of replacement strings are supported. /// @@ -855,6 +865,13 @@ impl Regex { /// matches are replaced. That is, `Regex::replace_all(hay, rep)` is /// equivalent to `Regex::replacen(hay, 0, rep)`. /// + /// If no match is found, then the haystack is returned unchanged. In that + /// case, this implementation will likely return a `Cow::Borrowed` value + /// such that no allocation is performed. + /// + /// When a `Cow::Borrowed` is returned, the value returned is guaranteed + /// to be equivalent to the `haystack` given. + /// /// The documentation for [`Regex::replace`] goes into more detail about /// what kinds of replacement strings are supported. /// @@ -1568,10 +1585,15 @@ impl<'h> From> for core::ops::Range { /// Represents the capture groups for a single match. /// -/// Capture groups refer to parts of a regex enclosed in parentheses. They can -/// be optionally named. The purpose of capture groups is to be able to -/// reference different parts of a match based on the original pattern. For -/// example, say you want to match the individual letters in a 5-letter word: +/// Capture groups refer to parts of a regex enclosed in parentheses. They +/// can be optionally named. The purpose of capture groups is to be able to +/// reference different parts of a match based on the original pattern. In +/// essence, a `Captures` is a container of [`Match`] values for each group +/// that participated in a regex match. Each `Match` can be looked up by either +/// its capture group index or name (if it has one). +/// +/// For example, say you want to match the individual letters in a 5-letter +/// word: /// /// ```text /// (?\w)(\w)(?:\w)\w(?\w) @@ -1984,7 +2006,7 @@ impl<'h> core::ops::Index for Captures<'h> { /// The haystack substring returned can't outlive the `Captures` object if this /// method is used, because of how `Index` is defined (normally `a[i]` is part /// of `a` and can't outlive it). To work around this limitation, do that, use -/// [`Captures::get`] instead. +/// [`Captures::name`] instead. /// /// `'h` is the lifetime of the matched haystack, but the lifetime of the /// `&str` returned by this implementation is the lifetime of the `Captures` diff --git a/src/regex/string.rs b/src/regex/string.rs index 880d6082a..dba94d46e 100644 --- a/src/regex/string.rs +++ b/src/regex/string.rs @@ -642,6 +642,9 @@ impl Regex { /// case, this implementation will likely return a `Cow::Borrowed` value /// such that no allocation is performed. /// + /// When a `Cow::Borrowed` is returned, the value returned is guaranteed + /// to be equivalent to the `haystack` given. + /// /// # Replacement string syntax /// /// All instances of `$ref` in the replacement string are replaced with @@ -748,6 +751,13 @@ impl Regex { /// replacement provided. This is the same as calling `replacen` with /// `limit` set to `0`. /// + /// If no match is found, then the haystack is returned unchanged. In that + /// case, this implementation will likely return a `Cow::Borrowed` value + /// such that no allocation is performed. + /// + /// When a `Cow::Borrowed` is returned, the value returned is guaranteed + /// to be equivalent to the `haystack` given. + /// /// The documentation for [`Regex::replace`] goes into more detail about /// what kinds of replacement strings are supported. /// @@ -842,6 +852,13 @@ impl Regex { /// matches are replaced. That is, `Regex::replace_all(hay, rep)` is /// equivalent to `Regex::replacen(hay, 0, rep)`. /// + /// If no match is found, then the haystack is returned unchanged. In that + /// case, this implementation will likely return a `Cow::Borrowed` value + /// such that no allocation is performed. + /// + /// When a `Cow::Borrowed` is returned, the value returned is guaranteed + /// to be equivalent to the `haystack` given. + /// /// The documentation for [`Regex::replace`] goes into more detail about /// what kinds of replacement strings are supported. /// @@ -1573,10 +1590,15 @@ impl<'h> From> for core::ops::Range { /// Represents the capture groups for a single match. /// -/// Capture groups refer to parts of a regex enclosed in parentheses. They can -/// be optionally named. The purpose of capture groups is to be able to -/// reference different parts of a match based on the original pattern. For -/// example, say you want to match the individual letters in a 5-letter word: +/// Capture groups refer to parts of a regex enclosed in parentheses. They +/// can be optionally named. The purpose of capture groups is to be able to +/// reference different parts of a match based on the original pattern. In +/// essence, a `Captures` is a container of [`Match`] values for each group +/// that participated in a regex match. Each `Match` can be looked up by either +/// its capture group index or name (if it has one). +/// +/// For example, say you want to match the individual letters in a 5-letter +/// word: /// /// ```text /// (?\w)(\w)(?:\w)\w(?\w) @@ -1987,7 +2009,7 @@ impl<'h> core::ops::Index for Captures<'h> { /// The haystack substring returned can't outlive the `Captures` object if this /// method is used, because of how `Index` is defined (normally `a[i]` is part /// of `a` and can't outlive it). To work around this limitation, do that, use -/// [`Captures::get`] instead. +/// [`Captures::name`] instead. /// /// `'h` is the lifetime of the matched haystack, but the lifetime of the /// `&str` returned by this implementation is the lifetime of the `Captures`