| use regex_automata::{ |
| hybrid::{ |
| dfa::DFA, |
| regex::{self, Regex}, |
| }, |
| nfa::thompson, |
| MatchKind, SyntaxConfig, |
| }; |
| use regex_syntax as syntax; |
| |
| use regex_test::{ |
| bstr::{BString, ByteSlice}, |
| CompiledRegex, Match, MatchKind as TestMatchKind, RegexTest, RegexTests, |
| SearchKind as TestSearchKind, TestResult, TestRunner, |
| }; |
| |
| use crate::{suite, Result}; |
| |
| /// Tests the default configuration of the hybrid NFA/DFA. |
| #[test] |
| fn default() -> Result<()> { |
| let builder = Regex::builder(); |
| TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert(); |
| Ok(()) |
| } |
| |
| /// Tests the hybrid NFA/DFA with NFA shrinking disabled. |
| /// |
| /// This is actually the typical configuration one wants for a lazy DFA. NFA |
| /// shrinking is mostly only advantageous when building a full DFA since it |
| /// can sharply decrease the amount of time determinization takes. But NFA |
| /// shrinking is itself otherwise fairly expensive. Since a lazy DFA has |
| /// no compilation time (other than for building the NFA of course) before |
| /// executing a search, it's usually worth it to forgo NFA shrinking. |
| #[test] |
| fn no_nfa_shrink() -> Result<()> { |
| let mut builder = Regex::builder(); |
| builder.thompson(thompson::Config::new().shrink(false)); |
| TestRunner::new()? |
| // Without NFA shrinking, this test blows the default cache capacity. |
| .blacklist("expensive/regression-many-repeat-no-stack-overflow") |
| .test_iter(suite()?.iter(), compiler(builder)) |
| .assert(); |
| Ok(()) |
| } |
| |
| /// Tests the hybrid NFA/DFA when 'starts_for_each_pattern' is enabled. |
| #[test] |
| fn starts_for_each_pattern() -> Result<()> { |
| let mut builder = Regex::builder(); |
| builder.dfa(DFA::config().starts_for_each_pattern(true)); |
| TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert(); |
| Ok(()) |
| } |
| |
| /// Tests the hybrid NFA/DFA when byte classes are disabled. |
| /// |
| /// N.B. Disabling byte classes doesn't avoid any indirection at search time. |
| /// All it does is cause every byte value to be its own distinct equivalence |
| /// class. |
| #[test] |
| fn no_byte_classes() -> Result<()> { |
| let mut builder = Regex::builder(); |
| builder.dfa(DFA::config().byte_classes(false)); |
| TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert(); |
| Ok(()) |
| } |
| |
| /// Tests that hybrid NFA/DFA never clears its cache for any test with the |
| /// default capacity. |
| /// |
| /// N.B. If a regex suite test is added that causes the cache to be cleared, |
| /// then this should just skip that test. (Which can be done by calling the |
| /// 'blacklist' method on 'TestRunner'.) |
| #[test] |
| fn no_cache_clearing() -> Result<()> { |
| let mut builder = Regex::builder(); |
| builder.dfa(DFA::config().minimum_cache_clear_count(Some(0))); |
| TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert(); |
| Ok(()) |
| } |
| |
| /// Tests the hybrid NFA/DFA when the minimum cache capacity is set. |
| #[test] |
| fn min_cache_capacity() -> Result<()> { |
| let mut builder = Regex::builder(); |
| builder |
| .dfa(DFA::config().cache_capacity(0).skip_cache_capacity_check(true)); |
| TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert(); |
| Ok(()) |
| } |
| |
| fn compiler( |
| mut builder: regex::Builder, |
| ) -> impl FnMut(&RegexTest, &[BString]) -> Result<CompiledRegex> { |
| move |test, regexes| { |
| let regexes = regexes |
| .iter() |
| .map(|r| r.to_str().map(|s| s.to_string())) |
| .collect::<std::result::Result<Vec<String>, _>>()?; |
| |
| // Check if our regex contains things that aren't supported by DFAs. |
| // That is, Unicode word boundaries when searching non-ASCII text. |
| let mut thompson = thompson::Builder::new(); |
| thompson.syntax(config_syntax(test)).configure(config_thompson(test)); |
| if let Ok(nfa) = thompson.build_many(®exes) { |
| let non_ascii = test.input().iter().any(|&b| !b.is_ascii()); |
| if nfa.has_word_boundary_unicode() && non_ascii { |
| return Ok(CompiledRegex::skip()); |
| } |
| } |
| if !configure_regex_builder(test, &mut builder) { |
| return Ok(CompiledRegex::skip()); |
| } |
| let re = builder.build_many(®exes)?; |
| let mut cache = re.create_cache(); |
| Ok(CompiledRegex::compiled(move |test| -> Vec<TestResult> { |
| run_test(&re, &mut cache, test) |
| })) |
| } |
| } |
| |
| fn run_test( |
| re: &Regex, |
| cache: &mut regex::Cache, |
| test: &RegexTest, |
| ) -> Vec<TestResult> { |
| let is_match = if re.is_match(cache, test.input()) { |
| TestResult::matched() |
| } else { |
| TestResult::no_match() |
| }; |
| let is_match = is_match.name("is_match"); |
| |
| let find_matches = match test.search_kind() { |
| TestSearchKind::Earliest => { |
| let it = re |
| .find_earliest_iter(cache, test.input()) |
| .take(test.match_limit().unwrap_or(std::usize::MAX)) |
| .map(|m| Match { |
| id: m.pattern().as_usize(), |
| start: m.start(), |
| end: m.end(), |
| }); |
| TestResult::matches(it).name("find_earliest_iter") |
| } |
| TestSearchKind::Leftmost => { |
| let it = re |
| .find_leftmost_iter(cache, test.input()) |
| .take(test.match_limit().unwrap_or(std::usize::MAX)) |
| .map(|m| Match { |
| id: m.pattern().as_usize(), |
| start: m.start(), |
| end: m.end(), |
| }); |
| TestResult::matches(it).name("find_leftmost_iter") |
| } |
| TestSearchKind::Overlapping => { |
| let it = re |
| .find_overlapping_iter(cache, test.input()) |
| .take(test.match_limit().unwrap_or(std::usize::MAX)) |
| .map(|m| Match { |
| id: m.pattern().as_usize(), |
| start: m.start(), |
| end: m.end(), |
| }); |
| TestResult::matches(it).name("find_overlapping_iter") |
| } |
| }; |
| vec![is_match, find_matches] |
| } |
| |
| /// Configures the given regex builder with all relevant settings on the given |
| /// regex test. |
| /// |
| /// If the regex test has a setting that is unsupported, then this returns |
| /// false (implying the test should be skipped). |
| fn configure_regex_builder( |
| test: &RegexTest, |
| builder: &mut regex::Builder, |
| ) -> bool { |
| let match_kind = match test.match_kind() { |
| TestMatchKind::All => MatchKind::All, |
| TestMatchKind::LeftmostFirst => MatchKind::LeftmostFirst, |
| TestMatchKind::LeftmostLongest => return false, |
| }; |
| |
| let dense_config = DFA::config() |
| .anchored(test.anchored()) |
| .match_kind(match_kind) |
| .unicode_word_boundary(true); |
| let regex_config = Regex::config().utf8(test.utf8()); |
| builder |
| .configure(regex_config) |
| .syntax(config_syntax(test)) |
| .thompson(config_thompson(test)) |
| .dfa(dense_config); |
| true |
| } |
| |
| /// Configuration of a Thompson NFA compiler from a regex test. |
| fn config_thompson(test: &RegexTest) -> thompson::Config { |
| thompson::Config::new().utf8(test.utf8()) |
| } |
| |
| /// Configuration of the regex parser from a regex test. |
| fn config_syntax(test: &RegexTest) -> SyntaxConfig { |
| SyntaxConfig::new() |
| .case_insensitive(test.case_insensitive()) |
| .unicode(test.unicode()) |
| .utf8(test.utf8()) |
| } |