| use core::cell::{Cell, RefCell}; |
| |
| use alloc::{ |
| boxed::Box, |
| string::{String, ToString}, |
| vec, |
| vec::Vec, |
| }; |
| |
| use crate::{ |
| error::Error, |
| hir::{self, Config, Flags, Hir, HirKind}, |
| }; |
| |
| // These are all of the errors that can occur while parsing a regex. Unlike |
| // regex-syntax, our errors are not particularly great. They are just enough |
| // to get a general sense of what went wrong. But in exchange, the error |
| // reporting mechanism is *much* simpler than what's in regex-syntax. |
| // |
| // By convention, we use each of these messages in exactly one place. That |
| // way, every branch that leads to an error has a unique message. This in turn |
| // means that given a message, one can precisely identify which part of the |
| // parser reported it. |
| // |
| // Finally, we give names to each message so that we can reference them in |
| // tests. |
| const ERR_TOO_MUCH_NESTING: &str = "pattern has too much nesting"; |
| const ERR_TOO_MANY_CAPTURES: &str = "too many capture groups"; |
| const ERR_DUPLICATE_CAPTURE_NAME: &str = "duplicate capture group name"; |
| const ERR_UNCLOSED_GROUP: &str = "found open group without closing ')'"; |
| const ERR_UNCLOSED_GROUP_QUESTION: &str = |
| "expected closing ')', but got end of pattern"; |
| const ERR_UNOPENED_GROUP: &str = "found closing ')' without matching '('"; |
| const ERR_LOOK_UNSUPPORTED: &str = "look-around is not supported"; |
| const ERR_EMPTY_FLAGS: &str = "empty flag directive '(?)' is not allowed"; |
| const ERR_MISSING_GROUP_NAME: &str = |
| "expected capture group name, but got end of pattern"; |
| const ERR_INVALID_GROUP_NAME: &str = "invalid group name"; |
| const ERR_UNCLOSED_GROUP_NAME: &str = |
| "expected end of capture group name, but got end of pattern"; |
| const ERR_EMPTY_GROUP_NAME: &str = "empty capture group names are not allowed"; |
| const ERR_FLAG_UNRECOGNIZED: &str = "unrecognized inline flag"; |
| const ERR_FLAG_REPEATED_NEGATION: &str = |
| "inline flag negation cannot be repeated"; |
| const ERR_FLAG_DUPLICATE: &str = "duplicate inline flag is not allowed"; |
| const ERR_FLAG_UNEXPECTED_EOF: &str = |
| "expected ':' or ')' to end inline flags, but got end of pattern"; |
| const ERR_FLAG_DANGLING_NEGATION: &str = |
| "inline flags cannot end with negation directive"; |
| const ERR_DECIMAL_NO_DIGITS: &str = |
| "expected decimal number, but found no digits"; |
| const ERR_DECIMAL_INVALID: &str = "got invalid decimal number"; |
| const ERR_HEX_BRACE_INVALID_DIGIT: &str = |
| "expected hexadecimal number in braces, but got non-hex digit"; |
| const ERR_HEX_BRACE_UNEXPECTED_EOF: &str = |
| "expected hexadecimal number, but saw end of pattern before closing brace"; |
| const ERR_HEX_BRACE_EMPTY: &str = |
| "expected hexadecimal number in braces, but got no digits"; |
| const ERR_HEX_BRACE_INVALID: &str = "got invalid hexadecimal number in braces"; |
| const ERR_HEX_FIXED_UNEXPECTED_EOF: &str = |
| "expected fixed length hexadecimal number, but saw end of pattern first"; |
| const ERR_HEX_FIXED_INVALID_DIGIT: &str = |
| "expected fixed length hexadecimal number, but got non-hex digit"; |
| const ERR_HEX_FIXED_INVALID: &str = |
| "got invalid fixed length hexadecimal number"; |
| const ERR_HEX_UNEXPECTED_EOF: &str = |
| "expected hexadecimal number, but saw end of pattern first"; |
| const ERR_ESCAPE_UNEXPECTED_EOF: &str = |
| "saw start of escape sequence, but saw end of pattern before it finished"; |
| const ERR_BACKREF_UNSUPPORTED: &str = "backreferences are not supported"; |
| const ERR_UNICODE_CLASS_UNSUPPORTED: &str = |
| "Unicode character classes are not supported"; |
| const ERR_ESCAPE_UNRECOGNIZED: &str = "unrecognized escape sequence"; |
| const ERR_POSIX_CLASS_UNRECOGNIZED: &str = |
| "unrecognized POSIX character class"; |
| const ERR_UNCOUNTED_REP_SUB_MISSING: &str = |
| "uncounted repetition operator must be applied to a sub-expression"; |
| const ERR_COUNTED_REP_SUB_MISSING: &str = |
| "counted repetition operator must be applied to a sub-expression"; |
| const ERR_COUNTED_REP_UNCLOSED: &str = |
| "found unclosed counted repetition operator"; |
| const ERR_COUNTED_REP_MIN_UNCLOSED: &str = |
| "found incomplete and unclosed counted repetition operator"; |
| const ERR_COUNTED_REP_COMMA_UNCLOSED: &str = |
| "found counted repetition operator with a comma that is unclosed"; |
| const ERR_COUNTED_REP_MIN_MAX_UNCLOSED: &str = |
| "found counted repetition with min and max that is unclosed"; |
| const ERR_COUNTED_REP_INVALID: &str = |
| "expected closing brace for counted repetition, but got something else"; |
| const ERR_COUNTED_REP_INVALID_RANGE: &str = |
| "found counted repetition with a min bigger than its max"; |
| const ERR_CLASS_UNCLOSED_AFTER_ITEM: &str = |
| "non-empty character class has no closing bracket"; |
| const ERR_CLASS_INVALID_RANGE_ITEM: &str = |
| "character class ranges must start and end with a single character"; |
| const ERR_CLASS_INVALID_ITEM: &str = |
| "invalid escape sequence in character class"; |
| const ERR_CLASS_UNCLOSED_AFTER_DASH: &str = |
| "non-empty character class has no closing bracket after dash"; |
| const ERR_CLASS_UNCLOSED_AFTER_NEGATION: &str = |
| "negated character class has no closing bracket"; |
| const ERR_CLASS_UNCLOSED_AFTER_CLOSING: &str = |
| "character class begins with literal ']' but has no closing bracket"; |
| const ERR_CLASS_INVALID_RANGE: &str = "invalid range in character class"; |
| const ERR_CLASS_UNCLOSED: &str = "found unclosed character class"; |
| const ERR_CLASS_NEST_UNSUPPORTED: &str = |
| "nested character classes are not supported"; |
| const ERR_CLASS_INTERSECTION_UNSUPPORTED: &str = |
| "character class intersection is not supported"; |
| const ERR_CLASS_DIFFERENCE_UNSUPPORTED: &str = |
| "character class difference is not supported"; |
| const ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED: &str = |
| "character class symmetric difference is not supported"; |
| const ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED: &str = |
| "special word boundary assertion is unclosed or has an invalid character"; |
| const ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED: &str = |
| "special word boundary assertion is unrecognized"; |
| const ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF: &str = |
| "found start of special word boundary or repetition without an end"; |
| |
| /// A regular expression parser. |
| /// |
| /// This parses a string representation of a regular expression into an |
| /// abstract syntax tree. The size of the tree is proportional to the length |
| /// of the regular expression pattern. |
| /// |
| /// A `Parser` can be configured in more detail via a [`ParserBuilder`]. |
| #[derive(Clone, Debug)] |
| pub(super) struct Parser<'a> { |
| /// The configuration of the parser as given by the caller. |
| config: Config, |
| /// The pattern we're parsing as given by the caller. |
| pattern: &'a str, |
| /// The call depth of the parser. This is incremented for each |
| /// sub-expression parsed. Its peak value is the maximum nesting of the |
| /// pattern. |
| depth: Cell<u32>, |
| /// The current position of the parser. |
| pos: Cell<usize>, |
| /// The current codepoint of the parser. The codepoint corresponds to the |
| /// codepoint encoded in `pattern` beginning at `pos`. |
| /// |
| /// This is `None` if and only if `pos == pattern.len()`. |
| char: Cell<Option<char>>, |
| /// The current capture index. |
| capture_index: Cell<u32>, |
| /// The flags that are currently set. |
| flags: RefCell<Flags>, |
| /// A sorted sequence of capture names. This is used to detect duplicate |
| /// capture names and report an error if one is detected. |
| capture_names: RefCell<Vec<String>>, |
| } |
| |
| /// The constructor and a variety of helper routines. |
| impl<'a> Parser<'a> { |
| /// Build a parser from this configuration with the given pattern. |
| pub(super) fn new(config: Config, pattern: &'a str) -> Parser<'a> { |
| Parser { |
| config, |
| pattern, |
| depth: Cell::new(0), |
| pos: Cell::new(0), |
| char: Cell::new(pattern.chars().next()), |
| capture_index: Cell::new(0), |
| flags: RefCell::new(config.flags), |
| capture_names: RefCell::new(vec![]), |
| } |
| } |
| |
| /// Returns the full pattern string that we're parsing. |
| fn pattern(&self) -> &str { |
| self.pattern |
| } |
| |
| /// Return the current byte offset of the parser. |
| /// |
| /// The offset starts at `0` from the beginning of the regular expression |
| /// pattern string. |
| fn pos(&self) -> usize { |
| self.pos.get() |
| } |
| |
| /// Increments the call depth of the parser. |
| /// |
| /// If the call depth would exceed the configured nest limit, then this |
| /// returns an error. |
| /// |
| /// This returns the old depth. |
| fn increment_depth(&self) -> Result<u32, Error> { |
| let old = self.depth.get(); |
| if old > self.config.nest_limit { |
| return Err(Error::new(ERR_TOO_MUCH_NESTING)); |
| } |
| // OK because our depth starts at 0, and we return an error if it |
| // ever reaches the limit. So the call depth can never exceed u32::MAX. |
| let new = old.checked_add(1).unwrap(); |
| self.depth.set(new); |
| Ok(old) |
| } |
| |
| /// Decrements the call depth of the parser. |
| /// |
| /// This panics if the current depth is 0. |
| fn decrement_depth(&self) { |
| let old = self.depth.get(); |
| // If this fails then the caller has a bug in how they're incrementing |
| // and decrementing the depth of the parser's call stack. |
| let new = old.checked_sub(1).unwrap(); |
| self.depth.set(new); |
| } |
| |
| /// Return the codepoint at the current position of the parser. |
| /// |
| /// This panics if the parser is positioned at the end of the pattern. |
| fn char(&self) -> char { |
| self.char.get().expect("codepoint, but parser is done") |
| } |
| |
| /// Returns true if the next call to `bump` would return false. |
| fn is_done(&self) -> bool { |
| self.pos() == self.pattern.len() |
| } |
| |
| /// Returns the flags that are current set for this regex. |
| fn flags(&self) -> Flags { |
| *self.flags.borrow() |
| } |
| |
| /// Bump the parser to the next Unicode scalar value. |
| /// |
| /// If the end of the input has been reached, then `false` is returned. |
| fn bump(&self) -> bool { |
| if self.is_done() { |
| return false; |
| } |
| self.pos.set(self.pos() + self.char().len_utf8()); |
| self.char.set(self.pattern()[self.pos()..].chars().next()); |
| self.char.get().is_some() |
| } |
| |
| /// If the substring starting at the current position of the parser has |
| /// the given prefix, then bump the parser to the character immediately |
| /// following the prefix and return true. Otherwise, don't bump the parser |
| /// and return false. |
| fn bump_if(&self, prefix: &str) -> bool { |
| if self.pattern()[self.pos()..].starts_with(prefix) { |
| for _ in 0..prefix.chars().count() { |
| self.bump(); |
| } |
| true |
| } else { |
| false |
| } |
| } |
| |
| /// Bump the parser, and if the `x` flag is enabled, bump through any |
| /// subsequent spaces. Return true if and only if the parser is not done. |
| fn bump_and_bump_space(&self) -> bool { |
| if !self.bump() { |
| return false; |
| } |
| self.bump_space(); |
| !self.is_done() |
| } |
| |
| /// If the `x` flag is enabled (i.e., whitespace insensitivity with |
| /// comments), then this will advance the parser through all whitespace |
| /// and comments to the next non-whitespace non-comment byte. |
| /// |
| /// If the `x` flag is disabled, then this is a no-op. |
| /// |
| /// This should be used selectively throughout the parser where |
| /// arbitrary whitespace is permitted when the `x` flag is enabled. For |
| /// example, `{ 5 , 6}` is equivalent to `{5,6}`. |
| fn bump_space(&self) { |
| if !self.flags().ignore_whitespace { |
| return; |
| } |
| while !self.is_done() { |
| if self.char().is_whitespace() { |
| self.bump(); |
| } else if self.char() == '#' { |
| self.bump(); |
| while !self.is_done() { |
| let c = self.char(); |
| self.bump(); |
| if c == '\n' { |
| break; |
| } |
| } |
| } else { |
| break; |
| } |
| } |
| } |
| |
| /// Peek at the next character in the input without advancing the parser. |
| /// |
| /// If the input has been exhausted, then this returns `None`. |
| fn peek(&self) -> Option<char> { |
| if self.is_done() { |
| return None; |
| } |
| self.pattern()[self.pos() + self.char().len_utf8()..].chars().next() |
| } |
| |
| /// Peeks at the next character in the pattern from the current offset, and |
| /// will ignore spaces when the parser is in whitespace insensitive mode. |
| fn peek_space(&self) -> Option<char> { |
| if !self.flags().ignore_whitespace { |
| return self.peek(); |
| } |
| if self.is_done() { |
| return None; |
| } |
| let mut start = self.pos() + self.char().len_utf8(); |
| let mut in_comment = false; |
| for (i, ch) in self.pattern()[start..].char_indices() { |
| if ch.is_whitespace() { |
| continue; |
| } else if !in_comment && ch == '#' { |
| in_comment = true; |
| } else if in_comment && ch == '\n' { |
| in_comment = false; |
| } else { |
| start += i; |
| break; |
| } |
| } |
| self.pattern()[start..].chars().next() |
| } |
| |
| /// Return the next capturing index. Each subsequent call increments the |
| /// internal index. Since the way capture indices are computed is a public |
| /// API guarantee, use of this routine depends on the parser being depth |
| /// first and left-to-right. |
| /// |
| /// If the capture limit is exceeded, then an error is returned. |
| fn next_capture_index(&self) -> Result<u32, Error> { |
| let current = self.capture_index.get(); |
| let next = current |
| .checked_add(1) |
| .ok_or_else(|| Error::new(ERR_TOO_MANY_CAPTURES))?; |
| self.capture_index.set(next); |
| Ok(next) |
| } |
| |
| /// Adds the given capture name to this parser. If this capture name has |
| /// already been used, then an error is returned. |
| fn add_capture_name(&self, name: &str) -> Result<(), Error> { |
| let mut names = self.capture_names.borrow_mut(); |
| match names.binary_search_by(|n| name.cmp(n)) { |
| Ok(_) => Err(Error::new(ERR_DUPLICATE_CAPTURE_NAME)), |
| Err(i) => { |
| names.insert(i, name.to_string()); |
| Ok(()) |
| } |
| } |
| } |
| |
| /// Returns true if and only if the parser is positioned at a look-around |
| /// prefix. The conditions under which this returns true must always |
| /// correspond to a regular expression that would otherwise be consider |
| /// invalid. |
| /// |
| /// This should only be called immediately after parsing the opening of |
| /// a group or a set of flags. |
| fn is_lookaround_prefix(&self) -> bool { |
| self.bump_if("?=") |
| || self.bump_if("?!") |
| || self.bump_if("?<=") |
| || self.bump_if("?<!") |
| } |
| } |
| |
| /// The actual parser. We try to break out each kind of regex syntax into its |
| /// own routine. |
| impl<'a> Parser<'a> { |
| pub(super) fn parse(&self) -> Result<Hir, Error> { |
| let hir = self.parse_inner()?; |
| // While we also check nesting during parsing, that only checks the |
| // number of recursive parse calls. It does not necessarily cover |
| // all possible recursive nestings of the Hir itself. For example, |
| // repetition operators don't require recursive parse calls. So one |
| // can stack them arbitrarily without overflowing the stack in the |
| // *parser*. But then if one recurses over the resulting Hir, a stack |
| // overflow is possible. So here we check the Hir nesting level |
| // thoroughly to ensure it isn't nested too deeply. |
| // |
| // Note that we do still need the nesting limit check in the parser as |
| // well, since that will avoid overflowing the stack during parse time |
| // before the complete Hir value is constructed. |
| check_hir_nesting(&hir, self.config.nest_limit)?; |
| Ok(hir) |
| } |
| |
| fn parse_inner(&self) -> Result<Hir, Error> { |
| let depth = self.increment_depth()?; |
| let mut alternates = vec![]; |
| let mut concat = vec![]; |
| loop { |
| self.bump_space(); |
| if self.is_done() { |
| break; |
| } |
| match self.char() { |
| '(' => { |
| // Save the old flags and reset them only when we close |
| // the group. |
| let oldflags = *self.flags.borrow(); |
| if let Some(sub) = self.parse_group()? { |
| concat.push(sub); |
| // We only reset them here because if 'parse_group' |
| // returns None, then that means it handled a flag |
| // directive, e.g., '(?ism)'. And the whole point is |
| // that those flags remain active until either disabled |
| // or the end of the pattern or current group. |
| *self.flags.borrow_mut() = oldflags; |
| } |
| if self.char.get() != Some(')') { |
| return Err(Error::new(ERR_UNCLOSED_GROUP)); |
| } |
| self.bump(); |
| } |
| ')' => { |
| if depth == 0 { |
| return Err(Error::new(ERR_UNOPENED_GROUP)); |
| } |
| break; |
| } |
| '|' => { |
| alternates.push(Hir::concat(core::mem::take(&mut concat))); |
| self.bump(); |
| } |
| '[' => concat.push(self.parse_class()?), |
| '?' | '*' | '+' => { |
| concat = self.parse_uncounted_repetition(concat)?; |
| } |
| '{' => { |
| concat = self.parse_counted_repetition(concat)?; |
| } |
| _ => concat.push(self.parse_primitive()?), |
| } |
| } |
| self.decrement_depth(); |
| alternates.push(Hir::concat(concat)); |
| // N.B. This strips off the "alternation" if there's only one branch. |
| Ok(Hir::alternation(alternates)) |
| } |
| |
| /// Parses a "primitive" pattern. A primitive is any expression that does |
| /// not contain any sub-expressions. |
| /// |
| /// This assumes the parser is pointing at the beginning of the primitive. |
| fn parse_primitive(&self) -> Result<Hir, Error> { |
| let ch = self.char(); |
| self.bump(); |
| match ch { |
| '\\' => self.parse_escape(), |
| '.' => Ok(self.hir_dot()), |
| '^' => Ok(self.hir_anchor_start()), |
| '$' => Ok(self.hir_anchor_end()), |
| ch => Ok(self.hir_char(ch)), |
| } |
| } |
| |
| /// Parse an escape sequence. This always results in a "primitive" HIR, |
| /// that is, an HIR with no sub-expressions. |
| /// |
| /// This assumes the parser is positioned at the start of the sequence, |
| /// immediately *after* the `\`. It advances the parser to the first |
| /// position immediately following the escape sequence. |
| fn parse_escape(&self) -> Result<Hir, Error> { |
| if self.is_done() { |
| return Err(Error::new(ERR_ESCAPE_UNEXPECTED_EOF)); |
| } |
| let ch = self.char(); |
| // Put some of the more complicated routines into helpers. |
| match ch { |
| '0'..='9' => return Err(Error::new(ERR_BACKREF_UNSUPPORTED)), |
| 'p' | 'P' => { |
| return Err(Error::new(ERR_UNICODE_CLASS_UNSUPPORTED)) |
| } |
| 'x' | 'u' | 'U' => return self.parse_hex(), |
| 'd' | 's' | 'w' | 'D' | 'S' | 'W' => { |
| return Ok(self.parse_perl_class()); |
| } |
| _ => {} |
| } |
| |
| // Handle all of the one letter sequences inline. |
| self.bump(); |
| if hir::is_meta_character(ch) || hir::is_escapeable_character(ch) { |
| return Ok(self.hir_char(ch)); |
| } |
| let special = |ch| Ok(self.hir_char(ch)); |
| match ch { |
| 'a' => special('\x07'), |
| 'f' => special('\x0C'), |
| 't' => special('\t'), |
| 'n' => special('\n'), |
| 'r' => special('\r'), |
| 'v' => special('\x0B'), |
| 'A' => Ok(Hir::look(hir::Look::Start)), |
| 'z' => Ok(Hir::look(hir::Look::End)), |
| 'b' => { |
| let mut hir = Hir::look(hir::Look::Word); |
| if !self.is_done() && self.char() == '{' { |
| if let Some(special) = |
| self.maybe_parse_special_word_boundary()? |
| { |
| hir = special; |
| } |
| } |
| Ok(hir) |
| } |
| 'B' => Ok(Hir::look(hir::Look::WordNegate)), |
| '<' => Ok(Hir::look(hir::Look::WordStart)), |
| '>' => Ok(Hir::look(hir::Look::WordEnd)), |
| _ => Err(Error::new(ERR_ESCAPE_UNRECOGNIZED)), |
| } |
| } |
| |
| /// Attempt to parse a specialty word boundary. That is, `\b{start}`, |
| /// `\b{end}`, `\b{start-half}` or `\b{end-half}`. |
| /// |
| /// This is similar to `maybe_parse_ascii_class` in that, in most cases, |
| /// if it fails it will just return `None` with no error. This is done |
| /// because `\b{5}` is a valid expression and we want to let that be parsed |
| /// by the existing counted repetition parsing code. (I thought about just |
| /// invoking the counted repetition code from here, but it seemed a little |
| /// ham-fisted.) |
| /// |
| /// Unlike `maybe_parse_ascii_class` though, this can return an error. |
| /// Namely, if we definitely know it isn't a counted repetition, then we |
| /// return an error specific to the specialty word boundaries. |
| /// |
| /// This assumes the parser is positioned at a `{` immediately following |
| /// a `\b`. When `None` is returned, the parser is returned to the position |
| /// at which it started: pointing at a `{`. |
| /// |
| /// The position given should correspond to the start of the `\b`. |
| fn maybe_parse_special_word_boundary(&self) -> Result<Option<Hir>, Error> { |
| assert_eq!(self.char(), '{'); |
| |
| let is_valid_char = |c| match c { |
| 'A'..='Z' | 'a'..='z' | '-' => true, |
| _ => false, |
| }; |
| let start = self.pos(); |
| if !self.bump_and_bump_space() { |
| return Err(Error::new(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF)); |
| } |
| // This is one of the critical bits: if the first non-whitespace |
| // character isn't in [-A-Za-z] (i.e., this can't be a special word |
| // boundary), then we bail and let the counted repetition parser deal |
| // with this. |
| if !is_valid_char(self.char()) { |
| self.pos.set(start); |
| self.char.set(Some('{')); |
| return Ok(None); |
| } |
| |
| // Now collect up our chars until we see a '}'. |
| let mut scratch = String::new(); |
| while !self.is_done() && is_valid_char(self.char()) { |
| scratch.push(self.char()); |
| self.bump_and_bump_space(); |
| } |
| if self.is_done() || self.char() != '}' { |
| return Err(Error::new(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED)); |
| } |
| self.bump(); |
| let kind = match scratch.as_str() { |
| "start" => hir::Look::WordStart, |
| "end" => hir::Look::WordEnd, |
| "start-half" => hir::Look::WordStartHalf, |
| "end-half" => hir::Look::WordEndHalf, |
| _ => { |
| return Err(Error::new(ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED)) |
| } |
| }; |
| Ok(Some(Hir::look(kind))) |
| } |
| |
| /// Parse a hex representation of a Unicode codepoint. This handles both |
| /// hex notations, i.e., `\xFF` and `\x{FFFF}`. This expects the parser to |
| /// be positioned at the `x`, `u` or `U` prefix. The parser is advanced to |
| /// the first character immediately following the hexadecimal literal. |
| fn parse_hex(&self) -> Result<Hir, Error> { |
| let digit_len = match self.char() { |
| 'x' => 2, |
| 'u' => 4, |
| 'U' => 8, |
| unk => unreachable!( |
| "invalid start of fixed length hexadecimal number {}", |
| unk |
| ), |
| }; |
| if !self.bump_and_bump_space() { |
| return Err(Error::new(ERR_HEX_UNEXPECTED_EOF)); |
| } |
| if self.char() == '{' { |
| self.parse_hex_brace() |
| } else { |
| self.parse_hex_digits(digit_len) |
| } |
| } |
| |
| /// Parse an N-digit hex representation of a Unicode codepoint. This |
| /// expects the parser to be positioned at the first digit and will advance |
| /// the parser to the first character immediately following the escape |
| /// sequence. |
| /// |
| /// The number of digits given must be 2 (for `\xNN`), 4 (for `\uNNNN`) |
| /// or 8 (for `\UNNNNNNNN`). |
| fn parse_hex_digits(&self, digit_len: usize) -> Result<Hir, Error> { |
| let mut scratch = String::new(); |
| for i in 0..digit_len { |
| if i > 0 && !self.bump_and_bump_space() { |
| return Err(Error::new(ERR_HEX_FIXED_UNEXPECTED_EOF)); |
| } |
| if !is_hex(self.char()) { |
| return Err(Error::new(ERR_HEX_FIXED_INVALID_DIGIT)); |
| } |
| scratch.push(self.char()); |
| } |
| // The final bump just moves the parser past the literal, which may |
| // be EOF. |
| self.bump_and_bump_space(); |
| match u32::from_str_radix(&scratch, 16).ok().and_then(char::from_u32) { |
| None => Err(Error::new(ERR_HEX_FIXED_INVALID)), |
| Some(ch) => Ok(self.hir_char(ch)), |
| } |
| } |
| |
| /// Parse a hex representation of any Unicode scalar value. This expects |
| /// the parser to be positioned at the opening brace `{` and will advance |
| /// the parser to the first character following the closing brace `}`. |
| fn parse_hex_brace(&self) -> Result<Hir, Error> { |
| let mut scratch = String::new(); |
| while self.bump_and_bump_space() && self.char() != '}' { |
| if !is_hex(self.char()) { |
| return Err(Error::new(ERR_HEX_BRACE_INVALID_DIGIT)); |
| } |
| scratch.push(self.char()); |
| } |
| if self.is_done() { |
| return Err(Error::new(ERR_HEX_BRACE_UNEXPECTED_EOF)); |
| } |
| assert_eq!(self.char(), '}'); |
| self.bump_and_bump_space(); |
| |
| if scratch.is_empty() { |
| return Err(Error::new(ERR_HEX_BRACE_EMPTY)); |
| } |
| match u32::from_str_radix(&scratch, 16).ok().and_then(char::from_u32) { |
| None => Err(Error::new(ERR_HEX_BRACE_INVALID)), |
| Some(ch) => Ok(self.hir_char(ch)), |
| } |
| } |
| |
| /// Parse a decimal number into a u32 while trimming leading and trailing |
| /// whitespace. |
| /// |
| /// This expects the parser to be positioned at the first position where |
| /// a decimal digit could occur. This will advance the parser to the byte |
| /// immediately following the last contiguous decimal digit. |
| /// |
| /// If no decimal digit could be found or if there was a problem parsing |
| /// the complete set of digits into a u32, then an error is returned. |
| fn parse_decimal(&self) -> Result<u32, Error> { |
| let mut scratch = String::new(); |
| while !self.is_done() && self.char().is_whitespace() { |
| self.bump(); |
| } |
| while !self.is_done() && '0' <= self.char() && self.char() <= '9' { |
| scratch.push(self.char()); |
| self.bump_and_bump_space(); |
| } |
| while !self.is_done() && self.char().is_whitespace() { |
| self.bump_and_bump_space(); |
| } |
| let digits = scratch.as_str(); |
| if digits.is_empty() { |
| return Err(Error::new(ERR_DECIMAL_NO_DIGITS)); |
| } |
| match u32::from_str_radix(digits, 10).ok() { |
| Some(n) => Ok(n), |
| None => Err(Error::new(ERR_DECIMAL_INVALID)), |
| } |
| } |
| |
| /// Parses an uncounted repetition operator. An uncounted repetition |
| /// operator includes `?`, `*` and `+`, but does not include the `{m,n}` |
| /// syntax. The current character should be one of `?`, `*` or `+`. Any |
| /// other character will result in a panic. |
| /// |
| /// This assumes that the parser is currently positioned at the repetition |
| /// operator and advances the parser to the first character after the |
| /// operator. (Note that the operator may include a single additional `?`, |
| /// which makes the operator ungreedy.) |
| /// |
| /// The caller should include the concatenation that is being built. The |
| /// concatenation returned includes the repetition operator applied to the |
| /// last expression in the given concatenation. |
| /// |
| /// If the concatenation is empty, then this returns an error. |
| fn parse_uncounted_repetition( |
| &self, |
| mut concat: Vec<Hir>, |
| ) -> Result<Vec<Hir>, Error> { |
| let sub = match concat.pop() { |
| Some(hir) => Box::new(hir), |
| None => { |
| return Err(Error::new(ERR_UNCOUNTED_REP_SUB_MISSING)); |
| } |
| }; |
| let (min, max) = match self.char() { |
| '?' => (0, Some(1)), |
| '*' => (0, None), |
| '+' => (1, None), |
| unk => unreachable!("unrecognized repetition operator '{}'", unk), |
| }; |
| let mut greedy = true; |
| if self.bump() && self.char() == '?' { |
| greedy = false; |
| self.bump(); |
| } |
| if self.flags().swap_greed { |
| greedy = !greedy; |
| } |
| concat.push(Hir::repetition(hir::Repetition { |
| min, |
| max, |
| greedy, |
| sub, |
| })); |
| Ok(concat) |
| } |
| |
| /// Parses a counted repetition operation. A counted repetition operator |
| /// corresponds to the `{m,n}` syntax, and does not include the `?`, `*` or |
| /// `+` operators. |
| /// |
| /// This assumes that the parser is currently at the opening `{` and |
| /// advances the parser to the first character after the operator. (Note |
| /// that the operator may include a single additional `?`, which makes the |
| /// operator ungreedy.) |
| /// |
| /// The caller should include the concatenation that is being built. The |
| /// concatenation returned includes the repetition operator applied to the |
| /// last expression in the given concatenation. |
| /// |
| /// If the concatenation is empty, then this returns an error. |
| fn parse_counted_repetition( |
| &self, |
| mut concat: Vec<Hir>, |
| ) -> Result<Vec<Hir>, Error> { |
| assert_eq!(self.char(), '{', "expected opening brace"); |
| let sub = match concat.pop() { |
| Some(hir) => Box::new(hir), |
| None => { |
| return Err(Error::new(ERR_COUNTED_REP_SUB_MISSING)); |
| } |
| }; |
| if !self.bump_and_bump_space() { |
| return Err(Error::new(ERR_COUNTED_REP_UNCLOSED)); |
| } |
| let min = self.parse_decimal()?; |
| let mut max = Some(min); |
| if self.is_done() { |
| return Err(Error::new(ERR_COUNTED_REP_MIN_UNCLOSED)); |
| } |
| if self.char() == ',' { |
| if !self.bump_and_bump_space() { |
| return Err(Error::new(ERR_COUNTED_REP_COMMA_UNCLOSED)); |
| } |
| if self.char() != '}' { |
| max = Some(self.parse_decimal()?); |
| } else { |
| max = None; |
| } |
| if self.is_done() { |
| return Err(Error::new(ERR_COUNTED_REP_MIN_MAX_UNCLOSED)); |
| } |
| } |
| if self.char() != '}' { |
| return Err(Error::new(ERR_COUNTED_REP_INVALID)); |
| } |
| |
| let mut greedy = true; |
| if self.bump_and_bump_space() && self.char() == '?' { |
| greedy = false; |
| self.bump(); |
| } |
| if self.flags().swap_greed { |
| greedy = !greedy; |
| } |
| |
| if max.map_or(false, |max| min > max) { |
| return Err(Error::new(ERR_COUNTED_REP_INVALID_RANGE)); |
| } |
| concat.push(Hir::repetition(hir::Repetition { |
| min, |
| max, |
| greedy, |
| sub, |
| })); |
| Ok(concat) |
| } |
| |
| /// Parses the part of a pattern that starts with a `(`. This is usually |
| /// a group sub-expression, but might just be a directive that enables |
| /// (or disables) certain flags. |
| /// |
| /// This assumes the parser is pointing at the opening `(`. |
| fn parse_group(&self) -> Result<Option<Hir>, Error> { |
| assert_eq!(self.char(), '('); |
| self.bump_and_bump_space(); |
| if self.is_lookaround_prefix() { |
| return Err(Error::new(ERR_LOOK_UNSUPPORTED)); |
| } |
| if self.bump_if("?P<") || self.bump_if("?<") { |
| let index = self.next_capture_index()?; |
| let name = Some(Box::from(self.parse_capture_name()?)); |
| let sub = Box::new(self.parse_inner()?); |
| let cap = hir::Capture { index, name, sub }; |
| Ok(Some(Hir::capture(cap))) |
| } else if self.bump_if("?") { |
| if self.is_done() { |
| return Err(Error::new(ERR_UNCLOSED_GROUP_QUESTION)); |
| } |
| let start = self.pos(); |
| // The flags get reset in the top-level 'parse' routine. |
| *self.flags.borrow_mut() = self.parse_flags()?; |
| let consumed = self.pos() - start; |
| if self.char() == ')' { |
| // We don't allow empty flags, e.g., `(?)`. |
| if consumed == 0 { |
| return Err(Error::new(ERR_EMPTY_FLAGS)); |
| } |
| Ok(None) |
| } else { |
| assert_eq!(':', self.char()); |
| self.bump(); |
| self.parse_inner().map(Some) |
| } |
| } else { |
| let index = self.next_capture_index()?; |
| let sub = Box::new(self.parse_inner()?); |
| let cap = hir::Capture { index, name: None, sub }; |
| Ok(Some(Hir::capture(cap))) |
| } |
| } |
| |
| /// Parses a capture group name. Assumes that the parser is positioned at |
| /// the first character in the name following the opening `<` (and may |
| /// possibly be EOF). This advances the parser to the first character |
| /// following the closing `>`. |
| fn parse_capture_name(&self) -> Result<&str, Error> { |
| if self.is_done() { |
| return Err(Error::new(ERR_MISSING_GROUP_NAME)); |
| } |
| let start = self.pos(); |
| loop { |
| if self.char() == '>' { |
| break; |
| } |
| if !is_capture_char(self.char(), self.pos() == start) { |
| return Err(Error::new(ERR_INVALID_GROUP_NAME)); |
| } |
| if !self.bump() { |
| break; |
| } |
| } |
| let end = self.pos(); |
| if self.is_done() { |
| return Err(Error::new(ERR_UNCLOSED_GROUP_NAME)); |
| } |
| assert_eq!(self.char(), '>'); |
| self.bump(); |
| let name = &self.pattern()[start..end]; |
| if name.is_empty() { |
| return Err(Error::new(ERR_EMPTY_GROUP_NAME)); |
| } |
| self.add_capture_name(name)?; |
| Ok(name) |
| } |
| |
| /// Parse a sequence of flags starting at the current character. |
| /// |
| /// This advances the parser to the character immediately following the |
| /// flags, which is guaranteed to be either `:` or `)`. |
| /// |
| /// # Errors |
| /// |
| /// If any flags are duplicated, then an error is returned. |
| /// |
| /// If the negation operator is used more than once, then an error is |
| /// returned. |
| /// |
| /// If no flags could be found or if the negation operation is not followed |
| /// by any flags, then an error is returned. |
| fn parse_flags(&self) -> Result<Flags, Error> { |
| let mut flags = *self.flags.borrow(); |
| let mut negate = false; |
| // Keeps track of whether the previous flag item was a '-'. We use this |
| // to detect whether there is a dangling '-', which is invalid. |
| let mut last_was_negation = false; |
| // A set to keep track of the flags we've seen. Since all flags are |
| // ASCII, we only need 128 bytes. |
| let mut seen = [false; 128]; |
| while self.char() != ':' && self.char() != ')' { |
| if self.char() == '-' { |
| last_was_negation = true; |
| if negate { |
| return Err(Error::new(ERR_FLAG_REPEATED_NEGATION)); |
| } |
| negate = true; |
| } else { |
| last_was_negation = false; |
| self.parse_flag(&mut flags, negate)?; |
| // OK because every valid flag is ASCII, and we're only here if |
| // the flag is valid. |
| let flag_byte = u8::try_from(self.char()).unwrap(); |
| if seen[usize::from(flag_byte)] { |
| return Err(Error::new(ERR_FLAG_DUPLICATE)); |
| } |
| seen[usize::from(flag_byte)] = true; |
| } |
| if !self.bump() { |
| return Err(Error::new(ERR_FLAG_UNEXPECTED_EOF)); |
| } |
| } |
| if last_was_negation { |
| return Err(Error::new(ERR_FLAG_DANGLING_NEGATION)); |
| } |
| Ok(flags) |
| } |
| |
| /// Parse the current character as a flag. Do not advance the parser. |
| /// |
| /// This sets the appropriate boolean value in place on the set of flags |
| /// given. The boolean is inverted when `negate` is true. |
| /// |
| /// # Errors |
| /// |
| /// If the flag is not recognized, then an error is returned. |
| fn parse_flag( |
| &self, |
| flags: &mut Flags, |
| negate: bool, |
| ) -> Result<(), Error> { |
| let enabled = !negate; |
| match self.char() { |
| 'i' => flags.case_insensitive = enabled, |
| 'm' => flags.multi_line = enabled, |
| 's' => flags.dot_matches_new_line = enabled, |
| 'U' => flags.swap_greed = enabled, |
| 'R' => flags.crlf = enabled, |
| 'x' => flags.ignore_whitespace = enabled, |
| // We make a special exception for this flag where we let it |
| // through as a recognized flag, but treat it as a no-op. This in |
| // practice retains some compatibility with the regex crate. It is |
| // a little suspect to do this, but for example, '(?-u:\b).+' in |
| // the regex crate is equivalent to '\b.+' in regex-lite. |
| 'u' => {} |
| _ => return Err(Error::new(ERR_FLAG_UNRECOGNIZED)), |
| } |
| Ok(()) |
| } |
| |
| /// Parse a standard character class consisting primarily of characters or |
| /// character ranges. |
| /// |
| /// This assumes the parser is positioned at the opening `[`. If parsing |
| /// is successful, then the parser is advanced to the position immediately |
| /// following the closing `]`. |
| fn parse_class(&self) -> Result<Hir, Error> { |
| assert_eq!(self.char(), '['); |
| |
| let mut union = vec![]; |
| if !self.bump_and_bump_space() { |
| return Err(Error::new(ERR_CLASS_UNCLOSED)); |
| } |
| // Determine whether the class is negated or not. |
| let negate = if self.char() != '^' { |
| false |
| } else { |
| if !self.bump_and_bump_space() { |
| return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_NEGATION)); |
| } |
| true |
| }; |
| // Accept any number of `-` as literal `-`. |
| while self.char() == '-' { |
| union.push(hir::ClassRange { start: '-', end: '-' }); |
| if !self.bump_and_bump_space() { |
| return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_DASH)); |
| } |
| } |
| // If `]` is the *first* char in a set, then interpret it as a literal |
| // `]`. That is, an empty class is impossible to write. |
| if union.is_empty() && self.char() == ']' { |
| union.push(hir::ClassRange { start: ']', end: ']' }); |
| if !self.bump_and_bump_space() { |
| return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_CLOSING)); |
| } |
| } |
| loop { |
| self.bump_space(); |
| if self.is_done() { |
| return Err(Error::new(ERR_CLASS_UNCLOSED)); |
| } |
| match self.char() { |
| '[' => { |
| // Attempt to treat this as the beginning of a POSIX class. |
| // If POSIX class parsing fails, then the parser backs up |
| // to `[`. |
| if let Some(class) = self.maybe_parse_posix_class() { |
| union.extend_from_slice(&class.ranges); |
| continue; |
| } |
| // ... otherwise we don't support nested classes. |
| return Err(Error::new(ERR_CLASS_NEST_UNSUPPORTED)); |
| } |
| ']' => { |
| self.bump(); |
| let mut class = hir::Class::new(union); |
| // Note that we must apply case folding before negation! |
| // Consider `(?i)[^x]`. If we applied negation first, then |
| // the result would be the character class that matched any |
| // Unicode scalar value. |
| if self.flags().case_insensitive { |
| class.ascii_case_fold(); |
| } |
| if negate { |
| class.negate(); |
| } |
| return Ok(Hir::class(class)); |
| } |
| '&' if self.peek() == Some('&') => { |
| return Err(Error::new( |
| ERR_CLASS_INTERSECTION_UNSUPPORTED, |
| )); |
| } |
| '-' if self.peek() == Some('-') => { |
| return Err(Error::new(ERR_CLASS_DIFFERENCE_UNSUPPORTED)); |
| } |
| '~' if self.peek() == Some('~') => { |
| return Err(Error::new( |
| ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED, |
| )); |
| } |
| _ => self.parse_class_range(&mut union)?, |
| } |
| } |
| } |
| |
| /// Parse a single primitive item in a character class set. The item to |
| /// be parsed can either be one of a simple literal character, a range |
| /// between two simple literal characters or a "primitive" character |
| /// class like `\w`. |
| /// |
| /// If an invalid escape is found, or if a character class is found where |
| /// a simple literal is expected (e.g., in a range), then an error is |
| /// returned. |
| /// |
| /// Otherwise, the range (or ranges) are appended to the given union of |
| /// ranges. |
| fn parse_class_range( |
| &self, |
| union: &mut Vec<hir::ClassRange>, |
| ) -> Result<(), Error> { |
| let prim1 = self.parse_class_item()?; |
| self.bump_space(); |
| if self.is_done() { |
| return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_ITEM)); |
| } |
| // If the next char isn't a `-`, then we don't have a range. |
| // There are two exceptions. If the char after a `-` is a `]`, then |
| // `-` is interpreted as a literal `-`. Alternatively, if the char |
| // after a `-` is a `-`, then `--` corresponds to a "difference" |
| // operation. (Which we don't support in regex-lite, but error about |
| // specifically in an effort to be loud about differences between the |
| // main regex crate where possible.) |
| if self.char() != '-' |
| || self.peek_space() == Some(']') |
| || self.peek_space() == Some('-') |
| { |
| union.extend_from_slice(&into_class_item_ranges(prim1)?); |
| return Ok(()); |
| } |
| // OK, now we're parsing a range, so bump past the `-` and parse the |
| // second half of the range. |
| if !self.bump_and_bump_space() { |
| return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_DASH)); |
| } |
| let prim2 = self.parse_class_item()?; |
| let range = hir::ClassRange { |
| start: into_class_item_range(prim1)?, |
| end: into_class_item_range(prim2)?, |
| }; |
| if range.start > range.end { |
| return Err(Error::new(ERR_CLASS_INVALID_RANGE)); |
| } |
| union.push(range); |
| Ok(()) |
| } |
| |
| /// Parse a single item in a character class as a primitive, where the |
| /// primitive either consists of a verbatim literal or a single escape |
| /// sequence. |
| /// |
| /// This assumes the parser is positioned at the beginning of a primitive, |
| /// and advances the parser to the first position after the primitive if |
| /// successful. |
| /// |
| /// Note that it is the caller's responsibility to report an error if an |
| /// illegal primitive was parsed. |
| fn parse_class_item(&self) -> Result<Hir, Error> { |
| let ch = self.char(); |
| self.bump(); |
| if ch == '\\' { |
| self.parse_escape() |
| } else { |
| Ok(Hir::char(ch)) |
| } |
| } |
| |
| /// Attempt to parse a POSIX character class, e.g., `[:alnum:]`. |
| /// |
| /// This assumes the parser is positioned at the opening `[`. |
| /// |
| /// If no valid POSIX character class could be found, then this does not |
| /// advance the parser and `None` is returned. Otherwise, the parser is |
| /// advanced to the first byte following the closing `]` and the |
| /// corresponding POSIX class is returned. |
| fn maybe_parse_posix_class(&self) -> Option<hir::Class> { |
| // POSIX character classes are interesting from a parsing perspective |
| // because parsing cannot fail with any interesting error. For example, |
| // in order to use an POSIX character class, it must be enclosed in |
| // double brackets, e.g., `[[:alnum:]]`. Alternatively, you might think |
| // of it as "POSIX character classes have the syntax `[:NAME:]` which |
| // can only appear within character brackets." This means that things |
| // like `[[:lower:]A]` are legal constructs. |
| // |
| // However, if one types an incorrect POSIX character class, e.g., |
| // `[[:loower:]]`, then we treat that as if it were normal nested |
| // character class containing the characters `:elorw`. (Which isn't |
| // supported and results in an error in regex-lite.) One might argue |
| // that we should return an error instead since the repeated colons |
| // give away the intent to write an POSIX class. But what if the user |
| // typed `[[:lower]]` instead? How can we tell that was intended to be |
| // a POSXI class and not just a normal nested class? |
| // |
| // Reasonable people can probably disagree over this, but for better |
| // or worse, we implement semantics that never fails at the expense of |
| // better failure modes. |
| assert_eq!(self.char(), '['); |
| |
| // If parsing fails, then we back up the parser to this starting point. |
| let start_pos = self.pos(); |
| let start_char = self.char.get(); |
| let reset = || { |
| self.pos.set(start_pos); |
| self.char.set(start_char); |
| }; |
| |
| let mut negated = false; |
| if !self.bump() || self.char() != ':' { |
| reset(); |
| return None; |
| } |
| if !self.bump() { |
| reset(); |
| return None; |
| } |
| if self.char() == '^' { |
| negated = true; |
| if !self.bump() { |
| reset(); |
| return None; |
| } |
| } |
| let name_start = self.pos(); |
| while self.char() != ':' && self.bump() {} |
| if self.is_done() { |
| reset(); |
| return None; |
| } |
| let name = &self.pattern()[name_start..self.pos()]; |
| if !self.bump_if(":]") { |
| reset(); |
| return None; |
| } |
| if let Ok(ranges) = posix_class(name) { |
| let mut class = hir::Class::new(ranges); |
| if negated { |
| class.negate(); |
| } |
| return Some(class); |
| } |
| reset(); |
| None |
| } |
| |
| /// Parse a Perl character class, e.g., `\d` or `\W`. This assumes the |
| /// parser is currently at a valid character class name and will be |
| /// advanced to the character immediately following the class. |
| fn parse_perl_class(&self) -> Hir { |
| let ch = self.char(); |
| self.bump(); |
| let mut class = hir::Class::new(match ch { |
| 'd' | 'D' => posix_class("digit").unwrap(), |
| 's' | 'S' => posix_class("space").unwrap(), |
| 'w' | 'W' => posix_class("word").unwrap(), |
| unk => unreachable!("invalid Perl class \\{}", unk), |
| }); |
| if ch.is_ascii_uppercase() { |
| class.negate(); |
| } |
| Hir::class(class) |
| } |
| |
| fn hir_dot(&self) -> Hir { |
| if self.flags().dot_matches_new_line { |
| Hir::class(hir::Class::new([hir::ClassRange { |
| start: '\x00', |
| end: '\u{10FFFF}', |
| }])) |
| } else if self.flags().crlf { |
| Hir::class(hir::Class::new([ |
| hir::ClassRange { start: '\x00', end: '\x09' }, |
| hir::ClassRange { start: '\x0B', end: '\x0C' }, |
| hir::ClassRange { start: '\x0E', end: '\u{10FFFF}' }, |
| ])) |
| } else { |
| Hir::class(hir::Class::new([ |
| hir::ClassRange { start: '\x00', end: '\x09' }, |
| hir::ClassRange { start: '\x0B', end: '\u{10FFFF}' }, |
| ])) |
| } |
| } |
| |
| fn hir_anchor_start(&self) -> Hir { |
| let look = if self.flags().multi_line { |
| if self.flags().crlf { |
| hir::Look::StartCRLF |
| } else { |
| hir::Look::StartLF |
| } |
| } else { |
| hir::Look::Start |
| }; |
| Hir::look(look) |
| } |
| |
| fn hir_anchor_end(&self) -> Hir { |
| let look = if self.flags().multi_line { |
| if self.flags().crlf { |
| hir::Look::EndCRLF |
| } else { |
| hir::Look::EndLF |
| } |
| } else { |
| hir::Look::End |
| }; |
| Hir::look(look) |
| } |
| |
| fn hir_char(&self, ch: char) -> Hir { |
| if self.flags().case_insensitive { |
| let this = hir::ClassRange { start: ch, end: ch }; |
| if let Some(folded) = this.ascii_case_fold() { |
| return Hir::class(hir::Class::new([this, folded])); |
| } |
| } |
| Hir::char(ch) |
| } |
| } |
| |
| /// This checks the depth of the given `Hir` value, and if it exceeds the given |
| /// limit, then an error is returned. |
| fn check_hir_nesting(hir: &Hir, limit: u32) -> Result<(), Error> { |
| fn recurse(hir: &Hir, limit: u32, depth: u32) -> Result<(), Error> { |
| if depth > limit { |
| return Err(Error::new(ERR_TOO_MUCH_NESTING)); |
| } |
| let Some(next_depth) = depth.checked_add(1) else { |
| return Err(Error::new(ERR_TOO_MUCH_NESTING)); |
| }; |
| match *hir.kind() { |
| HirKind::Empty |
| | HirKind::Char(_) |
| | HirKind::Class(_) |
| | HirKind::Look(_) => Ok(()), |
| HirKind::Repetition(hir::Repetition { ref sub, .. }) => { |
| recurse(sub, limit, next_depth) |
| } |
| HirKind::Capture(hir::Capture { ref sub, .. }) => { |
| recurse(sub, limit, next_depth) |
| } |
| HirKind::Concat(ref subs) | HirKind::Alternation(ref subs) => { |
| for sub in subs.iter() { |
| recurse(sub, limit, next_depth)?; |
| } |
| Ok(()) |
| } |
| } |
| } |
| recurse(hir, limit, 0) |
| } |
| |
| /// Converts the given Hir to a literal char if the Hir is just a single |
| /// character. Otherwise this returns an error. |
| /// |
| /// This is useful in contexts where you can only accept a single character, |
| /// but where it is convenient to parse something more general. For example, |
| /// parsing a single part of a character class range. It's useful to reuse |
| /// the literal parsing code, but that code can itself return entire classes |
| /// which can't be used as the start/end of a class range. |
| fn into_class_item_range(hir: Hir) -> Result<char, Error> { |
| match hir.kind { |
| HirKind::Char(ch) => Ok(ch), |
| _ => Err(Error::new(ERR_CLASS_INVALID_RANGE_ITEM)), |
| } |
| } |
| |
| fn into_class_item_ranges( |
| mut hir: Hir, |
| ) -> Result<Vec<hir::ClassRange>, Error> { |
| match core::mem::replace(&mut hir.kind, HirKind::Empty) { |
| HirKind::Char(ch) => Ok(vec![hir::ClassRange { start: ch, end: ch }]), |
| HirKind::Class(hir::Class { ranges }) => Ok(ranges), |
| _ => Err(Error::new(ERR_CLASS_INVALID_ITEM)), |
| } |
| } |
| |
| /// Returns an iterator of character class ranges for the given named POSIX |
| /// character class. If no such character class exists for the name given, then |
| /// an error is returned. |
| fn posix_class( |
| kind: &str, |
| ) -> Result<impl Iterator<Item = hir::ClassRange>, Error> { |
| let slice: &'static [(u8, u8)] = match kind { |
| "alnum" => &[(b'0', b'9'), (b'A', b'Z'), (b'a', b'z')], |
| "alpha" => &[(b'A', b'Z'), (b'a', b'z')], |
| "ascii" => &[(b'\x00', b'\x7F')], |
| "blank" => &[(b'\t', b'\t'), (b' ', b' ')], |
| "cntrl" => &[(b'\x00', b'\x1F'), (b'\x7F', b'\x7F')], |
| "digit" => &[(b'0', b'9')], |
| "graph" => &[(b'!', b'~')], |
| "lower" => &[(b'a', b'z')], |
| "print" => &[(b' ', b'~')], |
| "punct" => &[(b'!', b'/'), (b':', b'@'), (b'[', b'`'), (b'{', b'~')], |
| "space" => &[ |
| (b'\t', b'\t'), |
| (b'\n', b'\n'), |
| (b'\x0B', b'\x0B'), |
| (b'\x0C', b'\x0C'), |
| (b'\r', b'\r'), |
| (b' ', b' '), |
| ], |
| "upper" => &[(b'A', b'Z')], |
| "word" => &[(b'0', b'9'), (b'A', b'Z'), (b'_', b'_'), (b'a', b'z')], |
| "xdigit" => &[(b'0', b'9'), (b'A', b'F'), (b'a', b'f')], |
| _ => return Err(Error::new(ERR_POSIX_CLASS_UNRECOGNIZED)), |
| }; |
| Ok(slice.iter().map(|&(start, end)| hir::ClassRange { |
| start: char::from(start), |
| end: char::from(end), |
| })) |
| } |
| |
| /// Returns true if the given character is a hexadecimal digit. |
| fn is_hex(c: char) -> bool { |
| ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F') |
| } |
| |
| /// Returns true if the given character is a valid in a capture group name. |
| /// |
| /// If `first` is true, then `c` is treated as the first character in the |
| /// group name (which must be alphabetic or underscore). |
| fn is_capture_char(c: char, first: bool) -> bool { |
| if first { |
| c == '_' || c.is_alphabetic() |
| } else { |
| c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric() |
| } |
| } |
| |
| #[cfg(test)] |
| mod tests { |
| use super::*; |
| |
| fn p(pattern: &str) -> Hir { |
| Parser::new(Config::default(), pattern).parse_inner().unwrap() |
| } |
| |
| fn perr(pattern: &str) -> String { |
| Parser::new(Config::default(), pattern) |
| .parse_inner() |
| .unwrap_err() |
| .to_string() |
| } |
| |
| fn class<I: IntoIterator<Item = (char, char)>>(it: I) -> Hir { |
| Hir::class(hir::Class::new( |
| it.into_iter().map(|(start, end)| hir::ClassRange { start, end }), |
| )) |
| } |
| |
| fn singles<I: IntoIterator<Item = char>>(it: I) -> Hir { |
| Hir::class(hir::Class::new( |
| it.into_iter().map(|ch| hir::ClassRange { start: ch, end: ch }), |
| )) |
| } |
| |
| fn posix(name: &str) -> Hir { |
| Hir::class(hir::Class::new(posix_class(name).unwrap())) |
| } |
| |
| fn cap(index: u32, sub: Hir) -> Hir { |
| Hir::capture(hir::Capture { index, name: None, sub: Box::new(sub) }) |
| } |
| |
| fn named_cap(index: u32, name: &str, sub: Hir) -> Hir { |
| Hir::capture(hir::Capture { |
| index, |
| name: Some(Box::from(name)), |
| sub: Box::new(sub), |
| }) |
| } |
| |
| #[test] |
| fn ok_literal() { |
| assert_eq!(p("a"), Hir::char('a')); |
| assert_eq!(p("ab"), Hir::concat(vec![Hir::char('a'), Hir::char('b')])); |
| assert_eq!(p("💩"), Hir::char('💩')); |
| } |
| |
| #[test] |
| fn ok_meta_escapes() { |
| assert_eq!(p(r"\*"), Hir::char('*')); |
| assert_eq!(p(r"\+"), Hir::char('+')); |
| assert_eq!(p(r"\?"), Hir::char('?')); |
| assert_eq!(p(r"\|"), Hir::char('|')); |
| assert_eq!(p(r"\("), Hir::char('(')); |
| assert_eq!(p(r"\)"), Hir::char(')')); |
| assert_eq!(p(r"\^"), Hir::char('^')); |
| assert_eq!(p(r"\$"), Hir::char('$')); |
| assert_eq!(p(r"\["), Hir::char('[')); |
| assert_eq!(p(r"\]"), Hir::char(']')); |
| } |
| |
| #[test] |
| fn ok_special_escapes() { |
| assert_eq!(p(r"\a"), Hir::char('\x07')); |
| assert_eq!(p(r"\f"), Hir::char('\x0C')); |
| assert_eq!(p(r"\t"), Hir::char('\t')); |
| assert_eq!(p(r"\n"), Hir::char('\n')); |
| assert_eq!(p(r"\r"), Hir::char('\r')); |
| assert_eq!(p(r"\v"), Hir::char('\x0B')); |
| assert_eq!(p(r"\A"), Hir::look(hir::Look::Start)); |
| assert_eq!(p(r"\z"), Hir::look(hir::Look::End)); |
| assert_eq!(p(r"\b"), Hir::look(hir::Look::Word)); |
| assert_eq!(p(r"\B"), Hir::look(hir::Look::WordNegate)); |
| } |
| |
| #[test] |
| fn ok_hex() { |
| // fixed length |
| assert_eq!(p(r"\x41"), Hir::char('A')); |
| assert_eq!(p(r"\u2603"), Hir::char('☃')); |
| assert_eq!(p(r"\U0001F4A9"), Hir::char('💩')); |
| // braces |
| assert_eq!(p(r"\x{1F4A9}"), Hir::char('💩')); |
| assert_eq!(p(r"\u{1F4A9}"), Hir::char('💩')); |
| assert_eq!(p(r"\U{1F4A9}"), Hir::char('💩')); |
| } |
| |
| #[test] |
| fn ok_perl() { |
| assert_eq!(p(r"\d"), posix("digit")); |
| assert_eq!(p(r"\s"), posix("space")); |
| assert_eq!(p(r"\w"), posix("word")); |
| |
| let negated = |name| { |
| let mut class = hir::Class::new(posix_class(name).unwrap()); |
| class.negate(); |
| Hir::class(class) |
| }; |
| assert_eq!(p(r"\D"), negated("digit")); |
| assert_eq!(p(r"\S"), negated("space")); |
| assert_eq!(p(r"\W"), negated("word")); |
| } |
| |
| #[test] |
| fn ok_flags_and_primitives() { |
| assert_eq!(p(r"a"), Hir::char('a')); |
| assert_eq!(p(r"(?i:a)"), singles(['A', 'a'])); |
| |
| assert_eq!(p(r"^"), Hir::look(hir::Look::Start)); |
| assert_eq!(p(r"(?m:^)"), Hir::look(hir::Look::StartLF)); |
| assert_eq!(p(r"(?mR:^)"), Hir::look(hir::Look::StartCRLF)); |
| |
| assert_eq!(p(r"$"), Hir::look(hir::Look::End)); |
| assert_eq!(p(r"(?m:$)"), Hir::look(hir::Look::EndLF)); |
| assert_eq!(p(r"(?mR:$)"), Hir::look(hir::Look::EndCRLF)); |
| |
| assert_eq!(p(r"."), class([('\x00', '\x09'), ('\x0B', '\u{10FFFF}')])); |
| assert_eq!( |
| p(r"(?R:.)"), |
| class([ |
| ('\x00', '\x09'), |
| ('\x0B', '\x0C'), |
| ('\x0E', '\u{10FFFF}'), |
| ]) |
| ); |
| assert_eq!(p(r"(?s:.)"), class([('\x00', '\u{10FFFF}')])); |
| assert_eq!(p(r"(?sR:.)"), class([('\x00', '\u{10FFFF}')])); |
| } |
| |
| #[test] |
| fn ok_alternate() { |
| assert_eq!( |
| p(r"a|b"), |
| Hir::alternation(vec![Hir::char('a'), Hir::char('b')]) |
| ); |
| assert_eq!( |
| p(r"(?:a|b)"), |
| Hir::alternation(vec![Hir::char('a'), Hir::char('b')]) |
| ); |
| |
| assert_eq!( |
| p(r"(a|b)"), |
| cap(1, Hir::alternation(vec![Hir::char('a'), Hir::char('b')])) |
| ); |
| assert_eq!( |
| p(r"(?<foo>a|b)"), |
| named_cap( |
| 1, |
| "foo", |
| Hir::alternation(vec![Hir::char('a'), Hir::char('b')]) |
| ) |
| ); |
| |
| assert_eq!( |
| p(r"a|b|c"), |
| Hir::alternation(vec![ |
| Hir::char('a'), |
| Hir::char('b'), |
| Hir::char('c') |
| ]) |
| ); |
| |
| assert_eq!( |
| p(r"ax|by|cz"), |
| Hir::alternation(vec![ |
| Hir::concat(vec![Hir::char('a'), Hir::char('x')]), |
| Hir::concat(vec![Hir::char('b'), Hir::char('y')]), |
| Hir::concat(vec![Hir::char('c'), Hir::char('z')]), |
| ]) |
| ); |
| assert_eq!( |
| p(r"(ax|(by|(cz)))"), |
| cap( |
| 1, |
| Hir::alternation(vec![ |
| Hir::concat(vec![Hir::char('a'), Hir::char('x')]), |
| cap( |
| 2, |
| Hir::alternation(vec![ |
| Hir::concat(vec![Hir::char('b'), Hir::char('y')]), |
| cap( |
| 3, |
| Hir::concat(vec![ |
| Hir::char('c'), |
| Hir::char('z') |
| ]) |
| ), |
| ]) |
| ), |
| ]) |
| ) |
| ); |
| |
| assert_eq!( |
| p(r"|"), |
| Hir::alternation(vec![Hir::empty(), Hir::empty()]) |
| ); |
| assert_eq!( |
| p(r"||"), |
| Hir::alternation(vec![Hir::empty(), Hir::empty(), Hir::empty()]) |
| ); |
| |
| assert_eq!( |
| p(r"a|"), |
| Hir::alternation(vec![Hir::char('a'), Hir::empty()]) |
| ); |
| assert_eq!( |
| p(r"|a"), |
| Hir::alternation(vec![Hir::empty(), Hir::char('a')]) |
| ); |
| |
| assert_eq!( |
| p(r"(|)"), |
| cap(1, Hir::alternation(vec![Hir::empty(), Hir::empty()])) |
| ); |
| assert_eq!( |
| p(r"(a|)"), |
| cap(1, Hir::alternation(vec![Hir::char('a'), Hir::empty()])) |
| ); |
| assert_eq!( |
| p(r"(|a)"), |
| cap(1, Hir::alternation(vec![Hir::empty(), Hir::char('a')])) |
| ); |
| } |
| |
| #[test] |
| fn ok_flag_group() { |
| assert_eq!( |
| p("a(?i:b)"), |
| Hir::concat(vec![Hir::char('a'), singles(['B', 'b'])]) |
| ); |
| } |
| |
| #[test] |
| fn ok_flag_directive() { |
| assert_eq!(p("(?i)a"), singles(['A', 'a'])); |
| assert_eq!(p("a(?i)"), Hir::char('a')); |
| assert_eq!( |
| p("a(?i)b"), |
| Hir::concat(vec![Hir::char('a'), singles(['B', 'b'])]) |
| ); |
| assert_eq!( |
| p("a(?i)a(?-i)a"), |
| Hir::concat(vec![ |
| Hir::char('a'), |
| singles(['A', 'a']), |
| Hir::char('a'), |
| ]) |
| ); |
| assert_eq!( |
| p("a(?:(?i)a)a"), |
| Hir::concat(vec![ |
| Hir::char('a'), |
| singles(['A', 'a']), |
| Hir::char('a'), |
| ]) |
| ); |
| assert_eq!( |
| p("a((?i)a)a"), |
| Hir::concat(vec![ |
| Hir::char('a'), |
| cap(1, singles(['A', 'a'])), |
| Hir::char('a'), |
| ]) |
| ); |
| } |
| |
| #[test] |
| fn ok_uncounted_repetition() { |
| assert_eq!( |
| p(r"a?"), |
| Hir::repetition(hir::Repetition { |
| min: 0, |
| max: Some(1), |
| greedy: true, |
| sub: Box::new(Hir::char('a')), |
| }), |
| ); |
| assert_eq!( |
| p(r"a*"), |
| Hir::repetition(hir::Repetition { |
| min: 0, |
| max: None, |
| greedy: true, |
| sub: Box::new(Hir::char('a')), |
| }), |
| ); |
| assert_eq!( |
| p(r"a+"), |
| Hir::repetition(hir::Repetition { |
| min: 1, |
| max: None, |
| greedy: true, |
| sub: Box::new(Hir::char('a')), |
| }), |
| ); |
| |
| assert_eq!( |
| p(r"a??"), |
| Hir::repetition(hir::Repetition { |
| min: 0, |
| max: Some(1), |
| greedy: false, |
| sub: Box::new(Hir::char('a')), |
| }), |
| ); |
| assert_eq!( |
| p(r"a*?"), |
| Hir::repetition(hir::Repetition { |
| min: 0, |
| max: None, |
| greedy: false, |
| sub: Box::new(Hir::char('a')), |
| }), |
| ); |
| assert_eq!( |
| p(r"a+?"), |
| Hir::repetition(hir::Repetition { |
| min: 1, |
| max: None, |
| greedy: false, |
| sub: Box::new(Hir::char('a')), |
| }), |
| ); |
| |
| assert_eq!( |
| p(r"a?b"), |
| Hir::concat(vec![ |
| Hir::repetition(hir::Repetition { |
| min: 0, |
| max: Some(1), |
| greedy: true, |
| sub: Box::new(Hir::char('a')), |
| }), |
| Hir::char('b'), |
| ]), |
| ); |
| |
| assert_eq!( |
| p(r"ab?"), |
| Hir::concat(vec![ |
| Hir::char('a'), |
| Hir::repetition(hir::Repetition { |
| min: 0, |
| max: Some(1), |
| greedy: true, |
| sub: Box::new(Hir::char('b')), |
| }), |
| ]), |
| ); |
| |
| assert_eq!( |
| p(r"(?:ab)?"), |
| Hir::repetition(hir::Repetition { |
| min: 0, |
| max: Some(1), |
| greedy: true, |
| sub: Box::new(Hir::concat(vec![ |
| Hir::char('a'), |
| Hir::char('b') |
| ])), |
| }), |
| ); |
| |
| assert_eq!( |
| p(r"(ab)?"), |
| Hir::repetition(hir::Repetition { |
| min: 0, |
| max: Some(1), |
| greedy: true, |
| sub: Box::new(cap( |
| 1, |
| Hir::concat(vec![Hir::char('a'), Hir::char('b')]) |
| )), |
| }), |
| ); |
| |
| assert_eq!( |
| p(r"|a?"), |
| Hir::alternation(vec![ |
| Hir::empty(), |
| Hir::repetition(hir::Repetition { |
| min: 0, |
| max: Some(1), |
| greedy: true, |
| sub: Box::new(Hir::char('a')), |
| }) |
| ]), |
| ); |
| } |
| |
| #[test] |
| fn ok_counted_repetition() { |
| assert_eq!( |
| p(r"a{5}"), |
| Hir::repetition(hir::Repetition { |
| min: 5, |
| max: Some(5), |
| greedy: true, |
| sub: Box::new(Hir::char('a')), |
| }), |
| ); |
| assert_eq!( |
| p(r"a{5}?"), |
| Hir::repetition(hir::Repetition { |
| min: 5, |
| max: Some(5), |
| greedy: false, |
| sub: Box::new(Hir::char('a')), |
| }), |
| ); |
| |
| assert_eq!( |
| p(r"a{5,}"), |
| Hir::repetition(hir::Repetition { |
| min: 5, |
| max: None, |
| greedy: true, |
| sub: Box::new(Hir::char('a')), |
| }), |
| ); |
| |
| assert_eq!( |
| p(r"a{5,9}"), |
| Hir::repetition(hir::Repetition { |
| min: 5, |
| max: Some(9), |
| greedy: true, |
| sub: Box::new(Hir::char('a')), |
| }), |
| ); |
| |
| assert_eq!( |
| p(r"ab{5}c"), |
| Hir::concat(vec![ |
| Hir::char('a'), |
| Hir::repetition(hir::Repetition { |
| min: 5, |
| max: Some(5), |
| greedy: true, |
| sub: Box::new(Hir::char('b')), |
| }), |
| Hir::char('c'), |
| ]), |
| ); |
| |
| assert_eq!( |
| p(r"a{ 5 }"), |
| Hir::repetition(hir::Repetition { |
| min: 5, |
| max: Some(5), |
| greedy: true, |
| sub: Box::new(Hir::char('a')), |
| }), |
| ); |
| assert_eq!( |
| p(r"a{ 5 , 9 }"), |
| Hir::repetition(hir::Repetition { |
| min: 5, |
| max: Some(9), |
| greedy: true, |
| sub: Box::new(Hir::char('a')), |
| }), |
| ); |
| } |
| |
| #[test] |
| fn ok_group_unnamed() { |
| assert_eq!(p("(a)"), cap(1, Hir::char('a'))); |
| assert_eq!( |
| p("(ab)"), |
| cap(1, Hir::concat(vec![Hir::char('a'), Hir::char('b')])) |
| ); |
| } |
| |
| #[test] |
| fn ok_group_named() { |
| assert_eq!(p("(?P<foo>a)"), named_cap(1, "foo", Hir::char('a'))); |
| assert_eq!(p("(?<foo>a)"), named_cap(1, "foo", Hir::char('a'))); |
| |
| assert_eq!( |
| p("(?P<foo>ab)"), |
| named_cap( |
| 1, |
| "foo", |
| Hir::concat(vec![Hir::char('a'), Hir::char('b')]) |
| ) |
| ); |
| assert_eq!( |
| p("(?<foo>ab)"), |
| named_cap( |
| 1, |
| "foo", |
| Hir::concat(vec![Hir::char('a'), Hir::char('b')]) |
| ) |
| ); |
| |
| assert_eq!(p(r"(?<a>z)"), named_cap(1, "a", Hir::char('z'))); |
| assert_eq!(p(r"(?P<a>z)"), named_cap(1, "a", Hir::char('z'))); |
| |
| assert_eq!(p(r"(?<a_1>z)"), named_cap(1, "a_1", Hir::char('z'))); |
| assert_eq!(p(r"(?P<a_1>z)"), named_cap(1, "a_1", Hir::char('z'))); |
| |
| assert_eq!(p(r"(?<a.1>z)"), named_cap(1, "a.1", Hir::char('z'))); |
| assert_eq!(p(r"(?P<a.1>z)"), named_cap(1, "a.1", Hir::char('z'))); |
| |
| assert_eq!(p(r"(?<a[1]>z)"), named_cap(1, "a[1]", Hir::char('z'))); |
| assert_eq!(p(r"(?P<a[1]>z)"), named_cap(1, "a[1]", Hir::char('z'))); |
| |
| assert_eq!(p(r"(?<a¾>z)"), named_cap(1, "a¾", Hir::char('z'))); |
| assert_eq!(p(r"(?P<a¾>z)"), named_cap(1, "a¾", Hir::char('z'))); |
| |
| assert_eq!(p(r"(?<名字>z)"), named_cap(1, "名字", Hir::char('z'))); |
| assert_eq!(p(r"(?P<名字>z)"), named_cap(1, "名字", Hir::char('z'))); |
| } |
| |
| #[test] |
| fn ok_class() { |
| assert_eq!(p(r"[a]"), singles(['a'])); |
| assert_eq!(p(r"[a\]]"), singles(['a', ']'])); |
| assert_eq!(p(r"[a\-z]"), singles(['a', '-', 'z'])); |
| assert_eq!(p(r"[ab]"), class([('a', 'b')])); |
| assert_eq!(p(r"[a-]"), singles(['a', '-'])); |
| assert_eq!(p(r"[-a]"), singles(['a', '-'])); |
| assert_eq!(p(r"[--a]"), singles(['a', '-'])); |
| assert_eq!(p(r"[---a]"), singles(['a', '-'])); |
| assert_eq!(p(r"[[:alnum:]]"), posix("alnum")); |
| assert_eq!(p(r"[\w]"), posix("word")); |
| assert_eq!(p(r"[a\wz]"), posix("word")); |
| assert_eq!(p(r"[\s\S]"), class([('\x00', '\u{10FFFF}')])); |
| assert_eq!(p(r"[^\s\S]"), Hir::fail()); |
| assert_eq!(p(r"[a-cx-z]"), class([('a', 'c'), ('x', 'z')])); |
| assert_eq!(p(r"[☃-⛄]"), class([('☃', '⛄')])); |
| assert_eq!(p(r"[]]"), singles([']'])); |
| assert_eq!(p(r"[]a]"), singles([']', 'a'])); |
| assert_eq!(p(r"[]\[]"), singles(['[', ']'])); |
| assert_eq!(p(r"[\[]"), singles(['['])); |
| |
| assert_eq!(p(r"(?i)[a]"), singles(['A', 'a'])); |
| assert_eq!(p(r"(?i)[A]"), singles(['A', 'a'])); |
| assert_eq!(p(r"(?i)[k]"), singles(['K', 'k'])); |
| assert_eq!(p(r"(?i)[s]"), singles(['S', 's'])); |
| assert_eq!(p(r"(?i)[β]"), singles(['β'])); |
| |
| assert_eq!(p(r"[^^]"), class([('\x00', ']'), ('_', '\u{10FFFF}')])); |
| assert_eq!( |
| p(r"[^-a]"), |
| class([('\x00', ','), ('.', '`'), ('b', '\u{10FFFF}')]) |
| ); |
| |
| assert_eq!( |
| p(r"[-]a]"), |
| Hir::concat(vec![singles(['-']), Hir::char('a'), Hir::char(']')]) |
| ); |
| } |
| |
| #[test] |
| fn ok_verbatim() { |
| assert_eq!( |
| p(r"(?x)a{5,9} ?"), |
| Hir::repetition(hir::Repetition { |
| min: 5, |
| max: Some(9), |
| greedy: false, |
| sub: Box::new(Hir::char('a')), |
| }) |
| ); |
| assert_eq!(p(r"(?x)[ a]"), singles(['a'])); |
| assert_eq!( |
| p(r"(?x)[ ^ a]"), |
| class([('\x00', '`'), ('b', '\u{10FFFF}')]) |
| ); |
| assert_eq!(p(r"(?x)[ - a]"), singles(['a', '-'])); |
| assert_eq!(p(r"(?x)[ ] a]"), singles([']', 'a'])); |
| |
| assert_eq!( |
| p(r"(?x)a b"), |
| Hir::concat(vec![Hir::char('a'), Hir::char('b')]) |
| ); |
| assert_eq!( |
| p(r"(?x)a b(?-x)a b"), |
| Hir::concat(vec![ |
| Hir::char('a'), |
| Hir::char('b'), |
| Hir::char('a'), |
| Hir::char(' '), |
| Hir::char('b'), |
| ]) |
| ); |
| assert_eq!( |
| p(r"a (?x:a )a "), |
| Hir::concat(vec![ |
| Hir::char('a'), |
| Hir::char(' '), |
| Hir::char('a'), |
| Hir::char('a'), |
| Hir::char(' '), |
| ]) |
| ); |
| assert_eq!( |
| p(r"(?x)( ?P<foo> a )"), |
| named_cap(1, "foo", Hir::char('a')), |
| ); |
| assert_eq!(p(r"(?x)( a )"), cap(1, Hir::char('a'))); |
| assert_eq!(p(r"(?x)( ?: a )"), Hir::char('a')); |
| assert_eq!(p(r"(?x)\x { 53 }"), Hir::char('\x53')); |
| assert_eq!(p(r"(?x)\ "), Hir::char(' ')); |
| } |
| |
| #[test] |
| fn ok_comments() { |
| let pat = "(?x) |
| # This is comment 1. |
| foo # This is comment 2. |
| # This is comment 3. |
| bar |
| # This is comment 4."; |
| assert_eq!( |
| p(pat), |
| Hir::concat(vec![ |
| Hir::char('f'), |
| Hir::char('o'), |
| Hir::char('o'), |
| Hir::char('b'), |
| Hir::char('a'), |
| Hir::char('r'), |
| ]) |
| ); |
| } |
| |
| #[test] |
| fn err_standard() { |
| assert_eq!( |
| ERR_TOO_MUCH_NESTING, |
| perr("(((((((((((((((((((((((((((((((((((((((((((((((((((a)))))))))))))))))))))))))))))))))))))))))))))))))))"), |
| ); |
| // This one is tricky, because the only way it can happen is if the |
| // number of captures overflows u32. Perhaps we should allow setting a |
| // lower limit? |
| // assert_eq!(ERR_TOO_MANY_CAPTURES, perr("")); |
| assert_eq!(ERR_DUPLICATE_CAPTURE_NAME, perr(r"(?P<a>y)(?P<a>z)")); |
| assert_eq!(ERR_UNCLOSED_GROUP, perr("(")); |
| assert_eq!(ERR_UNCLOSED_GROUP_QUESTION, perr("(?")); |
| assert_eq!(ERR_UNOPENED_GROUP, perr(")")); |
| assert_eq!(ERR_LOOK_UNSUPPORTED, perr(r"(?=a)")); |
| assert_eq!(ERR_LOOK_UNSUPPORTED, perr(r"(?!a)")); |
| assert_eq!(ERR_LOOK_UNSUPPORTED, perr(r"(?<=a)")); |
| assert_eq!(ERR_LOOK_UNSUPPORTED, perr(r"(?<!a)")); |
| assert_eq!(ERR_EMPTY_FLAGS, perr(r"(?)")); |
| assert_eq!(ERR_MISSING_GROUP_NAME, perr(r"(?P<")); |
| assert_eq!(ERR_MISSING_GROUP_NAME, perr(r"(?<")); |
| assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?P<1abc>z)")); |
| assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<1abc>z)")); |
| assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<¾>z)")); |
| assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<¾a>z)")); |
| assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<☃>z)")); |
| assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<a☃>z)")); |
| assert_eq!(ERR_UNCLOSED_GROUP_NAME, perr(r"(?P<foo")); |
| assert_eq!(ERR_UNCLOSED_GROUP_NAME, perr(r"(?<foo")); |
| assert_eq!(ERR_EMPTY_GROUP_NAME, perr(r"(?P<>z)")); |
| assert_eq!(ERR_EMPTY_GROUP_NAME, perr(r"(?<>z)")); |
| assert_eq!(ERR_FLAG_UNRECOGNIZED, perr(r"(?z:foo)")); |
| assert_eq!(ERR_FLAG_REPEATED_NEGATION, perr(r"(?s-i-R)")); |
| assert_eq!(ERR_FLAG_DUPLICATE, perr(r"(?isi)")); |
| assert_eq!(ERR_FLAG_DUPLICATE, perr(r"(?is-i)")); |
| assert_eq!(ERR_FLAG_UNEXPECTED_EOF, perr(r"(?is")); |
| assert_eq!(ERR_FLAG_DANGLING_NEGATION, perr(r"(?is-:foo)")); |
| assert_eq!(ERR_HEX_BRACE_INVALID_DIGIT, perr(r"\x{Z}")); |
| assert_eq!(ERR_HEX_BRACE_UNEXPECTED_EOF, perr(r"\x{")); |
| assert_eq!(ERR_HEX_BRACE_UNEXPECTED_EOF, perr(r"\x{A")); |
| assert_eq!(ERR_HEX_BRACE_EMPTY, perr(r"\x{}")); |
| assert_eq!(ERR_HEX_BRACE_INVALID, perr(r"\x{FFFFFFFFFFFFFFFFF}")); |
| assert_eq!(ERR_HEX_FIXED_UNEXPECTED_EOF, perr(r"\xA")); |
| assert_eq!(ERR_HEX_FIXED_INVALID_DIGIT, perr(r"\xZ")); |
| assert_eq!(ERR_HEX_FIXED_INVALID_DIGIT, perr(r"\xZA")); |
| assert_eq!(ERR_HEX_FIXED_INVALID_DIGIT, perr(r"\xAZ")); |
| assert_eq!(ERR_HEX_FIXED_INVALID, perr(r"\uD800")); |
| assert_eq!(ERR_HEX_FIXED_INVALID, perr(r"\UFFFFFFFF")); |
| assert_eq!(ERR_HEX_UNEXPECTED_EOF, perr(r"\x")); |
| assert_eq!(ERR_ESCAPE_UNEXPECTED_EOF, perr(r"\")); |
| assert_eq!(ERR_BACKREF_UNSUPPORTED, perr(r"\0")); |
| assert_eq!(ERR_BACKREF_UNSUPPORTED, perr(r"\1")); |
| assert_eq!(ERR_BACKREF_UNSUPPORTED, perr(r"\8")); |
| assert_eq!(ERR_UNICODE_CLASS_UNSUPPORTED, perr(r"\pL")); |
| assert_eq!(ERR_UNICODE_CLASS_UNSUPPORTED, perr(r"\p{L}")); |
| assert_eq!(ERR_ESCAPE_UNRECOGNIZED, perr(r"\i")); |
| assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"?")); |
| assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"*")); |
| assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"+")); |
| assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"(+)")); |
| assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"|?")); |
| assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"(?i)?")); |
| assert_eq!(ERR_COUNTED_REP_SUB_MISSING, perr(r"{5}")); |
| assert_eq!(ERR_COUNTED_REP_SUB_MISSING, perr(r"({5})")); |
| assert_eq!(ERR_COUNTED_REP_SUB_MISSING, perr(r"(?i){5}")); |
| assert_eq!(ERR_COUNTED_REP_UNCLOSED, perr(r"a{")); |
| assert_eq!(ERR_COUNTED_REP_MIN_UNCLOSED, perr(r"a{5")); |
| assert_eq!(ERR_COUNTED_REP_COMMA_UNCLOSED, perr(r"a{5,")); |
| assert_eq!(ERR_COUNTED_REP_MIN_MAX_UNCLOSED, perr(r"a{5,6")); |
| assert_eq!(ERR_COUNTED_REP_INVALID, perr(r"a{5,6Z")); |
| assert_eq!(ERR_COUNTED_REP_INVALID_RANGE, perr(r"a{6,5}")); |
| assert_eq!(ERR_DECIMAL_NO_DIGITS, perr(r"a{}")); |
| assert_eq!(ERR_DECIMAL_NO_DIGITS, perr(r"a{]}")); |
| assert_eq!(ERR_DECIMAL_INVALID, perr(r"a{999999999999999}")); |
| assert_eq!(ERR_CLASS_UNCLOSED_AFTER_ITEM, perr(r"[a")); |
| assert_eq!(ERR_CLASS_INVALID_RANGE_ITEM, perr(r"[\w-a]")); |
| assert_eq!(ERR_CLASS_INVALID_RANGE_ITEM, perr(r"[a-\w]")); |
| assert_eq!(ERR_CLASS_INVALID_ITEM, perr(r"[\b]")); |
| assert_eq!(ERR_CLASS_UNCLOSED_AFTER_DASH, perr(r"[a-")); |
| assert_eq!(ERR_CLASS_UNCLOSED_AFTER_NEGATION, perr(r"[^")); |
| assert_eq!(ERR_CLASS_UNCLOSED_AFTER_CLOSING, perr(r"[]")); |
| assert_eq!(ERR_CLASS_INVALID_RANGE, perr(r"[z-a]")); |
| assert_eq!(ERR_CLASS_UNCLOSED, perr(r"[")); |
| assert_eq!(ERR_CLASS_UNCLOSED, perr(r"[a-z")); |
| assert_eq!(ERR_CLASS_NEST_UNSUPPORTED, perr(r"[a-z[A-Z]]")); |
| assert_eq!(ERR_CLASS_NEST_UNSUPPORTED, perr(r"[[:alnum]]")); |
| assert_eq!(ERR_CLASS_INTERSECTION_UNSUPPORTED, perr(r"[a&&b]")); |
| assert_eq!(ERR_CLASS_DIFFERENCE_UNSUPPORTED, perr(r"[a--b]")); |
| assert_eq!(ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED, perr(r"[a~~b]")); |
| assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED, perr(r"\b{foo")); |
| assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED, perr(r"\b{foo!}")); |
| assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED, perr(r"\b{foo}")); |
| assert_eq!(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF, perr(r"\b{")); |
| assert_eq!(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF, perr(r"(?x)\b{ ")); |
| } |
| |
| #[test] |
| fn err_verbatim() { |
| // See: https://github.com/rust-lang/regex/issues/792 |
| assert_eq!(ERR_CLASS_UNCLOSED_AFTER_DASH, perr(r"(?x)[-#]")); |
| assert_eq!(ERR_CLASS_UNCLOSED_AFTER_ITEM, perr(r"(?x)[a ")); |
| assert_eq!(ERR_CLASS_UNCLOSED_AFTER_DASH, perr(r"(?x)[a- ")); |
| assert_eq!(ERR_CLASS_UNCLOSED, perr(r"(?x)[ ")); |
| } |
| |
| // This tests a bug fix where the nest limit checker wasn't decrementing |
| // its depth during post-traversal, which causes long regexes to trip |
| // the default limit too aggressively. |
| #[test] |
| fn regression_454_nest_too_big() { |
| let pattern = r#" |
| 2(?: |
| [45]\d{3}| |
| 7(?: |
| 1[0-267]| |
| 2[0-289]| |
| 3[0-29]| |
| 4[01]| |
| 5[1-3]| |
| 6[013]| |
| 7[0178]| |
| 91 |
| )| |
| 8(?: |
| 0[125]| |
| [139][1-6]| |
| 2[0157-9]| |
| 41| |
| 6[1-35]| |
| 7[1-5]| |
| 8[1-8]| |
| 90 |
| )| |
| 9(?: |
| 0[0-2]| |
| 1[0-4]| |
| 2[568]| |
| 3[3-6]| |
| 5[5-7]| |
| 6[0167]| |
| 7[15]| |
| 8[0146-9] |
| ) |
| )\d{4} |
| "#; |
| p(pattern); |
| } |
| |
| // This tests that we treat a trailing `-` in a character class as a |
| // literal `-` even when whitespace mode is enabled and there is whitespace |
| // after the trailing `-`. |
| #[test] |
| fn regression_455_trailing_dash_ignore_whitespace() { |
| p("(?x)[ / - ]"); |
| p("(?x)[ a - ]"); |
| p("(?x)[ |
| a |
| - ] |
| "); |
| p("(?x)[ |
| a # wat |
| - ] |
| "); |
| |
| perr("(?x)[ / -"); |
| perr("(?x)[ / - "); |
| perr( |
| "(?x)[ |
| / - |
| ", |
| ); |
| perr( |
| "(?x)[ |
| / - # wat |
| ", |
| ); |
| } |
| |
| #[test] |
| fn regression_capture_indices() { |
| let got = p(r"(a|ab|c|bcd){4,10}(d*)"); |
| assert_eq!( |
| got, |
| Hir::concat(vec![ |
| Hir::repetition(hir::Repetition { |
| min: 4, |
| max: Some(10), |
| greedy: true, |
| sub: Box::new(cap( |
| 1, |
| Hir::alternation(vec![ |
| Hir::char('a'), |
| Hir::concat(vec![Hir::char('a'), Hir::char('b')]), |
| Hir::char('c'), |
| Hir::concat(vec![ |
| Hir::char('b'), |
| Hir::char('c'), |
| Hir::char('d') |
| ]), |
| ]) |
| )) |
| }), |
| cap( |
| 2, |
| Hir::repetition(hir::Repetition { |
| min: 0, |
| max: None, |
| greedy: true, |
| sub: Box::new(Hir::char('d')), |
| }) |
| ), |
| ]) |
| ); |
| } |
| } |