blob: 884fcfa35c973b1a3f24e446d9ba27f34568ac97 [file] [log] [blame]
// Copyright 2017 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
/*!
Defines an abstract syntax for regular expressions.
*/
use std::cmp::Ordering;
use std::error;
use std::fmt;
pub use ast::visitor::{Visitor, visit};
pub mod parse;
pub mod print;
mod visitor;
/// An error that occurred while parsing a regular expression into an abstract
/// syntax tree.
///
/// Note that note all ASTs represents a valid regular expression. For example,
/// an AST is constructed without error for `\p{Quux}`, but `Quux` is not a
/// valid Unicode property name. That particular error is reported when
/// translating an AST to the high-level intermediate representation (`HIR`).
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct Error {
/// The kind of error.
kind: ErrorKind,
/// The original pattern that the parser generated the error from. Every
/// span in an error is a valid range into this string.
pattern: String,
/// The span of this error.
span: Span,
}
impl Error {
/// Return the type of this error.
pub fn kind(&self) -> &ErrorKind {
&self.kind
}
/// The original pattern string in which this error occurred.
///
/// Every span reported by this error is reported in terms of this string.
pub fn pattern(&self) -> &str {
&self.pattern
}
/// Return the span at which this error occurred.
pub fn span(&self) -> &Span {
&self.span
}
/// Return an auxiliary span. This span exists only for some errors that
/// benefit from being able to point to two locations in the original
/// regular expression. For example, "duplicate" errors will have the
/// main error position set to the duplicate occurrence while its
/// auxiliary span will be set to the initial occurrence.
pub fn auxiliary_span(&self) -> Option<&Span> {
use self::ErrorKind::*;
match self.kind {
FlagDuplicate { ref original } => Some(original),
FlagRepeatedNegation { ref original, .. } => Some(original),
GroupNameDuplicate { ref original, .. } => Some(original),
_ => None,
}
}
}
/// The type of an error that occurred while building an AST.
#[derive(Clone, Debug, Eq, PartialEq)]
pub enum ErrorKind {
/// The capturing group limit was exceeded.
///
/// Note that this represents a limit on the total number of capturing
/// groups in a regex and not necessarily the number of nested capturing
/// groups. That is, the nest limit can be low and it is still possible for
/// this error to occur.
CaptureLimitExceeded,
/// An invalid escape sequence was found in a character class set.
ClassEscapeInvalid,
/// An invalid character class range was found. An invalid range is any
/// range where the start is greater than the end.
ClassRangeInvalid,
/// An invalid range boundary was found in a character class. Range
/// boundaries must be a single literal codepoint, but this error indicates
/// that something else was found, such as a nested class.
ClassRangeLiteral,
/// An opening `[` was found with no corresponding closing `]`.
ClassUnclosed,
/// An empty decimal number was given where one was expected.
DecimalEmpty,
/// An invalid decimal number was given where one was expected.
DecimalInvalid,
/// A bracketed hex literal was empty.
EscapeHexEmpty,
/// A bracketed hex literal did not correspond to a Unicode scalar value.
EscapeHexInvalid,
/// An invalid hexadecimal digit was found.
EscapeHexInvalidDigit,
/// EOF was found before an escape sequence was completed.
EscapeUnexpectedEof,
/// An unrecognized escape sequence.
EscapeUnrecognized,
/// A dangling negation was used when setting flags, e.g., `i-`.
FlagDanglingNegation,
/// A flag was used twice, e.g., `i-i`.
FlagDuplicate {
/// The position of the original flag. The error position
/// points to the duplicate flag.
original: Span,
},
/// The negation operator was used twice, e.g., `-i-s`.
FlagRepeatedNegation {
/// The position of the original negation operator. The error position
/// points to the duplicate negation operator.
original: Span,
},
/// Expected a flag but got EOF, e.g., `(?`.
FlagUnexpectedEof,
/// Unrecognized flag, e.g., `a`.
FlagUnrecognized,
/// A duplicate capture name was found.
GroupNameDuplicate {
/// The position of the initial occurrence of the capture name. The
/// error position itself points to the duplicate occurrence.
original: Span,
},
/// A capture group name is empty, e.g., `(?P<>abc)`.
GroupNameEmpty,
/// An invalid character was seen for a capture group name. This includes
/// errors where the first character is a digit (even though subsequent
/// characters are allowed to be digits).
GroupNameInvalid,
/// A closing `>` could not be found for a capture group name.
GroupNameUnexpectedEof,
/// An unclosed group, e.g., `(ab`.
///
/// The span of this error corresponds to the unclosed parenthesis.
GroupUnclosed,
/// An unopened group, e.g., `ab)`.
GroupUnopened,
/// The nest limit was exceeded. The limit stored here is the limit
/// configured in the parser.
NestLimitExceeded(u32),
/// The range provided in a counted repetition operator is invalid. The
/// range is invalid if the start is greater than the end.
RepetitionCountInvalid,
/// An opening `{` was found with no corresponding closing `}`.
RepetitionCountUnclosed,
/// A repetition operator was applied to a missing sub-expression. This
/// occurs, for example, in the regex consisting of just a `*` or even
/// `(?i)*`. It is, however, possible to create a repetition operating on
/// an empty sub-expression. For example, `()*` is still considered valid.
RepetitionMissing,
/// When octal support is disabled, this error is produced when an octal
/// escape is used. The octal escape is assumed to be an invocation of
/// a backreference, which is the common case.
UnsupportedBackreference,
/// When syntax similar to PCRE's look-around is used, this error is
/// returned. Some example syntaxes that are rejected include, but are
/// not necessarily limited to, `(?=re)`, `(?!re)`, `(?<=re)` and
/// `(?<!re)`. Note that all of these syntaxes are otherwise invalid; this
/// error is used to improve the user experience.
UnsupportedLookAround,
/// Hints that destructuring should not be exhaustive.
///
/// This enum may grow additional variants, so this makes sure clients
/// don't count on exhaustive matching. (Otherwise, adding a new variant
/// could break existing code.)
#[doc(hidden)]
__Nonexhaustive,
}
impl error::Error for Error {
fn description(&self) -> &str {
use self::ErrorKind::*;
match self.kind {
CaptureLimitExceeded => "capture group limit exceeded",
ClassEscapeInvalid => "invalid escape sequence in character class",
ClassRangeInvalid => "invalid character class range",
ClassRangeLiteral => "invalid range boundary, must be a literal",
ClassUnclosed => "unclosed character class",
DecimalEmpty => "empty decimal literal",
DecimalInvalid => "invalid decimal literal",
EscapeHexEmpty => "empty hexadecimal literal",
EscapeHexInvalid => "invalid hexadecimal literal",
EscapeHexInvalidDigit => "invalid hexadecimal digit",
EscapeUnexpectedEof => "unexpected eof (escape sequence)",
EscapeUnrecognized => "unrecognized escape sequence",
FlagDanglingNegation => "dangling flag negation operator",
FlagDuplicate{..} => "duplicate flag",
FlagRepeatedNegation{..} => "repeated negation",
FlagUnexpectedEof => "unexpected eof (flag)",
FlagUnrecognized => "unrecognized flag",
GroupNameDuplicate{..} => "duplicate capture group name",
GroupNameEmpty => "empty capture group name",
GroupNameInvalid => "invalid capture group name",
GroupNameUnexpectedEof => "unclosed capture group name",
GroupUnclosed => "unclosed group",
GroupUnopened => "unopened group",
NestLimitExceeded(_) => "nest limit exceeded",
RepetitionCountInvalid => "invalid repetition count range",
RepetitionCountUnclosed => "unclosed counted repetition",
RepetitionMissing => "repetition operator missing expression",
UnsupportedBackreference => "backreferences are not supported",
UnsupportedLookAround => "look-around is not supported",
_ => unreachable!(),
}
}
}
impl fmt::Display for Error {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
::error::Formatter::from(self).fmt(f)
}
}
impl fmt::Display for ErrorKind {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
use self::ErrorKind::*;
match *self {
CaptureLimitExceeded => {
write!(f, "exceeded the maximum number of \
capturing groups ({})", ::std::u32::MAX)
}
ClassEscapeInvalid => {
write!(f, "invalid escape sequence found in character class")
}
ClassRangeInvalid => {
write!(f, "invalid character class range, \
the start must be <= the end")
}
ClassRangeLiteral => {
write!(f, "invalid range boundary, must be a literal")
}
ClassUnclosed => {
write!(f, "unclosed character class")
}
DecimalEmpty => {
write!(f, "decimal literal empty")
}
DecimalInvalid => {
write!(f, "decimal literal invalid")
}
EscapeHexEmpty => {
write!(f, "hexadecimal literal empty")
}
EscapeHexInvalid => {
write!(f, "hexadecimal literal is not a Unicode scalar value")
}
EscapeHexInvalidDigit => {
write!(f, "invalid hexadecimal digit")
}
EscapeUnexpectedEof => {
write!(f, "incomplete escape sequence, \
reached end of pattern prematurely")
}
EscapeUnrecognized => {
write!(f, "unrecognized escape sequence")
}
FlagDanglingNegation => {
write!(f, "dangling flag negation operator")
}
FlagDuplicate{..} => {
write!(f, "duplicate flag")
}
FlagRepeatedNegation{..} => {
write!(f, "flag negation operator repeated")
}
FlagUnexpectedEof => {
write!(f, "expected flag but got end of regex")
}
FlagUnrecognized => {
write!(f, "unrecognized flag")
}
GroupNameDuplicate{..} => {
write!(f, "duplicate capture group name")
}
GroupNameEmpty => {
write!(f, "empty capture group name")
}
GroupNameInvalid => {
write!(f, "invalid capture group character")
}
GroupNameUnexpectedEof => {
write!(f, "unclosed capture group name")
}
GroupUnclosed => {
write!(f, "unclosed group")
}
GroupUnopened => {
write!(f, "unopened group")
}
NestLimitExceeded(limit) => {
write!(f, "exceed the maximum number of \
nested parentheses/brackets ({})", limit)
}
RepetitionCountInvalid => {
write!(f, "invalid repetition count range, \
the start must be <= the end")
}
RepetitionCountUnclosed => {
write!(f, "unclosed counted repetition")
}
RepetitionMissing => {
write!(f, "repetition operator missing expression")
}
UnsupportedBackreference => {
write!(f, "backreferences are not supported")
}
UnsupportedLookAround => {
write!(f, "look-around, including look-ahead and look-behind, \
is not supported")
}
_ => unreachable!(),
}
}
}
/// Span represents the position information of a single AST item.
///
/// All span positions are absolute byte offsets that can be used on the
/// original regular expression that was parsed.
#[derive(Clone, Copy, Eq, PartialEq)]
pub struct Span {
/// The start byte offset.
pub start: Position,
/// The end byte offset.
pub end: Position,
}
impl fmt::Debug for Span {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "Span({:?}, {:?})", self.start, self.end)
}
}
impl Ord for Span {
fn cmp(&self, other: &Span) -> Ordering {
(&self.start, &self.end).cmp(&(&other.start, &other.end))
}
}
impl PartialOrd for Span {
fn partial_cmp(&self, other: &Span) -> Option<Ordering> {
Some(self.cmp(other))
}
}
/// A single position in a regular expression.
///
/// A position encodes one half of a span, and include the byte offset, line
/// number and column number.
#[derive(Clone, Copy, Eq, PartialEq)]
pub struct Position {
/// The absolute offset of this position, starting at `0` from the
/// beginning of the regular expression pattern string.
pub offset: usize,
/// The line number, starting at `1`.
pub line: usize,
/// The approximate column number, starting at `1`.
pub column: usize,
}
impl fmt::Debug for Position {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(
f,
"Position(o: {:?}, l: {:?}, c: {:?})",
self.offset, self.line, self.column)
}
}
impl Ord for Position {
fn cmp(&self, other: &Position) -> Ordering {
self.offset.cmp(&other.offset)
}
}
impl PartialOrd for Position {
fn partial_cmp(&self, other: &Position) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl Span {
/// Create a new span with the given positions.
pub fn new(start: Position, end: Position) -> Span {
Span { start: start, end: end }
}
/// Create a new span using the given position as the start and end.
pub fn splat(pos: Position) -> Span {
Span::new(pos, pos)
}
/// Create a new span by replacing the starting the position with the one
/// given.
pub fn with_start(self, pos: Position) -> Span {
Span { start: pos, ..self }
}
/// Create a new span by replacing the ending the position with the one
/// given.
pub fn with_end(self, pos: Position) -> Span {
Span { end: pos, ..self }
}
/// Returns true if and only if this span occurs on a single line.
pub fn is_one_line(&self) -> bool {
self.start.line == self.end.line
}
/// Returns true if and only if this span is empty. That is, it points to
/// a single position in the concrete syntax of a regular expression.
pub fn is_empty(&self) -> bool {
self.start.offset == self.end.offset
}
}
impl Position {
/// Create a new position with the given information.
///
/// `offset` is the absolute offset of the position, starting at `0` from
/// the beginning of the regular expression pattern string.
///
/// `line` is the line number, starting at `1`.
///
/// `column` is the approximate column number, starting at `1`.
pub fn new(offset: usize, line: usize, column: usize) -> Position {
Position { offset: offset, line: line, column: column }
}
}
/// An abstract syntax tree for a singular expression along with comments
/// found.
///
/// Comments are not stored in the tree itself to avoid complexity. Each
/// comment contains a span of precisely where it occurred in the original
/// regular expression.
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct WithComments {
/// The actual ast.
pub ast: Ast,
/// All comments found in the original regular expression.
pub comments: Vec<Comment>,
}
/// A comment from a regular expression with an associated span.
///
/// A regular expression can only contain comments when the `x` flag is
/// enabled.
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct Comment {
/// The span of this comment, including the beginning `#` and ending `\n`.
pub span: Span,
/// The comment text, starting with the first character following the `#`
/// and ending with the last character preceding the `\n`.
pub comment: String,
}
/// An abstract syntax tree for a single regular expression.
///
/// An `Ast`'s `fmt::Display` implementation uses constant stack space and heap
/// space proportional to the size of the `Ast`.
///
/// This type defines its own destructor that uses constant stack space and
/// heap space proportional to the size of the `Ast`.
#[derive(Clone, Debug, Eq, PartialEq)]
pub enum Ast {
/// An empty regex that matches everything.
Empty(Span),
/// A set of flags, e.g., `(?is)`.
Flags(SetFlags),
/// A single character literal, which includes escape sequences.
Literal(Literal),
/// The "any character" class.
Dot(Span),
/// A single zero-width assertion.
Assertion(Assertion),
/// A single character class. This includes all forms of character classes
/// except for `.`. e.g., `\d`, `\pN`, `[a-z]` and `[[:alpha:]]`.
Class(Class),
/// A repetition operator applied to an arbitrary regular expression.
Repetition(Repetition),
/// A grouped regular expression.
Group(Group),
/// An alternation of regular expressions.
Alternation(Alternation),
/// A concatenation of regular expressions.
Concat(Concat),
}
impl Ast {
/// Return the span of this abstract syntax tree.
pub fn span(&self) -> &Span {
match *self {
Ast::Empty(ref span) => span,
Ast::Flags(ref x) => &x.span,
Ast::Literal(ref x) => &x.span,
Ast::Dot(ref span) => span,
Ast::Assertion(ref x) => &x.span,
Ast::Class(ref x) => x.span(),
Ast::Repetition(ref x) => &x.span,
Ast::Group(ref x) => &x.span,
Ast::Alternation(ref x) => &x.span,
Ast::Concat(ref x) => &x.span,
}
}
/// Return true if and only if this Ast is empty.
pub fn is_empty(&self) -> bool {
match *self {
Ast::Empty(_) => true,
_ => false,
}
}
/// Returns true if and only if this AST has any (including possibly empty)
/// subexpressions.
fn has_subexprs(&self) -> bool {
match *self {
Ast::Empty(_)
| Ast::Flags(_)
| Ast::Literal(_)
| Ast::Dot(_)
| Ast::Assertion(_) => false,
Ast::Class(_)
| Ast::Repetition(_)
| Ast::Group(_)
| Ast::Alternation(_)
| Ast::Concat(_) => true,
}
}
}
/// Print a display representation of this Ast.
///
/// This does not preserve any of the original whitespace formatting that may
/// have originally been present in the concrete syntax from which this Ast
/// was generated.
///
/// This implementation uses constant stack space and heap space proportional
/// to the size of the `Ast`.
impl fmt::Display for Ast {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
use ast::print::Printer;
Printer::new().print(self, f)
}
}
/// An alternation of regular expressions.
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct Alternation {
/// The span of this alternation.
pub span: Span,
/// The alternate regular expressions.
pub asts: Vec<Ast>,
}
impl Alternation {
/// Return this alternation as an AST.
///
/// If this alternation contains zero ASTs, then Ast::Empty is
/// returned. If this alternation contains exactly 1 AST, then the
/// corresponding AST is returned. Otherwise, Ast::Alternation is returned.
pub fn into_ast(mut self) -> Ast {
match self.asts.len() {
0 => Ast::Empty(self.span),
1 => self.asts.pop().unwrap(),
_ => Ast::Alternation(self),
}
}
}
/// A concatenation of regular expressions.
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct Concat {
/// The span of this concatenation.
pub span: Span,
/// The concatenation regular expressions.
pub asts: Vec<Ast>,
}
impl Concat {
/// Return this concatenation as an AST.
///
/// If this concatenation contains zero ASTs, then Ast::Empty is
/// returned. If this concatenation contains exactly 1 AST, then the
/// corresponding AST is returned. Otherwise, Ast::Concat is returned.
pub fn into_ast(mut self) -> Ast {
match self.asts.len() {
0 => Ast::Empty(self.span),
1 => self.asts.pop().unwrap(),
_ => Ast::Concat(self),
}
}
}
/// A single literal expression.
///
/// A literal corresponds to a single Unicode scalar value. Literals may be
/// represented in their literal form, e.g., `a` or in their escaped form,
/// e.g., `\x61`.
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct Literal {
/// The span of this literal.
pub span: Span,
/// The kind of this literal.
pub kind: LiteralKind,
/// The Unicode scalar value corresponding to this literal.
pub c: char,
}
impl Literal {
/// If this literal was written as a `\x` hex escape, then this returns
/// the corresponding byte value. Otherwise, this returns `None`.
pub fn byte(&self) -> Option<u8> {
let short_hex = LiteralKind::HexFixed(HexLiteralKind::X);
if self.c as u32 <= 255 && self.kind == short_hex {
Some(self.c as u8)
} else {
None
}
}
}
/// The kind of a single literal expression.
#[derive(Clone, Debug, Eq, PartialEq)]
pub enum LiteralKind {
/// The literal is written verbatim, e.g., `a` or `☃`.
Verbatim,
/// The literal is written as an escape because it is punctuation, e.g.,
/// `\*` or `\[`.
Punctuation,
/// The literal is written as an octal escape, e.g., `\141`.
Octal,
/// The literal is written as a hex code with a fixed number of digits
/// depending on the type of the escape, e.g., `\x61` or or `\u0061` or
/// `\U00000061`.
HexFixed(HexLiteralKind),
/// The literal is written as a hex code with a bracketed number of
/// digits. The only restriction is that the bracketed hex code must refer
/// to a valid Unicode scalar value.
HexBrace(HexLiteralKind),
/// The literal is written as a specially recognized escape, e.g., `\f`
/// or `\n`.
Special(SpecialLiteralKind),
}
/// The type of a special literal.
///
/// A special literal is a special escape sequence recognized by the regex
/// parser, e.g., `\f` or `\n`.
#[derive(Clone, Debug, Eq, PartialEq)]
pub enum SpecialLiteralKind {
/// Bell, spelled `\a` (`\x07`).
Bell,
/// Form feed, spelled `\f` (`\x0C`).
FormFeed,
/// Tab, spelled `\t` (`\x09`).
Tab,
/// Line feed, spelled `\n` (`\x0A`).
LineFeed,
/// Carriage return, spelled `\r` (`\x0D`).
CarriageReturn,
/// Vertical tab, spelled `\v` (`\x0B`).
VerticalTab,
/// Space, spelled `\ ` (`\x20`). Note that this can only appear when
/// parsing in verbose mode.
Space,
}
/// The type of a Unicode hex literal.
///
/// Note that all variants behave the same when used with brackets. They only
/// differ when used without brackets in the number of hex digits that must
/// follow.
#[derive(Clone, Debug, Eq, PartialEq)]
pub enum HexLiteralKind {
/// A `\x` prefix. When used without brackets, this form is limited to
/// two digits.
X,
/// A `\u` prefix. When used without brackets, this form is limited to
/// four digits.
UnicodeShort,
/// A `\U` prefix. When used without brackets, this form is limited to
/// eight digits.
UnicodeLong,
}
impl HexLiteralKind {
/// The number of digits that must be used with this literal form when
/// used without brackets. When used with brackets, there is no
/// restriction on the number of digits.
pub fn digits(&self) -> u32 {
match *self {
HexLiteralKind::X => 2,
HexLiteralKind::UnicodeShort => 4,
HexLiteralKind::UnicodeLong => 8,
}
}
}
/// A single character class expression.
#[derive(Clone, Debug, Eq, PartialEq)]
pub enum Class {
/// A Unicode character class, e.g., `\pL` or `\p{Greek}`.
Unicode(ClassUnicode),
/// A perl character class, e.g., `\d` or `\W`.
Perl(ClassPerl),
/// A bracketed character class set, which may contain zero or more
/// character ranges and/or zero or more nested classes. e.g.,
/// `[a-zA-Z\pL]`.
Bracketed(ClassBracketed),
}
impl Class {
/// Return the span of this character class.
pub fn span(&self) -> &Span {
match *self {
Class::Perl(ref x) => &x.span,
Class::Unicode(ref x) => &x.span,
Class::Bracketed(ref x) => &x.span,
}
}
}
/// A Perl character class.
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct ClassPerl {
/// The span of this class.
pub span: Span,
/// The kind of Perl class.
pub kind: ClassPerlKind,
/// Whether the class is negated or not. e.g., `\d` is not negated but
/// `\D` is.
pub negated: bool,
}
/// The available Perl character classes.
#[derive(Clone, Debug, Eq, PartialEq)]
pub enum ClassPerlKind {
/// Decimal numbers.
Digit,
/// Whitespace.
Space,
/// Word characters.
Word,
}
/// An ASCII character class.
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct ClassAscii {
/// The span of this class.
pub span: Span,
/// The kind of ASCII class.
pub kind: ClassAsciiKind,
/// Whether the class is negated or not. e.g., `[[:alpha:]]` is not negated
/// but `[[:^alpha:]]` is.
pub negated: bool,
}
/// The available ASCII character classes.
#[derive(Clone, Debug, Eq, PartialEq)]
pub enum ClassAsciiKind {
/// `[0-9A-Za-z]`
Alnum,
/// `[A-Za-z]`
Alpha,
/// `[\x00-\x7F]`
Ascii,
/// `[ \t]`
Blank,
/// `[\x00-\x1F\x7F]`
Cntrl,
/// `[0-9]`
Digit,
/// `[!-~]`
Graph,
/// `[a-z]`
Lower,
/// `[ -~]`
Print,
/// `[!-/:-@\[-`{-~]`
Punct,
/// `[\t\n\v\f\r ]`
Space,
/// `[A-Z]`
Upper,
/// `[0-9A-Za-z_]`
Word,
/// `[0-9A-Fa-f]`
Xdigit,
}
impl ClassAsciiKind {
/// Return the corresponding ClassAsciiKind variant for the given name.
///
/// The name given should correspond to the lowercase version of the
/// variant name. e.g., `cntrl` is the name for `ClassAsciiKind::Cntrl`.
///
/// If no variant with the corresponding name exists, then `None` is
/// returned.
pub fn from_name(name: &str) -> Option<ClassAsciiKind> {
use self::ClassAsciiKind::*;
match name {
"alnum" => Some(Alnum),
"alpha" => Some(Alpha),
"ascii" => Some(Ascii),
"blank" => Some(Blank),
"cntrl" => Some(Cntrl),
"digit" => Some(Digit),
"graph" => Some(Graph),
"lower" => Some(Lower),
"print" => Some(Print),
"punct" => Some(Punct),
"space" => Some(Space),
"upper" => Some(Upper),
"word" => Some(Word),
"xdigit" => Some(Xdigit),
_ => None,
}
}
}
/// A Unicode character class.
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct ClassUnicode {
/// The span of this class.
pub span: Span,
/// Whether this class is negated or not.
///
/// Note: be careful when using this attribute. This specifically refers
/// to whether the class is written as `\p` or `\P`, where the latter
/// is `negated = true`. However, it also possible to write something like
/// `\P{scx!=Katakana}` which is actually equivalent to
/// `\p{scx=Katakana}` and is therefore not actually negated even though
/// `negated = true` here. To test whether this class is truly negated
/// or not, use the `is_negated` method.
pub negated: bool,
/// The kind of Unicode class.
pub kind: ClassUnicodeKind,
}
impl ClassUnicode {
/// Returns true if this class has been negated.
///
/// Note that this takes the Unicode op into account, if it's present.
/// e.g., `is_negated` for `\P{scx!=Katakana}` will return `false`.
pub fn is_negated(&self) -> bool {
match self.kind {
ClassUnicodeKind::NamedValue {
op: ClassUnicodeOpKind::NotEqual, ..
} => !self.negated,
_ => self.negated,
}
}
}
/// The available forms of Unicode character classes.
#[derive(Clone, Debug, Eq, PartialEq)]
pub enum ClassUnicodeKind {
/// A one letter abbreviated class, e.g., `\pN`.
OneLetter(char),
/// A binary property, general category or script. The string may be
/// empty.
Named(String),
/// A property name and an associated value.
NamedValue {
/// The type of Unicode op used to associate `name` with `value`.
op: ClassUnicodeOpKind,
/// The property name (which may be empty).
name: String,
/// The property value (which may be empty).
value: String,
},
}
/// The type of op used in a Unicode character class.
#[derive(Clone, Debug, Eq, PartialEq)]
pub enum ClassUnicodeOpKind {
/// A property set to a specific value, e.g., `\p{scx=Katakana}`.
Equal,
/// A property set to a specific value using a colon, e.g.,
/// `\p{scx:Katakana}`.
Colon,
/// A property that isn't a particular value, e.g., `\p{scx!=Katakana}`.
NotEqual,
}
impl ClassUnicodeOpKind {
/// Whether the op is an equality op or not.
pub fn is_equal(&self) -> bool {
match *self {
ClassUnicodeOpKind::Equal|ClassUnicodeOpKind::Colon => true,
_ => false,
}
}
}
/// A bracketed character class, e.g., `[a-z0-9]`.
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct ClassBracketed {
/// The span of this class.
pub span: Span,
/// Whether this class is negated or not. e.g., `[a]` is not negated but
/// `[^a]` is.
pub negated: bool,
/// The type of this set. A set is either a normal union of things, e.g.,
/// `[abc]` or a result of applying set operations, e.g., `[\pL--c]`.
pub kind: ClassSet,
}
/// A character class set.
///
/// This type corresponds to the internal structure of a bracketed character
/// class. That is, every bracketed character is one of two types: a union of
/// items (literals, ranges, other bracketed classes) or a tree of binary set
/// operations.
#[derive(Clone, Debug, Eq, PartialEq)]
pub enum ClassSet {
/// An item, which can be a single literal, range, nested character class
/// or a union of items.
Item(ClassSetItem),
/// A single binary operation (i.e., &&, -- or ~~).
BinaryOp(ClassSetBinaryOp),
}
impl ClassSet {
/// Build a set from a union.
pub fn union(ast: ClassSetUnion) -> ClassSet {
ClassSet::Item(ClassSetItem::Union(ast))
}
/// Return the span of this character class set.
pub fn span(&self) -> &Span {
match *self {
ClassSet::Item(ref x) => x.span(),
ClassSet::BinaryOp(ref x) => &x.span,
}
}
/// Return true if and only if this class set is empty.
fn is_empty(&self) -> bool {
match *self {
ClassSet::Item(ClassSetItem::Empty(_)) => true,
_ => false,
}
}
}
/// A single component of a character class set.
#[derive(Clone, Debug, Eq, PartialEq)]
pub enum ClassSetItem {
/// An empty item.
///
/// Note that a bracketed character class cannot contain a single empty
/// item. Empty items can appear when using one of the binary operators.
/// For example, `[&&]` is the intersection of two empty classes.
Empty(Span),
/// A single literal.
Literal(Literal),
/// A range between two literals.
Range(ClassSetRange),
/// An ASCII character class, e.g., `[:alnum:]` or `[:punct:]`.
Ascii(ClassAscii),
/// A Unicode character class, e.g., `\pL` or `\p{Greek}`.
Unicode(ClassUnicode),
/// A perl character class, e.g., `\d` or `\W`.
Perl(ClassPerl),
/// A bracketed character class set, which may contain zero or more
/// character ranges and/or zero or more nested classes. e.g.,
/// `[a-zA-Z\pL]`.
Bracketed(Box<ClassBracketed>),
/// A union of items.
Union(ClassSetUnion),
}
impl ClassSetItem {
/// Return the span of this character class set item.
pub fn span(&self) -> &Span {
match *self {
ClassSetItem::Empty(ref span) => span,
ClassSetItem::Literal(ref x) => &x.span,
ClassSetItem::Range(ref x) => &x.span,
ClassSetItem::Ascii(ref x) => &x.span,
ClassSetItem::Perl(ref x) => &x.span,
ClassSetItem::Unicode(ref x) => &x.span,
ClassSetItem::Bracketed(ref x) => &x.span,
ClassSetItem::Union(ref x) => &x.span,
}
}
}
/// A single character class range in a set.
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct ClassSetRange {
/// The span of this range.
pub span: Span,
/// The start of this range.
pub start: Literal,
/// The end of this range.
pub end: Literal,
}
impl ClassSetRange {
/// Returns true if and only if this character class range is valid.
///
/// The only case where a range is invalid is if its start is greater than
/// its end.
pub fn is_valid(&self) -> bool {
self.start.c <= self.end.c
}
}
/// A union of items inside a character class set.
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct ClassSetUnion {
/// The span of the items in this operation. e.g., the `a-z0-9` in
/// `[^a-z0-9]`
pub span: Span,
/// The sequence of items that make up this union.
pub items: Vec<ClassSetItem>,
}
impl ClassSetUnion {
/// Push a new item in this union.
///
/// The ending position of this union's span is updated to the ending
/// position of the span of the item given. If the union is empty, then
/// the starting position of this union is set to the starting position
/// of this item.
///
/// In other words, if you only use this method to add items to a union
/// and you set the spans on each item correctly, then you should never
/// need to adjust the span of the union directly.
pub fn push(&mut self, item: ClassSetItem) {
if self.items.is_empty() {
self.span.start = item.span().start;
}
self.span.end = item.span().end;
self.items.push(item);
}
/// Return this union as a character class set item.
///
/// If this union contains zero items, then an empty union is
/// returned. If this concatenation contains exactly 1 item, then the
/// corresponding item is returned. Otherwise, ClassSetItem::Union is
/// returned.
pub fn into_item(mut self) -> ClassSetItem {
match self.items.len() {
0 => ClassSetItem::Empty(self.span),
1 => self.items.pop().unwrap(),
_ => ClassSetItem::Union(self),
}
}
}
/// A Unicode character class set operation.
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct ClassSetBinaryOp {
/// The span of this operation. e.g., the `a-z--[h-p]` in `[a-z--h-p]`.
pub span: Span,
/// The type of this set operation.
pub kind: ClassSetBinaryOpKind,
/// The left hand side of the operation.
pub lhs: Box<ClassSet>,
/// The right hand side of the operation.
pub rhs: Box<ClassSet>,
}
/// The type of a Unicode character class set operation.
///
/// Note that this doesn't explicitly represent union since there is no
/// explicit union operator. Concatenation inside a character class corresponds
/// to the union operation.
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum ClassSetBinaryOpKind {
/// The intersection of two sets, e.g., `\pN&&[a-z]`.
Intersection,
/// The difference of two sets, e.g., `\pN--[0-9]`.
Difference,
/// The symmetric difference of two sets. The symmetric difference is the
/// set of elements belonging to one but not both sets.
/// e.g., `[\pL~~[:ascii:]]`.
SymmetricDifference,
}
/// A single zero-width assertion.
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct Assertion {
/// The span of this assertion.
pub span: Span,
/// The assertion kind, e.g., `\b` or `^`.
pub kind: AssertionKind,
}
/// An assertion kind.
#[derive(Clone, Debug, Eq, PartialEq)]
pub enum AssertionKind {
/// `^`
StartLine,
/// `$`
EndLine,
/// `\A`
StartText,
/// `\z`
EndText,
/// `\b`
WordBoundary,
/// `\B`
NotWordBoundary,
}
/// A repetition operation applied to a regular expression.
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct Repetition {
/// The span of this operation.
pub span: Span,
/// The actual operation.
pub op: RepetitionOp,
/// Whether this operation was applied greedily or not.
pub greedy: bool,
/// The regular expression under repetition.
pub ast: Box<Ast>,
}
/// The repetition operator itself.
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct RepetitionOp {
/// The span of this operator. This includes things like `+`, `*?` and
/// `{m,n}`.
pub span: Span,
/// The type of operation.
pub kind: RepetitionKind,
}
/// The kind of a repetition operator.
#[derive(Clone, Debug, Eq, PartialEq)]
pub enum RepetitionKind {
/// `?`
ZeroOrOne,
/// `*`
ZeroOrMore,
/// `+`
OneOrMore,
/// `{m,n}`
Range(RepetitionRange),
}
/// A range repetition operator.
#[derive(Clone, Debug, Eq, PartialEq)]
pub enum RepetitionRange {
/// `{m}`
Exactly(u32),
/// `{m,}`
AtLeast(u32),
/// `{m,n}`
Bounded(u32, u32),
}
impl RepetitionRange {
/// Returns true if and only if this repetition range is valid.
///
/// The only case where a repetition range is invalid is if it is bounded
/// and its start is greater than its end.
pub fn is_valid(&self) -> bool {
match *self {
RepetitionRange::Bounded(s, e) if s > e => false,
_ => true,
}
}
}
/// A grouped regular expression.
///
/// This includes both capturing and non-capturing groups. This does **not**
/// include flag-only groups like `(?is)`, but does contain any group that
/// contains a sub-expression, e.g., `(a)`, `(?P<name>a)`, `(?:a)` and
/// `(?is:a)`.
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct Group {
/// The span of this group.
pub span: Span,
/// The kind of this group.
pub kind: GroupKind,
/// The regular expression in this group.
pub ast: Box<Ast>,
}
impl Group {
/// If this group is non-capturing, then this returns the (possibly empty)
/// set of flags. Otherwise, `None` is returned.
pub fn flags(&self) -> Option<&Flags> {
match self.kind {
GroupKind::NonCapturing(ref flags) => Some(flags),
_ => None,
}
}
/// Returns true if and only if this group is capturing.
pub fn is_capturing(&self) -> bool {
match self.kind {
GroupKind::CaptureIndex(_) | GroupKind::CaptureName(_) => true,
GroupKind::NonCapturing(_) => false,
}
}
/// Returns the capture index of this group, if this is a capturing group.
///
/// This returns a capture index precisely when `is_capturing` is `true`.
pub fn capture_index(&self) -> Option<u32> {
match self.kind {
GroupKind::CaptureIndex(i) => Some(i),
GroupKind::CaptureName(ref x) => Some(x.index),
GroupKind::NonCapturing(_) => None,
}
}
}
/// The kind of a group.
#[derive(Clone, Debug, Eq, PartialEq)]
pub enum GroupKind {
/// `(a)`
CaptureIndex(u32),
/// `(?P<name>a)`
CaptureName(CaptureName),
/// `(?:a)` and `(?i:a)`
NonCapturing(Flags),
}
/// A capture name.
///
/// This corresponds to the name itself between the angle brackets in, e.g.,
/// `(?P<foo>expr)`.
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct CaptureName {
/// The span of this capture name.
pub span: Span,
/// The capture name.
pub name: String,
/// The capture index.
pub index: u32,
}
/// A group of flags that is not applied to a particular regular expression.
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct SetFlags {
/// The span of these flags, including the grouping parentheses.
pub span: Span,
/// The actual sequence of flags.
pub flags: Flags,
}
/// A group of flags.
///
/// This corresponds only to the sequence of flags themselves, e.g., `is-u`.
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct Flags {
/// The span of this group of flags.
pub span: Span,
/// A sequence of flag items. Each item is either a flag or a negation
/// operator.
pub items: Vec<FlagsItem>,
}
impl Flags {
/// Add the given item to this sequence of flags.
///
/// If the item was added successfully, then `None` is returned. If the
/// given item is a duplicate, then `Some(i)` is returned, where
/// `items[i].kind == item.kind`.
pub fn add_item(&mut self, item: FlagsItem) -> Option<usize> {
for (i, x) in self.items.iter().enumerate() {
if x.kind == item.kind {
return Some(i);
}
}
self.items.push(item);
None
}
/// Returns the state of the given flag in this set.
///
/// If the given flag is in the set but is negated, then `Some(false)` is
/// returned.
///
/// If the given flag is in the set and is not negated, then `Some(true)`
/// is returned.
///
/// Otherwise, `None` is returned.
pub fn flag_state(&self, flag: Flag) -> Option<bool> {
let mut negated = false;
for x in &self.items {
match x.kind {
FlagsItemKind::Negation => {
negated = true;
}
FlagsItemKind::Flag(ref xflag) if xflag == &flag => {
return Some(!negated);
}
_ => {}
}
}
None
}
}
/// A single item in a group of flags.
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct FlagsItem {
/// The span of this item.
pub span: Span,
/// The kind of this item.
pub kind: FlagsItemKind,
}
/// The kind of an item in a group of flags.
#[derive(Clone, Debug, Eq, PartialEq)]
pub enum FlagsItemKind {
/// A negation operator applied to all subsequent flags in the enclosing
/// group.
Negation,
/// A single flag in a group.
Flag(Flag),
}
impl FlagsItemKind {
/// Returns true if and only if this item is a negation operator.
pub fn is_negation(&self) -> bool {
match *self {
FlagsItemKind::Negation => true,
_ => false,
}
}
}
/// A single flag.
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum Flag {
/// `i`
CaseInsensitive,
/// `m`
MultiLine,
/// `s`
DotMatchesNewLine,
/// `U`
SwapGreed,
/// `u`
Unicode,
/// `x`
IgnoreWhitespace,
}
/// A custom `Drop` impl is used for `Ast` such that it uses constant stack
/// space but heap space proportional to the depth of the `Ast`.
impl Drop for Ast {
fn drop(&mut self) {
use std::mem;
match *self {
Ast::Empty(_)
| Ast::Flags(_)
| Ast::Literal(_)
| Ast::Dot(_)
| Ast::Assertion(_)
// Classes are recursive, so they get their own Drop impl.
| Ast::Class(_) => return,
Ast::Repetition(ref x) if !x.ast.has_subexprs() => return,
Ast::Group(ref x) if !x.ast.has_subexprs() => return,
Ast::Alternation(ref x) if x.asts.is_empty() => return,
Ast::Concat(ref x) if x.asts.is_empty() => return,
_ => {}
}
let empty_span = || Span::splat(Position::new(0, 0, 0));
let empty_ast = || Ast::Empty(empty_span());
let mut stack = vec![mem::replace(self, empty_ast())];
while let Some(mut ast) = stack.pop() {
match ast {
Ast::Empty(_)
| Ast::Flags(_)
| Ast::Literal(_)
| Ast::Dot(_)
| Ast::Assertion(_)
// Classes are recursive, so they get their own Drop impl.
| Ast::Class(_) => {}
Ast::Repetition(ref mut x) => {
stack.push(mem::replace(&mut x.ast, empty_ast()));
}
Ast::Group(ref mut x) => {
stack.push(mem::replace(&mut x.ast, empty_ast()));
}
Ast::Alternation(ref mut x) => {
stack.extend(x.asts.drain(..));
}
Ast::Concat(ref mut x) => {
stack.extend(x.asts.drain(..));
}
}
}
}
}
/// A custom `Drop` impl is used for `ClassSet` such that it uses constant
/// stack space but heap space proportional to the depth of the `ClassSet`.
impl Drop for ClassSet {
fn drop(&mut self) {
use std::mem;
match *self {
ClassSet::Item(ref item) => {
match *item {
ClassSetItem::Empty(_)
| ClassSetItem::Literal(_)
| ClassSetItem::Range(_)
| ClassSetItem::Ascii(_)
| ClassSetItem::Unicode(_)
| ClassSetItem::Perl(_) => return,
ClassSetItem::Bracketed(ref x) => {
if x.kind.is_empty() {
return;
}
}
ClassSetItem::Union(ref x) => {
if x.items.is_empty() {
return;
}
}
}
}
ClassSet::BinaryOp(ref op) => {
if op.lhs.is_empty() && op.rhs.is_empty() {
return;
}
}
}
let empty_span = || Span::splat(Position::new(0, 0, 0));
let empty_set = || ClassSet::Item(ClassSetItem::Empty(empty_span()));
let mut stack = vec![mem::replace(self, empty_set())];
while let Some(mut set) = stack.pop() {
match set {
ClassSet::Item(ref mut item) => {
match *item {
ClassSetItem::Empty(_)
| ClassSetItem::Literal(_)
| ClassSetItem::Range(_)
| ClassSetItem::Ascii(_)
| ClassSetItem::Unicode(_)
| ClassSetItem::Perl(_) => {}
ClassSetItem::Bracketed(ref mut x) => {
stack.push(mem::replace(&mut x.kind, empty_set()));
}
ClassSetItem::Union(ref mut x) => {
stack.extend(
x.items.drain(..).map(ClassSet::Item));
}
}
}
ClassSet::BinaryOp(ref mut op) => {
stack.push(mem::replace(&mut op.lhs, empty_set()));
stack.push(mem::replace(&mut op.rhs, empty_set()));
}
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
// We use a thread with an explicit stack size to test that our destructor
// for Ast can handle arbitrarily sized expressions in constant stack
// space. In case we run on a platform without threads (WASM?), we limit
// this test to Windows/Unix.
#[test]
#[cfg(any(unix, windows))]
fn no_stack_overflow_on_drop() {
use std::thread;
let run = || {
let span = || Span::splat(Position::new(0, 0, 0));
let mut ast = Ast::Empty(span());
for i in 0..200 {
ast = Ast::Group(Group {
span: span(),
kind: GroupKind::CaptureIndex(i),
ast: Box::new(ast),
});
}
assert!(!ast.is_empty());
};
// We run our test on a thread with a small stack size so we can
// force the issue more easily.
thread::Builder::new()
.stack_size(1<<10)
.spawn(run)
.unwrap()
.join()
.unwrap();
}
}