src/utf8.rs - platform/external/rust/crates/regex-syntax - Git at Google

 /*!
 Converts ranges of Unicode scalar values to equivalent ranges of UTF-8 bytes.

 This is sub-module is useful for constructing byte based automatons that need
 to embed UTF-8 decoding. The most common use of this module is in conjunction
 with the [`hir::ClassUnicodeRange`](../hir/struct.ClassUnicodeRange.html) type.

 See the documentation on the `Utf8Sequences` iterator for more details and
 an example.

 # Wait, what is this?

 This is simplest to explain with an example. Let's say you wanted to test
 whether a particular byte sequence was a Cyrillic character. One possible
 scalar value range is `[0400-04FF]`. The set of allowed bytes for this
 range can be expressed as a sequence of byte ranges:

 ```ignore
 [D0-D3][80-BF]
 ```

 This is simple enough: simply encode the boundaries, `0400` encodes to
 `D0 80` and `04FF` encodes to `D3 BF`, and create ranges from each
 corresponding pair of bytes: `D0` to `D3` and `80` to `BF`.

 However, what if you wanted to add the Cyrillic Supplementary characters to
 your range? Your range might then become `[0400-052F]`. The same procedure
 as above doesn't quite work because `052F` encodes to `D4 AF`. The byte ranges
 you'd get from the previous transformation would be `[D0-D4][80-AF]`. However,
 this isn't quite correct because this range doesn't capture many characters,
 for example, `04FF` (because its last byte, `BF` isn't in the range `80-AF`).

 Instead, you need multiple sequences of byte ranges:

 ```ignore
 [D0-D3][80-BF]  # matches codepoints 0400-04FF
 [D4][80-AF]     # matches codepoints 0500-052F
 ```

 This gets even more complicated if you want bigger ranges, particularly if
 they naively contain surrogate codepoints. For example, the sequence of byte
 ranges for the basic multilingual plane (`[0000-FFFF]`) look like this:

 ```ignore
 [0-7F]
 [C2-DF][80-BF]
 [E0][A0-BF][80-BF]
 [E1-EC][80-BF][80-BF]
 [ED][80-9F][80-BF]
 [EE-EF][80-BF][80-BF]
 ```

 Note that the byte ranges above will *not* match any erroneous encoding of
 UTF-8, including encodings of surrogate codepoints.

 And, of course, for all of Unicode (`[000000-10FFFF]`):

 ```ignore
 [0-7F]
 [C2-DF][80-BF]
 [E0][A0-BF][80-BF]
 [E1-EC][80-BF][80-BF]
 [ED][80-9F][80-BF]
 [EE-EF][80-BF][80-BF]
 [F0][90-BF][80-BF][80-BF]
 [F1-F3][80-BF][80-BF][80-BF]
 [F4][80-8F][80-BF][80-BF]
 ```

 This module automates the process of creating these byte ranges from ranges of
 Unicode scalar values.

 # Lineage

 I got the idea and general implementation strategy from Russ Cox in his
 [article on regexps](https://web.archive.org/web/20160404141123/https://swtch.com/~rsc/regexp/regexp3.html) and RE2.
 Russ Cox got it from Ken Thompson's `grep` (no source, folk lore?).
 I also got the idea from
 [Lucene](https://github.com/apache/lucene-solr/blob/ae93f4e7ac6a3908046391de35d4f50a0d3c59ca/lucene/core/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.java),
 which uses it for executing automata on their term index.
 */

 #![deny(missing_docs)]

 use std::char;
 use std::fmt;
 use std::slice;

 const MAX_UTF8_BYTES: usize = 4;

 /// Utf8Sequence represents a sequence of byte ranges.
 ///
 /// To match a Utf8Sequence, a candidate byte sequence must match each
 /// successive range.
 ///
 /// For example, if there are two ranges, `[C2-DF][80-BF]`, then the byte
 /// sequence `\xDD\x61` would not match because `0x61 < 0x80`.
 #[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord)]
 pub enum Utf8Sequence {
     /// One byte range.
     One(Utf8Range),
     /// Two successive byte ranges.
     Two([Utf8Range; 2]),
     /// Three successive byte ranges.
     Three([Utf8Range; 3]),
     /// Four successive byte ranges.
     Four([Utf8Range; 4]),
 }

 impl Utf8Sequence {
     /// Creates a new UTF-8 sequence from the encoded bytes of a scalar value
     /// range.
     ///
     /// This assumes that `start` and `end` have the same length.
     fn from_encoded_range(start: &[u8], end: &[u8]) -> Self {
         assert_eq!(start.len(), end.len());
         match start.len() {
             2 => Utf8Sequence::Two([
                 Utf8Range::new(start[0], end[0]),
                 Utf8Range::new(start[1], end[1]),
             ]),
             3 => Utf8Sequence::Three([
                 Utf8Range::new(start[0], end[0]),
                 Utf8Range::new(start[1], end[1]),
                 Utf8Range::new(start[2], end[2]),
             ]),
             4 => Utf8Sequence::Four([
                 Utf8Range::new(start[0], end[0]),
                 Utf8Range::new(start[1], end[1]),
                 Utf8Range::new(start[2], end[2]),
                 Utf8Range::new(start[3], end[3]),
             ]),
             n => unreachable!("invalid encoded length: {}", n),
         }
     }

     /// Returns the underlying sequence of byte ranges as a slice.
     pub fn as_slice(&self) -> &[Utf8Range] {
         use self::Utf8Sequence::*;
         match *self {
             One(ref r) => slice::from_ref(r),
             Two(ref r) => &r[..],
             Three(ref r) => &r[..],
             Four(ref r) => &r[..],
         }
     }

     /// Returns the number of byte ranges in this sequence.
     ///
     /// The length is guaranteed to be in the closed interval `[1, 4]`.
     pub fn len(&self) -> usize {
         self.as_slice().len()
     }

     /// Reverses the ranges in this sequence.
     ///
     /// For example, if this corresponds to the following sequence:
     ///
     /// ```ignore
     /// [D0-D3][80-BF]
     /// ```
     ///
     /// Then after reversal, it will be
     ///
     /// ```ignore
     /// [80-BF][D0-D3]
     /// ```
     ///
     /// This is useful when one is constructing a UTF-8 automaton to match
     /// character classes in reverse.
     pub fn reverse(&mut self) {
         match *self {
             Utf8Sequence::One(_) => {}
             Utf8Sequence::Two(ref mut x) => x.reverse(),
             Utf8Sequence::Three(ref mut x) => x.reverse(),
             Utf8Sequence::Four(ref mut x) => x.reverse(),
         }
     }

     /// Returns true if and only if a prefix of `bytes` matches this sequence
     /// of byte ranges.
     pub fn matches(&self, bytes: &[u8]) -> bool {
         if bytes.len() < self.len() {
             return false;
         }
         for (&b, r) in bytes.iter().zip(self) {
             if !r.matches(b) {
                 return false;
             }
         }
         true
     }
 }

 impl<'a> IntoIterator for &'a Utf8Sequence {
     type IntoIter = slice::Iter<'a, Utf8Range>;
     type Item = &'a Utf8Range;

     fn into_iter(self) -> Self::IntoIter {
         self.as_slice().into_iter()
     }
 }

 impl fmt::Debug for Utf8Sequence {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         use self::Utf8Sequence::*;
         match *self {
             One(ref r) => write!(f, "{:?}", r),
             Two(ref r) => write!(f, "{:?}{:?}", r[0], r[1]),
             Three(ref r) => write!(f, "{:?}{:?}{:?}", r[0], r[1], r[2]),
             Four(ref r) => {
                 write!(f, "{:?}{:?}{:?}{:?}", r[0], r[1], r[2], r[3])
             }
         }
     }
 }

 /// A single inclusive range of UTF-8 bytes.
 #[derive(Clone, Copy, Eq, PartialEq, PartialOrd, Ord)]
 pub struct Utf8Range {
     /// Start of byte range (inclusive).
     pub start: u8,
     /// End of byte range (inclusive).
     pub end: u8,
 }

 impl Utf8Range {
     fn new(start: u8, end: u8) -> Self {
         Utf8Range { start, end }
     }

     /// Returns true if and only if the given byte is in this range.
     pub fn matches(&self, b: u8) -> bool {
         self.start <= b && b <= self.end
     }
 }

 impl fmt::Debug for Utf8Range {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         if self.start == self.end {
             write!(f, "[{:X}]", self.start)
         } else {
             write!(f, "[{:X}-{:X}]", self.start, self.end)
         }
     }
 }

 /// An iterator over ranges of matching UTF-8 byte sequences.
 ///
 /// The iteration represents an alternation of comprehensive byte sequences
 /// that match precisely the set of UTF-8 encoded scalar values.
 ///
 /// A byte sequence corresponds to one of the scalar values in the range given
 /// if and only if it completely matches exactly one of the sequences of byte
 /// ranges produced by this iterator.
 ///
 /// Each sequence of byte ranges matches a unique set of bytes. That is, no two
 /// sequences will match the same bytes.
 ///
 /// # Example
 ///
 /// This shows how to match an arbitrary byte sequence against a range of
 /// scalar values.
 ///
 /// ```rust
 /// use regex_syntax::utf8::{Utf8Sequences, Utf8Sequence};
 ///
 /// fn matches(seqs: &[Utf8Sequence], bytes: &[u8]) -> bool {
 ///     for range in seqs {
 ///         if range.matches(bytes) {
 ///             return true;
 ///         }
 ///     }
 ///     false
 /// }
 ///
 /// // Test the basic multilingual plane.
 /// let seqs: Vec<_> = Utf8Sequences::new('\u{0}', '\u{FFFF}').collect();
 ///
 /// // UTF-8 encoding of 'a'.
 /// assert!(matches(&seqs, &[0x61]));
 /// // UTF-8 encoding of '☃' (`\u{2603}`).
 /// assert!(matches(&seqs, &[0xE2, 0x98, 0x83]));
 /// // UTF-8 encoding of `\u{10348}` (outside the BMP).
 /// assert!(!matches(&seqs, &[0xF0, 0x90, 0x8D, 0x88]));
 /// // Tries to match against a UTF-8 encoding of a surrogate codepoint,
 /// // which is invalid UTF-8, and therefore fails, despite the fact that
 /// // the corresponding codepoint (0xD800) falls in the range given.
 /// assert!(!matches(&seqs, &[0xED, 0xA0, 0x80]));
 /// // And fails against plain old invalid UTF-8.
 /// assert!(!matches(&seqs, &[0xFF, 0xFF]));
 /// ```
 ///
 /// If this example seems circuitous, that's because it is! It's meant to be
 /// illustrative. In practice, you could just try to decode your byte sequence
 /// and compare it with the scalar value range directly. However, this is not
 /// always possible (for example, in a byte based automaton).
 pub struct Utf8Sequences {
     range_stack: Vec<ScalarRange>,
 }

 impl Utf8Sequences {
     /// Create a new iterator over UTF-8 byte ranges for the scalar value range
     /// given.
     pub fn new(start: char, end: char) -> Self {
         let mut it = Utf8Sequences { range_stack: vec![] };
         it.push(start as u32, end as u32);
         it
     }

     /// reset resets the scalar value range.
     /// Any existing state is cleared, but resources may be reused.
     ///
     /// N.B. Benchmarks say that this method is dubious.
     #[doc(hidden)]
     pub fn reset(&mut self, start: char, end: char) {
         self.range_stack.clear();
         self.push(start as u32, end as u32);
     }

     fn push(&mut self, start: u32, end: u32) {
         self.range_stack.push(ScalarRange { start, end });
     }
 }

 struct ScalarRange {
     start: u32,
     end: u32,
 }

 impl fmt::Debug for ScalarRange {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         write!(f, "ScalarRange({:X}, {:X})", self.start, self.end)
     }
 }

 impl Iterator for Utf8Sequences {
     type Item = Utf8Sequence;

     fn next(&mut self) -> Option<Self::Item> {
         'TOP: while let Some(mut r) = self.range_stack.pop() {
             'INNER: loop {
                 if let Some((r1, r2)) = r.split() {
                     self.push(r2.start, r2.end);
                     r.start = r1.start;
                     r.end = r1.end;
                     continue 'INNER;
                 }
                 if !r.is_valid() {
                     continue 'TOP;
                 }
                 for i in 1..MAX_UTF8_BYTES {
                     let max = max_scalar_value(i);
                     if r.start <= max && max < r.end {
                         self.push(max + 1, r.end);
                         r.end = max;
                         continue 'INNER;
                     }
                 }
                 if let Some(ascii_range) = r.as_ascii() {
                     return Some(Utf8Sequence::One(ascii_range));
                 }
                 for i in 1..MAX_UTF8_BYTES {
                     let m = (1 << (6 * i)) - 1;
                     if (r.start & !m) != (r.end & !m) {
                         if (r.start & m) != 0 {
                             self.push((r.start | m) + 1, r.end);
                             r.end = r.start | m;
                             continue 'INNER;
                         }
                         if (r.end & m) != m {
                             self.push(r.end & !m, r.end);
                             r.end = (r.end & !m) - 1;
                             continue 'INNER;
                         }
                     }
                 }
                 let mut start = [0; MAX_UTF8_BYTES];
                 let mut end = [0; MAX_UTF8_BYTES];
                 let n = r.encode(&mut start, &mut end);
                 return Some(Utf8Sequence::from_encoded_range(
                     &start[0..n],
                     &end[0..n],
                 ));
             }
         }
         None
     }
 }

 impl ScalarRange {
     /// split splits this range if it overlaps with a surrogate codepoint.
     ///
     /// Either or both ranges may be invalid.
     fn split(&self) -> Option<(ScalarRange, ScalarRange)> {
         if self.start < 0xE000 && self.end > 0xD7FF {
             Some((
                 ScalarRange { start: self.start, end: 0xD7FF },
                 ScalarRange { start: 0xE000, end: self.end },
             ))
         } else {
             None
         }
     }

     /// is_valid returns true if and only if start <= end.
     fn is_valid(&self) -> bool {
         self.start <= self.end
     }

     /// as_ascii returns this range as a Utf8Range if and only if all scalar
     /// values in this range can be encoded as a single byte.
     fn as_ascii(&self) -> Option<Utf8Range> {
         if self.is_ascii() {
             Some(Utf8Range::new(self.start as u8, self.end as u8))
         } else {
             None
         }
     }

     /// is_ascii returns true if the range is ASCII only (i.e., takes a single
     /// byte to encode any scalar value).
     fn is_ascii(&self) -> bool {
         self.is_valid() && self.end <= 0x7f
     }

     /// encode writes the UTF-8 encoding of the start and end of this range
     /// to the corresponding destination slices, and returns the number of
     /// bytes written.
     ///
     /// The slices should have room for at least `MAX_UTF8_BYTES`.
     fn encode(&self, start: &mut [u8], end: &mut [u8]) -> usize {
         let cs = char::from_u32(self.start).unwrap();
         let ce = char::from_u32(self.end).unwrap();
         let ss = cs.encode_utf8(start);
         let se = ce.encode_utf8(end);
         assert_eq!(ss.len(), se.len());
         ss.len()
     }
 }

 fn max_scalar_value(nbytes: usize) -> u32 {
     match nbytes {
         1 => 0x007F,
         2 => 0x07FF,
         3 => 0xFFFF,
         4 => 0x10FFFF,
         _ => unreachable!("invalid UTF-8 byte sequence size"),
     }
 }

 #[cfg(test)]
 mod tests {
     use std::char;

     use utf8::{Utf8Range, Utf8Sequences};

     fn rutf8(s: u8, e: u8) -> Utf8Range {
         Utf8Range::new(s, e)
     }

     fn never_accepts_surrogate_codepoints(start: char, end: char) {
         for cp in 0xD800..0xE000 {
             let buf = encode_surrogate(cp);
             for r in Utf8Sequences::new(start, end) {
                 if r.matches(&buf) {
                     panic!(
                         "Sequence ({:X}, {:X}) contains range {:?}, \
                          which matches surrogate code point {:X} \
                          with encoded bytes {:?}",
                         start as u32, end as u32, r, cp, buf,
                     );
                 }
             }
         }
     }

     #[test]
     fn codepoints_no_surrogates() {
         never_accepts_surrogate_codepoints('\u{0}', '\u{FFFF}');
         never_accepts_surrogate_codepoints('\u{0}', '\u{10FFFF}');
         never_accepts_surrogate_codepoints('\u{0}', '\u{10FFFE}');
         never_accepts_surrogate_codepoints('\u{80}', '\u{10FFFF}');
         never_accepts_surrogate_codepoints('\u{D7FF}', '\u{E000}');
     }

     #[test]
     fn single_codepoint_one_sequence() {
         // Tests that every range of scalar values that contains a single
         // scalar value is recognized by one sequence of byte ranges.
         for i in 0x0..(0x10FFFF + 1) {
             let c = match char::from_u32(i) {
                 None => continue,
                 Some(c) => c,
             };
             let seqs: Vec<_> = Utf8Sequences::new(c, c).collect();
             assert_eq!(seqs.len(), 1);
         }
     }

     #[test]
     fn bmp() {
         use utf8::Utf8Sequence::*;

         let seqs = Utf8Sequences::new('\u{0}', '\u{FFFF}').collect::<Vec<_>>();
         assert_eq!(
             seqs,
             vec![
                 One(rutf8(0x0, 0x7F)),
                 Two([rutf8(0xC2, 0xDF), rutf8(0x80, 0xBF)]),
                 Three([
                     rutf8(0xE0, 0xE0),
                     rutf8(0xA0, 0xBF),
                     rutf8(0x80, 0xBF)
                 ]),
                 Three([
                     rutf8(0xE1, 0xEC),
                     rutf8(0x80, 0xBF),
                     rutf8(0x80, 0xBF)
                 ]),
                 Three([
                     rutf8(0xED, 0xED),
                     rutf8(0x80, 0x9F),
                     rutf8(0x80, 0xBF)
                 ]),
                 Three([
                     rutf8(0xEE, 0xEF),
                     rutf8(0x80, 0xBF),
                     rutf8(0x80, 0xBF)
                 ]),
             ]
         );
     }

     #[test]
     fn reverse() {
         use utf8::Utf8Sequence::*;

         let mut s = One(rutf8(0xA, 0xB));
         s.reverse();
         assert_eq!(s.as_slice(), &[rutf8(0xA, 0xB)]);

         let mut s = Two([rutf8(0xA, 0xB), rutf8(0xB, 0xC)]);
         s.reverse();
         assert_eq!(s.as_slice(), &[rutf8(0xB, 0xC), rutf8(0xA, 0xB)]);

         let mut s = Three([rutf8(0xA, 0xB), rutf8(0xB, 0xC), rutf8(0xC, 0xD)]);
         s.reverse();
         assert_eq!(
             s.as_slice(),
             &[rutf8(0xC, 0xD), rutf8(0xB, 0xC), rutf8(0xA, 0xB)]
         );

         let mut s = Four([
             rutf8(0xA, 0xB),
             rutf8(0xB, 0xC),
             rutf8(0xC, 0xD),
             rutf8(0xD, 0xE),
         ]);
         s.reverse();
         assert_eq!(
             s.as_slice(),
             &[
                 rutf8(0xD, 0xE),
                 rutf8(0xC, 0xD),
                 rutf8(0xB, 0xC),
                 rutf8(0xA, 0xB)
             ]
         );
     }

     fn encode_surrogate(cp: u32) -> [u8; 3] {
         const TAG_CONT: u8 = 0b1000_0000;
         const TAG_THREE_B: u8 = 0b1110_0000;

         assert!(0xD800 <= cp && cp < 0xE000);
         let mut dst = [0; 3];
         dst[0] = (cp >> 12 & 0x0F) as u8 | TAG_THREE_B;
         dst[1] = (cp >> 6 & 0x3F) as u8 | TAG_CONT;
         dst[2] = (cp & 0x3F) as u8 | TAG_CONT;
         dst
     }
 }
	/*!
	Converts ranges of Unicode scalar values to equivalent ranges of UTF-8 bytes.

	This is sub-module is useful for constructing byte based automatons that need
	to embed UTF-8 decoding. The most common use of this module is in conjunction
	with the [`hir::ClassUnicodeRange`](../hir/struct.ClassUnicodeRange.html) type.

	See the documentation on the `Utf8Sequences` iterator for more details and
	an example.

	# Wait, what is this?

	This is simplest to explain with an example. Let's say you wanted to test
	whether a particular byte sequence was a Cyrillic character. One possible
	scalar value range is `[0400-04FF]`. The set of allowed bytes for this
	range can be expressed as a sequence of byte ranges:

	```ignore
	[D0-D3][80-BF]
	```

	This is simple enough: simply encode the boundaries, `0400` encodes to
	`D0 80` and `04FF` encodes to `D3 BF`, and create ranges from each
	corresponding pair of bytes: `D0` to `D3` and `80` to `BF`.

	However, what if you wanted to add the Cyrillic Supplementary characters to
	your range? Your range might then become `[0400-052F]`. The same procedure
	as above doesn't quite work because `052F` encodes to `D4 AF`. The byte ranges
	you'd get from the previous transformation would be `[D0-D4][80-AF]`. However,
	this isn't quite correct because this range doesn't capture many characters,
	for example, `04FF` (because its last byte, `BF` isn't in the range `80-AF`).

	Instead, you need multiple sequences of byte ranges:

	```ignore
	[D0-D3][80-BF] # matches codepoints 0400-04FF
	[D4][80-AF] # matches codepoints 0500-052F
	```

	This gets even more complicated if you want bigger ranges, particularly if
	they naively contain surrogate codepoints. For example, the sequence of byte
	ranges for the basic multilingual plane (`[0000-FFFF]`) look like this:

	```ignore
	[0-7F]
	[C2-DF][80-BF]
	[E0][A0-BF][80-BF]
	[E1-EC][80-BF][80-BF]
	[ED][80-9F][80-BF]
	[EE-EF][80-BF][80-BF]
	```

	Note that the byte ranges above will not match any erroneous encoding of
	UTF-8, including encodings of surrogate codepoints.

	And, of course, for all of Unicode (`[000000-10FFFF]`):

	```ignore
	[0-7F]
	[C2-DF][80-BF]
	[E0][A0-BF][80-BF]
	[E1-EC][80-BF][80-BF]
	[ED][80-9F][80-BF]
	[EE-EF][80-BF][80-BF]
	[F0][90-BF][80-BF][80-BF]
	[F1-F3][80-BF][80-BF][80-BF]
	[F4][80-8F][80-BF][80-BF]
	```

	This module automates the process of creating these byte ranges from ranges of
	Unicode scalar values.

	# Lineage

	I got the idea and general implementation strategy from Russ Cox in his
	[article on regexps](https://web.archive.org/web/20160404141123/https://swtch.com/~rsc/regexp/regexp3.html) and RE2.
	Russ Cox got it from Ken Thompson's `grep` (no source, folk lore?).
	I also got the idea from
	[Lucene](https://github.com/apache/lucene-solr/blob/ae93f4e7ac6a3908046391de35d4f50a0d3c59ca/lucene/core/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.java),
	which uses it for executing automata on their term index.
	*/

	#![deny(missing_docs)]

	use std::char;
	use std::fmt;
	use std::slice;

	const MAX_UTF8_BYTES: usize = 4;

	/// Utf8Sequence represents a sequence of byte ranges.
	///
	/// To match a Utf8Sequence, a candidate byte sequence must match each
	/// successive range.
	///
	/// For example, if there are two ranges, `[C2-DF][80-BF]`, then the byte
	/// sequence `\xDD\x61` would not match because `0x61 < 0x80`.
	#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord)]
	pub enum Utf8Sequence {
	/// One byte range.
	One(Utf8Range),
	/// Two successive byte ranges.
	Two([Utf8Range; 2]),
	/// Three successive byte ranges.
	Three([Utf8Range; 3]),
	/// Four successive byte ranges.
	Four([Utf8Range; 4]),
	}

	impl Utf8Sequence {
	/// Creates a new UTF-8 sequence from the encoded bytes of a scalar value
	/// range.
	///
	/// This assumes that `start` and `end` have the same length.
	fn from_encoded_range(start: &[u8], end: &[u8]) -> Self {
	assert_eq!(start.len(), end.len());
	match start.len() {
	2 => Utf8Sequence::Two([
	Utf8Range::new(start[0], end[0]),
	Utf8Range::new(start[1], end[1]),
	]),
	3 => Utf8Sequence::Three([
	Utf8Range::new(start[0], end[0]),
	Utf8Range::new(start[1], end[1]),
	Utf8Range::new(start[2], end[2]),
	]),
	4 => Utf8Sequence::Four([
	Utf8Range::new(start[0], end[0]),
	Utf8Range::new(start[1], end[1]),
	Utf8Range::new(start[2], end[2]),
	Utf8Range::new(start[3], end[3]),
	]),
	n => unreachable!("invalid encoded length: {}", n),
	}
	}

	/// Returns the underlying sequence of byte ranges as a slice.
	pub fn as_slice(&self) -> &[Utf8Range] {
	use self::Utf8Sequence::*;
	match *self {
	One(ref r) => slice::from_ref(r),
	Two(ref r) => &r[..],
	Three(ref r) => &r[..],
	Four(ref r) => &r[..],
	}
	}

	/// Returns the number of byte ranges in this sequence.
	///
	/// The length is guaranteed to be in the closed interval `[1, 4]`.
	pub fn len(&self) -> usize {
	self.as_slice().len()
	}

	/// Reverses the ranges in this sequence.
	///
	/// For example, if this corresponds to the following sequence:
	///
	/// ```ignore
	/// [D0-D3][80-BF]
	/// ```
	///
	/// Then after reversal, it will be
	///
	/// ```ignore
	/// [80-BF][D0-D3]
	/// ```
	///
	/// This is useful when one is constructing a UTF-8 automaton to match
	/// character classes in reverse.
	pub fn reverse(&mut self) {
	match *self {
	Utf8Sequence::One(_) => {}
	Utf8Sequence::Two(ref mut x) => x.reverse(),
	Utf8Sequence::Three(ref mut x) => x.reverse(),
	Utf8Sequence::Four(ref mut x) => x.reverse(),
	}
	}

	/// Returns true if and only if a prefix of `bytes` matches this sequence
	/// of byte ranges.
	pub fn matches(&self, bytes: &[u8]) -> bool {
	if bytes.len() < self.len() {
	return false;
	}
	for (&b, r) in bytes.iter().zip(self) {
	if !r.matches(b) {
	return false;
	}
	}
	true
	}
	}

	impl<'a> IntoIterator for &'a Utf8Sequence {
	type IntoIter = slice::Iter<'a, Utf8Range>;
	type Item = &'a Utf8Range;

	fn into_iter(self) -> Self::IntoIter {
	self.as_slice().into_iter()
	}
	}

	impl fmt::Debug for Utf8Sequence {
	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
	use self::Utf8Sequence::*;
	match *self {
	One(ref r) => write!(f, "{:?}", r),
	Two(ref r) => write!(f, "{:?}{:?}", r[0], r[1]),
	Three(ref r) => write!(f, "{:?}{:?}{:?}", r[0], r[1], r[2]),
	Four(ref r) => {
	write!(f, "{:?}{:?}{:?}{:?}", r[0], r[1], r[2], r[3])
	}
	}
	}
	}

	/// A single inclusive range of UTF-8 bytes.
	#[derive(Clone, Copy, Eq, PartialEq, PartialOrd, Ord)]
	pub struct Utf8Range {
	/// Start of byte range (inclusive).
	pub start: u8,
	/// End of byte range (inclusive).
	pub end: u8,
	}

	impl Utf8Range {
	fn new(start: u8, end: u8) -> Self {
	Utf8Range { start, end }
	}

	/// Returns true if and only if the given byte is in this range.
	pub fn matches(&self, b: u8) -> bool {
	self.start <= b && b <= self.end
	}
	}

	impl fmt::Debug for Utf8Range {
	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
	if self.start == self.end {
	write!(f, "[{:X}]", self.start)
	} else {
	write!(f, "[{:X}-{:X}]", self.start, self.end)
	}
	}
	}

	/// An iterator over ranges of matching UTF-8 byte sequences.
	///
	/// The iteration represents an alternation of comprehensive byte sequences
	/// that match precisely the set of UTF-8 encoded scalar values.
	///
	/// A byte sequence corresponds to one of the scalar values in the range given
	/// if and only if it completely matches exactly one of the sequences of byte
	/// ranges produced by this iterator.
	///
	/// Each sequence of byte ranges matches a unique set of bytes. That is, no two
	/// sequences will match the same bytes.
	///
	/// # Example
	///
	/// This shows how to match an arbitrary byte sequence against a range of
	/// scalar values.
	///
	/// ```rust
	/// use regex_syntax::utf8::{Utf8Sequences, Utf8Sequence};
	///
	/// fn matches(seqs: &[Utf8Sequence], bytes: &[u8]) -> bool {
	/// for range in seqs {
	/// if range.matches(bytes) {
	/// return true;
	/// }
	/// }
	/// false
	/// }
	///
	/// // Test the basic multilingual plane.
	/// let seqs: Vec<_> = Utf8Sequences::new('\u{0}', '\u{FFFF}').collect();
	///
	/// // UTF-8 encoding of 'a'.
	/// assert!(matches(&seqs, &[0x61]));
	/// // UTF-8 encoding of '☃' (`\u{2603}`).
	/// assert!(matches(&seqs, &[0xE2, 0x98, 0x83]));
	/// // UTF-8 encoding of `\u{10348}` (outside the BMP).
	/// assert!(!matches(&seqs, &[0xF0, 0x90, 0x8D, 0x88]));
	/// // Tries to match against a UTF-8 encoding of a surrogate codepoint,
	/// // which is invalid UTF-8, and therefore fails, despite the fact that
	/// // the corresponding codepoint (0xD800) falls in the range given.
	/// assert!(!matches(&seqs, &[0xED, 0xA0, 0x80]));
	/// // And fails against plain old invalid UTF-8.
	/// assert!(!matches(&seqs, &[0xFF, 0xFF]));
	/// ```
	///
	/// If this example seems circuitous, that's because it is! It's meant to be
	/// illustrative. In practice, you could just try to decode your byte sequence
	/// and compare it with the scalar value range directly. However, this is not
	/// always possible (for example, in a byte based automaton).
	pub struct Utf8Sequences {
	range_stack: Vec<ScalarRange>,
	}

	impl Utf8Sequences {
	/// Create a new iterator over UTF-8 byte ranges for the scalar value range
	/// given.
	pub fn new(start: char, end: char) -> Self {
	let mut it = Utf8Sequences { range_stack: vec![] };
	it.push(start as u32, end as u32);
	it
	}

	/// reset resets the scalar value range.
	/// Any existing state is cleared, but resources may be reused.
	///
	/// N.B. Benchmarks say that this method is dubious.
	#[doc(hidden)]
	pub fn reset(&mut self, start: char, end: char) {
	self.range_stack.clear();
	self.push(start as u32, end as u32);
	}

	fn push(&mut self, start: u32, end: u32) {
	self.range_stack.push(ScalarRange { start, end });
	}
	}

	struct ScalarRange {
	start: u32,
	end: u32,
	}

	impl fmt::Debug for ScalarRange {
	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
	write!(f, "ScalarRange({:X}, {:X})", self.start, self.end)
	}
	}

	impl Iterator for Utf8Sequences {
	type Item = Utf8Sequence;

	fn next(&mut self) -> Option<Self::Item> {
	'TOP: while let Some(mut r) = self.range_stack.pop() {
	'INNER: loop {
	if let Some((r1, r2)) = r.split() {
	self.push(r2.start, r2.end);
	r.start = r1.start;
	r.end = r1.end;
	continue 'INNER;
	}
	if !r.is_valid() {
	continue 'TOP;
	}
	for i in 1..MAX_UTF8_BYTES {
	let max = max_scalar_value(i);
	if r.start <= max && max < r.end {
	self.push(max + 1, r.end);
	r.end = max;
	continue 'INNER;
	}
	}
	if let Some(ascii_range) = r.as_ascii() {
	return Some(Utf8Sequence::One(ascii_range));
	}
	for i in 1..MAX_UTF8_BYTES {
	let m = (1 << (6 * i)) - 1;
	if (r.start & !m) != (r.end & !m) {
	if (r.start & m) != 0 {
	self.push((r.start \| m) + 1, r.end);
	r.end = r.start \| m;
	continue 'INNER;
	}
	if (r.end & m) != m {
	self.push(r.end & !m, r.end);
	r.end = (r.end & !m) - 1;
	continue 'INNER;
	}
	}
	}
	let mut start = [0; MAX_UTF8_BYTES];
	let mut end = [0; MAX_UTF8_BYTES];
	let n = r.encode(&mut start, &mut end);
	return Some(Utf8Sequence::from_encoded_range(
	&start[0..n],
	&end[0..n],
	));
	}
	}
	None
	}
	}

	impl ScalarRange {
	/// split splits this range if it overlaps with a surrogate codepoint.
	///
	/// Either or both ranges may be invalid.
	fn split(&self) -> Option<(ScalarRange, ScalarRange)> {
	if self.start < 0xE000 && self.end > 0xD7FF {
	Some((
	ScalarRange { start: self.start, end: 0xD7FF },
	ScalarRange { start: 0xE000, end: self.end },
	))
	} else {
	None
	}
	}

	/// is_valid returns true if and only if start <= end.
	fn is_valid(&self) -> bool {
	self.start <= self.end
	}

	/// as_ascii returns this range as a Utf8Range if and only if all scalar
	/// values in this range can be encoded as a single byte.
	fn as_ascii(&self) -> Option<Utf8Range> {
	if self.is_ascii() {
	Some(Utf8Range::new(self.start as u8, self.end as u8))
	} else {
	None
	}
	}

	/// is_ascii returns true if the range is ASCII only (i.e., takes a single
	/// byte to encode any scalar value).
	fn is_ascii(&self) -> bool {
	self.is_valid() && self.end <= 0x7f
	}

	/// encode writes the UTF-8 encoding of the start and end of this range
	/// to the corresponding destination slices, and returns the number of
	/// bytes written.
	///
	/// The slices should have room for at least `MAX_UTF8_BYTES`.
	fn encode(&self, start: &mut [u8], end: &mut [u8]) -> usize {
	let cs = char::from_u32(self.start).unwrap();
	let ce = char::from_u32(self.end).unwrap();
	let ss = cs.encode_utf8(start);
	let se = ce.encode_utf8(end);
	assert_eq!(ss.len(), se.len());
	ss.len()
	}
	}

	fn max_scalar_value(nbytes: usize) -> u32 {
	match nbytes {
	1 => 0x007F,
	2 => 0x07FF,
	3 => 0xFFFF,
	4 => 0x10FFFF,
	_ => unreachable!("invalid UTF-8 byte sequence size"),
	}
	}

	#[cfg(test)]
	mod tests {
	use std::char;

	use utf8::{Utf8Range, Utf8Sequences};

	fn rutf8(s: u8, e: u8) -> Utf8Range {
	Utf8Range::new(s, e)
	}

	fn never_accepts_surrogate_codepoints(start: char, end: char) {
	for cp in 0xD800..0xE000 {
	let buf = encode_surrogate(cp);
	for r in Utf8Sequences::new(start, end) {
	if r.matches(&buf) {
	panic!(
	"Sequence ({:X}, {:X}) contains range {:?}, \
	which matches surrogate code point {:X} \
	with encoded bytes {:?}",
	start as u32, end as u32, r, cp, buf,
	);
	}
	}
	}
	}

	#[test]
	fn codepoints_no_surrogates() {
	never_accepts_surrogate_codepoints('\u{0}', '\u{FFFF}');
	never_accepts_surrogate_codepoints('\u{0}', '\u{10FFFF}');
	never_accepts_surrogate_codepoints('\u{0}', '\u{10FFFE}');
	never_accepts_surrogate_codepoints('\u{80}', '\u{10FFFF}');
	never_accepts_surrogate_codepoints('\u{D7FF}', '\u{E000}');
	}

	#[test]
	fn single_codepoint_one_sequence() {
	// Tests that every range of scalar values that contains a single
	// scalar value is recognized by one sequence of byte ranges.
	for i in 0x0..(0x10FFFF + 1) {
	let c = match char::from_u32(i) {
	None => continue,
	Some(c) => c,
	};
	let seqs: Vec<_> = Utf8Sequences::new(c, c).collect();
	assert_eq!(seqs.len(), 1);
	}
	}

	#[test]
	fn bmp() {
	use utf8::Utf8Sequence::*;

	let seqs = Utf8Sequences::new('\u{0}', '\u{FFFF}').collect::<Vec<_>>();
	assert_eq!(
	seqs,
	vec![
	One(rutf8(0x0, 0x7F)),
	Two([rutf8(0xC2, 0xDF), rutf8(0x80, 0xBF)]),
	Three([
	rutf8(0xE0, 0xE0),
	rutf8(0xA0, 0xBF),
	rutf8(0x80, 0xBF)
	]),
	Three([
	rutf8(0xE1, 0xEC),
	rutf8(0x80, 0xBF),
	rutf8(0x80, 0xBF)
	]),
	Three([
	rutf8(0xED, 0xED),
	rutf8(0x80, 0x9F),
	rutf8(0x80, 0xBF)
	]),
	Three([
	rutf8(0xEE, 0xEF),
	rutf8(0x80, 0xBF),
	rutf8(0x80, 0xBF)
	]),
	]
	);
	}

	#[test]
	fn reverse() {
	use utf8::Utf8Sequence::*;

	let mut s = One(rutf8(0xA, 0xB));
	s.reverse();
	assert_eq!(s.as_slice(), &[rutf8(0xA, 0xB)]);

	let mut s = Two([rutf8(0xA, 0xB), rutf8(0xB, 0xC)]);
	s.reverse();
	assert_eq!(s.as_slice(), &[rutf8(0xB, 0xC), rutf8(0xA, 0xB)]);

	let mut s = Three([rutf8(0xA, 0xB), rutf8(0xB, 0xC), rutf8(0xC, 0xD)]);
	s.reverse();
	assert_eq!(
	s.as_slice(),
	&[rutf8(0xC, 0xD), rutf8(0xB, 0xC), rutf8(0xA, 0xB)]
	);

	let mut s = Four([
	rutf8(0xA, 0xB),
	rutf8(0xB, 0xC),
	rutf8(0xC, 0xD),
	rutf8(0xD, 0xE),
	]);
	s.reverse();
	assert_eq!(
	s.as_slice(),
	&[
	rutf8(0xD, 0xE),
	rutf8(0xC, 0xD),
	rutf8(0xB, 0xC),
	rutf8(0xA, 0xB)
	]
	);
	}

	fn encode_surrogate(cp: u32) -> [u8; 3] {
	const TAG_CONT: u8 = 0b1000_0000;
	const TAG_THREE_B: u8 = 0b1110_0000;

	assert!(0xD800 <= cp && cp < 0xE000);
	let mut dst = [0; 3];
	dst[0] = (cp >> 12 & 0x0F) as u8 \| TAG_THREE_B;
	dst[1] = (cp >> 6 & 0x3F) as u8 \| TAG_CONT;
	dst[2] = (cp & 0x3F) as u8 \| TAG_CONT;
	dst
	}
	}