vendor/pulldown-cmark-0.5.3/src/scanners.rs - toolchain/rustc - Git at Google

 // Copyright 2015 Google Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
 // in the Software without restriction, including without limitation the rights
 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 // copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 //
 // The above copyright notice and this permission notice shall be included in
 // all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 // THE SOFTWARE.

 //! Scanners for fragments of CommonMark syntax

 use std::char;
 use std::convert::TryInto;

 use crate::entities;
 use crate::parse::{Alignment, HtmlScanGuard, LinkType};
 pub use crate::puncttable::{is_ascii_punctuation, is_punctuation};
 use crate::strings::CowStr;

 use memchr::memchr;

 // Allowing arbitrary depth nested parentheses inside link destinations
 // can create denial of service vulnerabilities if we're not careful.
 // The simplest countermeasure is to limit their depth, which is
 // explicitly allowed by the spec as long as the limit is at least 3:
 // https://spec.commonmark.org/0.29/#link-destination
 const LINK_MAX_NESTED_PARENS: usize = 5;

 // sorted for binary search
 const HTML_TAGS: [&str; 62] = [
     "address",
     "article",
     "aside",
     "base",
     "basefont",
     "blockquote",
     "body",
     "caption",
     "center",
     "col",
     "colgroup",
     "dd",
     "details",
     "dialog",
     "dir",
     "div",
     "dl",
     "dt",
     "fieldset",
     "figcaption",
     "figure",
     "footer",
     "form",
     "frame",
     "frameset",
     "h1",
     "h2",
     "h3",
     "h4",
     "h5",
     "h6",
     "head",
     "header",
     "hr",
     "html",
     "iframe",
     "legend",
     "li",
     "link",
     "main",
     "menu",
     "menuitem",
     "nav",
     "noframes",
     "ol",
     "optgroup",
     "option",
     "p",
     "param",
     "section",
     "source",
     "summary",
     "table",
     "tbody",
     "td",
     "tfoot",
     "th",
     "thead",
     "title",
     "tr",
     "track",
     "ul",
 ];

 /// Analysis of the beginning of a line, including indentation and container
 /// markers.
 #[derive(Clone)]
 pub struct LineStart<'a> {
     bytes: &'a [u8],
     tab_start: usize,
     ix: usize,
     spaces_remaining: usize,
     // no thematic breaks can occur before this offset.
     // this prevents scanning over and over up to a certain point
     min_hrule_offset: usize,
 }

 impl<'a> LineStart<'a> {
     pub(crate) fn new(bytes: &[u8]) -> LineStart {
         LineStart {
             bytes,
             tab_start: 0,
             ix: 0,
             spaces_remaining: 0,
             min_hrule_offset: 0,
         }
     }

     /// Try to scan a number of spaces.
     ///
     /// Returns true if all spaces were consumed.
     ///
     /// Note: consumes some spaces even if not successful.
     pub(crate) fn scan_space(&mut self, n_space: usize) -> bool {
         self.scan_space_inner(n_space) == 0
     }

     /// Scan a number of spaces up to a maximum.
     ///
     /// Returns number of spaces scanned.
     pub(crate) fn scan_space_upto(&mut self, n_space: usize) -> usize {
         n_space - self.scan_space_inner(n_space)
     }

     /// Returns unused remainder of spaces.
     fn scan_space_inner(&mut self, mut n_space: usize) -> usize {
         let n_from_remaining = self.spaces_remaining.min(n_space);
         self.spaces_remaining -= n_from_remaining;
         n_space -= n_from_remaining;
         while n_space > 0 && self.ix < self.bytes.len() {
             match self.bytes[self.ix] {
                 b' ' => {
                     self.ix += 1;
                     n_space -= 1;
                 }
                 b'\t' => {
                     let spaces = 4 - (self.ix - self.tab_start) % 4;
                     self.ix += 1;
                     self.tab_start = self.ix;
                     let n = spaces.min(n_space);
                     n_space -= n;
                     self.spaces_remaining = spaces - n;
                 }
                 _ => break,
             }
         }
         n_space
     }

     /// Scan all available ASCII whitespace (not including eol).
     pub(crate) fn scan_all_space(&mut self) {
         self.spaces_remaining = 0;
         self.ix += self.bytes[self.ix..]
             .iter()
             .take_while(|&&b| b == b' ' || b == b'\t')
             .count();
     }

     /// Determine whether we're at end of line (includes end of file).
     pub(crate) fn is_at_eol(&self) -> bool {
         if self.ix >= self.bytes.len() {
             return true;
         }
         let c = self.bytes[self.ix];
         c == b'\r' || c == b'\n'
     }

     fn scan_ch(&mut self, c: u8) -> bool {
         if self.ix < self.bytes.len() && self.bytes[self.ix] == c {
             self.ix += 1;
             true
         } else {
             false
         }
     }

     pub(crate) fn scan_blockquote_marker(&mut self) -> bool {
         let save = self.clone();
         let _ = self.scan_space(3);
         if self.scan_ch(b'>') {
             let _ = self.scan_space(1);
             true
         } else {
             *self = save;
             false
         }
     }

     /// Scan a list marker.
     ///
     /// Return value is the character, the start index, and the indent in spaces.
     /// For ordered list markers, the character will be one of b'.' or b')'. For
     /// bullet list markers, it will be one of b'-', b'+', or b'*'.
     pub(crate) fn scan_list_marker(&mut self) -> Option<(u8, usize, usize)> {
         let save = self.clone();
         let indent = self.scan_space_upto(3);
         if self.ix < self.bytes.len() {
             let c = self.bytes[self.ix];
             if c == b'-' || c == b'+' || c == b'*' {
                 if self.ix >= self.min_hrule_offset {
                     // there could be an hrule here
                     if let Err(min_offset) = scan_hrule(&self.bytes[self.ix..]) {
                         self.min_hrule_offset = min_offset;
                     } else {
                         *self = save;
                         return None;
                     }
                 }
                 self.ix += 1;
                 if self.scan_space(1) || self.is_at_eol() {
                     return self.finish_list_marker(c, 0, indent + 2);
                 }
             } else if c >= b'0' && c <= b'9' {
                 let start_ix = self.ix;
                 let mut ix = self.ix + 1;
                 let mut val = u64::from(c - b'0');
                 while ix < self.bytes.len() && ix - start_ix < 10 {
                     let c = self.bytes[ix];
                     ix += 1;
                     if c >= b'0' && c <= b'9' {
                         val = val * 10 + u64::from(c - b'0');
                     } else if c == b')' || c == b'.' {
                         self.ix = ix;
                         let val_usize = val as usize;
                         // This will cause some failures on 32 bit arch.
                         // TODO (breaking API change): should be u64, not usize.
                         if val_usize as u64 != val {
                             return None;
                         }
                         if self.scan_space(1) || self.is_at_eol() {
                             return self.finish_list_marker(
                                 c,
                                 val_usize,
                                 indent + self.ix - start_ix,
                             );
                         } else {
                             break;
                         }
                     } else {
                         break;
                     }
                 }
             }
         }
         *self = save;
         None
     }

     fn finish_list_marker(
         &mut self,
         c: u8,
         start: usize,
         mut indent: usize,
     ) -> Option<(u8, usize, usize)> {
         let save = self.clone();

         // skip the rest of the line if it's blank
         if scan_blank_line(&self.bytes[self.ix..]).is_some() {
             return Some((c, start, indent));
         }

         let post_indent = self.scan_space_upto(4);
         if post_indent < 4 {
             indent += post_indent;
         } else {
             *self = save;
         }
         Some((c, start, indent))
     }

     /// Returns Some(is_checked) when a task list marker was found. Resets itself
     /// to original state otherwise.
     pub(crate) fn scan_task_list_marker(&mut self) -> Option<bool> {
         let save = self.clone();
         self.scan_space_upto(3);

         if !self.scan_ch(b'[') {
             *self = save;
             return None;
         }
         let is_checked = match self.bytes.get(self.ix) {
             Some(&c) if is_ascii_whitespace_no_nl(c) => {
                 self.ix += 1;
                 false
             }
             Some(b'x') | Some(b'X') => {
                 self.ix += 1;
                 true
             }
             _ => {
                 *self = save;
                 return None;
             }
         };
         if !self.scan_ch(b']') {
             *self = save;
             return None;
         }
         if !self
             .bytes
             .get(self.ix)
             .map(|&b| is_ascii_whitespace_no_nl(b))
             .unwrap_or(false)
         {
             *self = save;
             return None;
         }
         Some(is_checked)
     }

     pub(crate) fn bytes_scanned(&self) -> usize {
         self.ix
     }

     pub(crate) fn remaining_space(&self) -> usize {
         self.spaces_remaining
     }
 }

 pub(crate) fn is_ascii_whitespace(c: u8) -> bool {
     (c >= 0x09 && c <= 0x0d) || c == b' '
 }

 pub(crate) fn is_ascii_whitespace_no_nl(c: u8) -> bool {
     c == b'\t' || c == 0x0b || c == 0x0c || c == b' '
 }

 fn is_ascii_alpha(c: u8) -> bool {
     match c {
         b'a'..=b'z' | b'A'..=b'Z' => true,
         _ => false,
     }
 }

 fn is_ascii_alphanumeric(c: u8) -> bool {
     match c {
         b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' => true,
         _ => false,
     }
 }

 fn is_ascii_letterdigitdash(c: u8) -> bool {
     c == b'-' || is_ascii_alphanumeric(c)
 }

 fn is_digit(c: u8) -> bool {
     b'0' <= c && c <= b'9'
 }

 fn is_valid_unquoted_attr_value_char(c: u8) -> bool {
     match c {
         b'\'' | b'"' | b' ' | b'=' | b'>' | b'<' | b'`' | b'\n' | b'\r' => false,
         _ => true,
     }
 }

 // scan a single character
 pub(crate) fn scan_ch(data: &[u8], c: u8) -> usize {
     if !data.is_empty() && data[0] == c {
         1
     } else {
         0
     }
 }

 pub(crate) fn scan_while<F>(data: &[u8], mut f: F) -> usize
 where
     F: FnMut(u8) -> bool,
 {
     data.iter().take_while(|&&c| f(c)).count()
 }

 pub(crate) fn scan_rev_while<F>(data: &[u8], mut f: F) -> usize
 where
     F: FnMut(u8) -> bool,
 {
     data.iter().rev().take_while(|&&c| f(c)).count()
 }

 pub(crate) fn scan_ch_repeat(data: &[u8], c: u8) -> usize {
     scan_while(data, |x| x == c)
 }

 // Note: this scans ASCII whitespace only, for Unicode whitespace use
 // a different function.
 pub(crate) fn scan_whitespace_no_nl(data: &[u8]) -> usize {
     scan_while(data, is_ascii_whitespace_no_nl)
 }

 fn scan_attr_value_chars(data: &[u8]) -> usize {
     scan_while(data, is_valid_unquoted_attr_value_char)
 }

 pub(crate) fn scan_eol(bytes: &[u8]) -> Option<usize> {
     if bytes.is_empty() {
         return Some(0);
     }
     match bytes[0] {
         b'\n' => Some(1),
         b'\r' => Some(if bytes.get(1) == Some(&b'\n') { 2 } else { 1 }),
         _ => None,
     }
 }

 pub(crate) fn scan_blank_line(bytes: &[u8]) -> Option<usize> {
     let i = scan_whitespace_no_nl(bytes);
     scan_eol(&bytes[i..]).map(|n| i + n)
 }

 pub(crate) fn scan_nextline(bytes: &[u8]) -> usize {
     memchr(b'\n', bytes).map_or(bytes.len(), |x| x + 1)
 }

 // return: end byte for closing code fence, or None
 // if the line is not a closing code fence
 pub(crate) fn scan_closing_code_fence(
     bytes: &[u8],
     fence_char: u8,
     n_fence_char: usize,
 ) -> Option<usize> {
     if bytes.is_empty() {
         return Some(0);
     }
     let mut i = 0;
     let num_fence_chars_found = scan_ch_repeat(&bytes[i..], fence_char);
     if num_fence_chars_found < n_fence_char {
         return None;
     }
     i += num_fence_chars_found;
     let num_trailing_spaces = scan_ch_repeat(&bytes[i..], b' ');
     i += num_trailing_spaces;
     scan_eol(&bytes[i..]).map(|_| i)
 }

 // returned pair is (number of bytes, number of spaces)
 fn calc_indent(text: &[u8], max: usize) -> (usize, usize) {
     let mut spaces = 0;
     let mut offset = 0;

     for (i, &b) in text.iter().enumerate() {
         match b {
             b' ' => {
                 spaces += 1;
                 if spaces == max {
                     break;
                 }
             }
             b'\t' => {
                 let new_spaces = spaces + 4 - (spaces & 3);
                 if new_spaces > max {
                     break;
                 }
                 spaces = new_spaces;
             }
             _ => break,
         }
         offset = i;
     }

     (offset, spaces)
 }

 /// Scan hrule opening sequence.
 ///
 /// Returns Ok(x) when it finds an hrule, where x is the
 /// size of line containing the hrule, including the trailing newline.
 ///
 /// Returns Err(x) when it does not find an hrule and x is
 /// the offset in data before no hrule can appear.
 pub(crate) fn scan_hrule(bytes: &[u8]) -> Result<usize, usize> {
     if bytes.len() < 3 {
         return Err(0);
     }
     let c = bytes[0];
     if !(c == b'*' || c == b'-' || c == b'_') {
         return Err(0);
     }
     let mut n = 0;
     let mut i = 0;

     while i < bytes.len() {
         match bytes[i] {
             b'\n' | b'\r' => {
                 i += scan_eol(&bytes[i..]).unwrap_or(0);
                 break;
             }
             c2 if c2 == c => {
                 n += 1;
             }
             b' ' | b'\t' => (),
             _ => return Err(i),
         }
         i += 1;
     }
     if n >= 3 {
         Ok(i)
     } else {
         Err(i)
     }
 }

 /// Scan an ATX heading opening sequence.
 ///
 /// Returns number of bytes in prefix and level.
 pub(crate) fn scan_atx_heading(data: &[u8]) -> Option<(usize, i32)> {
     let level = scan_ch_repeat(data, b'#');
     if level >= 1 && level <= 6 && data.get(level).cloned().map_or(true, is_ascii_whitespace) {
         Some((level, level as i32))
     } else {
         None
     }
 }

 /// Scan a setext heading underline.
 ///
 /// Returns number of bytes in line (including trailing newline) and level.
 pub(crate) fn scan_setext_heading(data: &[u8]) -> Option<(usize, i32)> {
     let c = *data.get(0)?;
     if !(c == b'-' || c == b'=') {
         return None;
     }
     let mut i = 1 + scan_ch_repeat(&data[1..], c);
     i += scan_blank_line(&data[i..])?;
     let level = if c == b'=' { 1 } else { 2 };
     Some((i, level))
 }

 // returns number of bytes in line (including trailing
 // newline) and column alignments
 pub(crate) fn scan_table_head(data: &[u8]) -> (usize, Vec<Alignment>) {
     let (mut i, spaces) = calc_indent(data, 4);
     if spaces > 3 || i == data.len() {
         return (0, vec![]);
     }
     let mut cols = vec![];
     let mut active_col = Alignment::None;
     let mut start_col = true;
     if data[i] == b'|' {
         i += 1;
     }
     for c in &data[i..] {
         if let Some(n) = scan_eol(&data[i..]) {
             i += n;
             break;
         }
         match *c {
             b' ' => (),
             b':' => {
                 active_col = match (start_col, active_col) {
                     (true, Alignment::None) => Alignment::Left,
                     (false, Alignment::Left) => Alignment::Center,
                     (false, Alignment::None) => Alignment::Right,
                     _ => active_col,
                 };
                 start_col = false;
             }
             b'-' => {
                 start_col = false;
             }
             b'|' => {
                 start_col = true;
                 cols.push(active_col);
                 active_col = Alignment::None;
             }
             _ => {
                 cols = vec![];
                 start_col = true;
                 break;
             }
         }
         i += 1;
     }

     if !start_col {
         cols.push(active_col);
     }

     (i, cols)
 }

 /// Scan code fence.
 ///
 /// Returns number of bytes scanned and the char that is repeated to make the code fence.
 pub(crate) fn scan_code_fence(data: &[u8]) -> Option<(usize, u8)> {
     let c = *data.get(0)?;
     if !(c == b'`' || c == b'~') {
         return None;
     }
     let i = 1 + scan_ch_repeat(&data[1..], c);
     if i >= 3 {
         if c == b'`' {
             let suffix = &data[i..];
             let next_line = i + scan_nextline(suffix);
             // FIXME: make sure this is correct
             if suffix[..(next_line - i)].iter().any(|&b| b == b'`') {
                 return None;
             }
         }
         Some((i, c))
     } else {
         None
     }
 }

 pub(crate) fn scan_blockquote_start(data: &[u8]) -> Option<usize> {
     if data.starts_with(b"> ") {
         Some(2)
     } else {
         None
     }
 }

 /// This already assumes the list item has been scanned.
 pub(crate) fn scan_empty_list(data: &[u8]) -> bool {
     let mut ix = 0;
     for _ in 0..2 {
         if let Some(bytes) = scan_blank_line(&data[ix..]) {
             ix += bytes;
         } else {
             return false;
         }
     }
     true
 }

 // return number of bytes scanned, delimiter, start index, and indent
 pub(crate) fn scan_listitem(bytes: &[u8]) -> Option<(usize, u8, usize, usize)> {
     let mut c = *bytes.get(0)?;
     let (w, start) = match c {
         b'-' | b'+' | b'*' => (1, 0),
         b'0'..=b'9' => {
             let (length, start) = parse_decimal(bytes);
             c = *bytes.get(length)?;
             if !(c == b'.' || c == b')') {
                 return None;
             }
             (length + 1, start)
         }
         _ => {
             return None;
         }
     };
     // TODO: replace calc_indent with scan_leading_whitespace, for tab correctness
     let (mut postn, mut postindent) = calc_indent(&bytes[w..], 5);
     if postindent == 0 {
         scan_eol(&bytes[w..])?;
         postindent += 1;
     } else if postindent > 4 {
         postn = 1;
         postindent = 1;
     }
     if scan_blank_line(&bytes[w..]).is_some() {
         postn = 0;
         postindent = 1;
     }
     Some((w + postn, c, start, w + postindent))
 }

 // returns (number of bytes, parsed decimal)
 fn parse_decimal(bytes: &[u8]) -> (usize, usize) {
     match bytes
         .iter()
         .take_while(|&&b| is_digit(b))
         .try_fold((0, 0usize), |(count, acc), c| {
             let digit = usize::from(c - b'0');
             match acc
                 .checked_mul(10)
                 .and_then(|ten_acc| ten_acc.checked_add(digit))
             {
                 Some(number) => Ok((count + 1, number)),
                 // stop early on overflow
                 None => Err((count, acc)),
             }
         }) {
         Ok(p) | Err(p) => p,
     }
 }

 // returns (number of bytes, parsed hex)
 fn parse_hex(bytes: &[u8]) -> (usize, usize) {
     match bytes.iter().try_fold((0, 0usize), |(count, acc), c| {
         let mut c = *c;
         let digit = if c >= b'0' && c <= b'9' {
             usize::from(c - b'0')
         } else {
             // make lower case
             c |= 0x20;
             if c >= b'a' && c <= b'f' {
                 usize::from(c - b'a' + 10)
             } else {
                 return Err((count, acc));
             }
         };
         match acc
             .checked_mul(16)
             .and_then(|sixteen_acc| sixteen_acc.checked_add(digit))
         {
             Some(number) => Ok((count + 1, number)),
             // stop early on overflow
             None => Err((count, acc)),
         }
     }) {
         Ok(p) | Err(p) => p,
     }
 }

 fn char_from_codepoint(input: usize) -> Option<char> {
     let mut codepoint = input.try_into().ok()?;
     if codepoint == 0 {
         codepoint = 0xFFFD;
     }
     char::from_u32(codepoint)
 }

 // doesn't bother to check data[0] == '&'
 pub(crate) fn scan_entity(bytes: &[u8]) -> (usize, Option<CowStr<'static>>) {
     let mut end = 1;
     if scan_ch(&bytes[end..], b'#') == 1 {
         end += 1;
         let (bytecount, codepoint) = if end < bytes.len() && bytes[end] | 0x20 == b'x' {
             end += 1;
             parse_hex(&bytes[end..])
         } else {
             parse_decimal(&bytes[end..])
         };
         end += bytecount;
         return if bytecount == 0 || scan_ch(&bytes[end..], b';') == 0 {
             (0, None)
         } else if let Some(c) = char_from_codepoint(codepoint) {
             (end + 1, Some(c.into()))
         } else {
             (0, None)
         };
     }
     end += scan_while(&bytes[end..], is_ascii_alphanumeric);
     if scan_ch(&bytes[end..], b';') == 1 {
         if let Some(value) = entities::get_entity(&bytes[1..end]) {
             return (end + 1, Some(value.into()));
         }
     }
     (0, None)
 }

 // returns (bytes scanned, title cow)
 fn scan_link_title(text: &str, start_ix: usize) -> Option<(usize, CowStr<'_>)> {
     let bytes = text.as_bytes();
     let open = match bytes.get(start_ix) {
         Some(b @ b'\'') | Some(b @ b'\"') | Some(b @ b'(') => *b,
         _ => return None,
     };
     let close = if open == b'(' { b')' } else { open };

     let mut title = String::new();
     let mut mark = start_ix + 1;
     let mut i = start_ix + 1;

     while i < bytes.len() {
         let c = bytes[i];

         if c == close {
             let cow = if mark == 1 {
                 (i - start_ix + 1, text[mark..i].into())
             } else {
                 title.push_str(&text[mark..i]);
                 (i - start_ix + 1, title.into())
             };

             return Some(cow);
         }
         if c == open {
             return None;
         }

         // TODO: do b'\r' as well?
         if c == b'&' {
             if let (n, Some(value)) = scan_entity(&bytes[i..]) {
                 title.push_str(&text[mark..i]);
                 title.push_str(&value);
                 i += n;
                 mark = i;
                 continue;
             }
         }
         if c == b'\\' && i + 1 < bytes.len() && is_ascii_punctuation(bytes[i + 1]) {
             title.push_str(&text[mark..i]);
             i += 1;
             mark = i;
         }

         i += 1;
     }

     None
 }

 // FIXME: we can most likely re-use other scanners
 // returns (bytelength, title_str)
 pub(crate) fn scan_refdef_title(text: &str) -> Option<(usize, &str)> {
     let mut chars = text.chars().peekable();
     let closing_delim = match chars.next()? {
         '\'' => '\'',
         '"' => '"',
         '(' => ')',
         _ => return None,
     };
     let mut bytecount = 1;

     while let Some(c) = chars.next() {
         match c {
             '\n' => {
                 bytecount += 1;
                 let mut next = *chars.peek()?;
                 while is_ascii_whitespace_no_nl(next as u8) {
                     bytecount += chars.next()?.len_utf8();
                     next = *chars.peek()?;
                 }
                 if *chars.peek()? == '\n' {
                     // blank line - not allowed
                     return None;
                 }
             }
             '\\' => {
                 let next_char = chars.next()?;
                 bytecount += 1 + next_char.len_utf8();
             }
             c if c == closing_delim => {
                 return Some((bytecount + 1, &text[1..bytecount]));
             }
             c => {
                 bytecount += c.len_utf8();
             }
         }
     }
     None
 }

 // note: dest returned is raw, still needs to be unescaped
 // TODO: check that nested parens are really not allowed for refdefs
 // TODO(performance): this func should probably its own unescaping
 pub(crate) fn scan_link_dest(
     data: &str,
     start_ix: usize,
     max_next: usize,
 ) -> Option<(usize, &str)> {
     let bytes = &data.as_bytes()[start_ix..];
     let mut i = scan_ch(bytes, b'<');

     if i != 0 {
         // pointy links
         while i < bytes.len() {
             match bytes[i] {
                 b'\n' | b'\r' | b'<' => return None,
                 b'>' => return Some((i + 1, &data[(start_ix + 1)..(start_ix + i)])),
                 b'\\' if i + 1 < bytes.len() && is_ascii_punctuation(bytes[i + 1]) => {
                     i += 1;
                 }
                 _ => {}
             }
             i += 1;
         }
         None
     } else {
         // non-pointy links
         let mut nest = 0;
         while i < bytes.len() {
             match bytes[i] {
                 0x0..=0x20 => {
                     break;
                 }
                 b'(' => {
                     if nest > max_next {
                         return None;
                     }
                     nest += 1;
                 }
                 b')' => {
                     if nest == 0 {
                         break;
                     }
                     nest -= 1;
                 }
                 b'\\' if i + 1 < bytes.len() && is_ascii_punctuation(bytes[i + 1]) => {
                     i += 1;
                 }
                 _ => {}
             }
             i += 1;
         }
         Some((i, &data[start_ix..(start_ix + i)]))
     }
 }

 /// Returns next byte index, url and title.
 pub(crate) fn scan_inline_link(
     underlying: &str,
     start_ix: usize,
 ) -> Option<(usize, CowStr<'_>, CowStr<'_>)> {
     let mut ix = start_ix;
     if scan_ch(&underlying.as_bytes()[ix..], b'(') == 0 {
         return None;
     }
     ix += 1;
     ix += scan_while(&underlying.as_bytes()[ix..], is_ascii_whitespace);

     let (dest_length, dest) = scan_link_dest(underlying, ix, LINK_MAX_NESTED_PARENS)?;
     let dest = unescape(dest);
     ix += dest_length;

     ix += scan_while(&underlying.as_bytes()[ix..], is_ascii_whitespace);

     let title = if let Some((bytes_scanned, t)) = scan_link_title(underlying, ix) {
         ix += bytes_scanned;
         ix += scan_while(&underlying.as_bytes()[ix..], is_ascii_whitespace);
         t
     } else {
         "".into()
     };
     if scan_ch(&underlying.as_bytes()[ix..], b')') == 0 {
         return None;
     }
     ix += 1;

     Some((ix, dest, title))
 }

 /// Returns bytes scanned
 fn scan_attribute_name(data: &[u8]) -> Option<usize> {
     let (&c, tail) = data.split_first()?;
     if is_ascii_alpha(c) || c == b'_' || c == b':' {
         Some(
             1 + scan_while(tail, |c| {
                 is_ascii_alphanumeric(c) || c == b'_' || c == b'.' || c == b':' || c == b'-'
             }),
         )
     } else {
         None
     }
 }

 /// Returns byte scanned (TODO: should it return new offset?)
 fn scan_attribute(data: &[u8], allow_newline: bool) -> Option<usize> {
     let whitespace_scanner =
         |c| is_ascii_whitespace(c) && (allow_newline || c != b'\n' && c != b'\r');
     let mut ix = scan_attribute_name(data)?;
     let n_whitespace = scan_while(&data[ix..], whitespace_scanner);
     ix += n_whitespace;
     if scan_ch(&data[ix..], b'=') == 1 {
         ix += 1;
         ix += scan_while(&data[ix..], whitespace_scanner);
         ix += scan_attribute_value(&data[ix..], allow_newline)?;
     } else if n_whitespace > 0 {
         // Leave whitespace for next attribute.
         ix -= 1;
     }
     Some(ix)
 }

 fn scan_attribute_value(data: &[u8], allow_newline: bool) -> Option<usize> {
     let mut i = 0;
     match *data.get(0)? {
         b @ b'"' | b @ b'\'' => {
             i += 1;
             i += scan_while(&data[i..], |c| {
                 c != b && (allow_newline || c != b'\n' && c != b'\r')
             });
             if scan_ch(&data[i..], b) == 0 {
                 return None;
             }
             i += 1;
         }
         b' ' | b'=' | b'>' | b'<' | b'`' | b'\n' | b'\r' => {
             return None;
         }
         _ => {
             // unquoted attribute value
             i += scan_attr_value_chars(&data[i..]);
         }
     }
     Some(i)
 }

 // Remove backslash escapes and resolve entities
 pub(crate) fn unescape(input: &str) -> CowStr<'_> {
     let mut result = String::new();
     let mut mark = 0;
     let mut i = 0;
     let bytes = input.as_bytes();
     while i < bytes.len() {
         match bytes[i] {
             b'\\' if i + 1 < bytes.len() && is_ascii_punctuation(bytes[i + 1]) => {
                 result.push_str(&input[mark..i]);
                 mark = i + 1;
                 i += 2;
             }
             b'&' => match scan_entity(&bytes[i..]) {
                 (n, Some(value)) => {
                     result.push_str(&input[mark..i]);
                     result.push_str(&value);
                     i += n;
                     mark = i;
                 }
                 _ => i += 1,
             },
             b'\r' => {
                 result.push_str(&input[mark..i]);
                 i += 1;
                 mark = i;
             }
             _ => i += 1,
         }
     }
     if mark == 0 {
         input.into()
     } else {
         result.push_str(&input[mark..]);
         result.into()
     }
 }

 /// Assumes `data` is preceded by `<`.
 pub(crate) fn scan_html_block_tag(data: &[u8]) -> (usize, &[u8]) {
     let i = scan_ch(data, b'/');
     let n = scan_while(&data[i..], is_ascii_alphanumeric);
     // TODO: scan attributes and >
     (i + n, &data[i..i + n])
 }

 pub(crate) fn is_html_tag(tag: &[u8]) -> bool {
     HTML_TAGS
         .binary_search_by(|probe| {
             let probe_bytes_iter = probe.as_bytes().iter();
             let tag_bytes_iter = tag.iter();

             probe_bytes_iter
                 .zip(tag_bytes_iter)
                 .find_map(|(&a, &b)| {
                     // We can compare case insensitively because the probes are
                     // all lower case alpha strings.
                     match a.cmp(&(b | 0x20)) {
                         std::cmp::Ordering::Equal => None,
                         inequality => Some(inequality),
                     }
                 })
                 .unwrap_or_else(|| probe.len().cmp(&tag.len()))
         })
         .is_ok()
 }

 /// Assumes that `data` is preceded by `<`.
 pub(crate) fn scan_html_type_7(data: &[u8]) -> Option<usize> {
     let i = scan_html_block_inner(data, false)?;
     scan_blank_line(&data[i..])?;
     Some(i)
 }

 fn scan_html_block_inner(data: &[u8], allow_newline: bool) -> Option<usize> {
     let close_tag_bytes = scan_ch(&data, b'/');
     let l = scan_while(&data[close_tag_bytes..], is_ascii_alpha);
     if l == 0 {
         return None;
     }
     let mut i = close_tag_bytes + l;
     i += scan_while(&data[i..], is_ascii_letterdigitdash);

     if close_tag_bytes == 0 {
         loop {
             let whitespace_scanner =
                 |c| is_ascii_whitespace(c) && (allow_newline || c != b'\n' && c != b'\r');
             let whitespace = scan_while(&data[i..], whitespace_scanner);
             i += whitespace;
             if let Some(b'/') | Some(b'>') = data.get(i) {
                 break;
             }
             if whitespace == 0 {
                 return None;
             }
             i += scan_attribute(&data[i..], allow_newline)?;
         }
     }

     i += scan_whitespace_no_nl(&data[i..]);

     if close_tag_bytes == 0 {
         i += scan_ch(&data[i..], b'/');
     }

     let c = scan_ch(&data[i..], b'>');
     if c == 0 {
         None
     } else {
         Some(i + c)
     }
 }

 /// Returns (next_byte_offset, uri, type)
 pub(crate) fn scan_autolink(text: &str, start_ix: usize) -> Option<(usize, CowStr<'_>, LinkType)> {
     scan_uri(text, start_ix)
         .map(|(bytes, uri)| (bytes, uri, LinkType::Autolink))
         .or_else(|| scan_email(text, start_ix).map(|(bytes, uri)| (bytes, uri, LinkType::Email)))
 }

 /// Returns (next_byte_offset, uri)
 fn scan_uri(text: &str, start_ix: usize) -> Option<(usize, CowStr<'_>)> {
     let bytes = &text.as_bytes()[start_ix..];

     // scheme's first byte must be an ascii letter
     if bytes.is_empty() || !is_ascii_alpha(bytes[0]) {
         return None;
     }

     let mut i = 1;

     while i < bytes.len() {
         let c = bytes[i];
         i += 1;
         match c {
             c if is_ascii_alphanumeric(c) => (),
             b'.' | b'-' | b'+' => (),
             b':' => break,
             _ => return None,
         }
     }

     // scheme length must be between 2 and 32 characters long. scheme
     // must be followed by colon
     if i < 3 || i > 33 {
         return None;
     }

     let mut ended = false;
     while i < bytes.len() {
         match bytes[i] {
             b'\0'..=b' ' => {
                 ended = true;
             }
             b'>' | b'<' => break,
             _ if ended => return None,
             _ => (),
         }
         i += 1;
     }

     Some((start_ix + i + 1, text[start_ix..(start_ix + i)].into()))
 }

 /// Returns (next_byte_offset, email)
 fn scan_email(text: &str, start_ix: usize) -> Option<(usize, CowStr<'_>)> {
     // using a regex library would be convenient, but doing it by hand is not too bad
     let bytes = &text.as_bytes()[start_ix..];
     let mut i = 0;

     while i < bytes.len() {
         let c = bytes[i];
         i += 1;
         match c {
             c if is_ascii_alphanumeric(c) => (),
             b'.' | b'!' | b'#' | b'$' | b'%' | b'&' | b'\'' | b'*' | b'+' | b'/' | b'=' | b'?'
             | b'^' | b'_' | b'`' | b'{' | b'|' | b'}' | b'~' | b'-' => (),
             b'@' => break,
             _ => return None,
         }
     }

     loop {
         let label_start_ix = i;
         let mut fresh_label = true;

         while i < bytes.len() {
             match bytes[i] {
                 c if is_ascii_alphanumeric(c) => (),
                 b'-' if fresh_label => {
                     return None;
                 }
                 b'-' => (),
                 _ => break,
             }
             fresh_label = false;
             i += 1;
         }

         if i == label_start_ix || i - label_start_ix > 63 || bytes[i - 1] == b'-' {
             return None;
         }

         if scan_ch(&bytes[i..], b'.') == 0 {
             break;
         }
         i += 1;
     }

     if scan_ch(&bytes[i..], b'>') == 0 {
         return None;
     }

     Some((start_ix + i + 1, text[start_ix..(start_ix + i)].into()))
 }

 /// Scan comment, declaration, or CDATA section, with initial "<!" already consumed.
 /// Returns byte offset on match.
 fn scan_inline_html_comment(
     bytes: &[u8],
     mut ix: usize,
     scan_guard: &mut HtmlScanGuard,
 ) -> Option<usize> {
     let c = *bytes.get(ix)?;
     ix += 1;
     match c {
         b'-' => {
             let dashes = scan_ch_repeat(&bytes[ix..], b'-');
             if dashes < 1 {
                 return None;
             }
             // Saw "<!--", scan comment.
             ix += dashes;
             if scan_ch(&bytes[ix..], b'>') == 1 {
                 return None;
             }

             while let Some(x) = memchr(b'-', &bytes[ix..]) {
                 ix += x + 1;
                 if scan_ch(&bytes[ix..], b'-') == 1 {
                     ix += 1;
                     return if scan_ch(&bytes[ix..], b'>') == 1 {
                         Some(ix + 1)
                     } else {
                         None
                     };
                 }
             }
             None
         }
         b'[' if bytes[ix..].starts_with(b"CDATA[") && ix > scan_guard.cdata => {
             ix += b"CDATA[".len();
             ix = memchr(b']', &bytes[ix..]).map_or(bytes.len(), |x| ix + x);
             let close_brackets = scan_ch_repeat(&bytes[ix..], b']');
             ix += close_brackets;

             if close_brackets == 0 || scan_ch(&bytes[ix..], b'>') == 0 {
                 scan_guard.cdata = ix;
                 None
             } else {
                 Some(ix + 1)
             }
         }
         b'A'..=b'Z' if ix > scan_guard.declaration => {
             // Scan declaration.
             ix += scan_while(&bytes[ix..], |c| c >= b'A' && c <= b'Z');
             let whitespace = scan_while(&bytes[ix..], is_ascii_whitespace);
             if whitespace == 0 {
                 return None;
             }
             ix += whitespace;
             ix = memchr(b'>', &bytes[ix..]).map_or(bytes.len(), |x| ix + x);
             if scan_ch(&bytes[ix..], b'>') == 0 {
                 scan_guard.declaration = ix;
                 None
             } else {
                 Some(ix + 1)
             }
         }
         _ => None,
     }
 }

 /// Scan processing directive, with initial "<?" already consumed.
 /// Returns the next byte offset on success.
 fn scan_inline_html_processing(
     bytes: &[u8],
     mut ix: usize,
     scan_guard: &mut HtmlScanGuard,
 ) -> Option<usize> {
     if ix <= scan_guard.processing {
         return None;
     }
     while let Some(offset) = memchr(b'?', &bytes[ix..]) {
         ix += offset + 1;
         if scan_ch(&bytes[ix..], b'>') == 1 {
             return Some(ix + 1);
         }
     }
     scan_guard.processing = ix;
     None
 }

 /// Returns the next byte offset on success.
 pub(crate) fn scan_inline_html(
     bytes: &[u8],
     ix: usize,
     scan_guard: &mut HtmlScanGuard,
 ) -> Option<usize> {
     let c = *bytes.get(ix)?;
     if c == b'!' {
         scan_inline_html_comment(bytes, ix + 1, scan_guard)
     } else if c == b'?' {
         scan_inline_html_processing(bytes, ix + 1, scan_guard)
     } else {
         let i = scan_html_block_inner(&bytes[ix..], true)?;
         Some(i + ix)
     }
 }

 #[cfg(test)]
 mod test {
     use super::*;
     #[test]
     fn overflow_list() {
         assert!(
             scan_listitem(b"4444444444444444444444444444444444444444444444444444444444!").is_none()
         );
     }

     #[test]
     fn overflow_by_addition() {
         assert!(scan_listitem(b"1844674407370955161615!").is_none());
     }
 }