blob: ea7b5d4b98aa514cdc7873c548991ff55f7eef50 [file] [log] [blame]
// Copyright 2015 Google Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
//! Scanners for fragments of CommonMark syntax
use std::char;
use std::convert::TryInto;
use crate::entities;
use crate::parse::{Alignment, HtmlScanGuard, LinkType};
pub use crate::puncttable::{is_ascii_punctuation, is_punctuation};
use crate::strings::CowStr;
use memchr::memchr;
// Allowing arbitrary depth nested parentheses inside link destinations
// can create denial of service vulnerabilities if we're not careful.
// The simplest countermeasure is to limit their depth, which is
// explicitly allowed by the spec as long as the limit is at least 3:
// https://spec.commonmark.org/0.29/#link-destination
const LINK_MAX_NESTED_PARENS: usize = 5;
// sorted for binary search
const HTML_TAGS: [&str; 62] = [
"address",
"article",
"aside",
"base",
"basefont",
"blockquote",
"body",
"caption",
"center",
"col",
"colgroup",
"dd",
"details",
"dialog",
"dir",
"div",
"dl",
"dt",
"fieldset",
"figcaption",
"figure",
"footer",
"form",
"frame",
"frameset",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"head",
"header",
"hr",
"html",
"iframe",
"legend",
"li",
"link",
"main",
"menu",
"menuitem",
"nav",
"noframes",
"ol",
"optgroup",
"option",
"p",
"param",
"section",
"source",
"summary",
"table",
"tbody",
"td",
"tfoot",
"th",
"thead",
"title",
"tr",
"track",
"ul",
];
/// Analysis of the beginning of a line, including indentation and container
/// markers.
#[derive(Clone)]
pub struct LineStart<'a> {
bytes: &'a [u8],
tab_start: usize,
ix: usize,
spaces_remaining: usize,
// no thematic breaks can occur before this offset.
// this prevents scanning over and over up to a certain point
min_hrule_offset: usize,
}
impl<'a> LineStart<'a> {
pub(crate) fn new(bytes: &[u8]) -> LineStart {
LineStart {
bytes,
tab_start: 0,
ix: 0,
spaces_remaining: 0,
min_hrule_offset: 0,
}
}
/// Try to scan a number of spaces.
///
/// Returns true if all spaces were consumed.
///
/// Note: consumes some spaces even if not successful.
pub(crate) fn scan_space(&mut self, n_space: usize) -> bool {
self.scan_space_inner(n_space) == 0
}
/// Scan a number of spaces up to a maximum.
///
/// Returns number of spaces scanned.
pub(crate) fn scan_space_upto(&mut self, n_space: usize) -> usize {
n_space - self.scan_space_inner(n_space)
}
/// Returns unused remainder of spaces.
fn scan_space_inner(&mut self, mut n_space: usize) -> usize {
let n_from_remaining = self.spaces_remaining.min(n_space);
self.spaces_remaining -= n_from_remaining;
n_space -= n_from_remaining;
while n_space > 0 && self.ix < self.bytes.len() {
match self.bytes[self.ix] {
b' ' => {
self.ix += 1;
n_space -= 1;
}
b'\t' => {
let spaces = 4 - (self.ix - self.tab_start) % 4;
self.ix += 1;
self.tab_start = self.ix;
let n = spaces.min(n_space);
n_space -= n;
self.spaces_remaining = spaces - n;
}
_ => break,
}
}
n_space
}
/// Scan all available ASCII whitespace (not including eol).
pub(crate) fn scan_all_space(&mut self) {
self.spaces_remaining = 0;
self.ix += self.bytes[self.ix..]
.iter()
.take_while(|&&b| b == b' ' || b == b'\t')
.count();
}
/// Determine whether we're at end of line (includes end of file).
pub(crate) fn is_at_eol(&self) -> bool {
if self.ix >= self.bytes.len() {
return true;
}
let c = self.bytes[self.ix];
c == b'\r' || c == b'\n'
}
fn scan_ch(&mut self, c: u8) -> bool {
if self.ix < self.bytes.len() && self.bytes[self.ix] == c {
self.ix += 1;
true
} else {
false
}
}
pub(crate) fn scan_blockquote_marker(&mut self) -> bool {
let save = self.clone();
let _ = self.scan_space(3);
if self.scan_ch(b'>') {
let _ = self.scan_space(1);
true
} else {
*self = save;
false
}
}
/// Scan a list marker.
///
/// Return value is the character, the start index, and the indent in spaces.
/// For ordered list markers, the character will be one of b'.' or b')'. For
/// bullet list markers, it will be one of b'-', b'+', or b'*'.
pub(crate) fn scan_list_marker(&mut self) -> Option<(u8, usize, usize)> {
let save = self.clone();
let indent = self.scan_space_upto(3);
if self.ix < self.bytes.len() {
let c = self.bytes[self.ix];
if c == b'-' || c == b'+' || c == b'*' {
if self.ix >= self.min_hrule_offset {
// there could be an hrule here
if let Err(min_offset) = scan_hrule(&self.bytes[self.ix..]) {
self.min_hrule_offset = min_offset;
} else {
*self = save;
return None;
}
}
self.ix += 1;
if self.scan_space(1) || self.is_at_eol() {
return self.finish_list_marker(c, 0, indent + 2);
}
} else if c >= b'0' && c <= b'9' {
let start_ix = self.ix;
let mut ix = self.ix + 1;
let mut val = u64::from(c - b'0');
while ix < self.bytes.len() && ix - start_ix < 10 {
let c = self.bytes[ix];
ix += 1;
if c >= b'0' && c <= b'9' {
val = val * 10 + u64::from(c - b'0');
} else if c == b')' || c == b'.' {
self.ix = ix;
let val_usize = val as usize;
// This will cause some failures on 32 bit arch.
// TODO (breaking API change): should be u64, not usize.
if val_usize as u64 != val {
return None;
}
if self.scan_space(1) || self.is_at_eol() {
return self.finish_list_marker(
c,
val_usize,
indent + self.ix - start_ix,
);
} else {
break;
}
} else {
break;
}
}
}
}
*self = save;
None
}
fn finish_list_marker(
&mut self,
c: u8,
start: usize,
mut indent: usize,
) -> Option<(u8, usize, usize)> {
let save = self.clone();
// skip the rest of the line if it's blank
if scan_blank_line(&self.bytes[self.ix..]).is_some() {
return Some((c, start, indent));
}
let post_indent = self.scan_space_upto(4);
if post_indent < 4 {
indent += post_indent;
} else {
*self = save;
}
Some((c, start, indent))
}
/// Returns Some(is_checked) when a task list marker was found. Resets itself
/// to original state otherwise.
pub(crate) fn scan_task_list_marker(&mut self) -> Option<bool> {
let save = self.clone();
self.scan_space_upto(3);
if !self.scan_ch(b'[') {
*self = save;
return None;
}
let is_checked = match self.bytes.get(self.ix) {
Some(&c) if is_ascii_whitespace_no_nl(c) => {
self.ix += 1;
false
}
Some(b'x') | Some(b'X') => {
self.ix += 1;
true
}
_ => {
*self = save;
return None;
}
};
if !self.scan_ch(b']') {
*self = save;
return None;
}
if !self
.bytes
.get(self.ix)
.map(|&b| is_ascii_whitespace_no_nl(b))
.unwrap_or(false)
{
*self = save;
return None;
}
Some(is_checked)
}
pub(crate) fn bytes_scanned(&self) -> usize {
self.ix
}
pub(crate) fn remaining_space(&self) -> usize {
self.spaces_remaining
}
}
pub(crate) fn is_ascii_whitespace(c: u8) -> bool {
(c >= 0x09 && c <= 0x0d) || c == b' '
}
pub(crate) fn is_ascii_whitespace_no_nl(c: u8) -> bool {
c == b'\t' || c == 0x0b || c == 0x0c || c == b' '
}
fn is_ascii_alpha(c: u8) -> bool {
match c {
b'a'..=b'z' | b'A'..=b'Z' => true,
_ => false,
}
}
fn is_ascii_alphanumeric(c: u8) -> bool {
match c {
b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' => true,
_ => false,
}
}
fn is_ascii_letterdigitdash(c: u8) -> bool {
c == b'-' || is_ascii_alphanumeric(c)
}
fn is_digit(c: u8) -> bool {
b'0' <= c && c <= b'9'
}
fn is_valid_unquoted_attr_value_char(c: u8) -> bool {
match c {
b'\'' | b'"' | b' ' | b'=' | b'>' | b'<' | b'`' | b'\n' | b'\r' => false,
_ => true,
}
}
// scan a single character
pub(crate) fn scan_ch(data: &[u8], c: u8) -> usize {
if !data.is_empty() && data[0] == c {
1
} else {
0
}
}
pub(crate) fn scan_while<F>(data: &[u8], mut f: F) -> usize
where
F: FnMut(u8) -> bool,
{
data.iter().take_while(|&&c| f(c)).count()
}
pub(crate) fn scan_rev_while<F>(data: &[u8], mut f: F) -> usize
where
F: FnMut(u8) -> bool,
{
data.iter().rev().take_while(|&&c| f(c)).count()
}
pub(crate) fn scan_ch_repeat(data: &[u8], c: u8) -> usize {
scan_while(data, |x| x == c)
}
// Note: this scans ASCII whitespace only, for Unicode whitespace use
// a different function.
pub(crate) fn scan_whitespace_no_nl(data: &[u8]) -> usize {
scan_while(data, is_ascii_whitespace_no_nl)
}
fn scan_attr_value_chars(data: &[u8]) -> usize {
scan_while(data, is_valid_unquoted_attr_value_char)
}
pub(crate) fn scan_eol(bytes: &[u8]) -> Option<usize> {
if bytes.is_empty() {
return Some(0);
}
match bytes[0] {
b'\n' => Some(1),
b'\r' => Some(if bytes.get(1) == Some(&b'\n') { 2 } else { 1 }),
_ => None,
}
}
pub(crate) fn scan_blank_line(bytes: &[u8]) -> Option<usize> {
let i = scan_whitespace_no_nl(bytes);
scan_eol(&bytes[i..]).map(|n| i + n)
}
pub(crate) fn scan_nextline(bytes: &[u8]) -> usize {
memchr(b'\n', bytes).map_or(bytes.len(), |x| x + 1)
}
// return: end byte for closing code fence, or None
// if the line is not a closing code fence
pub(crate) fn scan_closing_code_fence(
bytes: &[u8],
fence_char: u8,
n_fence_char: usize,
) -> Option<usize> {
if bytes.is_empty() {
return Some(0);
}
let mut i = 0;
let num_fence_chars_found = scan_ch_repeat(&bytes[i..], fence_char);
if num_fence_chars_found < n_fence_char {
return None;
}
i += num_fence_chars_found;
let num_trailing_spaces = scan_ch_repeat(&bytes[i..], b' ');
i += num_trailing_spaces;
scan_eol(&bytes[i..]).map(|_| i)
}
// returned pair is (number of bytes, number of spaces)
fn calc_indent(text: &[u8], max: usize) -> (usize, usize) {
let mut spaces = 0;
let mut offset = 0;
for (i, &b) in text.iter().enumerate() {
match b {
b' ' => {
spaces += 1;
if spaces == max {
break;
}
}
b'\t' => {
let new_spaces = spaces + 4 - (spaces & 3);
if new_spaces > max {
break;
}
spaces = new_spaces;
}
_ => break,
}
offset = i;
}
(offset, spaces)
}
/// Scan hrule opening sequence.
///
/// Returns Ok(x) when it finds an hrule, where x is the
/// size of line containing the hrule, including the trailing newline.
///
/// Returns Err(x) when it does not find an hrule and x is
/// the offset in data before no hrule can appear.
pub(crate) fn scan_hrule(bytes: &[u8]) -> Result<usize, usize> {
if bytes.len() < 3 {
return Err(0);
}
let c = bytes[0];
if !(c == b'*' || c == b'-' || c == b'_') {
return Err(0);
}
let mut n = 0;
let mut i = 0;
while i < bytes.len() {
match bytes[i] {
b'\n' | b'\r' => {
i += scan_eol(&bytes[i..]).unwrap_or(0);
break;
}
c2 if c2 == c => {
n += 1;
}
b' ' | b'\t' => (),
_ => return Err(i),
}
i += 1;
}
if n >= 3 {
Ok(i)
} else {
Err(i)
}
}
/// Scan an ATX heading opening sequence.
///
/// Returns number of bytes in prefix and level.
pub(crate) fn scan_atx_heading(data: &[u8]) -> Option<(usize, i32)> {
let level = scan_ch_repeat(data, b'#');
if level >= 1 && level <= 6 && data.get(level).cloned().map_or(true, is_ascii_whitespace) {
Some((level, level as i32))
} else {
None
}
}
/// Scan a setext heading underline.
///
/// Returns number of bytes in line (including trailing newline) and level.
pub(crate) fn scan_setext_heading(data: &[u8]) -> Option<(usize, i32)> {
let c = *data.get(0)?;
if !(c == b'-' || c == b'=') {
return None;
}
let mut i = 1 + scan_ch_repeat(&data[1..], c);
i += scan_blank_line(&data[i..])?;
let level = if c == b'=' { 1 } else { 2 };
Some((i, level))
}
// returns number of bytes in line (including trailing
// newline) and column alignments
pub(crate) fn scan_table_head(data: &[u8]) -> (usize, Vec<Alignment>) {
let (mut i, spaces) = calc_indent(data, 4);
if spaces > 3 || i == data.len() {
return (0, vec![]);
}
let mut cols = vec![];
let mut active_col = Alignment::None;
let mut start_col = true;
if data[i] == b'|' {
i += 1;
}
for c in &data[i..] {
if let Some(n) = scan_eol(&data[i..]) {
i += n;
break;
}
match *c {
b' ' => (),
b':' => {
active_col = match (start_col, active_col) {
(true, Alignment::None) => Alignment::Left,
(false, Alignment::Left) => Alignment::Center,
(false, Alignment::None) => Alignment::Right,
_ => active_col,
};
start_col = false;
}
b'-' => {
start_col = false;
}
b'|' => {
start_col = true;
cols.push(active_col);
active_col = Alignment::None;
}
_ => {
cols = vec![];
start_col = true;
break;
}
}
i += 1;
}
if !start_col {
cols.push(active_col);
}
(i, cols)
}
/// Scan code fence.
///
/// Returns number of bytes scanned and the char that is repeated to make the code fence.
pub(crate) fn scan_code_fence(data: &[u8]) -> Option<(usize, u8)> {
let c = *data.get(0)?;
if !(c == b'`' || c == b'~') {
return None;
}
let i = 1 + scan_ch_repeat(&data[1..], c);
if i >= 3 {
if c == b'`' {
let suffix = &data[i..];
let next_line = i + scan_nextline(suffix);
// FIXME: make sure this is correct
if suffix[..(next_line - i)].iter().any(|&b| b == b'`') {
return None;
}
}
Some((i, c))
} else {
None
}
}
pub(crate) fn scan_blockquote_start(data: &[u8]) -> Option<usize> {
if data.starts_with(b"> ") {
Some(2)
} else {
None
}
}
/// This already assumes the list item has been scanned.
pub(crate) fn scan_empty_list(data: &[u8]) -> bool {
let mut ix = 0;
for _ in 0..2 {
if let Some(bytes) = scan_blank_line(&data[ix..]) {
ix += bytes;
} else {
return false;
}
}
true
}
// return number of bytes scanned, delimiter, start index, and indent
pub(crate) fn scan_listitem(bytes: &[u8]) -> Option<(usize, u8, usize, usize)> {
let mut c = *bytes.get(0)?;
let (w, start) = match c {
b'-' | b'+' | b'*' => (1, 0),
b'0'..=b'9' => {
let (length, start) = parse_decimal(bytes);
c = *bytes.get(length)?;
if !(c == b'.' || c == b')') {
return None;
}
(length + 1, start)
}
_ => {
return None;
}
};
// TODO: replace calc_indent with scan_leading_whitespace, for tab correctness
let (mut postn, mut postindent) = calc_indent(&bytes[w..], 5);
if postindent == 0 {
scan_eol(&bytes[w..])?;
postindent += 1;
} else if postindent > 4 {
postn = 1;
postindent = 1;
}
if scan_blank_line(&bytes[w..]).is_some() {
postn = 0;
postindent = 1;
}
Some((w + postn, c, start, w + postindent))
}
// returns (number of bytes, parsed decimal)
fn parse_decimal(bytes: &[u8]) -> (usize, usize) {
match bytes
.iter()
.take_while(|&&b| is_digit(b))
.try_fold((0, 0usize), |(count, acc), c| {
let digit = usize::from(c - b'0');
match acc
.checked_mul(10)
.and_then(|ten_acc| ten_acc.checked_add(digit))
{
Some(number) => Ok((count + 1, number)),
// stop early on overflow
None => Err((count, acc)),
}
}) {
Ok(p) | Err(p) => p,
}
}
// returns (number of bytes, parsed hex)
fn parse_hex(bytes: &[u8]) -> (usize, usize) {
match bytes.iter().try_fold((0, 0usize), |(count, acc), c| {
let mut c = *c;
let digit = if c >= b'0' && c <= b'9' {
usize::from(c - b'0')
} else {
// make lower case
c |= 0x20;
if c >= b'a' && c <= b'f' {
usize::from(c - b'a' + 10)
} else {
return Err((count, acc));
}
};
match acc
.checked_mul(16)
.and_then(|sixteen_acc| sixteen_acc.checked_add(digit))
{
Some(number) => Ok((count + 1, number)),
// stop early on overflow
None => Err((count, acc)),
}
}) {
Ok(p) | Err(p) => p,
}
}
fn char_from_codepoint(input: usize) -> Option<char> {
let mut codepoint = input.try_into().ok()?;
if codepoint == 0 {
codepoint = 0xFFFD;
}
char::from_u32(codepoint)
}
// doesn't bother to check data[0] == '&'
pub(crate) fn scan_entity(bytes: &[u8]) -> (usize, Option<CowStr<'static>>) {
let mut end = 1;
if scan_ch(&bytes[end..], b'#') == 1 {
end += 1;
let (bytecount, codepoint) = if end < bytes.len() && bytes[end] | 0x20 == b'x' {
end += 1;
parse_hex(&bytes[end..])
} else {
parse_decimal(&bytes[end..])
};
end += bytecount;
return if bytecount == 0 || scan_ch(&bytes[end..], b';') == 0 {
(0, None)
} else if let Some(c) = char_from_codepoint(codepoint) {
(end + 1, Some(c.into()))
} else {
(0, None)
};
}
end += scan_while(&bytes[end..], is_ascii_alphanumeric);
if scan_ch(&bytes[end..], b';') == 1 {
if let Some(value) = entities::get_entity(&bytes[1..end]) {
return (end + 1, Some(value.into()));
}
}
(0, None)
}
// returns (bytes scanned, title cow)
fn scan_link_title(text: &str, start_ix: usize) -> Option<(usize, CowStr<'_>)> {
let bytes = text.as_bytes();
let open = match bytes.get(start_ix) {
Some(b @ b'\'') | Some(b @ b'\"') | Some(b @ b'(') => *b,
_ => return None,
};
let close = if open == b'(' { b')' } else { open };
let mut title = String::new();
let mut mark = start_ix + 1;
let mut i = start_ix + 1;
while i < bytes.len() {
let c = bytes[i];
if c == close {
let cow = if mark == 1 {
(i - start_ix + 1, text[mark..i].into())
} else {
title.push_str(&text[mark..i]);
(i - start_ix + 1, title.into())
};
return Some(cow);
}
if c == open {
return None;
}
// TODO: do b'\r' as well?
if c == b'&' {
if let (n, Some(value)) = scan_entity(&bytes[i..]) {
title.push_str(&text[mark..i]);
title.push_str(&value);
i += n;
mark = i;
continue;
}
}
if c == b'\\' && i + 1 < bytes.len() && is_ascii_punctuation(bytes[i + 1]) {
title.push_str(&text[mark..i]);
i += 1;
mark = i;
}
i += 1;
}
None
}
// FIXME: we can most likely re-use other scanners
// returns (bytelength, title_str)
pub(crate) fn scan_refdef_title(text: &str) -> Option<(usize, &str)> {
let mut chars = text.chars().peekable();
let closing_delim = match chars.next()? {
'\'' => '\'',
'"' => '"',
'(' => ')',
_ => return None,
};
let mut bytecount = 1;
while let Some(c) = chars.next() {
match c {
'\n' => {
bytecount += 1;
let mut next = *chars.peek()?;
while is_ascii_whitespace_no_nl(next as u8) {
bytecount += chars.next()?.len_utf8();
next = *chars.peek()?;
}
if *chars.peek()? == '\n' {
// blank line - not allowed
return None;
}
}
'\\' => {
let next_char = chars.next()?;
bytecount += 1 + next_char.len_utf8();
}
c if c == closing_delim => {
return Some((bytecount + 1, &text[1..bytecount]));
}
c => {
bytecount += c.len_utf8();
}
}
}
None
}
// note: dest returned is raw, still needs to be unescaped
// TODO: check that nested parens are really not allowed for refdefs
// TODO(performance): this func should probably its own unescaping
pub(crate) fn scan_link_dest(
data: &str,
start_ix: usize,
max_next: usize,
) -> Option<(usize, &str)> {
let bytes = &data.as_bytes()[start_ix..];
let mut i = scan_ch(bytes, b'<');
if i != 0 {
// pointy links
while i < bytes.len() {
match bytes[i] {
b'\n' | b'\r' | b'<' => return None,
b'>' => return Some((i + 1, &data[(start_ix + 1)..(start_ix + i)])),
b'\\' if i + 1 < bytes.len() && is_ascii_punctuation(bytes[i + 1]) => {
i += 1;
}
_ => {}
}
i += 1;
}
None
} else {
// non-pointy links
let mut nest = 0;
while i < bytes.len() {
match bytes[i] {
0x0..=0x20 => {
break;
}
b'(' => {
if nest > max_next {
return None;
}
nest += 1;
}
b')' => {
if nest == 0 {
break;
}
nest -= 1;
}
b'\\' if i + 1 < bytes.len() && is_ascii_punctuation(bytes[i + 1]) => {
i += 1;
}
_ => {}
}
i += 1;
}
Some((i, &data[start_ix..(start_ix + i)]))
}
}
/// Returns next byte index, url and title.
pub(crate) fn scan_inline_link(
underlying: &str,
start_ix: usize,
) -> Option<(usize, CowStr<'_>, CowStr<'_>)> {
let mut ix = start_ix;
if scan_ch(&underlying.as_bytes()[ix..], b'(') == 0 {
return None;
}
ix += 1;
ix += scan_while(&underlying.as_bytes()[ix..], is_ascii_whitespace);
let (dest_length, dest) = scan_link_dest(underlying, ix, LINK_MAX_NESTED_PARENS)?;
let dest = unescape(dest);
ix += dest_length;
ix += scan_while(&underlying.as_bytes()[ix..], is_ascii_whitespace);
let title = if let Some((bytes_scanned, t)) = scan_link_title(underlying, ix) {
ix += bytes_scanned;
ix += scan_while(&underlying.as_bytes()[ix..], is_ascii_whitespace);
t
} else {
"".into()
};
if scan_ch(&underlying.as_bytes()[ix..], b')') == 0 {
return None;
}
ix += 1;
Some((ix, dest, title))
}
/// Returns bytes scanned
fn scan_attribute_name(data: &[u8]) -> Option<usize> {
let (&c, tail) = data.split_first()?;
if is_ascii_alpha(c) || c == b'_' || c == b':' {
Some(
1 + scan_while(tail, |c| {
is_ascii_alphanumeric(c) || c == b'_' || c == b'.' || c == b':' || c == b'-'
}),
)
} else {
None
}
}
/// Returns byte scanned (TODO: should it return new offset?)
fn scan_attribute(data: &[u8], allow_newline: bool) -> Option<usize> {
let whitespace_scanner =
|c| is_ascii_whitespace(c) && (allow_newline || c != b'\n' && c != b'\r');
let mut ix = scan_attribute_name(data)?;
let n_whitespace = scan_while(&data[ix..], whitespace_scanner);
ix += n_whitespace;
if scan_ch(&data[ix..], b'=') == 1 {
ix += 1;
ix += scan_while(&data[ix..], whitespace_scanner);
ix += scan_attribute_value(&data[ix..], allow_newline)?;
} else if n_whitespace > 0 {
// Leave whitespace for next attribute.
ix -= 1;
}
Some(ix)
}
fn scan_attribute_value(data: &[u8], allow_newline: bool) -> Option<usize> {
let mut i = 0;
match *data.get(0)? {
b @ b'"' | b @ b'\'' => {
i += 1;
i += scan_while(&data[i..], |c| {
c != b && (allow_newline || c != b'\n' && c != b'\r')
});
if scan_ch(&data[i..], b) == 0 {
return None;
}
i += 1;
}
b' ' | b'=' | b'>' | b'<' | b'`' | b'\n' | b'\r' => {
return None;
}
_ => {
// unquoted attribute value
i += scan_attr_value_chars(&data[i..]);
}
}
Some(i)
}
// Remove backslash escapes and resolve entities
pub(crate) fn unescape(input: &str) -> CowStr<'_> {
let mut result = String::new();
let mut mark = 0;
let mut i = 0;
let bytes = input.as_bytes();
while i < bytes.len() {
match bytes[i] {
b'\\' if i + 1 < bytes.len() && is_ascii_punctuation(bytes[i + 1]) => {
result.push_str(&input[mark..i]);
mark = i + 1;
i += 2;
}
b'&' => match scan_entity(&bytes[i..]) {
(n, Some(value)) => {
result.push_str(&input[mark..i]);
result.push_str(&value);
i += n;
mark = i;
}
_ => i += 1,
},
b'\r' => {
result.push_str(&input[mark..i]);
i += 1;
mark = i;
}
_ => i += 1,
}
}
if mark == 0 {
input.into()
} else {
result.push_str(&input[mark..]);
result.into()
}
}
/// Assumes `data` is preceded by `<`.
pub(crate) fn scan_html_block_tag(data: &[u8]) -> (usize, &[u8]) {
let i = scan_ch(data, b'/');
let n = scan_while(&data[i..], is_ascii_alphanumeric);
// TODO: scan attributes and >
(i + n, &data[i..i + n])
}
pub(crate) fn is_html_tag(tag: &[u8]) -> bool {
HTML_TAGS
.binary_search_by(|probe| {
let probe_bytes_iter = probe.as_bytes().iter();
let tag_bytes_iter = tag.iter();
probe_bytes_iter
.zip(tag_bytes_iter)
.find_map(|(&a, &b)| {
// We can compare case insensitively because the probes are
// all lower case alpha strings.
match a.cmp(&(b | 0x20)) {
std::cmp::Ordering::Equal => None,
inequality => Some(inequality),
}
})
.unwrap_or_else(|| probe.len().cmp(&tag.len()))
})
.is_ok()
}
/// Assumes that `data` is preceded by `<`.
pub(crate) fn scan_html_type_7(data: &[u8]) -> Option<usize> {
let i = scan_html_block_inner(data, false)?;
scan_blank_line(&data[i..])?;
Some(i)
}
fn scan_html_block_inner(data: &[u8], allow_newline: bool) -> Option<usize> {
let close_tag_bytes = scan_ch(&data, b'/');
let l = scan_while(&data[close_tag_bytes..], is_ascii_alpha);
if l == 0 {
return None;
}
let mut i = close_tag_bytes + l;
i += scan_while(&data[i..], is_ascii_letterdigitdash);
if close_tag_bytes == 0 {
loop {
let whitespace_scanner =
|c| is_ascii_whitespace(c) && (allow_newline || c != b'\n' && c != b'\r');
let whitespace = scan_while(&data[i..], whitespace_scanner);
i += whitespace;
if let Some(b'/') | Some(b'>') = data.get(i) {
break;
}
if whitespace == 0 {
return None;
}
i += scan_attribute(&data[i..], allow_newline)?;
}
}
i += scan_whitespace_no_nl(&data[i..]);
if close_tag_bytes == 0 {
i += scan_ch(&data[i..], b'/');
}
let c = scan_ch(&data[i..], b'>');
if c == 0 {
None
} else {
Some(i + c)
}
}
/// Returns (next_byte_offset, uri, type)
pub(crate) fn scan_autolink(text: &str, start_ix: usize) -> Option<(usize, CowStr<'_>, LinkType)> {
scan_uri(text, start_ix)
.map(|(bytes, uri)| (bytes, uri, LinkType::Autolink))
.or_else(|| scan_email(text, start_ix).map(|(bytes, uri)| (bytes, uri, LinkType::Email)))
}
/// Returns (next_byte_offset, uri)
fn scan_uri(text: &str, start_ix: usize) -> Option<(usize, CowStr<'_>)> {
let bytes = &text.as_bytes()[start_ix..];
// scheme's first byte must be an ascii letter
if bytes.is_empty() || !is_ascii_alpha(bytes[0]) {
return None;
}
let mut i = 1;
while i < bytes.len() {
let c = bytes[i];
i += 1;
match c {
c if is_ascii_alphanumeric(c) => (),
b'.' | b'-' | b'+' => (),
b':' => break,
_ => return None,
}
}
// scheme length must be between 2 and 32 characters long. scheme
// must be followed by colon
if i < 3 || i > 33 {
return None;
}
let mut ended = false;
while i < bytes.len() {
match bytes[i] {
b'\0'..=b' ' => {
ended = true;
}
b'>' | b'<' => break,
_ if ended => return None,
_ => (),
}
i += 1;
}
Some((start_ix + i + 1, text[start_ix..(start_ix + i)].into()))
}
/// Returns (next_byte_offset, email)
fn scan_email(text: &str, start_ix: usize) -> Option<(usize, CowStr<'_>)> {
// using a regex library would be convenient, but doing it by hand is not too bad
let bytes = &text.as_bytes()[start_ix..];
let mut i = 0;
while i < bytes.len() {
let c = bytes[i];
i += 1;
match c {
c if is_ascii_alphanumeric(c) => (),
b'.' | b'!' | b'#' | b'$' | b'%' | b'&' | b'\'' | b'*' | b'+' | b'/' | b'=' | b'?'
| b'^' | b'_' | b'`' | b'{' | b'|' | b'}' | b'~' | b'-' => (),
b'@' => break,
_ => return None,
}
}
loop {
let label_start_ix = i;
let mut fresh_label = true;
while i < bytes.len() {
match bytes[i] {
c if is_ascii_alphanumeric(c) => (),
b'-' if fresh_label => {
return None;
}
b'-' => (),
_ => break,
}
fresh_label = false;
i += 1;
}
if i == label_start_ix || i - label_start_ix > 63 || bytes[i - 1] == b'-' {
return None;
}
if scan_ch(&bytes[i..], b'.') == 0 {
break;
}
i += 1;
}
if scan_ch(&bytes[i..], b'>') == 0 {
return None;
}
Some((start_ix + i + 1, text[start_ix..(start_ix + i)].into()))
}
/// Scan comment, declaration, or CDATA section, with initial "<!" already consumed.
/// Returns byte offset on match.
fn scan_inline_html_comment(
bytes: &[u8],
mut ix: usize,
scan_guard: &mut HtmlScanGuard,
) -> Option<usize> {
let c = *bytes.get(ix)?;
ix += 1;
match c {
b'-' => {
let dashes = scan_ch_repeat(&bytes[ix..], b'-');
if dashes < 1 {
return None;
}
// Saw "<!--", scan comment.
ix += dashes;
if scan_ch(&bytes[ix..], b'>') == 1 {
return None;
}
while let Some(x) = memchr(b'-', &bytes[ix..]) {
ix += x + 1;
if scan_ch(&bytes[ix..], b'-') == 1 {
ix += 1;
return if scan_ch(&bytes[ix..], b'>') == 1 {
Some(ix + 1)
} else {
None
};
}
}
None
}
b'[' if bytes[ix..].starts_with(b"CDATA[") && ix > scan_guard.cdata => {
ix += b"CDATA[".len();
ix = memchr(b']', &bytes[ix..]).map_or(bytes.len(), |x| ix + x);
let close_brackets = scan_ch_repeat(&bytes[ix..], b']');
ix += close_brackets;
if close_brackets == 0 || scan_ch(&bytes[ix..], b'>') == 0 {
scan_guard.cdata = ix;
None
} else {
Some(ix + 1)
}
}
b'A'..=b'Z' if ix > scan_guard.declaration => {
// Scan declaration.
ix += scan_while(&bytes[ix..], |c| c >= b'A' && c <= b'Z');
let whitespace = scan_while(&bytes[ix..], is_ascii_whitespace);
if whitespace == 0 {
return None;
}
ix += whitespace;
ix = memchr(b'>', &bytes[ix..]).map_or(bytes.len(), |x| ix + x);
if scan_ch(&bytes[ix..], b'>') == 0 {
scan_guard.declaration = ix;
None
} else {
Some(ix + 1)
}
}
_ => None,
}
}
/// Scan processing directive, with initial "<?" already consumed.
/// Returns the next byte offset on success.
fn scan_inline_html_processing(
bytes: &[u8],
mut ix: usize,
scan_guard: &mut HtmlScanGuard,
) -> Option<usize> {
if ix <= scan_guard.processing {
return None;
}
while let Some(offset) = memchr(b'?', &bytes[ix..]) {
ix += offset + 1;
if scan_ch(&bytes[ix..], b'>') == 1 {
return Some(ix + 1);
}
}
scan_guard.processing = ix;
None
}
/// Returns the next byte offset on success.
pub(crate) fn scan_inline_html(
bytes: &[u8],
ix: usize,
scan_guard: &mut HtmlScanGuard,
) -> Option<usize> {
let c = *bytes.get(ix)?;
if c == b'!' {
scan_inline_html_comment(bytes, ix + 1, scan_guard)
} else if c == b'?' {
scan_inline_html_processing(bytes, ix + 1, scan_guard)
} else {
let i = scan_html_block_inner(&bytes[ix..], true)?;
Some(i + ix)
}
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn overflow_list() {
assert!(
scan_listitem(b"4444444444444444444444444444444444444444444444444444444444!").is_none()
);
}
#[test]
fn overflow_by_addition() {
assert!(scan_listitem(b"1844674407370955161615!").is_none());
}
}