blob: 92760cee2c6f5e9bef701746a8a970ac2a2d86a2 [file] [log] [blame]
use crate::{
hybrid::{
dfa::{Cache, DFA},
id::{LazyStateID, OverlappingState, StateMatch},
},
nfa::thompson,
util::{
id::PatternID,
matchtypes::{HalfMatch, MatchError},
prefilter, MATCH_OFFSET,
},
};
#[inline(never)]
pub(crate) fn find_earliest_fwd(
pre: Option<&mut prefilter::Scanner>,
dfa: &DFA,
cache: &mut Cache,
pattern_id: Option<PatternID>,
bytes: &[u8],
start: usize,
end: usize,
) -> Result<Option<HalfMatch>, MatchError> {
// Searching with a pattern ID is always anchored, so we should never use
// a prefilter.
if pre.is_some() && pattern_id.is_none() {
find_fwd(pre, true, dfa, cache, pattern_id, bytes, start, end)
} else {
find_fwd(None, true, dfa, cache, pattern_id, bytes, start, end)
}
}
#[inline(never)]
pub(crate) fn find_leftmost_fwd(
pre: Option<&mut prefilter::Scanner>,
dfa: &DFA,
cache: &mut Cache,
pattern_id: Option<PatternID>,
bytes: &[u8],
start: usize,
end: usize,
) -> Result<Option<HalfMatch>, MatchError> {
// Searching with a pattern ID is always anchored, so we should never use
// a prefilter.
if pre.is_some() && pattern_id.is_none() {
find_fwd(pre, false, dfa, cache, pattern_id, bytes, start, end)
} else {
find_fwd(None, false, dfa, cache, pattern_id, bytes, start, end)
}
}
#[inline(always)]
fn find_fwd(
mut pre: Option<&mut prefilter::Scanner>,
earliest: bool,
dfa: &DFA,
cache: &mut Cache,
pattern_id: Option<PatternID>,
haystack: &[u8],
start: usize,
end: usize,
) -> Result<Option<HalfMatch>, MatchError> {
assert!(start <= end);
assert!(start <= haystack.len());
assert!(end <= haystack.len());
// Why do this? This lets 'bytes[at]' work without bounds checks below.
// It seems the assert on 'end <= haystack.len()' above is otherwise
// not enough. Why not just make 'bytes' scoped this way anyway? Well,
// 'eoi_fwd' (below) might actually want to try to access the byte at 'end'
// for resolving look-ahead.
let bytes = &haystack[..end];
let mut sid = init_fwd(dfa, cache, pattern_id, haystack, start, end)?;
let mut last_match = None;
let mut at = start;
if let Some(ref mut pre) = pre {
// If a prefilter doesn't report false positives, then we don't need to
// touch the DFA at all. However, since all matches include the pattern
// ID, and the prefilter infrastructure doesn't report pattern IDs, we
// limit this optimization to cases where there is exactly one pattern.
// In that case, any match must be the 0th pattern.
if dfa.pattern_count() == 1 && !pre.reports_false_positives() {
return Ok(pre.next_candidate(bytes, at).into_option().map(
|offset| HalfMatch { pattern: PatternID::ZERO, offset },
));
} else if pre.is_effective(at) {
match pre.next_candidate(bytes, at).into_option() {
None => return Ok(None),
Some(i) => {
at = i;
}
}
}
}
while at < end {
if sid.is_tagged() {
sid = dfa
.next_state(cache, sid, bytes[at])
.map_err(|_| gave_up(at))?;
at += 1;
} else {
// SAFETY: There are two safety invariants we need to uphold
// here in the loop below: that 'sid' is a valid state ID for
// this DFA, and that 'at' is a valid index into 'bytes'. For
// the former, we rely on the invariant that next_state* and
// start_state_forward always returns a valid state ID (given a
// valid state ID in the former case), and that we are only at this
// place in the code if 'sid' is untagged. Moreover, every call to
// next_state_untagged_unchecked below is guarded by a check that
// sid is untagged. For the latter safety invariant, we always
// guard unchecked access with a check that 'at' is less than
// 'end', where 'end == bytes.len()'.
//
// For justification, this gives us a ~10% bump in search time.
// This was used for a benchmark:
//
// regex-cli find hybrid regex @/some/big/file '(?m)^.+$' -UBb
//
// With bounds checked: ~881.4ms. Without: ~775ms. For input, I
// used OpenSubtitles2018.raw.sample.medium.en.
let mut prev_sid = sid;
while at < end {
prev_sid = sid;
sid = unsafe {
dfa.next_state_untagged_unchecked(
cache,
sid,
*bytes.get_unchecked(at),
)
};
at += 1;
if sid.is_tagged() {
break;
}
// SAFETY: we make four unguarded accesses to 'bytes[at]'
// below, and each are safe because we know that 'at + 4' is
// in bounds. Moreover, while we don't check whether 'sid' is
// untagged directly, we know it is because of the check above.
// And the unrolled loop below quits when the next state is not
// equal to the previous state.
//
// PERF: For justification for eliminating bounds checks,
// see above. For justification for the unrolling, we use
// two tests. The one above with regex '(?m)^.+$', and also
// '(?m)^.{40}$'. The former is kinda the best case for
// unrolling, and gives a 1.67 boost primarily because the DFA
// spends most of its time munching through the input in the
// same state. But the latter pattern rarely spends time in the
// same state through subsequent transitions, so unrolling is
// pretty much always ineffective in that it craps out on the
// first 'sid != next' check below. However, without unrolling,
// search is only 1.03 times faster than with unrolling on the
// latter pattern, which we deem to be an acceptable loss in
// favor of optimizing the more common case of having a "hot"
// state somewhere in the DFA.
while at + 4 < end {
let next = unsafe {
dfa.next_state_untagged_unchecked(
cache,
sid,
*bytes.get_unchecked(at),
)
};
if sid != next {
break;
}
at += 1;
let next = unsafe {
dfa.next_state_untagged_unchecked(
cache,
sid,
*bytes.get_unchecked(at),
)
};
if sid != next {
break;
}
at += 1;
let next = unsafe {
dfa.next_state_untagged_unchecked(
cache,
sid,
*bytes.get_unchecked(at),
)
};
if sid != next {
break;
}
at += 1;
let next = unsafe {
dfa.next_state_untagged_unchecked(
cache,
sid,
*bytes.get_unchecked(at),
)
};
if sid != next {
break;
}
at += 1;
}
}
if sid.is_unknown() {
sid = dfa
.next_state(cache, prev_sid, bytes[at - 1])
.map_err(|_| gave_up(at - 1))?;
}
}
if sid.is_tagged() {
if sid.is_start() {
if let Some(ref mut pre) = pre {
if pre.is_effective(at) {
match pre.next_candidate(bytes, at).into_option() {
None => return Ok(None),
Some(i) => {
at = i;
}
}
}
}
} else if sid.is_match() {
last_match = Some(HalfMatch {
pattern: dfa.match_pattern(cache, sid, 0),
offset: at - MATCH_OFFSET,
});
if earliest {
return Ok(last_match);
}
} else if sid.is_dead() {
return Ok(last_match);
} else if sid.is_quit() {
if last_match.is_some() {
return Ok(last_match);
}
let offset = at - 1;
return Err(MatchError::Quit { byte: bytes[offset], offset });
} else {
debug_assert!(sid.is_unknown());
unreachable!("sid being unknown is a bug");
}
}
}
// We are careful to use 'haystack' here, which contains the full context
// that we might want to inspect.
Ok(eoi_fwd(dfa, cache, haystack, end, &mut sid)?.or(last_match))
}
#[inline(never)]
pub(crate) fn find_earliest_rev(
dfa: &DFA,
cache: &mut Cache,
pattern_id: Option<PatternID>,
bytes: &[u8],
start: usize,
end: usize,
) -> Result<Option<HalfMatch>, MatchError> {
find_rev(true, dfa, cache, pattern_id, bytes, start, end)
}
#[inline(never)]
pub(crate) fn find_leftmost_rev(
dfa: &DFA,
cache: &mut Cache,
pattern_id: Option<PatternID>,
bytes: &[u8],
start: usize,
end: usize,
) -> Result<Option<HalfMatch>, MatchError> {
find_rev(false, dfa, cache, pattern_id, bytes, start, end)
}
#[inline(always)]
fn find_rev(
earliest: bool,
dfa: &DFA,
cache: &mut Cache,
pattern_id: Option<PatternID>,
haystack: &[u8],
start: usize,
end: usize,
) -> Result<Option<HalfMatch>, MatchError> {
assert!(start <= end);
assert!(start <= haystack.len());
assert!(end <= haystack.len());
// Why do this? This lets 'bytes[at]' work without bounds checks below.
// It seems the assert on 'end <= haystack.len()' above is otherwise
// not enough. Why not just make 'bytes' scoped this way anyway? Well,
// 'eoi_fwd' (below) might actually want to try to access the byte at 'end'
// for resolving look-ahead.
let bytes = &haystack[start..];
let mut sid = init_rev(dfa, cache, pattern_id, haystack, start, end)?;
let mut last_match = None;
let mut at = end - start;
while at > 0 {
if sid.is_tagged() {
at -= 1;
sid = dfa
.next_state(cache, sid, bytes[at])
.map_err(|_| gave_up(at))?;
} else {
// SAFETY: See comments in 'find_fwd' for both a safety argument
// and a justification from a performance perspective as to 1) why
// we elide bounds checks and 2) why we do a specialized version of
// unrolling below.
let mut prev_sid = sid;
while at > 0 && !sid.is_tagged() {
prev_sid = sid;
at -= 1;
while at > 3 {
let next = unsafe {
dfa.next_state_untagged_unchecked(
cache,
sid,
*bytes.get_unchecked(at),
)
};
if sid != next {
break;
}
at -= 1;
let next = unsafe {
dfa.next_state_untagged_unchecked(
cache,
sid,
*bytes.get_unchecked(at),
)
};
if sid != next {
break;
}
at -= 1;
let next = unsafe {
dfa.next_state_untagged_unchecked(
cache,
sid,
*bytes.get_unchecked(at),
)
};
if sid != next {
break;
}
at -= 1;
let next = unsafe {
dfa.next_state_untagged_unchecked(
cache,
sid,
*bytes.get_unchecked(at),
)
};
if sid != next {
break;
}
at -= 1;
}
sid = unsafe {
dfa.next_state_untagged_unchecked(
cache,
sid,
*bytes.get_unchecked(at),
)
};
}
if sid.is_unknown() {
sid = dfa
.next_state(cache, prev_sid, bytes[at])
.map_err(|_| gave_up(at))?;
}
}
if sid.is_tagged() {
if sid.is_start() {
continue;
} else if sid.is_match() {
last_match = Some(HalfMatch {
pattern: dfa.match_pattern(cache, sid, 0),
offset: start + at + MATCH_OFFSET,
});
if earliest {
return Ok(last_match);
}
} else if sid.is_dead() {
return Ok(last_match);
} else {
debug_assert!(sid.is_quit());
if last_match.is_some() {
return Ok(last_match);
}
return Err(MatchError::Quit { byte: bytes[at], offset: at });
}
}
}
Ok(eoi_rev(dfa, cache, haystack, start, sid)?.or(last_match))
}
#[inline(never)]
pub(crate) fn find_overlapping_fwd(
pre: Option<&mut prefilter::Scanner>,
dfa: &DFA,
cache: &mut Cache,
pattern_id: Option<PatternID>,
bytes: &[u8],
start: usize,
end: usize,
caller_state: &mut OverlappingState,
) -> Result<Option<HalfMatch>, MatchError> {
// Searching with a pattern ID is always anchored, so we should only ever
// use a prefilter when no pattern ID is given.
if pre.is_some() && pattern_id.is_none() {
find_overlapping_fwd_imp(
pre,
dfa,
cache,
pattern_id,
bytes,
start,
end,
caller_state,
)
} else {
find_overlapping_fwd_imp(
None,
dfa,
cache,
pattern_id,
bytes,
start,
end,
caller_state,
)
}
}
#[inline(always)]
fn find_overlapping_fwd_imp(
mut pre: Option<&mut prefilter::Scanner>,
dfa: &DFA,
cache: &mut Cache,
pattern_id: Option<PatternID>,
bytes: &[u8],
mut start: usize,
end: usize,
caller_state: &mut OverlappingState,
) -> Result<Option<HalfMatch>, MatchError> {
assert!(start <= end);
assert!(start <= bytes.len());
assert!(end <= bytes.len());
let mut sid = match caller_state.id() {
None => init_fwd(dfa, cache, pattern_id, bytes, start, end)?,
Some(sid) => {
if let Some(last) = caller_state.last_match() {
let match_count = dfa.match_count(cache, sid);
if last.match_index < match_count {
let m = HalfMatch {
pattern: dfa.match_pattern(
cache,
sid,
last.match_index,
),
offset: last.offset,
};
last.match_index += 1;
return Ok(Some(m));
}
}
// This is a subtle but critical detail. If the caller provides a
// non-None state ID, then it must be the case that the state ID
// corresponds to one set by this function. The state ID therefore
// corresponds to a match state, a dead state or some other state.
// However, "some other" state _only_ occurs when the input has
// been exhausted because the only way to stop before then is to
// see a match or a dead/quit state.
//
// If the input is exhausted or if it's a dead state, then
// incrementing the starting position has no relevance on
// correctness, since the loop below will either not execute
// at all or will immediately stop due to being in a dead state.
// (Once in a dead state it is impossible to leave it.)
//
// Therefore, the only case we need to consider is when
// caller_state is a match state. In this case, since our machines
// support the ability to delay a match by a certain number of
// bytes (to support look-around), it follows that we actually
// consumed that many additional bytes on our previous search. When
// the caller resumes their search to find subsequent matches, they
// will use the ending location from the previous match as the next
// starting point, which is `match_offset` bytes PRIOR to where
// we scanned to on the previous search. Therefore, we need to
// compensate by bumping `start` up by `MATCH_OFFSET` bytes.
//
// Incidentally, since MATCH_OFFSET is non-zero, this also makes
// dealing with empty matches convenient. Namely, callers needn't
// special case them when implementing an iterator. Instead, this
// ensures that forward progress is always made.
start += MATCH_OFFSET;
sid
}
};
let mut at = start;
while at < end {
let byte = bytes[at];
sid = dfa.next_state(cache, sid, byte).map_err(|_| gave_up(at))?;
at += 1;
if sid.is_tagged() {
caller_state.set_id(sid);
if sid.is_start() {
if let Some(ref mut pre) = pre {
if pre.is_effective(at) {
match pre.next_candidate(bytes, at).into_option() {
None => return Ok(None),
Some(i) => {
at = i;
}
}
}
}
} else if sid.is_match() {
let offset = at - MATCH_OFFSET;
caller_state
.set_last_match(StateMatch { match_index: 1, offset });
return Ok(Some(HalfMatch {
pattern: dfa.match_pattern(cache, sid, 0),
offset,
}));
} else if sid.is_dead() {
return Ok(None);
} else {
debug_assert!(sid.is_quit());
return Err(MatchError::Quit { byte, offset: at - 1 });
}
}
}
let result = eoi_fwd(dfa, cache, bytes, end, &mut sid);
caller_state.set_id(sid);
if let Ok(Some(ref last_match)) = result {
caller_state.set_last_match(StateMatch {
// '1' is always correct here since if we get to this point, this
// always corresponds to the first (index '0') match discovered at
// this position. So the next match to report at this position (if
// it exists) is at index '1'.
match_index: 1,
offset: last_match.offset(),
});
}
result
}
#[inline(always)]
fn init_fwd(
dfa: &DFA,
cache: &mut Cache,
pattern_id: Option<PatternID>,
bytes: &[u8],
start: usize,
end: usize,
) -> Result<LazyStateID, MatchError> {
let sid = dfa
.start_state_forward(cache, pattern_id, bytes, start, end)
.map_err(|_| gave_up(start))?;
// Start states can never be match states, since all matches are delayed
// by 1 byte.
assert!(!sid.is_match());
Ok(sid)
}
#[inline(always)]
fn init_rev(
dfa: &DFA,
cache: &mut Cache,
pattern_id: Option<PatternID>,
bytes: &[u8],
start: usize,
end: usize,
) -> Result<LazyStateID, MatchError> {
let sid = dfa
.start_state_reverse(cache, pattern_id, bytes, start, end)
.map_err(|_| gave_up(end))?;
// Start states can never be match states, since all matches are delayed
// by 1 byte.
assert!(!sid.is_match());
Ok(sid)
}
#[inline(always)]
fn eoi_fwd(
dfa: &DFA,
cache: &mut Cache,
bytes: &[u8],
end: usize,
sid: &mut LazyStateID,
) -> Result<Option<HalfMatch>, MatchError> {
match bytes.get(end) {
Some(&b) => {
*sid = dfa.next_state(cache, *sid, b).map_err(|_| gave_up(end))?;
if sid.is_match() {
Ok(Some(HalfMatch {
pattern: dfa.match_pattern(cache, *sid, 0),
offset: end,
}))
} else {
Ok(None)
}
}
None => {
*sid = dfa
.next_eoi_state(cache, *sid)
.map_err(|_| gave_up(bytes.len()))?;
if sid.is_match() {
Ok(Some(HalfMatch {
pattern: dfa.match_pattern(cache, *sid, 0),
offset: bytes.len(),
}))
} else {
Ok(None)
}
}
}
}
#[inline(always)]
fn eoi_rev(
dfa: &DFA,
cache: &mut Cache,
bytes: &[u8],
start: usize,
state: LazyStateID,
) -> Result<Option<HalfMatch>, MatchError> {
if start > 0 {
let sid = dfa
.next_state(cache, state, bytes[start - 1])
.map_err(|_| gave_up(start))?;
if sid.is_match() {
Ok(Some(HalfMatch {
pattern: dfa.match_pattern(cache, sid, 0),
offset: start,
}))
} else {
Ok(None)
}
} else {
let sid =
dfa.next_eoi_state(cache, state).map_err(|_| gave_up(start))?;
if sid.is_match() {
Ok(Some(HalfMatch {
pattern: dfa.match_pattern(cache, sid, 0),
offset: 0,
}))
} else {
Ok(None)
}
}
}
/// A convenience routine for constructing a "gave up" match error.
#[inline(always)]
fn gave_up(offset: usize) -> MatchError {
MatchError::GaveUp { offset }
}