src/sentence.rs - platform/external/rust/crates/unicode-segmentation - Git at Google

 // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
 // file at the top-level directory of this distribution and at
 // http://rust-lang.org/COPYRIGHT.
 //
 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.

 use core::cmp;
 use core::iter::Filter;

 // All of the logic for forward iteration over sentences
 mod fwd {
     use crate::tables::sentence::SentenceCat;
     use core::cmp;

     // Describe a parsed part of source string as described in this table:
     // https://unicode.org/reports/tr29/#Default_Sentence_Boundaries
     #[derive(Clone, Copy, PartialEq, Eq)]
     enum StatePart {
         Sot,
         Eot,
         Other,
         CR,
         LF,
         Sep,
         ATerm,
         UpperLower,
         ClosePlus,
         SpPlus,
         STerm,
     }

     #[derive(Clone, PartialEq, Eq)]
     struct SentenceBreaksState(pub [StatePart; 4]);

     const INITIAL_STATE: SentenceBreaksState = SentenceBreaksState([
         StatePart::Sot,
         StatePart::Sot,
         StatePart::Sot,
         StatePart::Sot,
     ]);

     #[derive(Clone)]
     pub struct SentenceBreaks<'a> {
         pub string: &'a str,
         pos: usize,
         state: SentenceBreaksState,
     }

     impl SentenceBreaksState {
         // Attempt to advance the internal state by one part
         // Whitespace and some punctutation will be collapsed
         fn next(&self, cat: SentenceCat) -> SentenceBreaksState {
             let &SentenceBreaksState(parts) = self;
             let parts = match (parts[3], cat) {
                 (StatePart::ClosePlus, SentenceCat::SC_Close) => parts,
                 (StatePart::SpPlus, SentenceCat::SC_Sp) => parts,
                 _ => [
                     parts[1],
                     parts[2],
                     parts[3],
                     match cat {
                         SentenceCat::SC_CR => StatePart::CR,
                         SentenceCat::SC_LF => StatePart::LF,
                         SentenceCat::SC_Sep => StatePart::Sep,
                         SentenceCat::SC_ATerm => StatePart::ATerm,
                         SentenceCat::SC_Upper | SentenceCat::SC_Lower => StatePart::UpperLower,
                         SentenceCat::SC_Close => StatePart::ClosePlus,
                         SentenceCat::SC_Sp => StatePart::SpPlus,
                         SentenceCat::SC_STerm => StatePart::STerm,
                         _ => StatePart::Other,
                     },
                 ],
             };
             SentenceBreaksState(parts)
         }

         fn end(&self) -> SentenceBreaksState {
             let &SentenceBreaksState(parts) = self;
             SentenceBreaksState([parts[1], parts[2], parts[3], StatePart::Eot])
         }

         // Helper function to check if state head matches a single `StatePart`
         fn match1(&self, part: StatePart) -> bool {
             let &SentenceBreaksState(parts) = self;
             part == parts[3]
         }

         // Helper function to check if first two `StateParts` in state match
         // the given two
         fn match2(&self, part1: StatePart, part2: StatePart) -> bool {
             let &SentenceBreaksState(parts) = self;
             part1 == parts[2] && part2 == parts[3]
         }
     }

     // https://unicode.org/reports/tr29/#SB8
     // TODO cache this, it is currently quadratic
     fn match_sb8(state: &SentenceBreaksState, ahead: &str) -> bool {
         let &SentenceBreaksState(parts) = state;
         let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
         if parts[idx] == StatePart::ClosePlus {
             idx -= 1
         }

         if parts[idx] == StatePart::ATerm {
             use crate::tables::sentence as se;

             for next_char in ahead.chars() {
                 //( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )* Lower
                 match se::sentence_category(next_char).2 {
                     se::SC_Lower => return true,
                     se::SC_OLetter
                     | se::SC_Upper
                     | se::SC_Sep
                     | se::SC_CR
                     | se::SC_LF
                     | se::SC_STerm
                     | se::SC_ATerm => return false,
                     _ => continue,
                 }
             }
         }

         false
     }

     // https://unicode.org/reports/tr29/#SB8a
     fn match_sb8a(state: &SentenceBreaksState) -> bool {
         // SATerm Close* Sp*
         let &SentenceBreaksState(parts) = state;
         let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
         if parts[idx] == StatePart::ClosePlus {
             idx -= 1
         }
         parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
     }

     // https://unicode.org/reports/tr29/#SB9
     fn match_sb9(state: &SentenceBreaksState) -> bool {
         // SATerm Close*
         let &SentenceBreaksState(parts) = state;
         let idx = if parts[3] == StatePart::ClosePlus {
             2
         } else {
             3
         };
         parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
     }

     // https://unicode.org/reports/tr29/#SB11
     fn match_sb11(state: &SentenceBreaksState) -> bool {
         // SATerm Close* Sp* ParaSep?
         let &SentenceBreaksState(parts) = state;
         let mut idx = match parts[3] {
             StatePart::Sep | StatePart::CR | StatePart::LF => 2,
             _ => 3,
         };

         if parts[idx] == StatePart::SpPlus {
             idx -= 1
         }
         if parts[idx] == StatePart::ClosePlus {
             idx -= 1
         }

         parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
     }

     impl<'a> Iterator for SentenceBreaks<'a> {
         // Returns the index of the character which follows a break
         type Item = usize;

         #[inline]
         fn size_hint(&self) -> (usize, Option<usize>) {
             let slen = self.string.len();
             // A sentence could be one character
             (cmp::min(slen, 2), Some(slen + 1))
         }

         #[inline]
         fn next(&mut self) -> Option<usize> {
             use crate::tables::sentence as se;

             for next_char in self.string[self.pos..].chars() {
                 let position_before = self.pos;
                 let state_before = self.state.clone();

                 let next_cat = se::sentence_category(next_char).2;

                 self.pos += next_char.len_utf8();
                 self.state = self.state.next(next_cat);

                 match next_cat {
                     // SB1 https://unicode.org/reports/tr29/#SB1
                     _ if state_before.match1(StatePart::Sot) => return Some(position_before),

                     // SB2 is handled when inner iterator (chars) is finished

                     // SB3 https://unicode.org/reports/tr29/#SB3
                     SentenceCat::SC_LF if state_before.match1(StatePart::CR) => continue,

                     // SB4 https://unicode.org/reports/tr29/#SB4
                     _ if state_before.match1(StatePart::Sep)
                         || state_before.match1(StatePart::CR)
                         || state_before.match1(StatePart::LF) =>
                     {
                         return Some(position_before)
                     }

                     // SB5 https://unicode.org/reports/tr29/#SB5
                     SentenceCat::SC_Extend | SentenceCat::SC_Format => self.state = state_before,

                     // SB6 https://unicode.org/reports/tr29/#SB6
                     SentenceCat::SC_Numeric if state_before.match1(StatePart::ATerm) => continue,

                     // SB7 https://unicode.org/reports/tr29/#SB7
                     SentenceCat::SC_Upper
                         if state_before.match2(StatePart::UpperLower, StatePart::ATerm) =>
                     {
                         continue
                     }

                     // SB8 https://unicode.org/reports/tr29/#SB8
                     _ if match_sb8(&state_before, &self.string[position_before..]) => continue,

                     // SB8a https://unicode.org/reports/tr29/#SB8a
                     SentenceCat::SC_SContinue | SentenceCat::SC_STerm | SentenceCat::SC_ATerm
                         if match_sb8a(&state_before) =>
                     {
                         continue
                     }

                     // SB9 https://unicode.org/reports/tr29/#SB9
                     SentenceCat::SC_Close
                     | SentenceCat::SC_Sp
                     | SentenceCat::SC_Sep
                     | SentenceCat::SC_CR
                     | SentenceCat::SC_LF
                         if match_sb9(&state_before) =>
                     {
                         continue
                     }

                     // SB10 https://unicode.org/reports/tr29/#SB10
                     SentenceCat::SC_Sp
                     | SentenceCat::SC_Sep
                     | SentenceCat::SC_CR
                     | SentenceCat::SC_LF
                         if match_sb8a(&state_before) =>
                     {
                         continue
                     }

                     // SB11 https://unicode.org/reports/tr29/#SB11
                     _ if match_sb11(&state_before) => return Some(position_before),

                     // SB998 https://unicode.org/reports/tr29/#SB998
                     _ => continue,
                 }
             }

             // SB2 https://unicode.org/reports/tr29/#SB2
             if self.state.match1(StatePart::Sot) {
                 None
             } else if self.state.match1(StatePart::Eot) {
                 None
             } else {
                 self.state = self.state.end();
                 Some(self.pos)
             }
         }
     }

     pub fn new_sentence_breaks<'a>(source: &'a str) -> SentenceBreaks<'a> {
         SentenceBreaks {
             string: source,
             pos: 0,
             state: INITIAL_STATE,
         }
     }
 }

 /// An iterator over the substrings of a string which, after splitting the string on
 /// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries),
 /// contain any characters with the
 /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
 /// property, or with
 /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
 ///
 /// This struct is created by the [`unicode_sentences`] method on the [`UnicodeSegmentation`]
 /// trait. See its documentation for more.
 ///
 /// [`unicode_sentences`]: trait.UnicodeSegmentation.html#tymethod.unicode_sentences
 /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
 #[derive(Clone)]
 pub struct UnicodeSentences<'a> {
     inner: Filter<USentenceBounds<'a>, fn(&&str) -> bool>,
 }

 /// External iterator for a string's
 /// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
 ///
 /// This struct is created by the [`split_sentence_bounds`] method on the [`UnicodeSegmentation`]
 /// trait. See its documentation for more.
 ///
 /// [`split_sentence_bounds`]: trait.UnicodeSegmentation.html#tymethod.split_sentence_bounds
 /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
 #[derive(Clone)]
 pub struct USentenceBounds<'a> {
     iter: fwd::SentenceBreaks<'a>,
     sentence_start: Option<usize>,
 }

 /// External iterator for sentence boundaries and byte offsets.
 ///
 /// This struct is created by the [`split_sentence_bound_indices`] method on the
 /// [`UnicodeSegmentation`] trait. See its documentation for more.
 ///
 /// [`split_sentence_bound_indices`]: trait.UnicodeSegmentation.html#tymethod.split_sentence_bound_indices
 /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
 #[derive(Clone)]
 pub struct USentenceBoundIndices<'a> {
     start_offset: usize,
     iter: USentenceBounds<'a>,
 }

 #[inline]
 pub fn new_sentence_bounds<'a>(source: &'a str) -> USentenceBounds<'a> {
     USentenceBounds {
         iter: fwd::new_sentence_breaks(source),
         sentence_start: None,
     }
 }

 #[inline]
 pub fn new_sentence_bound_indices<'a>(source: &'a str) -> USentenceBoundIndices<'a> {
     USentenceBoundIndices {
         start_offset: source.as_ptr() as usize,
         iter: new_sentence_bounds(source),
     }
 }

 #[inline]
 pub fn new_unicode_sentences<'b>(s: &'b str) -> UnicodeSentences<'b> {
     use super::UnicodeSegmentation;
     use crate::tables::util::is_alphanumeric;

     fn has_alphanumeric(s: &&str) -> bool {
         s.chars().any(|c| is_alphanumeric(c))
     }
     let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer

     UnicodeSentences {
         inner: s.split_sentence_bounds().filter(has_alphanumeric),
     }
 }

 impl<'a> Iterator for UnicodeSentences<'a> {
     type Item = &'a str;

     #[inline]
     fn next(&mut self) -> Option<&'a str> {
         self.inner.next()
     }
 }

 impl<'a> Iterator for USentenceBounds<'a> {
     type Item = &'a str;

     #[inline]
     fn size_hint(&self) -> (usize, Option<usize>) {
         let (lower, upper) = self.iter.size_hint();
         (cmp::max(0, lower - 1), upper.map(|u| cmp::max(0, u - 1)))
     }

     #[inline]
     fn next(&mut self) -> Option<&'a str> {
         if self.sentence_start == None {
             if let Some(start_pos) = self.iter.next() {
                 self.sentence_start = Some(start_pos)
             } else {
                 return None;
             }
         }

         if let Some(break_pos) = self.iter.next() {
             let start_pos = self.sentence_start.unwrap();
             let sentence = &self.iter.string[start_pos..break_pos];
             self.sentence_start = Some(break_pos);
             Some(sentence)
         } else {
             None
         }
     }
 }

 impl<'a> Iterator for USentenceBoundIndices<'a> {
     type Item = (usize, &'a str);

     #[inline]
     fn next(&mut self) -> Option<(usize, &'a str)> {
         self.iter
             .next()
             .map(|s| (s.as_ptr() as usize - self.start_offset, s))
     }

     #[inline]
     fn size_hint(&self) -> (usize, Option<usize>) {
         self.iter.size_hint()
     }
 }
	// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
	// file at the top-level directory of this distribution and at
	// http://rust-lang.org/COPYRIGHT.
	//
	// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
	// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
	// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
	// option. This file may not be copied, modified, or distributed
	// except according to those terms.

	use core::cmp;
	use core::iter::Filter;

	// All of the logic for forward iteration over sentences
	mod fwd {
	use crate::tables::sentence::SentenceCat;
	use core::cmp;

	// Describe a parsed part of source string as described in this table:
	// https://unicode.org/reports/tr29/#Default_Sentence_Boundaries
	#[derive(Clone, Copy, PartialEq, Eq)]
	enum StatePart {
	Sot,
	Eot,
	Other,
	CR,
	LF,
	Sep,
	ATerm,
	UpperLower,
	ClosePlus,
	SpPlus,
	STerm,
	}

	#[derive(Clone, PartialEq, Eq)]
	struct SentenceBreaksState(pub [StatePart; 4]);

	const INITIAL_STATE: SentenceBreaksState = SentenceBreaksState([
	StatePart::Sot,
	StatePart::Sot,
	StatePart::Sot,
	StatePart::Sot,
	]);

	#[derive(Clone)]
	pub struct SentenceBreaks<'a> {
	pub string: &'a str,
	pos: usize,
	state: SentenceBreaksState,
	}

	impl SentenceBreaksState {
	// Attempt to advance the internal state by one part
	// Whitespace and some punctutation will be collapsed
	fn next(&self, cat: SentenceCat) -> SentenceBreaksState {
	let &SentenceBreaksState(parts) = self;
	let parts = match (parts[3], cat) {
	(StatePart::ClosePlus, SentenceCat::SC_Close) => parts,
	(StatePart::SpPlus, SentenceCat::SC_Sp) => parts,
	_ => [
	parts[1],
	parts[2],
	parts[3],
	match cat {
	SentenceCat::SC_CR => StatePart::CR,
	SentenceCat::SC_LF => StatePart::LF,
	SentenceCat::SC_Sep => StatePart::Sep,
	SentenceCat::SC_ATerm => StatePart::ATerm,
	SentenceCat::SC_Upper \| SentenceCat::SC_Lower => StatePart::UpperLower,
	SentenceCat::SC_Close => StatePart::ClosePlus,
	SentenceCat::SC_Sp => StatePart::SpPlus,
	SentenceCat::SC_STerm => StatePart::STerm,
	_ => StatePart::Other,
	},
	],
	};
	SentenceBreaksState(parts)
	}

	fn end(&self) -> SentenceBreaksState {
	let &SentenceBreaksState(parts) = self;
	SentenceBreaksState([parts[1], parts[2], parts[3], StatePart::Eot])
	}

	// Helper function to check if state head matches a single `StatePart`
	fn match1(&self, part: StatePart) -> bool {
	let &SentenceBreaksState(parts) = self;
	part == parts[3]
	}

	// Helper function to check if first two `StateParts` in state match
	// the given two
	fn match2(&self, part1: StatePart, part2: StatePart) -> bool {
	let &SentenceBreaksState(parts) = self;
	part1 == parts[2] && part2 == parts[3]
	}
	}

	// https://unicode.org/reports/tr29/#SB8
	// TODO cache this, it is currently quadratic
	fn match_sb8(state: &SentenceBreaksState, ahead: &str) -> bool {
	let &SentenceBreaksState(parts) = state;
	let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
	if parts[idx] == StatePart::ClosePlus {
	idx -= 1
	}

	if parts[idx] == StatePart::ATerm {
	use crate::tables::sentence as se;

	for next_char in ahead.chars() {
	//( ¬(OLetter \| Upper \| Lower \| ParaSep \| SATerm) )* Lower
	match se::sentence_category(next_char).2 {
	se::SC_Lower => return true,
	se::SC_OLetter
	\| se::SC_Upper
	\| se::SC_Sep
	\| se::SC_CR
	\| se::SC_LF
	\| se::SC_STerm
	\| se::SC_ATerm => return false,
	_ => continue,
	}
	}
	}

	false
	}

	// https://unicode.org/reports/tr29/#SB8a
	fn match_sb8a(state: &SentenceBreaksState) -> bool {
	// SATerm Close* Sp*
	let &SentenceBreaksState(parts) = state;
	let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
	if parts[idx] == StatePart::ClosePlus {
	idx -= 1
	}
	parts[idx] == StatePart::STerm \|\| parts[idx] == StatePart::ATerm
	}

	// https://unicode.org/reports/tr29/#SB9
	fn match_sb9(state: &SentenceBreaksState) -> bool {
	// SATerm Close*
	let &SentenceBreaksState(parts) = state;
	let idx = if parts[3] == StatePart::ClosePlus {
	2
	} else {
	3
	};
	parts[idx] == StatePart::STerm \|\| parts[idx] == StatePart::ATerm
	}

	// https://unicode.org/reports/tr29/#SB11
	fn match_sb11(state: &SentenceBreaksState) -> bool {
	// SATerm Close* Sp* ParaSep?
	let &SentenceBreaksState(parts) = state;
	let mut idx = match parts[3] {
	StatePart::Sep \| StatePart::CR \| StatePart::LF => 2,
	_ => 3,
	};

	if parts[idx] == StatePart::SpPlus {
	idx -= 1
	}
	if parts[idx] == StatePart::ClosePlus {
	idx -= 1
	}

	parts[idx] == StatePart::STerm \|\| parts[idx] == StatePart::ATerm
	}

	impl<'a> Iterator for SentenceBreaks<'a> {
	// Returns the index of the character which follows a break
	type Item = usize;

	#[inline]
	fn size_hint(&self) -> (usize, Option<usize>) {
	let slen = self.string.len();
	// A sentence could be one character
	(cmp::min(slen, 2), Some(slen + 1))
	}

	#[inline]
	fn next(&mut self) -> Option<usize> {
	use crate::tables::sentence as se;

	for next_char in self.string[self.pos..].chars() {
	let position_before = self.pos;
	let state_before = self.state.clone();

	let next_cat = se::sentence_category(next_char).2;

	self.pos += next_char.len_utf8();
	self.state = self.state.next(next_cat);

	match next_cat {
	// SB1 https://unicode.org/reports/tr29/#SB1
	_ if state_before.match1(StatePart::Sot) => return Some(position_before),

	// SB2 is handled when inner iterator (chars) is finished

	// SB3 https://unicode.org/reports/tr29/#SB3
	SentenceCat::SC_LF if state_before.match1(StatePart::CR) => continue,

	// SB4 https://unicode.org/reports/tr29/#SB4
	_ if state_before.match1(StatePart::Sep)
	\|\| state_before.match1(StatePart::CR)
	\|\| state_before.match1(StatePart::LF) =>
	{
	return Some(position_before)
	}

	// SB5 https://unicode.org/reports/tr29/#SB5
	SentenceCat::SC_Extend \| SentenceCat::SC_Format => self.state = state_before,

	// SB6 https://unicode.org/reports/tr29/#SB6
	SentenceCat::SC_Numeric if state_before.match1(StatePart::ATerm) => continue,

	// SB7 https://unicode.org/reports/tr29/#SB7
	SentenceCat::SC_Upper
	if state_before.match2(StatePart::UpperLower, StatePart::ATerm) =>
	{
	continue
	}

	// SB8 https://unicode.org/reports/tr29/#SB8
	_ if match_sb8(&state_before, &self.string[position_before..]) => continue,

	// SB8a https://unicode.org/reports/tr29/#SB8a
	SentenceCat::SC_SContinue \| SentenceCat::SC_STerm \| SentenceCat::SC_ATerm
	if match_sb8a(&state_before) =>
	{
	continue
	}

	// SB9 https://unicode.org/reports/tr29/#SB9
	SentenceCat::SC_Close
	\| SentenceCat::SC_Sp
	\| SentenceCat::SC_Sep
	\| SentenceCat::SC_CR
	\| SentenceCat::SC_LF
	if match_sb9(&state_before) =>
	{
	continue
	}

	// SB10 https://unicode.org/reports/tr29/#SB10
	SentenceCat::SC_Sp
	\| SentenceCat::SC_Sep
	\| SentenceCat::SC_CR
	\| SentenceCat::SC_LF
	if match_sb8a(&state_before) =>
	{
	continue
	}

	// SB11 https://unicode.org/reports/tr29/#SB11
	_ if match_sb11(&state_before) => return Some(position_before),

	// SB998 https://unicode.org/reports/tr29/#SB998
	_ => continue,
	}
	}

	// SB2 https://unicode.org/reports/tr29/#SB2
	if self.state.match1(StatePart::Sot) {
	None
	} else if self.state.match1(StatePart::Eot) {
	None
	} else {
	self.state = self.state.end();
	Some(self.pos)
	}
	}
	}

	pub fn new_sentence_breaks<'a>(source: &'a str) -> SentenceBreaks<'a> {
	SentenceBreaks {
	string: source,
	pos: 0,
	state: INITIAL_STATE,
	}
	}
	}

	/// An iterator over the substrings of a string which, after splitting the string on
	/// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries),
	/// contain any characters with the
	/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
	/// property, or with
	/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
	///
	/// This struct is created by the [`unicode_sentences`] method on the [`UnicodeSegmentation`]
	/// trait. See its documentation for more.
	///
	/// [`unicode_sentences`]: trait.UnicodeSegmentation.html#tymethod.unicode_sentences
	/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
	#[derive(Clone)]
	pub struct UnicodeSentences<'a> {
	inner: Filter<USentenceBounds<'a>, fn(&&str) -> bool>,
	}

	/// External iterator for a string's
	/// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
	///
	/// This struct is created by the [`split_sentence_bounds`] method on the [`UnicodeSegmentation`]
	/// trait. See its documentation for more.
	///
	/// [`split_sentence_bounds`]: trait.UnicodeSegmentation.html#tymethod.split_sentence_bounds
	/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
	#[derive(Clone)]
	pub struct USentenceBounds<'a> {
	iter: fwd::SentenceBreaks<'a>,
	sentence_start: Option<usize>,
	}

	/// External iterator for sentence boundaries and byte offsets.
	///
	/// This struct is created by the [`split_sentence_bound_indices`] method on the
	/// [`UnicodeSegmentation`] trait. See its documentation for more.
	///
	/// [`split_sentence_bound_indices`]: trait.UnicodeSegmentation.html#tymethod.split_sentence_bound_indices
	/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
	#[derive(Clone)]
	pub struct USentenceBoundIndices<'a> {
	start_offset: usize,
	iter: USentenceBounds<'a>,
	}

	#[inline]
	pub fn new_sentence_bounds<'a>(source: &'a str) -> USentenceBounds<'a> {
	USentenceBounds {
	iter: fwd::new_sentence_breaks(source),
	sentence_start: None,
	}
	}

	#[inline]
	pub fn new_sentence_bound_indices<'a>(source: &'a str) -> USentenceBoundIndices<'a> {
	USentenceBoundIndices {
	start_offset: source.as_ptr() as usize,
	iter: new_sentence_bounds(source),
	}
	}

	#[inline]
	pub fn new_unicode_sentences<'b>(s: &'b str) -> UnicodeSentences<'b> {
	use super::UnicodeSegmentation;
	use crate::tables::util::is_alphanumeric;

	fn has_alphanumeric(s: &&str) -> bool {
	s.chars().any(\|c\| is_alphanumeric(c))
	}
	let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer

	UnicodeSentences {
	inner: s.split_sentence_bounds().filter(has_alphanumeric),
	}
	}

	impl<'a> Iterator for UnicodeSentences<'a> {
	type Item = &'a str;

	#[inline]
	fn next(&mut self) -> Option<&'a str> {
	self.inner.next()
	}
	}

	impl<'a> Iterator for USentenceBounds<'a> {
	type Item = &'a str;

	#[inline]
	fn size_hint(&self) -> (usize, Option<usize>) {
	let (lower, upper) = self.iter.size_hint();
	(cmp::max(0, lower - 1), upper.map(\|u\| cmp::max(0, u - 1)))
	}

	#[inline]
	fn next(&mut self) -> Option<&'a str> {
	if self.sentence_start == None {
	if let Some(start_pos) = self.iter.next() {
	self.sentence_start = Some(start_pos)
	} else {
	return None;
	}
	}

	if let Some(break_pos) = self.iter.next() {
	let start_pos = self.sentence_start.unwrap();
	let sentence = &self.iter.string[start_pos..break_pos];
	self.sentence_start = Some(break_pos);
	Some(sentence)
	} else {
	None
	}
	}
	}

	impl<'a> Iterator for USentenceBoundIndices<'a> {
	type Item = (usize, &'a str);

	#[inline]
	fn next(&mut self) -> Option<(usize, &'a str)> {
	self.iter
	.next()
	.map(\|s\| (s.as_ptr() as usize - self.start_offset, s))
	}

	#[inline]
	fn size_hint(&self) -> (usize, Option<usize>) {
	self.iter.size_hint()
	}
	}