blob: 6e9c049d9dcb6e30b493e6416db488c822923bf8 [file] [log] [blame]
// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use core::cmp;
use core::iter::Filter;
use tables::word::WordCat;
/// An iterator over the substrings of a string which, after splitting the string on
/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
/// contain any characters with the
/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
/// property, or with
/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
pub struct UnicodeWords<'a> {
inner: Filter<UWordBounds<'a>, fn(&&str) -> bool>,
}
impl<'a> Iterator for UnicodeWords<'a> {
type Item = &'a str;
#[inline]
fn next(&mut self) -> Option<&'a str> { self.inner.next() }
}
impl<'a> DoubleEndedIterator for UnicodeWords<'a> {
#[inline]
fn next_back(&mut self) -> Option<&'a str> { self.inner.next_back() }
}
/// External iterator for a string's
/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
#[derive(Clone)]
pub struct UWordBounds<'a> {
string: &'a str,
cat: Option<WordCat>,
catb: Option<WordCat>,
}
/// External iterator for word boundaries and byte offsets.
#[derive(Clone)]
pub struct UWordBoundIndices<'a> {
start_offset: usize,
iter: UWordBounds<'a>,
}
impl<'a> UWordBoundIndices<'a> {
#[inline]
/// View the underlying data (the part yet to be iterated) as a slice of the original string.
///
/// ```rust
/// # use unicode_segmentation::UnicodeSegmentation;
/// let mut iter = "Hello world".split_word_bound_indices();
/// assert_eq!(iter.as_str(), "Hello world");
/// iter.next();
/// assert_eq!(iter.as_str(), " world");
/// iter.next();
/// assert_eq!(iter.as_str(), "world");
/// ```
pub fn as_str(&self) -> &'a str {
self.iter.as_str()
}
}
impl<'a> Iterator for UWordBoundIndices<'a> {
type Item = (usize, &'a str);
#[inline]
fn next(&mut self) -> Option<(usize, &'a str)> {
self.iter.next().map(|s| (s.as_ptr() as usize - self.start_offset, s))
}
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
self.iter.size_hint()
}
}
impl<'a> DoubleEndedIterator for UWordBoundIndices<'a> {
#[inline]
fn next_back(&mut self) -> Option<(usize, &'a str)> {
self.iter.next_back().map(|s| (s.as_ptr() as usize - self.start_offset, s))
}
}
// state machine for word boundary rules
#[derive(Clone,Copy,PartialEq,Eq,Debug)]
enum UWordBoundsState {
Start,
Letter,
HLetter,
Numeric,
Katakana,
ExtendNumLet,
Regional(RegionalState),
FormatExtend(FormatExtendType),
Zwj,
Emoji,
WSegSpace,
}
// subtypes for FormatExtend state in UWordBoundsState
#[derive(Clone,Copy,PartialEq,Eq,Debug)]
enum FormatExtendType {
AcceptAny,
AcceptNone,
RequireLetter,
RequireHLetter,
AcceptQLetter,
RequireNumeric,
}
#[derive(Clone,Copy,PartialEq,Eq,Debug)]
enum RegionalState {
Half,
Full,
Unknown,
}
fn is_emoji(ch: char) -> bool {
use tables::emoji;
emoji::emoji_category(ch) == emoji::EmojiCat::EC_Extended_Pictographic
}
impl<'a> Iterator for UWordBounds<'a> {
type Item = &'a str;
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
let slen = self.string.len();
(cmp::min(slen, 1), Some(slen))
}
#[inline]
fn next(&mut self) -> Option<&'a str> {
use self::UWordBoundsState::*;
use self::FormatExtendType::*;
use tables::word as wd;
if self.string.len() == 0 {
return None;
}
let mut take_curr = true;
let mut take_cat = true;
let mut idx = 0;
let mut saveidx = 0;
let mut state = Start;
let mut cat = wd::WC_Any;
let mut savecat = wd::WC_Any;
// Whether or not the previous category was ZWJ
// ZWJs get collapsed, so this handles precedence of WB3c over WB4
let mut prev_zwj;
// If extend/format/zwj were skipped. Handles precedence of WB3d over WB4
let mut skipped_format_extend = false;
for (curr, ch) in self.string.char_indices() {
idx = curr;
prev_zwj = cat == wd::WC_ZWJ;
// if there's a category cached, grab it
cat = match self.cat {
None => wd::word_category(ch),
_ => self.cat.take().unwrap()
};
take_cat = true;
// handle rule WB4
// just skip all format, extend, and zwj chars
// note that Start is a special case: if there's a bunch of Format | Extend
// characters at the beginning of a block of text, dump them out as one unit.
//
// (This is not obvious from the wording of UAX#29, but if you look at the
// test cases http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt
// then the "correct" interpretation of WB4 becomes apparent.)
if state != Start {
match cat {
wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => {
skipped_format_extend = true;
continue
}
_ => {}
}
}
// rule WB3c
// WB4 makes all ZWJs collapse into the previous state
// but you can still be in a Zwj state if you started with Zwj
//
// This means that an EP + Zwj will collapse into EP, which is wrong,
// since EP+EP is not a boundary but EP+ZWJ+EP is
//
// Thus, we separately keep track of whether or not the last character
// was a ZWJ. This is an additional bit of state tracked outside of the
// state enum; the state enum represents the last non-zwj state encountered.
// When prev_zwj is true, for the purposes of WB3c, we are in the Zwj state,
// however we are in the previous state for the purposes of all other rules.
if prev_zwj {
if is_emoji(ch) {
state = Emoji;
continue;
}
}
// Don't use `continue` in this match without updating `cat`
state = match state {
Start if cat == wd::WC_CR => {
idx += match self.get_next_cat(idx) {
Some(ncat) if ncat == wd::WC_LF => 1, // rule WB3
_ => 0
};
break; // rule WB3a
},
Start => match cat {
wd::WC_ALetter => Letter, // rule WB5, WB6, WB9, WB13a
wd::WC_Hebrew_Letter => HLetter, // rule WB5, WB6, WB7a, WB7b, WB9, WB13a
wd::WC_Numeric => Numeric, // rule WB8, WB10, WB12, WB13a
wd::WC_Katakana => Katakana, // rule WB13, WB13a
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a, WB13b
wd::WC_Regional_Indicator => Regional(RegionalState::Half), // rule WB13c
wd::WC_LF | wd::WC_Newline => break, // rule WB3a
wd::WC_ZWJ => Zwj, // rule WB3c
wd::WC_WSegSpace => WSegSpace, // rule WB3d
_ => {
if let Some(ncat) = self.get_next_cat(idx) { // rule WB4
if ncat == wd::WC_Format || ncat == wd::WC_Extend || ncat == wd::WC_ZWJ {
state = FormatExtend(AcceptNone);
self.cat = Some(ncat);
continue;
}
}
break; // rule WB999
}
},
WSegSpace => match cat {
wd::WC_WSegSpace if !skipped_format_extend => WSegSpace,
_ => {
take_curr = false;
break;
}
},
Zwj => {
// We already handle WB3c above.
take_curr = false;
break;
}
Letter | HLetter => match cat {
wd::WC_ALetter => Letter, // rule WB5
wd::WC_Hebrew_Letter => HLetter, // rule WB5
wd::WC_Numeric => Numeric, // rule WB9
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
wd::WC_Double_Quote if state == HLetter => {
savecat = cat;
saveidx = idx;
FormatExtend(RequireHLetter) // rule WB7b
},
wd::WC_Single_Quote if state == HLetter => {
FormatExtend(AcceptQLetter) // rule WB7a
},
wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => {
savecat = cat;
saveidx = idx;
FormatExtend(RequireLetter) // rule WB6
},
_ => {
take_curr = false;
break;
}
},
Numeric => match cat {
wd::WC_Numeric => Numeric, // rule WB8
wd::WC_ALetter => Letter, // rule WB10
wd::WC_Hebrew_Letter => HLetter, // rule WB10
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => {
savecat = cat;
saveidx = idx;
FormatExtend(RequireNumeric) // rule WB12
},
_ => {
take_curr = false;
break;
}
},
Katakana => match cat {
wd::WC_Katakana => Katakana, // rule WB13
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
_ => {
take_curr = false;
break;
}
},
ExtendNumLet => match cat {
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
wd::WC_ALetter => Letter, // rule WB13b
wd::WC_Hebrew_Letter => HLetter, // rule WB13b
wd::WC_Numeric => Numeric, // rule WB13b
wd::WC_Katakana => Katakana, // rule WB13b
_ => {
take_curr = false;
break;
}
},
Regional(RegionalState::Full) => {
// if it reaches here we've gone too far,
// a full flag can only compose with ZWJ/Extend/Format
// proceeding it.
take_curr = false;
break;
}
Regional(RegionalState::Half) => match cat {
wd::WC_Regional_Indicator => Regional(RegionalState::Full), // rule WB13c
_ => {
take_curr = false;
break;
}
},
Regional(_) => unreachable!("RegionalState::Unknown should not occur on forward iteration"),
Emoji => {
// We already handle WB3c above. If you've reached this point, the emoji sequence is over.
take_curr = false;
break;
},
FormatExtend(t) => match t { // handle FormatExtends depending on what type
RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB11
RequireLetter | AcceptQLetter if cat == wd::WC_ALetter => Letter, // rule WB7
RequireLetter | AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // WB7a
RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b
AcceptNone | AcceptQLetter => {
take_curr = false; // emit all the Format|Extend characters
take_cat = false;
break;
},
_ => break // rewind (in if statement below)
}
}
}
if let FormatExtend(t) = state {
// we were looking for something and didn't find it; we have to back up
if t == RequireLetter || t == RequireHLetter || t == RequireNumeric {
idx = saveidx;
cat = savecat;
take_curr = false;
}
}
self.cat = if take_curr {
idx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
None
} else if take_cat {
Some(cat)
} else {
None
};
let retstr = &self.string[..idx];
self.string = &self.string[idx..];
Some(retstr)
}
}
impl<'a> DoubleEndedIterator for UWordBounds<'a> {
#[inline]
fn next_back(&mut self) -> Option<&'a str> {
use self::UWordBoundsState::*;
use self::FormatExtendType::*;
use tables::word as wd;
if self.string.len() == 0 {
return None;
}
let mut take_curr = true;
let mut take_cat = true;
let mut idx = self.string.len();
idx -= self.string.chars().next_back().unwrap().len_utf8();
let mut previdx = idx;
let mut saveidx = idx;
let mut state = Start;
let mut savestate = Start;
let mut cat = wd::WC_Any;
let mut skipped_format_extend = false;
for (curr, ch) in self.string.char_indices().rev() {
previdx = idx;
idx = curr;
// if there's a category cached, grab it
cat = match self.catb {
None => wd::word_category(ch),
_ => self.catb.take().unwrap()
};
take_cat = true;
// backward iterator over word boundaries. Mostly the same as the forward
// iterator, with two weirdnesses:
// (1) If we encounter a single quote in the Start state, we have to check for a
// Hebrew Letter immediately before it.
// (2) Format and Extend char handling takes some gymnastics.
if cat == wd::WC_Extend
|| cat == wd::WC_Format
|| (cat == wd::WC_ZWJ && state != Zwj) { // WB3c has more priority so we should not
// fold in that case
if match state {
FormatExtend(_) | Start => false,
_ => true
} {
saveidx = previdx;
savestate = state;
state = FormatExtend(AcceptNone);
}
if state != Start {
continue;
}
} else if state == FormatExtend(AcceptNone) {
// finished a scan of some Format|Extend chars, restore previous state
state = savestate;
previdx = saveidx;
take_cat = false;
skipped_format_extend = true;
}
// Don't use `continue` in this match without updating `catb`
state = match state {
Start | FormatExtend(AcceptAny) => match cat {
_ if is_emoji(ch) => Zwj,
wd::WC_ALetter => Letter, // rule WB5, WB7, WB10, WB13b
wd::WC_Hebrew_Letter => HLetter, // rule WB5, WB7, WB7c, WB10, WB13b
wd::WC_Numeric => Numeric, // rule WB8, WB9, WB11, WB13b
wd::WC_Katakana => Katakana, // rule WB13, WB13b
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
wd::WC_Regional_Indicator => Regional(RegionalState::Unknown), // rule WB13c
// rule WB4:
wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => FormatExtend(AcceptAny),
wd::WC_Single_Quote => {
saveidx = idx;
FormatExtend(AcceptQLetter) // rule WB7a
},
wd::WC_WSegSpace => WSegSpace,
wd::WC_CR | wd::WC_LF | wd::WC_Newline => {
if state == Start {
if cat == wd::WC_LF {
idx -= match self.get_prev_cat(idx) {
Some(pcat) if pcat == wd::WC_CR => 1, // rule WB3
_ => 0
};
}
} else {
take_curr = false;
}
break; // rule WB3a
},
_ => break // rule WB999
},
Zwj => match cat { // rule WB3c
wd::WC_ZWJ => {
FormatExtend(AcceptAny)
}
_ => {
take_curr = false;
break;
}
},
WSegSpace => match cat { // rule WB3d
wd::WC_WSegSpace if !skipped_format_extend => {
WSegSpace
}
_ => {
take_curr = false;
break;
}
},
Letter | HLetter => match cat {
wd::WC_ALetter => Letter, // rule WB5
wd::WC_Hebrew_Letter => HLetter, // rule WB5
wd::WC_Numeric => Numeric, // rule WB10
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
wd::WC_Double_Quote if state == HLetter => {
saveidx = previdx;
FormatExtend(RequireHLetter) // rule WB7c
},
wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => {
saveidx = previdx;
FormatExtend(RequireLetter) // rule WB7
},
_ => {
take_curr = false;
break;
}
},
Numeric => match cat {
wd::WC_Numeric => Numeric, // rule WB8
wd::WC_ALetter => Letter, // rule WB9
wd::WC_Hebrew_Letter => HLetter, // rule WB9
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => {
saveidx = previdx;
FormatExtend(RequireNumeric) // rule WB11
},
_ => {
take_curr = false;
break;
}
},
Katakana => match cat {
wd::WC_Katakana => Katakana, // rule WB13
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
_ => {
take_curr = false;
break;
}
},
ExtendNumLet => match cat {
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
wd::WC_ALetter => Letter, // rule WB13a
wd::WC_Hebrew_Letter => HLetter, // rule WB13a
wd::WC_Numeric => Numeric, // rule WB13a
wd::WC_Katakana => Katakana, // rule WB13a
_ => {
take_curr = false;
break;
}
},
Regional(mut regional_state) => match cat {
// rule WB13c
wd::WC_Regional_Indicator => {
if regional_state == RegionalState::Unknown {
let count = self.string[..previdx]
.chars().rev()
.map(|c| wd::word_category(c))
.filter(|&c| ! (c == wd::WC_ZWJ || c == wd::WC_Extend || c == wd::WC_Format))
.take_while(|&c| c == wd::WC_Regional_Indicator)
.count();
regional_state = if count % 2 == 0 {
RegionalState::Full
} else {
RegionalState::Half
};
}
if regional_state == RegionalState::Full {
take_curr = false;
break;
} else {
Regional(RegionalState::Full)
}
}
_ => {
take_curr = false;
break;
}
},
Emoji => {
if is_emoji(ch) { // rule WB3c
Zwj
} else {
take_curr = false;
break;
}
},
FormatExtend(t) => match t {
RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB12
RequireLetter if cat == wd::WC_ALetter => Letter, // rule WB6
RequireLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB6
AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7a
RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b
_ => break // backtrack will happens
}
}
}
if let FormatExtend(t) = state {
// if we required something but didn't find it, backtrack
if t == RequireLetter || t == RequireHLetter ||
t == RequireNumeric || t == AcceptNone || t == AcceptQLetter {
previdx = saveidx;
take_cat = false;
take_curr = false;
}
}
self.catb = if take_curr {
None
} else {
idx = previdx;
if take_cat {
Some(cat)
} else {
None
}
};
let retstr = &self.string[idx..];
self.string = &self.string[..idx];
Some(retstr)
}
}
impl<'a> UWordBounds<'a> {
#[inline]
/// View the underlying data (the part yet to be iterated) as a slice of the original string.
///
/// ```rust
/// # use unicode_segmentation::UnicodeSegmentation;
/// let mut iter = "Hello world".split_word_bounds();
/// assert_eq!(iter.as_str(), "Hello world");
/// iter.next();
/// assert_eq!(iter.as_str(), " world");
/// iter.next();
/// assert_eq!(iter.as_str(), "world");
/// ```
pub fn as_str(&self) -> &'a str {
self.string
}
#[inline]
fn get_next_cat(&self, idx: usize) -> Option<WordCat> {
use tables::word as wd;
let nidx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
if nidx < self.string.len() {
let nch = self.string[nidx..].chars().next().unwrap();
Some(wd::word_category(nch))
} else {
None
}
}
#[inline]
fn get_prev_cat(&self, idx: usize) -> Option<WordCat> {
use tables::word as wd;
if idx > 0 {
let nch = self.string[..idx].chars().next_back().unwrap();
Some(wd::word_category(nch))
} else {
None
}
}
}
#[inline]
pub fn new_word_bounds<'b>(s: &'b str) -> UWordBounds<'b> {
UWordBounds { string: s, cat: None, catb: None }
}
#[inline]
pub fn new_word_bound_indices<'b>(s: &'b str) -> UWordBoundIndices<'b> {
UWordBoundIndices { start_offset: s.as_ptr() as usize, iter: new_word_bounds(s) }
}
#[inline]
pub fn new_unicode_words<'b>(s: &'b str) -> UnicodeWords<'b> {
use super::UnicodeSegmentation;
use tables::util::is_alphanumeric;
fn has_alphanumeric(s: &&str) -> bool { s.chars().any(|c| is_alphanumeric(c)) }
let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer
UnicodeWords { inner: s.split_word_bounds().filter(has_alphanumeric) }
}