blob: 9c4b771feca5055ec12400e9954bc7a60da5f2bb [file] [log] [blame]
use std::sync::Arc;
use automaton::AlwaysMatch;
use error::Error;
use raw::{self, VERSION, Builder, Bound, Fst, Stream, Output};
use stream::Streamer;
const TEXT: &'static str = include_str!("./../../data/words-100000");
pub fn fst_set<I, S>(ss: I) -> Fst
where I: IntoIterator<Item=S>, S: AsRef<[u8]> {
let mut bfst = Builder::memory();
let mut ss: Vec<Vec<u8>> =
ss.into_iter().map(|s| s.as_ref().to_vec()).collect();
ss.sort();
for s in ss.iter().into_iter() {
bfst.add(s).unwrap();
}
let fst = Fst::from_bytes(bfst.into_inner().unwrap()).unwrap();
ss.dedup();
assert_eq!(fst.len(), ss.len());
fst
}
pub fn fst_map<I, S>(ss: I) -> Fst
where I: IntoIterator<Item=(S, u64)>, S: AsRef<[u8]> {
let mut bfst = Builder::memory();
let mut ss: Vec<(Vec<u8>, u64)> =
ss.into_iter().map(|(s, o)| (s.as_ref().to_vec(), o)).collect();
ss.sort();
ss.dedup();
for (s, o) in ss.into_iter() {
bfst.insert(s, o).unwrap();
}
Fst::from_bytes(bfst.into_inner().unwrap()).unwrap()
}
pub fn fst_inputs(fst: &Fst) -> Vec<Vec<u8>> {
let mut words = vec![];
let mut rdr = fst.stream();
while let Some((word, _)) = rdr.next() {
words.push(word.to_vec());
}
words
}
pub fn fst_inputs_outputs(fst: &Fst) -> Vec<(Vec<u8>, u64)> {
let mut words = vec![];
let mut rdr = fst.stream();
while let Some((word, out)) = rdr.next() {
words.push((word.to_vec(), out.value()));
}
words
}
macro_rules! test_set {
($name:ident, $($s:expr),+) => {
#[test]
fn $name() {
let mut items = vec![$($s),*];
let fst = fst_set(&items);
let mut rdr = fst.stream();
items.sort();
items.dedup();
for item in &items {
assert_eq!(rdr.next().unwrap().0, item.as_bytes());
}
assert_eq!(rdr.next(), None);
for item in &items {
assert!(fst.get(item).is_some());
}
}
}
}
macro_rules! test_set_fail {
($name:ident, $($s:expr),+) => {
#[test]
#[should_panic]
fn $name() {
let mut bfst = Builder::memory();
$(bfst.add($s).unwrap();)*
}
}
}
test_set!(fst_set_only_empty, "");
test_set!(fst_set_one, "a");
test_set!(fst_set_dupe_empty, "", "");
test_set!(fst_set_dupe1, "a", "a");
test_set!(fst_set_dupe2, "a", "b", "b");
test_set!(fst_set_two1, "a", "b");
test_set!(fst_set_two2, "a", "ab");
test_set!(fst_set_jan, "jam", "jbm", "jcm", "jdm", "jem", "jfm", "jgm");
test_set_fail!(fst_set_order1, "b", "a");
test_set_fail!(fst_set_order2, "a", "b", "c", "a");
#[test]
fn fst_set_100000() {
let words: Vec<Vec<u8>> = TEXT.lines()
.map(|s| s.as_bytes().to_vec())
.collect();
let fst = fst_set(words.clone());
assert_eq!(words, fst_inputs(&fst));
for word in &words {
assert!(fst.get(word).is_some(),
"failed to find word: {}",
::std::str::from_utf8(word).unwrap());
}
}
macro_rules! test_map {
($name:ident, $($s:expr, $o:expr),+) => {
#[test]
fn $name() {
let fst = fst_map(vec![$(($s, $o)),*]);
let mut rdr = fst.stream();
$({
let (s, o) = rdr.next().unwrap();
assert_eq!((s, o.value()), ($s.as_bytes(), $o));
})*
assert_eq!(rdr.next(), None);
$({
assert_eq!(fst.get($s.as_bytes()), Some(Output::new($o)));
})*
}
}
}
macro_rules! test_map_fail {
($name:ident, $($s:expr, $o:expr),+) => {
#[test]
#[should_panic]
fn $name() {
let mut bfst = Builder::memory();
$(bfst.insert($s, $o).unwrap();)*
}
}
}
test_map!(fst_map_only_empty1, "", 0);
test_map!(fst_map_only_empty2, "", 100);
test_map!(fst_map_only_empty3, "", 9999999999);
test_map!(fst_map_one1, "a", 0);
test_map!(fst_map_one2, "a", 100);
test_map!(fst_map_one3, "a", 999999999);
test_map!(fst_map_two, "a", 1, "b", 2);
test_map!(fst_map_many1, "a", 34786, "ab", 26);
test_map!(
fst_map_many2,
"a", 34786, "ab", 26, "abc", 58976, "abcd", 25,
"z", 58, "zabc", 6798
);
test_map!(fst_map_many3, "a", 1, "ab", 0, "abc", 0);
test_map_fail!(fst_map_dupe_empty, "", 0, "", 0);
test_map_fail!(fst_map_dupe1, "a", 0, "a", 0);
test_map_fail!(fst_map_dupe2, "a", 0, "b", 0, "b", 0);
test_map_fail!(fst_map_order1, "b", 0, "a", 0);
test_map_fail!(fst_map_order2, "a", 0, "b", 0, "c", 0, "a", 0);
#[test]
fn fst_map_100000_increments() {
let words: Vec<(Vec<u8>, u64)> =
TEXT.lines()
.enumerate()
.map(|(i, s)| (s.as_bytes().to_vec(), i as u64))
.collect();
let fst = fst_map(words.clone());
assert_eq!(words, fst_inputs_outputs(&fst));
for &(ref word, out) in &words {
assert_eq!(fst.get(word), Some(Output::new(out)));
}
}
#[test]
fn fst_map_100000_lengths() {
let words: Vec<(Vec<u8>, u64)> =
TEXT.lines()
.map(|s| (s.as_bytes().to_vec(), s.len() as u64))
.collect();
let fst = fst_map(words.clone());
assert_eq!(words, fst_inputs_outputs(&fst));
for &(ref word, out) in &words {
assert_eq!(fst.get(word), Some(Output::new(out)));
}
}
#[test]
fn invalid_version() {
match Fst::from_bytes(vec![0; 32]) {
Err(Error::Fst(raw::Error::Version { got, .. })) => assert_eq!(got, 0),
Err(err) => panic!("expected version error, got {:?}", err),
Ok(_) => panic!("expected version error, got FST"),
}
}
#[test]
fn invalid_version_crate_too_old() {
use byteorder::{ByteOrder, LittleEndian};
let mut buf = vec![0; 32];
LittleEndian::write_u64(&mut buf, VERSION + 1);
match Fst::from_bytes(buf) {
Err(Error::Fst(raw::Error::Version { got, .. })) => {
assert_eq!(got, VERSION + 1);
}
Err(err) => panic!("expected version error, got {:?}", err),
Ok(_) => panic!("expected version error, got FST"),
}
}
#[test]
fn invalid_format() {
match Fst::from_bytes(vec![0; 0]) {
Err(Error::Fst(raw::Error::Format)) => {}
Err(err) => panic!("expected format error, got {:?}", err),
Ok(_) => panic!("expected format error, got FST"),
}
}
#[test]
fn fst_set_zero() {
let fst = fst_set::<_, String>(vec![]);
let mut rdr = fst.stream();
assert_eq!(rdr.next(), None);
}
macro_rules! test_range {
(
$name:ident,
min: $min:expr,
max: $max:expr,
imin: $imin:expr,
imax: $imax:expr,
$($s:expr),*
) => {
#[test]
fn $name() {
let items: Vec<&'static str> = vec![$($s),*];
let items: Vec<_> =
items.into_iter().enumerate()
.map(|(i, k)| (k, i as u64)).collect();
let fst = fst_map(items.clone());
let mut rdr = Stream::new(&fst, AlwaysMatch, $min, $max);
for i in $imin..$imax {
assert_eq!(rdr.next().unwrap(),
(items[i].0.as_bytes(), Output::new(items[i].1)));
}
assert_eq!(rdr.next(), None);
}
}
}
test_range! {
fst_range_empty_1,
min: Bound::Unbounded, max: Bound::Unbounded,
imin: 0, imax: 0,
}
test_range! {
fst_range_empty_2,
min: Bound::Unbounded, max: Bound::Unbounded,
imin: 0, imax: 1,
""
}
test_range! {
fst_range_empty_3,
min: Bound::Included(vec![]), max: Bound::Unbounded,
imin: 0, imax: 1,
""
}
test_range! {
fst_range_empty_4,
min: Bound::Excluded(vec![]), max: Bound::Unbounded,
imin: 0, imax: 0,
""
}
test_range! {
fst_range_empty_5,
min: Bound::Included(vec![]), max: Bound::Unbounded,
imin: 0, imax: 2,
"", "a"
}
test_range! {
fst_range_empty_6,
min: Bound::Excluded(vec![]), max: Bound::Unbounded,
imin: 1, imax: 2,
"", "a"
}
test_range! {
fst_range_empty_7,
min: Bound::Unbounded, max: Bound::Unbounded,
imin: 0, imax: 2,
"", "a"
}
test_range! {
fst_range_empty_8,
min: Bound::Unbounded, max: Bound::Included(vec![]),
imin: 0, imax: 1,
""
}
test_range! {
fst_range_empty_9,
min: Bound::Unbounded, max: Bound::Excluded(vec![]),
imin: 0, imax: 0,
""
}
test_range! {
fst_range_empty_10,
min: Bound::Unbounded, max: Bound::Included(vec![]),
imin: 0, imax: 1,
"", "a"
}
test_range! {
fst_range_empty_11,
min: Bound::Included(vec![]), max: Bound::Included(vec![]),
imin: 0, imax: 1,
""
}
test_range! {
fst_range_1,
min: Bound::Included(vec![b'a']), max: Bound::Included(vec![b'z']),
imin: 0, imax: 4,
"a", "b", "y", "z"
}
test_range! {
fst_range_2,
min: Bound::Excluded(vec![b'a']), max: Bound::Included(vec![b'y']),
imin: 1, imax: 3,
"a", "b", "y", "z"
}
test_range! {
fst_range_3,
min: Bound::Excluded(vec![b'a']), max: Bound::Excluded(vec![b'y']),
imin: 1, imax: 2,
"a", "b", "y", "z"
}
test_range! {
fst_range_4,
min: Bound::Unbounded, max: Bound::Unbounded,
imin: 0, imax: 4,
"a", "b", "y", "z"
}
test_range! {
fst_range_5,
min: Bound::Included(b"abd".to_vec()), max: Bound::Unbounded,
imin: 0, imax: 0,
"a", "ab", "abc", "abcd", "abcde"
}
test_range! {
fst_range_6,
min: Bound::Included(b"abd".to_vec()), max: Bound::Unbounded,
imin: 5, imax: 6,
"a", "ab", "abc", "abcd", "abcde", "abe"
}
test_range! {
fst_range_7,
min: Bound::Excluded(b"abd".to_vec()), max: Bound::Unbounded,
imin: 5, imax: 6,
"a", "ab", "abc", "abcd", "abcde", "abe"
}
test_range! {
fst_range_8,
min: Bound::Included(b"abd".to_vec()), max: Bound::Unbounded,
imin: 5, imax: 6,
"a", "ab", "abc", "abcd", "abcde", "xyz"
}
test_range! {
fst_range_9,
min: Bound::Unbounded, max: Bound::Included(b"abd".to_vec()),
imin: 0, imax: 5,
"a", "ab", "abc", "abcd", "abcde", "abe"
}
test_range! {
fst_range_10,
min: Bound::Unbounded, max: Bound::Included(b"abd".to_vec()),
imin: 0, imax: 6,
"a", "ab", "abc", "abcd", "abcde", "abd"
}
test_range! {
fst_range_11,
min: Bound::Unbounded, max: Bound::Included(b"abd".to_vec()),
imin: 0, imax: 6,
"a", "ab", "abc", "abcd", "abcde", "abd", "abdx"
}
test_range! {
fst_range_12,
min: Bound::Unbounded, max: Bound::Excluded(b"abd".to_vec()),
imin: 0, imax: 5,
"a", "ab", "abc", "abcd", "abcde", "abe"
}
test_range! {
fst_range_13,
min: Bound::Unbounded, max: Bound::Excluded(b"abd".to_vec()),
imin: 0, imax: 5,
"a", "ab", "abc", "abcd", "abcde", "abd"
}
test_range! {
fst_range_14,
min: Bound::Unbounded, max: Bound::Excluded(b"abd".to_vec()),
imin: 0, imax: 5,
"a", "ab", "abc", "abcd", "abcde", "abd", "abdx"
}
test_range! {
fst_range_15,
min: Bound::Included(vec![b'd']), max: Bound::Included(vec![b'c']),
imin: 0, imax: 0,
"a", "b", "c", "d", "e", "f"
}
test_range! {
fst_range_16,
min: Bound::Included(vec![b'c']), max: Bound::Included(vec![b'c']),
imin: 2, imax: 3,
"a", "b", "c", "d", "e", "f"
}
test_range! {
fst_range_17,
min: Bound::Excluded(vec![b'c']), max: Bound::Excluded(vec![b'c']),
imin: 0, imax: 0,
"a", "b", "c", "d", "e", "f"
}
test_range! {
fst_range_18,
min: Bound::Included(vec![b'c']), max: Bound::Excluded(vec![b'c']),
imin: 0, imax: 0,
"a", "b", "c", "d", "e", "f"
}
test_range! {
fst_range_19,
min: Bound::Included(vec![b'c']), max: Bound::Excluded(vec![b'd']),
imin: 2, imax: 3,
"a", "b", "c", "d", "e", "f"
}
#[test]
fn one_vec_multiple_fsts() {
let mut bfst1 = Builder::memory();
bfst1.add(b"bar").unwrap();
bfst1.add(b"baz").unwrap();
let bytes = bfst1.into_inner().unwrap();
let fst1_len = bytes.len();
let mut bfst2 = Builder::new(bytes).unwrap();
bfst2.add(b"bar").unwrap();
bfst2.add(b"foo").unwrap();
let bytes = Arc::new(bfst2.into_inner().unwrap());
let fst1 = Fst::from_shared_bytes(bytes.clone(), 0, fst1_len).unwrap();
let fst2 = Fst::from_shared_bytes(
bytes.clone(), fst1_len, bytes.len() - fst1_len).unwrap();
assert_eq!(fst_inputs(&fst1), vec![b"bar".to_vec(), b"baz".to_vec()]);
assert_eq!(fst_inputs(&fst2), vec![b"bar".to_vec(), b"foo".to_vec()]);
}
#[test]
fn bytes_written() {
let mut bfst1 = Builder::memory();
bfst1.add(b"bar").unwrap();
bfst1.add(b"baz").unwrap();
let counted_len = bfst1.bytes_written();
let bytes = bfst1.into_inner().unwrap();
let fst1_len = bytes.len() as u64;
let footer_size = 24;
assert_eq!(counted_len + footer_size, fst1_len);
}