//! Intended to be compatible with <https://github.com/MihaiValentin/lunr-languages>. Each supported | |
//! language has a trimmer, a stop word filter, and a stemmer. Most users will not need to use | |
//! these modules directly. | |
#[allow(unused_macros)] | |
macro_rules! make_trimmer { | |
($reg:expr) => { | |
pub fn trimmer(token: String) -> Option<String> { | |
use regex::Regex; | |
lazy_static! { | |
static ref START: Regex = Regex::new(concat!("^[^", $reg, "]+")).unwrap(); | |
static ref END: Regex = Regex::new(concat!("[^", $reg, "]+$")).unwrap(); | |
} | |
let token = START.replace(&token, ""); | |
Some(END.replace(&token, "").into()) | |
} | |
}; | |
} | |
macro_rules! make_stop_word_filter { | |
($words:expr) => { | |
pub fn stop_word_filter(token: String) -> Option<String> { | |
use std::collections::HashSet; | |
lazy_static! { | |
static ref WORDS: HashSet<&'static str> = { | |
let words = $words; | |
let mut set = HashSet::with_capacity(words.len()); | |
for word in words.iter() { | |
set.insert(*word); | |
} | |
set | |
}; | |
} | |
if WORDS.contains(token.as_str()) { | |
None | |
} else { | |
Some(token) | |
} | |
} | |
}; | |
} | |
#[cfg(feature = "rust-stemmers")] | |
macro_rules! make_stemmer { | |
($lang:expr) => { | |
pub fn stemmer(token: String) -> Option<String> { | |
use rust_stemmers::{Algorithm, Stemmer}; | |
lazy_static! { | |
static ref STEMMER: Stemmer = Stemmer::create($lang); | |
} | |
Some(STEMMER.stem(&token).into()) | |
} | |
}; | |
} | |
/// Used to configure the `Index` for a specific lanugage. | |
#[derive(Copy, Clone, Eq, PartialEq, Debug, EnumString, ToString, EnumIter)] | |
pub enum Language { | |
English, | |
#[cfg(feature = "da")] | |
Danish, | |
#[cfg(feature = "du")] | |
Dutch, | |
#[cfg(feature = "fi")] | |
Finnish, | |
#[cfg(feature = "fr")] | |
French, | |
#[cfg(feature = "de")] | |
German, | |
#[cfg(feature = "it")] | |
Italian, | |
#[cfg(feature = "pt")] | |
Portuguese, | |
#[cfg(feature = "ro")] | |
Romanian, | |
#[cfg(feature = "ru")] | |
Russian, | |
#[cfg(feature = "es")] | |
Spanish, | |
#[cfg(feature = "sv")] | |
Swedish, | |
#[cfg(feature = "tr")] | |
Turkish, | |
#[doc(hidden)] | |
#[strum(disabled = "true")] | |
__NonExhaustive, | |
} | |
impl Language { | |
/// Returns the `Language` for the given two-character [ISO 639-1][iso] language code if the | |
/// language is supported. Returns `None` if not supported. | |
/// | |
/// *Note:* | |
/// | |
/// The ISO 639-1 code for Dutch is "nl". However "du" is used for the module name | |
/// and pipeline suffix in order to match lunr-languages. | |
/// | |
/// [iso]: https://en.wikipedia.org/wiki/ISO_639-1 | |
pub fn from_code(code: &str) -> Option<Language> { | |
match code.to_ascii_lowercase().as_str() { | |
"en" => Some(Language::English), | |
#[cfg(feature = "da")] | |
"da" => Some(Language::Danish), | |
#[cfg(feature = "du")] | |
"nl" => Some(Language::Dutch), | |
#[cfg(feature = "fi")] | |
"fi" => Some(Language::Finnish), | |
#[cfg(feature = "fr")] | |
"fr" => Some(Language::French), | |
#[cfg(feature = "de")] | |
"de" => Some(Language::German), | |
#[cfg(feature = "it")] | |
"it" => Some(Language::Italian), | |
#[cfg(feature = "pt")] | |
"pt" => Some(Language::Portuguese), | |
#[cfg(feature = "ro")] | |
"ro" => Some(Language::Romanian), | |
#[cfg(feature = "ru")] | |
"ru" => Some(Language::Russian), | |
#[cfg(feature = "es")] | |
"es" => Some(Language::Spanish), | |
#[cfg(feature = "sv")] | |
"sv" => Some(Language::Swedish), | |
#[cfg(feature = "tr")] | |
"tr" => Some(Language::Turkish), | |
_ => None, | |
} | |
} | |
/// Returns the two-character [ISO 639-1][iso] language code for the `Language`. | |
/// | |
/// *Note:* | |
/// | |
/// The ISO 639-1 code for Dutch is "nl". However "du" is used for the module name | |
/// and pipeline suffix in order to match lunr-languages. | |
/// | |
/// [iso]: https://en.wikipedia.org/wiki/ISO_639-1 | |
pub fn to_code(&self) -> &'static str { | |
match *self { | |
Language::English => "en", | |
#[cfg(feature = "da")] | |
Language::Danish => "da", | |
#[cfg(feature = "du")] | |
Language::Dutch => "nl", | |
#[cfg(feature = "fi")] | |
Language::Finnish => "fi", | |
#[cfg(feature = "fr")] | |
Language::French => "fr", | |
#[cfg(feature = "de")] | |
Language::German => "de", | |
#[cfg(feature = "it")] | |
Language::Italian => "it", | |
#[cfg(feature = "pt")] | |
Language::Portuguese => "pt", | |
#[cfg(feature = "ro")] | |
Language::Romanian => "ro", | |
#[cfg(feature = "ru")] | |
Language::Russian => "ru", | |
#[cfg(feature = "es")] | |
Language::Spanish => "es", | |
#[cfg(feature = "sv")] | |
Language::Swedish => "sv", | |
#[cfg(feature = "tr")] | |
Language::Turkish => "tr", | |
_ => panic!("Don't use the __NonExhaustive variant!"), | |
} | |
} | |
/// Creates a pipeline for the [`Language`](../lang/enum.Language.html). | |
pub fn make_pipeline(&self) -> ::pipeline::Pipeline { | |
match *self { | |
Language::English => ::lang::en::make_pipeline(), | |
#[cfg(feature = "da")] | |
Language::Danish => ::lang::da::make_pipeline(), | |
#[cfg(feature = "du")] | |
Language::Dutch => ::lang::du::make_pipeline(), | |
#[cfg(feature = "fi")] | |
Language::Finnish => ::lang::fi::make_pipeline(), | |
#[cfg(feature = "fr")] | |
Language::French => ::lang::fr::make_pipeline(), | |
#[cfg(feature = "de")] | |
Language::German => ::lang::de::make_pipeline(), | |
#[cfg(feature = "it")] | |
Language::Italian => ::lang::it::make_pipeline(), | |
#[cfg(feature = "pt")] | |
Language::Portuguese => ::lang::pt::make_pipeline(), | |
#[cfg(feature = "ro")] | |
Language::Romanian => ::lang::ro::make_pipeline(), | |
#[cfg(feature = "ru")] | |
Language::Russian => ::lang::ru::make_pipeline(), | |
#[cfg(feature = "es")] | |
Language::Spanish => ::lang::es::make_pipeline(), | |
#[cfg(feature = "sv")] | |
Language::Swedish => ::lang::sv::make_pipeline(), | |
#[cfg(feature = "tr")] | |
Language::Turkish => ::lang::tr::make_pipeline(), | |
_ => panic!("Dont use the `__NonExhaustive` variant!"), | |
} | |
} | |
} | |
pub mod en; | |
#[cfg(feature = "da")] | |
pub mod da; | |
#[cfg(feature = "de")] | |
pub mod de; | |
#[cfg(feature = "du")] | |
pub mod du; | |
#[cfg(feature = "es")] | |
pub mod es; | |
#[cfg(feature = "fi")] | |
pub mod fi; | |
#[cfg(feature = "fr")] | |
pub mod fr; | |
#[cfg(feature = "it")] | |
pub mod it; | |
#[cfg(feature = "pt")] | |
pub mod pt; | |
#[cfg(feature = "ro")] | |
pub mod ro; | |
#[cfg(feature = "ru")] | |
pub mod ru; | |
#[cfg(feature = "sv")] | |
pub mod sv; | |
#[cfg(feature = "tr")] | |
pub mod tr; |