blob: 7608502b9fff962d15f2fb496c781245add0a4aa [file] [log] [blame]
//! Intended to be compatible with <https://github.com/MihaiValentin/lunr-languages>. Each supported
//! language has a trimmer, a stop word filter, and a stemmer. Most users will not need to use
//! these modules directly.
#[allow(unused_macros)]
macro_rules! make_trimmer {
($reg:expr) => {
pub fn trimmer(token: String) -> Option<String> {
use regex::Regex;
lazy_static! {
static ref START: Regex = Regex::new(concat!("^[^", $reg, "]+")).unwrap();
static ref END: Regex = Regex::new(concat!("[^", $reg, "]+$")).unwrap();
}
let token = START.replace(&token, "");
Some(END.replace(&token, "").into())
}
};
}
macro_rules! make_stop_word_filter {
($words:expr) => {
pub fn stop_word_filter(token: String) -> Option<String> {
use std::collections::HashSet;
lazy_static! {
static ref WORDS: HashSet<&'static str> = {
let words = $words;
let mut set = HashSet::with_capacity(words.len());
for word in words.iter() {
set.insert(*word);
}
set
};
}
if WORDS.contains(token.as_str()) {
None
} else {
Some(token)
}
}
};
}
#[cfg(feature = "rust-stemmers")]
macro_rules! make_stemmer {
($lang:expr) => {
pub fn stemmer(token: String) -> Option<String> {
use rust_stemmers::{Algorithm, Stemmer};
lazy_static! {
static ref STEMMER: Stemmer = Stemmer::create($lang);
}
Some(STEMMER.stem(&token).into())
}
};
}
/// Used to configure the `Index` for a specific lanugage.
#[derive(Copy, Clone, Eq, PartialEq, Debug, EnumString, ToString, EnumIter)]
pub enum Language {
English,
#[cfg(feature = "da")]
Danish,
#[cfg(feature = "du")]
Dutch,
#[cfg(feature = "fi")]
Finnish,
#[cfg(feature = "fr")]
French,
#[cfg(feature = "de")]
German,
#[cfg(feature = "it")]
Italian,
#[cfg(feature = "pt")]
Portuguese,
#[cfg(feature = "ro")]
Romanian,
#[cfg(feature = "ru")]
Russian,
#[cfg(feature = "es")]
Spanish,
#[cfg(feature = "sv")]
Swedish,
#[cfg(feature = "tr")]
Turkish,
#[doc(hidden)]
#[strum(disabled = "true")]
__NonExhaustive,
}
impl Language {
/// Returns the `Language` for the given two-character [ISO 639-1][iso] language code if the
/// language is supported. Returns `None` if not supported.
///
/// *Note:*
///
/// The ISO 639-1 code for Dutch is "nl". However "du" is used for the module name
/// and pipeline suffix in order to match lunr-languages.
///
/// [iso]: https://en.wikipedia.org/wiki/ISO_639-1
pub fn from_code(code: &str) -> Option<Language> {
match code.to_ascii_lowercase().as_str() {
"en" => Some(Language::English),
#[cfg(feature = "da")]
"da" => Some(Language::Danish),
#[cfg(feature = "du")]
"nl" => Some(Language::Dutch),
#[cfg(feature = "fi")]
"fi" => Some(Language::Finnish),
#[cfg(feature = "fr")]
"fr" => Some(Language::French),
#[cfg(feature = "de")]
"de" => Some(Language::German),
#[cfg(feature = "it")]
"it" => Some(Language::Italian),
#[cfg(feature = "pt")]
"pt" => Some(Language::Portuguese),
#[cfg(feature = "ro")]
"ro" => Some(Language::Romanian),
#[cfg(feature = "ru")]
"ru" => Some(Language::Russian),
#[cfg(feature = "es")]
"es" => Some(Language::Spanish),
#[cfg(feature = "sv")]
"sv" => Some(Language::Swedish),
#[cfg(feature = "tr")]
"tr" => Some(Language::Turkish),
_ => None,
}
}
/// Returns the two-character [ISO 639-1][iso] language code for the `Language`.
///
/// *Note:*
///
/// The ISO 639-1 code for Dutch is "nl". However "du" is used for the module name
/// and pipeline suffix in order to match lunr-languages.
///
/// [iso]: https://en.wikipedia.org/wiki/ISO_639-1
pub fn to_code(&self) -> &'static str {
match *self {
Language::English => "en",
#[cfg(feature = "da")]
Language::Danish => "da",
#[cfg(feature = "du")]
Language::Dutch => "nl",
#[cfg(feature = "fi")]
Language::Finnish => "fi",
#[cfg(feature = "fr")]
Language::French => "fr",
#[cfg(feature = "de")]
Language::German => "de",
#[cfg(feature = "it")]
Language::Italian => "it",
#[cfg(feature = "pt")]
Language::Portuguese => "pt",
#[cfg(feature = "ro")]
Language::Romanian => "ro",
#[cfg(feature = "ru")]
Language::Russian => "ru",
#[cfg(feature = "es")]
Language::Spanish => "es",
#[cfg(feature = "sv")]
Language::Swedish => "sv",
#[cfg(feature = "tr")]
Language::Turkish => "tr",
_ => panic!("Don't use the __NonExhaustive variant!"),
}
}
/// Creates a pipeline for the [`Language`](../lang/enum.Language.html).
pub fn make_pipeline(&self) -> ::pipeline::Pipeline {
match *self {
Language::English => ::lang::en::make_pipeline(),
#[cfg(feature = "da")]
Language::Danish => ::lang::da::make_pipeline(),
#[cfg(feature = "du")]
Language::Dutch => ::lang::du::make_pipeline(),
#[cfg(feature = "fi")]
Language::Finnish => ::lang::fi::make_pipeline(),
#[cfg(feature = "fr")]
Language::French => ::lang::fr::make_pipeline(),
#[cfg(feature = "de")]
Language::German => ::lang::de::make_pipeline(),
#[cfg(feature = "it")]
Language::Italian => ::lang::it::make_pipeline(),
#[cfg(feature = "pt")]
Language::Portuguese => ::lang::pt::make_pipeline(),
#[cfg(feature = "ro")]
Language::Romanian => ::lang::ro::make_pipeline(),
#[cfg(feature = "ru")]
Language::Russian => ::lang::ru::make_pipeline(),
#[cfg(feature = "es")]
Language::Spanish => ::lang::es::make_pipeline(),
#[cfg(feature = "sv")]
Language::Swedish => ::lang::sv::make_pipeline(),
#[cfg(feature = "tr")]
Language::Turkish => ::lang::tr::make_pipeline(),
_ => panic!("Dont use the `__NonExhaustive` variant!"),
}
}
}
pub mod en;
#[cfg(feature = "da")]
pub mod da;
#[cfg(feature = "de")]
pub mod de;
#[cfg(feature = "du")]
pub mod du;
#[cfg(feature = "es")]
pub mod es;
#[cfg(feature = "fi")]
pub mod fi;
#[cfg(feature = "fr")]
pub mod fr;
#[cfg(feature = "it")]
pub mod it;
#[cfg(feature = "pt")]
pub mod pt;
#[cfg(feature = "ro")]
pub mod ro;
#[cfg(feature = "ru")]
pub mod ru;
#[cfg(feature = "sv")]
pub mod sv;
#[cfg(feature = "tr")]
pub mod tr;