2020-06-17 16:57:28 +02:00
|
|
|
#[cfg(feature = "search-lindera")]
|
|
|
|
use lindera_tantivy::tokenizer::LinderaTokenizer;
|
2018-12-02 17:37:51 +01:00
|
|
|
use std::str::CharIndices;
|
2020-06-17 16:57:28 +02:00
|
|
|
use tantivy::tokenizer::*;
|
|
|
|
|
|
|
|
#[derive(Clone, Copy)]
|
|
|
|
pub enum TokenizerKind {
|
|
|
|
Simple,
|
|
|
|
Ngram,
|
|
|
|
Whitespace,
|
|
|
|
#[cfg(feature = "search-lindera")]
|
|
|
|
Lindera,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl From<TokenizerKind> for TextAnalyzer {
|
|
|
|
fn from(tokenizer: TokenizerKind) -> TextAnalyzer {
|
|
|
|
use TokenizerKind::*;
|
|
|
|
|
|
|
|
match tokenizer {
|
|
|
|
Simple => TextAnalyzer::from(SimpleTokenizer)
|
|
|
|
.filter(RemoveLongFilter::limit(40))
|
|
|
|
.filter(LowerCaser),
|
|
|
|
Ngram => TextAnalyzer::from(NgramTokenizer::new(2, 8, false)).filter(LowerCaser),
|
|
|
|
Whitespace => TextAnalyzer::from(WhitespaceTokenizer).filter(LowerCaser),
|
|
|
|
#[cfg(feature = "search-lindera")]
|
|
|
|
Lindera => {
|
|
|
|
TextAnalyzer::from(LinderaTokenizer::new("decompose", "")).filter(LowerCaser)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2018-12-02 17:37:51 +01:00
|
|
|
|
|
|
|
/// Tokenize the text by splitting on whitespaces. Pretty much a copy of Tantivy's SimpleTokenizer,
|
|
|
|
/// but not splitting on punctuation
|
|
|
|
#[derive(Clone)]
|
|
|
|
pub struct WhitespaceTokenizer;
|
|
|
|
|
|
|
|
pub struct WhitespaceTokenStream<'a> {
|
|
|
|
text: &'a str,
|
|
|
|
chars: CharIndices<'a>,
|
|
|
|
token: Token,
|
|
|
|
}
|
|
|
|
|
2020-05-20 13:31:45 +02:00
|
|
|
impl Tokenizer for WhitespaceTokenizer {
|
|
|
|
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
|
|
|
|
BoxTokenStream::from(WhitespaceTokenStream {
|
2018-12-02 17:37:51 +01:00
|
|
|
text,
|
|
|
|
chars: text.char_indices(),
|
|
|
|
token: Token::default(),
|
2020-05-20 13:31:45 +02:00
|
|
|
})
|
2018-12-02 17:37:51 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
impl<'a> WhitespaceTokenStream<'a> {
|
|
|
|
// search for the end of the current token.
|
|
|
|
fn search_token_end(&mut self) -> usize {
|
|
|
|
(&mut self.chars)
|
|
|
|
.filter(|&(_, ref c)| c.is_whitespace())
|
|
|
|
.map(|(offset, _)| offset)
|
|
|
|
.next()
|
2022-01-27 03:45:35 +01:00
|
|
|
.unwrap_or(self.text.len())
|
2018-12-02 17:37:51 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl<'a> TokenStream for WhitespaceTokenStream<'a> {
|
|
|
|
fn advance(&mut self) -> bool {
|
|
|
|
self.token.text.clear();
|
|
|
|
self.token.position = self.token.position.wrapping_add(1);
|
|
|
|
|
|
|
|
loop {
|
|
|
|
match self.chars.next() {
|
|
|
|
Some((offset_from, c)) => {
|
|
|
|
if !c.is_whitespace() {
|
|
|
|
let offset_to = self.search_token_end();
|
|
|
|
self.token.offset_from = offset_from;
|
|
|
|
self.token.offset_to = offset_to;
|
|
|
|
self.token.text.push_str(&self.text[offset_from..offset_to]);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
None => {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
fn token(&self) -> &Token {
|
|
|
|
&self.token
|
|
|
|
}
|
|
|
|
|
|
|
|
fn token_mut(&mut self) -> &mut Token {
|
|
|
|
&mut self.token
|
|
|
|
}
|
|
|
|
}
|