Plume/plume-models/src/search/tokenizer.rs

#[cfg(feature = "search-lindera")]
use lindera_tantivy::tokenizer::LinderaTokenizer;
use std::str::CharIndices;
use tantivy::tokenizer::*;

#[derive(Clone, Copy)]
pub enum TokenizerKind {
    Simple,
    Ngram,
    Whitespace,
    #[cfg(feature = "search-lindera")]
    Lindera,
}

impl From<TokenizerKind> for TextAnalyzer {
    fn from(tokenizer: TokenizerKind) -> TextAnalyzer {
        use TokenizerKind::*;

        match tokenizer {
            Simple => TextAnalyzer::from(SimpleTokenizer)
                .filter(RemoveLongFilter::limit(40))
                .filter(LowerCaser),
            Ngram => TextAnalyzer::from(NgramTokenizer::new(2, 8, false)).filter(LowerCaser),
            Whitespace => TextAnalyzer::from(WhitespaceTokenizer).filter(LowerCaser),
            #[cfg(feature = "search-lindera")]
            Lindera => {
                TextAnalyzer::from(LinderaTokenizer::new("decompose", "")).filter(LowerCaser)
            }
        }
    }
}

/// Tokenize the text by splitting on whitespaces. Pretty much a copy of Tantivy's SimpleTokenizer,
/// but not splitting on punctuation
#[derive(Clone)]
pub struct WhitespaceTokenizer;

pub struct WhitespaceTokenStream<'a> {
    text: &'a str,
    chars: CharIndices<'a>,
    token: Token,
}

impl Tokenizer for WhitespaceTokenizer {
    fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
        BoxTokenStream::from(WhitespaceTokenStream {
            text,
            chars: text.char_indices(),
            token: Token::default(),
        })
    }
}
impl<'a> WhitespaceTokenStream<'a> {
    // search for the end of the current token.
    fn search_token_end(&mut self) -> usize {
        (&mut self.chars)
            .filter(|&(_, ref c)| c.is_whitespace())
            .map(|(offset, _)| offset)
            .next()
            .unwrap_or(self.text.len())
    }
}

impl<'a> TokenStream for WhitespaceTokenStream<'a> {
    fn advance(&mut self) -> bool {
        self.token.text.clear();
        self.token.position = self.token.position.wrapping_add(1);

        loop {
            match self.chars.next() {
                Some((offset_from, c)) => {
                    if !c.is_whitespace() {
                        let offset_to = self.search_token_end();
                        self.token.offset_from = offset_from;
                        self.token.offset_to = offset_to;
                        self.token.text.push_str(&self.text[offset_from..offset_to]);
                        return true;
                    }
                }
                None => {
                    return false;
                }
            }
        }
    }

    fn token(&self) -> &Token {
        &self.token
    }

    fn token_mut(&mut self) -> &mut Token {
        &mut self.token
    }
}
Switchable tokenizer (#776) * [REFACTORING]Rename whitespace_tokenizer to tag_tokenizer for registration Name representing its purpose is preferred. * Add lindera-tantivy to plume-model's dependencies * Install lindera-tantivy * Add SearchTokenizerConfig struct * Add search tokenizers to config option * Use CONFIG for tokenizers * Use enum to hold tokenizer config instead of initializing on config phase * Use guard instead of duplicate default values * Use as_deref() instead of guard * Move SearchTokenizer from plume-models to plume-models::search::tokenizer * Rename SearchTokenizer to TokenizerKind * Define SearchTokenierConfig::determine_tokenizer() * Use determine_tokenizer in SearchTokenizerConfig::init() * Pass tokenizer config to Searcher methods * Add LowerCase filter to Lindera tokenizer * Add test for Lindera tokenizer * Define SEARCH_LANG env to specify tokenizers set * Run cargo fmt * Make Lindera tokenizer optional * Fix typos 2020-06-17 16:57:28 +02:00			`#[cfg(feature = "search-lindera")]`
			`use lindera_tantivy::tokenizer::LinderaTokenizer;`
Add a search engine into Plume (#324) * Add search engine to the model Add a Tantivy based search engine to the model Implement most required functions for it * Implement indexing and plm subcommands Implement indexation on insert, update and delete Modify func args to get the indexer where required Add subcommand to initialize, refill and unlock search db * Move to a new threadpool engine allowing scheduling * Autocommit search index every half an hour * Implement front part of search Add default fields for search Add new routes and templates for search and result Implement FromFormValue for Page to reuse it on search result pagination Add optional query parameters to paginate template's macro Update to newer rocket_csrf, don't get csrf token on GET forms * Handle process termination to release lock Handle process termination Add tests to search * Add proper support for advanced search Add an advanced search form to /search, in template and route Modify Tantivy schema, add new tokenizer for some properties Create new String query parser Create Tantivy query AST from our own * Split search.rs, add comment and tests Split search.rs into multiple submodules Add comments and tests for Query Make user@domain be treated as one could assume 2018-12-02 17:37:51 +01:00			`use std::str::CharIndices;`
Switchable tokenizer (#776) * [REFACTORING]Rename whitespace_tokenizer to tag_tokenizer for registration Name representing its purpose is preferred. * Add lindera-tantivy to plume-model's dependencies * Install lindera-tantivy * Add SearchTokenizerConfig struct * Add search tokenizers to config option * Use CONFIG for tokenizers * Use enum to hold tokenizer config instead of initializing on config phase * Use guard instead of duplicate default values * Use as_deref() instead of guard * Move SearchTokenizer from plume-models to plume-models::search::tokenizer * Rename SearchTokenizer to TokenizerKind * Define SearchTokenierConfig::determine_tokenizer() * Use determine_tokenizer in SearchTokenizerConfig::init() * Pass tokenizer config to Searcher methods * Add LowerCase filter to Lindera tokenizer * Add test for Lindera tokenizer * Define SEARCH_LANG env to specify tokenizers set * Run cargo fmt * Make Lindera tokenizer optional * Fix typos 2020-06-17 16:57:28 +02:00			`use tantivy::tokenizer::*;`

			`#[derive(Clone, Copy)]`
			`pub enum TokenizerKind {`
			`Simple,`
			`Ngram,`
			`Whitespace,`
			`#[cfg(feature = "search-lindera")]`
			`Lindera,`
			`}`

			`impl From<TokenizerKind> for TextAnalyzer {`
			`fn from(tokenizer: TokenizerKind) -> TextAnalyzer {`
			`use TokenizerKind::*;`

			`match tokenizer {`
			`Simple => TextAnalyzer::from(SimpleTokenizer)`
			`.filter(RemoveLongFilter::limit(40))`
			`.filter(LowerCaser),`
			`Ngram => TextAnalyzer::from(NgramTokenizer::new(2, 8, false)).filter(LowerCaser),`
			`Whitespace => TextAnalyzer::from(WhitespaceTokenizer).filter(LowerCaser),`
			`#[cfg(feature = "search-lindera")]`
			`Lindera => {`
			`TextAnalyzer::from(LinderaTokenizer::new("decompose", "")).filter(LowerCaser)`
			`}`
			`}`
			`}`
			`}`
Add a search engine into Plume (#324) * Add search engine to the model Add a Tantivy based search engine to the model Implement most required functions for it * Implement indexing and plm subcommands Implement indexation on insert, update and delete Modify func args to get the indexer where required Add subcommand to initialize, refill and unlock search db * Move to a new threadpool engine allowing scheduling * Autocommit search index every half an hour * Implement front part of search Add default fields for search Add new routes and templates for search and result Implement FromFormValue for Page to reuse it on search result pagination Add optional query parameters to paginate template's macro Update to newer rocket_csrf, don't get csrf token on GET forms * Handle process termination to release lock Handle process termination Add tests to search * Add proper support for advanced search Add an advanced search form to /search, in template and route Modify Tantivy schema, add new tokenizer for some properties Create new String query parser Create Tantivy query AST from our own * Split search.rs, add comment and tests Split search.rs into multiple submodules Add comments and tests for Query Make user@domain be treated as one could assume 2018-12-02 17:37:51 +01:00
			`/// Tokenize the text by splitting on whitespaces. Pretty much a copy of Tantivy's SimpleTokenizer,`
			`/// but not splitting on punctuation`
			`#[derive(Clone)]`
			`pub struct WhitespaceTokenizer;`

			`pub struct WhitespaceTokenStream<'a> {`
			`text: &'a str,`
			`chars: CharIndices<'a>,`
			`token: Token,`
			`}`

Upgrade Tantivy to v0.12.0 (#771) * Upgrade Tantivy to 0.12.0 * Follow Tantivy Tokenizer's new type definition * Wrap tokenizers with TextAnalyzer to use filter methods * Replace async IndexWriter::garbage_collect_files with sync functions * Update Cargo.toml 2020-05-20 13:31:45 +02:00			`impl Tokenizer for WhitespaceTokenizer {`
			`fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {`
			`BoxTokenStream::from(WhitespaceTokenStream {`
Add a search engine into Plume (#324) * Add search engine to the model Add a Tantivy based search engine to the model Implement most required functions for it * Implement indexing and plm subcommands Implement indexation on insert, update and delete Modify func args to get the indexer where required Add subcommand to initialize, refill and unlock search db * Move to a new threadpool engine allowing scheduling * Autocommit search index every half an hour * Implement front part of search Add default fields for search Add new routes and templates for search and result Implement FromFormValue for Page to reuse it on search result pagination Add optional query parameters to paginate template's macro Update to newer rocket_csrf, don't get csrf token on GET forms * Handle process termination to release lock Handle process termination Add tests to search * Add proper support for advanced search Add an advanced search form to /search, in template and route Modify Tantivy schema, add new tokenizer for some properties Create new String query parser Create Tantivy query AST from our own * Split search.rs, add comment and tests Split search.rs into multiple submodules Add comments and tests for Query Make user@domain be treated as one could assume 2018-12-02 17:37:51 +01:00			`text,`
			`chars: text.char_indices(),`
			`token: Token::default(),`
Upgrade Tantivy to v0.12.0 (#771) * Upgrade Tantivy to 0.12.0 * Follow Tantivy Tokenizer's new type definition * Wrap tokenizers with TextAnalyzer to use filter methods * Replace async IndexWriter::garbage_collect_files with sync functions * Update Cargo.toml 2020-05-20 13:31:45 +02:00			`})`
Add a search engine into Plume (#324) * Add search engine to the model Add a Tantivy based search engine to the model Implement most required functions for it * Implement indexing and plm subcommands Implement indexation on insert, update and delete Modify func args to get the indexer where required Add subcommand to initialize, refill and unlock search db * Move to a new threadpool engine allowing scheduling * Autocommit search index every half an hour * Implement front part of search Add default fields for search Add new routes and templates for search and result Implement FromFormValue for Page to reuse it on search result pagination Add optional query parameters to paginate template's macro Update to newer rocket_csrf, don't get csrf token on GET forms * Handle process termination to release lock Handle process termination Add tests to search * Add proper support for advanced search Add an advanced search form to /search, in template and route Modify Tantivy schema, add new tokenizer for some properties Create new String query parser Create Tantivy query AST from our own * Split search.rs, add comment and tests Split search.rs into multiple submodules Add comments and tests for Query Make user@domain be treated as one could assume 2018-12-02 17:37:51 +01:00			`}`
			`}`
			`impl<'a> WhitespaceTokenStream<'a> {`
			`// search for the end of the current token.`
			`fn search_token_end(&mut self) -> usize {`
			`(&mut self.chars)`
			`.filter(\|&(_, ref c)\| c.is_whitespace())`
			`.map(\|(offset, _)\| offset)`
			`.next()`
Satisfy clippy 2022-01-27 03:45:35 +01:00			`.unwrap_or(self.text.len())`
Add a search engine into Plume (#324) * Add search engine to the model Add a Tantivy based search engine to the model Implement most required functions for it * Implement indexing and plm subcommands Implement indexation on insert, update and delete Modify func args to get the indexer where required Add subcommand to initialize, refill and unlock search db * Move to a new threadpool engine allowing scheduling * Autocommit search index every half an hour * Implement front part of search Add default fields for search Add new routes and templates for search and result Implement FromFormValue for Page to reuse it on search result pagination Add optional query parameters to paginate template's macro Update to newer rocket_csrf, don't get csrf token on GET forms * Handle process termination to release lock Handle process termination Add tests to search * Add proper support for advanced search Add an advanced search form to /search, in template and route Modify Tantivy schema, add new tokenizer for some properties Create new String query parser Create Tantivy query AST from our own * Split search.rs, add comment and tests Split search.rs into multiple submodules Add comments and tests for Query Make user@domain be treated as one could assume 2018-12-02 17:37:51 +01:00			`}`
			`}`

			`impl<'a> TokenStream for WhitespaceTokenStream<'a> {`
			`fn advance(&mut self) -> bool {`
			`self.token.text.clear();`
			`self.token.position = self.token.position.wrapping_add(1);`

			`loop {`
			`match self.chars.next() {`
			`Some((offset_from, c)) => {`
			`if !c.is_whitespace() {`
			`let offset_to = self.search_token_end();`
			`self.token.offset_from = offset_from;`
			`self.token.offset_to = offset_to;`
			`self.token.text.push_str(&self.text[offset_from..offset_to]);`
			`return true;`
			`}`
			`}`
			`None => {`
			`return false;`
			`}`
			`}`
			`}`
			`}`

			`fn token(&self) -> &Token {`
			`&self.token`
			`}`

			`fn token_mut(&mut self) -> &mut Token {`
			`&mut self.token`
			`}`
			`}`