Plume/plume-models/src/search/tokenizer.rs

use std::str::CharIndices;
use tantivy::tokenizer::{Token, TokenStream, Tokenizer};

/// Tokenize the text by splitting on whitespaces. Pretty much a copy of Tantivy's SimpleTokenizer,
/// but not splitting on punctuation
#[derive(Clone)]
pub struct WhitespaceTokenizer;

pub struct WhitespaceTokenStream<'a> {
    text: &'a str,
    chars: CharIndices<'a>,
    token: Token,
}

impl<'a> Tokenizer<'a> for WhitespaceTokenizer {
    type TokenStreamImpl = WhitespaceTokenStream<'a>;

    fn token_stream(&self, text: &'a str) -> Self::TokenStreamImpl {
        WhitespaceTokenStream {
            text,
            chars: text.char_indices(),
            token: Token::default(),
        }
    }
}
impl<'a> WhitespaceTokenStream<'a> {
    // search for the end of the current token.
    fn search_token_end(&mut self) -> usize {
        (&mut self.chars)
            .filter(|&(_, ref c)| c.is_whitespace())
            .map(|(offset, _)| offset)
            .next()
            .unwrap_or_else(|| self.text.len())
    }
}

impl<'a> TokenStream for WhitespaceTokenStream<'a> {
    fn advance(&mut self) -> bool {
        self.token.text.clear();
        self.token.position = self.token.position.wrapping_add(1);

        loop {
            match self.chars.next() {
                Some((offset_from, c)) => {
                    if !c.is_whitespace() {
                        let offset_to = self.search_token_end();
                        self.token.offset_from = offset_from;
                        self.token.offset_to = offset_to;
                        self.token.text.push_str(&self.text[offset_from..offset_to]);
                        return true;
                    }
                }
                None => {
                    return false;
                }
            }
        }
    }

    fn token(&self) -> &Token {
        &self.token
    }

    fn token_mut(&mut self) -> &mut Token {
        &mut self.token
    }
}
Add a search engine into Plume (#324) * Add search engine to the model Add a Tantivy based search engine to the model Implement most required functions for it * Implement indexing and plm subcommands Implement indexation on insert, update and delete Modify func args to get the indexer where required Add subcommand to initialize, refill and unlock search db * Move to a new threadpool engine allowing scheduling * Autocommit search index every half an hour * Implement front part of search Add default fields for search Add new routes and templates for search and result Implement FromFormValue for Page to reuse it on search result pagination Add optional query parameters to paginate template's macro Update to newer rocket_csrf, don't get csrf token on GET forms * Handle process termination to release lock Handle process termination Add tests to search * Add proper support for advanced search Add an advanced search form to /search, in template and route Modify Tantivy schema, add new tokenizer for some properties Create new String query parser Create Tantivy query AST from our own * Split search.rs, add comment and tests Split search.rs into multiple submodules Add comments and tests for Query Make user@domain be treated as one could assume 2018-12-02 17:37:51 +01:00			`use std::str::CharIndices;`
			`use tantivy::tokenizer::{Token, TokenStream, Tokenizer};`

			`/// Tokenize the text by splitting on whitespaces. Pretty much a copy of Tantivy's SimpleTokenizer,`
			`/// but not splitting on punctuation`
			`#[derive(Clone)]`
			`pub struct WhitespaceTokenizer;`

			`pub struct WhitespaceTokenStream<'a> {`
			`text: &'a str,`
			`chars: CharIndices<'a>,`
			`token: Token,`
			`}`

			`impl<'a> Tokenizer<'a> for WhitespaceTokenizer {`
			`type TokenStreamImpl = WhitespaceTokenStream<'a>;`

			`fn token_stream(&self, text: &'a str) -> Self::TokenStreamImpl {`
			`WhitespaceTokenStream {`
			`text,`
			`chars: text.char_indices(),`
			`token: Token::default(),`
			`}`
			`}`
			`}`
			`impl<'a> WhitespaceTokenStream<'a> {`
			`// search for the end of the current token.`
			`fn search_token_end(&mut self) -> usize {`
			`(&mut self.chars)`
			`.filter(\|&(_, ref c)\| c.is_whitespace())`
			`.map(\|(offset, _)\| offset)`
			`.next()`
			`.unwrap_or_else(\|\| self.text.len())`
			`}`
			`}`

			`impl<'a> TokenStream for WhitespaceTokenStream<'a> {`
			`fn advance(&mut self) -> bool {`
			`self.token.text.clear();`
			`self.token.position = self.token.position.wrapping_add(1);`

			`loop {`
			`match self.chars.next() {`
			`Some((offset_from, c)) => {`
			`if !c.is_whitespace() {`
			`let offset_to = self.search_token_end();`
			`self.token.offset_from = offset_from;`
			`self.token.offset_to = offset_to;`
			`self.token.text.push_str(&self.text[offset_from..offset_to]);`
			`return true;`
			`}`
			`}`
			`None => {`
			`return false;`
			`}`
			`}`
			`}`
			`}`

			`fn token(&self) -> &Token {`
			`&self.token`
			`}`

			`fn token_mut(&mut self) -> &mut Token {`
			`&mut self.token`
			`}`
			`}`