2018-12-02 17:37:51 +01:00
|
|
|
use instance::Instance;
|
|
|
|
use posts::Post;
|
|
|
|
use tags::Tag;
|
|
|
|
use Connection;
|
|
|
|
|
|
|
|
use chrono::Datelike;
|
|
|
|
use itertools::Itertools;
|
2019-03-20 17:56:17 +01:00
|
|
|
use std::{cmp, fs::create_dir_all, path::Path, sync::Mutex};
|
2018-12-02 17:37:51 +01:00
|
|
|
use tantivy::{
|
2019-03-20 17:56:17 +01:00
|
|
|
collector::TopDocs, directory::MmapDirectory, schema::*, tokenizer::*, Index, IndexWriter, Term,
|
2018-12-02 17:37:51 +01:00
|
|
|
};
|
|
|
|
use whatlang::{detect as detect_lang, Lang};
|
|
|
|
|
|
|
|
use super::tokenizer;
|
2019-03-20 17:56:17 +01:00
|
|
|
use search::query::PlumeQuery;
|
2018-12-29 09:36:07 +01:00
|
|
|
use Result;
|
2018-12-02 17:37:51 +01:00
|
|
|
|
|
|
|
#[derive(Debug)]
|
2018-12-29 09:36:07 +01:00
|
|
|
pub enum SearcherError {
|
2018-12-02 17:37:51 +01:00
|
|
|
IndexCreationError,
|
|
|
|
WriteLockAcquisitionError,
|
|
|
|
IndexOpeningError,
|
|
|
|
IndexEditionError,
|
|
|
|
}
|
|
|
|
|
|
|
|
pub struct Searcher {
|
|
|
|
index: Index,
|
|
|
|
writer: Mutex<Option<IndexWriter>>,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl Searcher {
|
|
|
|
pub fn schema() -> Schema {
|
2019-03-20 17:56:17 +01:00
|
|
|
let tag_indexing = TextOptions::default().set_indexing_options(
|
|
|
|
TextFieldIndexing::default()
|
|
|
|
.set_tokenizer("whitespace_tokenizer")
|
|
|
|
.set_index_option(IndexRecordOption::Basic),
|
|
|
|
);
|
|
|
|
|
|
|
|
let content_indexing = TextOptions::default().set_indexing_options(
|
|
|
|
TextFieldIndexing::default()
|
|
|
|
.set_tokenizer("content_tokenizer")
|
|
|
|
.set_index_option(IndexRecordOption::WithFreqsAndPositions),
|
|
|
|
);
|
|
|
|
|
|
|
|
let property_indexing = TextOptions::default().set_indexing_options(
|
|
|
|
TextFieldIndexing::default()
|
|
|
|
.set_tokenizer("property_tokenizer")
|
|
|
|
.set_index_option(IndexRecordOption::WithFreqsAndPositions),
|
|
|
|
);
|
2018-12-02 17:37:51 +01:00
|
|
|
|
|
|
|
let mut schema_builder = SchemaBuilder::default();
|
|
|
|
|
2018-12-04 06:43:32 +01:00
|
|
|
schema_builder.add_i64_field("post_id", INT_STORED | INT_INDEXED);
|
2018-12-02 17:37:51 +01:00
|
|
|
schema_builder.add_i64_field("creation_date", INT_INDEXED);
|
|
|
|
|
|
|
|
schema_builder.add_text_field("instance", tag_indexing.clone());
|
2018-12-04 00:07:39 +01:00
|
|
|
schema_builder.add_text_field("author", tag_indexing.clone());
|
2018-12-02 17:37:51 +01:00
|
|
|
schema_builder.add_text_field("tag", tag_indexing);
|
|
|
|
|
|
|
|
schema_builder.add_text_field("blog", content_indexing.clone());
|
|
|
|
schema_builder.add_text_field("content", content_indexing.clone());
|
|
|
|
schema_builder.add_text_field("subtitle", content_indexing.clone());
|
|
|
|
schema_builder.add_text_field("title", content_indexing);
|
|
|
|
|
|
|
|
schema_builder.add_text_field("lang", property_indexing.clone());
|
|
|
|
schema_builder.add_text_field("license", property_indexing);
|
|
|
|
|
|
|
|
schema_builder.build()
|
|
|
|
}
|
|
|
|
|
2018-12-29 09:36:07 +01:00
|
|
|
pub fn create(path: &AsRef<Path>) -> Result<Self> {
|
2019-03-20 17:56:17 +01:00
|
|
|
let whitespace_tokenizer = tokenizer::WhitespaceTokenizer.filter(LowerCaser);
|
2018-12-02 17:37:51 +01:00
|
|
|
|
|
|
|
let content_tokenizer = SimpleTokenizer
|
|
|
|
.filter(RemoveLongFilter::limit(40))
|
|
|
|
.filter(LowerCaser);
|
|
|
|
|
2019-03-20 17:56:17 +01:00
|
|
|
let property_tokenizer = NgramTokenizer::new(2, 8, false).filter(LowerCaser);
|
2018-12-02 17:37:51 +01:00
|
|
|
|
|
|
|
let schema = Self::schema();
|
|
|
|
|
|
|
|
create_dir_all(path).map_err(|_| SearcherError::IndexCreationError)?;
|
2019-03-20 17:56:17 +01:00
|
|
|
let index = Index::create(
|
|
|
|
MmapDirectory::open(path).map_err(|_| SearcherError::IndexCreationError)?,
|
|
|
|
schema,
|
|
|
|
)
|
|
|
|
.map_err(|_| SearcherError::IndexCreationError)?;
|
2018-12-02 17:37:51 +01:00
|
|
|
|
|
|
|
{
|
|
|
|
let tokenizer_manager = index.tokenizers();
|
|
|
|
tokenizer_manager.register("whitespace_tokenizer", whitespace_tokenizer);
|
|
|
|
tokenizer_manager.register("content_tokenizer", content_tokenizer);
|
|
|
|
tokenizer_manager.register("property_tokenizer", property_tokenizer);
|
2019-03-20 17:56:17 +01:00
|
|
|
} //to please the borrow checker
|
2018-12-02 17:37:51 +01:00
|
|
|
Ok(Self {
|
2019-03-20 17:56:17 +01:00
|
|
|
writer: Mutex::new(Some(
|
|
|
|
index
|
|
|
|
.writer(50_000_000)
|
|
|
|
.map_err(|_| SearcherError::WriteLockAcquisitionError)?,
|
|
|
|
)),
|
|
|
|
index,
|
2018-12-02 17:37:51 +01:00
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2018-12-29 09:36:07 +01:00
|
|
|
pub fn open(path: &AsRef<Path>) -> Result<Self> {
|
2019-03-20 17:56:17 +01:00
|
|
|
let whitespace_tokenizer = tokenizer::WhitespaceTokenizer.filter(LowerCaser);
|
2018-12-02 17:37:51 +01:00
|
|
|
|
|
|
|
let content_tokenizer = SimpleTokenizer
|
|
|
|
.filter(RemoveLongFilter::limit(40))
|
|
|
|
.filter(LowerCaser);
|
|
|
|
|
2019-03-20 17:56:17 +01:00
|
|
|
let property_tokenizer = NgramTokenizer::new(2, 8, false).filter(LowerCaser);
|
2018-12-02 17:37:51 +01:00
|
|
|
|
2019-03-20 17:56:17 +01:00
|
|
|
let index =
|
|
|
|
Index::open(MmapDirectory::open(path).map_err(|_| SearcherError::IndexOpeningError)?)
|
|
|
|
.map_err(|_| SearcherError::IndexOpeningError)?;
|
2018-12-02 17:37:51 +01:00
|
|
|
|
|
|
|
{
|
|
|
|
let tokenizer_manager = index.tokenizers();
|
|
|
|
tokenizer_manager.register("whitespace_tokenizer", whitespace_tokenizer);
|
|
|
|
tokenizer_manager.register("content_tokenizer", content_tokenizer);
|
|
|
|
tokenizer_manager.register("property_tokenizer", property_tokenizer);
|
2019-03-20 17:56:17 +01:00
|
|
|
} //to please the borrow checker
|
|
|
|
let mut writer = index
|
|
|
|
.writer(50_000_000)
|
|
|
|
.map_err(|_| SearcherError::WriteLockAcquisitionError)?;
|
|
|
|
writer
|
|
|
|
.garbage_collect_files()
|
|
|
|
.map_err(|_| SearcherError::IndexEditionError)?;
|
2018-12-02 17:37:51 +01:00
|
|
|
Ok(Self {
|
|
|
|
writer: Mutex::new(Some(writer)),
|
|
|
|
index,
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2018-12-29 09:36:07 +01:00
|
|
|
pub fn add_document(&self, conn: &Connection, post: &Post) -> Result<()> {
|
2018-12-02 17:37:51 +01:00
|
|
|
let schema = self.index.schema();
|
|
|
|
|
|
|
|
let post_id = schema.get_field("post_id").unwrap();
|
|
|
|
let creation_date = schema.get_field("creation_date").unwrap();
|
|
|
|
|
|
|
|
let instance = schema.get_field("instance").unwrap();
|
|
|
|
let author = schema.get_field("author").unwrap();
|
|
|
|
let tag = schema.get_field("tag").unwrap();
|
|
|
|
|
|
|
|
let blog_name = schema.get_field("blog").unwrap();
|
|
|
|
let content = schema.get_field("content").unwrap();
|
|
|
|
let subtitle = schema.get_field("subtitle").unwrap();
|
|
|
|
let title = schema.get_field("title").unwrap();
|
|
|
|
|
|
|
|
let lang = schema.get_field("lang").unwrap();
|
|
|
|
let license = schema.get_field("license").unwrap();
|
|
|
|
|
|
|
|
let mut writer = self.writer.lock().unwrap();
|
|
|
|
let writer = writer.as_mut().unwrap();
|
|
|
|
writer.add_document(doc!(
|
2018-12-29 09:36:07 +01:00
|
|
|
post_id => i64::from(post.id),
|
2019-03-06 18:28:10 +01:00
|
|
|
author => post.get_authors(conn)?.into_iter().map(|u| u.fqn).join(" "),
|
2018-12-29 09:36:07 +01:00
|
|
|
creation_date => i64::from(post.creation_date.num_days_from_ce()),
|
|
|
|
instance => Instance::get(conn, post.get_blog(conn)?.instance_id)?.public_domain.clone(),
|
|
|
|
tag => Tag::for_post(conn, post.id)?.into_iter().map(|t| t.tag).join(" "),
|
|
|
|
blog_name => post.get_blog(conn)?.title,
|
|
|
|
content => post.content.get().clone(),
|
|
|
|
subtitle => post.subtitle.clone(),
|
|
|
|
title => post.title.clone(),
|
|
|
|
lang => detect_lang(post.content.get()).and_then(|i| if i.is_reliable() { Some(i.lang()) } else {None} ).unwrap_or(Lang::Eng).name(),
|
|
|
|
license => post.license.clone(),
|
|
|
|
));
|
|
|
|
Ok(())
|
2018-12-02 17:37:51 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
pub fn delete_document(&self, post: &Post) {
|
|
|
|
let schema = self.index.schema();
|
|
|
|
let post_id = schema.get_field("post_id").unwrap();
|
|
|
|
|
|
|
|
let doc_id = Term::from_field_i64(post_id, i64::from(post.id));
|
|
|
|
let mut writer = self.writer.lock().unwrap();
|
|
|
|
let writer = writer.as_mut().unwrap();
|
|
|
|
writer.delete_term(doc_id);
|
|
|
|
}
|
|
|
|
|
2018-12-29 09:36:07 +01:00
|
|
|
pub fn update_document(&self, conn: &Connection, post: &Post) -> Result<()> {
|
2018-12-02 17:37:51 +01:00
|
|
|
self.delete_document(post);
|
2018-12-29 09:36:07 +01:00
|
|
|
self.add_document(conn, post)
|
2018-12-02 17:37:51 +01:00
|
|
|
}
|
|
|
|
|
2019-03-20 17:56:17 +01:00
|
|
|
pub fn search_document(
|
|
|
|
&self,
|
|
|
|
conn: &Connection,
|
|
|
|
query: PlumeQuery,
|
|
|
|
(min, max): (i32, i32),
|
|
|
|
) -> Vec<Post> {
|
2018-12-02 17:37:51 +01:00
|
|
|
let schema = self.index.schema();
|
|
|
|
let post_id = schema.get_field("post_id").unwrap();
|
|
|
|
|
2019-03-20 17:56:17 +01:00
|
|
|
let collector = TopDocs::with_limit(cmp::max(1, max) as usize);
|
2018-12-02 17:37:51 +01:00
|
|
|
|
|
|
|
let searcher = self.index.searcher();
|
2019-01-18 19:22:36 +01:00
|
|
|
let res = searcher.search(&query.into_query(), &collector).unwrap();
|
2018-12-02 17:37:51 +01:00
|
|
|
|
2019-03-20 17:56:17 +01:00
|
|
|
res.get(min as usize..)
|
|
|
|
.unwrap_or(&[])
|
2019-03-19 14:37:56 +01:00
|
|
|
.iter()
|
2019-03-20 17:56:17 +01:00
|
|
|
.filter_map(|(_, doc_add)| {
|
2018-12-02 17:37:51 +01:00
|
|
|
let doc = searcher.doc(*doc_add).ok()?;
|
|
|
|
let id = doc.get_first(post_id)?;
|
2018-12-29 09:36:07 +01:00
|
|
|
Post::get(conn, id.i64_value() as i32).ok()
|
|
|
|
//borrow checker don't want me to use filter_map or and_then here
|
|
|
|
})
|
2018-12-02 17:37:51 +01:00
|
|
|
.collect()
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn commit(&self) {
|
|
|
|
let mut writer = self.writer.lock().unwrap();
|
|
|
|
writer.as_mut().unwrap().commit().unwrap();
|
|
|
|
self.index.load_searchers().unwrap();
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn drop_writer(&self) {
|
|
|
|
self.writer.lock().unwrap().take();
|
|
|
|
}
|
|
|
|
}
|