80a4dae8bd
- Use `Result` as much as possible - Display errors instead of panicking TODO (maybe in another PR? this one is already quite big): - Find a way to merge Ructe/ErrorPage types, so that we can have routes returning `Result<X, ErrorPage>` instead of panicking when we have an `Error` - Display more details about the error, to make it easier to debug (sorry, this isn't going to be fun to review, the diff is huge, but it is always the same changes)
206 lines
8.0 KiB
Rust
206 lines
8.0 KiB
Rust
use instance::Instance;
|
|
use posts::Post;
|
|
use tags::Tag;
|
|
use Connection;
|
|
|
|
use chrono::Datelike;
|
|
use itertools::Itertools;
|
|
use tantivy::{
|
|
collector::TopCollector, directory::MmapDirectory,
|
|
schema::*, tokenizer::*, Index, IndexWriter, Term
|
|
};
|
|
use whatlang::{detect as detect_lang, Lang};
|
|
use std::{cmp, fs::create_dir_all, path::Path, sync::Mutex};
|
|
|
|
use search::query::PlumeQuery;
|
|
use super::tokenizer;
|
|
use Result;
|
|
|
|
#[derive(Debug)]
|
|
pub enum SearcherError {
|
|
IndexCreationError,
|
|
WriteLockAcquisitionError,
|
|
IndexOpeningError,
|
|
IndexEditionError,
|
|
}
|
|
|
|
pub struct Searcher {
|
|
index: Index,
|
|
writer: Mutex<Option<IndexWriter>>,
|
|
}
|
|
|
|
impl Searcher {
|
|
pub fn schema() -> Schema {
|
|
let tag_indexing = TextOptions::default()
|
|
.set_indexing_options(TextFieldIndexing::default()
|
|
.set_tokenizer("whitespace_tokenizer")
|
|
.set_index_option(IndexRecordOption::Basic));
|
|
|
|
let content_indexing = TextOptions::default()
|
|
.set_indexing_options(TextFieldIndexing::default()
|
|
.set_tokenizer("content_tokenizer")
|
|
.set_index_option(IndexRecordOption::WithFreqsAndPositions));
|
|
|
|
let property_indexing = TextOptions::default()
|
|
.set_indexing_options(TextFieldIndexing::default()
|
|
.set_tokenizer("property_tokenizer")
|
|
.set_index_option(IndexRecordOption::WithFreqsAndPositions));
|
|
|
|
let mut schema_builder = SchemaBuilder::default();
|
|
|
|
schema_builder.add_i64_field("post_id", INT_STORED | INT_INDEXED);
|
|
schema_builder.add_i64_field("creation_date", INT_INDEXED);
|
|
|
|
schema_builder.add_text_field("instance", tag_indexing.clone());
|
|
schema_builder.add_text_field("author", tag_indexing.clone());
|
|
schema_builder.add_text_field("tag", tag_indexing);
|
|
|
|
schema_builder.add_text_field("blog", content_indexing.clone());
|
|
schema_builder.add_text_field("content", content_indexing.clone());
|
|
schema_builder.add_text_field("subtitle", content_indexing.clone());
|
|
schema_builder.add_text_field("title", content_indexing);
|
|
|
|
schema_builder.add_text_field("lang", property_indexing.clone());
|
|
schema_builder.add_text_field("license", property_indexing);
|
|
|
|
schema_builder.build()
|
|
}
|
|
|
|
|
|
pub fn create(path: &AsRef<Path>) -> Result<Self> {
|
|
let whitespace_tokenizer = tokenizer::WhitespaceTokenizer
|
|
.filter(LowerCaser);
|
|
|
|
let content_tokenizer = SimpleTokenizer
|
|
.filter(RemoveLongFilter::limit(40))
|
|
.filter(LowerCaser);
|
|
|
|
let property_tokenizer = NgramTokenizer::new(2, 8, false)
|
|
.filter(LowerCaser);
|
|
|
|
let schema = Self::schema();
|
|
|
|
create_dir_all(path).map_err(|_| SearcherError::IndexCreationError)?;
|
|
let index = Index::create(MmapDirectory::open(path).map_err(|_| SearcherError::IndexCreationError)?, schema).map_err(|_| SearcherError::IndexCreationError)?;
|
|
|
|
{
|
|
let tokenizer_manager = index.tokenizers();
|
|
tokenizer_manager.register("whitespace_tokenizer", whitespace_tokenizer);
|
|
tokenizer_manager.register("content_tokenizer", content_tokenizer);
|
|
tokenizer_manager.register("property_tokenizer", property_tokenizer);
|
|
}//to please the borrow checker
|
|
Ok(Self {
|
|
writer: Mutex::new(Some(index.writer(50_000_000).map_err(|_| SearcherError::WriteLockAcquisitionError)?)),
|
|
index
|
|
})
|
|
}
|
|
|
|
pub fn open(path: &AsRef<Path>) -> Result<Self> {
|
|
let whitespace_tokenizer = tokenizer::WhitespaceTokenizer
|
|
.filter(LowerCaser);
|
|
|
|
let content_tokenizer = SimpleTokenizer
|
|
.filter(RemoveLongFilter::limit(40))
|
|
.filter(LowerCaser);
|
|
|
|
let property_tokenizer = NgramTokenizer::new(2, 8, false)
|
|
.filter(LowerCaser);
|
|
|
|
let index = Index::open(MmapDirectory::open(path).map_err(|_| SearcherError::IndexOpeningError)?).map_err(|_| SearcherError::IndexOpeningError)?;
|
|
|
|
{
|
|
let tokenizer_manager = index.tokenizers();
|
|
tokenizer_manager.register("whitespace_tokenizer", whitespace_tokenizer);
|
|
tokenizer_manager.register("content_tokenizer", content_tokenizer);
|
|
tokenizer_manager.register("property_tokenizer", property_tokenizer);
|
|
}//to please the borrow checker
|
|
let mut writer = index.writer(50_000_000).map_err(|_| SearcherError::WriteLockAcquisitionError)?;
|
|
writer.garbage_collect_files().map_err(|_| SearcherError::IndexEditionError)?;
|
|
Ok(Self {
|
|
writer: Mutex::new(Some(writer)),
|
|
index,
|
|
})
|
|
}
|
|
|
|
pub fn add_document(&self, conn: &Connection, post: &Post) -> Result<()> {
|
|
let schema = self.index.schema();
|
|
|
|
let post_id = schema.get_field("post_id").unwrap();
|
|
let creation_date = schema.get_field("creation_date").unwrap();
|
|
|
|
let instance = schema.get_field("instance").unwrap();
|
|
let author = schema.get_field("author").unwrap();
|
|
let tag = schema.get_field("tag").unwrap();
|
|
|
|
let blog_name = schema.get_field("blog").unwrap();
|
|
let content = schema.get_field("content").unwrap();
|
|
let subtitle = schema.get_field("subtitle").unwrap();
|
|
let title = schema.get_field("title").unwrap();
|
|
|
|
let lang = schema.get_field("lang").unwrap();
|
|
let license = schema.get_field("license").unwrap();
|
|
|
|
let mut writer = self.writer.lock().unwrap();
|
|
let writer = writer.as_mut().unwrap();
|
|
writer.add_document(doc!(
|
|
post_id => i64::from(post.id),
|
|
author => post.get_authors(conn)?.into_iter().map(|u| u.get_fqn(conn)).join(" "),
|
|
creation_date => i64::from(post.creation_date.num_days_from_ce()),
|
|
instance => Instance::get(conn, post.get_blog(conn)?.instance_id)?.public_domain.clone(),
|
|
tag => Tag::for_post(conn, post.id)?.into_iter().map(|t| t.tag).join(" "),
|
|
blog_name => post.get_blog(conn)?.title,
|
|
content => post.content.get().clone(),
|
|
subtitle => post.subtitle.clone(),
|
|
title => post.title.clone(),
|
|
lang => detect_lang(post.content.get()).and_then(|i| if i.is_reliable() { Some(i.lang()) } else {None} ).unwrap_or(Lang::Eng).name(),
|
|
license => post.license.clone(),
|
|
));
|
|
Ok(())
|
|
}
|
|
|
|
pub fn delete_document(&self, post: &Post) {
|
|
let schema = self.index.schema();
|
|
let post_id = schema.get_field("post_id").unwrap();
|
|
|
|
let doc_id = Term::from_field_i64(post_id, i64::from(post.id));
|
|
let mut writer = self.writer.lock().unwrap();
|
|
let writer = writer.as_mut().unwrap();
|
|
writer.delete_term(doc_id);
|
|
}
|
|
|
|
pub fn update_document(&self, conn: &Connection, post: &Post) -> Result<()> {
|
|
self.delete_document(post);
|
|
self.add_document(conn, post)
|
|
}
|
|
|
|
pub fn search_document(&self, conn: &Connection, query: PlumeQuery, (min, max): (i32, i32)) -> Vec<Post>{
|
|
let schema = self.index.schema();
|
|
let post_id = schema.get_field("post_id").unwrap();
|
|
|
|
let mut collector = TopCollector::with_limit(cmp::max(1,max) as usize);
|
|
|
|
let searcher = self.index.searcher();
|
|
searcher.search(&query.into_query(), &mut collector).unwrap();
|
|
|
|
collector.docs().get(min as usize..).unwrap_or(&[])
|
|
.into_iter()
|
|
.filter_map(|doc_add| {
|
|
let doc = searcher.doc(*doc_add).ok()?;
|
|
let id = doc.get_first(post_id)?;
|
|
Post::get(conn, id.i64_value() as i32).ok()
|
|
//borrow checker don't want me to use filter_map or and_then here
|
|
})
|
|
.collect()
|
|
}
|
|
|
|
pub fn commit(&self) {
|
|
let mut writer = self.writer.lock().unwrap();
|
|
writer.as_mut().unwrap().commit().unwrap();
|
|
self.index.load_searchers().unwrap();
|
|
}
|
|
|
|
pub fn drop_writer(&self) {
|
|
self.writer.lock().unwrap().take();
|
|
}
|
|
}
|