387 lines
15 KiB
Rust
387 lines
15 KiB
Rust
use crate::search::searcher::Searcher;
|
|
use chrono::{naive::NaiveDate, offset::Utc, Datelike};
|
|
use std::{cmp, ops::Bound};
|
|
use tantivy::{query::*, schema::*, Term};
|
|
|
|
//Generate functions for advanced search
|
|
macro_rules! gen_func {
|
|
( $($field:ident),*; strip: $($strip:ident),* ) => {
|
|
$( //most fields go here, it's kinda the "default" way
|
|
pub fn $field(&mut self, mut val: &str, occur: Option<Occur>) -> &mut Self {
|
|
if !val.trim_matches(&[' ', '"', '+', '-'][..]).is_empty() {
|
|
let occur = if let Some(occur) = occur {
|
|
occur
|
|
} else {
|
|
if val.get(0..1).map(|v| v=="+").unwrap_or(false) {
|
|
val = &val[1..];
|
|
Occur::Must
|
|
} else if val.get(0..1).map(|v| v=="-").unwrap_or(false) {
|
|
val = &val[1..];
|
|
Occur::MustNot
|
|
} else {
|
|
Occur::Should
|
|
}
|
|
};
|
|
self.$field.push((occur, val.trim_matches(&[' ', '"'][..]).to_owned()));
|
|
}
|
|
self
|
|
}
|
|
)*
|
|
$( // blog and author go here, leading @ get dismissed
|
|
pub fn $strip(&mut self, mut val: &str, occur: Option<Occur>) -> &mut Self {
|
|
if !val.trim_matches(&[' ', '"', '+', '-'][..]).is_empty() {
|
|
let occur = if let Some(occur) = occur {
|
|
occur
|
|
} else {
|
|
if val.get(0..1).map(|v| v=="+").unwrap_or(false) {
|
|
val = &val[1..];
|
|
Occur::Must
|
|
} else if val.get(0..1).map(|v| v=="-").unwrap_or(false) {
|
|
val = &val[1..];
|
|
Occur::MustNot
|
|
} else {
|
|
Occur::Should
|
|
}
|
|
};
|
|
self.$strip.push((occur, val.trim_matches(&[' ', '"', '@'][..]).to_owned()));
|
|
}
|
|
self
|
|
}
|
|
)*
|
|
}
|
|
}
|
|
|
|
//generate the parser for advanced query from string
|
|
macro_rules! gen_parser {
|
|
( $self:ident, $query:ident, $occur:ident; normal: $($field:ident),*; date: $($date:ident),*) => {
|
|
$( // most fields go here
|
|
if $query.starts_with(concat!(stringify!($field), ':')) {
|
|
let new_query = &$query[concat!(stringify!($field), ':').len()..];
|
|
let (token, rest) = Self::get_first_token(new_query);
|
|
$query = rest;
|
|
$self.$field(token, Some($occur));
|
|
} else
|
|
)*
|
|
$( // dates (before/after) got here
|
|
if $query.starts_with(concat!(stringify!($date), ':')) {
|
|
let new_query = &$query[concat!(stringify!($date), ':').len()..];
|
|
let (token, rest) = Self::get_first_token(new_query);
|
|
$query = rest;
|
|
if let Ok(token) = NaiveDate::parse_from_str(token, "%Y-%m-%d") {
|
|
$self.$date(&token);
|
|
}
|
|
} else
|
|
)* // fields without 'fieldname:' prefix are considered bare words, and will be searched in title, subtitle and content
|
|
{
|
|
let (token, rest) = Self::get_first_token($query);
|
|
$query = rest;
|
|
$self.text(token, Some($occur));
|
|
}
|
|
}
|
|
}
|
|
|
|
// generate the to_string, giving back a textual query from a PlumeQuery
|
|
macro_rules! gen_to_string {
|
|
( $self:ident, $result:ident; normal: $($field:ident),*; date: $($date:ident),*) => {
|
|
$(
|
|
for (occur, val) in &$self.$field {
|
|
if val.contains(' ') {
|
|
$result.push_str(&format!("{}{}:\"{}\" ", Self::occur_to_str(*occur), stringify!($field), val));
|
|
} else {
|
|
$result.push_str(&format!("{}{}:{} ", Self::occur_to_str(*occur), stringify!($field), val));
|
|
}
|
|
}
|
|
)*
|
|
$(
|
|
for val in &$self.$date {
|
|
$result.push_str(&format!("{}:{} ", stringify!($date), NaiveDate::from_num_days_from_ce(*val as i32).format("%Y-%m-%d")));
|
|
}
|
|
)*
|
|
}
|
|
}
|
|
|
|
// convert PlumeQuery to Tantivy's Query
|
|
macro_rules! gen_to_query {
|
|
( $self:ident, $result:ident; normal: $($normal:ident),*; oneoff: $($oneoff:ident),*) => {
|
|
$( // classic fields
|
|
for (occur, token) in $self.$normal {
|
|
$result.push((occur, Self::token_to_query(&token, stringify!($normal))));
|
|
}
|
|
)*
|
|
$( // fields where having more than on Must make no sense in general, so it's considered a Must be one of these instead.
|
|
// Those fields are instance, author, blog, lang and license
|
|
let mut subresult = Vec::new();
|
|
for (occur, token) in $self.$oneoff {
|
|
match occur {
|
|
Occur::Must => subresult.push((Occur::Should, Self::token_to_query(&token, stringify!($oneoff)))),
|
|
occur => $result.push((occur, Self::token_to_query(&token, stringify!($oneoff)))),
|
|
}
|
|
}
|
|
if !subresult.is_empty() {
|
|
$result.push((Occur::Must, Box::new(BooleanQuery::from(subresult))));
|
|
}
|
|
)*
|
|
}
|
|
}
|
|
|
|
#[derive(Default)]
|
|
pub struct PlumeQuery {
|
|
text: Vec<(Occur, String)>,
|
|
title: Vec<(Occur, String)>,
|
|
subtitle: Vec<(Occur, String)>,
|
|
content: Vec<(Occur, String)>,
|
|
tag: Vec<(Occur, String)>,
|
|
instance: Vec<(Occur, String)>,
|
|
author: Vec<(Occur, String)>,
|
|
blog: Vec<(Occur, String)>,
|
|
lang: Vec<(Occur, String)>,
|
|
license: Vec<(Occur, String)>,
|
|
before: Option<i64>,
|
|
after: Option<i64>,
|
|
}
|
|
|
|
impl PlumeQuery {
|
|
/// Create a new empty Query
|
|
pub fn new() -> Self {
|
|
Default::default()
|
|
}
|
|
|
|
/// Parse a query string into this Query
|
|
pub fn parse_query(&mut self, query: &str) -> &mut Self {
|
|
self.from_str_req(&query.trim())
|
|
}
|
|
|
|
/// Convert this Query to a Tantivy Query
|
|
pub fn into_query(self) -> BooleanQuery {
|
|
let mut result: Vec<(Occur, Box<dyn Query>)> = Vec::new();
|
|
gen_to_query!(self, result; normal: title, subtitle, content, tag;
|
|
oneoff: instance, author, blog, lang, license);
|
|
|
|
for (occur, token) in self.text {
|
|
// text entries need to be added as multiple Terms
|
|
match occur {
|
|
Occur::Must => {
|
|
// a Must mean this must be in one of title subtitle or content, not in all 3
|
|
let subresult = vec![
|
|
(Occur::Should, Self::token_to_query(&token, "title")),
|
|
(Occur::Should, Self::token_to_query(&token, "subtitle")),
|
|
(Occur::Should, Self::token_to_query(&token, "content")),
|
|
];
|
|
|
|
result.push((Occur::Must, Box::new(BooleanQuery::from(subresult))));
|
|
}
|
|
occur => {
|
|
result.push((occur, Self::token_to_query(&token, "title")));
|
|
result.push((occur, Self::token_to_query(&token, "subtitle")));
|
|
result.push((occur, Self::token_to_query(&token, "content")));
|
|
}
|
|
}
|
|
}
|
|
|
|
if self.before.is_some() || self.after.is_some() {
|
|
// if at least one range bound is provided
|
|
let after = self
|
|
.after
|
|
.unwrap_or_else(|| i64::from(NaiveDate::from_ymd(2000, 1, 1).num_days_from_ce()));
|
|
let before = self
|
|
.before
|
|
.unwrap_or_else(|| i64::from(Utc::today().num_days_from_ce()));
|
|
let field = Searcher::schema().get_field("creation_date").unwrap();
|
|
let range =
|
|
RangeQuery::new_i64_bounds(field, Bound::Included(after), Bound::Included(before));
|
|
result.push((Occur::Must, Box::new(range)));
|
|
}
|
|
|
|
result.into()
|
|
}
|
|
|
|
//generate most setters functions
|
|
gen_func!(text, title, subtitle, content, tag, instance, lang, license; strip: author, blog);
|
|
|
|
// documents newer than the provided date will be ignored
|
|
pub fn before<D: Datelike>(&mut self, date: &D) -> &mut Self {
|
|
let before = self
|
|
.before
|
|
.unwrap_or_else(|| i64::from(Utc::today().num_days_from_ce()));
|
|
self.before = Some(cmp::min(before, i64::from(date.num_days_from_ce())));
|
|
self
|
|
}
|
|
|
|
// documents older than the provided date will be ignored
|
|
pub fn after<D: Datelike>(&mut self, date: &D) -> &mut Self {
|
|
let after = self
|
|
.after
|
|
.unwrap_or_else(|| i64::from(NaiveDate::from_ymd(2000, 1, 1).num_days_from_ce()));
|
|
self.after = Some(cmp::max(after, i64::from(date.num_days_from_ce())));
|
|
self
|
|
}
|
|
|
|
// split a string into a token and a rest
|
|
pub fn get_first_token(mut query: &str) -> (&str, &str) {
|
|
query = query.trim();
|
|
if query.is_empty() {
|
|
("", "")
|
|
} else if query.get(0..1).map(|v| v == "\"").unwrap_or(false) {
|
|
if let Some(index) = query[1..].find('"') {
|
|
query.split_at(index + 2)
|
|
} else {
|
|
(query, "")
|
|
}
|
|
} else if query
|
|
.get(0..2)
|
|
.map(|v| v == "+\"" || v == "-\"")
|
|
.unwrap_or(false)
|
|
{
|
|
if let Some(index) = query[2..].find('"') {
|
|
query.split_at(index + 3)
|
|
} else {
|
|
(query, "")
|
|
}
|
|
} else if let Some(index) = query.find(' ') {
|
|
query.split_at(index)
|
|
} else {
|
|
(query, "")
|
|
}
|
|
}
|
|
|
|
// map each Occur state to a prefix
|
|
fn occur_to_str(occur: Occur) -> &'static str {
|
|
match occur {
|
|
Occur::Should => "",
|
|
Occur::Must => "+",
|
|
Occur::MustNot => "-",
|
|
}
|
|
}
|
|
|
|
// recursive parser for query string
|
|
// allow this clippy lint for now, until someone figures out how to
|
|
// refactor this better.
|
|
#[allow(clippy::wrong_self_convention)]
|
|
fn from_str_req(&mut self, mut query: &str) -> &mut Self {
|
|
query = query.trim_start();
|
|
if query.is_empty() {
|
|
return self;
|
|
}
|
|
|
|
let occur = if query.get(0..1).map(|v| v == "+").unwrap_or(false) {
|
|
query = &query[1..];
|
|
Occur::Must
|
|
} else if query.get(0..1).map(|v| v == "-").unwrap_or(false) {
|
|
query = &query[1..];
|
|
Occur::MustNot
|
|
} else {
|
|
Occur::Should
|
|
};
|
|
gen_parser!(self, query, occur; normal: title, subtitle, content, tag,
|
|
instance, author, blog, lang, license;
|
|
date: after, before);
|
|
self.from_str_req(query)
|
|
}
|
|
|
|
// map a token and it's field to a query
|
|
fn token_to_query(token: &str, field_name: &str) -> Box<dyn Query> {
|
|
let token = token.to_lowercase();
|
|
let token = token.as_str();
|
|
let field = Searcher::schema().get_field(field_name).unwrap();
|
|
if token.contains('@') && (field_name == "author" || field_name == "blog") {
|
|
let pos = token.find('@').unwrap();
|
|
let user_term = Term::from_field_text(field, &token[..pos]);
|
|
let instance_term = Term::from_field_text(
|
|
Searcher::schema().get_field("instance").unwrap(),
|
|
&token[pos + 1..],
|
|
);
|
|
Box::new(BooleanQuery::from(vec![
|
|
(
|
|
Occur::Must,
|
|
Box::new(TermQuery::new(
|
|
user_term,
|
|
if field_name == "author" {
|
|
IndexRecordOption::Basic
|
|
} else {
|
|
IndexRecordOption::WithFreqsAndPositions
|
|
},
|
|
)) as Box<dyn Query + 'static>,
|
|
),
|
|
(
|
|
Occur::Must,
|
|
Box::new(TermQuery::new(instance_term, IndexRecordOption::Basic)),
|
|
),
|
|
]))
|
|
} else if token.contains(' ') {
|
|
// phrase query
|
|
match field_name {
|
|
"instance" | "author" | "tag" =>
|
|
// phrase query are not available on these fields, treat it as multiple Term queries
|
|
{
|
|
Box::new(BooleanQuery::from(
|
|
token
|
|
.split_whitespace()
|
|
.map(|token| {
|
|
let term = Term::from_field_text(field, token);
|
|
(
|
|
Occur::Should,
|
|
Box::new(TermQuery::new(term, IndexRecordOption::Basic))
|
|
as Box<dyn Query + 'static>,
|
|
)
|
|
})
|
|
.collect::<Vec<_>>(),
|
|
))
|
|
}
|
|
_ => Box::new(PhraseQuery::new(
|
|
token
|
|
.split_whitespace()
|
|
.map(|token| Term::from_field_text(field, token))
|
|
.collect(),
|
|
)),
|
|
}
|
|
} else {
|
|
// Term Query
|
|
let term = Term::from_field_text(field, token);
|
|
let index_option = match field_name {
|
|
"instance" | "author" | "tag" => IndexRecordOption::Basic,
|
|
_ => IndexRecordOption::WithFreqsAndPositions,
|
|
};
|
|
Box::new(TermQuery::new(term, index_option))
|
|
}
|
|
}
|
|
}
|
|
|
|
impl std::str::FromStr for PlumeQuery {
|
|
type Err = !;
|
|
|
|
/// Create a new Query from &str
|
|
/// Same as doing
|
|
/// ```rust
|
|
/// # extern crate plume_models;
|
|
/// # use plume_models::search::Query;
|
|
/// let mut q = Query::new();
|
|
/// q.parse_query("some query");
|
|
/// ```
|
|
fn from_str(query: &str) -> Result<PlumeQuery, !> {
|
|
let mut res: PlumeQuery = Default::default();
|
|
|
|
res.from_str_req(&query.trim());
|
|
Ok(res)
|
|
}
|
|
}
|
|
|
|
impl ToString for PlumeQuery {
|
|
fn to_string(&self) -> String {
|
|
let mut result = String::new();
|
|
for (occur, val) in &self.text {
|
|
if val.contains(' ') {
|
|
result.push_str(&format!("{}\"{}\" ", Self::occur_to_str(*occur), val));
|
|
} else {
|
|
result.push_str(&format!("{}{} ", Self::occur_to_str(*occur), val));
|
|
}
|
|
}
|
|
|
|
gen_to_string!(self, result; normal: title, subtitle, content, tag,
|
|
instance, author, blog, lang, license;
|
|
date: before, after);
|
|
|
|
result.pop(); // remove trailing ' '
|
|
result
|
|
}
|
|
}
|