Plume/plume-common/src/utils.rs

545 lines
20 KiB
Rust
Raw Normal View History

2022-01-06 22:12:15 +01:00
use heck::ToUpperCamelCase;
use openssl::rand::rand_bytes;
2020-12-28 14:15:45 +01:00
use pulldown_cmark::{html, CodeBlockKind, CowStr, Event, LinkType, Options, Parser, Tag};
use regex_syntax::is_word_character;
use rocket::http::uri::Uri;
use std::collections::HashSet;
2021-01-01 23:10:10 +01:00
use syntect::html::{ClassStyle, ClassedHTMLGenerator};
use syntect::parsing::SyntaxSet;
2018-04-23 12:54:37 +02:00
/// Generates an hexadecimal representation of 32 bytes of random data
pub fn random_hex() -> String {
2019-03-20 17:56:17 +01:00
let mut bytes = [0; 32];
rand_bytes(&mut bytes).expect("Error while generating client id");
2019-03-20 17:56:17 +01:00
bytes
.iter()
.fold(String::new(), |res, byte| format!("{}{:x}", res, byte))
}
/// Remove non alphanumeric characters and CamelCase a string
pub fn make_actor_id(name: &str) -> String {
2022-01-06 22:12:15 +01:00
name.to_upper_camel_case()
.chars()
.filter(|c| c.is_alphanumeric())
.collect()
2018-04-23 12:54:37 +02:00
}
2018-04-23 16:25:39 +02:00
/**
* Percent-encode characters which are not allowed in IRI path segments.
*
* Intended to be used for generating Post ap_url.
*/
pub fn iri_percent_encode_seg(segment: &str) -> String {
segment.chars().map(iri_percent_encode_seg_char).collect()
}
pub fn iri_percent_encode_seg_char(c: char) -> String {
if c.is_alphanumeric() {
c.to_string()
} else {
match c {
'-'
| '.'
| '_'
| '~'
| '\u{A0}'..='\u{D7FF}'
| '\u{20000}'..='\u{2FFFD}'
| '\u{30000}'..='\u{3FFFD}'
| '\u{40000}'..='\u{4FFFD}'
| '\u{50000}'..='\u{5FFFD}'
| '\u{60000}'..='\u{6FFFD}'
| '\u{70000}'..='\u{7FFFD}'
| '\u{80000}'..='\u{8FFFD}'
| '\u{90000}'..='\u{9FFFD}'
| '\u{A0000}'..='\u{AFFFD}'
| '\u{B0000}'..='\u{BFFFD}'
| '\u{C0000}'..='\u{CFFFD}'
| '\u{D0000}'..='\u{DFFFD}'
| '\u{E0000}'..='\u{EFFFD}'
| '!'
| '$'
| '&'
| '\''
| '('
| ')'
| '*'
| '+'
| ','
| ';'
| '='
| ':'
| '@' => c.to_string(),
_ => {
let s = c.to_string();
Uri::percent_encode(&s).to_string()
}
}
}
}
2018-10-20 16:38:16 +02:00
#[derive(Debug)]
enum State {
Mention,
Hashtag,
Word,
Ready,
}
2020-01-21 07:02:03 +01:00
fn to_inline(tag: Tag<'_>) -> Tag<'_> {
match tag {
2020-12-27 20:38:23 +01:00
Tag::Heading(_) | Tag::Table(_) | Tag::TableHead | Tag::TableRow | Tag::TableCell => {
Tag::Paragraph
}
2020-12-27 20:38:23 +01:00
Tag::Image(typ, url, title) => Tag::Link(typ, url, title),
t => t,
}
}
struct HighlighterContext {
content: Vec<String>,
}
2021-01-15 16:59:07 +01:00
#[allow(clippy::unnecessary_wraps)]
fn highlight_code<'a>(
context: &mut Option<HighlighterContext>,
evt: Event<'a>,
) -> Option<Vec<Event<'a>>> {
match evt {
2020-12-27 20:38:23 +01:00
Event::Start(Tag::CodeBlock(kind)) => {
match &kind {
CodeBlockKind::Fenced(lang) if !lang.is_empty() => {
*context = Some(HighlighterContext { content: vec![] });
2020-12-28 14:15:45 +01:00
}
2020-12-27 20:38:23 +01:00
_ => {}
}
2020-12-27 20:38:23 +01:00
Some(vec![Event::Start(Tag::CodeBlock(kind))])
}
2020-12-27 20:38:23 +01:00
Event::End(Tag::CodeBlock(kind)) => {
let mut result = vec![];
if let Some(ctx) = context.take() {
2020-12-27 20:38:23 +01:00
let lang = if let CodeBlockKind::Fenced(lang) = &kind {
if lang.is_empty() {
unreachable!();
} else {
lang
}
} else {
unreachable!();
};
let syntax_set = SyntaxSet::load_defaults_newlines();
2021-11-27 23:53:13 +01:00
let syntax = syntax_set.find_syntax_by_token(lang).unwrap_or_else(|| {
syntax_set
2021-11-27 23:53:13 +01:00
.find_syntax_by_name(lang)
.unwrap_or_else(|| syntax_set.find_syntax_plain_text())
});
2021-01-01 23:10:10 +01:00
let mut html = ClassedHTMLGenerator::new_with_class_style(
2021-11-27 23:53:13 +01:00
syntax,
2021-01-01 23:10:10 +01:00
&syntax_set,
ClassStyle::Spaced,
);
for line in ctx.content {
html.parse_html_for_line_which_includes_newline(&line);
}
let q = html.finalize();
result.push(Event::Html(q.into()));
}
2020-12-27 20:38:23 +01:00
result.push(Event::End(Tag::CodeBlock(kind)));
*context = None;
Some(result)
}
Event::Text(t) => {
if let Some(mut c) = context.take() {
c.content.push(t.to_string());
*context = Some(c);
Some(vec![])
} else {
Some(vec![Event::Text(t)])
}
}
_ => Some(vec![evt]),
}
}
2021-01-15 16:59:07 +01:00
#[allow(clippy::unnecessary_wraps)]
fn flatten_text<'a>(state: &mut Option<String>, evt: Event<'a>) -> Option<Vec<Event<'a>>> {
let (s, res) = match evt {
Event::Text(txt) => match state.take() {
Some(mut prev_txt) => {
prev_txt.push_str(&txt);
(Some(prev_txt), vec![])
}
2020-12-27 20:38:23 +01:00
None => (Some(txt.into_string()), vec![]),
},
e => match state.take() {
2020-12-27 20:38:23 +01:00
Some(prev) => (None, vec![Event::Text(CowStr::Boxed(prev.into())), e]),
None => (None, vec![e]),
},
};
*state = s;
Some(res)
}
2021-01-15 16:59:07 +01:00
#[allow(clippy::unnecessary_wraps)]
fn inline_tags<'a>(
(state, inline): &mut (Vec<Tag<'a>>, bool),
evt: Event<'a>,
) -> Option<Event<'a>> {
if *inline {
let new_evt = match evt {
Event::Start(t) => {
let tag = to_inline(t);
state.push(tag.clone());
Event::Start(tag)
}
Event::End(t) => match state.pop() {
Some(other) => Event::End(other),
None => Event::End(t),
},
e => e,
};
Some(new_evt)
} else {
Some(evt)
}
}
pub type MediaProcessor<'a> = Box<dyn 'a + Fn(i32) -> Option<(String, Option<String>)>>;
fn process_image<'a, 'b>(
evt: Event<'a>,
inline: bool,
processor: &Option<MediaProcessor<'b>>,
) -> Event<'a> {
if let Some(ref processor) = *processor {
match evt {
2020-12-27 20:38:23 +01:00
Event::Start(Tag::Image(typ, id, title)) => {
if let Some((url, cw)) = id.parse::<i32>().ok().and_then(processor.as_ref()) {
if let (Some(cw), false) = (cw, inline) {
// there is a cw, and where are not inline
2020-12-28 14:15:45 +01:00
Event::Html(CowStr::Boxed(
format!(
r#"<label for="postcontent-cw-{id}">
<input type="checkbox" id="postcontent-cw-{id}" checked="checked" class="cw-checkbox">
<span class="cw-container">
<span class="cw-text">
{cw}
</span>
<img src="{url}" alt=""#,
2020-12-28 14:15:45 +01:00
id = random_hex(),
cw = cw,
url = url
)
.into(),
))
} else {
2020-12-27 20:38:23 +01:00
Event::Start(Tag::Image(typ, CowStr::Boxed(url.into()), title))
}
} else {
2020-12-27 20:38:23 +01:00
Event::Start(Tag::Image(typ, id, title))
}
}
2020-12-27 20:38:23 +01:00
Event::End(Tag::Image(typ, id, title)) => {
if let Some((url, cw)) = id.parse::<i32>().ok().and_then(processor.as_ref()) {
if inline || cw.is_none() {
2020-12-27 20:38:23 +01:00
Event::End(Tag::Image(typ, CowStr::Boxed(url.into()), title))
} else {
2020-12-27 20:38:23 +01:00
Event::Html(CowStr::Borrowed(
r#""/>
</span>
</label>"#,
))
}
} else {
2020-12-27 20:38:23 +01:00
Event::End(Tag::Image(typ, id, title))
}
}
e => e,
}
} else {
evt
}
}
#[derive(Default, Debug)]
struct DocumentContext {
in_code: bool,
in_link: bool,
}
2018-10-20 16:38:16 +02:00
/// Returns (HTML, mentions, hashtags)
pub fn md_to_html<'a>(
md: &str,
base_url: Option<&str>,
inline: bool,
media_processor: Option<MediaProcessor<'a>>,
) -> (String, HashSet<String>, HashSet<String>) {
let base_url = if let Some(base_url) = base_url {
format!("https://{}/", base_url)
} else {
"/".to_owned()
};
let parser = Parser::new_ext(md, Options::all());
2018-06-20 20:22:34 +02:00
2020-01-21 07:02:03 +01:00
let (parser, mentions, hashtags): (Vec<Event<'_>>, Vec<String>, Vec<String>) = parser
// Flatten text because pulldown_cmark break #hashtag in two individual text elements
.scan(None, flatten_text)
.flatten()
.scan(None, highlight_code)
.flatten()
.map(|evt| process_image(evt, inline, &media_processor))
// Ignore headings, images, and tables if inline = true
.scan((vec![], inline), inline_tags)
.scan(&mut DocumentContext::default(), |ctx, evt| match evt {
2020-12-27 20:38:23 +01:00
Event::Start(Tag::CodeBlock(_)) => {
ctx.in_code = true;
2020-01-12 13:24:41 +01:00
Some((vec![evt], vec![], vec![]))
}
2020-12-27 20:38:23 +01:00
Event::End(Tag::CodeBlock(_)) => {
ctx.in_code = false;
Some((vec![evt], vec![], vec![]))
}
2020-12-27 20:38:23 +01:00
Event::Start(Tag::Link(_, _, _)) => {
ctx.in_link = true;
Some((vec![evt], vec![], vec![]))
}
2020-12-27 20:38:23 +01:00
Event::End(Tag::Link(_, _, _)) => {
ctx.in_link = false;
2020-01-12 13:24:41 +01:00
Some((vec![evt], vec![], vec![]))
}
2019-03-20 17:56:17 +01:00
Event::Text(txt) => {
let (evts, _, _, _, new_mentions, new_hashtags) = txt.chars().fold(
(vec![], State::Ready, String::new(), 0, vec![], vec![]),
|(mut events, state, mut text_acc, n, mut mentions, mut hashtags), c| {
match state {
State::Mention => {
let char_matches = c.is_alphanumeric() || "@.-_".contains(c);
if char_matches && (n < (txt.chars().count() - 1)) {
text_acc.push(c);
(events, State::Mention, text_acc, n + 1, mentions, hashtags)
} else {
if char_matches {
text_acc.push(c)
}
let mention = text_acc;
let link = Tag::Link(
2020-12-27 20:38:23 +01:00
LinkType::Inline,
format!("{}@/{}/", base_url, &mention).into(),
2021-11-27 23:53:13 +01:00
mention.clone().into(),
2019-03-20 17:56:17 +01:00
);
2018-10-20 16:38:16 +02:00
2019-03-20 17:56:17 +01:00
mentions.push(mention.clone());
events.push(Event::Start(link.clone()));
2021-11-27 23:53:13 +01:00
events.push(Event::Text(format!("@{}", &mention).into()));
2019-03-20 17:56:17 +01:00
events.push(Event::End(link));
2018-10-20 16:38:16 +02:00
2019-03-20 17:56:17 +01:00
(
events,
State::Ready,
c.to_string(),
n + 1,
mentions,
hashtags,
)
}
}
2019-03-20 17:56:17 +01:00
State::Hashtag => {
let char_matches = c == '-' || is_word_character(c);
2019-03-20 17:56:17 +01:00
if char_matches && (n < (txt.chars().count() - 1)) {
text_acc.push(c);
(events, State::Hashtag, text_acc, n + 1, mentions, hashtags)
} else {
if char_matches {
text_acc.push(c);
}
let hashtag = text_acc;
let link = Tag::Link(
2020-12-27 20:38:23 +01:00
LinkType::Inline,
2020-12-01 00:38:58 +01:00
format!("{}tag/{}", base_url, &hashtag).into(),
2019-03-20 17:56:17 +01:00
hashtag.to_owned().into(),
);
2018-10-20 16:38:16 +02:00
2019-03-20 17:56:17 +01:00
hashtags.push(hashtag.clone());
events.push(Event::Start(link.clone()));
events.push(Event::Text(format!("#{}", &hashtag).into()));
events.push(Event::End(link));
2018-10-20 16:38:16 +02:00
2019-03-20 17:56:17 +01:00
(
events,
State::Ready,
c.to_string(),
n + 1,
mentions,
hashtags,
)
}
2018-10-20 16:38:16 +02:00
}
2019-03-20 17:56:17 +01:00
State::Ready => {
if !ctx.in_code && !ctx.in_link && c == '@' {
2019-03-20 17:56:17 +01:00
events.push(Event::Text(text_acc.into()));
(
events,
State::Mention,
String::new(),
n + 1,
mentions,
hashtags,
)
} else if !ctx.in_code && !ctx.in_link && c == '#' {
2019-03-20 17:56:17 +01:00
events.push(Event::Text(text_acc.into()));
(
events,
State::Hashtag,
String::new(),
n + 1,
mentions,
hashtags,
)
} else if c.is_alphanumeric() {
text_acc.push(c);
if n >= (txt.chars().count() - 1) {
// Add the text after at the end, even if it is not followed by a mention.
events.push(Event::Text(text_acc.clone().into()))
}
(events, State::Word, text_acc, n + 1, mentions, hashtags)
} else {
text_acc.push(c);
if n >= (txt.chars().count() - 1) {
// Add the text after at the end, even if it is not followed by a mention.
events.push(Event::Text(text_acc.clone().into()))
}
(events, State::Ready, text_acc, n + 1, mentions, hashtags)
}
2018-10-20 16:38:16 +02:00
}
2019-03-20 17:56:17 +01:00
State::Word => {
text_acc.push(c);
if c.is_alphanumeric() {
if n >= (txt.chars().count() - 1) {
// Add the text after at the end, even if it is not followed by a mention.
events.push(Event::Text(text_acc.clone().into()))
}
(events, State::Word, text_acc, n + 1, mentions, hashtags)
} else {
if n >= (txt.chars().count() - 1) {
// Add the text after at the end, even if it is not followed by a mention.
events.push(Event::Text(text_acc.clone().into()))
}
(events, State::Ready, text_acc, n + 1, mentions, hashtags)
}
2018-10-20 16:38:16 +02:00
}
2018-06-20 22:58:11 +02:00
}
2019-03-20 17:56:17 +01:00
},
);
2020-01-12 13:24:41 +01:00
Some((evts, new_mentions, new_hashtags))
2019-03-20 17:56:17 +01:00
}
2020-01-12 13:24:41 +01:00
_ => Some((vec![evt], vec![], vec![])),
2019-03-20 17:56:17 +01:00
})
.fold(
(vec![], vec![], vec![]),
|(mut parser, mut mention, mut hashtag), (mut p, mut m, mut h)| {
parser.append(&mut p);
mention.append(&mut m);
hashtag.append(&mut h);
(parser, mention, hashtag)
},
);
let parser = parser.into_iter();
let mentions = mentions.into_iter().map(|m| String::from(m.trim()));
let hashtags = hashtags.into_iter().map(|h| String::from(h.trim()));
2018-06-20 20:22:34 +02:00
// TODO: fetch mentionned profiles in background, if needed
let mut buf = String::new();
html::push_html(&mut buf, parser);
(buf, mentions.collect(), hashtags.collect())
}
2018-07-18 18:35:50 +02:00
2022-01-06 21:36:39 +01:00
pub fn escape(string: &str) -> askama_escape::Escaped<askama_escape::Html> {
askama_escape::escape(string, askama_escape::Html)
}
2018-07-18 18:35:50 +02:00
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_mentions() {
let tests = vec![
("nothing", vec![]),
("@mention", vec!["mention"]),
("@mention@instance.tld", vec!["mention@instance.tld"]),
("@many @mentions", vec!["many", "mentions"]),
("@start with a mentions", vec!["start"]),
("mention at @end", vec!["end"]),
("between parenthesis (@test)", vec!["test"]),
("with some punctuation @test!", vec!["test"]),
2020-01-12 13:24:41 +01:00
(" @spaces ", vec!["spaces"]),
("@is_a@mention", vec!["is_a@mention"]),
2018-10-20 16:38:16 +02:00
("not_a@mention", vec![]),
2020-01-12 13:24:41 +01:00
("`@helo`", vec![]),
("```\n@hello\n```", vec![]),
("[@atmark in link](https://example.org/)", vec![]),
2018-07-18 18:35:50 +02:00
];
for (md, mentions) in tests {
2019-03-20 17:56:17 +01:00
assert_eq!(
md_to_html(md, None, false, None).1,
2019-03-20 17:56:17 +01:00
mentions
.into_iter()
.map(|s| s.to_string())
.collect::<HashSet<String>>()
);
2018-07-18 18:35:50 +02:00
}
}
2018-10-20 16:38:16 +02:00
#[test]
fn test_hashtags() {
let tests = vec![
("nothing", vec![]),
("#hashtag", vec!["hashtag"]),
("#many #hashtags", vec!["many", "hashtags"]),
("#start with a hashtag", vec!["start"]),
("hashtag at #end", vec!["end"]),
("between parenthesis (#test)", vec!["test"]),
("with some punctuation #test!", vec!["test"]),
2020-01-12 13:24:41 +01:00
(" #spaces ", vec!["spaces"]),
2018-10-20 16:38:16 +02:00
("not_a#hashtag", vec![]),
("#نرمافزار_آزاد", vec!["نرمافزار_آزاد"]),
("[#hash in link](https://example.org/)", vec![]),
("#zwsp\u{200b}inhash", vec!["zwsp"]),
2018-10-20 16:38:16 +02:00
];
for (md, mentions) in tests {
2019-03-20 17:56:17 +01:00
assert_eq!(
md_to_html(md, None, false, None).2,
2019-03-20 17:56:17 +01:00
mentions
.into_iter()
.map(|s| s.to_string())
.collect::<HashSet<String>>()
);
2018-10-20 16:38:16 +02:00
}
}
#[test]
fn test_iri_percent_encode_seg() {
assert_eq!(
&iri_percent_encode_seg("including whitespace"),
"including%20whitespace"
);
assert_eq!(&iri_percent_encode_seg("%20"), "%2520");
assert_eq!(&iri_percent_encode_seg("é"), "é");
assert_eq!(
&iri_percent_encode_seg("空白入り 日本語"),
"空白入り%20日本語"
);
}
#[test]
fn test_inline() {
assert_eq!(
md_to_html("# Hello", None, false, None).0,
2020-12-27 21:02:37 +01:00
String::from("<h1 dir=\"auto\">Hello</h1>\n")
);
assert_eq!(
md_to_html("# Hello", None, true, None).0,
2020-12-27 21:02:37 +01:00
String::from("<p dir=\"auto\">Hello</p>\n")
);
}
2018-07-18 18:35:50 +02:00
}