use openssl::rand::rand_bytes; use pulldown_cmark::{html, CodeBlockKind, CowStr, Event, LinkType, Options, Parser, Tag}; use regex_syntax::is_word_character; use rocket::http::uri::Uri; use std::collections::HashSet; use syntect::html::{ClassStyle, ClassedHTMLGenerator}; use syntect::parsing::SyntaxSet; /// Generates an hexadecimal representation of 32 bytes of random data pub fn random_hex() -> String { let mut bytes = [0; 32]; rand_bytes(&mut bytes).expect("Error while generating client id"); bytes .iter() .fold(String::new(), |res, byte| format!("{}{:x}", res, byte)) } /** * Percent-encode characters which are not allowed in IRI path segments. * * Intended to be used for generating Post ap_url. */ pub fn iri_percent_encode_seg(segment: &str) -> String { segment.chars().map(iri_percent_encode_seg_char).collect() } pub fn iri_percent_encode_seg_char(c: char) -> String { if c.is_alphanumeric() { c.to_string() } else { match c { '-' | '.' | '_' | '~' | '\u{A0}'..='\u{D7FF}' | '\u{20000}'..='\u{2FFFD}' | '\u{30000}'..='\u{3FFFD}' | '\u{40000}'..='\u{4FFFD}' | '\u{50000}'..='\u{5FFFD}' | '\u{60000}'..='\u{6FFFD}' | '\u{70000}'..='\u{7FFFD}' | '\u{80000}'..='\u{8FFFD}' | '\u{90000}'..='\u{9FFFD}' | '\u{A0000}'..='\u{AFFFD}' | '\u{B0000}'..='\u{BFFFD}' | '\u{C0000}'..='\u{CFFFD}' | '\u{D0000}'..='\u{DFFFD}' | '\u{E0000}'..='\u{EFFFD}' | '!' | '$' | '&' | '\'' | '(' | ')' | '*' | '+' | ',' | ';' | '=' | ':' | '@' => c.to_string(), _ => { let s = c.to_string(); Uri::percent_encode(&s).to_string() } } } } #[derive(Debug)] enum State { Mention, Hashtag, Word, Ready, } fn to_inline(tag: Tag<'_>) -> Tag<'_> { match tag { Tag::Heading(_) | Tag::Table(_) | Tag::TableHead | Tag::TableRow | Tag::TableCell => { Tag::Paragraph } Tag::Image(typ, url, title) => Tag::Link(typ, url, title), t => t, } } struct HighlighterContext { content: Vec, } #[allow(clippy::unnecessary_wraps)] fn highlight_code<'a>( context: &mut Option, evt: Event<'a>, ) -> Option>> { match evt { Event::Start(Tag::CodeBlock(kind)) => { match &kind { CodeBlockKind::Fenced(lang) if !lang.is_empty() => { *context = Some(HighlighterContext { content: vec![] }); } _ => {} } Some(vec![Event::Start(Tag::CodeBlock(kind))]) } Event::End(Tag::CodeBlock(kind)) => { let mut result = vec![]; if let Some(ctx) = context.take() { let lang = if let CodeBlockKind::Fenced(lang) = &kind { if lang.is_empty() { unreachable!(); } else { lang } } else { unreachable!(); }; let syntax_set = SyntaxSet::load_defaults_newlines(); let syntax = syntax_set.find_syntax_by_token(lang).unwrap_or_else(|| { syntax_set .find_syntax_by_name(lang) .unwrap_or_else(|| syntax_set.find_syntax_plain_text()) }); let mut html = ClassedHTMLGenerator::new_with_class_style( syntax, &syntax_set, ClassStyle::Spaced, ); for line in ctx.content { html.parse_html_for_line_which_includes_newline(&line); } let q = html.finalize(); result.push(Event::Html(q.into())); } result.push(Event::End(Tag::CodeBlock(kind))); *context = None; Some(result) } Event::Text(t) => { if let Some(mut c) = context.take() { c.content.push(t.to_string()); *context = Some(c); Some(vec![]) } else { Some(vec![Event::Text(t)]) } } _ => Some(vec![evt]), } } #[allow(clippy::unnecessary_wraps)] fn flatten_text<'a>(state: &mut Option, evt: Event<'a>) -> Option>> { let (s, res) = match evt { Event::Text(txt) => match state.take() { Some(mut prev_txt) => { prev_txt.push_str(&txt); (Some(prev_txt), vec![]) } None => (Some(txt.into_string()), vec![]), }, e => match state.take() { Some(prev) => (None, vec![Event::Text(CowStr::Boxed(prev.into())), e]), None => (None, vec![e]), }, }; *state = s; Some(res) } #[allow(clippy::unnecessary_wraps)] fn inline_tags<'a>( (state, inline): &mut (Vec>, bool), evt: Event<'a>, ) -> Option> { if *inline { let new_evt = match evt { Event::Start(t) => { let tag = to_inline(t); state.push(tag.clone()); Event::Start(tag) } Event::End(t) => match state.pop() { Some(other) => Event::End(other), None => Event::End(t), }, e => e, }; Some(new_evt) } else { Some(evt) } } pub type MediaProcessor<'a> = Box Option<(String, Option)>>; fn process_image<'a, 'b>( evt: Event<'a>, inline: bool, processor: &Option>, ) -> Event<'a> { if let Some(ref processor) = *processor { match evt { Event::Start(Tag::Image(typ, id, title)) => { if let Some((url, cw)) = id.parse::().ok().and_then(processor.as_ref()) { if let (Some(cw), false) = (cw, inline) { // there is a cw, and where are not inline Event::Html(CowStr::Boxed( format!( r#""#, )) } } else { Event::End(Tag::Image(typ, id, title)) } } e => e, } } else { evt } } #[derive(Default, Debug)] struct DocumentContext { in_code: bool, in_link: bool, } /// Returns (HTML, mentions, hashtags) pub fn md_to_html<'a>( md: &str, base_url: Option<&str>, inline: bool, media_processor: Option>, ) -> (String, HashSet, HashSet) { let base_url = if let Some(base_url) = base_url { format!("https://{}/", base_url) } else { "/".to_owned() }; let parser = Parser::new_ext(md, Options::all()); let (parser, mentions, hashtags): (Vec>, Vec, Vec) = parser // Flatten text because pulldown_cmark break #hashtag in two individual text elements .scan(None, flatten_text) .flatten() .scan(None, highlight_code) .flatten() .map(|evt| process_image(evt, inline, &media_processor)) // Ignore headings, images, and tables if inline = true .scan((vec![], inline), inline_tags) .scan(&mut DocumentContext::default(), |ctx, evt| match evt { Event::Start(Tag::CodeBlock(_)) => { ctx.in_code = true; Some((vec![evt], vec![], vec![])) } Event::End(Tag::CodeBlock(_)) => { ctx.in_code = false; Some((vec![evt], vec![], vec![])) } Event::Start(Tag::Link(_, _, _)) => { ctx.in_link = true; Some((vec![evt], vec![], vec![])) } Event::End(Tag::Link(_, _, _)) => { ctx.in_link = false; Some((vec![evt], vec![], vec![])) } Event::Text(txt) => { let (evts, _, _, _, new_mentions, new_hashtags) = txt.chars().fold( (vec![], State::Ready, String::new(), 0, vec![], vec![]), |(mut events, state, mut text_acc, n, mut mentions, mut hashtags), c| { match state { State::Mention => { let char_matches = c.is_alphanumeric() || "@.-_".contains(c); if char_matches && (n < (txt.chars().count() - 1)) { text_acc.push(c); (events, State::Mention, text_acc, n + 1, mentions, hashtags) } else { if char_matches { text_acc.push(c) } let mention = text_acc; let link = Tag::Link( LinkType::Inline, format!("{}@/{}/", base_url, &mention).into(), mention.clone().into(), ); mentions.push(mention.clone()); events.push(Event::Start(link.clone())); events.push(Event::Text(format!("@{}", &mention).into())); events.push(Event::End(link)); ( events, State::Ready, c.to_string(), n + 1, mentions, hashtags, ) } } State::Hashtag => { let char_matches = c == '-' || is_word_character(c); if char_matches && (n < (txt.chars().count() - 1)) { text_acc.push(c); (events, State::Hashtag, text_acc, n + 1, mentions, hashtags) } else { if char_matches { text_acc.push(c); } let hashtag = text_acc; let link = Tag::Link( LinkType::Inline, format!("{}tag/{}", base_url, &hashtag).into(), hashtag.to_owned().into(), ); hashtags.push(hashtag.clone()); events.push(Event::Start(link.clone())); events.push(Event::Text(format!("#{}", &hashtag).into())); events.push(Event::End(link)); ( events, State::Ready, c.to_string(), n + 1, mentions, hashtags, ) } } State::Ready => { if !ctx.in_code && !ctx.in_link && c == '@' { events.push(Event::Text(text_acc.into())); ( events, State::Mention, String::new(), n + 1, mentions, hashtags, ) } else if !ctx.in_code && !ctx.in_link && c == '#' { events.push(Event::Text(text_acc.into())); ( events, State::Hashtag, String::new(), n + 1, mentions, hashtags, ) } else if c.is_alphanumeric() { text_acc.push(c); if n >= (txt.chars().count() - 1) { // Add the text after at the end, even if it is not followed by a mention. events.push(Event::Text(text_acc.clone().into())) } (events, State::Word, text_acc, n + 1, mentions, hashtags) } else { text_acc.push(c); if n >= (txt.chars().count() - 1) { // Add the text after at the end, even if it is not followed by a mention. events.push(Event::Text(text_acc.clone().into())) } (events, State::Ready, text_acc, n + 1, mentions, hashtags) } } State::Word => { text_acc.push(c); if c.is_alphanumeric() { if n >= (txt.chars().count() - 1) { // Add the text after at the end, even if it is not followed by a mention. events.push(Event::Text(text_acc.clone().into())) } (events, State::Word, text_acc, n + 1, mentions, hashtags) } else { if n >= (txt.chars().count() - 1) { // Add the text after at the end, even if it is not followed by a mention. events.push(Event::Text(text_acc.clone().into())) } (events, State::Ready, text_acc, n + 1, mentions, hashtags) } } } }, ); Some((evts, new_mentions, new_hashtags)) } _ => Some((vec![evt], vec![], vec![])), }) .fold( (vec![], vec![], vec![]), |(mut parser, mut mention, mut hashtag), (mut p, mut m, mut h)| { parser.append(&mut p); mention.append(&mut m); hashtag.append(&mut h); (parser, mention, hashtag) }, ); let parser = parser.into_iter(); let mentions = mentions.into_iter().map(|m| String::from(m.trim())); let hashtags = hashtags.into_iter().map(|h| String::from(h.trim())); // TODO: fetch mentionned profiles in background, if needed let mut buf = String::new(); html::push_html(&mut buf, parser); (buf, mentions.collect(), hashtags.collect()) } pub fn escape(string: &str) -> askama_escape::Escaped { askama_escape::escape(string, askama_escape::Html) } #[cfg(test)] mod tests { use super::*; #[test] fn test_mentions() { let tests = vec![ ("nothing", vec![]), ("@mention", vec!["mention"]), ("@mention@instance.tld", vec!["mention@instance.tld"]), ("@many @mentions", vec!["many", "mentions"]), ("@start with a mentions", vec!["start"]), ("mention at @end", vec!["end"]), ("between parenthesis (@test)", vec!["test"]), ("with some punctuation @test!", vec!["test"]), (" @spaces ", vec!["spaces"]), ("@is_a@mention", vec!["is_a@mention"]), ("not_a@mention", vec![]), ("`@helo`", vec![]), ("```\n@hello\n```", vec![]), ("[@atmark in link](https://example.org/)", vec![]), ]; for (md, mentions) in tests { assert_eq!( md_to_html(md, None, false, None).1, mentions .into_iter() .map(|s| s.to_string()) .collect::>() ); } } #[test] fn test_hashtags() { let tests = vec![ ("nothing", vec![]), ("#hashtag", vec!["hashtag"]), ("#many #hashtags", vec!["many", "hashtags"]), ("#start with a hashtag", vec!["start"]), ("hashtag at #end", vec!["end"]), ("between parenthesis (#test)", vec!["test"]), ("with some punctuation #test!", vec!["test"]), (" #spaces ", vec!["spaces"]), ("not_a#hashtag", vec![]), ("#نرم‌افزار_آزاد", vec!["نرم‌افزار_آزاد"]), ("[#hash in link](https://example.org/)", vec![]), ("#zwsp\u{200b}inhash", vec!["zwsp"]), ]; for (md, mentions) in tests { assert_eq!( md_to_html(md, None, false, None).2, mentions .into_iter() .map(|s| s.to_string()) .collect::>() ); } } #[test] fn test_iri_percent_encode_seg() { assert_eq!( &iri_percent_encode_seg("including whitespace"), "including%20whitespace" ); assert_eq!(&iri_percent_encode_seg("%20"), "%2520"); assert_eq!(&iri_percent_encode_seg("é"), "é"); assert_eq!( &iri_percent_encode_seg("空白入り 日本語"), "空白入り%20日本語" ); } #[test] fn test_inline() { assert_eq!( md_to_html("# Hello", None, false, None).0, String::from("

Hello

\n") ); assert_eq!( md_to_html("# Hello", None, true, None).0, String::from("

Hello

\n") ); } }