use std::path::Path; use futures::future::OptionFuture; use html_minifier::HTMLMinifier; use nipper::Document; use url::Url; use crate::consts::{EVENT_HANDLERS, LINK_REL_EXTERNAL_RESOURCES}; use crate::Download; /// Stores configuration for the self_contained_html function // TODO: write a builder pub struct Config<'t, E, D, S1 = &'static str, S2 = &'static str> where E: std::error::Error, D: Download + Send, S1: AsRef, S2: AsRef, { /// the downloader that will be used to retrieve ressources on the page pub downloader: Option<&'t D>, /// Base url for downloading ressources, it probably the pub base_url: Option<&'t Url>, pub elements_to_remove: &'t [S1], pub styles_to_add: &'t [S2], } impl<'t, E, D> Default for Config<'t, E, D> where E: std::error::Error, D: Download + Send, { fn default() -> Self { Self { downloader: None, base_url: None, elements_to_remove: &[], styles_to_add: &[], } } } impl<'t, E, D, S1, S2> Config<'t, E, D, S1, S2> where E: std::error::Error, D: Download + Send, S1: AsRef, S2: AsRef, { /// Makes an html page self-contained /// /// The `downloader` must implement `Download` and is used to download ressources that are /// needed to make this page self-contained such as stylesheets or images. /// /// The function also removes all scripts on the page pub async fn run(&self, html: impl AsRef) -> String { //TODO: don't panic let base_url = self.base_url.expect("Base url not defined"); let downloader = self.downloader.expect("Downloader not defined"); // TODO: split/refactor this function : // - ¿ find a way to ease the |get urls| -> |async download| -> |replace| workflow ? // - put each modification (ex: style in the `foreach`) in functions, maybe using // (¿sealed?) Traits : https://git.alpaga.dev/poc/crieur/issues/42 // - `element_to_remove` , `base_url` and `downloader` should be in a configuration structure let (style_urls, html) = { let document = Document::from(html.as_ref()); // ---- Remove scripts ---- // document.select("script").remove(); for event in EVENT_HANDLERS { document .select(format!("[{}]", event).as_str()) .remove_attr(event); } for rel in LINK_REL_EXTERNAL_RESOURCES { document .select(format!("link[rel=\"{}\"]", rel).as_str()) .remove(); } // ---- Replace stylesheets ---- // let stylesheets = document.select("link[href][rel=\"stylesheet\"]"); let styles_url = stylesheets .iter() .map(|style_link| { if let Some(src) = style_link.attr("href") { base_url.join(src.as_ref()).ok() } else { None } }) .collect::>(); (styles_url, String::from(document.html())) }; let style_urls = style_urls.into_iter().map(|style_url| { OptionFuture::from( style_url.map(|s| async move { downloader.download(&s).await.unwrap() }), ) }); let downloaded_styles = futures::future::join_all(style_urls).await; let html = { let document = Document::from(&html); let styles = document.select("link[href][rel=\"stylesheet\"]"); styles .iter() .zip(downloaded_styles.iter()) .for_each(|(mut style_link, inner_css)| { if let Some(Some(inner_css)) = inner_css { let css = String::from_utf8(inner_css.to_vec()).unwrap(); let media_query = style_link.attr("media"); let css = match media_query { Some(media_query) => { format!("", media_query, css) } None => format!("", css), }; style_link.replace_with_html(css); } else { style_link.remove(); } }); String::from(document.html()) }; // ---- Replace imgs ---- // let image_urls = { let document = Document::from(&html); let imgs = document.select("img:not([src^=\"data:\"])"); imgs.iter() .map(|image| { if let Some(src) = image.attr("src") { base_url.join(src.as_ref()).ok() } else { None } }) .collect::>() }; let downloaded_images = image_urls.into_iter().map(|image_url| { OptionFuture::from(image_url.map(|url| async move { let data = downloader.download(&url).await.unwrap(); (url, data) })) }); let downloaded_images = futures::future::join_all(downloaded_images).await; let html = { let document = Document::from(&html); let imgs = document.select("img:not([src^=\"data:\"])"); imgs.iter() .zip(downloaded_images.iter()) .for_each(|(mut img, data)| { if let Some((url, Some(data))) = data { let data = base64::encode(data); //TODO: use an extension hashmap let extension = Path::new(url.path()).extension().unwrap().to_str().unwrap(); img.set_attr("src", &format!("data:image/{};base64,{}", extension, data)); } else { img.remove() } }); // ---- Remove unwanted html elements ----- // for element in self.elements_to_remove { document.select(element.as_ref()).remove(); } // ---- Add additional styles ---- // for style in self.styles_to_add { document .select("head") .append_html(format!("\n\n", style.as_ref())); } String::from(document.html()) }; // ---- output ---- // let mut minifier = HTMLMinifier::new(); minifier.digest(html.as_str()).unwrap(); String::from_utf8(minifier.get_html().into()).unwrap() } } #[cfg(test)] mod tests { // TODO: reduce boilerplate, DRY use super::*; use std::fs::File; use std::io::prelude::*; use anyhow::Result; use async_trait::async_trait; use bytes::Bytes; use indoc::{formatdoc, indoc}; use crate::errors; fn init() { let _ = env_logger::builder().is_test(true).try_init(); } // TODO: the Dummy,Css and Png Downloaders don't really test the async scenario as // they don't use futures : they don't call await. // They should be testing the async scenario struct DummyDownloader; #[async_trait] impl Download for DummyDownloader { type Error = errors::Error; async fn download(&self, _file_link: &Url) -> errors::Result> { Ok(Some(Bytes::from(""))) } } #[tokio::test] async fn remove_scripts() -> Result<()> { let html = ""; let base_url = Url::parse("http://example.com")?; let downloader = DummyDownloader {}; assert_eq!( Config { downloader: Some(&downloader), base_url: Some(&base_url), ..Default::default() } .run(html) .await, "" ); Ok(()) } #[tokio::test] async fn remove_onevent_handlers() -> Result<()> { init(); let downloader = DummyDownloader {}; let html = |onevent| { formatdoc! {" ", onevent } }; let base_url = Url::parse("http://example.com")?; for s in EVENT_HANDLERS { assert_eq!( Config { downloader: Some(&downloader), base_url: Some(&base_url), ..Default::default() }.run(html(s)).await, "\n\n\n\n" ); } Ok(()) } #[tokio::test] async fn remove_link_with_external_ressource() -> Result<()> { init(); let downloader = DummyDownloader {}; let html = |onevent| { formatdoc! {" ", onevent } }; let base_url = Url::parse("http://example.com")?; for s in LINK_REL_EXTERNAL_RESOURCES { assert_eq!( Config { downloader: Some(&downloader), base_url: Some(&base_url), ..Default::default() } .run(html(s)) .await, "\n\n\n" ); } Ok(()) } struct CssDownloader; #[async_trait] impl Download for CssDownloader { type Error = errors::Error; async fn download(&self, _file_link: &Url) -> errors::Result> { Ok(Some( indoc! {" section#warning { color: red; }"} .into(), )) } } #[tokio::test] async fn download_css() -> Result<()> { let downloader = CssDownloader {}; let html = indoc! {" "}; // FIXME: find why minify doesn't minify let wanted_html = indoc! {" "}; let mut minifier = HTMLMinifier::new(); minifier.digest(wanted_html)?; let minified = String::from_utf8(minifier.get_html().into())?; let base_url = Url::parse("http://example.com")?; assert_eq!( Config { downloader: Some(&downloader), base_url: Some(&base_url), ..Default::default() } .run(html) .await, minified ); Ok(()) } #[tokio::test] async fn download_css_with_media_query() -> Result<()> { let downloader = CssDownloader {}; let html = indoc! {" "}; let wanted_html = indoc! {" "}; let mut minifier = HTMLMinifier::new(); minifier.digest(wanted_html)?; let minified = String::from_utf8(minifier.get_html().into())?; let base_url = Url::parse("http://example.com")?; assert_eq!( Config { downloader: Some(&downloader), base_url: Some(&base_url), ..Default::default() } .run(html) .await, minified ); Ok(()) } struct PngDownloader; #[async_trait] impl Download for PngDownloader { type Error = errors::Error; async fn download(&self, _file_link: &Url) -> errors::Result> { let image_path = Path::new("test_data/home.png"); let mut image_file = File::open(&image_path).unwrap(); let mut image_buf: Vec = vec![]; image_file.read_to_end(&mut image_buf).unwrap(); Ok(Some(image_buf.into())) } } #[tokio::test] async fn download_image_png() -> Result<()> { let downloader = PngDownloader {}; let html = indoc! {" \"an "}; let wanted_html = indoc! {" \"an "}; let mut minifier = HTMLMinifier::new(); minifier.digest(wanted_html)?; let minified = String::from_utf8(minifier.get_html().into())?; let base_url = Url::parse("http://example.com")?; assert_eq!( Config { downloader: Some(&downloader), base_url: Some(&base_url), ..Default::default() } .run(html) .await, minified ); Ok(()) } #[tokio::test] async fn remove_css_selectors() -> Result<()> { let html = indoc! {"
The header
The articlesocial media button
a placeholder>
"}; let wanted_html = indoc! {"
The article
"}; let base_url = Url::parse("http://example.com")?; let downloader = DummyDownloader {}; let mut minifier = HTMLMinifier::new(); minifier.digest(wanted_html)?; let minified = String::from_utf8(minifier.get_html().into())?; assert_eq!( Config { downloader: Some(&downloader), base_url: Some(&base_url), elements_to_remove: &["header", ".placeholder", "article > span.huge"], ..Default::default() } .run(html) .await, minified ); Ok(()) } #[tokio::test] async fn add_style() -> Result<()> { let html = indoc! {" The body "}; let wanted_html = indoc! {" The body "}; let style_to_add = indoc! {" body { margin: 3em; } "}; let base_url = Url::parse("http://example.com")?; let downloader = DummyDownloader {}; let mut minifier = HTMLMinifier::new(); minifier.digest(wanted_html)?; let minified = String::from_utf8(minifier.get_html().into())?; assert_eq!( Config { downloader: Some(&downloader), base_url: Some(&base_url), styles_to_add: &[style_to_add], ..Default::default() } .run(html) .await, minified ); Ok(()) } }