crieur/crieur-retrieve/src/tools/self_contained_html.rs
koalp e34edf0b21
All checks were successful
continuous-integration/drone/push Build is passing
fix: keep media queries in ref styles
Previously, media queries weren't keep when downloading styles from ref
tags.

It have been fixed so that media attribute are kept when creating style
tags from ref tags.
2021-05-22 04:41:08 +02:00

562 lines
17 KiB
Rust

use std::path::Path;
use futures::future::OptionFuture;
use html_minifier::HTMLMinifier;
use nipper::Document;
use url::Url;
use crate::consts::{EVENT_HANDLERS, LINK_REL_EXTERNAL_RESOURCES};
use crate::Download;
/// Stores configuration for the self_contained_html function
// TODO: write a builder
pub struct Config<'t, E, D, S1 = &'static str, S2 = &'static str>
where
E: std::error::Error,
D: Download<Error = E> + Send,
S1: AsRef<str>,
S2: AsRef<str>,
{
/// the downloader that will be used to retrieve ressources on the page
pub downloader: Option<&'t D>,
/// Base url for downloading ressources, it probably the
pub base_url: Option<&'t Url>,
pub elements_to_remove: &'t [S1],
pub styles_to_add: &'t [S2],
}
impl<'t, E, D> Default for Config<'t, E, D>
where
E: std::error::Error,
D: Download<Error = E> + Send,
{
fn default() -> Self {
Self {
downloader: None,
base_url: None,
elements_to_remove: &[],
styles_to_add: &[],
}
}
}
impl<'t, E, D, S1, S2> Config<'t, E, D, S1, S2>
where
E: std::error::Error,
D: Download<Error = E> + Send,
S1: AsRef<str>,
S2: AsRef<str>,
{
/// Makes an html page self-contained
///
/// The `downloader` must implement `Download` and is used to download ressources that are
/// needed to make this page self-contained such as stylesheets or images.
///
/// The function also removes all scripts on the page
pub async fn run(&self, html: impl AsRef<str>) -> String {
//TODO: don't panic
let base_url = self.base_url.expect("Base url not defined");
let downloader = self.downloader.expect("Downloader not defined");
// TODO: split/refactor this function :
// - ¿ find a way to ease the |get urls| -> |async download| -> |replace| workflow ?
// - put each modification (ex: style in the `foreach`) in functions, maybe using
// (¿sealed?) Traits : https://git.alpaga.dev/poc/crieur/issues/42
// - `element_to_remove` , `base_url` and `downloader` should be in a configuration structure
let (style_urls, html) = {
let document = Document::from(html.as_ref());
// ---- Remove scripts ----
//
document.select("script").remove();
for event in EVENT_HANDLERS {
document
.select(format!("[{}]", event).as_str())
.remove_attr(event);
}
for rel in LINK_REL_EXTERNAL_RESOURCES {
document
.select(format!("link[rel=\"{}\"]", rel).as_str())
.remove();
}
// ---- Replace stylesheets ----
//
let stylesheets = document.select("link[href][rel=\"stylesheet\"]");
let styles_url = stylesheets
.iter()
.map(|style_link| {
if let Some(src) = style_link.attr("href") {
base_url.join(src.as_ref()).ok()
} else {
None
}
})
.collect::<Vec<_>>();
(styles_url, String::from(document.html()))
};
let style_urls = style_urls.into_iter().map(|style_url| {
OptionFuture::from(
style_url.map(|s| async move { downloader.download(&s).await.unwrap() }),
)
});
let downloaded_styles = futures::future::join_all(style_urls).await;
let html = {
let document = Document::from(&html);
let styles = document.select("link[href][rel=\"stylesheet\"]");
styles
.iter()
.zip(downloaded_styles.iter())
.for_each(|(mut style_link, inner_css)| {
if let Some(Some(inner_css)) = inner_css {
let css = String::from_utf8(inner_css.to_vec()).unwrap();
let media_query = style_link.attr("media");
let css = match media_query {
Some(media_query) => {
format!("<style media=\"{}\">{}</style>", media_query, css)
}
None => format!("<style>{}</style>", css),
};
style_link.replace_with_html(css);
} else {
style_link.remove();
}
});
String::from(document.html())
};
// ---- Replace imgs ----
//
let image_urls = {
let document = Document::from(&html);
let imgs = document.select("img:not([src^=\"data:\"])");
imgs.iter()
.map(|image| {
if let Some(src) = image.attr("src") {
base_url.join(src.as_ref()).ok()
} else {
None
}
})
.collect::<Vec<_>>()
};
let downloaded_images = image_urls.into_iter().map(|image_url| {
OptionFuture::from(image_url.map(|url| async move {
let data = downloader.download(&url).await.unwrap();
(url, data)
}))
});
let downloaded_images = futures::future::join_all(downloaded_images).await;
let html = {
let document = Document::from(&html);
let imgs = document.select("img:not([src^=\"data:\"])");
imgs.iter()
.zip(downloaded_images.iter())
.for_each(|(mut img, data)| {
if let Some((url, Some(data))) = data {
let data = base64::encode(data);
//TODO: use an extension hashmap
let extension =
Path::new(url.path()).extension().unwrap().to_str().unwrap();
img.set_attr("src", &format!("data:image/{};base64,{}", extension, data));
} else {
img.remove()
}
});
// ---- Remove unwanted html elements -----
//
for element in self.elements_to_remove {
document.select(element.as_ref()).remove();
}
// ---- Add additional styles ----
//
for style in self.styles_to_add {
document
.select("head")
.append_html(format!("\n<style>{}</style>\n", style.as_ref()));
}
String::from(document.html())
};
// ---- output ----
//
let mut minifier = HTMLMinifier::new();
minifier.digest(html.as_str()).unwrap();
String::from_utf8(minifier.get_html().into()).unwrap()
}
}
#[cfg(test)]
mod tests {
// TODO: reduce boilerplate, DRY
use super::*;
use std::fs::File;
use std::io::prelude::*;
use anyhow::Result;
use async_trait::async_trait;
use bytes::Bytes;
use indoc::{formatdoc, indoc};
use crate::errors;
fn init() {
let _ = env_logger::builder().is_test(true).try_init();
}
// TODO: the Dummy,Css and Png Downloaders don't really test the async scenario as
// they don't use futures : they don't call await.
// They should be testing the async scenario
struct DummyDownloader;
#[async_trait]
impl Download for DummyDownloader {
type Error = errors::Error;
async fn download(&self, _file_link: &Url) -> errors::Result<Option<Bytes>> {
Ok(Some(Bytes::from("")))
}
}
#[tokio::test]
async fn remove_scripts() -> Result<()> {
let html = "<html><head><script>let id = id => id</script></head><body></body></html>";
let base_url = Url::parse("http://example.com")?;
let downloader = DummyDownloader {};
assert_eq!(
Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}
.run(html)
.await,
"<html><head></head><body></body></html>"
);
Ok(())
}
#[tokio::test]
async fn remove_onevent_handlers() -> Result<()> {
init();
let downloader = DummyDownloader {};
let html = |onevent| {
formatdoc! {"
<html>
<head>
</head>
<body>
<button class=\"activate\" {}=\"let id = id => id\">button</button>
</body>
</html>",
onevent
}
};
let base_url = Url::parse("http://example.com")?;
for s in EVENT_HANDLERS {
assert_eq!(
Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}.run(html(s)).await,
"<html><head>\n</head>\n<body>\n<button class=\"activate\">button</button>\n</body></html>"
);
}
Ok(())
}
#[tokio::test]
async fn remove_link_with_external_ressource() -> Result<()> {
init();
let downloader = DummyDownloader {};
let html = |onevent| {
formatdoc! {"
<html>
<head>
<link rel=\"{}\" href=\"https://example.org/script.js\">
</head>
<body>
</body>
</html>",
onevent
}
};
let base_url = Url::parse("http://example.com")?;
for s in LINK_REL_EXTERNAL_RESOURCES {
assert_eq!(
Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}
.run(html(s))
.await,
"<html><head>\n</head>\n<body>\n</body></html>"
);
}
Ok(())
}
struct CssDownloader;
#[async_trait]
impl Download for CssDownloader {
type Error = errors::Error;
async fn download(&self, _file_link: &Url) -> errors::Result<Option<Bytes>> {
Ok(Some(
indoc! {"
section#warning {
color: red;
}"}
.into(),
))
}
}
#[tokio::test]
async fn download_css() -> Result<()> {
let downloader = CssDownloader {};
let html = indoc! {"
<html>
<head>
<link rel=\"stylesheet\" href=\"main.css\">
</head>
<body>
</body>
</html>
"};
// FIXME: find why minify doesn't minify
let wanted_html = indoc! {"
<html><head>
<style>
section#warning {
color: red;
}
</style>
</head>
<body>
</body></html>
"};
let mut minifier = HTMLMinifier::new();
minifier.digest(wanted_html)?;
let minified = String::from_utf8(minifier.get_html().into())?;
let base_url = Url::parse("http://example.com")?;
assert_eq!(
Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}
.run(html)
.await,
minified
);
Ok(())
}
#[tokio::test]
async fn download_css_with_media_query() -> Result<()> {
let downloader = CssDownloader {};
let html = indoc! {"
<html>
<head>
<link rel=\"stylesheet\" href=\"main.css\" media=\"print\">
</head>
<body>
</body>
</html>
"};
let wanted_html = indoc! {"
<html><head>
<style media=\"print\">
section#warning {
color: red;
}
</style>
</head>
<body>
</body></html>
"};
let mut minifier = HTMLMinifier::new();
minifier.digest(wanted_html)?;
let minified = String::from_utf8(minifier.get_html().into())?;
let base_url = Url::parse("http://example.com")?;
assert_eq!(
Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}
.run(html)
.await,
minified
);
Ok(())
}
struct PngDownloader;
#[async_trait]
impl Download for PngDownloader {
type Error = errors::Error;
async fn download(&self, _file_link: &Url) -> errors::Result<Option<Bytes>> {
let image_path = Path::new("test_data/home.png");
let mut image_file = File::open(&image_path).unwrap();
let mut image_buf: Vec<u8> = vec![];
image_file.read_to_end(&mut image_buf).unwrap();
Ok(Some(image_buf.into()))
}
}
#[tokio::test]
async fn download_image_png() -> Result<()> {
let downloader = PngDownloader {};
let html = indoc! {"
<html>
<head></head>
<body>
<img src=\"home.png\" alt=\"an home\" />
</body>
</html>
"};
let wanted_html = indoc! {"
<html><head></head>
<body>
<img src=\"\
JLR0QA/wD/AP+gvaeTAAAAh0lEQVQ4jc2RQQqAIBBFn12ioNNE544gWhStc+FNWtVmBDFHsjZ9+DBffeOgo\
KsBVmCRukgNYIFTbGVtlDzk4DqCwyZhTqoFXAJO+RN8a1ADewF8CvPqZm8nLNsL2HutgEN70Qc6TBDUr1Fk\
AKrMgU4OGaDPdlEmMFFO7ucmeKR/NZgLuMkXFxHZVhLI8sXeAAAAAElFTkSuQmCC\" alt=\"an home\">
</body></html>
"};
let mut minifier = HTMLMinifier::new();
minifier.digest(wanted_html)?;
let minified = String::from_utf8(minifier.get_html().into())?;
let base_url = Url::parse("http://example.com")?;
assert_eq!(
Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}
.run(html)
.await,
minified
);
Ok(())
}
#[tokio::test]
async fn remove_css_selectors() -> Result<()> {
let html = indoc! {"
<html>
<head></head>
<body>
<header>The header</header>
<article>The article<span class=\"huge\">social media button</span></article>
<div class=\"placeholder\">a placeholder></div>
</body>
</html>
"};
let wanted_html = indoc! {"
<html><head></head>
<body>
<article>The article</article>
</body></html>
"};
let base_url = Url::parse("http://example.com")?;
let downloader = DummyDownloader {};
let mut minifier = HTMLMinifier::new();
minifier.digest(wanted_html)?;
let minified = String::from_utf8(minifier.get_html().into())?;
assert_eq!(
Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
elements_to_remove: &["header", ".placeholder", "article > span.huge"],
..Default::default()
}
.run(html)
.await,
minified
);
Ok(())
}
#[tokio::test]
async fn add_style() -> Result<()> {
let html = indoc! {"
<html>
<head>
<meta charset=\"UTF-8\">
</head>
<body>
The body
</body>
</html>
"};
let wanted_html = indoc! {"
<html><head>
<meta charset=\"UTF-8\">
<style>
body {
margin: 3em;
}
</style>
</head>
<body>
The body
</body></html>
"};
let style_to_add = indoc! {"
body {
margin: 3em;
}
"};
let base_url = Url::parse("http://example.com")?;
let downloader = DummyDownloader {};
let mut minifier = HTMLMinifier::new();
minifier.digest(wanted_html)?;
let minified = String::from_utf8(minifier.get_html().into())?;
assert_eq!(
Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
styles_to_add: &[style_to_add],
..Default::default()
}
.run(html)
.await,
minified
);
Ok(())
}
}