fix: keep media queries in ref styles
All checks were successful
continuous-integration/drone/push Build is passing

Previously, media queries weren't keep when downloading styles from ref
tags.

It have been fixed so that media attribute are kept when creating style
tags from ref tags.
This commit is contained in:
koalp 2021-05-22 04:25:50 +02:00
parent 40ebc1ddea
commit e34edf0b21
Signed by: koalp
GPG Key ID: 35B21047DEB09A81
7 changed files with 126 additions and 39 deletions

50
Cargo.lock generated
View File

@ -662,9 +662,9 @@ dependencies = [
[[package]] [[package]]
name = "futures" name = "futures"
version = "0.3.14" version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a9d5813545e459ad3ca1bff9915e9ad7f1a47dc6a91b627ce321d5863b7dd253" checksum = "0e7e43a803dae2fa37c1f6a8fe121e1f7bf9548b4dfc0522a42f34145dadfc27"
dependencies = [ dependencies = [
"futures-channel", "futures-channel",
"futures-core", "futures-core",
@ -677,9 +677,9 @@ dependencies = [
[[package]] [[package]]
name = "futures-channel" name = "futures-channel"
version = "0.3.14" version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ce79c6a52a299137a6013061e0cf0e688fce5d7f1bc60125f520912fdb29ec25" checksum = "e682a68b29a882df0545c143dc3646daefe80ba479bcdede94d5a703de2871e2"
dependencies = [ dependencies = [
"futures-core", "futures-core",
"futures-sink", "futures-sink",
@ -687,15 +687,15 @@ dependencies = [
[[package]] [[package]]
name = "futures-core" name = "futures-core"
version = "0.3.14" version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "098cd1c6dda6ca01650f1a37a794245eb73181d0d4d4e955e2f3c37db7af1815" checksum = "0402f765d8a89a26043b889b26ce3c4679d268fa6bb22cd7c6aad98340e179d1"
[[package]] [[package]]
name = "futures-executor" name = "futures-executor"
version = "0.3.14" version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "10f6cb7042eda00f0049b1d2080aa4b93442997ee507eb3828e8bd7577f94c9d" checksum = "badaa6a909fac9e7236d0620a2f57f7664640c56575b71a7552fbd68deafab79"
dependencies = [ dependencies = [
"futures-core", "futures-core",
"futures-task", "futures-task",
@ -704,9 +704,9 @@ dependencies = [
[[package]] [[package]]
name = "futures-io" name = "futures-io"
version = "0.3.14" version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "365a1a1fb30ea1c03a830fdb2158f5236833ac81fa0ad12fe35b29cddc35cb04" checksum = "acc499defb3b348f8d8f3f66415835a9131856ff7714bf10dadfc4ec4bdb29a1"
[[package]] [[package]]
name = "futures-locks" name = "futures-locks"
@ -719,10 +719,11 @@ dependencies = [
[[package]] [[package]]
name = "futures-macro" name = "futures-macro"
version = "0.3.14" version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "668c6733a182cd7deb4f1de7ba3bf2120823835b3bcfbeacf7d2c4a773c1bb8b" checksum = "a4c40298486cdf52cc00cd6d6987892ba502c7656a16a4192a9992b1ccedd121"
dependencies = [ dependencies = [
"autocfg 1.0.1",
"proc-macro-hack", "proc-macro-hack",
"proc-macro2", "proc-macro2",
"quote", "quote",
@ -731,15 +732,15 @@ dependencies = [
[[package]] [[package]]
name = "futures-sink" name = "futures-sink"
version = "0.3.14" version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c5629433c555de3d82861a7a4e3794a4c40040390907cfbfd7143a92a426c23" checksum = "a57bead0ceff0d6dde8f465ecd96c9338121bb7717d3e7b108059531870c4282"
[[package]] [[package]]
name = "futures-task" name = "futures-task"
version = "0.3.14" version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ba7aa51095076f3ba6d9a1f702f74bd05ec65f555d70d2033d55ba8d69f581bc" checksum = "8a16bef9fc1a4dddb5bee51c989e3fbba26569cbb0e31f5b303c184e3dd33dae"
[[package]] [[package]]
name = "futures-timer" name = "futures-timer"
@ -753,10 +754,11 @@ dependencies = [
[[package]] [[package]]
name = "futures-util" name = "futures-util"
version = "0.3.14" version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3c144ad54d60f23927f0a6b6d816e4271278b64f005ad65e4e35291d2de9c025" checksum = "feb5c238d27e2bf94ffdfd27b2c29e3df4a68c4193bb6427384259e2bf191967"
dependencies = [ dependencies = [
"autocfg 1.0.1",
"futures-channel", "futures-channel",
"futures-core", "futures-core",
"futures-io", "futures-io",
@ -895,9 +897,9 @@ dependencies = [
[[package]] [[package]]
name = "html-minifier" name = "html-minifier"
version = "3.0.12" version = "3.0.13"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a3b862e637e82b3134913fdd0aa0b8e79b7486fe88878f6bab0d09daf4996af" checksum = "70f11cbdecf4fde3b74532e8d6d5926b6354adfda3fe259ed585402f9ae0dbaf"
dependencies = [ dependencies = [
"cow-utils", "cow-utils",
"educe", "educe",
@ -1284,9 +1286,9 @@ checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d"
[[package]] [[package]]
name = "minifier" name = "minifier"
version = "0.0.40" version = "0.0.41"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ed8f02a863a23d5797a6e72ea7102bd0ee38ceba1256b522aeddb70ff14b32c9" checksum = "5594542d20834f2b974f5e5fb8e0cf1c67a2119dcadc29ef5d93a081fb30cc08"
dependencies = [ dependencies = [
"macro-utils", "macro-utils",
] ]
@ -2659,9 +2661,9 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c"
[[package]] [[package]]
name = "tokio" name = "tokio"
version = "1.5.0" version = "1.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "83f0c8e7c0addab50b663055baf787d0af7f413a46e6e7fb9559a4e4db7137a5" checksum = "bd3076b5c8cc18138b8f8814895c11eb4de37114a5d127bafdc5e55798ceef37"
dependencies = [ dependencies = [
"autocfg 1.0.1", "autocfg 1.0.1",
"bytes", "bytes",

View File

@ -22,5 +22,5 @@ crieur-chatbot = {version = "0.1", path="crieur-chatbot"}
dotenv = "0.15.0" dotenv = "0.15.0"
env_logger = "0.8.3" env_logger = "0.8.3"
log = "0.4.14" log = "0.4.14"
tokio = { version = "1.5.0", features = ["full"] } tokio = { version = "1.6.0", features = ["full"] }
tracing-subscriber = "0.2.18" tracing-subscriber = "0.2.18"

View File

@ -16,14 +16,14 @@ hyper-rustls = "0.22.1"
cookie = "0.15.0" cookie = "0.15.0"
lol_html = "0.3.0" lol_html = "0.3.0"
indoc = "1.0.3" indoc = "1.0.3"
html-minifier = "3.0.12" html-minifier = "3.0.13"
bytes = "1.0.1" bytes = "1.0.1"
base64 = "0.13.0" base64 = "0.13.0"
futures = "0.3.14" futures = "0.3.15"
derive_builder = "0.10.2" derive_builder = "0.10.2"
nipper = "0.1.9" nipper = "0.1.9"
log = "0.4.14" log = "0.4.14"
env_logger = "0.8.3" env_logger = "0.8.3"
[dev-dependencies] [dev-dependencies]
tokio = "1.5.0" tokio = "1.6.0"

View File

@ -1,6 +1,7 @@
use anyhow::{anyhow, bail, Result}; use anyhow::{anyhow, bail, Result};
use async_trait::async_trait; use async_trait::async_trait;
use cookie::Cookie; use cookie::Cookie;
use indoc::indoc;
use url::Host; use url::Host;
use crate::newspaper::{Metadata, Newspaper}; use crate::newspaper::{Metadata, Newspaper};
@ -81,9 +82,42 @@ impl Newspaper for CourrierInternational {
None => bail!("404 not found"), None => bail!("404 not found"),
}; };
let elements_to_remove = &[
// navigation elements
"header.site-header",
"footer.site-footer",
// Social buttons
"#toolbox-share",
".toolbox-share",
".toolbox-print",
".toolbox-respond",
".toolbox-zen",
".toolbox-newsletter",
".toolbox-offer",
".box-article-offer-friend-abo",
// unused services
".article-aside",
".article-secondary",
".article-subject-readmore",
// misc
".element-invisible",
".gptcontainer",
];
// FIXME: it doesn't work because the aside is in the article body
//
let toolbox_style = indoc! {"
aside.article-toolbox {
position: sticky;
top: 1em;
}
"};
let single_page_html = tools::self_contained_html::Config { let single_page_html = tools::self_contained_html::Config {
downloader: Some(&downloader), downloader: Some(&downloader),
base_url: Some(&url), base_url: Some(&url),
elements_to_remove,
styles_to_add: &[toolbox_style],
..Default::default() ..Default::default()
} }
.run(&html) .run(&html)

View File

@ -86,7 +86,7 @@ impl Newspaper for Mediapart {
}; };
// TODO: Move to const // TODO: Move to const
let elements_to_remove = [ let elements_to_remove = &[
// header // header
".fb-root", ".fb-root",
".skipLinks", ".skipLinks",
@ -107,7 +107,7 @@ impl Newspaper for Mediapart {
let single_page_html = tools::self_contained_html::Config { let single_page_html = tools::self_contained_html::Config {
downloader: Some(&downloader), downloader: Some(&downloader),
base_url: Some(&url), base_url: Some(&url),
elements_to_remove: &elements_to_remove, elements_to_remove,
..Default::default() ..Default::default()
} }
.run(&html) .run(&html)

View File

@ -91,7 +91,7 @@ impl Newspaper for MondeDiplo {
}; };
// TODO: Move to const // TODO: Move to const
let elements_to_remove = [ let elements_to_remove = &[
// navigation elements // navigation elements
"#tout-en-haut.preentete", "#tout-en-haut.preentete",
"#entete.connecte", "#entete.connecte",
@ -110,7 +110,7 @@ impl Newspaper for MondeDiplo {
let single_page_html = tools::self_contained_html::Config { let single_page_html = tools::self_contained_html::Config {
downloader: Some(&downloader), downloader: Some(&downloader),
base_url: Some(&url), base_url: Some(&url),
elements_to_remove: &elements_to_remove, elements_to_remove,
..Default::default() ..Default::default()
} }
.run(&html) .run(&html)

View File

@ -59,6 +59,8 @@ where
let downloader = self.downloader.expect("Downloader not defined"); let downloader = self.downloader.expect("Downloader not defined");
// TODO: split/refactor this function : // TODO: split/refactor this function :
// - ¿ find a way to ease the |get urls| -> |async download| -> |replace| workflow ? // - ¿ find a way to ease the |get urls| -> |async download| -> |replace| workflow ?
// - put each modification (ex: style in the `foreach`) in functions, maybe using
// (¿sealed?) Traits : https://git.alpaga.dev/poc/crieur/issues/42
// - `element_to_remove` , `base_url` and `downloader` should be in a configuration structure // - `element_to_remove` , `base_url` and `downloader` should be in a configuration structure
let (style_urls, html) = { let (style_urls, html) = {
let document = Document::from(html.as_ref()); let document = Document::from(html.as_ref());
@ -84,9 +86,8 @@ where
let stylesheets = document.select("link[href][rel=\"stylesheet\"]"); let stylesheets = document.select("link[href][rel=\"stylesheet\"]");
let styles_url = stylesheets let styles_url = stylesheets
.iter() .iter()
.map(|stylesheet| { .map(|style_link| {
if let Some(src) = stylesheet.attr("href") { if let Some(src) = style_link.attr("href") {
//TODO: does it work with absolute urls ?
base_url.join(src.as_ref()).ok() base_url.join(src.as_ref()).ok()
} else { } else {
None None
@ -110,13 +111,19 @@ where
styles styles
.iter() .iter()
.zip(downloaded_styles.iter()) .zip(downloaded_styles.iter())
.for_each(|(mut stylesheet, inner_css)| { .for_each(|(mut style_link, inner_css)| {
if let Some(Some(inner_css)) = inner_css { if let Some(Some(inner_css)) = inner_css {
let css = String::from_utf8(inner_css.to_vec()).unwrap(); let css = String::from_utf8(inner_css.to_vec()).unwrap();
let css = format!("<style>{}</style>", css); let media_query = style_link.attr("media");
stylesheet.replace_with_html(css); let css = match media_query {
Some(media_query) => {
format!("<style media=\"{}\">{}</style>", media_query, css)
}
None => format!("<style>{}</style>", css),
};
style_link.replace_with_html(css);
} else { } else {
stylesheet.remove(); style_link.remove();
} }
}); });
String::from(document.html()) String::from(document.html())
@ -192,6 +199,7 @@ where
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
// TODO: reduce boilerplate, DRY
use super::*; use super::*;
@ -362,6 +370,49 @@ mod tests {
Ok(()) Ok(())
} }
#[tokio::test]
async fn download_css_with_media_query() -> Result<()> {
let downloader = CssDownloader {};
let html = indoc! {"
<html>
<head>
<link rel=\"stylesheet\" href=\"main.css\" media=\"print\">
</head>
<body>
</body>
</html>
"};
let wanted_html = indoc! {"
<html><head>
<style media=\"print\">
section#warning {
color: red;
}
</style>
</head>
<body>
</body></html>
"};
let mut minifier = HTMLMinifier::new();
minifier.digest(wanted_html)?;
let minified = String::from_utf8(minifier.get_html().into())?;
let base_url = Url::parse("http://example.com")?;
assert_eq!(
Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}
.run(html)
.await,
minified
);
Ok(())
}
struct PngDownloader; struct PngDownloader;
#[async_trait] #[async_trait]
impl Download for PngDownloader { impl Download for PngDownloader {