fix: keep media queries in ref styles
All checks were successful
continuous-integration/drone/push Build is passing

Previously, media queries weren't keep when downloading styles from ref
tags.

It have been fixed so that media attribute are kept when creating style
tags from ref tags.
This commit is contained in:
koalp 2021-05-22 04:25:50 +02:00
parent 40ebc1ddea
commit e34edf0b21
Signed by: koalp
GPG Key ID: 35B21047DEB09A81
7 changed files with 126 additions and 39 deletions

50
Cargo.lock generated
View File

@ -662,9 +662,9 @@ dependencies = [
[[package]]
name = "futures"
version = "0.3.14"
version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a9d5813545e459ad3ca1bff9915e9ad7f1a47dc6a91b627ce321d5863b7dd253"
checksum = "0e7e43a803dae2fa37c1f6a8fe121e1f7bf9548b4dfc0522a42f34145dadfc27"
dependencies = [
"futures-channel",
"futures-core",
@ -677,9 +677,9 @@ dependencies = [
[[package]]
name = "futures-channel"
version = "0.3.14"
version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ce79c6a52a299137a6013061e0cf0e688fce5d7f1bc60125f520912fdb29ec25"
checksum = "e682a68b29a882df0545c143dc3646daefe80ba479bcdede94d5a703de2871e2"
dependencies = [
"futures-core",
"futures-sink",
@ -687,15 +687,15 @@ dependencies = [
[[package]]
name = "futures-core"
version = "0.3.14"
version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "098cd1c6dda6ca01650f1a37a794245eb73181d0d4d4e955e2f3c37db7af1815"
checksum = "0402f765d8a89a26043b889b26ce3c4679d268fa6bb22cd7c6aad98340e179d1"
[[package]]
name = "futures-executor"
version = "0.3.14"
version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "10f6cb7042eda00f0049b1d2080aa4b93442997ee507eb3828e8bd7577f94c9d"
checksum = "badaa6a909fac9e7236d0620a2f57f7664640c56575b71a7552fbd68deafab79"
dependencies = [
"futures-core",
"futures-task",
@ -704,9 +704,9 @@ dependencies = [
[[package]]
name = "futures-io"
version = "0.3.14"
version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "365a1a1fb30ea1c03a830fdb2158f5236833ac81fa0ad12fe35b29cddc35cb04"
checksum = "acc499defb3b348f8d8f3f66415835a9131856ff7714bf10dadfc4ec4bdb29a1"
[[package]]
name = "futures-locks"
@ -719,10 +719,11 @@ dependencies = [
[[package]]
name = "futures-macro"
version = "0.3.14"
version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "668c6733a182cd7deb4f1de7ba3bf2120823835b3bcfbeacf7d2c4a773c1bb8b"
checksum = "a4c40298486cdf52cc00cd6d6987892ba502c7656a16a4192a9992b1ccedd121"
dependencies = [
"autocfg 1.0.1",
"proc-macro-hack",
"proc-macro2",
"quote",
@ -731,15 +732,15 @@ dependencies = [
[[package]]
name = "futures-sink"
version = "0.3.14"
version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c5629433c555de3d82861a7a4e3794a4c40040390907cfbfd7143a92a426c23"
checksum = "a57bead0ceff0d6dde8f465ecd96c9338121bb7717d3e7b108059531870c4282"
[[package]]
name = "futures-task"
version = "0.3.14"
version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ba7aa51095076f3ba6d9a1f702f74bd05ec65f555d70d2033d55ba8d69f581bc"
checksum = "8a16bef9fc1a4dddb5bee51c989e3fbba26569cbb0e31f5b303c184e3dd33dae"
[[package]]
name = "futures-timer"
@ -753,10 +754,11 @@ dependencies = [
[[package]]
name = "futures-util"
version = "0.3.14"
version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3c144ad54d60f23927f0a6b6d816e4271278b64f005ad65e4e35291d2de9c025"
checksum = "feb5c238d27e2bf94ffdfd27b2c29e3df4a68c4193bb6427384259e2bf191967"
dependencies = [
"autocfg 1.0.1",
"futures-channel",
"futures-core",
"futures-io",
@ -895,9 +897,9 @@ dependencies = [
[[package]]
name = "html-minifier"
version = "3.0.12"
version = "3.0.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a3b862e637e82b3134913fdd0aa0b8e79b7486fe88878f6bab0d09daf4996af"
checksum = "70f11cbdecf4fde3b74532e8d6d5926b6354adfda3fe259ed585402f9ae0dbaf"
dependencies = [
"cow-utils",
"educe",
@ -1284,9 +1286,9 @@ checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d"
[[package]]
name = "minifier"
version = "0.0.40"
version = "0.0.41"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ed8f02a863a23d5797a6e72ea7102bd0ee38ceba1256b522aeddb70ff14b32c9"
checksum = "5594542d20834f2b974f5e5fb8e0cf1c67a2119dcadc29ef5d93a081fb30cc08"
dependencies = [
"macro-utils",
]
@ -2659,9 +2661,9 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c"
[[package]]
name = "tokio"
version = "1.5.0"
version = "1.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "83f0c8e7c0addab50b663055baf787d0af7f413a46e6e7fb9559a4e4db7137a5"
checksum = "bd3076b5c8cc18138b8f8814895c11eb4de37114a5d127bafdc5e55798ceef37"
dependencies = [
"autocfg 1.0.1",
"bytes",

View File

@ -22,5 +22,5 @@ crieur-chatbot = {version = "0.1", path="crieur-chatbot"}
dotenv = "0.15.0"
env_logger = "0.8.3"
log = "0.4.14"
tokio = { version = "1.5.0", features = ["full"] }
tokio = { version = "1.6.0", features = ["full"] }
tracing-subscriber = "0.2.18"

View File

@ -16,14 +16,14 @@ hyper-rustls = "0.22.1"
cookie = "0.15.0"
lol_html = "0.3.0"
indoc = "1.0.3"
html-minifier = "3.0.12"
html-minifier = "3.0.13"
bytes = "1.0.1"
base64 = "0.13.0"
futures = "0.3.14"
futures = "0.3.15"
derive_builder = "0.10.2"
nipper = "0.1.9"
log = "0.4.14"
env_logger = "0.8.3"
[dev-dependencies]
tokio = "1.5.0"
tokio = "1.6.0"

View File

@ -1,6 +1,7 @@
use anyhow::{anyhow, bail, Result};
use async_trait::async_trait;
use cookie::Cookie;
use indoc::indoc;
use url::Host;
use crate::newspaper::{Metadata, Newspaper};
@ -81,9 +82,42 @@ impl Newspaper for CourrierInternational {
None => bail!("404 not found"),
};
let elements_to_remove = &[
// navigation elements
"header.site-header",
"footer.site-footer",
// Social buttons
"#toolbox-share",
".toolbox-share",
".toolbox-print",
".toolbox-respond",
".toolbox-zen",
".toolbox-newsletter",
".toolbox-offer",
".box-article-offer-friend-abo",
// unused services
".article-aside",
".article-secondary",
".article-subject-readmore",
// misc
".element-invisible",
".gptcontainer",
];
// FIXME: it doesn't work because the aside is in the article body
//
let toolbox_style = indoc! {"
aside.article-toolbox {
position: sticky;
top: 1em;
}
"};
let single_page_html = tools::self_contained_html::Config {
downloader: Some(&downloader),
base_url: Some(&url),
elements_to_remove,
styles_to_add: &[toolbox_style],
..Default::default()
}
.run(&html)

View File

@ -86,7 +86,7 @@ impl Newspaper for Mediapart {
};
// TODO: Move to const
let elements_to_remove = [
let elements_to_remove = &[
// header
".fb-root",
".skipLinks",
@ -107,7 +107,7 @@ impl Newspaper for Mediapart {
let single_page_html = tools::self_contained_html::Config {
downloader: Some(&downloader),
base_url: Some(&url),
elements_to_remove: &elements_to_remove,
elements_to_remove,
..Default::default()
}
.run(&html)

View File

@ -91,7 +91,7 @@ impl Newspaper for MondeDiplo {
};
// TODO: Move to const
let elements_to_remove = [
let elements_to_remove = &[
// navigation elements
"#tout-en-haut.preentete",
"#entete.connecte",
@ -110,7 +110,7 @@ impl Newspaper for MondeDiplo {
let single_page_html = tools::self_contained_html::Config {
downloader: Some(&downloader),
base_url: Some(&url),
elements_to_remove: &elements_to_remove,
elements_to_remove,
..Default::default()
}
.run(&html)

View File

@ -59,6 +59,8 @@ where
let downloader = self.downloader.expect("Downloader not defined");
// TODO: split/refactor this function :
// - ¿ find a way to ease the |get urls| -> |async download| -> |replace| workflow ?
// - put each modification (ex: style in the `foreach`) in functions, maybe using
// (¿sealed?) Traits : https://git.alpaga.dev/poc/crieur/issues/42
// - `element_to_remove` , `base_url` and `downloader` should be in a configuration structure
let (style_urls, html) = {
let document = Document::from(html.as_ref());
@ -84,9 +86,8 @@ where
let stylesheets = document.select("link[href][rel=\"stylesheet\"]");
let styles_url = stylesheets
.iter()
.map(|stylesheet| {
if let Some(src) = stylesheet.attr("href") {
//TODO: does it work with absolute urls ?
.map(|style_link| {
if let Some(src) = style_link.attr("href") {
base_url.join(src.as_ref()).ok()
} else {
None
@ -110,13 +111,19 @@ where
styles
.iter()
.zip(downloaded_styles.iter())
.for_each(|(mut stylesheet, inner_css)| {
.for_each(|(mut style_link, inner_css)| {
if let Some(Some(inner_css)) = inner_css {
let css = String::from_utf8(inner_css.to_vec()).unwrap();
let css = format!("<style>{}</style>", css);
stylesheet.replace_with_html(css);
let media_query = style_link.attr("media");
let css = match media_query {
Some(media_query) => {
format!("<style media=\"{}\">{}</style>", media_query, css)
}
None => format!("<style>{}</style>", css),
};
style_link.replace_with_html(css);
} else {
stylesheet.remove();
style_link.remove();
}
});
String::from(document.html())
@ -192,6 +199,7 @@ where
#[cfg(test)]
mod tests {
// TODO: reduce boilerplate, DRY
use super::*;
@ -362,6 +370,49 @@ mod tests {
Ok(())
}
#[tokio::test]
async fn download_css_with_media_query() -> Result<()> {
let downloader = CssDownloader {};
let html = indoc! {"
<html>
<head>
<link rel=\"stylesheet\" href=\"main.css\" media=\"print\">
</head>
<body>
</body>
</html>
"};
let wanted_html = indoc! {"
<html><head>
<style media=\"print\">
section#warning {
color: red;
}
</style>
</head>
<body>
</body></html>
"};
let mut minifier = HTMLMinifier::new();
minifier.digest(wanted_html)?;
let minified = String::from_utf8(minifier.get_html().into())?;
let base_url = Url::parse("http://example.com")?;
assert_eq!(
Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}
.run(html)
.await,
minified
);
Ok(())
}
struct PngDownloader;
#[async_trait]
impl Download for PngDownloader {