fix: keep media queries in ref styles
All checks were successful
continuous-integration/drone/push Build is passing
All checks were successful
continuous-integration/drone/push Build is passing
Previously, media queries weren't keep when downloading styles from ref tags. It have been fixed so that media attribute are kept when creating style tags from ref tags.
This commit is contained in:
parent
40ebc1ddea
commit
e34edf0b21
50
Cargo.lock
generated
50
Cargo.lock
generated
@ -662,9 +662,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "futures"
|
||||
version = "0.3.14"
|
||||
version = "0.3.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a9d5813545e459ad3ca1bff9915e9ad7f1a47dc6a91b627ce321d5863b7dd253"
|
||||
checksum = "0e7e43a803dae2fa37c1f6a8fe121e1f7bf9548b4dfc0522a42f34145dadfc27"
|
||||
dependencies = [
|
||||
"futures-channel",
|
||||
"futures-core",
|
||||
@ -677,9 +677,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "futures-channel"
|
||||
version = "0.3.14"
|
||||
version = "0.3.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ce79c6a52a299137a6013061e0cf0e688fce5d7f1bc60125f520912fdb29ec25"
|
||||
checksum = "e682a68b29a882df0545c143dc3646daefe80ba479bcdede94d5a703de2871e2"
|
||||
dependencies = [
|
||||
"futures-core",
|
||||
"futures-sink",
|
||||
@ -687,15 +687,15 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "futures-core"
|
||||
version = "0.3.14"
|
||||
version = "0.3.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "098cd1c6dda6ca01650f1a37a794245eb73181d0d4d4e955e2f3c37db7af1815"
|
||||
checksum = "0402f765d8a89a26043b889b26ce3c4679d268fa6bb22cd7c6aad98340e179d1"
|
||||
|
||||
[[package]]
|
||||
name = "futures-executor"
|
||||
version = "0.3.14"
|
||||
version = "0.3.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "10f6cb7042eda00f0049b1d2080aa4b93442997ee507eb3828e8bd7577f94c9d"
|
||||
checksum = "badaa6a909fac9e7236d0620a2f57f7664640c56575b71a7552fbd68deafab79"
|
||||
dependencies = [
|
||||
"futures-core",
|
||||
"futures-task",
|
||||
@ -704,9 +704,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "futures-io"
|
||||
version = "0.3.14"
|
||||
version = "0.3.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "365a1a1fb30ea1c03a830fdb2158f5236833ac81fa0ad12fe35b29cddc35cb04"
|
||||
checksum = "acc499defb3b348f8d8f3f66415835a9131856ff7714bf10dadfc4ec4bdb29a1"
|
||||
|
||||
[[package]]
|
||||
name = "futures-locks"
|
||||
@ -719,10 +719,11 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "futures-macro"
|
||||
version = "0.3.14"
|
||||
version = "0.3.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "668c6733a182cd7deb4f1de7ba3bf2120823835b3bcfbeacf7d2c4a773c1bb8b"
|
||||
checksum = "a4c40298486cdf52cc00cd6d6987892ba502c7656a16a4192a9992b1ccedd121"
|
||||
dependencies = [
|
||||
"autocfg 1.0.1",
|
||||
"proc-macro-hack",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
@ -731,15 +732,15 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "futures-sink"
|
||||
version = "0.3.14"
|
||||
version = "0.3.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5c5629433c555de3d82861a7a4e3794a4c40040390907cfbfd7143a92a426c23"
|
||||
checksum = "a57bead0ceff0d6dde8f465ecd96c9338121bb7717d3e7b108059531870c4282"
|
||||
|
||||
[[package]]
|
||||
name = "futures-task"
|
||||
version = "0.3.14"
|
||||
version = "0.3.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ba7aa51095076f3ba6d9a1f702f74bd05ec65f555d70d2033d55ba8d69f581bc"
|
||||
checksum = "8a16bef9fc1a4dddb5bee51c989e3fbba26569cbb0e31f5b303c184e3dd33dae"
|
||||
|
||||
[[package]]
|
||||
name = "futures-timer"
|
||||
@ -753,10 +754,11 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "futures-util"
|
||||
version = "0.3.14"
|
||||
version = "0.3.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3c144ad54d60f23927f0a6b6d816e4271278b64f005ad65e4e35291d2de9c025"
|
||||
checksum = "feb5c238d27e2bf94ffdfd27b2c29e3df4a68c4193bb6427384259e2bf191967"
|
||||
dependencies = [
|
||||
"autocfg 1.0.1",
|
||||
"futures-channel",
|
||||
"futures-core",
|
||||
"futures-io",
|
||||
@ -895,9 +897,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "html-minifier"
|
||||
version = "3.0.12"
|
||||
version = "3.0.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5a3b862e637e82b3134913fdd0aa0b8e79b7486fe88878f6bab0d09daf4996af"
|
||||
checksum = "70f11cbdecf4fde3b74532e8d6d5926b6354adfda3fe259ed585402f9ae0dbaf"
|
||||
dependencies = [
|
||||
"cow-utils",
|
||||
"educe",
|
||||
@ -1284,9 +1286,9 @@ checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d"
|
||||
|
||||
[[package]]
|
||||
name = "minifier"
|
||||
version = "0.0.40"
|
||||
version = "0.0.41"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ed8f02a863a23d5797a6e72ea7102bd0ee38ceba1256b522aeddb70ff14b32c9"
|
||||
checksum = "5594542d20834f2b974f5e5fb8e0cf1c67a2119dcadc29ef5d93a081fb30cc08"
|
||||
dependencies = [
|
||||
"macro-utils",
|
||||
]
|
||||
@ -2659,9 +2661,9 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c"
|
||||
|
||||
[[package]]
|
||||
name = "tokio"
|
||||
version = "1.5.0"
|
||||
version = "1.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "83f0c8e7c0addab50b663055baf787d0af7f413a46e6e7fb9559a4e4db7137a5"
|
||||
checksum = "bd3076b5c8cc18138b8f8814895c11eb4de37114a5d127bafdc5e55798ceef37"
|
||||
dependencies = [
|
||||
"autocfg 1.0.1",
|
||||
"bytes",
|
||||
|
@ -22,5 +22,5 @@ crieur-chatbot = {version = "0.1", path="crieur-chatbot"}
|
||||
dotenv = "0.15.0"
|
||||
env_logger = "0.8.3"
|
||||
log = "0.4.14"
|
||||
tokio = { version = "1.5.0", features = ["full"] }
|
||||
tokio = { version = "1.6.0", features = ["full"] }
|
||||
tracing-subscriber = "0.2.18"
|
||||
|
@ -16,14 +16,14 @@ hyper-rustls = "0.22.1"
|
||||
cookie = "0.15.0"
|
||||
lol_html = "0.3.0"
|
||||
indoc = "1.0.3"
|
||||
html-minifier = "3.0.12"
|
||||
html-minifier = "3.0.13"
|
||||
bytes = "1.0.1"
|
||||
base64 = "0.13.0"
|
||||
futures = "0.3.14"
|
||||
futures = "0.3.15"
|
||||
derive_builder = "0.10.2"
|
||||
nipper = "0.1.9"
|
||||
log = "0.4.14"
|
||||
env_logger = "0.8.3"
|
||||
|
||||
[dev-dependencies]
|
||||
tokio = "1.5.0"
|
||||
tokio = "1.6.0"
|
||||
|
@ -1,6 +1,7 @@
|
||||
use anyhow::{anyhow, bail, Result};
|
||||
use async_trait::async_trait;
|
||||
use cookie::Cookie;
|
||||
use indoc::indoc;
|
||||
use url::Host;
|
||||
|
||||
use crate::newspaper::{Metadata, Newspaper};
|
||||
@ -81,9 +82,42 @@ impl Newspaper for CourrierInternational {
|
||||
None => bail!("404 not found"),
|
||||
};
|
||||
|
||||
let elements_to_remove = &[
|
||||
// navigation elements
|
||||
"header.site-header",
|
||||
"footer.site-footer",
|
||||
// Social buttons
|
||||
"#toolbox-share",
|
||||
".toolbox-share",
|
||||
".toolbox-print",
|
||||
".toolbox-respond",
|
||||
".toolbox-zen",
|
||||
".toolbox-newsletter",
|
||||
".toolbox-offer",
|
||||
".box-article-offer-friend-abo",
|
||||
// unused services
|
||||
".article-aside",
|
||||
".article-secondary",
|
||||
".article-subject-readmore",
|
||||
// misc
|
||||
".element-invisible",
|
||||
".gptcontainer",
|
||||
];
|
||||
|
||||
// FIXME: it doesn't work because the aside is in the article body
|
||||
//
|
||||
let toolbox_style = indoc! {"
|
||||
aside.article-toolbox {
|
||||
position: sticky;
|
||||
top: 1em;
|
||||
}
|
||||
"};
|
||||
|
||||
let single_page_html = tools::self_contained_html::Config {
|
||||
downloader: Some(&downloader),
|
||||
base_url: Some(&url),
|
||||
elements_to_remove,
|
||||
styles_to_add: &[toolbox_style],
|
||||
..Default::default()
|
||||
}
|
||||
.run(&html)
|
||||
|
@ -86,7 +86,7 @@ impl Newspaper for Mediapart {
|
||||
};
|
||||
|
||||
// TODO: Move to const
|
||||
let elements_to_remove = [
|
||||
let elements_to_remove = &[
|
||||
// header
|
||||
".fb-root",
|
||||
".skipLinks",
|
||||
@ -107,7 +107,7 @@ impl Newspaper for Mediapart {
|
||||
let single_page_html = tools::self_contained_html::Config {
|
||||
downloader: Some(&downloader),
|
||||
base_url: Some(&url),
|
||||
elements_to_remove: &elements_to_remove,
|
||||
elements_to_remove,
|
||||
..Default::default()
|
||||
}
|
||||
.run(&html)
|
||||
|
@ -91,7 +91,7 @@ impl Newspaper for MondeDiplo {
|
||||
};
|
||||
|
||||
// TODO: Move to const
|
||||
let elements_to_remove = [
|
||||
let elements_to_remove = &[
|
||||
// navigation elements
|
||||
"#tout-en-haut.preentete",
|
||||
"#entete.connecte",
|
||||
@ -110,7 +110,7 @@ impl Newspaper for MondeDiplo {
|
||||
let single_page_html = tools::self_contained_html::Config {
|
||||
downloader: Some(&downloader),
|
||||
base_url: Some(&url),
|
||||
elements_to_remove: &elements_to_remove,
|
||||
elements_to_remove,
|
||||
..Default::default()
|
||||
}
|
||||
.run(&html)
|
||||
|
@ -59,6 +59,8 @@ where
|
||||
let downloader = self.downloader.expect("Downloader not defined");
|
||||
// TODO: split/refactor this function :
|
||||
// - ¿ find a way to ease the |get urls| -> |async download| -> |replace| workflow ?
|
||||
// - put each modification (ex: style in the `foreach`) in functions, maybe using
|
||||
// (¿sealed?) Traits : https://git.alpaga.dev/poc/crieur/issues/42
|
||||
// - `element_to_remove` , `base_url` and `downloader` should be in a configuration structure
|
||||
let (style_urls, html) = {
|
||||
let document = Document::from(html.as_ref());
|
||||
@ -84,9 +86,8 @@ where
|
||||
let stylesheets = document.select("link[href][rel=\"stylesheet\"]");
|
||||
let styles_url = stylesheets
|
||||
.iter()
|
||||
.map(|stylesheet| {
|
||||
if let Some(src) = stylesheet.attr("href") {
|
||||
//TODO: does it work with absolute urls ?
|
||||
.map(|style_link| {
|
||||
if let Some(src) = style_link.attr("href") {
|
||||
base_url.join(src.as_ref()).ok()
|
||||
} else {
|
||||
None
|
||||
@ -110,13 +111,19 @@ where
|
||||
styles
|
||||
.iter()
|
||||
.zip(downloaded_styles.iter())
|
||||
.for_each(|(mut stylesheet, inner_css)| {
|
||||
.for_each(|(mut style_link, inner_css)| {
|
||||
if let Some(Some(inner_css)) = inner_css {
|
||||
let css = String::from_utf8(inner_css.to_vec()).unwrap();
|
||||
let css = format!("<style>{}</style>", css);
|
||||
stylesheet.replace_with_html(css);
|
||||
let media_query = style_link.attr("media");
|
||||
let css = match media_query {
|
||||
Some(media_query) => {
|
||||
format!("<style media=\"{}\">{}</style>", media_query, css)
|
||||
}
|
||||
None => format!("<style>{}</style>", css),
|
||||
};
|
||||
style_link.replace_with_html(css);
|
||||
} else {
|
||||
stylesheet.remove();
|
||||
style_link.remove();
|
||||
}
|
||||
});
|
||||
String::from(document.html())
|
||||
@ -192,6 +199,7 @@ where
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
// TODO: reduce boilerplate, DRY
|
||||
|
||||
use super::*;
|
||||
|
||||
@ -362,6 +370,49 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn download_css_with_media_query() -> Result<()> {
|
||||
let downloader = CssDownloader {};
|
||||
|
||||
let html = indoc! {"
|
||||
<html>
|
||||
<head>
|
||||
<link rel=\"stylesheet\" href=\"main.css\" media=\"print\">
|
||||
</head>
|
||||
<body>
|
||||
</body>
|
||||
</html>
|
||||
"};
|
||||
|
||||
let wanted_html = indoc! {"
|
||||
<html><head>
|
||||
<style media=\"print\">
|
||||
section#warning {
|
||||
color: red;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
</body></html>
|
||||
"};
|
||||
let mut minifier = HTMLMinifier::new();
|
||||
minifier.digest(wanted_html)?;
|
||||
let minified = String::from_utf8(minifier.get_html().into())?;
|
||||
|
||||
let base_url = Url::parse("http://example.com")?;
|
||||
assert_eq!(
|
||||
Config {
|
||||
downloader: Some(&downloader),
|
||||
base_url: Some(&base_url),
|
||||
..Default::default()
|
||||
}
|
||||
.run(html)
|
||||
.await,
|
||||
minified
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
struct PngDownloader;
|
||||
#[async_trait]
|
||||
impl Download for PngDownloader {
|
||||
|
Loading…
Reference in New Issue
Block a user