From e34edf0b21946826a3fb664e7dce2864222b0e52 Mon Sep 17 00:00:00 2001 From: koalp Date: Sat, 22 May 2021 04:25:50 +0200 Subject: [PATCH] fix: keep media queries in ref styles Previously, media queries weren't keep when downloading styles from ref tags. It have been fixed so that media attribute are kept when creating style tags from ref tags. --- Cargo.lock | 50 +++++++------- Cargo.toml | 2 +- crieur-retrieve/Cargo.toml | 6 +- .../src/newspapers/courrier_international.rs | 34 ++++++++++ crieur-retrieve/src/newspapers/mediapart.rs | 4 +- .../src/newspapers/monde_diplomatique.rs | 4 +- .../src/tools/self_contained_html.rs | 65 +++++++++++++++++-- 7 files changed, 126 insertions(+), 39 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 52af574..95f5f16 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -662,9 +662,9 @@ dependencies = [ [[package]] name = "futures" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a9d5813545e459ad3ca1bff9915e9ad7f1a47dc6a91b627ce321d5863b7dd253" +checksum = "0e7e43a803dae2fa37c1f6a8fe121e1f7bf9548b4dfc0522a42f34145dadfc27" dependencies = [ "futures-channel", "futures-core", @@ -677,9 +677,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce79c6a52a299137a6013061e0cf0e688fce5d7f1bc60125f520912fdb29ec25" +checksum = "e682a68b29a882df0545c143dc3646daefe80ba479bcdede94d5a703de2871e2" dependencies = [ "futures-core", "futures-sink", @@ -687,15 +687,15 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "098cd1c6dda6ca01650f1a37a794245eb73181d0d4d4e955e2f3c37db7af1815" +checksum = "0402f765d8a89a26043b889b26ce3c4679d268fa6bb22cd7c6aad98340e179d1" [[package]] name = "futures-executor" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10f6cb7042eda00f0049b1d2080aa4b93442997ee507eb3828e8bd7577f94c9d" +checksum = "badaa6a909fac9e7236d0620a2f57f7664640c56575b71a7552fbd68deafab79" dependencies = [ "futures-core", "futures-task", @@ -704,9 +704,9 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "365a1a1fb30ea1c03a830fdb2158f5236833ac81fa0ad12fe35b29cddc35cb04" +checksum = "acc499defb3b348f8d8f3f66415835a9131856ff7714bf10dadfc4ec4bdb29a1" [[package]] name = "futures-locks" @@ -719,10 +719,11 @@ dependencies = [ [[package]] name = "futures-macro" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "668c6733a182cd7deb4f1de7ba3bf2120823835b3bcfbeacf7d2c4a773c1bb8b" +checksum = "a4c40298486cdf52cc00cd6d6987892ba502c7656a16a4192a9992b1ccedd121" dependencies = [ + "autocfg 1.0.1", "proc-macro-hack", "proc-macro2", "quote", @@ -731,15 +732,15 @@ dependencies = [ [[package]] name = "futures-sink" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c5629433c555de3d82861a7a4e3794a4c40040390907cfbfd7143a92a426c23" +checksum = "a57bead0ceff0d6dde8f465ecd96c9338121bb7717d3e7b108059531870c4282" [[package]] name = "futures-task" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba7aa51095076f3ba6d9a1f702f74bd05ec65f555d70d2033d55ba8d69f581bc" +checksum = "8a16bef9fc1a4dddb5bee51c989e3fbba26569cbb0e31f5b303c184e3dd33dae" [[package]] name = "futures-timer" @@ -753,10 +754,11 @@ dependencies = [ [[package]] name = "futures-util" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c144ad54d60f23927f0a6b6d816e4271278b64f005ad65e4e35291d2de9c025" +checksum = "feb5c238d27e2bf94ffdfd27b2c29e3df4a68c4193bb6427384259e2bf191967" dependencies = [ + "autocfg 1.0.1", "futures-channel", "futures-core", "futures-io", @@ -895,9 +897,9 @@ dependencies = [ [[package]] name = "html-minifier" -version = "3.0.12" +version = "3.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a3b862e637e82b3134913fdd0aa0b8e79b7486fe88878f6bab0d09daf4996af" +checksum = "70f11cbdecf4fde3b74532e8d6d5926b6354adfda3fe259ed585402f9ae0dbaf" dependencies = [ "cow-utils", "educe", @@ -1284,9 +1286,9 @@ checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d" [[package]] name = "minifier" -version = "0.0.40" +version = "0.0.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed8f02a863a23d5797a6e72ea7102bd0ee38ceba1256b522aeddb70ff14b32c9" +checksum = "5594542d20834f2b974f5e5fb8e0cf1c67a2119dcadc29ef5d93a081fb30cc08" dependencies = [ "macro-utils", ] @@ -2659,9 +2661,9 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" [[package]] name = "tokio" -version = "1.5.0" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83f0c8e7c0addab50b663055baf787d0af7f413a46e6e7fb9559a4e4db7137a5" +checksum = "bd3076b5c8cc18138b8f8814895c11eb4de37114a5d127bafdc5e55798ceef37" dependencies = [ "autocfg 1.0.1", "bytes", diff --git a/Cargo.toml b/Cargo.toml index 6c58b28..6d1c92b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,5 +22,5 @@ crieur-chatbot = {version = "0.1", path="crieur-chatbot"} dotenv = "0.15.0" env_logger = "0.8.3" log = "0.4.14" -tokio = { version = "1.5.0", features = ["full"] } +tokio = { version = "1.6.0", features = ["full"] } tracing-subscriber = "0.2.18" diff --git a/crieur-retrieve/Cargo.toml b/crieur-retrieve/Cargo.toml index 6b6578b..6c3c947 100644 --- a/crieur-retrieve/Cargo.toml +++ b/crieur-retrieve/Cargo.toml @@ -16,14 +16,14 @@ hyper-rustls = "0.22.1" cookie = "0.15.0" lol_html = "0.3.0" indoc = "1.0.3" -html-minifier = "3.0.12" +html-minifier = "3.0.13" bytes = "1.0.1" base64 = "0.13.0" -futures = "0.3.14" +futures = "0.3.15" derive_builder = "0.10.2" nipper = "0.1.9" log = "0.4.14" env_logger = "0.8.3" [dev-dependencies] -tokio = "1.5.0" +tokio = "1.6.0" diff --git a/crieur-retrieve/src/newspapers/courrier_international.rs b/crieur-retrieve/src/newspapers/courrier_international.rs index cd6799e..7263d11 100644 --- a/crieur-retrieve/src/newspapers/courrier_international.rs +++ b/crieur-retrieve/src/newspapers/courrier_international.rs @@ -1,6 +1,7 @@ use anyhow::{anyhow, bail, Result}; use async_trait::async_trait; use cookie::Cookie; +use indoc::indoc; use url::Host; use crate::newspaper::{Metadata, Newspaper}; @@ -81,9 +82,42 @@ impl Newspaper for CourrierInternational { None => bail!("404 not found"), }; + let elements_to_remove = &[ + // navigation elements + "header.site-header", + "footer.site-footer", + // Social buttons + "#toolbox-share", + ".toolbox-share", + ".toolbox-print", + ".toolbox-respond", + ".toolbox-zen", + ".toolbox-newsletter", + ".toolbox-offer", + ".box-article-offer-friend-abo", + // unused services + ".article-aside", + ".article-secondary", + ".article-subject-readmore", + // misc + ".element-invisible", + ".gptcontainer", + ]; + + // FIXME: it doesn't work because the aside is in the article body + // + let toolbox_style = indoc! {" + aside.article-toolbox { + position: sticky; + top: 1em; + } + "}; + let single_page_html = tools::self_contained_html::Config { downloader: Some(&downloader), base_url: Some(&url), + elements_to_remove, + styles_to_add: &[toolbox_style], ..Default::default() } .run(&html) diff --git a/crieur-retrieve/src/newspapers/mediapart.rs b/crieur-retrieve/src/newspapers/mediapart.rs index 2586e08..40f17c1 100644 --- a/crieur-retrieve/src/newspapers/mediapart.rs +++ b/crieur-retrieve/src/newspapers/mediapart.rs @@ -86,7 +86,7 @@ impl Newspaper for Mediapart { }; // TODO: Move to const - let elements_to_remove = [ + let elements_to_remove = &[ // header ".fb-root", ".skipLinks", @@ -107,7 +107,7 @@ impl Newspaper for Mediapart { let single_page_html = tools::self_contained_html::Config { downloader: Some(&downloader), base_url: Some(&url), - elements_to_remove: &elements_to_remove, + elements_to_remove, ..Default::default() } .run(&html) diff --git a/crieur-retrieve/src/newspapers/monde_diplomatique.rs b/crieur-retrieve/src/newspapers/monde_diplomatique.rs index 8348dae..e3a12df 100644 --- a/crieur-retrieve/src/newspapers/monde_diplomatique.rs +++ b/crieur-retrieve/src/newspapers/monde_diplomatique.rs @@ -91,7 +91,7 @@ impl Newspaper for MondeDiplo { }; // TODO: Move to const - let elements_to_remove = [ + let elements_to_remove = &[ // navigation elements "#tout-en-haut.preentete", "#entete.connecte", @@ -110,7 +110,7 @@ impl Newspaper for MondeDiplo { let single_page_html = tools::self_contained_html::Config { downloader: Some(&downloader), base_url: Some(&url), - elements_to_remove: &elements_to_remove, + elements_to_remove, ..Default::default() } .run(&html) diff --git a/crieur-retrieve/src/tools/self_contained_html.rs b/crieur-retrieve/src/tools/self_contained_html.rs index 2e9ea2e..e04234f 100644 --- a/crieur-retrieve/src/tools/self_contained_html.rs +++ b/crieur-retrieve/src/tools/self_contained_html.rs @@ -59,6 +59,8 @@ where let downloader = self.downloader.expect("Downloader not defined"); // TODO: split/refactor this function : // - ¿ find a way to ease the |get urls| -> |async download| -> |replace| workflow ? + // - put each modification (ex: style in the `foreach`) in functions, maybe using + // (¿sealed?) Traits : https://git.alpaga.dev/poc/crieur/issues/42 // - `element_to_remove` , `base_url` and `downloader` should be in a configuration structure let (style_urls, html) = { let document = Document::from(html.as_ref()); @@ -84,9 +86,8 @@ where let stylesheets = document.select("link[href][rel=\"stylesheet\"]"); let styles_url = stylesheets .iter() - .map(|stylesheet| { - if let Some(src) = stylesheet.attr("href") { - //TODO: does it work with absolute urls ? + .map(|style_link| { + if let Some(src) = style_link.attr("href") { base_url.join(src.as_ref()).ok() } else { None @@ -110,13 +111,19 @@ where styles .iter() .zip(downloaded_styles.iter()) - .for_each(|(mut stylesheet, inner_css)| { + .for_each(|(mut style_link, inner_css)| { if let Some(Some(inner_css)) = inner_css { let css = String::from_utf8(inner_css.to_vec()).unwrap(); - let css = format!("", css); - stylesheet.replace_with_html(css); + let media_query = style_link.attr("media"); + let css = match media_query { + Some(media_query) => { + format!("", media_query, css) + } + None => format!("", css), + }; + style_link.replace_with_html(css); } else { - stylesheet.remove(); + style_link.remove(); } }); String::from(document.html()) @@ -192,6 +199,7 @@ where #[cfg(test)] mod tests { + // TODO: reduce boilerplate, DRY use super::*; @@ -362,6 +370,49 @@ mod tests { Ok(()) } + #[tokio::test] + async fn download_css_with_media_query() -> Result<()> { + let downloader = CssDownloader {}; + + let html = indoc! {" + + + + + + + + "}; + + let wanted_html = indoc! {" + + + + + + "}; + let mut minifier = HTMLMinifier::new(); + minifier.digest(wanted_html)?; + let minified = String::from_utf8(minifier.get_html().into())?; + + let base_url = Url::parse("http://example.com")?; + assert_eq!( + Config { + downloader: Some(&downloader), + base_url: Some(&base_url), + ..Default::default() + } + .run(html) + .await, + minified + ); + Ok(()) + } + struct PngDownloader; #[async_trait] impl Download for PngDownloader {