diff --git a/Cargo.lock b/Cargo.lock index 52af574..95f5f16 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -662,9 +662,9 @@ dependencies = [ [[package]] name = "futures" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a9d5813545e459ad3ca1bff9915e9ad7f1a47dc6a91b627ce321d5863b7dd253" +checksum = "0e7e43a803dae2fa37c1f6a8fe121e1f7bf9548b4dfc0522a42f34145dadfc27" dependencies = [ "futures-channel", "futures-core", @@ -677,9 +677,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce79c6a52a299137a6013061e0cf0e688fce5d7f1bc60125f520912fdb29ec25" +checksum = "e682a68b29a882df0545c143dc3646daefe80ba479bcdede94d5a703de2871e2" dependencies = [ "futures-core", "futures-sink", @@ -687,15 +687,15 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "098cd1c6dda6ca01650f1a37a794245eb73181d0d4d4e955e2f3c37db7af1815" +checksum = "0402f765d8a89a26043b889b26ce3c4679d268fa6bb22cd7c6aad98340e179d1" [[package]] name = "futures-executor" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10f6cb7042eda00f0049b1d2080aa4b93442997ee507eb3828e8bd7577f94c9d" +checksum = "badaa6a909fac9e7236d0620a2f57f7664640c56575b71a7552fbd68deafab79" dependencies = [ "futures-core", "futures-task", @@ -704,9 +704,9 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "365a1a1fb30ea1c03a830fdb2158f5236833ac81fa0ad12fe35b29cddc35cb04" +checksum = "acc499defb3b348f8d8f3f66415835a9131856ff7714bf10dadfc4ec4bdb29a1" [[package]] name = "futures-locks" @@ -719,10 +719,11 @@ dependencies = [ [[package]] name = "futures-macro" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "668c6733a182cd7deb4f1de7ba3bf2120823835b3bcfbeacf7d2c4a773c1bb8b" +checksum = "a4c40298486cdf52cc00cd6d6987892ba502c7656a16a4192a9992b1ccedd121" dependencies = [ + "autocfg 1.0.1", "proc-macro-hack", "proc-macro2", "quote", @@ -731,15 +732,15 @@ dependencies = [ [[package]] name = "futures-sink" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c5629433c555de3d82861a7a4e3794a4c40040390907cfbfd7143a92a426c23" +checksum = "a57bead0ceff0d6dde8f465ecd96c9338121bb7717d3e7b108059531870c4282" [[package]] name = "futures-task" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba7aa51095076f3ba6d9a1f702f74bd05ec65f555d70d2033d55ba8d69f581bc" +checksum = "8a16bef9fc1a4dddb5bee51c989e3fbba26569cbb0e31f5b303c184e3dd33dae" [[package]] name = "futures-timer" @@ -753,10 +754,11 @@ dependencies = [ [[package]] name = "futures-util" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c144ad54d60f23927f0a6b6d816e4271278b64f005ad65e4e35291d2de9c025" +checksum = "feb5c238d27e2bf94ffdfd27b2c29e3df4a68c4193bb6427384259e2bf191967" dependencies = [ + "autocfg 1.0.1", "futures-channel", "futures-core", "futures-io", @@ -895,9 +897,9 @@ dependencies = [ [[package]] name = "html-minifier" -version = "3.0.12" +version = "3.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a3b862e637e82b3134913fdd0aa0b8e79b7486fe88878f6bab0d09daf4996af" +checksum = "70f11cbdecf4fde3b74532e8d6d5926b6354adfda3fe259ed585402f9ae0dbaf" dependencies = [ "cow-utils", "educe", @@ -1284,9 +1286,9 @@ checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d" [[package]] name = "minifier" -version = "0.0.40" +version = "0.0.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed8f02a863a23d5797a6e72ea7102bd0ee38ceba1256b522aeddb70ff14b32c9" +checksum = "5594542d20834f2b974f5e5fb8e0cf1c67a2119dcadc29ef5d93a081fb30cc08" dependencies = [ "macro-utils", ] @@ -2659,9 +2661,9 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" [[package]] name = "tokio" -version = "1.5.0" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83f0c8e7c0addab50b663055baf787d0af7f413a46e6e7fb9559a4e4db7137a5" +checksum = "bd3076b5c8cc18138b8f8814895c11eb4de37114a5d127bafdc5e55798ceef37" dependencies = [ "autocfg 1.0.1", "bytes", diff --git a/Cargo.toml b/Cargo.toml index 6c58b28..6d1c92b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,5 +22,5 @@ crieur-chatbot = {version = "0.1", path="crieur-chatbot"} dotenv = "0.15.0" env_logger = "0.8.3" log = "0.4.14" -tokio = { version = "1.5.0", features = ["full"] } +tokio = { version = "1.6.0", features = ["full"] } tracing-subscriber = "0.2.18" diff --git a/crieur-retrieve/Cargo.toml b/crieur-retrieve/Cargo.toml index 6b6578b..6c3c947 100644 --- a/crieur-retrieve/Cargo.toml +++ b/crieur-retrieve/Cargo.toml @@ -16,14 +16,14 @@ hyper-rustls = "0.22.1" cookie = "0.15.0" lol_html = "0.3.0" indoc = "1.0.3" -html-minifier = "3.0.12" +html-minifier = "3.0.13" bytes = "1.0.1" base64 = "0.13.0" -futures = "0.3.14" +futures = "0.3.15" derive_builder = "0.10.2" nipper = "0.1.9" log = "0.4.14" env_logger = "0.8.3" [dev-dependencies] -tokio = "1.5.0" +tokio = "1.6.0" diff --git a/crieur-retrieve/src/newspapers/courrier_international.rs b/crieur-retrieve/src/newspapers/courrier_international.rs index cd6799e..7263d11 100644 --- a/crieur-retrieve/src/newspapers/courrier_international.rs +++ b/crieur-retrieve/src/newspapers/courrier_international.rs @@ -1,6 +1,7 @@ use anyhow::{anyhow, bail, Result}; use async_trait::async_trait; use cookie::Cookie; +use indoc::indoc; use url::Host; use crate::newspaper::{Metadata, Newspaper}; @@ -81,9 +82,42 @@ impl Newspaper for CourrierInternational { None => bail!("404 not found"), }; + let elements_to_remove = &[ + // navigation elements + "header.site-header", + "footer.site-footer", + // Social buttons + "#toolbox-share", + ".toolbox-share", + ".toolbox-print", + ".toolbox-respond", + ".toolbox-zen", + ".toolbox-newsletter", + ".toolbox-offer", + ".box-article-offer-friend-abo", + // unused services + ".article-aside", + ".article-secondary", + ".article-subject-readmore", + // misc + ".element-invisible", + ".gptcontainer", + ]; + + // FIXME: it doesn't work because the aside is in the article body + // + let toolbox_style = indoc! {" + aside.article-toolbox { + position: sticky; + top: 1em; + } + "}; + let single_page_html = tools::self_contained_html::Config { downloader: Some(&downloader), base_url: Some(&url), + elements_to_remove, + styles_to_add: &[toolbox_style], ..Default::default() } .run(&html) diff --git a/crieur-retrieve/src/newspapers/mediapart.rs b/crieur-retrieve/src/newspapers/mediapart.rs index 2586e08..40f17c1 100644 --- a/crieur-retrieve/src/newspapers/mediapart.rs +++ b/crieur-retrieve/src/newspapers/mediapart.rs @@ -86,7 +86,7 @@ impl Newspaper for Mediapart { }; // TODO: Move to const - let elements_to_remove = [ + let elements_to_remove = &[ // header ".fb-root", ".skipLinks", @@ -107,7 +107,7 @@ impl Newspaper for Mediapart { let single_page_html = tools::self_contained_html::Config { downloader: Some(&downloader), base_url: Some(&url), - elements_to_remove: &elements_to_remove, + elements_to_remove, ..Default::default() } .run(&html) diff --git a/crieur-retrieve/src/newspapers/monde_diplomatique.rs b/crieur-retrieve/src/newspapers/monde_diplomatique.rs index 8348dae..e3a12df 100644 --- a/crieur-retrieve/src/newspapers/monde_diplomatique.rs +++ b/crieur-retrieve/src/newspapers/monde_diplomatique.rs @@ -91,7 +91,7 @@ impl Newspaper for MondeDiplo { }; // TODO: Move to const - let elements_to_remove = [ + let elements_to_remove = &[ // navigation elements "#tout-en-haut.preentete", "#entete.connecte", @@ -110,7 +110,7 @@ impl Newspaper for MondeDiplo { let single_page_html = tools::self_contained_html::Config { downloader: Some(&downloader), base_url: Some(&url), - elements_to_remove: &elements_to_remove, + elements_to_remove, ..Default::default() } .run(&html) diff --git a/crieur-retrieve/src/tools/self_contained_html.rs b/crieur-retrieve/src/tools/self_contained_html.rs index 2e9ea2e..e04234f 100644 --- a/crieur-retrieve/src/tools/self_contained_html.rs +++ b/crieur-retrieve/src/tools/self_contained_html.rs @@ -59,6 +59,8 @@ where let downloader = self.downloader.expect("Downloader not defined"); // TODO: split/refactor this function : // - ¿ find a way to ease the |get urls| -> |async download| -> |replace| workflow ? + // - put each modification (ex: style in the `foreach`) in functions, maybe using + // (¿sealed?) Traits : https://git.alpaga.dev/poc/crieur/issues/42 // - `element_to_remove` , `base_url` and `downloader` should be in a configuration structure let (style_urls, html) = { let document = Document::from(html.as_ref()); @@ -84,9 +86,8 @@ where let stylesheets = document.select("link[href][rel=\"stylesheet\"]"); let styles_url = stylesheets .iter() - .map(|stylesheet| { - if let Some(src) = stylesheet.attr("href") { - //TODO: does it work with absolute urls ? + .map(|style_link| { + if let Some(src) = style_link.attr("href") { base_url.join(src.as_ref()).ok() } else { None @@ -110,13 +111,19 @@ where styles .iter() .zip(downloaded_styles.iter()) - .for_each(|(mut stylesheet, inner_css)| { + .for_each(|(mut style_link, inner_css)| { if let Some(Some(inner_css)) = inner_css { let css = String::from_utf8(inner_css.to_vec()).unwrap(); - let css = format!("", css); - stylesheet.replace_with_html(css); + let media_query = style_link.attr("media"); + let css = match media_query { + Some(media_query) => { + format!("", media_query, css) + } + None => format!("", css), + }; + style_link.replace_with_html(css); } else { - stylesheet.remove(); + style_link.remove(); } }); String::from(document.html()) @@ -192,6 +199,7 @@ where #[cfg(test)] mod tests { + // TODO: reduce boilerplate, DRY use super::*; @@ -362,6 +370,49 @@ mod tests { Ok(()) } + #[tokio::test] + async fn download_css_with_media_query() -> Result<()> { + let downloader = CssDownloader {}; + + let html = indoc! {" + +
+ + + + + + "}; + + let wanted_html = indoc! {" + + + + + + "}; + let mut minifier = HTMLMinifier::new(); + minifier.digest(wanted_html)?; + let minified = String::from_utf8(minifier.get_html().into())?; + + let base_url = Url::parse("http://example.com")?; + assert_eq!( + Config { + downloader: Some(&downloader), + base_url: Some(&base_url), + ..Default::default() + } + .run(html) + .await, + minified + ); + Ok(()) + } + struct PngDownloader; #[async_trait] impl Download for PngDownloader {