From 970f510cd115543832d89c20bcecd244749200cf Mon Sep 17 00:00:00 2001 From: koalp Date: Sat, 8 May 2021 03:23:27 +0200 Subject: [PATCH 1/6] feat: add retrieval from le monde diplomatique Add retrieval from le monde diplomatique Previously, 404 pages were injected in the document when downloading styles Now, the downloader returns None when documents are not found --- Cargo.lock | 209 +++++++++++++----- Cargo.toml | 1 + README.md | 3 +- crieur-retrieve/Cargo.toml | 10 +- crieur-retrieve/src/article_location.rs | 27 ++- crieur-retrieve/src/newspapers/mediapart.rs | 7 +- crieur-retrieve/src/newspapers/mod.rs | 1 + .../src/newspapers/monde_diplomatique.rs | 135 +++++++++++ crieur-retrieve/src/tools/download.rs | 28 ++- .../src/tools/self_contained_html.rs | 6 +- .../reference/newspaper_configuration.md | 13 ++ examples/cli_downloader.rs | 23 +- 12 files changed, 375 insertions(+), 88 deletions(-) create mode 100644 crieur-retrieve/src/newspapers/monde_diplomatique.rs diff --git a/Cargo.lock b/Cargo.lock index 25be411..992c8ee 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -74,13 +74,22 @@ checksum = "739f4a8db6605981345c5654f3a85b056ce52f37a39d34da03f25bf2151ea16e" [[package]] name = "aho-corasick" -version = "0.7.15" +version = "0.7.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7404febffaa47dac81aa44dba71523c9d069b1bdc50a77db41195149e17f68e5" +checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f" dependencies = [ "memchr", ] +[[package]] +name = "ansi_term" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2" +dependencies = [ + "winapi", +] + [[package]] name = "anyhow" version = "1.0.40" @@ -213,6 +222,18 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "chrono" +version = "0.4.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "670ad68c9088c2a963aaa298cb369688cf3f9465ce5e2d4ca10e6e0098a1ce73" +dependencies = [ + "libc", + "num-integer", + "num-traits", + "winapi", +] + [[package]] name = "cipher" version = "0.2.5" @@ -285,10 +306,13 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "79bb3adfaf5f75d24b01aee375f7555907840fa2800e5ec8fa3b9e2031830173" [[package]] -name = "cpuid-bool" -version = "0.1.2" +name = "cpufeatures" +version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8aebca1129a03dc6dc2b127edd729435bbc4a37e1d5f4d7513165089ceb02634" +checksum = "dec1028182c380cc45a2e2c5ec841134f2dfd0f8f5f0a5bcd68004f81b5efdf4" +dependencies = [ + "libc", +] [[package]] name = "cpuid-bool" @@ -307,6 +331,7 @@ dependencies = [ "env_logger", "log", "tokio", + "tracing-subscriber", ] [[package]] @@ -821,9 +846,9 @@ dependencies = [ [[package]] name = "h2" -version = "0.3.2" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc018e188373e2777d0ef2467ebff62a08e66c3f5857b23c8fbec3018210dc00" +checksum = "825343c4eef0b63f541f8903f395dc5beb362a979b5799a84062527ef1e37726" dependencies = [ "bytes", "fnv", @@ -877,9 +902,9 @@ dependencies = [ [[package]] name = "html-minifier" -version = "3.0.10" +version = "3.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "108452631307790510cde91282fc706ae70076bd68200add8638773f06d5e122" +checksum = "9876ed2cad2fd2f78bad42688bcac8a87cebce9b2381c5b05efc30b0c8429af0" dependencies = [ "cow-utils", "educe", @@ -914,9 +939,9 @@ dependencies = [ [[package]] name = "http-body" -version = "0.4.1" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5dfb77c123b4e2f72a2069aeae0b4b4949cc7e966df277813fc16347e7549737" +checksum = "60daa14be0e0786db0f03a9e57cb404c9d756eed2b6c62b9ea98ec5743ec75a9" dependencies = [ "bytes", "http", @@ -1055,9 +1080,9 @@ checksum = "dd25036021b0de88a0aff6b850051563c6516d0bf53f8638938edbb9de732736" [[package]] name = "js-sys" -version = "0.3.50" +version = "0.3.51" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d99f9e3e84b8f67f846ef5b4cbbc3b1c29f6c759fcbce6f01aa0e73d932a24c" +checksum = "83bdfbace3a0e81a4253f73b49e960b053e396a11012cbd49b9b74d6a2b67062" dependencies = [ "wasm-bindgen", ] @@ -1091,9 +1116,9 @@ checksum = "18794a8ad5b29321f790b55d93dfba91e125cb1a9edbd4f8e3150acc771c1a5e" [[package]] name = "lock_api" -version = "0.4.3" +version = "0.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a3c91c24eae6777794bb1997ad98bbb87daf92890acab859f7eaa4320333176" +checksum = "0382880606dff6d15c9476c416d18690b72742aa7b605bb6dd6ec9030fbf07eb" dependencies = [ "scopeguard", ] @@ -1158,6 +1183,15 @@ dependencies = [ "tendril", ] +[[package]] +name = "matchers" +version = "0.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f099785f7595cc4b4553a174ce30dd7589ef93391ff414dbb67f62392b9e0ce1" +dependencies = [ + "regex-automata", +] + [[package]] name = "matches" version = "0.1.8" @@ -1254,9 +1288,9 @@ checksum = "60302e4db3a61da70c0cb7991976248362f30319e88850c487b9b95bbf059e00" [[package]] name = "memchr" -version = "2.3.4" +version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ee1c47aaa256ecabcaea351eae4a9b01ef39ed810004e298d2511ed284b1525" +checksum = "b16bd47d9e329435e309c58469fe0791c2d0d1ba96ec0954152a5ae2b04387dc" [[package]] name = "mime" @@ -1585,7 +1619,7 @@ version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eebcc4aa140b9abd2bc40d9c3f7ccec842679cd79045ac3a7ac698c1a064b7cd" dependencies = [ - "cpuid-bool 0.2.0", + "cpuid-bool", "opaque-debug", "universal-hash", ] @@ -1856,18 +1890,18 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.2.7" +version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85dd92e586f7355c633911e11f77f3d12f04b1b1bd76a198bd34ae3af8341ef2" +checksum = "742739e41cd49414de871ea5e549afb7e2a3ac77b589bcbebe8c82fab37147fc" dependencies = [ "bitflags", ] [[package]] name = "regex" -version = "1.4.6" +version = "1.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a26af418b574bd56588335b3a3659a65725d4e636eb1016c2f9e3b38c7cc759" +checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461" dependencies = [ "aho-corasick", "memchr", @@ -1875,10 +1909,20 @@ dependencies = [ ] [[package]] -name = "regex-syntax" -version = "0.6.23" +name = "regex-automata" +version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24d5f089152e60f62d28b835fbff2cd2e8dc0baf1ac13343bef92ab7eed84548" +checksum = "ae1ded71d66a4a97f5e961fd0cb25a5f366a42a41570d16a763a69c092c26ae4" +dependencies = [ + "byteorder", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.6.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b" [[package]] name = "reqwest" @@ -2319,17 +2363,26 @@ checksum = "2579985fda508104f7587689507983eadd6a6e84dd35d6d115361f530916fa0d" [[package]] name = "sha2" -version = "0.9.3" +version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa827a14b29ab7f44778d14a88d3cb76e949c45083f7dbfa507d0cb699dc12de" +checksum = "d8f6b75b17576b792bef0db1bcc4b8b8bcdf9506744cf34b974195487af6cff2" dependencies = [ "block-buffer", "cfg-if 1.0.0", - "cpuid-bool 0.1.2", + "cpufeatures", "digest", "opaque-debug", ] +[[package]] +name = "sharded-slab" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79c719719ee05df97490f80a45acfc99e5a30ce98a1e4fb67aee422745ae14e3" +dependencies = [ + "lazy_static", +] + [[package]] name = "signal-hook-registry" version = "1.3.0" @@ -2491,9 +2544,9 @@ checksum = "1e81da0851ada1f3e9d4312c704aa4f8806f0f9d69faaf8df2f3464b4a9437c2" [[package]] name = "syn" -version = "1.0.71" +version = "1.0.72" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad184cc9470f9117b2ac6817bfe297307418819ba40552f9b3846f05c33d5373" +checksum = "a1e8cdbefb79a9a5a65e0db8b47b723ee907b7c7f8496c76a1770b5c310bab82" dependencies = [ "proc-macro2", "quote", @@ -2558,6 +2611,15 @@ dependencies = [ "syn", ] +[[package]] +name = "thread_local" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8018d24e04c95ac8790716a5987d0fec4f8b27249ffa0f7d33f1369bdfb88cbd" +dependencies = [ + "once_cell", +] + [[package]] name = "time" version = "0.2.26" @@ -2684,9 +2746,9 @@ checksum = "360dfd1d6d30e05fda32ace2c8c70e9c0a9da713275777f5a4dbb8a1893930c6" [[package]] name = "tracing" -version = "0.1.25" +version = "0.1.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01ebdc2bb4498ab1ab5f5b73c5803825e60199229ccba0698170e3be0e7f959f" +checksum = "09adeb8c97449311ccd28a427f96fb563e7fd31aabf994189879d9da2394b89d" dependencies = [ "cfg-if 1.0.0", "pin-project-lite", @@ -2707,9 +2769,9 @@ dependencies = [ [[package]] name = "tracing-core" -version = "0.1.17" +version = "0.1.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f50de3927f93d202783f4513cda820ab47ef17f624b03c096e86ef00c67e6b5f" +checksum = "a9ff14f98b1a4b289c6248a023c1c2fa1491062964e9fed67ab29c4e4da4a052" dependencies = [ "lazy_static", ] @@ -2724,6 +2786,49 @@ dependencies = [ "tracing", ] +[[package]] +name = "tracing-log" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6923477a48e41c1951f1999ef8bb5a3023eb723ceadafe78ffb65dc366761e3" +dependencies = [ + "lazy_static", + "log", + "tracing-core", +] + +[[package]] +name = "tracing-serde" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb65ea441fbb84f9f6748fd496cf7f63ec9af5bca94dd86456978d055e8eb28b" +dependencies = [ + "serde", + "tracing-core", +] + +[[package]] +name = "tracing-subscriber" +version = "0.2.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa5553bf0883ba7c9cbe493b085c29926bd41b66afc31ff72cf17ff4fb60dcd5" +dependencies = [ + "ansi_term", + "chrono", + "lazy_static", + "matchers", + "regex", + "serde", + "serde_json", + "sharded-slab", + "smallvec 1.6.1", + "thread_local", + "tracing", + "tracing-core", + "tracing-log", + "tracing-serde", +] + [[package]] name = "try-lock" version = "0.2.3" @@ -2756,9 +2861,9 @@ dependencies = [ [[package]] name = "unicode-xid" -version = "0.2.1" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7fe0bb3479651439c9112f72b6c505038574c9fbb575ed1bf3b797fa39dd564" +checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3" [[package]] name = "unindent" @@ -2784,9 +2889,9 @@ checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a" [[package]] name = "url" -version = "2.2.1" +version = "2.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ccd964113622c8e9322cfac19eb1004a07e636c545f325da085d5cdde6f1f8b" +checksum = "a507c383b2d33b5fc35d1861e77e6b383d158b2da5e14fe51b83dfedf6fd578c" dependencies = [ "form_urlencoded", "idna", @@ -2846,9 +2951,9 @@ checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6" [[package]] name = "wasm-bindgen" -version = "0.2.73" +version = "0.2.74" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83240549659d187488f91f33c0f8547cbfef0b2088bc470c116d1d260ef623d9" +checksum = "d54ee1d4ed486f78874278e63e4069fc1ab9f6a18ca492076ffb90c5eb2997fd" dependencies = [ "cfg-if 1.0.0", "serde", @@ -2858,9 +2963,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.73" +version = "0.2.74" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae70622411ca953215ca6d06d3ebeb1e915f0f6613e3b495122878d7ebec7dae" +checksum = "3b33f6a0694ccfea53d94db8b2ed1c3a8a4c86dd936b13b9f0a15ec4a451b900" dependencies = [ "bumpalo", "lazy_static", @@ -2873,9 +2978,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.23" +version = "0.4.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81b8b767af23de6ac18bf2168b690bed2902743ddf0fb39252e36f9e2bfc63ea" +checksum = "5fba7978c679d53ce2d0ac80c8c175840feb849a161664365d1287b41f2e67f1" dependencies = [ "cfg-if 1.0.0", "js-sys", @@ -2885,9 +2990,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.73" +version = "0.2.74" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3e734d91443f177bfdb41969de821e15c516931c3c3db3d318fa1b68975d0f6f" +checksum = "088169ca61430fe1e58b8096c24975251700e7b1f6fd91cc9d59b04fb9b18bd4" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -2895,9 +3000,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.73" +version = "0.2.74" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d53739ff08c8a68b0fdbcd54c372b8ab800b1449ab3c9d706503bc7dd1621b2c" +checksum = "be2241542ff3d9f241f5e2cb6dd09b37efe786df8851c54957683a49f0987a97" dependencies = [ "proc-macro2", "quote", @@ -2908,15 +3013,15 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.73" +version = "0.2.74" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9a543ae66aa233d14bb765ed9af4a33e81b8b58d1584cf1b47ff8cd0b9e4489" +checksum = "d7cff876b8f18eed75a66cf49b65e7f967cb354a7aa16003fb55dbfd25b44b4f" [[package]] name = "web-sys" -version = "0.3.50" +version = "0.3.51" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a905d57e488fec8861446d3393670fb50d27a262344013181c2cdf9fff5481be" +checksum = "e828417b379f3df7111d3a2a9e5753706cae29c41f7c4029ee9fd77f3e09e582" dependencies = [ "js-sys", "wasm-bindgen", diff --git a/Cargo.toml b/Cargo.toml index 458df77..6c58b28 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,3 +23,4 @@ dotenv = "0.15.0" env_logger = "0.8.3" log = "0.4.14" tokio = { version = "1.5.0", features = ["full"] } +tracing-subscriber = "0.2.18" diff --git a/README.md b/README.md index 455d599..8a77713 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,5 @@ -Tools to retrieve articles from multiple newspaper you subscribed to. +Tools to retrieve articles from multiple newspaper you subscribed to, all from +the same place. **This is a prototype, it isn't stable at all and you may not want to use it if you expect it to just work !** diff --git a/crieur-retrieve/Cargo.toml b/crieur-retrieve/Cargo.toml index 7b35a3c..089aea1 100644 --- a/crieur-retrieve/Cargo.toml +++ b/crieur-retrieve/Cargo.toml @@ -8,19 +8,19 @@ publish = false [dependencies] anyhow = "1.0.40" -async-trait = "0.1.48" +async-trait = "0.1.50" thiserror = "1.0.24" -url = "2.2.1" -hyper = { version = "0.14.5", features = ["full"] } +url = "2.2.2" +hyper = { version = "0.14.7", features = ["full"] } hyper-rustls = "0.22.1" cookie = "0.15.0" lol_html = "0.3.0" indoc = "1.0.3" -html-minifier = "3.0.9" +html-minifier = "3.0.11" bytes = "1.0.1" base64 = "0.13.0" futures = "0.3.14" -derive_builder = "0.10.0" +derive_builder = "0.10.2" nipper = "0.1.9" log = "0.4.14" env_logger = "0.8.3" diff --git a/crieur-retrieve/src/article_location.rs b/crieur-retrieve/src/article_location.rs index d6a177a..a0476b3 100644 --- a/crieur-retrieve/src/article_location.rs +++ b/crieur-retrieve/src/article_location.rs @@ -2,12 +2,12 @@ use std::boxed::Box; use std::convert::TryInto; use std::env; -use anyhow::anyhow; use log::info; use url::{Host, Url}; use crate::newspaper::Newspaper; use crate::newspapers::mediapart::{self, Mediapart}; +use crate::newspapers::monde_diplomatique::{self, MondeDiplo}; /// Enumerate all errors that can be encountered when using ArticleLocation #[derive(thiserror::Error, Debug)] @@ -33,6 +33,7 @@ type Newspapers = Vec>; pub type Result = core::result::Result; fn default_newpapers() -> Result { + // TODO: same thing is written too much times : how to DRY ? let config_key = "MEDIAPART_COOKIE".to_string(); let mpruiid = env::var(&config_key) .map_err(|_| Error::Misconfiguration(config_key))? @@ -42,7 +43,29 @@ fn default_newpapers() -> Result { .login(mediapart::Login::MPRUUID(mpruiid)) .build()?; - Ok(vec![Box::new(mediapart)]) + let lmd_a_m = "MONDE_DIPLO_LMD_A_M".to_string(); + let phpsessid = "MONDE_DIPLO_PHPSESSID".to_string(); + let spip_session = "MONDE_DIPLO_SPIP_SESSION".to_string(); + + let lmd_a_m = env::var(&lmd_a_m) + .map_err(|_| Error::Misconfiguration(lmd_a_m))? + .into(); + let phpsessid = env::var(&phpsessid) + .map_err(|_| Error::Misconfiguration(phpsessid))? + .into(); + let spip_session = env::var(&spip_session) + .map_err(|_| Error::Misconfiguration(spip_session))? + .into(); + + let monde_diplo = MondeDiplo::builder() + .login(monde_diplomatique::Login::Cookies { + lmd_a_m, + phpsessid, + spip_session, + }) + .build()?; + + Ok(vec![Box::new(mediapart), Box::new(monde_diplo)]) } #[derive(Default)] diff --git a/crieur-retrieve/src/newspapers/mediapart.rs b/crieur-retrieve/src/newspapers/mediapart.rs index 0933b6e..24a3933 100644 --- a/crieur-retrieve/src/newspapers/mediapart.rs +++ b/crieur-retrieve/src/newspapers/mediapart.rs @@ -1,4 +1,4 @@ -use anyhow::{anyhow, Result}; +use anyhow::{anyhow, bail, Result}; use async_trait::async_trait; use cookie::Cookie; use url::Host; @@ -80,7 +80,10 @@ impl Newspaper for Mediapart { let downloader = Downloader { cookies }; let body = downloader.download(&url).await?; - let html = String::from_utf8(body.to_vec())?; + let html = match body { + Some(body) => String::from_utf8(body.to_vec())?, + None => bail!("404 not found"), + }; // TODO: Move to const let element_to_remove = [ diff --git a/crieur-retrieve/src/newspapers/mod.rs b/crieur-retrieve/src/newspapers/mod.rs index 7f44529..1cc9356 100644 --- a/crieur-retrieve/src/newspapers/mod.rs +++ b/crieur-retrieve/src/newspapers/mod.rs @@ -1 +1,2 @@ pub mod mediapart; +pub mod monde_diplomatique; diff --git a/crieur-retrieve/src/newspapers/monde_diplomatique.rs b/crieur-retrieve/src/newspapers/monde_diplomatique.rs new file mode 100644 index 0000000..abf5187 --- /dev/null +++ b/crieur-retrieve/src/newspapers/monde_diplomatique.rs @@ -0,0 +1,135 @@ +use anyhow::{anyhow, bail, Result}; +use async_trait::async_trait; +use cookie::Cookie; +use url::Host; + +use crate::newspaper::{Metadata, Newspaper}; +use crate::tools; +use crate::Url; +use crate::{Download, Downloader}; + +pub enum Login { + Username(String, String), + Cookies { + lmd_a_m: String, + phpsessid: String, + spip_session: String, + }, +} + +#[derive(Debug, Clone, Default)] +pub struct MondeDiplo { + login_cookies: Vec<(String, String)>, +} + +fn str_to_host>(host: S) -> Host { + Host::Domain(host.into()) +} + +#[derive(Debug, Clone, Default)] +pub struct Builder { + login_cookies: Option>, +} + +impl Builder { + pub fn login(&mut self, login: Login) -> &mut Self { + self.login_cookies = match login { + Login::Username(_username, _password) => { + unimplemented!("login using username and passwond not implemented") + } + Login::Cookies { + lmd_a_m, + phpsessid, + spip_session, + } => Some(vec![ + ("lmd_a_m".into(), lmd_a_m), + ("PHPSESSID".into(), phpsessid), + ("spip_session".into(), spip_session), + ]), + }; + self + } + + pub fn build(&self) -> Result { + match &self.login_cookies { + Some(login_cookies) => Ok(MondeDiplo { + login_cookies: login_cookies.clone(), + }), + None => Err(anyhow!("You have to log in to access this newspaper")), + } + } +} + +#[async_trait] +impl Newspaper for MondeDiplo { + fn metadata(&self) -> Metadata { + Metadata::builder() + .hosts(vec![ + str_to_host("monde-diplomatique.fr"), + str_to_host("www.monde-diplomatique.fr"), + ]) + .lower_case_name("monde-diplomatique") + .name("Le Monde Diplomatique") + .build() + .unwrap_or_default() + } + + async fn retrieve_html(&self, url: &Url) -> Result { + let cookies = self + .login_cookies + .iter() + .map(|cookie| Cookie::build(&cookie.0, &cookie.1).finish()) + .collect::>(); + //let cookie = Cookie::build(&self.login_cookie.0, &self.login_cookie.1) + // .secure(true) + // .finish(); + //let cookies = vec![cookie]; + + // TODO: replace by builder + let downloader = Downloader { cookies }; + + let body = downloader.download(&url).await?; + let html = match body { + Some(body) => String::from_utf8(body.to_vec())?, + None => bail!("404 not found"), + }; + + // TODO: Move to const + let element_to_remove = [ + // navigation elements + "#tout-en-haut.preentete", + "#entete.connecte", + "#navigation", + "#pied", + ".bloc-connexion", + // unused features + "#ecouter", + // Social buttons + ".actions-article", + "#partage", + // misc + "noscript", + ]; + + let single_page_html = + tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await; + Ok(single_page_html) + } + + fn new() -> Self { + Self { + ..Default::default() + } + } + + async fn has_complete_access(&self) -> bool { + // TODO: check if we are logged using the cookie + true + } +} + +impl MondeDiplo { + pub fn builder() -> Builder { + Builder::default() + } +} diff --git a/crieur-retrieve/src/tools/download.rs b/crieur-retrieve/src/tools/download.rs index e00f77f..fef1df4 100644 --- a/crieur-retrieve/src/tools/download.rs +++ b/crieur-retrieve/src/tools/download.rs @@ -4,7 +4,7 @@ use anyhow::Result; use async_trait::async_trait; use bytes::Bytes; use cookie::Cookie; -use hyper::{header, Body, Client, Method, Request}; +use hyper::{header, Body, Client, Method, Request, StatusCode}; use thiserror::Error; use url::Url; @@ -22,7 +22,9 @@ pub trait Download { type Error: StdError; /// Downloads a file from an url and returns the result as bytes - async fn download(&self, file_link: &Url) -> Result; + /// + /// If the file is not found, returns None + async fn download(&self, file_link: &Url) -> Result, Self::Error>; } /// Store several cookies @@ -36,7 +38,8 @@ pub struct Downloader<'c> { impl<'c> Download for Downloader<'c> { type Error = DownloadError; - async fn download(&self, file_link: &Url) -> Result { + async fn download(&self, file_link: &Url) -> Result, Self::Error> { + log::info!("downloading url {:?}", file_link); let https = hyper_rustls::HttpsConnector::with_native_roots(); let client: Client<_, hyper::Body> = Client::builder().build(https); @@ -44,14 +47,25 @@ impl<'c> Download for Downloader<'c> { .method(Method::GET) .uri(file_link.as_str()); - for cookie in &self.cookies { - req = req.header(header::COOKIE, cookie.to_string()); - } + req = req.header( + header::COOKIE, + self.cookies + .iter() + .map(Cookie::to_string) + .collect::>() + .join(";"), + ); + log::info!("headers : {:?}", req.headers_ref()); let req = req.body(Body::empty())?; let resp = client.request(req).await?; - let body = hyper::body::to_bytes(resp).await?; + let body = match resp.status() { + StatusCode::OK => Some(hyper::body::to_bytes(resp).await?), + StatusCode::NOT_FOUND => None, + // TODO: enhance this by handling more error codes + _ => None, + }; Ok(body) } } diff --git a/crieur-retrieve/src/tools/self_contained_html.rs b/crieur-retrieve/src/tools/self_contained_html.rs index 7283a0e..32b8c85 100644 --- a/crieur-retrieve/src/tools/self_contained_html.rs +++ b/crieur-retrieve/src/tools/self_contained_html.rs @@ -77,7 +77,7 @@ where .iter() .zip(downloaded_styles.iter()) .for_each(|(mut stylesheet, inner_css)| { - if let Some(inner_css) = inner_css { + if let Some(Some(inner_css)) = inner_css { let css = String::from_utf8(inner_css.to_vec()).unwrap(); let css = format!("", css); stylesheet.replace_with_html(css); @@ -120,10 +120,12 @@ where imgs.iter() .zip(downloaded_images.iter()) .for_each(|(mut img, data)| { - if let Some((url, data)) = data { + if let Some((url, Some(data))) = data { let data = base64::encode(data); let extension = Path::new(url.path()).extension().unwrap().to_str().unwrap(); img.set_attr("src", &format!("data:image/{};base64,{}", extension, data)); + } else { + img.remove() } }); // ---- Remove unwanted html elements ----- diff --git a/documentation/reference/newspaper_configuration.md b/documentation/reference/newspaper_configuration.md index 8658087..0fe3c9a 100644 --- a/documentation/reference/newspaper_configuration.md +++ b/documentation/reference/newspaper_configuration.md @@ -8,3 +8,16 @@ The newspapers are configured using environment variables MEDIAPART_COOKIE : sets the `MPRUUID` cookie, used to log in + +# Le Monde Diplomatique + +All cookies are mandatory to log in + +MONDE_DIPLO_LMD_A_M +: sets the `lmd_a_m` cookie + +MONDE_DIPLO_PHPSESSID +: sets the `PHPSESSID` cookie + +MONDE_DIPLO_SPIP_SESSION +: sets the `spip_session` cookie diff --git a/examples/cli_downloader.rs b/examples/cli_downloader.rs index 4a4eefe..84d1815 100644 --- a/examples/cli_downloader.rs +++ b/examples/cli_downloader.rs @@ -2,38 +2,27 @@ use std::convert::TryInto; use std::env; use anyhow::Result; -use crieur_retrieve::{ - newspaper::Newspaper, - newspapers::mediapart::{self, Mediapart}, - ArticleLocation, Url, -}; +use crieur_retrieve::{ArticleLocation, Url}; use dotenv::dotenv; use log::info; #[tokio::main] async fn main() -> Result<()> { dotenv().ok(); - env_logger::init(); + tracing_subscriber::fmt() + .with_writer(std::io::stderr) + .with_env_filter(tracing_subscriber::EnvFilter::from_default_env()) + .init(); let url = match env::args().nth(1) { Some(url) => Url::parse(&url)?, None => "https://www.mediapart.fr/journal/france/030421/l-hotel-dieu-patients-et-medecins-tentent-de-percer-les-mysteres-du-covid-long".try_into()?, }; - // TODO: remove this in favor of default newspapers - - let mpruiid = env::var("MEDIAPART_COOKIE")?.into(); - let mediapart = Mediapart::builder() - .login(mediapart::Login::MPRUUID(mpruiid)) - .build()?; - info!("Trying to download article from {}", url); // TODO: shorten this, maybe an helper function ? - let article_location = ArticleLocation::builder() - .url(url)? - .newspaper(mediapart) - .build()?; + let article_location = ArticleLocation::builder().url(url)?.build()?; let article_str = article_location.retrieve_html().await?; From cee0af6c3c3c0b8e997a201cbf52a4a9d221b10b Mon Sep 17 00:00:00 2001 From: koalp Date: Thu, 13 May 2021 20:30:27 +0200 Subject: [PATCH 2/6] fix: only select images that have non-data src Previously, when the image url contained data, it tried to parse an url and failed, instead of keeping data. It have been fixed so that images where url is starting by 'data' are not modified. --- crieur-retrieve/src/tools/self_contained_html.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crieur-retrieve/src/tools/self_contained_html.rs b/crieur-retrieve/src/tools/self_contained_html.rs index 32b8c85..44c3e11 100644 --- a/crieur-retrieve/src/tools/self_contained_html.rs +++ b/crieur-retrieve/src/tools/self_contained_html.rs @@ -92,7 +92,7 @@ where // let image_urls = { let document = Document::from(&html); - let imgs = document.select("img"); + let imgs = document.select("img:not([src^=\"data:\"])"); imgs.iter() .map(|image| { @@ -115,7 +115,7 @@ where let html = { let document = Document::from(&html); - let imgs = document.select("img"); + let imgs = document.select("img:not([src^=\"data:\"])"); imgs.iter() .zip(downloaded_images.iter()) From 5d0872b4d97eb1bd440eed810245e1847f36a095 Mon Sep 17 00:00:00 2001 From: koalp Date: Thu, 13 May 2021 20:29:36 +0200 Subject: [PATCH 3/6] feat : add retrieve from courrier international Retrieval of articles from courrier international have been added --- Cargo.lock | 24 +--- crieur-chatbot/src/handlers/html.rs | 1 - crieur-retrieve/Cargo.toml | 3 +- crieur-retrieve/src/article_location.rs | 22 +++- .../src/newspapers/courrier_international.rs | 111 ++++++++++++++++++ crieur-retrieve/src/newspapers/mod.rs | 1 + .../src/newspapers/monde_diplomatique.rs | 4 - crieur-retrieve/src/tools/download.rs | 5 +- .../src/tools/self_contained_html.rs | 16 +-- .../reference/newspaper_configuration.md | 8 ++ justfile | 28 +++-- 11 files changed, 176 insertions(+), 47 deletions(-) create mode 100644 crieur-retrieve/src/newspapers/courrier_international.rs diff --git a/Cargo.lock b/Cargo.lock index 992c8ee..52af574 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -362,7 +362,6 @@ dependencies = [ "hyper", "hyper-rustls", "indoc", - "itertools", "log", "lol_html", "nipper", @@ -594,12 +593,6 @@ dependencies = [ "syn", ] -[[package]] -name = "either" -version = "1.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" - [[package]] name = "encoding_rs" version = "0.8.28" @@ -902,9 +895,9 @@ dependencies = [ [[package]] name = "html-minifier" -version = "3.0.11" +version = "3.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9876ed2cad2fd2f78bad42688bcac8a87cebce9b2381c5b05efc30b0c8429af0" +checksum = "5a3b862e637e82b3134913fdd0aa0b8e79b7486fe88878f6bab0d09daf4996af" dependencies = [ "cow-utils", "educe", @@ -1063,15 +1056,6 @@ version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "47be2f14c678be2fdcab04ab1171db51b2762ce6f0a8ee87c8dd4a04ed216135" -[[package]] -name = "itertools" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37d572918e350e82412fe766d24b15e6682fb2ed2bbe018280caa810397cb319" -dependencies = [ - "either", -] - [[package]] name = "itoa" version = "0.4.7" @@ -1300,9 +1284,9 @@ checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d" [[package]] name = "minifier" -version = "0.0.39" +version = "0.0.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6cdf618de5c9c98d4a7b2e0d1f1e44f82a19196cfd94040bb203621c25d28d98" +checksum = "ed8f02a863a23d5797a6e72ea7102bd0ee38ceba1256b522aeddb70ff14b32c9" dependencies = [ "macro-utils", ] diff --git a/crieur-chatbot/src/handlers/html.rs b/crieur-chatbot/src/handlers/html.rs index f8a73fd..0b514df 100644 --- a/crieur-chatbot/src/handlers/html.rs +++ b/crieur-chatbot/src/handlers/html.rs @@ -113,7 +113,6 @@ impl EventHandler for Html { } else { return; }; - info!("sending file"); match msg_body.split(' ').collect::>().as_slice() { ["!html", url, ..] => send_article(*url, room).await, diff --git a/crieur-retrieve/Cargo.toml b/crieur-retrieve/Cargo.toml index 089aea1..6b6578b 100644 --- a/crieur-retrieve/Cargo.toml +++ b/crieur-retrieve/Cargo.toml @@ -16,7 +16,7 @@ hyper-rustls = "0.22.1" cookie = "0.15.0" lol_html = "0.3.0" indoc = "1.0.3" -html-minifier = "3.0.11" +html-minifier = "3.0.12" bytes = "1.0.1" base64 = "0.13.0" futures = "0.3.14" @@ -24,7 +24,6 @@ derive_builder = "0.10.2" nipper = "0.1.9" log = "0.4.14" env_logger = "0.8.3" -itertools = "0.10.0" [dev-dependencies] tokio = "1.5.0" diff --git a/crieur-retrieve/src/article_location.rs b/crieur-retrieve/src/article_location.rs index a0476b3..ebe97b1 100644 --- a/crieur-retrieve/src/article_location.rs +++ b/crieur-retrieve/src/article_location.rs @@ -6,6 +6,7 @@ use log::info; use url::{Host, Url}; use crate::newspaper::Newspaper; +use crate::newspapers::courrier_international::{self, CourrierInternational}; use crate::newspapers::mediapart::{self, Mediapart}; use crate::newspapers::monde_diplomatique::{self, MondeDiplo}; @@ -65,7 +66,25 @@ fn default_newpapers() -> Result { }) .build()?; - Ok(vec![Box::new(mediapart), Box::new(monde_diplo)]) + let lmd_a_m = "COURRIER_INTERNATIONAL_LMD_A_M".to_string(); + let ssess = "COURRIER_INTERNATIONAL_SSESS".to_string(); + + let lmd_a_m = env::var(&lmd_a_m) + .map_err(|_| Error::Misconfiguration(lmd_a_m))? + .into(); + let ssess = env::var(&ssess) + .map_err(|_| Error::Misconfiguration(ssess))? + .into(); + + let courrier_international = CourrierInternational::builder() + .login(courrier_international::Login::Cookies { lmd_a_m, ssess }) + .build()?; + + Ok(vec![ + Box::new(mediapart), + Box::new(monde_diplo), + Box::new(courrier_international), + ]) } #[derive(Default)] @@ -149,7 +168,6 @@ impl ArticleLocation { } pub async fn retrieve_html(&self) -> Result { - info!("It will download from {}", self.url); // TODO: modify when retrieve_html returns a specific Error type Ok(self.newspaper.retrieve_html(&self.url).await?) } diff --git a/crieur-retrieve/src/newspapers/courrier_international.rs b/crieur-retrieve/src/newspapers/courrier_international.rs new file mode 100644 index 0000000..911b9e9 --- /dev/null +++ b/crieur-retrieve/src/newspapers/courrier_international.rs @@ -0,0 +1,111 @@ +use anyhow::{anyhow, bail, Result}; +use async_trait::async_trait; +use cookie::Cookie; +use url::Host; + +use crate::newspaper::{Metadata, Newspaper}; +use crate::tools; +use crate::Url; +use crate::{Download, Downloader}; + +pub enum Login { + Username(String, String), + Cookies { lmd_a_m: String, ssess: String }, +} + +#[derive(Debug, Clone, Default)] +pub struct CourrierInternational { + login_cookies: Vec<(String, String)>, +} + +fn str_to_host>(host: S) -> Host { + Host::Domain(host.into()) +} + +#[derive(Debug, Clone, Default)] +pub struct Builder { + login_cookies: Option>, +} + +impl Builder { + pub fn login(&mut self, login: Login) -> &mut Self { + self.login_cookies = match login { + Login::Username(_username, _password) => { + unimplemented!("login using username and passwond not implemented") + } + Login::Cookies { lmd_a_m, ssess } => Some(vec![ + ("lmd_a_m".into(), lmd_a_m), + ("SSESS862c7003d721c672d39f161b1456b890".into(), ssess), + ]), + }; + self + } + + pub fn build(&self) -> Result { + match &self.login_cookies { + Some(login_cookies) => Ok(CourrierInternational { + login_cookies: login_cookies.clone(), + }), + None => Err(anyhow!("You have to log in to access this newspaper")), + } + } +} + +#[async_trait] +impl Newspaper for CourrierInternational { + fn metadata(&self) -> Metadata { + Metadata::builder() + .hosts(vec![ + str_to_host("courrierinternational.com"), + str_to_host("www.courrierinternational.com"), + ]) + .lower_case_name("courrier-international") + .name("Courrier international") + .build() + .unwrap_or_default() + } + + async fn retrieve_html(&self, url: &Url) -> Result { + let cookies = self + .login_cookies + .iter() + .map(|cookie| Cookie::build(&cookie.0, &cookie.1).finish()) + .collect::>(); + + // TODO: replace by builder + let downloader = Downloader { cookies }; + + let body = downloader.download(&url).await?; + let html = match body { + Some(body) => String::from_utf8(body.to_vec())?, + None => bail!("404 not found"), + }; + + // TODO: Move to const + let element_to_remove = [ + // navigation elements + "#entete.connecte", + ]; + + let single_page_html = + tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await; + Ok(single_page_html) + } + + fn new() -> Self { + Self { + ..Default::default() + } + } + + async fn has_complete_access(&self) -> bool { + // TODO: check if we are logged using the cookie + true + } +} + +impl CourrierInternational { + pub fn builder() -> Builder { + Builder::default() + } +} diff --git a/crieur-retrieve/src/newspapers/mod.rs b/crieur-retrieve/src/newspapers/mod.rs index 1cc9356..d07c868 100644 --- a/crieur-retrieve/src/newspapers/mod.rs +++ b/crieur-retrieve/src/newspapers/mod.rs @@ -1,2 +1,3 @@ +pub mod courrier_international; pub mod mediapart; pub mod monde_diplomatique; diff --git a/crieur-retrieve/src/newspapers/monde_diplomatique.rs b/crieur-retrieve/src/newspapers/monde_diplomatique.rs index abf5187..041737f 100644 --- a/crieur-retrieve/src/newspapers/monde_diplomatique.rs +++ b/crieur-retrieve/src/newspapers/monde_diplomatique.rs @@ -80,10 +80,6 @@ impl Newspaper for MondeDiplo { .iter() .map(|cookie| Cookie::build(&cookie.0, &cookie.1).finish()) .collect::>(); - //let cookie = Cookie::build(&self.login_cookie.0, &self.login_cookie.1) - // .secure(true) - // .finish(); - //let cookies = vec![cookie]; // TODO: replace by builder let downloader = Downloader { cookies }; diff --git a/crieur-retrieve/src/tools/download.rs b/crieur-retrieve/src/tools/download.rs index fef1df4..ff5096b 100644 --- a/crieur-retrieve/src/tools/download.rs +++ b/crieur-retrieve/src/tools/download.rs @@ -39,7 +39,7 @@ impl<'c> Download for Downloader<'c> { type Error = DownloadError; async fn download(&self, file_link: &Url) -> Result, Self::Error> { - log::info!("downloading url {:?}", file_link); + log::debug!("downloading url {:?}", file_link); let https = hyper_rustls::HttpsConnector::with_native_roots(); let client: Client<_, hyper::Body> = Client::builder().build(https); @@ -55,11 +55,12 @@ impl<'c> Download for Downloader<'c> { .collect::>() .join(";"), ); - log::info!("headers : {:?}", req.headers_ref()); + log::debug!("headers : {:?}", req.headers_ref()); let req = req.body(Body::empty())?; let resp = client.request(req).await?; + log::debug!("Response status : {:?}", resp.status()); let body = match resp.status() { StatusCode::OK => Some(hyper::body::to_bytes(resp).await?), StatusCode::NOT_FOUND => None, diff --git a/crieur-retrieve/src/tools/self_contained_html.rs b/crieur-retrieve/src/tools/self_contained_html.rs index 44c3e11..bc25211 100644 --- a/crieur-retrieve/src/tools/self_contained_html.rs +++ b/crieur-retrieve/src/tools/self_contained_html.rs @@ -170,8 +170,8 @@ mod tests { #[async_trait] impl Download for DummyDownloader { type Error = errors::Error; - async fn download(&self, _file_link: &Url) -> errors::Result { - Ok(Bytes::from("")) + async fn download(&self, _file_link: &Url) -> errors::Result> { + Ok(Some(Bytes::from(""))) } } @@ -248,12 +248,14 @@ mod tests { #[async_trait] impl Download for CssDownloader { type Error = errors::Error; - async fn download(&self, _file_link: &Url) -> errors::Result { - Ok(indoc! {" + async fn download(&self, _file_link: &Url) -> errors::Result> { + Ok(Some( + indoc! {" section#warning { color: red; }"} - .into()) + .into(), + )) } } @@ -300,12 +302,12 @@ mod tests { #[async_trait] impl Download for PngDownloader { type Error = errors::Error; - async fn download(&self, _file_link: &Url) -> errors::Result { + async fn download(&self, _file_link: &Url) -> errors::Result> { let image_path = Path::new("test_data/home.png"); let mut image_file = File::open(&image_path).unwrap(); let mut image_buf: Vec = vec![]; image_file.read_to_end(&mut image_buf).unwrap(); - Ok(image_buf.into()) + Ok(Some(image_buf.into())) } } diff --git a/documentation/reference/newspaper_configuration.md b/documentation/reference/newspaper_configuration.md index 0fe3c9a..fa7ab2c 100644 --- a/documentation/reference/newspaper_configuration.md +++ b/documentation/reference/newspaper_configuration.md @@ -21,3 +21,11 @@ MONDE_DIPLO_PHPSESSID MONDE_DIPLO_SPIP_SESSION : sets the `spip_session` cookie + +# Courrier international + +COURRIER_INTERNATIONAL_LMD_A_M +: sets the `lmd_a_m` cookie + +COURRIER_INTERNATIONAL_SSESS +: sets the `ssess` cookie diff --git a/justfile b/justfile index a28b9d5..3b60ba7 100644 --- a/justfile +++ b/justfile @@ -1,19 +1,29 @@ @build: - cargo build + cargo build + +@build-container: + podman build --file ./containers/chatbot.containerfile -t crieur-chatbot . @clean: - cargo clean + cargo clean @run: - cargo run + cargo run + +@test: + cargo test --all + +@clippy: + cargo clippy + +@fmt: + cargo fmt + +@simulate-ci: fmt clippy test -@container: - podman build --file ./containers/chatbot.containerfile -t crieur-chatbot . @audit: - cargo audit + cargo audit @crev: - cargo crev verify - -@verify: audit crev + cargo crev verify From 6e091a32fc111a7c9f46886c4399915f4830d0a1 Mon Sep 17 00:00:00 2001 From: koalp Date: Mon, 17 May 2021 20:18:32 +0200 Subject: [PATCH 4/6] chore: use a config struct for self_contained_html Previously, self_html_function was a function taking all parameters as arguments. As new optionnal parameters are beeing added, the function had too much arguments and each usage of the function would have to be modified each time an argument will be added. Therefore, it have been moved to a configuration structure with a `run` function taking only one argument, the html string. --- crieur-chatbot/src/chatbot.rs | 10 +- crieur-chatbot/src/handlers/html.rs | 5 +- crieur-retrieve/src/article_location.rs | 28 +- .../src/newspapers/courrier_international.rs | 12 +- crieur-retrieve/src/newspapers/mediapart.rs | 16 +- .../src/newspapers/monde_diplomatique.rs | 12 +- crieur-retrieve/src/tools/mod.rs | 3 +- .../src/tools/self_contained_html.rs | 331 +++++++++++------- documentation/design/scope.md | 3 +- 9 files changed, 239 insertions(+), 181 deletions(-) diff --git a/crieur-chatbot/src/chatbot.rs b/crieur-chatbot/src/chatbot.rs index 137b9ef..85018b2 100644 --- a/crieur-chatbot/src/chatbot.rs +++ b/crieur-chatbot/src/chatbot.rs @@ -2,15 +2,7 @@ use std::convert::TryInto; use anyhow::Result; -use matrix_sdk::{ - self, async_trait, - events::{ - room::message::{MessageEventContent, MessageType, TextMessageEventContent}, - AnyMessageEventContent, SyncMessageEvent, - }, - room::Room, - Client, ClientConfig, EventHandler, SyncSettings, -}; +use matrix_sdk::{self, Client, SyncSettings}; use crate::Html; diff --git a/crieur-chatbot/src/handlers/html.rs b/crieur-chatbot/src/handlers/html.rs index 0b514df..3d5bf61 100644 --- a/crieur-chatbot/src/handlers/html.rs +++ b/crieur-chatbot/src/handlers/html.rs @@ -1,7 +1,6 @@ use std::convert::TryInto; -use std::env; -use log::{error, info}; +use log::error; use matrix_sdk::{ self, async_trait, events::{ @@ -9,7 +8,7 @@ use matrix_sdk::{ AnyMessageEventContent, SyncMessageEvent, }, room::Room, - Client, ClientConfig, EventHandler, SyncSettings, + EventHandler, }; use crieur_retrieve::{article_location::Error, article_location::Result, ArticleLocation, Url}; diff --git a/crieur-retrieve/src/article_location.rs b/crieur-retrieve/src/article_location.rs index ebe97b1..2062c0b 100644 --- a/crieur-retrieve/src/article_location.rs +++ b/crieur-retrieve/src/article_location.rs @@ -2,7 +2,6 @@ use std::boxed::Box; use std::convert::TryInto; use std::env; -use log::info; use url::{Host, Url}; use crate::newspaper::Newspaper; @@ -36,27 +35,20 @@ pub type Result = core::result::Result; fn default_newpapers() -> Result { // TODO: same thing is written too much times : how to DRY ? let config_key = "MEDIAPART_COOKIE".to_string(); - let mpruiid = env::var(&config_key) - .map_err(|_| Error::Misconfiguration(config_key))? - .into(); + let mpruiid = env::var(&config_key).map_err(|_| Error::Misconfiguration(config_key))?; let mediapart = Mediapart::builder() - .login(mediapart::Login::MPRUUID(mpruiid)) + .login(mediapart::Login::Mpruuid(mpruiid)) .build()?; let lmd_a_m = "MONDE_DIPLO_LMD_A_M".to_string(); let phpsessid = "MONDE_DIPLO_PHPSESSID".to_string(); let spip_session = "MONDE_DIPLO_SPIP_SESSION".to_string(); - let lmd_a_m = env::var(&lmd_a_m) - .map_err(|_| Error::Misconfiguration(lmd_a_m))? - .into(); - let phpsessid = env::var(&phpsessid) - .map_err(|_| Error::Misconfiguration(phpsessid))? - .into(); - let spip_session = env::var(&spip_session) - .map_err(|_| Error::Misconfiguration(spip_session))? - .into(); + let lmd_a_m = env::var(&lmd_a_m).map_err(|_| Error::Misconfiguration(lmd_a_m))?; + let phpsessid = env::var(&phpsessid).map_err(|_| Error::Misconfiguration(phpsessid))?; + let spip_session = + env::var(&spip_session).map_err(|_| Error::Misconfiguration(spip_session))?; let monde_diplo = MondeDiplo::builder() .login(monde_diplomatique::Login::Cookies { @@ -69,12 +61,8 @@ fn default_newpapers() -> Result { let lmd_a_m = "COURRIER_INTERNATIONAL_LMD_A_M".to_string(); let ssess = "COURRIER_INTERNATIONAL_SSESS".to_string(); - let lmd_a_m = env::var(&lmd_a_m) - .map_err(|_| Error::Misconfiguration(lmd_a_m))? - .into(); - let ssess = env::var(&ssess) - .map_err(|_| Error::Misconfiguration(ssess))? - .into(); + let lmd_a_m = env::var(&lmd_a_m).map_err(|_| Error::Misconfiguration(lmd_a_m))?; + let ssess = env::var(&ssess).map_err(|_| Error::Misconfiguration(ssess))?; let courrier_international = CourrierInternational::builder() .login(courrier_international::Login::Cookies { lmd_a_m, ssess }) diff --git a/crieur-retrieve/src/newspapers/courrier_international.rs b/crieur-retrieve/src/newspapers/courrier_international.rs index 911b9e9..187e5db 100644 --- a/crieur-retrieve/src/newspapers/courrier_international.rs +++ b/crieur-retrieve/src/newspapers/courrier_international.rs @@ -82,13 +82,19 @@ impl Newspaper for CourrierInternational { }; // TODO: Move to const - let element_to_remove = [ + let elements_to_remove = [ // navigation elements "#entete.connecte", ]; - let single_page_html = - tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await; + let single_page_html = tools::self_contained_html::Config { + downloader: Some(&downloader), + base_url: Some(&url), + elements_to_remove: &elements_to_remove, + ..Default::default() + } + .run(&html) + .await; Ok(single_page_html) } diff --git a/crieur-retrieve/src/newspapers/mediapart.rs b/crieur-retrieve/src/newspapers/mediapart.rs index 24a3933..2586e08 100644 --- a/crieur-retrieve/src/newspapers/mediapart.rs +++ b/crieur-retrieve/src/newspapers/mediapart.rs @@ -10,7 +10,7 @@ use crate::{Download, Downloader}; pub enum Login { Username(String, String), - MPRUUID(String), + Mpruuid(String), } #[derive(Debug, Clone, Default)] @@ -33,7 +33,7 @@ impl Builder { Login::Username(_username, _password) => { unimplemented!("login using username and passwond not implemented") } - Login::MPRUUID(cookie_value) => Some(("MPRUUID".into(), cookie_value)), + Login::Mpruuid(cookie_value) => Some(("MPRUUID".into(), cookie_value)), }; self } @@ -86,7 +86,7 @@ impl Newspaper for Mediapart { }; // TODO: Move to const - let element_to_remove = [ + let elements_to_remove = [ // header ".fb-root", ".skipLinks", @@ -104,8 +104,14 @@ impl Newspaper for Mediapart { "aside.cc-modal", ]; - let single_page_html = - tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await; + let single_page_html = tools::self_contained_html::Config { + downloader: Some(&downloader), + base_url: Some(&url), + elements_to_remove: &elements_to_remove, + ..Default::default() + } + .run(&html) + .await; Ok(single_page_html) } diff --git a/crieur-retrieve/src/newspapers/monde_diplomatique.rs b/crieur-retrieve/src/newspapers/monde_diplomatique.rs index 041737f..8348dae 100644 --- a/crieur-retrieve/src/newspapers/monde_diplomatique.rs +++ b/crieur-retrieve/src/newspapers/monde_diplomatique.rs @@ -91,7 +91,7 @@ impl Newspaper for MondeDiplo { }; // TODO: Move to const - let element_to_remove = [ + let elements_to_remove = [ // navigation elements "#tout-en-haut.preentete", "#entete.connecte", @@ -107,8 +107,14 @@ impl Newspaper for MondeDiplo { "noscript", ]; - let single_page_html = - tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await; + let single_page_html = tools::self_contained_html::Config { + downloader: Some(&downloader), + base_url: Some(&url), + elements_to_remove: &elements_to_remove, + ..Default::default() + } + .run(&html) + .await; Ok(single_page_html) } diff --git a/crieur-retrieve/src/tools/mod.rs b/crieur-retrieve/src/tools/mod.rs index 59381b1..80f159e 100644 --- a/crieur-retrieve/src/tools/mod.rs +++ b/crieur-retrieve/src/tools/mod.rs @@ -1,5 +1,4 @@ mod download; -mod self_contained_html; +pub mod self_contained_html; pub use download::{Download, DownloadError, Downloader}; -pub use self_contained_html::self_contained_html; diff --git a/crieur-retrieve/src/tools/self_contained_html.rs b/crieur-retrieve/src/tools/self_contained_html.rs index bc25211..2a5ddac 100644 --- a/crieur-retrieve/src/tools/self_contained_html.rs +++ b/crieur-retrieve/src/tools/self_contained_html.rs @@ -8,140 +8,177 @@ use url::Url; use crate::consts::{EVENT_HANDLERS, LINK_REL_EXTERNAL_RESOURCES}; use crate::Download; -/// Makes an html page self-contained -/// -/// The `downloader` must implement `Download` and is used to download ressources that are -/// needed to make this page self-contained such as stylesheets or images. -/// -/// The function also removes all scripts on the page -pub async fn self_contained_html( - html: impl AsRef, - downloader: &D, - base_url: &Url, - elements_to_remove: &[impl AsRef], -) -> String +/// Stores configuration for the self_contained_html function +// TODO: write a builder +pub struct Config<'t, E, D, S1 = &'static str, S2 = &'static str> +where + E: std::error::Error, + D: Download + Send, + S1: AsRef, + S2: AsRef, +{ + /// the downloader that will be used to retrieve ressources on the page + pub downloader: Option<&'t D>, + /// Base url for downloading ressources, it probably the + pub base_url: Option<&'t Url>, + pub elements_to_remove: &'t [S1], + pub styles_to_add: &'t [S2], +} + +impl<'t, E, D> Default for Config<'t, E, D> where E: std::error::Error, D: Download + Send, { - // TODO: split/refactor this function : - // - ¿ find a way to ease the |get urls| -> |async download| -> |replace| workflow ? - // - `element_to_remove` , `base_url` and `downloader` should be in a configuration structure - // - ¿ should be function of a trait ? or only of the configuration struct ? - let (style_urls, html) = { - let document = Document::from(html.as_ref()); - - // ---- Remove scripts ---- - // - document.select("script").remove(); - - for event in EVENT_HANDLERS { - document - .select(format!("[{}]", event).as_str()) - .remove_attr(event); + fn default() -> Self { + Self { + downloader: None, + base_url: None, + elements_to_remove: &[], + styles_to_add: &[], } + } +} - for rel in LINK_REL_EXTERNAL_RESOURCES { - document - .select(format!("link[rel=\"{}\"]", rel).as_str()) - .remove(); - } +impl<'t, E, D, S1, S2> Config<'t, E, D, S1, S2> +where + E: std::error::Error, + D: Download + Send, + S1: AsRef, + S2: AsRef, +{ + /// Makes an html page self-contained + /// + /// The `downloader` must implement `Download` and is used to download ressources that are + /// needed to make this page self-contained such as stylesheets or images. + /// + /// The function also removes all scripts on the page + pub async fn run(&self, html: impl AsRef) -> String { + //TODO: don't panic + let base_url = self.base_url.expect("Base url not defined"); + let downloader = self.downloader.expect("Downloader not defined"); + // TODO: split/refactor this function : + // - ¿ find a way to ease the |get urls| -> |async download| -> |replace| workflow ? + // - `element_to_remove` , `base_url` and `downloader` should be in a configuration structure + let (style_urls, html) = { + let document = Document::from(html.as_ref()); - // ---- Replace stylesheets ---- + // ---- Remove scripts ---- + // + document.select("script").remove(); + + for event in EVENT_HANDLERS { + document + .select(format!("[{}]", event).as_str()) + .remove_attr(event); + } + + for rel in LINK_REL_EXTERNAL_RESOURCES { + document + .select(format!("link[rel=\"{}\"]", rel).as_str()) + .remove(); + } + + // ---- Replace stylesheets ---- + // + let stylesheets = document.select("link[href][rel=\"stylesheet\"]"); + let styles_url = stylesheets + .iter() + .map(|stylesheet| { + if let Some(src) = stylesheet.attr("href") { + //TODO: does it work with absolute urls ? + base_url.join(src.as_ref()).ok() + } else { + None + } + }) + .collect::>(); + (styles_url, String::from(document.html())) + }; + + let style_urls = style_urls.into_iter().map(|style_url| { + OptionFuture::from( + style_url.map(|s| async move { downloader.download(&s).await.unwrap() }), + ) + }); + let downloaded_styles = futures::future::join_all(style_urls).await; + + let html = { + let document = Document::from(&html); + let styles = document.select("link[href][rel=\"stylesheet\"]"); + + styles + .iter() + .zip(downloaded_styles.iter()) + .for_each(|(mut stylesheet, inner_css)| { + if let Some(Some(inner_css)) = inner_css { + let css = String::from_utf8(inner_css.to_vec()).unwrap(); + let css = format!("", css); + stylesheet.replace_with_html(css); + } else { + stylesheet.remove(); + } + }); + String::from(document.html()) + }; + + // ---- Replace imgs ---- // - let stylesheets = document.select("link[href][rel=\"stylesheet\"]"); - let styles_url = stylesheets - .iter() - .map(|stylesheet| { - if let Some(src) = stylesheet.attr("href") { - //TODO: does it work with absolute urls ? - base_url.join(src.as_ref()).ok() - } else { - None - } - }) - .collect::>(); - (styles_url, String::from(document.html())) - }; + let image_urls = { + let document = Document::from(&html); + let imgs = document.select("img:not([src^=\"data:\"])"); - let style_urls = style_urls.into_iter().map(|style_url| { - OptionFuture::from(style_url.map(|s| async move { downloader.download(&s).await.unwrap() })) - }); - let downloaded_styles = futures::future::join_all(style_urls).await; + imgs.iter() + .map(|image| { + if let Some(src) = image.attr("src") { + base_url.join(src.as_ref()).ok() + } else { + None + } + }) + .collect::>() + }; - let html = { - let document = Document::from(&html); - let styles = document.select("link[href][rel=\"stylesheet\"]"); + let downloaded_images = image_urls.into_iter().map(|image_url| { + OptionFuture::from(image_url.map(|url| async move { + let data = downloader.download(&url).await.unwrap(); + (url, data) + })) + }); + let downloaded_images = futures::future::join_all(downloaded_images).await; - styles - .iter() - .zip(downloaded_styles.iter()) - .for_each(|(mut stylesheet, inner_css)| { - if let Some(Some(inner_css)) = inner_css { - let css = String::from_utf8(inner_css.to_vec()).unwrap(); - let css = format!("", css); - stylesheet.replace_with_html(css); - } else { - stylesheet.remove(); - } - }); - String::from(document.html()) - }; + let html = { + let document = Document::from(&html); + let imgs = document.select("img:not([src^=\"data:\"])"); - // ---- Replace imgs ---- - // - let image_urls = { - let document = Document::from(&html); - let imgs = document.select("img:not([src^=\"data:\"])"); + imgs.iter() + .zip(downloaded_images.iter()) + .for_each(|(mut img, data)| { + if let Some((url, Some(data))) = data { + let data = base64::encode(data); + //TODO: use an extension hashmap + let extension = + Path::new(url.path()).extension().unwrap().to_str().unwrap(); + img.set_attr("src", &format!("data:image/{};base64,{}", extension, data)); + } else { + img.remove() + } + }); + // ---- Remove unwanted html elements ----- + // + for element in self.elements_to_remove { + document.select(element.as_ref()).remove(); + } + String::from(document.html()) + }; - imgs.iter() - .map(|image| { - if let Some(src) = image.attr("src") { - base_url.join(src.as_ref()).ok() - } else { - None - } - }) - .collect::>() - }; - - let downloaded_images = image_urls.into_iter().map(|image_url| { - OptionFuture::from(image_url.map(|url| async move { - let data = downloader.download(&url).await.unwrap(); - (url, data) - })) - }); - let downloaded_images = futures::future::join_all(downloaded_images).await; - - let html = { - let document = Document::from(&html); - let imgs = document.select("img:not([src^=\"data:\"])"); - - imgs.iter() - .zip(downloaded_images.iter()) - .for_each(|(mut img, data)| { - if let Some((url, Some(data))) = data { - let data = base64::encode(data); - let extension = Path::new(url.path()).extension().unwrap().to_str().unwrap(); - img.set_attr("src", &format!("data:image/{};base64,{}", extension, data)); - } else { - img.remove() - } - }); - // ---- Remove unwanted html elements ----- + // ---- output ---- // - for element in elements_to_remove { - document.select(element.as_ref()).remove(); - } - String::from(document.html()) - }; + let mut minifier = HTMLMinifier::new(); + minifier.digest(html.as_str()).unwrap(); - // ---- output ---- - // - let mut minifier = HTMLMinifier::new(); - minifier.digest(html.as_str()).unwrap(); - - String::from_utf8(minifier.get_html().into()).unwrap() + String::from_utf8(minifier.get_html().into()).unwrap() + } } #[cfg(test)] @@ -180,9 +217,14 @@ mod tests { let html = ""; let base_url = Url::parse("http://example.com")?; let downloader = DummyDownloader {}; - let to_remove: &[&str] = &[]; assert_eq!( - self_contained_html(html, &downloader, &base_url, to_remove).await, + Config { + downloader: Some(&downloader), + base_url: Some(&base_url), + ..Default::default() + } + .run(html) + .await, "" ); Ok(()) @@ -206,10 +248,13 @@ mod tests { }; let base_url = Url::parse("http://example.com")?; - let to_remove: &[&str] = &[]; for s in EVENT_HANDLERS { assert_eq!( - self_contained_html(html(s), &downloader, &base_url, to_remove).await, + Config { + downloader: Some(&downloader), + base_url: Some(&base_url), + ..Default::default() + }.run(html(s)).await, "\n\n\n\n" ); } @@ -234,10 +279,15 @@ mod tests { }; let base_url = Url::parse("http://example.com")?; - let to_remove: &[&str] = &[]; for s in LINK_REL_EXTERNAL_RESOURCES { assert_eq!( - self_contained_html(html(s), &downloader, &base_url, to_remove).await, + Config { + downloader: Some(&downloader), + base_url: Some(&base_url), + ..Default::default() + } + .run(html(s)) + .await, "\n\n\n" ); } @@ -290,9 +340,14 @@ mod tests { let minified = String::from_utf8(minifier.get_html().into())?; let base_url = Url::parse("http://example.com")?; - let to_remove: &[&str] = &[]; assert_eq!( - self_contained_html(html, &downloader, &base_url, to_remove).await, + Config { + downloader: Some(&downloader), + base_url: Some(&base_url), + ..Default::default() + } + .run(html) + .await, minified ); Ok(()) @@ -337,9 +392,14 @@ mod tests { let minified = String::from_utf8(minifier.get_html().into())?; let base_url = Url::parse("http://example.com")?; - let to_remove: &[&str] = &[]; assert_eq!( - self_contained_html(html, &downloader, &base_url, to_remove).await, + Config { + downloader: Some(&downloader), + base_url: Some(&base_url), + ..Default::default() + } + .run(html) + .await, minified ); Ok(()) @@ -372,12 +432,13 @@ mod tests { let minified = String::from_utf8(minifier.get_html().into())?; assert_eq!( - self_contained_html( - html, - &downloader, - &base_url, - &["header", ".placeholder", "article > span.huge"] - ) + Config { + downloader: Some(&downloader), + base_url: Some(&base_url), + elements_to_remove: &["header", ".placeholder", "article > span.huge"], + ..Default::default() + } + .run(html) .await, minified ); diff --git a/documentation/design/scope.md b/documentation/design/scope.md index 6417bf7..cca4047 100644 --- a/documentation/design/scope.md +++ b/documentation/design/scope.md @@ -37,7 +37,8 @@ frame "backend" { newspaper -> retrieval_tools: uses to implement - article_location --> article_repr :uses + article_location --> article_repr: uses + retrieval_tools -up-> article_repr: uses auto_retrieve --> rss: watches auto_retrieve --> article_location From 40ebc1ddea28334f5db31135736223b940f57353 Mon Sep 17 00:00:00 2001 From: koalp Date: Wed, 19 May 2021 04:09:44 +0200 Subject: [PATCH 5/6] feat: allow to inject styles --- .../src/newspapers/courrier_international.rs | 7 --- .../src/tools/self_contained_html.rs | 63 +++++++++++++++++++ 2 files changed, 63 insertions(+), 7 deletions(-) diff --git a/crieur-retrieve/src/newspapers/courrier_international.rs b/crieur-retrieve/src/newspapers/courrier_international.rs index 187e5db..cd6799e 100644 --- a/crieur-retrieve/src/newspapers/courrier_international.rs +++ b/crieur-retrieve/src/newspapers/courrier_international.rs @@ -81,16 +81,9 @@ impl Newspaper for CourrierInternational { None => bail!("404 not found"), }; - // TODO: Move to const - let elements_to_remove = [ - // navigation elements - "#entete.connecte", - ]; - let single_page_html = tools::self_contained_html::Config { downloader: Some(&downloader), base_url: Some(&url), - elements_to_remove: &elements_to_remove, ..Default::default() } .run(&html) diff --git a/crieur-retrieve/src/tools/self_contained_html.rs b/crieur-retrieve/src/tools/self_contained_html.rs index 2a5ddac..2e9ea2e 100644 --- a/crieur-retrieve/src/tools/self_contained_html.rs +++ b/crieur-retrieve/src/tools/self_contained_html.rs @@ -169,6 +169,15 @@ where for element in self.elements_to_remove { document.select(element.as_ref()).remove(); } + + // ---- Add additional styles ---- + // + for style in self.styles_to_add { + document + .select("head") + .append_html(format!("\n\n", style.as_ref())); + } + String::from(document.html()) }; @@ -444,4 +453,58 @@ mod tests { ); Ok(()) } + + #[tokio::test] + async fn add_style() -> Result<()> { + let html = indoc! {" + + + + + + The body + + + "}; + + let wanted_html = indoc! {" + + + + + + The body + + "}; + + let style_to_add = indoc! {" + body { + margin: 3em; + } + "}; + + let base_url = Url::parse("http://example.com")?; + let downloader = DummyDownloader {}; + + let mut minifier = HTMLMinifier::new(); + minifier.digest(wanted_html)?; + let minified = String::from_utf8(minifier.get_html().into())?; + + assert_eq!( + Config { + downloader: Some(&downloader), + base_url: Some(&base_url), + styles_to_add: &[style_to_add], + ..Default::default() + } + .run(html) + .await, + minified + ); + Ok(()) + } } From e34edf0b21946826a3fb664e7dce2864222b0e52 Mon Sep 17 00:00:00 2001 From: koalp Date: Sat, 22 May 2021 04:25:50 +0200 Subject: [PATCH 6/6] fix: keep media queries in ref styles Previously, media queries weren't keep when downloading styles from ref tags. It have been fixed so that media attribute are kept when creating style tags from ref tags. --- Cargo.lock | 50 +++++++------- Cargo.toml | 2 +- crieur-retrieve/Cargo.toml | 6 +- .../src/newspapers/courrier_international.rs | 34 ++++++++++ crieur-retrieve/src/newspapers/mediapart.rs | 4 +- .../src/newspapers/monde_diplomatique.rs | 4 +- .../src/tools/self_contained_html.rs | 65 +++++++++++++++++-- 7 files changed, 126 insertions(+), 39 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 52af574..95f5f16 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -662,9 +662,9 @@ dependencies = [ [[package]] name = "futures" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a9d5813545e459ad3ca1bff9915e9ad7f1a47dc6a91b627ce321d5863b7dd253" +checksum = "0e7e43a803dae2fa37c1f6a8fe121e1f7bf9548b4dfc0522a42f34145dadfc27" dependencies = [ "futures-channel", "futures-core", @@ -677,9 +677,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce79c6a52a299137a6013061e0cf0e688fce5d7f1bc60125f520912fdb29ec25" +checksum = "e682a68b29a882df0545c143dc3646daefe80ba479bcdede94d5a703de2871e2" dependencies = [ "futures-core", "futures-sink", @@ -687,15 +687,15 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "098cd1c6dda6ca01650f1a37a794245eb73181d0d4d4e955e2f3c37db7af1815" +checksum = "0402f765d8a89a26043b889b26ce3c4679d268fa6bb22cd7c6aad98340e179d1" [[package]] name = "futures-executor" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10f6cb7042eda00f0049b1d2080aa4b93442997ee507eb3828e8bd7577f94c9d" +checksum = "badaa6a909fac9e7236d0620a2f57f7664640c56575b71a7552fbd68deafab79" dependencies = [ "futures-core", "futures-task", @@ -704,9 +704,9 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "365a1a1fb30ea1c03a830fdb2158f5236833ac81fa0ad12fe35b29cddc35cb04" +checksum = "acc499defb3b348f8d8f3f66415835a9131856ff7714bf10dadfc4ec4bdb29a1" [[package]] name = "futures-locks" @@ -719,10 +719,11 @@ dependencies = [ [[package]] name = "futures-macro" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "668c6733a182cd7deb4f1de7ba3bf2120823835b3bcfbeacf7d2c4a773c1bb8b" +checksum = "a4c40298486cdf52cc00cd6d6987892ba502c7656a16a4192a9992b1ccedd121" dependencies = [ + "autocfg 1.0.1", "proc-macro-hack", "proc-macro2", "quote", @@ -731,15 +732,15 @@ dependencies = [ [[package]] name = "futures-sink" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c5629433c555de3d82861a7a4e3794a4c40040390907cfbfd7143a92a426c23" +checksum = "a57bead0ceff0d6dde8f465ecd96c9338121bb7717d3e7b108059531870c4282" [[package]] name = "futures-task" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba7aa51095076f3ba6d9a1f702f74bd05ec65f555d70d2033d55ba8d69f581bc" +checksum = "8a16bef9fc1a4dddb5bee51c989e3fbba26569cbb0e31f5b303c184e3dd33dae" [[package]] name = "futures-timer" @@ -753,10 +754,11 @@ dependencies = [ [[package]] name = "futures-util" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c144ad54d60f23927f0a6b6d816e4271278b64f005ad65e4e35291d2de9c025" +checksum = "feb5c238d27e2bf94ffdfd27b2c29e3df4a68c4193bb6427384259e2bf191967" dependencies = [ + "autocfg 1.0.1", "futures-channel", "futures-core", "futures-io", @@ -895,9 +897,9 @@ dependencies = [ [[package]] name = "html-minifier" -version = "3.0.12" +version = "3.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a3b862e637e82b3134913fdd0aa0b8e79b7486fe88878f6bab0d09daf4996af" +checksum = "70f11cbdecf4fde3b74532e8d6d5926b6354adfda3fe259ed585402f9ae0dbaf" dependencies = [ "cow-utils", "educe", @@ -1284,9 +1286,9 @@ checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d" [[package]] name = "minifier" -version = "0.0.40" +version = "0.0.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed8f02a863a23d5797a6e72ea7102bd0ee38ceba1256b522aeddb70ff14b32c9" +checksum = "5594542d20834f2b974f5e5fb8e0cf1c67a2119dcadc29ef5d93a081fb30cc08" dependencies = [ "macro-utils", ] @@ -2659,9 +2661,9 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" [[package]] name = "tokio" -version = "1.5.0" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83f0c8e7c0addab50b663055baf787d0af7f413a46e6e7fb9559a4e4db7137a5" +checksum = "bd3076b5c8cc18138b8f8814895c11eb4de37114a5d127bafdc5e55798ceef37" dependencies = [ "autocfg 1.0.1", "bytes", diff --git a/Cargo.toml b/Cargo.toml index 6c58b28..6d1c92b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,5 +22,5 @@ crieur-chatbot = {version = "0.1", path="crieur-chatbot"} dotenv = "0.15.0" env_logger = "0.8.3" log = "0.4.14" -tokio = { version = "1.5.0", features = ["full"] } +tokio = { version = "1.6.0", features = ["full"] } tracing-subscriber = "0.2.18" diff --git a/crieur-retrieve/Cargo.toml b/crieur-retrieve/Cargo.toml index 6b6578b..6c3c947 100644 --- a/crieur-retrieve/Cargo.toml +++ b/crieur-retrieve/Cargo.toml @@ -16,14 +16,14 @@ hyper-rustls = "0.22.1" cookie = "0.15.0" lol_html = "0.3.0" indoc = "1.0.3" -html-minifier = "3.0.12" +html-minifier = "3.0.13" bytes = "1.0.1" base64 = "0.13.0" -futures = "0.3.14" +futures = "0.3.15" derive_builder = "0.10.2" nipper = "0.1.9" log = "0.4.14" env_logger = "0.8.3" [dev-dependencies] -tokio = "1.5.0" +tokio = "1.6.0" diff --git a/crieur-retrieve/src/newspapers/courrier_international.rs b/crieur-retrieve/src/newspapers/courrier_international.rs index cd6799e..7263d11 100644 --- a/crieur-retrieve/src/newspapers/courrier_international.rs +++ b/crieur-retrieve/src/newspapers/courrier_international.rs @@ -1,6 +1,7 @@ use anyhow::{anyhow, bail, Result}; use async_trait::async_trait; use cookie::Cookie; +use indoc::indoc; use url::Host; use crate::newspaper::{Metadata, Newspaper}; @@ -81,9 +82,42 @@ impl Newspaper for CourrierInternational { None => bail!("404 not found"), }; + let elements_to_remove = &[ + // navigation elements + "header.site-header", + "footer.site-footer", + // Social buttons + "#toolbox-share", + ".toolbox-share", + ".toolbox-print", + ".toolbox-respond", + ".toolbox-zen", + ".toolbox-newsletter", + ".toolbox-offer", + ".box-article-offer-friend-abo", + // unused services + ".article-aside", + ".article-secondary", + ".article-subject-readmore", + // misc + ".element-invisible", + ".gptcontainer", + ]; + + // FIXME: it doesn't work because the aside is in the article body + // + let toolbox_style = indoc! {" + aside.article-toolbox { + position: sticky; + top: 1em; + } + "}; + let single_page_html = tools::self_contained_html::Config { downloader: Some(&downloader), base_url: Some(&url), + elements_to_remove, + styles_to_add: &[toolbox_style], ..Default::default() } .run(&html) diff --git a/crieur-retrieve/src/newspapers/mediapart.rs b/crieur-retrieve/src/newspapers/mediapart.rs index 2586e08..40f17c1 100644 --- a/crieur-retrieve/src/newspapers/mediapart.rs +++ b/crieur-retrieve/src/newspapers/mediapart.rs @@ -86,7 +86,7 @@ impl Newspaper for Mediapart { }; // TODO: Move to const - let elements_to_remove = [ + let elements_to_remove = &[ // header ".fb-root", ".skipLinks", @@ -107,7 +107,7 @@ impl Newspaper for Mediapart { let single_page_html = tools::self_contained_html::Config { downloader: Some(&downloader), base_url: Some(&url), - elements_to_remove: &elements_to_remove, + elements_to_remove, ..Default::default() } .run(&html) diff --git a/crieur-retrieve/src/newspapers/monde_diplomatique.rs b/crieur-retrieve/src/newspapers/monde_diplomatique.rs index 8348dae..e3a12df 100644 --- a/crieur-retrieve/src/newspapers/monde_diplomatique.rs +++ b/crieur-retrieve/src/newspapers/monde_diplomatique.rs @@ -91,7 +91,7 @@ impl Newspaper for MondeDiplo { }; // TODO: Move to const - let elements_to_remove = [ + let elements_to_remove = &[ // navigation elements "#tout-en-haut.preentete", "#entete.connecte", @@ -110,7 +110,7 @@ impl Newspaper for MondeDiplo { let single_page_html = tools::self_contained_html::Config { downloader: Some(&downloader), base_url: Some(&url), - elements_to_remove: &elements_to_remove, + elements_to_remove, ..Default::default() } .run(&html) diff --git a/crieur-retrieve/src/tools/self_contained_html.rs b/crieur-retrieve/src/tools/self_contained_html.rs index 2e9ea2e..e04234f 100644 --- a/crieur-retrieve/src/tools/self_contained_html.rs +++ b/crieur-retrieve/src/tools/self_contained_html.rs @@ -59,6 +59,8 @@ where let downloader = self.downloader.expect("Downloader not defined"); // TODO: split/refactor this function : // - ¿ find a way to ease the |get urls| -> |async download| -> |replace| workflow ? + // - put each modification (ex: style in the `foreach`) in functions, maybe using + // (¿sealed?) Traits : https://git.alpaga.dev/poc/crieur/issues/42 // - `element_to_remove` , `base_url` and `downloader` should be in a configuration structure let (style_urls, html) = { let document = Document::from(html.as_ref()); @@ -84,9 +86,8 @@ where let stylesheets = document.select("link[href][rel=\"stylesheet\"]"); let styles_url = stylesheets .iter() - .map(|stylesheet| { - if let Some(src) = stylesheet.attr("href") { - //TODO: does it work with absolute urls ? + .map(|style_link| { + if let Some(src) = style_link.attr("href") { base_url.join(src.as_ref()).ok() } else { None @@ -110,13 +111,19 @@ where styles .iter() .zip(downloaded_styles.iter()) - .for_each(|(mut stylesheet, inner_css)| { + .for_each(|(mut style_link, inner_css)| { if let Some(Some(inner_css)) = inner_css { let css = String::from_utf8(inner_css.to_vec()).unwrap(); - let css = format!("", css); - stylesheet.replace_with_html(css); + let media_query = style_link.attr("media"); + let css = match media_query { + Some(media_query) => { + format!("", media_query, css) + } + None => format!("", css), + }; + style_link.replace_with_html(css); } else { - stylesheet.remove(); + style_link.remove(); } }); String::from(document.html()) @@ -192,6 +199,7 @@ where #[cfg(test)] mod tests { + // TODO: reduce boilerplate, DRY use super::*; @@ -362,6 +370,49 @@ mod tests { Ok(()) } + #[tokio::test] + async fn download_css_with_media_query() -> Result<()> { + let downloader = CssDownloader {}; + + let html = indoc! {" + + + + + + + + "}; + + let wanted_html = indoc! {" + + + + + + "}; + let mut minifier = HTMLMinifier::new(); + minifier.digest(wanted_html)?; + let minified = String::from_utf8(minifier.get_html().into())?; + + let base_url = Url::parse("http://example.com")?; + assert_eq!( + Config { + downloader: Some(&downloader), + base_url: Some(&base_url), + ..Default::default() + } + .run(html) + .await, + minified + ); + Ok(()) + } + struct PngDownloader; #[async_trait] impl Download for PngDownloader {