From 970f510cd115543832d89c20bcecd244749200cf Mon Sep 17 00:00:00 2001 From: koalp Date: Sat, 8 May 2021 03:23:27 +0200 Subject: [PATCH] feat: add retrieval from le monde diplomatique Add retrieval from le monde diplomatique Previously, 404 pages were injected in the document when downloading styles Now, the downloader returns None when documents are not found --- Cargo.lock | 209 +++++++++++++----- Cargo.toml | 1 + README.md | 3 +- crieur-retrieve/Cargo.toml | 10 +- crieur-retrieve/src/article_location.rs | 27 ++- crieur-retrieve/src/newspapers/mediapart.rs | 7 +- crieur-retrieve/src/newspapers/mod.rs | 1 + .../src/newspapers/monde_diplomatique.rs | 135 +++++++++++ crieur-retrieve/src/tools/download.rs | 28 ++- .../src/tools/self_contained_html.rs | 6 +- .../reference/newspaper_configuration.md | 13 ++ examples/cli_downloader.rs | 23 +- 12 files changed, 375 insertions(+), 88 deletions(-) create mode 100644 crieur-retrieve/src/newspapers/monde_diplomatique.rs diff --git a/Cargo.lock b/Cargo.lock index 25be411..992c8ee 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -74,13 +74,22 @@ checksum = "739f4a8db6605981345c5654f3a85b056ce52f37a39d34da03f25bf2151ea16e" [[package]] name = "aho-corasick" -version = "0.7.15" +version = "0.7.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7404febffaa47dac81aa44dba71523c9d069b1bdc50a77db41195149e17f68e5" +checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f" dependencies = [ "memchr", ] +[[package]] +name = "ansi_term" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2" +dependencies = [ + "winapi", +] + [[package]] name = "anyhow" version = "1.0.40" @@ -213,6 +222,18 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "chrono" +version = "0.4.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "670ad68c9088c2a963aaa298cb369688cf3f9465ce5e2d4ca10e6e0098a1ce73" +dependencies = [ + "libc", + "num-integer", + "num-traits", + "winapi", +] + [[package]] name = "cipher" version = "0.2.5" @@ -285,10 +306,13 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "79bb3adfaf5f75d24b01aee375f7555907840fa2800e5ec8fa3b9e2031830173" [[package]] -name = "cpuid-bool" -version = "0.1.2" +name = "cpufeatures" +version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8aebca1129a03dc6dc2b127edd729435bbc4a37e1d5f4d7513165089ceb02634" +checksum = "dec1028182c380cc45a2e2c5ec841134f2dfd0f8f5f0a5bcd68004f81b5efdf4" +dependencies = [ + "libc", +] [[package]] name = "cpuid-bool" @@ -307,6 +331,7 @@ dependencies = [ "env_logger", "log", "tokio", + "tracing-subscriber", ] [[package]] @@ -821,9 +846,9 @@ dependencies = [ [[package]] name = "h2" -version = "0.3.2" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc018e188373e2777d0ef2467ebff62a08e66c3f5857b23c8fbec3018210dc00" +checksum = "825343c4eef0b63f541f8903f395dc5beb362a979b5799a84062527ef1e37726" dependencies = [ "bytes", "fnv", @@ -877,9 +902,9 @@ dependencies = [ [[package]] name = "html-minifier" -version = "3.0.10" +version = "3.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "108452631307790510cde91282fc706ae70076bd68200add8638773f06d5e122" +checksum = "9876ed2cad2fd2f78bad42688bcac8a87cebce9b2381c5b05efc30b0c8429af0" dependencies = [ "cow-utils", "educe", @@ -914,9 +939,9 @@ dependencies = [ [[package]] name = "http-body" -version = "0.4.1" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5dfb77c123b4e2f72a2069aeae0b4b4949cc7e966df277813fc16347e7549737" +checksum = "60daa14be0e0786db0f03a9e57cb404c9d756eed2b6c62b9ea98ec5743ec75a9" dependencies = [ "bytes", "http", @@ -1055,9 +1080,9 @@ checksum = "dd25036021b0de88a0aff6b850051563c6516d0bf53f8638938edbb9de732736" [[package]] name = "js-sys" -version = "0.3.50" +version = "0.3.51" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d99f9e3e84b8f67f846ef5b4cbbc3b1c29f6c759fcbce6f01aa0e73d932a24c" +checksum = "83bdfbace3a0e81a4253f73b49e960b053e396a11012cbd49b9b74d6a2b67062" dependencies = [ "wasm-bindgen", ] @@ -1091,9 +1116,9 @@ checksum = "18794a8ad5b29321f790b55d93dfba91e125cb1a9edbd4f8e3150acc771c1a5e" [[package]] name = "lock_api" -version = "0.4.3" +version = "0.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a3c91c24eae6777794bb1997ad98bbb87daf92890acab859f7eaa4320333176" +checksum = "0382880606dff6d15c9476c416d18690b72742aa7b605bb6dd6ec9030fbf07eb" dependencies = [ "scopeguard", ] @@ -1158,6 +1183,15 @@ dependencies = [ "tendril", ] +[[package]] +name = "matchers" +version = "0.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f099785f7595cc4b4553a174ce30dd7589ef93391ff414dbb67f62392b9e0ce1" +dependencies = [ + "regex-automata", +] + [[package]] name = "matches" version = "0.1.8" @@ -1254,9 +1288,9 @@ checksum = "60302e4db3a61da70c0cb7991976248362f30319e88850c487b9b95bbf059e00" [[package]] name = "memchr" -version = "2.3.4" +version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ee1c47aaa256ecabcaea351eae4a9b01ef39ed810004e298d2511ed284b1525" +checksum = "b16bd47d9e329435e309c58469fe0791c2d0d1ba96ec0954152a5ae2b04387dc" [[package]] name = "mime" @@ -1585,7 +1619,7 @@ version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eebcc4aa140b9abd2bc40d9c3f7ccec842679cd79045ac3a7ac698c1a064b7cd" dependencies = [ - "cpuid-bool 0.2.0", + "cpuid-bool", "opaque-debug", "universal-hash", ] @@ -1856,18 +1890,18 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.2.7" +version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85dd92e586f7355c633911e11f77f3d12f04b1b1bd76a198bd34ae3af8341ef2" +checksum = "742739e41cd49414de871ea5e549afb7e2a3ac77b589bcbebe8c82fab37147fc" dependencies = [ "bitflags", ] [[package]] name = "regex" -version = "1.4.6" +version = "1.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a26af418b574bd56588335b3a3659a65725d4e636eb1016c2f9e3b38c7cc759" +checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461" dependencies = [ "aho-corasick", "memchr", @@ -1875,10 +1909,20 @@ dependencies = [ ] [[package]] -name = "regex-syntax" -version = "0.6.23" +name = "regex-automata" +version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24d5f089152e60f62d28b835fbff2cd2e8dc0baf1ac13343bef92ab7eed84548" +checksum = "ae1ded71d66a4a97f5e961fd0cb25a5f366a42a41570d16a763a69c092c26ae4" +dependencies = [ + "byteorder", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.6.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b" [[package]] name = "reqwest" @@ -2319,17 +2363,26 @@ checksum = "2579985fda508104f7587689507983eadd6a6e84dd35d6d115361f530916fa0d" [[package]] name = "sha2" -version = "0.9.3" +version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa827a14b29ab7f44778d14a88d3cb76e949c45083f7dbfa507d0cb699dc12de" +checksum = "d8f6b75b17576b792bef0db1bcc4b8b8bcdf9506744cf34b974195487af6cff2" dependencies = [ "block-buffer", "cfg-if 1.0.0", - "cpuid-bool 0.1.2", + "cpufeatures", "digest", "opaque-debug", ] +[[package]] +name = "sharded-slab" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79c719719ee05df97490f80a45acfc99e5a30ce98a1e4fb67aee422745ae14e3" +dependencies = [ + "lazy_static", +] + [[package]] name = "signal-hook-registry" version = "1.3.0" @@ -2491,9 +2544,9 @@ checksum = "1e81da0851ada1f3e9d4312c704aa4f8806f0f9d69faaf8df2f3464b4a9437c2" [[package]] name = "syn" -version = "1.0.71" +version = "1.0.72" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad184cc9470f9117b2ac6817bfe297307418819ba40552f9b3846f05c33d5373" +checksum = "a1e8cdbefb79a9a5a65e0db8b47b723ee907b7c7f8496c76a1770b5c310bab82" dependencies = [ "proc-macro2", "quote", @@ -2558,6 +2611,15 @@ dependencies = [ "syn", ] +[[package]] +name = "thread_local" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8018d24e04c95ac8790716a5987d0fec4f8b27249ffa0f7d33f1369bdfb88cbd" +dependencies = [ + "once_cell", +] + [[package]] name = "time" version = "0.2.26" @@ -2684,9 +2746,9 @@ checksum = "360dfd1d6d30e05fda32ace2c8c70e9c0a9da713275777f5a4dbb8a1893930c6" [[package]] name = "tracing" -version = "0.1.25" +version = "0.1.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01ebdc2bb4498ab1ab5f5b73c5803825e60199229ccba0698170e3be0e7f959f" +checksum = "09adeb8c97449311ccd28a427f96fb563e7fd31aabf994189879d9da2394b89d" dependencies = [ "cfg-if 1.0.0", "pin-project-lite", @@ -2707,9 +2769,9 @@ dependencies = [ [[package]] name = "tracing-core" -version = "0.1.17" +version = "0.1.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f50de3927f93d202783f4513cda820ab47ef17f624b03c096e86ef00c67e6b5f" +checksum = "a9ff14f98b1a4b289c6248a023c1c2fa1491062964e9fed67ab29c4e4da4a052" dependencies = [ "lazy_static", ] @@ -2724,6 +2786,49 @@ dependencies = [ "tracing", ] +[[package]] +name = "tracing-log" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6923477a48e41c1951f1999ef8bb5a3023eb723ceadafe78ffb65dc366761e3" +dependencies = [ + "lazy_static", + "log", + "tracing-core", +] + +[[package]] +name = "tracing-serde" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb65ea441fbb84f9f6748fd496cf7f63ec9af5bca94dd86456978d055e8eb28b" +dependencies = [ + "serde", + "tracing-core", +] + +[[package]] +name = "tracing-subscriber" +version = "0.2.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa5553bf0883ba7c9cbe493b085c29926bd41b66afc31ff72cf17ff4fb60dcd5" +dependencies = [ + "ansi_term", + "chrono", + "lazy_static", + "matchers", + "regex", + "serde", + "serde_json", + "sharded-slab", + "smallvec 1.6.1", + "thread_local", + "tracing", + "tracing-core", + "tracing-log", + "tracing-serde", +] + [[package]] name = "try-lock" version = "0.2.3" @@ -2756,9 +2861,9 @@ dependencies = [ [[package]] name = "unicode-xid" -version = "0.2.1" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7fe0bb3479651439c9112f72b6c505038574c9fbb575ed1bf3b797fa39dd564" +checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3" [[package]] name = "unindent" @@ -2784,9 +2889,9 @@ checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a" [[package]] name = "url" -version = "2.2.1" +version = "2.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ccd964113622c8e9322cfac19eb1004a07e636c545f325da085d5cdde6f1f8b" +checksum = "a507c383b2d33b5fc35d1861e77e6b383d158b2da5e14fe51b83dfedf6fd578c" dependencies = [ "form_urlencoded", "idna", @@ -2846,9 +2951,9 @@ checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6" [[package]] name = "wasm-bindgen" -version = "0.2.73" +version = "0.2.74" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83240549659d187488f91f33c0f8547cbfef0b2088bc470c116d1d260ef623d9" +checksum = "d54ee1d4ed486f78874278e63e4069fc1ab9f6a18ca492076ffb90c5eb2997fd" dependencies = [ "cfg-if 1.0.0", "serde", @@ -2858,9 +2963,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.73" +version = "0.2.74" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae70622411ca953215ca6d06d3ebeb1e915f0f6613e3b495122878d7ebec7dae" +checksum = "3b33f6a0694ccfea53d94db8b2ed1c3a8a4c86dd936b13b9f0a15ec4a451b900" dependencies = [ "bumpalo", "lazy_static", @@ -2873,9 +2978,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.23" +version = "0.4.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81b8b767af23de6ac18bf2168b690bed2902743ddf0fb39252e36f9e2bfc63ea" +checksum = "5fba7978c679d53ce2d0ac80c8c175840feb849a161664365d1287b41f2e67f1" dependencies = [ "cfg-if 1.0.0", "js-sys", @@ -2885,9 +2990,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.73" +version = "0.2.74" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3e734d91443f177bfdb41969de821e15c516931c3c3db3d318fa1b68975d0f6f" +checksum = "088169ca61430fe1e58b8096c24975251700e7b1f6fd91cc9d59b04fb9b18bd4" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -2895,9 +3000,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.73" +version = "0.2.74" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d53739ff08c8a68b0fdbcd54c372b8ab800b1449ab3c9d706503bc7dd1621b2c" +checksum = "be2241542ff3d9f241f5e2cb6dd09b37efe786df8851c54957683a49f0987a97" dependencies = [ "proc-macro2", "quote", @@ -2908,15 +3013,15 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.73" +version = "0.2.74" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9a543ae66aa233d14bb765ed9af4a33e81b8b58d1584cf1b47ff8cd0b9e4489" +checksum = "d7cff876b8f18eed75a66cf49b65e7f967cb354a7aa16003fb55dbfd25b44b4f" [[package]] name = "web-sys" -version = "0.3.50" +version = "0.3.51" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a905d57e488fec8861446d3393670fb50d27a262344013181c2cdf9fff5481be" +checksum = "e828417b379f3df7111d3a2a9e5753706cae29c41f7c4029ee9fd77f3e09e582" dependencies = [ "js-sys", "wasm-bindgen", diff --git a/Cargo.toml b/Cargo.toml index 458df77..6c58b28 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,3 +23,4 @@ dotenv = "0.15.0" env_logger = "0.8.3" log = "0.4.14" tokio = { version = "1.5.0", features = ["full"] } +tracing-subscriber = "0.2.18" diff --git a/README.md b/README.md index 455d599..8a77713 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,5 @@ -Tools to retrieve articles from multiple newspaper you subscribed to. +Tools to retrieve articles from multiple newspaper you subscribed to, all from +the same place. **This is a prototype, it isn't stable at all and you may not want to use it if you expect it to just work !** diff --git a/crieur-retrieve/Cargo.toml b/crieur-retrieve/Cargo.toml index 7b35a3c..089aea1 100644 --- a/crieur-retrieve/Cargo.toml +++ b/crieur-retrieve/Cargo.toml @@ -8,19 +8,19 @@ publish = false [dependencies] anyhow = "1.0.40" -async-trait = "0.1.48" +async-trait = "0.1.50" thiserror = "1.0.24" -url = "2.2.1" -hyper = { version = "0.14.5", features = ["full"] } +url = "2.2.2" +hyper = { version = "0.14.7", features = ["full"] } hyper-rustls = "0.22.1" cookie = "0.15.0" lol_html = "0.3.0" indoc = "1.0.3" -html-minifier = "3.0.9" +html-minifier = "3.0.11" bytes = "1.0.1" base64 = "0.13.0" futures = "0.3.14" -derive_builder = "0.10.0" +derive_builder = "0.10.2" nipper = "0.1.9" log = "0.4.14" env_logger = "0.8.3" diff --git a/crieur-retrieve/src/article_location.rs b/crieur-retrieve/src/article_location.rs index d6a177a..a0476b3 100644 --- a/crieur-retrieve/src/article_location.rs +++ b/crieur-retrieve/src/article_location.rs @@ -2,12 +2,12 @@ use std::boxed::Box; use std::convert::TryInto; use std::env; -use anyhow::anyhow; use log::info; use url::{Host, Url}; use crate::newspaper::Newspaper; use crate::newspapers::mediapart::{self, Mediapart}; +use crate::newspapers::monde_diplomatique::{self, MondeDiplo}; /// Enumerate all errors that can be encountered when using ArticleLocation #[derive(thiserror::Error, Debug)] @@ -33,6 +33,7 @@ type Newspapers = Vec>; pub type Result = core::result::Result; fn default_newpapers() -> Result { + // TODO: same thing is written too much times : how to DRY ? let config_key = "MEDIAPART_COOKIE".to_string(); let mpruiid = env::var(&config_key) .map_err(|_| Error::Misconfiguration(config_key))? @@ -42,7 +43,29 @@ fn default_newpapers() -> Result { .login(mediapart::Login::MPRUUID(mpruiid)) .build()?; - Ok(vec![Box::new(mediapart)]) + let lmd_a_m = "MONDE_DIPLO_LMD_A_M".to_string(); + let phpsessid = "MONDE_DIPLO_PHPSESSID".to_string(); + let spip_session = "MONDE_DIPLO_SPIP_SESSION".to_string(); + + let lmd_a_m = env::var(&lmd_a_m) + .map_err(|_| Error::Misconfiguration(lmd_a_m))? + .into(); + let phpsessid = env::var(&phpsessid) + .map_err(|_| Error::Misconfiguration(phpsessid))? + .into(); + let spip_session = env::var(&spip_session) + .map_err(|_| Error::Misconfiguration(spip_session))? + .into(); + + let monde_diplo = MondeDiplo::builder() + .login(monde_diplomatique::Login::Cookies { + lmd_a_m, + phpsessid, + spip_session, + }) + .build()?; + + Ok(vec![Box::new(mediapart), Box::new(monde_diplo)]) } #[derive(Default)] diff --git a/crieur-retrieve/src/newspapers/mediapart.rs b/crieur-retrieve/src/newspapers/mediapart.rs index 0933b6e..24a3933 100644 --- a/crieur-retrieve/src/newspapers/mediapart.rs +++ b/crieur-retrieve/src/newspapers/mediapart.rs @@ -1,4 +1,4 @@ -use anyhow::{anyhow, Result}; +use anyhow::{anyhow, bail, Result}; use async_trait::async_trait; use cookie::Cookie; use url::Host; @@ -80,7 +80,10 @@ impl Newspaper for Mediapart { let downloader = Downloader { cookies }; let body = downloader.download(&url).await?; - let html = String::from_utf8(body.to_vec())?; + let html = match body { + Some(body) => String::from_utf8(body.to_vec())?, + None => bail!("404 not found"), + }; // TODO: Move to const let element_to_remove = [ diff --git a/crieur-retrieve/src/newspapers/mod.rs b/crieur-retrieve/src/newspapers/mod.rs index 7f44529..1cc9356 100644 --- a/crieur-retrieve/src/newspapers/mod.rs +++ b/crieur-retrieve/src/newspapers/mod.rs @@ -1 +1,2 @@ pub mod mediapart; +pub mod monde_diplomatique; diff --git a/crieur-retrieve/src/newspapers/monde_diplomatique.rs b/crieur-retrieve/src/newspapers/monde_diplomatique.rs new file mode 100644 index 0000000..abf5187 --- /dev/null +++ b/crieur-retrieve/src/newspapers/monde_diplomatique.rs @@ -0,0 +1,135 @@ +use anyhow::{anyhow, bail, Result}; +use async_trait::async_trait; +use cookie::Cookie; +use url::Host; + +use crate::newspaper::{Metadata, Newspaper}; +use crate::tools; +use crate::Url; +use crate::{Download, Downloader}; + +pub enum Login { + Username(String, String), + Cookies { + lmd_a_m: String, + phpsessid: String, + spip_session: String, + }, +} + +#[derive(Debug, Clone, Default)] +pub struct MondeDiplo { + login_cookies: Vec<(String, String)>, +} + +fn str_to_host>(host: S) -> Host { + Host::Domain(host.into()) +} + +#[derive(Debug, Clone, Default)] +pub struct Builder { + login_cookies: Option>, +} + +impl Builder { + pub fn login(&mut self, login: Login) -> &mut Self { + self.login_cookies = match login { + Login::Username(_username, _password) => { + unimplemented!("login using username and passwond not implemented") + } + Login::Cookies { + lmd_a_m, + phpsessid, + spip_session, + } => Some(vec![ + ("lmd_a_m".into(), lmd_a_m), + ("PHPSESSID".into(), phpsessid), + ("spip_session".into(), spip_session), + ]), + }; + self + } + + pub fn build(&self) -> Result { + match &self.login_cookies { + Some(login_cookies) => Ok(MondeDiplo { + login_cookies: login_cookies.clone(), + }), + None => Err(anyhow!("You have to log in to access this newspaper")), + } + } +} + +#[async_trait] +impl Newspaper for MondeDiplo { + fn metadata(&self) -> Metadata { + Metadata::builder() + .hosts(vec![ + str_to_host("monde-diplomatique.fr"), + str_to_host("www.monde-diplomatique.fr"), + ]) + .lower_case_name("monde-diplomatique") + .name("Le Monde Diplomatique") + .build() + .unwrap_or_default() + } + + async fn retrieve_html(&self, url: &Url) -> Result { + let cookies = self + .login_cookies + .iter() + .map(|cookie| Cookie::build(&cookie.0, &cookie.1).finish()) + .collect::>(); + //let cookie = Cookie::build(&self.login_cookie.0, &self.login_cookie.1) + // .secure(true) + // .finish(); + //let cookies = vec![cookie]; + + // TODO: replace by builder + let downloader = Downloader { cookies }; + + let body = downloader.download(&url).await?; + let html = match body { + Some(body) => String::from_utf8(body.to_vec())?, + None => bail!("404 not found"), + }; + + // TODO: Move to const + let element_to_remove = [ + // navigation elements + "#tout-en-haut.preentete", + "#entete.connecte", + "#navigation", + "#pied", + ".bloc-connexion", + // unused features + "#ecouter", + // Social buttons + ".actions-article", + "#partage", + // misc + "noscript", + ]; + + let single_page_html = + tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await; + Ok(single_page_html) + } + + fn new() -> Self { + Self { + ..Default::default() + } + } + + async fn has_complete_access(&self) -> bool { + // TODO: check if we are logged using the cookie + true + } +} + +impl MondeDiplo { + pub fn builder() -> Builder { + Builder::default() + } +} diff --git a/crieur-retrieve/src/tools/download.rs b/crieur-retrieve/src/tools/download.rs index e00f77f..fef1df4 100644 --- a/crieur-retrieve/src/tools/download.rs +++ b/crieur-retrieve/src/tools/download.rs @@ -4,7 +4,7 @@ use anyhow::Result; use async_trait::async_trait; use bytes::Bytes; use cookie::Cookie; -use hyper::{header, Body, Client, Method, Request}; +use hyper::{header, Body, Client, Method, Request, StatusCode}; use thiserror::Error; use url::Url; @@ -22,7 +22,9 @@ pub trait Download { type Error: StdError; /// Downloads a file from an url and returns the result as bytes - async fn download(&self, file_link: &Url) -> Result; + /// + /// If the file is not found, returns None + async fn download(&self, file_link: &Url) -> Result, Self::Error>; } /// Store several cookies @@ -36,7 +38,8 @@ pub struct Downloader<'c> { impl<'c> Download for Downloader<'c> { type Error = DownloadError; - async fn download(&self, file_link: &Url) -> Result { + async fn download(&self, file_link: &Url) -> Result, Self::Error> { + log::info!("downloading url {:?}", file_link); let https = hyper_rustls::HttpsConnector::with_native_roots(); let client: Client<_, hyper::Body> = Client::builder().build(https); @@ -44,14 +47,25 @@ impl<'c> Download for Downloader<'c> { .method(Method::GET) .uri(file_link.as_str()); - for cookie in &self.cookies { - req = req.header(header::COOKIE, cookie.to_string()); - } + req = req.header( + header::COOKIE, + self.cookies + .iter() + .map(Cookie::to_string) + .collect::>() + .join(";"), + ); + log::info!("headers : {:?}", req.headers_ref()); let req = req.body(Body::empty())?; let resp = client.request(req).await?; - let body = hyper::body::to_bytes(resp).await?; + let body = match resp.status() { + StatusCode::OK => Some(hyper::body::to_bytes(resp).await?), + StatusCode::NOT_FOUND => None, + // TODO: enhance this by handling more error codes + _ => None, + }; Ok(body) } } diff --git a/crieur-retrieve/src/tools/self_contained_html.rs b/crieur-retrieve/src/tools/self_contained_html.rs index 7283a0e..32b8c85 100644 --- a/crieur-retrieve/src/tools/self_contained_html.rs +++ b/crieur-retrieve/src/tools/self_contained_html.rs @@ -77,7 +77,7 @@ where .iter() .zip(downloaded_styles.iter()) .for_each(|(mut stylesheet, inner_css)| { - if let Some(inner_css) = inner_css { + if let Some(Some(inner_css)) = inner_css { let css = String::from_utf8(inner_css.to_vec()).unwrap(); let css = format!("", css); stylesheet.replace_with_html(css); @@ -120,10 +120,12 @@ where imgs.iter() .zip(downloaded_images.iter()) .for_each(|(mut img, data)| { - if let Some((url, data)) = data { + if let Some((url, Some(data))) = data { let data = base64::encode(data); let extension = Path::new(url.path()).extension().unwrap().to_str().unwrap(); img.set_attr("src", &format!("data:image/{};base64,{}", extension, data)); + } else { + img.remove() } }); // ---- Remove unwanted html elements ----- diff --git a/documentation/reference/newspaper_configuration.md b/documentation/reference/newspaper_configuration.md index 8658087..0fe3c9a 100644 --- a/documentation/reference/newspaper_configuration.md +++ b/documentation/reference/newspaper_configuration.md @@ -8,3 +8,16 @@ The newspapers are configured using environment variables MEDIAPART_COOKIE : sets the `MPRUUID` cookie, used to log in + +# Le Monde Diplomatique + +All cookies are mandatory to log in + +MONDE_DIPLO_LMD_A_M +: sets the `lmd_a_m` cookie + +MONDE_DIPLO_PHPSESSID +: sets the `PHPSESSID` cookie + +MONDE_DIPLO_SPIP_SESSION +: sets the `spip_session` cookie diff --git a/examples/cli_downloader.rs b/examples/cli_downloader.rs index 4a4eefe..84d1815 100644 --- a/examples/cli_downloader.rs +++ b/examples/cli_downloader.rs @@ -2,38 +2,27 @@ use std::convert::TryInto; use std::env; use anyhow::Result; -use crieur_retrieve::{ - newspaper::Newspaper, - newspapers::mediapart::{self, Mediapart}, - ArticleLocation, Url, -}; +use crieur_retrieve::{ArticleLocation, Url}; use dotenv::dotenv; use log::info; #[tokio::main] async fn main() -> Result<()> { dotenv().ok(); - env_logger::init(); + tracing_subscriber::fmt() + .with_writer(std::io::stderr) + .with_env_filter(tracing_subscriber::EnvFilter::from_default_env()) + .init(); let url = match env::args().nth(1) { Some(url) => Url::parse(&url)?, None => "https://www.mediapart.fr/journal/france/030421/l-hotel-dieu-patients-et-medecins-tentent-de-percer-les-mysteres-du-covid-long".try_into()?, }; - // TODO: remove this in favor of default newspapers - - let mpruiid = env::var("MEDIAPART_COOKIE")?.into(); - let mediapart = Mediapart::builder() - .login(mediapart::Login::MPRUUID(mpruiid)) - .build()?; - info!("Trying to download article from {}", url); // TODO: shorten this, maybe an helper function ? - let article_location = ArticleLocation::builder() - .url(url)? - .newspaper(mediapart) - .build()?; + let article_location = ArticleLocation::builder().url(url)?.build()?; let article_str = article_location.retrieve_html().await?;