feat: add retrieval from le monde diplomatique

Add retrieval from le monde diplomatique

Previously, 404 pages were injected in the document when downloading
styles
Now, the downloader returns None when documents are not found
This commit is contained in:
koalp 2021-05-08 03:23:27 +02:00
parent 8afd74995b
commit 970f510cd1
Signed by: koalp
GPG Key ID: 35B21047DEB09A81
12 changed files with 375 additions and 88 deletions

209
Cargo.lock generated
View File

@ -74,13 +74,22 @@ checksum = "739f4a8db6605981345c5654f3a85b056ce52f37a39d34da03f25bf2151ea16e"
[[package]]
name = "aho-corasick"
version = "0.7.15"
version = "0.7.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7404febffaa47dac81aa44dba71523c9d069b1bdc50a77db41195149e17f68e5"
checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f"
dependencies = [
"memchr",
]
[[package]]
name = "ansi_term"
version = "0.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2"
dependencies = [
"winapi",
]
[[package]]
name = "anyhow"
version = "1.0.40"
@ -213,6 +222,18 @@ version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "chrono"
version = "0.4.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "670ad68c9088c2a963aaa298cb369688cf3f9465ce5e2d4ca10e6e0098a1ce73"
dependencies = [
"libc",
"num-integer",
"num-traits",
"winapi",
]
[[package]]
name = "cipher"
version = "0.2.5"
@ -285,10 +306,13 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "79bb3adfaf5f75d24b01aee375f7555907840fa2800e5ec8fa3b9e2031830173"
[[package]]
name = "cpuid-bool"
version = "0.1.2"
name = "cpufeatures"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8aebca1129a03dc6dc2b127edd729435bbc4a37e1d5f4d7513165089ceb02634"
checksum = "dec1028182c380cc45a2e2c5ec841134f2dfd0f8f5f0a5bcd68004f81b5efdf4"
dependencies = [
"libc",
]
[[package]]
name = "cpuid-bool"
@ -307,6 +331,7 @@ dependencies = [
"env_logger",
"log",
"tokio",
"tracing-subscriber",
]
[[package]]
@ -821,9 +846,9 @@ dependencies = [
[[package]]
name = "h2"
version = "0.3.2"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fc018e188373e2777d0ef2467ebff62a08e66c3f5857b23c8fbec3018210dc00"
checksum = "825343c4eef0b63f541f8903f395dc5beb362a979b5799a84062527ef1e37726"
dependencies = [
"bytes",
"fnv",
@ -877,9 +902,9 @@ dependencies = [
[[package]]
name = "html-minifier"
version = "3.0.10"
version = "3.0.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "108452631307790510cde91282fc706ae70076bd68200add8638773f06d5e122"
checksum = "9876ed2cad2fd2f78bad42688bcac8a87cebce9b2381c5b05efc30b0c8429af0"
dependencies = [
"cow-utils",
"educe",
@ -914,9 +939,9 @@ dependencies = [
[[package]]
name = "http-body"
version = "0.4.1"
version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5dfb77c123b4e2f72a2069aeae0b4b4949cc7e966df277813fc16347e7549737"
checksum = "60daa14be0e0786db0f03a9e57cb404c9d756eed2b6c62b9ea98ec5743ec75a9"
dependencies = [
"bytes",
"http",
@ -1055,9 +1080,9 @@ checksum = "dd25036021b0de88a0aff6b850051563c6516d0bf53f8638938edbb9de732736"
[[package]]
name = "js-sys"
version = "0.3.50"
version = "0.3.51"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2d99f9e3e84b8f67f846ef5b4cbbc3b1c29f6c759fcbce6f01aa0e73d932a24c"
checksum = "83bdfbace3a0e81a4253f73b49e960b053e396a11012cbd49b9b74d6a2b67062"
dependencies = [
"wasm-bindgen",
]
@ -1091,9 +1116,9 @@ checksum = "18794a8ad5b29321f790b55d93dfba91e125cb1a9edbd4f8e3150acc771c1a5e"
[[package]]
name = "lock_api"
version = "0.4.3"
version = "0.4.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a3c91c24eae6777794bb1997ad98bbb87daf92890acab859f7eaa4320333176"
checksum = "0382880606dff6d15c9476c416d18690b72742aa7b605bb6dd6ec9030fbf07eb"
dependencies = [
"scopeguard",
]
@ -1158,6 +1183,15 @@ dependencies = [
"tendril",
]
[[package]]
name = "matchers"
version = "0.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f099785f7595cc4b4553a174ce30dd7589ef93391ff414dbb67f62392b9e0ce1"
dependencies = [
"regex-automata",
]
[[package]]
name = "matches"
version = "0.1.8"
@ -1254,9 +1288,9 @@ checksum = "60302e4db3a61da70c0cb7991976248362f30319e88850c487b9b95bbf059e00"
[[package]]
name = "memchr"
version = "2.3.4"
version = "2.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0ee1c47aaa256ecabcaea351eae4a9b01ef39ed810004e298d2511ed284b1525"
checksum = "b16bd47d9e329435e309c58469fe0791c2d0d1ba96ec0954152a5ae2b04387dc"
[[package]]
name = "mime"
@ -1585,7 +1619,7 @@ version = "0.4.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eebcc4aa140b9abd2bc40d9c3f7ccec842679cd79045ac3a7ac698c1a064b7cd"
dependencies = [
"cpuid-bool 0.2.0",
"cpuid-bool",
"opaque-debug",
"universal-hash",
]
@ -1856,18 +1890,18 @@ dependencies = [
[[package]]
name = "redox_syscall"
version = "0.2.7"
version = "0.2.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "85dd92e586f7355c633911e11f77f3d12f04b1b1bd76a198bd34ae3af8341ef2"
checksum = "742739e41cd49414de871ea5e549afb7e2a3ac77b589bcbebe8c82fab37147fc"
dependencies = [
"bitflags",
]
[[package]]
name = "regex"
version = "1.4.6"
version = "1.5.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2a26af418b574bd56588335b3a3659a65725d4e636eb1016c2f9e3b38c7cc759"
checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461"
dependencies = [
"aho-corasick",
"memchr",
@ -1875,10 +1909,20 @@ dependencies = [
]
[[package]]
name = "regex-syntax"
version = "0.6.23"
name = "regex-automata"
version = "0.1.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "24d5f089152e60f62d28b835fbff2cd2e8dc0baf1ac13343bef92ab7eed84548"
checksum = "ae1ded71d66a4a97f5e961fd0cb25a5f366a42a41570d16a763a69c092c26ae4"
dependencies = [
"byteorder",
"regex-syntax",
]
[[package]]
name = "regex-syntax"
version = "0.6.25"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b"
[[package]]
name = "reqwest"
@ -2319,17 +2363,26 @@ checksum = "2579985fda508104f7587689507983eadd6a6e84dd35d6d115361f530916fa0d"
[[package]]
name = "sha2"
version = "0.9.3"
version = "0.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fa827a14b29ab7f44778d14a88d3cb76e949c45083f7dbfa507d0cb699dc12de"
checksum = "d8f6b75b17576b792bef0db1bcc4b8b8bcdf9506744cf34b974195487af6cff2"
dependencies = [
"block-buffer",
"cfg-if 1.0.0",
"cpuid-bool 0.1.2",
"cpufeatures",
"digest",
"opaque-debug",
]
[[package]]
name = "sharded-slab"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "79c719719ee05df97490f80a45acfc99e5a30ce98a1e4fb67aee422745ae14e3"
dependencies = [
"lazy_static",
]
[[package]]
name = "signal-hook-registry"
version = "1.3.0"
@ -2491,9 +2544,9 @@ checksum = "1e81da0851ada1f3e9d4312c704aa4f8806f0f9d69faaf8df2f3464b4a9437c2"
[[package]]
name = "syn"
version = "1.0.71"
version = "1.0.72"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ad184cc9470f9117b2ac6817bfe297307418819ba40552f9b3846f05c33d5373"
checksum = "a1e8cdbefb79a9a5a65e0db8b47b723ee907b7c7f8496c76a1770b5c310bab82"
dependencies = [
"proc-macro2",
"quote",
@ -2558,6 +2611,15 @@ dependencies = [
"syn",
]
[[package]]
name = "thread_local"
version = "1.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8018d24e04c95ac8790716a5987d0fec4f8b27249ffa0f7d33f1369bdfb88cbd"
dependencies = [
"once_cell",
]
[[package]]
name = "time"
version = "0.2.26"
@ -2684,9 +2746,9 @@ checksum = "360dfd1d6d30e05fda32ace2c8c70e9c0a9da713275777f5a4dbb8a1893930c6"
[[package]]
name = "tracing"
version = "0.1.25"
version = "0.1.26"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "01ebdc2bb4498ab1ab5f5b73c5803825e60199229ccba0698170e3be0e7f959f"
checksum = "09adeb8c97449311ccd28a427f96fb563e7fd31aabf994189879d9da2394b89d"
dependencies = [
"cfg-if 1.0.0",
"pin-project-lite",
@ -2707,9 +2769,9 @@ dependencies = [
[[package]]
name = "tracing-core"
version = "0.1.17"
version = "0.1.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f50de3927f93d202783f4513cda820ab47ef17f624b03c096e86ef00c67e6b5f"
checksum = "a9ff14f98b1a4b289c6248a023c1c2fa1491062964e9fed67ab29c4e4da4a052"
dependencies = [
"lazy_static",
]
@ -2724,6 +2786,49 @@ dependencies = [
"tracing",
]
[[package]]
name = "tracing-log"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a6923477a48e41c1951f1999ef8bb5a3023eb723ceadafe78ffb65dc366761e3"
dependencies = [
"lazy_static",
"log",
"tracing-core",
]
[[package]]
name = "tracing-serde"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fb65ea441fbb84f9f6748fd496cf7f63ec9af5bca94dd86456978d055e8eb28b"
dependencies = [
"serde",
"tracing-core",
]
[[package]]
name = "tracing-subscriber"
version = "0.2.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aa5553bf0883ba7c9cbe493b085c29926bd41b66afc31ff72cf17ff4fb60dcd5"
dependencies = [
"ansi_term",
"chrono",
"lazy_static",
"matchers",
"regex",
"serde",
"serde_json",
"sharded-slab",
"smallvec 1.6.1",
"thread_local",
"tracing",
"tracing-core",
"tracing-log",
"tracing-serde",
]
[[package]]
name = "try-lock"
version = "0.2.3"
@ -2756,9 +2861,9 @@ dependencies = [
[[package]]
name = "unicode-xid"
version = "0.2.1"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f7fe0bb3479651439c9112f72b6c505038574c9fbb575ed1bf3b797fa39dd564"
checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3"
[[package]]
name = "unindent"
@ -2784,9 +2889,9 @@ checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a"
[[package]]
name = "url"
version = "2.2.1"
version = "2.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9ccd964113622c8e9322cfac19eb1004a07e636c545f325da085d5cdde6f1f8b"
checksum = "a507c383b2d33b5fc35d1861e77e6b383d158b2da5e14fe51b83dfedf6fd578c"
dependencies = [
"form_urlencoded",
"idna",
@ -2846,9 +2951,9 @@ checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6"
[[package]]
name = "wasm-bindgen"
version = "0.2.73"
version = "0.2.74"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "83240549659d187488f91f33c0f8547cbfef0b2088bc470c116d1d260ef623d9"
checksum = "d54ee1d4ed486f78874278e63e4069fc1ab9f6a18ca492076ffb90c5eb2997fd"
dependencies = [
"cfg-if 1.0.0",
"serde",
@ -2858,9 +2963,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen-backend"
version = "0.2.73"
version = "0.2.74"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ae70622411ca953215ca6d06d3ebeb1e915f0f6613e3b495122878d7ebec7dae"
checksum = "3b33f6a0694ccfea53d94db8b2ed1c3a8a4c86dd936b13b9f0a15ec4a451b900"
dependencies = [
"bumpalo",
"lazy_static",
@ -2873,9 +2978,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen-futures"
version = "0.4.23"
version = "0.4.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "81b8b767af23de6ac18bf2168b690bed2902743ddf0fb39252e36f9e2bfc63ea"
checksum = "5fba7978c679d53ce2d0ac80c8c175840feb849a161664365d1287b41f2e67f1"
dependencies = [
"cfg-if 1.0.0",
"js-sys",
@ -2885,9 +2990,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen-macro"
version = "0.2.73"
version = "0.2.74"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3e734d91443f177bfdb41969de821e15c516931c3c3db3d318fa1b68975d0f6f"
checksum = "088169ca61430fe1e58b8096c24975251700e7b1f6fd91cc9d59b04fb9b18bd4"
dependencies = [
"quote",
"wasm-bindgen-macro-support",
@ -2895,9 +3000,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen-macro-support"
version = "0.2.73"
version = "0.2.74"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d53739ff08c8a68b0fdbcd54c372b8ab800b1449ab3c9d706503bc7dd1621b2c"
checksum = "be2241542ff3d9f241f5e2cb6dd09b37efe786df8851c54957683a49f0987a97"
dependencies = [
"proc-macro2",
"quote",
@ -2908,15 +3013,15 @@ dependencies = [
[[package]]
name = "wasm-bindgen-shared"
version = "0.2.73"
version = "0.2.74"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9a543ae66aa233d14bb765ed9af4a33e81b8b58d1584cf1b47ff8cd0b9e4489"
checksum = "d7cff876b8f18eed75a66cf49b65e7f967cb354a7aa16003fb55dbfd25b44b4f"
[[package]]
name = "web-sys"
version = "0.3.50"
version = "0.3.51"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a905d57e488fec8861446d3393670fb50d27a262344013181c2cdf9fff5481be"
checksum = "e828417b379f3df7111d3a2a9e5753706cae29c41f7c4029ee9fd77f3e09e582"
dependencies = [
"js-sys",
"wasm-bindgen",

View File

@ -23,3 +23,4 @@ dotenv = "0.15.0"
env_logger = "0.8.3"
log = "0.4.14"
tokio = { version = "1.5.0", features = ["full"] }
tracing-subscriber = "0.2.18"

View File

@ -1,4 +1,5 @@
Tools to retrieve articles from multiple newspaper you subscribed to.
Tools to retrieve articles from multiple newspaper you subscribed to, all from
the same place.
**This is a prototype, it isn't stable at all and you may not want to use it if
you expect it to just work !**

View File

@ -8,19 +8,19 @@ publish = false
[dependencies]
anyhow = "1.0.40"
async-trait = "0.1.48"
async-trait = "0.1.50"
thiserror = "1.0.24"
url = "2.2.1"
hyper = { version = "0.14.5", features = ["full"] }
url = "2.2.2"
hyper = { version = "0.14.7", features = ["full"] }
hyper-rustls = "0.22.1"
cookie = "0.15.0"
lol_html = "0.3.0"
indoc = "1.0.3"
html-minifier = "3.0.9"
html-minifier = "3.0.11"
bytes = "1.0.1"
base64 = "0.13.0"
futures = "0.3.14"
derive_builder = "0.10.0"
derive_builder = "0.10.2"
nipper = "0.1.9"
log = "0.4.14"
env_logger = "0.8.3"

View File

@ -2,12 +2,12 @@ use std::boxed::Box;
use std::convert::TryInto;
use std::env;
use anyhow::anyhow;
use log::info;
use url::{Host, Url};
use crate::newspaper::Newspaper;
use crate::newspapers::mediapart::{self, Mediapart};
use crate::newspapers::monde_diplomatique::{self, MondeDiplo};
/// Enumerate all errors that can be encountered when using ArticleLocation
#[derive(thiserror::Error, Debug)]
@ -33,6 +33,7 @@ type Newspapers = Vec<Box<dyn Newspaper>>;
pub type Result<T, E = Error> = core::result::Result<T, E>;
fn default_newpapers() -> Result<Newspapers> {
// TODO: same thing is written too much times : how to DRY ?
let config_key = "MEDIAPART_COOKIE".to_string();
let mpruiid = env::var(&config_key)
.map_err(|_| Error::Misconfiguration(config_key))?
@ -42,7 +43,29 @@ fn default_newpapers() -> Result<Newspapers> {
.login(mediapart::Login::MPRUUID(mpruiid))
.build()?;
Ok(vec![Box::new(mediapart)])
let lmd_a_m = "MONDE_DIPLO_LMD_A_M".to_string();
let phpsessid = "MONDE_DIPLO_PHPSESSID".to_string();
let spip_session = "MONDE_DIPLO_SPIP_SESSION".to_string();
let lmd_a_m = env::var(&lmd_a_m)
.map_err(|_| Error::Misconfiguration(lmd_a_m))?
.into();
let phpsessid = env::var(&phpsessid)
.map_err(|_| Error::Misconfiguration(phpsessid))?
.into();
let spip_session = env::var(&spip_session)
.map_err(|_| Error::Misconfiguration(spip_session))?
.into();
let monde_diplo = MondeDiplo::builder()
.login(monde_diplomatique::Login::Cookies {
lmd_a_m,
phpsessid,
spip_session,
})
.build()?;
Ok(vec![Box::new(mediapart), Box::new(monde_diplo)])
}
#[derive(Default)]

View File

@ -1,4 +1,4 @@
use anyhow::{anyhow, Result};
use anyhow::{anyhow, bail, Result};
use async_trait::async_trait;
use cookie::Cookie;
use url::Host;
@ -80,7 +80,10 @@ impl Newspaper for Mediapart {
let downloader = Downloader { cookies };
let body = downloader.download(&url).await?;
let html = String::from_utf8(body.to_vec())?;
let html = match body {
Some(body) => String::from_utf8(body.to_vec())?,
None => bail!("404 not found"),
};
// TODO: Move to const
let element_to_remove = [

View File

@ -1 +1,2 @@
pub mod mediapart;
pub mod monde_diplomatique;

View File

@ -0,0 +1,135 @@
use anyhow::{anyhow, bail, Result};
use async_trait::async_trait;
use cookie::Cookie;
use url::Host;
use crate::newspaper::{Metadata, Newspaper};
use crate::tools;
use crate::Url;
use crate::{Download, Downloader};
pub enum Login {
Username(String, String),
Cookies {
lmd_a_m: String,
phpsessid: String,
spip_session: String,
},
}
#[derive(Debug, Clone, Default)]
pub struct MondeDiplo {
login_cookies: Vec<(String, String)>,
}
fn str_to_host<S: Into<String>>(host: S) -> Host {
Host::Domain(host.into())
}
#[derive(Debug, Clone, Default)]
pub struct Builder {
login_cookies: Option<Vec<(String, String)>>,
}
impl Builder {
pub fn login(&mut self, login: Login) -> &mut Self {
self.login_cookies = match login {
Login::Username(_username, _password) => {
unimplemented!("login using username and passwond not implemented")
}
Login::Cookies {
lmd_a_m,
phpsessid,
spip_session,
} => Some(vec![
("lmd_a_m".into(), lmd_a_m),
("PHPSESSID".into(), phpsessid),
("spip_session".into(), spip_session),
]),
};
self
}
pub fn build(&self) -> Result<MondeDiplo> {
match &self.login_cookies {
Some(login_cookies) => Ok(MondeDiplo {
login_cookies: login_cookies.clone(),
}),
None => Err(anyhow!("You have to log in to access this newspaper")),
}
}
}
#[async_trait]
impl Newspaper for MondeDiplo {
fn metadata(&self) -> Metadata {
Metadata::builder()
.hosts(vec![
str_to_host("monde-diplomatique.fr"),
str_to_host("www.monde-diplomatique.fr"),
])
.lower_case_name("monde-diplomatique")
.name("Le Monde Diplomatique")
.build()
.unwrap_or_default()
}
async fn retrieve_html(&self, url: &Url) -> Result<String> {
let cookies = self
.login_cookies
.iter()
.map(|cookie| Cookie::build(&cookie.0, &cookie.1).finish())
.collect::<Vec<_>>();
//let cookie = Cookie::build(&self.login_cookie.0, &self.login_cookie.1)
// .secure(true)
// .finish();
//let cookies = vec![cookie];
// TODO: replace by builder
let downloader = Downloader { cookies };
let body = downloader.download(&url).await?;
let html = match body {
Some(body) => String::from_utf8(body.to_vec())?,
None => bail!("404 not found"),
};
// TODO: Move to const
let element_to_remove = [
// navigation elements
"#tout-en-haut.preentete",
"#entete.connecte",
"#navigation",
"#pied",
".bloc-connexion",
// unused features
"#ecouter",
// Social buttons
".actions-article",
"#partage",
// misc
"noscript",
];
let single_page_html =
tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await;
Ok(single_page_html)
}
fn new() -> Self {
Self {
..Default::default()
}
}
async fn has_complete_access(&self) -> bool {
// TODO: check if we are logged using the cookie
true
}
}
impl MondeDiplo {
pub fn builder() -> Builder {
Builder::default()
}
}

View File

@ -4,7 +4,7 @@ use anyhow::Result;
use async_trait::async_trait;
use bytes::Bytes;
use cookie::Cookie;
use hyper::{header, Body, Client, Method, Request};
use hyper::{header, Body, Client, Method, Request, StatusCode};
use thiserror::Error;
use url::Url;
@ -22,7 +22,9 @@ pub trait Download {
type Error: StdError;
/// Downloads a file from an url and returns the result as bytes
async fn download(&self, file_link: &Url) -> Result<Bytes, Self::Error>;
///
/// If the file is not found, returns None
async fn download(&self, file_link: &Url) -> Result<Option<Bytes>, Self::Error>;
}
/// Store several cookies
@ -36,7 +38,8 @@ pub struct Downloader<'c> {
impl<'c> Download for Downloader<'c> {
type Error = DownloadError;
async fn download(&self, file_link: &Url) -> Result<Bytes, Self::Error> {
async fn download(&self, file_link: &Url) -> Result<Option<Bytes>, Self::Error> {
log::info!("downloading url {:?}", file_link);
let https = hyper_rustls::HttpsConnector::with_native_roots();
let client: Client<_, hyper::Body> = Client::builder().build(https);
@ -44,14 +47,25 @@ impl<'c> Download for Downloader<'c> {
.method(Method::GET)
.uri(file_link.as_str());
for cookie in &self.cookies {
req = req.header(header::COOKIE, cookie.to_string());
}
req = req.header(
header::COOKIE,
self.cookies
.iter()
.map(Cookie::to_string)
.collect::<Vec<_>>()
.join(";"),
);
log::info!("headers : {:?}", req.headers_ref());
let req = req.body(Body::empty())?;
let resp = client.request(req).await?;
let body = hyper::body::to_bytes(resp).await?;
let body = match resp.status() {
StatusCode::OK => Some(hyper::body::to_bytes(resp).await?),
StatusCode::NOT_FOUND => None,
// TODO: enhance this by handling more error codes
_ => None,
};
Ok(body)
}
}

View File

@ -77,7 +77,7 @@ where
.iter()
.zip(downloaded_styles.iter())
.for_each(|(mut stylesheet, inner_css)| {
if let Some(inner_css) = inner_css {
if let Some(Some(inner_css)) = inner_css {
let css = String::from_utf8(inner_css.to_vec()).unwrap();
let css = format!("<style>{}</style>", css);
stylesheet.replace_with_html(css);
@ -120,10 +120,12 @@ where
imgs.iter()
.zip(downloaded_images.iter())
.for_each(|(mut img, data)| {
if let Some((url, data)) = data {
if let Some((url, Some(data))) = data {
let data = base64::encode(data);
let extension = Path::new(url.path()).extension().unwrap().to_str().unwrap();
img.set_attr("src", &format!("data:image/{};base64,{}", extension, data));
} else {
img.remove()
}
});
// ---- Remove unwanted html elements -----

View File

@ -8,3 +8,16 @@ The newspapers are configured using environment variables
MEDIAPART_COOKIE
: sets the `MPRUUID` cookie, used to log in
# Le Monde Diplomatique
All cookies are mandatory to log in
MONDE_DIPLO_LMD_A_M
: sets the `lmd_a_m` cookie
MONDE_DIPLO_PHPSESSID
: sets the `PHPSESSID` cookie
MONDE_DIPLO_SPIP_SESSION
: sets the `spip_session` cookie

View File

@ -2,38 +2,27 @@ use std::convert::TryInto;
use std::env;
use anyhow::Result;
use crieur_retrieve::{
newspaper::Newspaper,
newspapers::mediapart::{self, Mediapart},
ArticleLocation, Url,
};
use crieur_retrieve::{ArticleLocation, Url};
use dotenv::dotenv;
use log::info;
#[tokio::main]
async fn main() -> Result<()> {
dotenv().ok();
env_logger::init();
tracing_subscriber::fmt()
.with_writer(std::io::stderr)
.with_env_filter(tracing_subscriber::EnvFilter::from_default_env())
.init();
let url = match env::args().nth(1) {
Some(url) => Url::parse(&url)?,
None => "https://www.mediapart.fr/journal/france/030421/l-hotel-dieu-patients-et-medecins-tentent-de-percer-les-mysteres-du-covid-long".try_into()?,
};
// TODO: remove this in favor of default newspapers
let mpruiid = env::var("MEDIAPART_COOKIE")?.into();
let mediapart = Mediapart::builder()
.login(mediapart::Login::MPRUUID(mpruiid))
.build()?;
info!("Trying to download article from {}", url);
// TODO: shorten this, maybe an helper function ?
let article_location = ArticleLocation::builder()
.url(url)?
.newspaper(mediapart)
.build()?;
let article_location = ArticleLocation::builder().url(url)?.build()?;
let article_str = article_location.retrieve_html().await?;