From 5d0872b4d97eb1bd440eed810245e1847f36a095 Mon Sep 17 00:00:00 2001 From: koalp Date: Thu, 13 May 2021 20:29:36 +0200 Subject: [PATCH] feat : add retrieve from courrier international Retrieval of articles from courrier international have been added --- Cargo.lock | 24 +--- crieur-chatbot/src/handlers/html.rs | 1 - crieur-retrieve/Cargo.toml | 3 +- crieur-retrieve/src/article_location.rs | 22 +++- .../src/newspapers/courrier_international.rs | 111 ++++++++++++++++++ crieur-retrieve/src/newspapers/mod.rs | 1 + .../src/newspapers/monde_diplomatique.rs | 4 - crieur-retrieve/src/tools/download.rs | 5 +- .../src/tools/self_contained_html.rs | 16 +-- .../reference/newspaper_configuration.md | 8 ++ justfile | 28 +++-- 11 files changed, 176 insertions(+), 47 deletions(-) create mode 100644 crieur-retrieve/src/newspapers/courrier_international.rs diff --git a/Cargo.lock b/Cargo.lock index 992c8ee..52af574 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -362,7 +362,6 @@ dependencies = [ "hyper", "hyper-rustls", "indoc", - "itertools", "log", "lol_html", "nipper", @@ -594,12 +593,6 @@ dependencies = [ "syn", ] -[[package]] -name = "either" -version = "1.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" - [[package]] name = "encoding_rs" version = "0.8.28" @@ -902,9 +895,9 @@ dependencies = [ [[package]] name = "html-minifier" -version = "3.0.11" +version = "3.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9876ed2cad2fd2f78bad42688bcac8a87cebce9b2381c5b05efc30b0c8429af0" +checksum = "5a3b862e637e82b3134913fdd0aa0b8e79b7486fe88878f6bab0d09daf4996af" dependencies = [ "cow-utils", "educe", @@ -1063,15 +1056,6 @@ version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "47be2f14c678be2fdcab04ab1171db51b2762ce6f0a8ee87c8dd4a04ed216135" -[[package]] -name = "itertools" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37d572918e350e82412fe766d24b15e6682fb2ed2bbe018280caa810397cb319" -dependencies = [ - "either", -] - [[package]] name = "itoa" version = "0.4.7" @@ -1300,9 +1284,9 @@ checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d" [[package]] name = "minifier" -version = "0.0.39" +version = "0.0.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6cdf618de5c9c98d4a7b2e0d1f1e44f82a19196cfd94040bb203621c25d28d98" +checksum = "ed8f02a863a23d5797a6e72ea7102bd0ee38ceba1256b522aeddb70ff14b32c9" dependencies = [ "macro-utils", ] diff --git a/crieur-chatbot/src/handlers/html.rs b/crieur-chatbot/src/handlers/html.rs index f8a73fd..0b514df 100644 --- a/crieur-chatbot/src/handlers/html.rs +++ b/crieur-chatbot/src/handlers/html.rs @@ -113,7 +113,6 @@ impl EventHandler for Html { } else { return; }; - info!("sending file"); match msg_body.split(' ').collect::>().as_slice() { ["!html", url, ..] => send_article(*url, room).await, diff --git a/crieur-retrieve/Cargo.toml b/crieur-retrieve/Cargo.toml index 089aea1..6b6578b 100644 --- a/crieur-retrieve/Cargo.toml +++ b/crieur-retrieve/Cargo.toml @@ -16,7 +16,7 @@ hyper-rustls = "0.22.1" cookie = "0.15.0" lol_html = "0.3.0" indoc = "1.0.3" -html-minifier = "3.0.11" +html-minifier = "3.0.12" bytes = "1.0.1" base64 = "0.13.0" futures = "0.3.14" @@ -24,7 +24,6 @@ derive_builder = "0.10.2" nipper = "0.1.9" log = "0.4.14" env_logger = "0.8.3" -itertools = "0.10.0" [dev-dependencies] tokio = "1.5.0" diff --git a/crieur-retrieve/src/article_location.rs b/crieur-retrieve/src/article_location.rs index a0476b3..ebe97b1 100644 --- a/crieur-retrieve/src/article_location.rs +++ b/crieur-retrieve/src/article_location.rs @@ -6,6 +6,7 @@ use log::info; use url::{Host, Url}; use crate::newspaper::Newspaper; +use crate::newspapers::courrier_international::{self, CourrierInternational}; use crate::newspapers::mediapart::{self, Mediapart}; use crate::newspapers::monde_diplomatique::{self, MondeDiplo}; @@ -65,7 +66,25 @@ fn default_newpapers() -> Result { }) .build()?; - Ok(vec![Box::new(mediapart), Box::new(monde_diplo)]) + let lmd_a_m = "COURRIER_INTERNATIONAL_LMD_A_M".to_string(); + let ssess = "COURRIER_INTERNATIONAL_SSESS".to_string(); + + let lmd_a_m = env::var(&lmd_a_m) + .map_err(|_| Error::Misconfiguration(lmd_a_m))? + .into(); + let ssess = env::var(&ssess) + .map_err(|_| Error::Misconfiguration(ssess))? + .into(); + + let courrier_international = CourrierInternational::builder() + .login(courrier_international::Login::Cookies { lmd_a_m, ssess }) + .build()?; + + Ok(vec![ + Box::new(mediapart), + Box::new(monde_diplo), + Box::new(courrier_international), + ]) } #[derive(Default)] @@ -149,7 +168,6 @@ impl ArticleLocation { } pub async fn retrieve_html(&self) -> Result { - info!("It will download from {}", self.url); // TODO: modify when retrieve_html returns a specific Error type Ok(self.newspaper.retrieve_html(&self.url).await?) } diff --git a/crieur-retrieve/src/newspapers/courrier_international.rs b/crieur-retrieve/src/newspapers/courrier_international.rs new file mode 100644 index 0000000..911b9e9 --- /dev/null +++ b/crieur-retrieve/src/newspapers/courrier_international.rs @@ -0,0 +1,111 @@ +use anyhow::{anyhow, bail, Result}; +use async_trait::async_trait; +use cookie::Cookie; +use url::Host; + +use crate::newspaper::{Metadata, Newspaper}; +use crate::tools; +use crate::Url; +use crate::{Download, Downloader}; + +pub enum Login { + Username(String, String), + Cookies { lmd_a_m: String, ssess: String }, +} + +#[derive(Debug, Clone, Default)] +pub struct CourrierInternational { + login_cookies: Vec<(String, String)>, +} + +fn str_to_host>(host: S) -> Host { + Host::Domain(host.into()) +} + +#[derive(Debug, Clone, Default)] +pub struct Builder { + login_cookies: Option>, +} + +impl Builder { + pub fn login(&mut self, login: Login) -> &mut Self { + self.login_cookies = match login { + Login::Username(_username, _password) => { + unimplemented!("login using username and passwond not implemented") + } + Login::Cookies { lmd_a_m, ssess } => Some(vec![ + ("lmd_a_m".into(), lmd_a_m), + ("SSESS862c7003d721c672d39f161b1456b890".into(), ssess), + ]), + }; + self + } + + pub fn build(&self) -> Result { + match &self.login_cookies { + Some(login_cookies) => Ok(CourrierInternational { + login_cookies: login_cookies.clone(), + }), + None => Err(anyhow!("You have to log in to access this newspaper")), + } + } +} + +#[async_trait] +impl Newspaper for CourrierInternational { + fn metadata(&self) -> Metadata { + Metadata::builder() + .hosts(vec![ + str_to_host("courrierinternational.com"), + str_to_host("www.courrierinternational.com"), + ]) + .lower_case_name("courrier-international") + .name("Courrier international") + .build() + .unwrap_or_default() + } + + async fn retrieve_html(&self, url: &Url) -> Result { + let cookies = self + .login_cookies + .iter() + .map(|cookie| Cookie::build(&cookie.0, &cookie.1).finish()) + .collect::>(); + + // TODO: replace by builder + let downloader = Downloader { cookies }; + + let body = downloader.download(&url).await?; + let html = match body { + Some(body) => String::from_utf8(body.to_vec())?, + None => bail!("404 not found"), + }; + + // TODO: Move to const + let element_to_remove = [ + // navigation elements + "#entete.connecte", + ]; + + let single_page_html = + tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await; + Ok(single_page_html) + } + + fn new() -> Self { + Self { + ..Default::default() + } + } + + async fn has_complete_access(&self) -> bool { + // TODO: check if we are logged using the cookie + true + } +} + +impl CourrierInternational { + pub fn builder() -> Builder { + Builder::default() + } +} diff --git a/crieur-retrieve/src/newspapers/mod.rs b/crieur-retrieve/src/newspapers/mod.rs index 1cc9356..d07c868 100644 --- a/crieur-retrieve/src/newspapers/mod.rs +++ b/crieur-retrieve/src/newspapers/mod.rs @@ -1,2 +1,3 @@ +pub mod courrier_international; pub mod mediapart; pub mod monde_diplomatique; diff --git a/crieur-retrieve/src/newspapers/monde_diplomatique.rs b/crieur-retrieve/src/newspapers/monde_diplomatique.rs index abf5187..041737f 100644 --- a/crieur-retrieve/src/newspapers/monde_diplomatique.rs +++ b/crieur-retrieve/src/newspapers/monde_diplomatique.rs @@ -80,10 +80,6 @@ impl Newspaper for MondeDiplo { .iter() .map(|cookie| Cookie::build(&cookie.0, &cookie.1).finish()) .collect::>(); - //let cookie = Cookie::build(&self.login_cookie.0, &self.login_cookie.1) - // .secure(true) - // .finish(); - //let cookies = vec![cookie]; // TODO: replace by builder let downloader = Downloader { cookies }; diff --git a/crieur-retrieve/src/tools/download.rs b/crieur-retrieve/src/tools/download.rs index fef1df4..ff5096b 100644 --- a/crieur-retrieve/src/tools/download.rs +++ b/crieur-retrieve/src/tools/download.rs @@ -39,7 +39,7 @@ impl<'c> Download for Downloader<'c> { type Error = DownloadError; async fn download(&self, file_link: &Url) -> Result, Self::Error> { - log::info!("downloading url {:?}", file_link); + log::debug!("downloading url {:?}", file_link); let https = hyper_rustls::HttpsConnector::with_native_roots(); let client: Client<_, hyper::Body> = Client::builder().build(https); @@ -55,11 +55,12 @@ impl<'c> Download for Downloader<'c> { .collect::>() .join(";"), ); - log::info!("headers : {:?}", req.headers_ref()); + log::debug!("headers : {:?}", req.headers_ref()); let req = req.body(Body::empty())?; let resp = client.request(req).await?; + log::debug!("Response status : {:?}", resp.status()); let body = match resp.status() { StatusCode::OK => Some(hyper::body::to_bytes(resp).await?), StatusCode::NOT_FOUND => None, diff --git a/crieur-retrieve/src/tools/self_contained_html.rs b/crieur-retrieve/src/tools/self_contained_html.rs index 44c3e11..bc25211 100644 --- a/crieur-retrieve/src/tools/self_contained_html.rs +++ b/crieur-retrieve/src/tools/self_contained_html.rs @@ -170,8 +170,8 @@ mod tests { #[async_trait] impl Download for DummyDownloader { type Error = errors::Error; - async fn download(&self, _file_link: &Url) -> errors::Result { - Ok(Bytes::from("")) + async fn download(&self, _file_link: &Url) -> errors::Result> { + Ok(Some(Bytes::from(""))) } } @@ -248,12 +248,14 @@ mod tests { #[async_trait] impl Download for CssDownloader { type Error = errors::Error; - async fn download(&self, _file_link: &Url) -> errors::Result { - Ok(indoc! {" + async fn download(&self, _file_link: &Url) -> errors::Result> { + Ok(Some( + indoc! {" section#warning { color: red; }"} - .into()) + .into(), + )) } } @@ -300,12 +302,12 @@ mod tests { #[async_trait] impl Download for PngDownloader { type Error = errors::Error; - async fn download(&self, _file_link: &Url) -> errors::Result { + async fn download(&self, _file_link: &Url) -> errors::Result> { let image_path = Path::new("test_data/home.png"); let mut image_file = File::open(&image_path).unwrap(); let mut image_buf: Vec = vec![]; image_file.read_to_end(&mut image_buf).unwrap(); - Ok(image_buf.into()) + Ok(Some(image_buf.into())) } } diff --git a/documentation/reference/newspaper_configuration.md b/documentation/reference/newspaper_configuration.md index 0fe3c9a..fa7ab2c 100644 --- a/documentation/reference/newspaper_configuration.md +++ b/documentation/reference/newspaper_configuration.md @@ -21,3 +21,11 @@ MONDE_DIPLO_PHPSESSID MONDE_DIPLO_SPIP_SESSION : sets the `spip_session` cookie + +# Courrier international + +COURRIER_INTERNATIONAL_LMD_A_M +: sets the `lmd_a_m` cookie + +COURRIER_INTERNATIONAL_SSESS +: sets the `ssess` cookie diff --git a/justfile b/justfile index a28b9d5..3b60ba7 100644 --- a/justfile +++ b/justfile @@ -1,19 +1,29 @@ @build: - cargo build + cargo build + +@build-container: + podman build --file ./containers/chatbot.containerfile -t crieur-chatbot . @clean: - cargo clean + cargo clean @run: - cargo run + cargo run + +@test: + cargo test --all + +@clippy: + cargo clippy + +@fmt: + cargo fmt + +@simulate-ci: fmt clippy test -@container: - podman build --file ./containers/chatbot.containerfile -t crieur-chatbot . @audit: - cargo audit + cargo audit @crev: - cargo crev verify - -@verify: audit crev + cargo crev verify