From 6e091a32fc111a7c9f46886c4399915f4830d0a1 Mon Sep 17 00:00:00 2001 From: koalp Date: Mon, 17 May 2021 20:18:32 +0200 Subject: [PATCH] chore: use a config struct for self_contained_html Previously, self_html_function was a function taking all parameters as arguments. As new optionnal parameters are beeing added, the function had too much arguments and each usage of the function would have to be modified each time an argument will be added. Therefore, it have been moved to a configuration structure with a `run` function taking only one argument, the html string. --- crieur-chatbot/src/chatbot.rs | 10 +- crieur-chatbot/src/handlers/html.rs | 5 +- crieur-retrieve/src/article_location.rs | 28 +- .../src/newspapers/courrier_international.rs | 12 +- crieur-retrieve/src/newspapers/mediapart.rs | 16 +- .../src/newspapers/monde_diplomatique.rs | 12 +- crieur-retrieve/src/tools/mod.rs | 3 +- .../src/tools/self_contained_html.rs | 331 +++++++++++------- documentation/design/scope.md | 3 +- 9 files changed, 239 insertions(+), 181 deletions(-) diff --git a/crieur-chatbot/src/chatbot.rs b/crieur-chatbot/src/chatbot.rs index 137b9ef..85018b2 100644 --- a/crieur-chatbot/src/chatbot.rs +++ b/crieur-chatbot/src/chatbot.rs @@ -2,15 +2,7 @@ use std::convert::TryInto; use anyhow::Result; -use matrix_sdk::{ - self, async_trait, - events::{ - room::message::{MessageEventContent, MessageType, TextMessageEventContent}, - AnyMessageEventContent, SyncMessageEvent, - }, - room::Room, - Client, ClientConfig, EventHandler, SyncSettings, -}; +use matrix_sdk::{self, Client, SyncSettings}; use crate::Html; diff --git a/crieur-chatbot/src/handlers/html.rs b/crieur-chatbot/src/handlers/html.rs index 0b514df..3d5bf61 100644 --- a/crieur-chatbot/src/handlers/html.rs +++ b/crieur-chatbot/src/handlers/html.rs @@ -1,7 +1,6 @@ use std::convert::TryInto; -use std::env; -use log::{error, info}; +use log::error; use matrix_sdk::{ self, async_trait, events::{ @@ -9,7 +8,7 @@ use matrix_sdk::{ AnyMessageEventContent, SyncMessageEvent, }, room::Room, - Client, ClientConfig, EventHandler, SyncSettings, + EventHandler, }; use crieur_retrieve::{article_location::Error, article_location::Result, ArticleLocation, Url}; diff --git a/crieur-retrieve/src/article_location.rs b/crieur-retrieve/src/article_location.rs index ebe97b1..2062c0b 100644 --- a/crieur-retrieve/src/article_location.rs +++ b/crieur-retrieve/src/article_location.rs @@ -2,7 +2,6 @@ use std::boxed::Box; use std::convert::TryInto; use std::env; -use log::info; use url::{Host, Url}; use crate::newspaper::Newspaper; @@ -36,27 +35,20 @@ pub type Result = core::result::Result; fn default_newpapers() -> Result { // TODO: same thing is written too much times : how to DRY ? let config_key = "MEDIAPART_COOKIE".to_string(); - let mpruiid = env::var(&config_key) - .map_err(|_| Error::Misconfiguration(config_key))? - .into(); + let mpruiid = env::var(&config_key).map_err(|_| Error::Misconfiguration(config_key))?; let mediapart = Mediapart::builder() - .login(mediapart::Login::MPRUUID(mpruiid)) + .login(mediapart::Login::Mpruuid(mpruiid)) .build()?; let lmd_a_m = "MONDE_DIPLO_LMD_A_M".to_string(); let phpsessid = "MONDE_DIPLO_PHPSESSID".to_string(); let spip_session = "MONDE_DIPLO_SPIP_SESSION".to_string(); - let lmd_a_m = env::var(&lmd_a_m) - .map_err(|_| Error::Misconfiguration(lmd_a_m))? - .into(); - let phpsessid = env::var(&phpsessid) - .map_err(|_| Error::Misconfiguration(phpsessid))? - .into(); - let spip_session = env::var(&spip_session) - .map_err(|_| Error::Misconfiguration(spip_session))? - .into(); + let lmd_a_m = env::var(&lmd_a_m).map_err(|_| Error::Misconfiguration(lmd_a_m))?; + let phpsessid = env::var(&phpsessid).map_err(|_| Error::Misconfiguration(phpsessid))?; + let spip_session = + env::var(&spip_session).map_err(|_| Error::Misconfiguration(spip_session))?; let monde_diplo = MondeDiplo::builder() .login(monde_diplomatique::Login::Cookies { @@ -69,12 +61,8 @@ fn default_newpapers() -> Result { let lmd_a_m = "COURRIER_INTERNATIONAL_LMD_A_M".to_string(); let ssess = "COURRIER_INTERNATIONAL_SSESS".to_string(); - let lmd_a_m = env::var(&lmd_a_m) - .map_err(|_| Error::Misconfiguration(lmd_a_m))? - .into(); - let ssess = env::var(&ssess) - .map_err(|_| Error::Misconfiguration(ssess))? - .into(); + let lmd_a_m = env::var(&lmd_a_m).map_err(|_| Error::Misconfiguration(lmd_a_m))?; + let ssess = env::var(&ssess).map_err(|_| Error::Misconfiguration(ssess))?; let courrier_international = CourrierInternational::builder() .login(courrier_international::Login::Cookies { lmd_a_m, ssess }) diff --git a/crieur-retrieve/src/newspapers/courrier_international.rs b/crieur-retrieve/src/newspapers/courrier_international.rs index 911b9e9..187e5db 100644 --- a/crieur-retrieve/src/newspapers/courrier_international.rs +++ b/crieur-retrieve/src/newspapers/courrier_international.rs @@ -82,13 +82,19 @@ impl Newspaper for CourrierInternational { }; // TODO: Move to const - let element_to_remove = [ + let elements_to_remove = [ // navigation elements "#entete.connecte", ]; - let single_page_html = - tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await; + let single_page_html = tools::self_contained_html::Config { + downloader: Some(&downloader), + base_url: Some(&url), + elements_to_remove: &elements_to_remove, + ..Default::default() + } + .run(&html) + .await; Ok(single_page_html) } diff --git a/crieur-retrieve/src/newspapers/mediapart.rs b/crieur-retrieve/src/newspapers/mediapart.rs index 24a3933..2586e08 100644 --- a/crieur-retrieve/src/newspapers/mediapart.rs +++ b/crieur-retrieve/src/newspapers/mediapart.rs @@ -10,7 +10,7 @@ use crate::{Download, Downloader}; pub enum Login { Username(String, String), - MPRUUID(String), + Mpruuid(String), } #[derive(Debug, Clone, Default)] @@ -33,7 +33,7 @@ impl Builder { Login::Username(_username, _password) => { unimplemented!("login using username and passwond not implemented") } - Login::MPRUUID(cookie_value) => Some(("MPRUUID".into(), cookie_value)), + Login::Mpruuid(cookie_value) => Some(("MPRUUID".into(), cookie_value)), }; self } @@ -86,7 +86,7 @@ impl Newspaper for Mediapart { }; // TODO: Move to const - let element_to_remove = [ + let elements_to_remove = [ // header ".fb-root", ".skipLinks", @@ -104,8 +104,14 @@ impl Newspaper for Mediapart { "aside.cc-modal", ]; - let single_page_html = - tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await; + let single_page_html = tools::self_contained_html::Config { + downloader: Some(&downloader), + base_url: Some(&url), + elements_to_remove: &elements_to_remove, + ..Default::default() + } + .run(&html) + .await; Ok(single_page_html) } diff --git a/crieur-retrieve/src/newspapers/monde_diplomatique.rs b/crieur-retrieve/src/newspapers/monde_diplomatique.rs index 041737f..8348dae 100644 --- a/crieur-retrieve/src/newspapers/monde_diplomatique.rs +++ b/crieur-retrieve/src/newspapers/monde_diplomatique.rs @@ -91,7 +91,7 @@ impl Newspaper for MondeDiplo { }; // TODO: Move to const - let element_to_remove = [ + let elements_to_remove = [ // navigation elements "#tout-en-haut.preentete", "#entete.connecte", @@ -107,8 +107,14 @@ impl Newspaper for MondeDiplo { "noscript", ]; - let single_page_html = - tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await; + let single_page_html = tools::self_contained_html::Config { + downloader: Some(&downloader), + base_url: Some(&url), + elements_to_remove: &elements_to_remove, + ..Default::default() + } + .run(&html) + .await; Ok(single_page_html) } diff --git a/crieur-retrieve/src/tools/mod.rs b/crieur-retrieve/src/tools/mod.rs index 59381b1..80f159e 100644 --- a/crieur-retrieve/src/tools/mod.rs +++ b/crieur-retrieve/src/tools/mod.rs @@ -1,5 +1,4 @@ mod download; -mod self_contained_html; +pub mod self_contained_html; pub use download::{Download, DownloadError, Downloader}; -pub use self_contained_html::self_contained_html; diff --git a/crieur-retrieve/src/tools/self_contained_html.rs b/crieur-retrieve/src/tools/self_contained_html.rs index bc25211..2a5ddac 100644 --- a/crieur-retrieve/src/tools/self_contained_html.rs +++ b/crieur-retrieve/src/tools/self_contained_html.rs @@ -8,140 +8,177 @@ use url::Url; use crate::consts::{EVENT_HANDLERS, LINK_REL_EXTERNAL_RESOURCES}; use crate::Download; -/// Makes an html page self-contained -/// -/// The `downloader` must implement `Download` and is used to download ressources that are -/// needed to make this page self-contained such as stylesheets or images. -/// -/// The function also removes all scripts on the page -pub async fn self_contained_html( - html: impl AsRef, - downloader: &D, - base_url: &Url, - elements_to_remove: &[impl AsRef], -) -> String +/// Stores configuration for the self_contained_html function +// TODO: write a builder +pub struct Config<'t, E, D, S1 = &'static str, S2 = &'static str> +where + E: std::error::Error, + D: Download + Send, + S1: AsRef, + S2: AsRef, +{ + /// the downloader that will be used to retrieve ressources on the page + pub downloader: Option<&'t D>, + /// Base url for downloading ressources, it probably the + pub base_url: Option<&'t Url>, + pub elements_to_remove: &'t [S1], + pub styles_to_add: &'t [S2], +} + +impl<'t, E, D> Default for Config<'t, E, D> where E: std::error::Error, D: Download + Send, { - // TODO: split/refactor this function : - // - ¿ find a way to ease the |get urls| -> |async download| -> |replace| workflow ? - // - `element_to_remove` , `base_url` and `downloader` should be in a configuration structure - // - ¿ should be function of a trait ? or only of the configuration struct ? - let (style_urls, html) = { - let document = Document::from(html.as_ref()); - - // ---- Remove scripts ---- - // - document.select("script").remove(); - - for event in EVENT_HANDLERS { - document - .select(format!("[{}]", event).as_str()) - .remove_attr(event); + fn default() -> Self { + Self { + downloader: None, + base_url: None, + elements_to_remove: &[], + styles_to_add: &[], } + } +} - for rel in LINK_REL_EXTERNAL_RESOURCES { - document - .select(format!("link[rel=\"{}\"]", rel).as_str()) - .remove(); - } +impl<'t, E, D, S1, S2> Config<'t, E, D, S1, S2> +where + E: std::error::Error, + D: Download + Send, + S1: AsRef, + S2: AsRef, +{ + /// Makes an html page self-contained + /// + /// The `downloader` must implement `Download` and is used to download ressources that are + /// needed to make this page self-contained such as stylesheets or images. + /// + /// The function also removes all scripts on the page + pub async fn run(&self, html: impl AsRef) -> String { + //TODO: don't panic + let base_url = self.base_url.expect("Base url not defined"); + let downloader = self.downloader.expect("Downloader not defined"); + // TODO: split/refactor this function : + // - ¿ find a way to ease the |get urls| -> |async download| -> |replace| workflow ? + // - `element_to_remove` , `base_url` and `downloader` should be in a configuration structure + let (style_urls, html) = { + let document = Document::from(html.as_ref()); - // ---- Replace stylesheets ---- + // ---- Remove scripts ---- + // + document.select("script").remove(); + + for event in EVENT_HANDLERS { + document + .select(format!("[{}]", event).as_str()) + .remove_attr(event); + } + + for rel in LINK_REL_EXTERNAL_RESOURCES { + document + .select(format!("link[rel=\"{}\"]", rel).as_str()) + .remove(); + } + + // ---- Replace stylesheets ---- + // + let stylesheets = document.select("link[href][rel=\"stylesheet\"]"); + let styles_url = stylesheets + .iter() + .map(|stylesheet| { + if let Some(src) = stylesheet.attr("href") { + //TODO: does it work with absolute urls ? + base_url.join(src.as_ref()).ok() + } else { + None + } + }) + .collect::>(); + (styles_url, String::from(document.html())) + }; + + let style_urls = style_urls.into_iter().map(|style_url| { + OptionFuture::from( + style_url.map(|s| async move { downloader.download(&s).await.unwrap() }), + ) + }); + let downloaded_styles = futures::future::join_all(style_urls).await; + + let html = { + let document = Document::from(&html); + let styles = document.select("link[href][rel=\"stylesheet\"]"); + + styles + .iter() + .zip(downloaded_styles.iter()) + .for_each(|(mut stylesheet, inner_css)| { + if let Some(Some(inner_css)) = inner_css { + let css = String::from_utf8(inner_css.to_vec()).unwrap(); + let css = format!("", css); + stylesheet.replace_with_html(css); + } else { + stylesheet.remove(); + } + }); + String::from(document.html()) + }; + + // ---- Replace imgs ---- // - let stylesheets = document.select("link[href][rel=\"stylesheet\"]"); - let styles_url = stylesheets - .iter() - .map(|stylesheet| { - if let Some(src) = stylesheet.attr("href") { - //TODO: does it work with absolute urls ? - base_url.join(src.as_ref()).ok() - } else { - None - } - }) - .collect::>(); - (styles_url, String::from(document.html())) - }; + let image_urls = { + let document = Document::from(&html); + let imgs = document.select("img:not([src^=\"data:\"])"); - let style_urls = style_urls.into_iter().map(|style_url| { - OptionFuture::from(style_url.map(|s| async move { downloader.download(&s).await.unwrap() })) - }); - let downloaded_styles = futures::future::join_all(style_urls).await; + imgs.iter() + .map(|image| { + if let Some(src) = image.attr("src") { + base_url.join(src.as_ref()).ok() + } else { + None + } + }) + .collect::>() + }; - let html = { - let document = Document::from(&html); - let styles = document.select("link[href][rel=\"stylesheet\"]"); + let downloaded_images = image_urls.into_iter().map(|image_url| { + OptionFuture::from(image_url.map(|url| async move { + let data = downloader.download(&url).await.unwrap(); + (url, data) + })) + }); + let downloaded_images = futures::future::join_all(downloaded_images).await; - styles - .iter() - .zip(downloaded_styles.iter()) - .for_each(|(mut stylesheet, inner_css)| { - if let Some(Some(inner_css)) = inner_css { - let css = String::from_utf8(inner_css.to_vec()).unwrap(); - let css = format!("", css); - stylesheet.replace_with_html(css); - } else { - stylesheet.remove(); - } - }); - String::from(document.html()) - }; + let html = { + let document = Document::from(&html); + let imgs = document.select("img:not([src^=\"data:\"])"); - // ---- Replace imgs ---- - // - let image_urls = { - let document = Document::from(&html); - let imgs = document.select("img:not([src^=\"data:\"])"); + imgs.iter() + .zip(downloaded_images.iter()) + .for_each(|(mut img, data)| { + if let Some((url, Some(data))) = data { + let data = base64::encode(data); + //TODO: use an extension hashmap + let extension = + Path::new(url.path()).extension().unwrap().to_str().unwrap(); + img.set_attr("src", &format!("data:image/{};base64,{}", extension, data)); + } else { + img.remove() + } + }); + // ---- Remove unwanted html elements ----- + // + for element in self.elements_to_remove { + document.select(element.as_ref()).remove(); + } + String::from(document.html()) + }; - imgs.iter() - .map(|image| { - if let Some(src) = image.attr("src") { - base_url.join(src.as_ref()).ok() - } else { - None - } - }) - .collect::>() - }; - - let downloaded_images = image_urls.into_iter().map(|image_url| { - OptionFuture::from(image_url.map(|url| async move { - let data = downloader.download(&url).await.unwrap(); - (url, data) - })) - }); - let downloaded_images = futures::future::join_all(downloaded_images).await; - - let html = { - let document = Document::from(&html); - let imgs = document.select("img:not([src^=\"data:\"])"); - - imgs.iter() - .zip(downloaded_images.iter()) - .for_each(|(mut img, data)| { - if let Some((url, Some(data))) = data { - let data = base64::encode(data); - let extension = Path::new(url.path()).extension().unwrap().to_str().unwrap(); - img.set_attr("src", &format!("data:image/{};base64,{}", extension, data)); - } else { - img.remove() - } - }); - // ---- Remove unwanted html elements ----- + // ---- output ---- // - for element in elements_to_remove { - document.select(element.as_ref()).remove(); - } - String::from(document.html()) - }; + let mut minifier = HTMLMinifier::new(); + minifier.digest(html.as_str()).unwrap(); - // ---- output ---- - // - let mut minifier = HTMLMinifier::new(); - minifier.digest(html.as_str()).unwrap(); - - String::from_utf8(minifier.get_html().into()).unwrap() + String::from_utf8(minifier.get_html().into()).unwrap() + } } #[cfg(test)] @@ -180,9 +217,14 @@ mod tests { let html = ""; let base_url = Url::parse("http://example.com")?; let downloader = DummyDownloader {}; - let to_remove: &[&str] = &[]; assert_eq!( - self_contained_html(html, &downloader, &base_url, to_remove).await, + Config { + downloader: Some(&downloader), + base_url: Some(&base_url), + ..Default::default() + } + .run(html) + .await, "" ); Ok(()) @@ -206,10 +248,13 @@ mod tests { }; let base_url = Url::parse("http://example.com")?; - let to_remove: &[&str] = &[]; for s in EVENT_HANDLERS { assert_eq!( - self_contained_html(html(s), &downloader, &base_url, to_remove).await, + Config { + downloader: Some(&downloader), + base_url: Some(&base_url), + ..Default::default() + }.run(html(s)).await, "\n\n\n\n" ); } @@ -234,10 +279,15 @@ mod tests { }; let base_url = Url::parse("http://example.com")?; - let to_remove: &[&str] = &[]; for s in LINK_REL_EXTERNAL_RESOURCES { assert_eq!( - self_contained_html(html(s), &downloader, &base_url, to_remove).await, + Config { + downloader: Some(&downloader), + base_url: Some(&base_url), + ..Default::default() + } + .run(html(s)) + .await, "\n\n\n" ); } @@ -290,9 +340,14 @@ mod tests { let minified = String::from_utf8(minifier.get_html().into())?; let base_url = Url::parse("http://example.com")?; - let to_remove: &[&str] = &[]; assert_eq!( - self_contained_html(html, &downloader, &base_url, to_remove).await, + Config { + downloader: Some(&downloader), + base_url: Some(&base_url), + ..Default::default() + } + .run(html) + .await, minified ); Ok(()) @@ -337,9 +392,14 @@ mod tests { let minified = String::from_utf8(minifier.get_html().into())?; let base_url = Url::parse("http://example.com")?; - let to_remove: &[&str] = &[]; assert_eq!( - self_contained_html(html, &downloader, &base_url, to_remove).await, + Config { + downloader: Some(&downloader), + base_url: Some(&base_url), + ..Default::default() + } + .run(html) + .await, minified ); Ok(()) @@ -372,12 +432,13 @@ mod tests { let minified = String::from_utf8(minifier.get_html().into())?; assert_eq!( - self_contained_html( - html, - &downloader, - &base_url, - &["header", ".placeholder", "article > span.huge"] - ) + Config { + downloader: Some(&downloader), + base_url: Some(&base_url), + elements_to_remove: &["header", ".placeholder", "article > span.huge"], + ..Default::default() + } + .run(html) .await, minified ); diff --git a/documentation/design/scope.md b/documentation/design/scope.md index 6417bf7..cca4047 100644 --- a/documentation/design/scope.md +++ b/documentation/design/scope.md @@ -37,7 +37,8 @@ frame "backend" { newspaper -> retrieval_tools: uses to implement - article_location --> article_repr :uses + article_location --> article_repr: uses + retrieval_tools -up-> article_repr: uses auto_retrieve --> rss: watches auto_retrieve --> article_location