diff --git a/crieur-chatbot/src/chatbot.rs b/crieur-chatbot/src/chatbot.rs index 137b9ef..85018b2 100644 --- a/crieur-chatbot/src/chatbot.rs +++ b/crieur-chatbot/src/chatbot.rs @@ -2,15 +2,7 @@ use std::convert::TryInto; use anyhow::Result; -use matrix_sdk::{ - self, async_trait, - events::{ - room::message::{MessageEventContent, MessageType, TextMessageEventContent}, - AnyMessageEventContent, SyncMessageEvent, - }, - room::Room, - Client, ClientConfig, EventHandler, SyncSettings, -}; +use matrix_sdk::{self, Client, SyncSettings}; use crate::Html; diff --git a/crieur-chatbot/src/handlers/html.rs b/crieur-chatbot/src/handlers/html.rs index 0b514df..3d5bf61 100644 --- a/crieur-chatbot/src/handlers/html.rs +++ b/crieur-chatbot/src/handlers/html.rs @@ -1,7 +1,6 @@ use std::convert::TryInto; -use std::env; -use log::{error, info}; +use log::error; use matrix_sdk::{ self, async_trait, events::{ @@ -9,7 +8,7 @@ use matrix_sdk::{ AnyMessageEventContent, SyncMessageEvent, }, room::Room, - Client, ClientConfig, EventHandler, SyncSettings, + EventHandler, }; use crieur_retrieve::{article_location::Error, article_location::Result, ArticleLocation, Url}; diff --git a/crieur-retrieve/src/article_location.rs b/crieur-retrieve/src/article_location.rs index ebe97b1..2062c0b 100644 --- a/crieur-retrieve/src/article_location.rs +++ b/crieur-retrieve/src/article_location.rs @@ -2,7 +2,6 @@ use std::boxed::Box; use std::convert::TryInto; use std::env; -use log::info; use url::{Host, Url}; use crate::newspaper::Newspaper; @@ -36,27 +35,20 @@ pub type Result = core::result::Result; fn default_newpapers() -> Result { // TODO: same thing is written too much times : how to DRY ? let config_key = "MEDIAPART_COOKIE".to_string(); - let mpruiid = env::var(&config_key) - .map_err(|_| Error::Misconfiguration(config_key))? - .into(); + let mpruiid = env::var(&config_key).map_err(|_| Error::Misconfiguration(config_key))?; let mediapart = Mediapart::builder() - .login(mediapart::Login::MPRUUID(mpruiid)) + .login(mediapart::Login::Mpruuid(mpruiid)) .build()?; let lmd_a_m = "MONDE_DIPLO_LMD_A_M".to_string(); let phpsessid = "MONDE_DIPLO_PHPSESSID".to_string(); let spip_session = "MONDE_DIPLO_SPIP_SESSION".to_string(); - let lmd_a_m = env::var(&lmd_a_m) - .map_err(|_| Error::Misconfiguration(lmd_a_m))? - .into(); - let phpsessid = env::var(&phpsessid) - .map_err(|_| Error::Misconfiguration(phpsessid))? - .into(); - let spip_session = env::var(&spip_session) - .map_err(|_| Error::Misconfiguration(spip_session))? - .into(); + let lmd_a_m = env::var(&lmd_a_m).map_err(|_| Error::Misconfiguration(lmd_a_m))?; + let phpsessid = env::var(&phpsessid).map_err(|_| Error::Misconfiguration(phpsessid))?; + let spip_session = + env::var(&spip_session).map_err(|_| Error::Misconfiguration(spip_session))?; let monde_diplo = MondeDiplo::builder() .login(monde_diplomatique::Login::Cookies { @@ -69,12 +61,8 @@ fn default_newpapers() -> Result { let lmd_a_m = "COURRIER_INTERNATIONAL_LMD_A_M".to_string(); let ssess = "COURRIER_INTERNATIONAL_SSESS".to_string(); - let lmd_a_m = env::var(&lmd_a_m) - .map_err(|_| Error::Misconfiguration(lmd_a_m))? - .into(); - let ssess = env::var(&ssess) - .map_err(|_| Error::Misconfiguration(ssess))? - .into(); + let lmd_a_m = env::var(&lmd_a_m).map_err(|_| Error::Misconfiguration(lmd_a_m))?; + let ssess = env::var(&ssess).map_err(|_| Error::Misconfiguration(ssess))?; let courrier_international = CourrierInternational::builder() .login(courrier_international::Login::Cookies { lmd_a_m, ssess }) diff --git a/crieur-retrieve/src/newspapers/courrier_international.rs b/crieur-retrieve/src/newspapers/courrier_international.rs index 911b9e9..187e5db 100644 --- a/crieur-retrieve/src/newspapers/courrier_international.rs +++ b/crieur-retrieve/src/newspapers/courrier_international.rs @@ -82,13 +82,19 @@ impl Newspaper for CourrierInternational { }; // TODO: Move to const - let element_to_remove = [ + let elements_to_remove = [ // navigation elements "#entete.connecte", ]; - let single_page_html = - tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await; + let single_page_html = tools::self_contained_html::Config { + downloader: Some(&downloader), + base_url: Some(&url), + elements_to_remove: &elements_to_remove, + ..Default::default() + } + .run(&html) + .await; Ok(single_page_html) } diff --git a/crieur-retrieve/src/newspapers/mediapart.rs b/crieur-retrieve/src/newspapers/mediapart.rs index 24a3933..2586e08 100644 --- a/crieur-retrieve/src/newspapers/mediapart.rs +++ b/crieur-retrieve/src/newspapers/mediapart.rs @@ -10,7 +10,7 @@ use crate::{Download, Downloader}; pub enum Login { Username(String, String), - MPRUUID(String), + Mpruuid(String), } #[derive(Debug, Clone, Default)] @@ -33,7 +33,7 @@ impl Builder { Login::Username(_username, _password) => { unimplemented!("login using username and passwond not implemented") } - Login::MPRUUID(cookie_value) => Some(("MPRUUID".into(), cookie_value)), + Login::Mpruuid(cookie_value) => Some(("MPRUUID".into(), cookie_value)), }; self } @@ -86,7 +86,7 @@ impl Newspaper for Mediapart { }; // TODO: Move to const - let element_to_remove = [ + let elements_to_remove = [ // header ".fb-root", ".skipLinks", @@ -104,8 +104,14 @@ impl Newspaper for Mediapart { "aside.cc-modal", ]; - let single_page_html = - tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await; + let single_page_html = tools::self_contained_html::Config { + downloader: Some(&downloader), + base_url: Some(&url), + elements_to_remove: &elements_to_remove, + ..Default::default() + } + .run(&html) + .await; Ok(single_page_html) } diff --git a/crieur-retrieve/src/newspapers/monde_diplomatique.rs b/crieur-retrieve/src/newspapers/monde_diplomatique.rs index 041737f..8348dae 100644 --- a/crieur-retrieve/src/newspapers/monde_diplomatique.rs +++ b/crieur-retrieve/src/newspapers/monde_diplomatique.rs @@ -91,7 +91,7 @@ impl Newspaper for MondeDiplo { }; // TODO: Move to const - let element_to_remove = [ + let elements_to_remove = [ // navigation elements "#tout-en-haut.preentete", "#entete.connecte", @@ -107,8 +107,14 @@ impl Newspaper for MondeDiplo { "noscript", ]; - let single_page_html = - tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await; + let single_page_html = tools::self_contained_html::Config { + downloader: Some(&downloader), + base_url: Some(&url), + elements_to_remove: &elements_to_remove, + ..Default::default() + } + .run(&html) + .await; Ok(single_page_html) } diff --git a/crieur-retrieve/src/tools/mod.rs b/crieur-retrieve/src/tools/mod.rs index 59381b1..80f159e 100644 --- a/crieur-retrieve/src/tools/mod.rs +++ b/crieur-retrieve/src/tools/mod.rs @@ -1,5 +1,4 @@ mod download; -mod self_contained_html; +pub mod self_contained_html; pub use download::{Download, DownloadError, Downloader}; -pub use self_contained_html::self_contained_html; diff --git a/crieur-retrieve/src/tools/self_contained_html.rs b/crieur-retrieve/src/tools/self_contained_html.rs index bc25211..2a5ddac 100644 --- a/crieur-retrieve/src/tools/self_contained_html.rs +++ b/crieur-retrieve/src/tools/self_contained_html.rs @@ -8,140 +8,177 @@ use url::Url; use crate::consts::{EVENT_HANDLERS, LINK_REL_EXTERNAL_RESOURCES}; use crate::Download; -/// Makes an html page self-contained -/// -/// The `downloader` must implement `Download` and is used to download ressources that are -/// needed to make this page self-contained such as stylesheets or images. -/// -/// The function also removes all scripts on the page -pub async fn self_contained_html( - html: impl AsRef, - downloader: &D, - base_url: &Url, - elements_to_remove: &[impl AsRef], -) -> String +/// Stores configuration for the self_contained_html function +// TODO: write a builder +pub struct Config<'t, E, D, S1 = &'static str, S2 = &'static str> +where + E: std::error::Error, + D: Download + Send, + S1: AsRef, + S2: AsRef, +{ + /// the downloader that will be used to retrieve ressources on the page + pub downloader: Option<&'t D>, + /// Base url for downloading ressources, it probably the + pub base_url: Option<&'t Url>, + pub elements_to_remove: &'t [S1], + pub styles_to_add: &'t [S2], +} + +impl<'t, E, D> Default for Config<'t, E, D> where E: std::error::Error, D: Download + Send, { - // TODO: split/refactor this function : - // - ¿ find a way to ease the |get urls| -> |async download| -> |replace| workflow ? - // - `element_to_remove` , `base_url` and `downloader` should be in a configuration structure - // - ¿ should be function of a trait ? or only of the configuration struct ? - let (style_urls, html) = { - let document = Document::from(html.as_ref()); - - // ---- Remove scripts ---- - // - document.select("script").remove(); - - for event in EVENT_HANDLERS { - document - .select(format!("[{}]", event).as_str()) - .remove_attr(event); + fn default() -> Self { + Self { + downloader: None, + base_url: None, + elements_to_remove: &[], + styles_to_add: &[], } + } +} - for rel in LINK_REL_EXTERNAL_RESOURCES { - document - .select(format!("link[rel=\"{}\"]", rel).as_str()) - .remove(); - } +impl<'t, E, D, S1, S2> Config<'t, E, D, S1, S2> +where + E: std::error::Error, + D: Download + Send, + S1: AsRef, + S2: AsRef, +{ + /// Makes an html page self-contained + /// + /// The `downloader` must implement `Download` and is used to download ressources that are + /// needed to make this page self-contained such as stylesheets or images. + /// + /// The function also removes all scripts on the page + pub async fn run(&self, html: impl AsRef) -> String { + //TODO: don't panic + let base_url = self.base_url.expect("Base url not defined"); + let downloader = self.downloader.expect("Downloader not defined"); + // TODO: split/refactor this function : + // - ¿ find a way to ease the |get urls| -> |async download| -> |replace| workflow ? + // - `element_to_remove` , `base_url` and `downloader` should be in a configuration structure + let (style_urls, html) = { + let document = Document::from(html.as_ref()); - // ---- Replace stylesheets ---- + // ---- Remove scripts ---- + // + document.select("script").remove(); + + for event in EVENT_HANDLERS { + document + .select(format!("[{}]", event).as_str()) + .remove_attr(event); + } + + for rel in LINK_REL_EXTERNAL_RESOURCES { + document + .select(format!("link[rel=\"{}\"]", rel).as_str()) + .remove(); + } + + // ---- Replace stylesheets ---- + // + let stylesheets = document.select("link[href][rel=\"stylesheet\"]"); + let styles_url = stylesheets + .iter() + .map(|stylesheet| { + if let Some(src) = stylesheet.attr("href") { + //TODO: does it work with absolute urls ? + base_url.join(src.as_ref()).ok() + } else { + None + } + }) + .collect::>(); + (styles_url, String::from(document.html())) + }; + + let style_urls = style_urls.into_iter().map(|style_url| { + OptionFuture::from( + style_url.map(|s| async move { downloader.download(&s).await.unwrap() }), + ) + }); + let downloaded_styles = futures::future::join_all(style_urls).await; + + let html = { + let document = Document::from(&html); + let styles = document.select("link[href][rel=\"stylesheet\"]"); + + styles + .iter() + .zip(downloaded_styles.iter()) + .for_each(|(mut stylesheet, inner_css)| { + if let Some(Some(inner_css)) = inner_css { + let css = String::from_utf8(inner_css.to_vec()).unwrap(); + let css = format!("", css); + stylesheet.replace_with_html(css); + } else { + stylesheet.remove(); + } + }); + String::from(document.html()) + }; + + // ---- Replace imgs ---- // - let stylesheets = document.select("link[href][rel=\"stylesheet\"]"); - let styles_url = stylesheets - .iter() - .map(|stylesheet| { - if let Some(src) = stylesheet.attr("href") { - //TODO: does it work with absolute urls ? - base_url.join(src.as_ref()).ok() - } else { - None - } - }) - .collect::>(); - (styles_url, String::from(document.html())) - }; + let image_urls = { + let document = Document::from(&html); + let imgs = document.select("img:not([src^=\"data:\"])"); - let style_urls = style_urls.into_iter().map(|style_url| { - OptionFuture::from(style_url.map(|s| async move { downloader.download(&s).await.unwrap() })) - }); - let downloaded_styles = futures::future::join_all(style_urls).await; + imgs.iter() + .map(|image| { + if let Some(src) = image.attr("src") { + base_url.join(src.as_ref()).ok() + } else { + None + } + }) + .collect::>() + }; - let html = { - let document = Document::from(&html); - let styles = document.select("link[href][rel=\"stylesheet\"]"); + let downloaded_images = image_urls.into_iter().map(|image_url| { + OptionFuture::from(image_url.map(|url| async move { + let data = downloader.download(&url).await.unwrap(); + (url, data) + })) + }); + let downloaded_images = futures::future::join_all(downloaded_images).await; - styles - .iter() - .zip(downloaded_styles.iter()) - .for_each(|(mut stylesheet, inner_css)| { - if let Some(Some(inner_css)) = inner_css { - let css = String::from_utf8(inner_css.to_vec()).unwrap(); - let css = format!("", css); - stylesheet.replace_with_html(css); - } else { - stylesheet.remove(); - } - }); - String::from(document.html()) - }; + let html = { + let document = Document::from(&html); + let imgs = document.select("img:not([src^=\"data:\"])"); - // ---- Replace imgs ---- - // - let image_urls = { - let document = Document::from(&html); - let imgs = document.select("img:not([src^=\"data:\"])"); + imgs.iter() + .zip(downloaded_images.iter()) + .for_each(|(mut img, data)| { + if let Some((url, Some(data))) = data { + let data = base64::encode(data); + //TODO: use an extension hashmap + let extension = + Path::new(url.path()).extension().unwrap().to_str().unwrap(); + img.set_attr("src", &format!("data:image/{};base64,{}", extension, data)); + } else { + img.remove() + } + }); + // ---- Remove unwanted html elements ----- + // + for element in self.elements_to_remove { + document.select(element.as_ref()).remove(); + } + String::from(document.html()) + }; - imgs.iter() - .map(|image| { - if let Some(src) = image.attr("src") { - base_url.join(src.as_ref()).ok() - } else { - None - } - }) - .collect::>() - }; - - let downloaded_images = image_urls.into_iter().map(|image_url| { - OptionFuture::from(image_url.map(|url| async move { - let data = downloader.download(&url).await.unwrap(); - (url, data) - })) - }); - let downloaded_images = futures::future::join_all(downloaded_images).await; - - let html = { - let document = Document::from(&html); - let imgs = document.select("img:not([src^=\"data:\"])"); - - imgs.iter() - .zip(downloaded_images.iter()) - .for_each(|(mut img, data)| { - if let Some((url, Some(data))) = data { - let data = base64::encode(data); - let extension = Path::new(url.path()).extension().unwrap().to_str().unwrap(); - img.set_attr("src", &format!("data:image/{};base64,{}", extension, data)); - } else { - img.remove() - } - }); - // ---- Remove unwanted html elements ----- + // ---- output ---- // - for element in elements_to_remove { - document.select(element.as_ref()).remove(); - } - String::from(document.html()) - }; + let mut minifier = HTMLMinifier::new(); + minifier.digest(html.as_str()).unwrap(); - // ---- output ---- - // - let mut minifier = HTMLMinifier::new(); - minifier.digest(html.as_str()).unwrap(); - - String::from_utf8(minifier.get_html().into()).unwrap() + String::from_utf8(minifier.get_html().into()).unwrap() + } } #[cfg(test)] @@ -180,9 +217,14 @@ mod tests { let html = ""; let base_url = Url::parse("http://example.com")?; let downloader = DummyDownloader {}; - let to_remove: &[&str] = &[]; assert_eq!( - self_contained_html(html, &downloader, &base_url, to_remove).await, + Config { + downloader: Some(&downloader), + base_url: Some(&base_url), + ..Default::default() + } + .run(html) + .await, "" ); Ok(()) @@ -206,10 +248,13 @@ mod tests { }; let base_url = Url::parse("http://example.com")?; - let to_remove: &[&str] = &[]; for s in EVENT_HANDLERS { assert_eq!( - self_contained_html(html(s), &downloader, &base_url, to_remove).await, + Config { + downloader: Some(&downloader), + base_url: Some(&base_url), + ..Default::default() + }.run(html(s)).await, "\n\n\n\n" ); } @@ -234,10 +279,15 @@ mod tests { }; let base_url = Url::parse("http://example.com")?; - let to_remove: &[&str] = &[]; for s in LINK_REL_EXTERNAL_RESOURCES { assert_eq!( - self_contained_html(html(s), &downloader, &base_url, to_remove).await, + Config { + downloader: Some(&downloader), + base_url: Some(&base_url), + ..Default::default() + } + .run(html(s)) + .await, "\n\n\n" ); } @@ -290,9 +340,14 @@ mod tests { let minified = String::from_utf8(minifier.get_html().into())?; let base_url = Url::parse("http://example.com")?; - let to_remove: &[&str] = &[]; assert_eq!( - self_contained_html(html, &downloader, &base_url, to_remove).await, + Config { + downloader: Some(&downloader), + base_url: Some(&base_url), + ..Default::default() + } + .run(html) + .await, minified ); Ok(()) @@ -337,9 +392,14 @@ mod tests { let minified = String::from_utf8(minifier.get_html().into())?; let base_url = Url::parse("http://example.com")?; - let to_remove: &[&str] = &[]; assert_eq!( - self_contained_html(html, &downloader, &base_url, to_remove).await, + Config { + downloader: Some(&downloader), + base_url: Some(&base_url), + ..Default::default() + } + .run(html) + .await, minified ); Ok(()) @@ -372,12 +432,13 @@ mod tests { let minified = String::from_utf8(minifier.get_html().into())?; assert_eq!( - self_contained_html( - html, - &downloader, - &base_url, - &["header", ".placeholder", "article > span.huge"] - ) + Config { + downloader: Some(&downloader), + base_url: Some(&base_url), + elements_to_remove: &["header", ".placeholder", "article > span.huge"], + ..Default::default() + } + .run(html) .await, minified ); diff --git a/documentation/design/scope.md b/documentation/design/scope.md index 6417bf7..cca4047 100644 --- a/documentation/design/scope.md +++ b/documentation/design/scope.md @@ -37,7 +37,8 @@ frame "backend" { newspaper -> retrieval_tools: uses to implement - article_location --> article_repr :uses + article_location --> article_repr: uses + retrieval_tools -up-> article_repr: uses auto_retrieve --> rss: watches auto_retrieve --> article_location