chore: use a config struct for self_contained_html

Previously, self_html_function was a function taking all parameters as
arguments.
As new optionnal parameters are beeing added, the function had too much
arguments and each usage of the function would have to be modified each
time an argument will be added.

Therefore, it have been moved to a configuration structure with a `run`
function taking only one argument, the html string.
This commit is contained in:
koalp 2021-05-17 20:18:32 +02:00
parent 5d0872b4d9
commit 6e091a32fc
Signed by: koalp
GPG Key ID: 35B21047DEB09A81
9 changed files with 239 additions and 181 deletions

View File

@ -2,15 +2,7 @@
use std::convert::TryInto; use std::convert::TryInto;
use anyhow::Result; use anyhow::Result;
use matrix_sdk::{ use matrix_sdk::{self, Client, SyncSettings};
self, async_trait,
events::{
room::message::{MessageEventContent, MessageType, TextMessageEventContent},
AnyMessageEventContent, SyncMessageEvent,
},
room::Room,
Client, ClientConfig, EventHandler, SyncSettings,
};
use crate::Html; use crate::Html;

View File

@ -1,7 +1,6 @@
use std::convert::TryInto; use std::convert::TryInto;
use std::env;
use log::{error, info}; use log::error;
use matrix_sdk::{ use matrix_sdk::{
self, async_trait, self, async_trait,
events::{ events::{
@ -9,7 +8,7 @@ use matrix_sdk::{
AnyMessageEventContent, SyncMessageEvent, AnyMessageEventContent, SyncMessageEvent,
}, },
room::Room, room::Room,
Client, ClientConfig, EventHandler, SyncSettings, EventHandler,
}; };
use crieur_retrieve::{article_location::Error, article_location::Result, ArticleLocation, Url}; use crieur_retrieve::{article_location::Error, article_location::Result, ArticleLocation, Url};

View File

@ -2,7 +2,6 @@ use std::boxed::Box;
use std::convert::TryInto; use std::convert::TryInto;
use std::env; use std::env;
use log::info;
use url::{Host, Url}; use url::{Host, Url};
use crate::newspaper::Newspaper; use crate::newspaper::Newspaper;
@ -36,27 +35,20 @@ pub type Result<T, E = Error> = core::result::Result<T, E>;
fn default_newpapers() -> Result<Newspapers> { fn default_newpapers() -> Result<Newspapers> {
// TODO: same thing is written too much times : how to DRY ? // TODO: same thing is written too much times : how to DRY ?
let config_key = "MEDIAPART_COOKIE".to_string(); let config_key = "MEDIAPART_COOKIE".to_string();
let mpruiid = env::var(&config_key) let mpruiid = env::var(&config_key).map_err(|_| Error::Misconfiguration(config_key))?;
.map_err(|_| Error::Misconfiguration(config_key))?
.into();
let mediapart = Mediapart::builder() let mediapart = Mediapart::builder()
.login(mediapart::Login::MPRUUID(mpruiid)) .login(mediapart::Login::Mpruuid(mpruiid))
.build()?; .build()?;
let lmd_a_m = "MONDE_DIPLO_LMD_A_M".to_string(); let lmd_a_m = "MONDE_DIPLO_LMD_A_M".to_string();
let phpsessid = "MONDE_DIPLO_PHPSESSID".to_string(); let phpsessid = "MONDE_DIPLO_PHPSESSID".to_string();
let spip_session = "MONDE_DIPLO_SPIP_SESSION".to_string(); let spip_session = "MONDE_DIPLO_SPIP_SESSION".to_string();
let lmd_a_m = env::var(&lmd_a_m) let lmd_a_m = env::var(&lmd_a_m).map_err(|_| Error::Misconfiguration(lmd_a_m))?;
.map_err(|_| Error::Misconfiguration(lmd_a_m))? let phpsessid = env::var(&phpsessid).map_err(|_| Error::Misconfiguration(phpsessid))?;
.into(); let spip_session =
let phpsessid = env::var(&phpsessid) env::var(&spip_session).map_err(|_| Error::Misconfiguration(spip_session))?;
.map_err(|_| Error::Misconfiguration(phpsessid))?
.into();
let spip_session = env::var(&spip_session)
.map_err(|_| Error::Misconfiguration(spip_session))?
.into();
let monde_diplo = MondeDiplo::builder() let monde_diplo = MondeDiplo::builder()
.login(monde_diplomatique::Login::Cookies { .login(monde_diplomatique::Login::Cookies {
@ -69,12 +61,8 @@ fn default_newpapers() -> Result<Newspapers> {
let lmd_a_m = "COURRIER_INTERNATIONAL_LMD_A_M".to_string(); let lmd_a_m = "COURRIER_INTERNATIONAL_LMD_A_M".to_string();
let ssess = "COURRIER_INTERNATIONAL_SSESS".to_string(); let ssess = "COURRIER_INTERNATIONAL_SSESS".to_string();
let lmd_a_m = env::var(&lmd_a_m) let lmd_a_m = env::var(&lmd_a_m).map_err(|_| Error::Misconfiguration(lmd_a_m))?;
.map_err(|_| Error::Misconfiguration(lmd_a_m))? let ssess = env::var(&ssess).map_err(|_| Error::Misconfiguration(ssess))?;
.into();
let ssess = env::var(&ssess)
.map_err(|_| Error::Misconfiguration(ssess))?
.into();
let courrier_international = CourrierInternational::builder() let courrier_international = CourrierInternational::builder()
.login(courrier_international::Login::Cookies { lmd_a_m, ssess }) .login(courrier_international::Login::Cookies { lmd_a_m, ssess })

View File

@ -82,13 +82,19 @@ impl Newspaper for CourrierInternational {
}; };
// TODO: Move to const // TODO: Move to const
let element_to_remove = [ let elements_to_remove = [
// navigation elements // navigation elements
"#entete.connecte", "#entete.connecte",
]; ];
let single_page_html = let single_page_html = tools::self_contained_html::Config {
tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await; downloader: Some(&downloader),
base_url: Some(&url),
elements_to_remove: &elements_to_remove,
..Default::default()
}
.run(&html)
.await;
Ok(single_page_html) Ok(single_page_html)
} }

View File

@ -10,7 +10,7 @@ use crate::{Download, Downloader};
pub enum Login { pub enum Login {
Username(String, String), Username(String, String),
MPRUUID(String), Mpruuid(String),
} }
#[derive(Debug, Clone, Default)] #[derive(Debug, Clone, Default)]
@ -33,7 +33,7 @@ impl Builder {
Login::Username(_username, _password) => { Login::Username(_username, _password) => {
unimplemented!("login using username and passwond not implemented") unimplemented!("login using username and passwond not implemented")
} }
Login::MPRUUID(cookie_value) => Some(("MPRUUID".into(), cookie_value)), Login::Mpruuid(cookie_value) => Some(("MPRUUID".into(), cookie_value)),
}; };
self self
} }
@ -86,7 +86,7 @@ impl Newspaper for Mediapart {
}; };
// TODO: Move to const // TODO: Move to const
let element_to_remove = [ let elements_to_remove = [
// header // header
".fb-root", ".fb-root",
".skipLinks", ".skipLinks",
@ -104,8 +104,14 @@ impl Newspaper for Mediapart {
"aside.cc-modal", "aside.cc-modal",
]; ];
let single_page_html = let single_page_html = tools::self_contained_html::Config {
tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await; downloader: Some(&downloader),
base_url: Some(&url),
elements_to_remove: &elements_to_remove,
..Default::default()
}
.run(&html)
.await;
Ok(single_page_html) Ok(single_page_html)
} }

View File

@ -91,7 +91,7 @@ impl Newspaper for MondeDiplo {
}; };
// TODO: Move to const // TODO: Move to const
let element_to_remove = [ let elements_to_remove = [
// navigation elements // navigation elements
"#tout-en-haut.preentete", "#tout-en-haut.preentete",
"#entete.connecte", "#entete.connecte",
@ -107,8 +107,14 @@ impl Newspaper for MondeDiplo {
"noscript", "noscript",
]; ];
let single_page_html = let single_page_html = tools::self_contained_html::Config {
tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await; downloader: Some(&downloader),
base_url: Some(&url),
elements_to_remove: &elements_to_remove,
..Default::default()
}
.run(&html)
.await;
Ok(single_page_html) Ok(single_page_html)
} }

View File

@ -1,5 +1,4 @@
mod download; mod download;
mod self_contained_html; pub mod self_contained_html;
pub use download::{Download, DownloadError, Downloader}; pub use download::{Download, DownloadError, Downloader};
pub use self_contained_html::self_contained_html;

View File

@ -8,26 +8,58 @@ use url::Url;
use crate::consts::{EVENT_HANDLERS, LINK_REL_EXTERNAL_RESOURCES}; use crate::consts::{EVENT_HANDLERS, LINK_REL_EXTERNAL_RESOURCES};
use crate::Download; use crate::Download;
/// Stores configuration for the self_contained_html function
// TODO: write a builder
pub struct Config<'t, E, D, S1 = &'static str, S2 = &'static str>
where
E: std::error::Error,
D: Download<Error = E> + Send,
S1: AsRef<str>,
S2: AsRef<str>,
{
/// the downloader that will be used to retrieve ressources on the page
pub downloader: Option<&'t D>,
/// Base url for downloading ressources, it probably the
pub base_url: Option<&'t Url>,
pub elements_to_remove: &'t [S1],
pub styles_to_add: &'t [S2],
}
impl<'t, E, D> Default for Config<'t, E, D>
where
E: std::error::Error,
D: Download<Error = E> + Send,
{
fn default() -> Self {
Self {
downloader: None,
base_url: None,
elements_to_remove: &[],
styles_to_add: &[],
}
}
}
impl<'t, E, D, S1, S2> Config<'t, E, D, S1, S2>
where
E: std::error::Error,
D: Download<Error = E> + Send,
S1: AsRef<str>,
S2: AsRef<str>,
{
/// Makes an html page self-contained /// Makes an html page self-contained
/// ///
/// The `downloader` must implement `Download` and is used to download ressources that are /// The `downloader` must implement `Download` and is used to download ressources that are
/// needed to make this page self-contained such as stylesheets or images. /// needed to make this page self-contained such as stylesheets or images.
/// ///
/// The function also removes all scripts on the page /// The function also removes all scripts on the page
pub async fn self_contained_html<E, D>( pub async fn run(&self, html: impl AsRef<str>) -> String {
html: impl AsRef<str>, //TODO: don't panic
downloader: &D, let base_url = self.base_url.expect("Base url not defined");
base_url: &Url, let downloader = self.downloader.expect("Downloader not defined");
elements_to_remove: &[impl AsRef<str>],
) -> String
where
E: std::error::Error,
D: Download<Error = E> + Send,
{
// TODO: split/refactor this function : // TODO: split/refactor this function :
// - ¿ find a way to ease the |get urls| -> |async download| -> |replace| workflow ? // - ¿ find a way to ease the |get urls| -> |async download| -> |replace| workflow ?
// - `element_to_remove` , `base_url` and `downloader` should be in a configuration structure // - `element_to_remove` , `base_url` and `downloader` should be in a configuration structure
// - ¿ should be function of a trait ? or only of the configuration struct ?
let (style_urls, html) = { let (style_urls, html) = {
let document = Document::from(html.as_ref()); let document = Document::from(html.as_ref());
@ -65,7 +97,9 @@ where
}; };
let style_urls = style_urls.into_iter().map(|style_url| { let style_urls = style_urls.into_iter().map(|style_url| {
OptionFuture::from(style_url.map(|s| async move { downloader.download(&s).await.unwrap() })) OptionFuture::from(
style_url.map(|s| async move { downloader.download(&s).await.unwrap() }),
)
}); });
let downloaded_styles = futures::future::join_all(style_urls).await; let downloaded_styles = futures::future::join_all(style_urls).await;
@ -122,7 +156,9 @@ where
.for_each(|(mut img, data)| { .for_each(|(mut img, data)| {
if let Some((url, Some(data))) = data { if let Some((url, Some(data))) = data {
let data = base64::encode(data); let data = base64::encode(data);
let extension = Path::new(url.path()).extension().unwrap().to_str().unwrap(); //TODO: use an extension hashmap
let extension =
Path::new(url.path()).extension().unwrap().to_str().unwrap();
img.set_attr("src", &format!("data:image/{};base64,{}", extension, data)); img.set_attr("src", &format!("data:image/{};base64,{}", extension, data));
} else { } else {
img.remove() img.remove()
@ -130,7 +166,7 @@ where
}); });
// ---- Remove unwanted html elements ----- // ---- Remove unwanted html elements -----
// //
for element in elements_to_remove { for element in self.elements_to_remove {
document.select(element.as_ref()).remove(); document.select(element.as_ref()).remove();
} }
String::from(document.html()) String::from(document.html())
@ -143,6 +179,7 @@ where
String::from_utf8(minifier.get_html().into()).unwrap() String::from_utf8(minifier.get_html().into()).unwrap()
} }
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
@ -180,9 +217,14 @@ mod tests {
let html = "<html><head><script>let id = id => id</script></head><body></body></html>"; let html = "<html><head><script>let id = id => id</script></head><body></body></html>";
let base_url = Url::parse("http://example.com")?; let base_url = Url::parse("http://example.com")?;
let downloader = DummyDownloader {}; let downloader = DummyDownloader {};
let to_remove: &[&str] = &[];
assert_eq!( assert_eq!(
self_contained_html(html, &downloader, &base_url, to_remove).await, Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}
.run(html)
.await,
"<html><head></head><body></body></html>" "<html><head></head><body></body></html>"
); );
Ok(()) Ok(())
@ -206,10 +248,13 @@ mod tests {
}; };
let base_url = Url::parse("http://example.com")?; let base_url = Url::parse("http://example.com")?;
let to_remove: &[&str] = &[];
for s in EVENT_HANDLERS { for s in EVENT_HANDLERS {
assert_eq!( assert_eq!(
self_contained_html(html(s), &downloader, &base_url, to_remove).await, Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}.run(html(s)).await,
"<html><head>\n</head>\n<body>\n<button class=\"activate\">button</button>\n</body></html>" "<html><head>\n</head>\n<body>\n<button class=\"activate\">button</button>\n</body></html>"
); );
} }
@ -234,10 +279,15 @@ mod tests {
}; };
let base_url = Url::parse("http://example.com")?; let base_url = Url::parse("http://example.com")?;
let to_remove: &[&str] = &[];
for s in LINK_REL_EXTERNAL_RESOURCES { for s in LINK_REL_EXTERNAL_RESOURCES {
assert_eq!( assert_eq!(
self_contained_html(html(s), &downloader, &base_url, to_remove).await, Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}
.run(html(s))
.await,
"<html><head>\n</head>\n<body>\n</body></html>" "<html><head>\n</head>\n<body>\n</body></html>"
); );
} }
@ -290,9 +340,14 @@ mod tests {
let minified = String::from_utf8(minifier.get_html().into())?; let minified = String::from_utf8(minifier.get_html().into())?;
let base_url = Url::parse("http://example.com")?; let base_url = Url::parse("http://example.com")?;
let to_remove: &[&str] = &[];
assert_eq!( assert_eq!(
self_contained_html(html, &downloader, &base_url, to_remove).await, Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}
.run(html)
.await,
minified minified
); );
Ok(()) Ok(())
@ -337,9 +392,14 @@ mod tests {
let minified = String::from_utf8(minifier.get_html().into())?; let minified = String::from_utf8(minifier.get_html().into())?;
let base_url = Url::parse("http://example.com")?; let base_url = Url::parse("http://example.com")?;
let to_remove: &[&str] = &[];
assert_eq!( assert_eq!(
self_contained_html(html, &downloader, &base_url, to_remove).await, Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}
.run(html)
.await,
minified minified
); );
Ok(()) Ok(())
@ -372,12 +432,13 @@ mod tests {
let minified = String::from_utf8(minifier.get_html().into())?; let minified = String::from_utf8(minifier.get_html().into())?;
assert_eq!( assert_eq!(
self_contained_html( Config {
html, downloader: Some(&downloader),
&downloader, base_url: Some(&base_url),
&base_url, elements_to_remove: &["header", ".placeholder", "article > span.huge"],
&["header", ".placeholder", "article > span.huge"] ..Default::default()
) }
.run(html)
.await, .await,
minified minified
); );

View File

@ -38,6 +38,7 @@ frame "backend" {
newspaper -> retrieval_tools: uses to implement newspaper -> retrieval_tools: uses to implement
article_location --> article_repr: uses article_location --> article_repr: uses
retrieval_tools -up-> article_repr: uses
auto_retrieve --> rss: watches auto_retrieve --> rss: watches
auto_retrieve --> article_location auto_retrieve --> article_location