chore: use a config struct for self_contained_html

Previously, self_html_function was a function taking all parameters as
arguments.
As new optionnal parameters are beeing added, the function had too much
arguments and each usage of the function would have to be modified each
time an argument will be added.

Therefore, it have been moved to a configuration structure with a `run`
function taking only one argument, the html string.
This commit is contained in:
koalp 2021-05-17 20:18:32 +02:00
parent 5d0872b4d9
commit 6e091a32fc
Signed by: koalp
GPG Key ID: 35B21047DEB09A81
9 changed files with 239 additions and 181 deletions

View File

@ -2,15 +2,7 @@
use std::convert::TryInto;
use anyhow::Result;
use matrix_sdk::{
self, async_trait,
events::{
room::message::{MessageEventContent, MessageType, TextMessageEventContent},
AnyMessageEventContent, SyncMessageEvent,
},
room::Room,
Client, ClientConfig, EventHandler, SyncSettings,
};
use matrix_sdk::{self, Client, SyncSettings};
use crate::Html;

View File

@ -1,7 +1,6 @@
use std::convert::TryInto;
use std::env;
use log::{error, info};
use log::error;
use matrix_sdk::{
self, async_trait,
events::{
@ -9,7 +8,7 @@ use matrix_sdk::{
AnyMessageEventContent, SyncMessageEvent,
},
room::Room,
Client, ClientConfig, EventHandler, SyncSettings,
EventHandler,
};
use crieur_retrieve::{article_location::Error, article_location::Result, ArticleLocation, Url};

View File

@ -2,7 +2,6 @@ use std::boxed::Box;
use std::convert::TryInto;
use std::env;
use log::info;
use url::{Host, Url};
use crate::newspaper::Newspaper;
@ -36,27 +35,20 @@ pub type Result<T, E = Error> = core::result::Result<T, E>;
fn default_newpapers() -> Result<Newspapers> {
// TODO: same thing is written too much times : how to DRY ?
let config_key = "MEDIAPART_COOKIE".to_string();
let mpruiid = env::var(&config_key)
.map_err(|_| Error::Misconfiguration(config_key))?
.into();
let mpruiid = env::var(&config_key).map_err(|_| Error::Misconfiguration(config_key))?;
let mediapart = Mediapart::builder()
.login(mediapart::Login::MPRUUID(mpruiid))
.login(mediapart::Login::Mpruuid(mpruiid))
.build()?;
let lmd_a_m = "MONDE_DIPLO_LMD_A_M".to_string();
let phpsessid = "MONDE_DIPLO_PHPSESSID".to_string();
let spip_session = "MONDE_DIPLO_SPIP_SESSION".to_string();
let lmd_a_m = env::var(&lmd_a_m)
.map_err(|_| Error::Misconfiguration(lmd_a_m))?
.into();
let phpsessid = env::var(&phpsessid)
.map_err(|_| Error::Misconfiguration(phpsessid))?
.into();
let spip_session = env::var(&spip_session)
.map_err(|_| Error::Misconfiguration(spip_session))?
.into();
let lmd_a_m = env::var(&lmd_a_m).map_err(|_| Error::Misconfiguration(lmd_a_m))?;
let phpsessid = env::var(&phpsessid).map_err(|_| Error::Misconfiguration(phpsessid))?;
let spip_session =
env::var(&spip_session).map_err(|_| Error::Misconfiguration(spip_session))?;
let monde_diplo = MondeDiplo::builder()
.login(monde_diplomatique::Login::Cookies {
@ -69,12 +61,8 @@ fn default_newpapers() -> Result<Newspapers> {
let lmd_a_m = "COURRIER_INTERNATIONAL_LMD_A_M".to_string();
let ssess = "COURRIER_INTERNATIONAL_SSESS".to_string();
let lmd_a_m = env::var(&lmd_a_m)
.map_err(|_| Error::Misconfiguration(lmd_a_m))?
.into();
let ssess = env::var(&ssess)
.map_err(|_| Error::Misconfiguration(ssess))?
.into();
let lmd_a_m = env::var(&lmd_a_m).map_err(|_| Error::Misconfiguration(lmd_a_m))?;
let ssess = env::var(&ssess).map_err(|_| Error::Misconfiguration(ssess))?;
let courrier_international = CourrierInternational::builder()
.login(courrier_international::Login::Cookies { lmd_a_m, ssess })

View File

@ -82,13 +82,19 @@ impl Newspaper for CourrierInternational {
};
// TODO: Move to const
let element_to_remove = [
let elements_to_remove = [
// navigation elements
"#entete.connecte",
];
let single_page_html =
tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await;
let single_page_html = tools::self_contained_html::Config {
downloader: Some(&downloader),
base_url: Some(&url),
elements_to_remove: &elements_to_remove,
..Default::default()
}
.run(&html)
.await;
Ok(single_page_html)
}

View File

@ -10,7 +10,7 @@ use crate::{Download, Downloader};
pub enum Login {
Username(String, String),
MPRUUID(String),
Mpruuid(String),
}
#[derive(Debug, Clone, Default)]
@ -33,7 +33,7 @@ impl Builder {
Login::Username(_username, _password) => {
unimplemented!("login using username and passwond not implemented")
}
Login::MPRUUID(cookie_value) => Some(("MPRUUID".into(), cookie_value)),
Login::Mpruuid(cookie_value) => Some(("MPRUUID".into(), cookie_value)),
};
self
}
@ -86,7 +86,7 @@ impl Newspaper for Mediapart {
};
// TODO: Move to const
let element_to_remove = [
let elements_to_remove = [
// header
".fb-root",
".skipLinks",
@ -104,8 +104,14 @@ impl Newspaper for Mediapart {
"aside.cc-modal",
];
let single_page_html =
tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await;
let single_page_html = tools::self_contained_html::Config {
downloader: Some(&downloader),
base_url: Some(&url),
elements_to_remove: &elements_to_remove,
..Default::default()
}
.run(&html)
.await;
Ok(single_page_html)
}

View File

@ -91,7 +91,7 @@ impl Newspaper for MondeDiplo {
};
// TODO: Move to const
let element_to_remove = [
let elements_to_remove = [
// navigation elements
"#tout-en-haut.preentete",
"#entete.connecte",
@ -107,8 +107,14 @@ impl Newspaper for MondeDiplo {
"noscript",
];
let single_page_html =
tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await;
let single_page_html = tools::self_contained_html::Config {
downloader: Some(&downloader),
base_url: Some(&url),
elements_to_remove: &elements_to_remove,
..Default::default()
}
.run(&html)
.await;
Ok(single_page_html)
}

View File

@ -1,5 +1,4 @@
mod download;
mod self_contained_html;
pub mod self_contained_html;
pub use download::{Download, DownloadError, Downloader};
pub use self_contained_html::self_contained_html;

View File

@ -8,140 +8,177 @@ use url::Url;
use crate::consts::{EVENT_HANDLERS, LINK_REL_EXTERNAL_RESOURCES};
use crate::Download;
/// Makes an html page self-contained
///
/// The `downloader` must implement `Download` and is used to download ressources that are
/// needed to make this page self-contained such as stylesheets or images.
///
/// The function also removes all scripts on the page
pub async fn self_contained_html<E, D>(
html: impl AsRef<str>,
downloader: &D,
base_url: &Url,
elements_to_remove: &[impl AsRef<str>],
) -> String
/// Stores configuration for the self_contained_html function
// TODO: write a builder
pub struct Config<'t, E, D, S1 = &'static str, S2 = &'static str>
where
E: std::error::Error,
D: Download<Error = E> + Send,
S1: AsRef<str>,
S2: AsRef<str>,
{
/// the downloader that will be used to retrieve ressources on the page
pub downloader: Option<&'t D>,
/// Base url for downloading ressources, it probably the
pub base_url: Option<&'t Url>,
pub elements_to_remove: &'t [S1],
pub styles_to_add: &'t [S2],
}
impl<'t, E, D> Default for Config<'t, E, D>
where
E: std::error::Error,
D: Download<Error = E> + Send,
{
// TODO: split/refactor this function :
// - ¿ find a way to ease the |get urls| -> |async download| -> |replace| workflow ?
// - `element_to_remove` , `base_url` and `downloader` should be in a configuration structure
// - ¿ should be function of a trait ? or only of the configuration struct ?
let (style_urls, html) = {
let document = Document::from(html.as_ref());
// ---- Remove scripts ----
//
document.select("script").remove();
for event in EVENT_HANDLERS {
document
.select(format!("[{}]", event).as_str())
.remove_attr(event);
fn default() -> Self {
Self {
downloader: None,
base_url: None,
elements_to_remove: &[],
styles_to_add: &[],
}
}
}
for rel in LINK_REL_EXTERNAL_RESOURCES {
document
.select(format!("link[rel=\"{}\"]", rel).as_str())
.remove();
}
impl<'t, E, D, S1, S2> Config<'t, E, D, S1, S2>
where
E: std::error::Error,
D: Download<Error = E> + Send,
S1: AsRef<str>,
S2: AsRef<str>,
{
/// Makes an html page self-contained
///
/// The `downloader` must implement `Download` and is used to download ressources that are
/// needed to make this page self-contained such as stylesheets or images.
///
/// The function also removes all scripts on the page
pub async fn run(&self, html: impl AsRef<str>) -> String {
//TODO: don't panic
let base_url = self.base_url.expect("Base url not defined");
let downloader = self.downloader.expect("Downloader not defined");
// TODO: split/refactor this function :
// - ¿ find a way to ease the |get urls| -> |async download| -> |replace| workflow ?
// - `element_to_remove` , `base_url` and `downloader` should be in a configuration structure
let (style_urls, html) = {
let document = Document::from(html.as_ref());
// ---- Replace stylesheets ----
// ---- Remove scripts ----
//
document.select("script").remove();
for event in EVENT_HANDLERS {
document
.select(format!("[{}]", event).as_str())
.remove_attr(event);
}
for rel in LINK_REL_EXTERNAL_RESOURCES {
document
.select(format!("link[rel=\"{}\"]", rel).as_str())
.remove();
}
// ---- Replace stylesheets ----
//
let stylesheets = document.select("link[href][rel=\"stylesheet\"]");
let styles_url = stylesheets
.iter()
.map(|stylesheet| {
if let Some(src) = stylesheet.attr("href") {
//TODO: does it work with absolute urls ?
base_url.join(src.as_ref()).ok()
} else {
None
}
})
.collect::<Vec<_>>();
(styles_url, String::from(document.html()))
};
let style_urls = style_urls.into_iter().map(|style_url| {
OptionFuture::from(
style_url.map(|s| async move { downloader.download(&s).await.unwrap() }),
)
});
let downloaded_styles = futures::future::join_all(style_urls).await;
let html = {
let document = Document::from(&html);
let styles = document.select("link[href][rel=\"stylesheet\"]");
styles
.iter()
.zip(downloaded_styles.iter())
.for_each(|(mut stylesheet, inner_css)| {
if let Some(Some(inner_css)) = inner_css {
let css = String::from_utf8(inner_css.to_vec()).unwrap();
let css = format!("<style>{}</style>", css);
stylesheet.replace_with_html(css);
} else {
stylesheet.remove();
}
});
String::from(document.html())
};
// ---- Replace imgs ----
//
let stylesheets = document.select("link[href][rel=\"stylesheet\"]");
let styles_url = stylesheets
.iter()
.map(|stylesheet| {
if let Some(src) = stylesheet.attr("href") {
//TODO: does it work with absolute urls ?
base_url.join(src.as_ref()).ok()
} else {
None
}
})
.collect::<Vec<_>>();
(styles_url, String::from(document.html()))
};
let image_urls = {
let document = Document::from(&html);
let imgs = document.select("img:not([src^=\"data:\"])");
let style_urls = style_urls.into_iter().map(|style_url| {
OptionFuture::from(style_url.map(|s| async move { downloader.download(&s).await.unwrap() }))
});
let downloaded_styles = futures::future::join_all(style_urls).await;
imgs.iter()
.map(|image| {
if let Some(src) = image.attr("src") {
base_url.join(src.as_ref()).ok()
} else {
None
}
})
.collect::<Vec<_>>()
};
let html = {
let document = Document::from(&html);
let styles = document.select("link[href][rel=\"stylesheet\"]");
let downloaded_images = image_urls.into_iter().map(|image_url| {
OptionFuture::from(image_url.map(|url| async move {
let data = downloader.download(&url).await.unwrap();
(url, data)
}))
});
let downloaded_images = futures::future::join_all(downloaded_images).await;
styles
.iter()
.zip(downloaded_styles.iter())
.for_each(|(mut stylesheet, inner_css)| {
if let Some(Some(inner_css)) = inner_css {
let css = String::from_utf8(inner_css.to_vec()).unwrap();
let css = format!("<style>{}</style>", css);
stylesheet.replace_with_html(css);
} else {
stylesheet.remove();
}
});
String::from(document.html())
};
let html = {
let document = Document::from(&html);
let imgs = document.select("img:not([src^=\"data:\"])");
// ---- Replace imgs ----
//
let image_urls = {
let document = Document::from(&html);
let imgs = document.select("img:not([src^=\"data:\"])");
imgs.iter()
.zip(downloaded_images.iter())
.for_each(|(mut img, data)| {
if let Some((url, Some(data))) = data {
let data = base64::encode(data);
//TODO: use an extension hashmap
let extension =
Path::new(url.path()).extension().unwrap().to_str().unwrap();
img.set_attr("src", &format!("data:image/{};base64,{}", extension, data));
} else {
img.remove()
}
});
// ---- Remove unwanted html elements -----
//
for element in self.elements_to_remove {
document.select(element.as_ref()).remove();
}
String::from(document.html())
};
imgs.iter()
.map(|image| {
if let Some(src) = image.attr("src") {
base_url.join(src.as_ref()).ok()
} else {
None
}
})
.collect::<Vec<_>>()
};
let downloaded_images = image_urls.into_iter().map(|image_url| {
OptionFuture::from(image_url.map(|url| async move {
let data = downloader.download(&url).await.unwrap();
(url, data)
}))
});
let downloaded_images = futures::future::join_all(downloaded_images).await;
let html = {
let document = Document::from(&html);
let imgs = document.select("img:not([src^=\"data:\"])");
imgs.iter()
.zip(downloaded_images.iter())
.for_each(|(mut img, data)| {
if let Some((url, Some(data))) = data {
let data = base64::encode(data);
let extension = Path::new(url.path()).extension().unwrap().to_str().unwrap();
img.set_attr("src", &format!("data:image/{};base64,{}", extension, data));
} else {
img.remove()
}
});
// ---- Remove unwanted html elements -----
// ---- output ----
//
for element in elements_to_remove {
document.select(element.as_ref()).remove();
}
String::from(document.html())
};
let mut minifier = HTMLMinifier::new();
minifier.digest(html.as_str()).unwrap();
// ---- output ----
//
let mut minifier = HTMLMinifier::new();
minifier.digest(html.as_str()).unwrap();
String::from_utf8(minifier.get_html().into()).unwrap()
String::from_utf8(minifier.get_html().into()).unwrap()
}
}
#[cfg(test)]
@ -180,9 +217,14 @@ mod tests {
let html = "<html><head><script>let id = id => id</script></head><body></body></html>";
let base_url = Url::parse("http://example.com")?;
let downloader = DummyDownloader {};
let to_remove: &[&str] = &[];
assert_eq!(
self_contained_html(html, &downloader, &base_url, to_remove).await,
Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}
.run(html)
.await,
"<html><head></head><body></body></html>"
);
Ok(())
@ -206,10 +248,13 @@ mod tests {
};
let base_url = Url::parse("http://example.com")?;
let to_remove: &[&str] = &[];
for s in EVENT_HANDLERS {
assert_eq!(
self_contained_html(html(s), &downloader, &base_url, to_remove).await,
Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}.run(html(s)).await,
"<html><head>\n</head>\n<body>\n<button class=\"activate\">button</button>\n</body></html>"
);
}
@ -234,10 +279,15 @@ mod tests {
};
let base_url = Url::parse("http://example.com")?;
let to_remove: &[&str] = &[];
for s in LINK_REL_EXTERNAL_RESOURCES {
assert_eq!(
self_contained_html(html(s), &downloader, &base_url, to_remove).await,
Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}
.run(html(s))
.await,
"<html><head>\n</head>\n<body>\n</body></html>"
);
}
@ -290,9 +340,14 @@ mod tests {
let minified = String::from_utf8(minifier.get_html().into())?;
let base_url = Url::parse("http://example.com")?;
let to_remove: &[&str] = &[];
assert_eq!(
self_contained_html(html, &downloader, &base_url, to_remove).await,
Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}
.run(html)
.await,
minified
);
Ok(())
@ -337,9 +392,14 @@ mod tests {
let minified = String::from_utf8(minifier.get_html().into())?;
let base_url = Url::parse("http://example.com")?;
let to_remove: &[&str] = &[];
assert_eq!(
self_contained_html(html, &downloader, &base_url, to_remove).await,
Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}
.run(html)
.await,
minified
);
Ok(())
@ -372,12 +432,13 @@ mod tests {
let minified = String::from_utf8(minifier.get_html().into())?;
assert_eq!(
self_contained_html(
html,
&downloader,
&base_url,
&["header", ".placeholder", "article > span.huge"]
)
Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
elements_to_remove: &["header", ".placeholder", "article > span.huge"],
..Default::default()
}
.run(html)
.await,
minified
);

View File

@ -37,7 +37,8 @@ frame "backend" {
newspaper -> retrieval_tools: uses to implement
article_location --> article_repr :uses
article_location --> article_repr: uses
retrieval_tools -up-> article_repr: uses
auto_retrieve --> rss: watches
auto_retrieve --> article_location