chore: use a config struct for self_contained_html
Previously, self_html_function was a function taking all parameters as arguments. As new optionnal parameters are beeing added, the function had too much arguments and each usage of the function would have to be modified each time an argument will be added. Therefore, it have been moved to a configuration structure with a `run` function taking only one argument, the html string.
This commit is contained in:
parent
5d0872b4d9
commit
6e091a32fc
@ -2,15 +2,7 @@
|
|||||||
use std::convert::TryInto;
|
use std::convert::TryInto;
|
||||||
|
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use matrix_sdk::{
|
use matrix_sdk::{self, Client, SyncSettings};
|
||||||
self, async_trait,
|
|
||||||
events::{
|
|
||||||
room::message::{MessageEventContent, MessageType, TextMessageEventContent},
|
|
||||||
AnyMessageEventContent, SyncMessageEvent,
|
|
||||||
},
|
|
||||||
room::Room,
|
|
||||||
Client, ClientConfig, EventHandler, SyncSettings,
|
|
||||||
};
|
|
||||||
|
|
||||||
use crate::Html;
|
use crate::Html;
|
||||||
|
|
||||||
|
@ -1,7 +1,6 @@
|
|||||||
use std::convert::TryInto;
|
use std::convert::TryInto;
|
||||||
use std::env;
|
|
||||||
|
|
||||||
use log::{error, info};
|
use log::error;
|
||||||
use matrix_sdk::{
|
use matrix_sdk::{
|
||||||
self, async_trait,
|
self, async_trait,
|
||||||
events::{
|
events::{
|
||||||
@ -9,7 +8,7 @@ use matrix_sdk::{
|
|||||||
AnyMessageEventContent, SyncMessageEvent,
|
AnyMessageEventContent, SyncMessageEvent,
|
||||||
},
|
},
|
||||||
room::Room,
|
room::Room,
|
||||||
Client, ClientConfig, EventHandler, SyncSettings,
|
EventHandler,
|
||||||
};
|
};
|
||||||
|
|
||||||
use crieur_retrieve::{article_location::Error, article_location::Result, ArticleLocation, Url};
|
use crieur_retrieve::{article_location::Error, article_location::Result, ArticleLocation, Url};
|
||||||
|
@ -2,7 +2,6 @@ use std::boxed::Box;
|
|||||||
use std::convert::TryInto;
|
use std::convert::TryInto;
|
||||||
use std::env;
|
use std::env;
|
||||||
|
|
||||||
use log::info;
|
|
||||||
use url::{Host, Url};
|
use url::{Host, Url};
|
||||||
|
|
||||||
use crate::newspaper::Newspaper;
|
use crate::newspaper::Newspaper;
|
||||||
@ -36,27 +35,20 @@ pub type Result<T, E = Error> = core::result::Result<T, E>;
|
|||||||
fn default_newpapers() -> Result<Newspapers> {
|
fn default_newpapers() -> Result<Newspapers> {
|
||||||
// TODO: same thing is written too much times : how to DRY ?
|
// TODO: same thing is written too much times : how to DRY ?
|
||||||
let config_key = "MEDIAPART_COOKIE".to_string();
|
let config_key = "MEDIAPART_COOKIE".to_string();
|
||||||
let mpruiid = env::var(&config_key)
|
let mpruiid = env::var(&config_key).map_err(|_| Error::Misconfiguration(config_key))?;
|
||||||
.map_err(|_| Error::Misconfiguration(config_key))?
|
|
||||||
.into();
|
|
||||||
|
|
||||||
let mediapart = Mediapart::builder()
|
let mediapart = Mediapart::builder()
|
||||||
.login(mediapart::Login::MPRUUID(mpruiid))
|
.login(mediapart::Login::Mpruuid(mpruiid))
|
||||||
.build()?;
|
.build()?;
|
||||||
|
|
||||||
let lmd_a_m = "MONDE_DIPLO_LMD_A_M".to_string();
|
let lmd_a_m = "MONDE_DIPLO_LMD_A_M".to_string();
|
||||||
let phpsessid = "MONDE_DIPLO_PHPSESSID".to_string();
|
let phpsessid = "MONDE_DIPLO_PHPSESSID".to_string();
|
||||||
let spip_session = "MONDE_DIPLO_SPIP_SESSION".to_string();
|
let spip_session = "MONDE_DIPLO_SPIP_SESSION".to_string();
|
||||||
|
|
||||||
let lmd_a_m = env::var(&lmd_a_m)
|
let lmd_a_m = env::var(&lmd_a_m).map_err(|_| Error::Misconfiguration(lmd_a_m))?;
|
||||||
.map_err(|_| Error::Misconfiguration(lmd_a_m))?
|
let phpsessid = env::var(&phpsessid).map_err(|_| Error::Misconfiguration(phpsessid))?;
|
||||||
.into();
|
let spip_session =
|
||||||
let phpsessid = env::var(&phpsessid)
|
env::var(&spip_session).map_err(|_| Error::Misconfiguration(spip_session))?;
|
||||||
.map_err(|_| Error::Misconfiguration(phpsessid))?
|
|
||||||
.into();
|
|
||||||
let spip_session = env::var(&spip_session)
|
|
||||||
.map_err(|_| Error::Misconfiguration(spip_session))?
|
|
||||||
.into();
|
|
||||||
|
|
||||||
let monde_diplo = MondeDiplo::builder()
|
let monde_diplo = MondeDiplo::builder()
|
||||||
.login(monde_diplomatique::Login::Cookies {
|
.login(monde_diplomatique::Login::Cookies {
|
||||||
@ -69,12 +61,8 @@ fn default_newpapers() -> Result<Newspapers> {
|
|||||||
let lmd_a_m = "COURRIER_INTERNATIONAL_LMD_A_M".to_string();
|
let lmd_a_m = "COURRIER_INTERNATIONAL_LMD_A_M".to_string();
|
||||||
let ssess = "COURRIER_INTERNATIONAL_SSESS".to_string();
|
let ssess = "COURRIER_INTERNATIONAL_SSESS".to_string();
|
||||||
|
|
||||||
let lmd_a_m = env::var(&lmd_a_m)
|
let lmd_a_m = env::var(&lmd_a_m).map_err(|_| Error::Misconfiguration(lmd_a_m))?;
|
||||||
.map_err(|_| Error::Misconfiguration(lmd_a_m))?
|
let ssess = env::var(&ssess).map_err(|_| Error::Misconfiguration(ssess))?;
|
||||||
.into();
|
|
||||||
let ssess = env::var(&ssess)
|
|
||||||
.map_err(|_| Error::Misconfiguration(ssess))?
|
|
||||||
.into();
|
|
||||||
|
|
||||||
let courrier_international = CourrierInternational::builder()
|
let courrier_international = CourrierInternational::builder()
|
||||||
.login(courrier_international::Login::Cookies { lmd_a_m, ssess })
|
.login(courrier_international::Login::Cookies { lmd_a_m, ssess })
|
||||||
|
@ -82,13 +82,19 @@ impl Newspaper for CourrierInternational {
|
|||||||
};
|
};
|
||||||
|
|
||||||
// TODO: Move to const
|
// TODO: Move to const
|
||||||
let element_to_remove = [
|
let elements_to_remove = [
|
||||||
// navigation elements
|
// navigation elements
|
||||||
"#entete.connecte",
|
"#entete.connecte",
|
||||||
];
|
];
|
||||||
|
|
||||||
let single_page_html =
|
let single_page_html = tools::self_contained_html::Config {
|
||||||
tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await;
|
downloader: Some(&downloader),
|
||||||
|
base_url: Some(&url),
|
||||||
|
elements_to_remove: &elements_to_remove,
|
||||||
|
..Default::default()
|
||||||
|
}
|
||||||
|
.run(&html)
|
||||||
|
.await;
|
||||||
Ok(single_page_html)
|
Ok(single_page_html)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -10,7 +10,7 @@ use crate::{Download, Downloader};
|
|||||||
|
|
||||||
pub enum Login {
|
pub enum Login {
|
||||||
Username(String, String),
|
Username(String, String),
|
||||||
MPRUUID(String),
|
Mpruuid(String),
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Default)]
|
#[derive(Debug, Clone, Default)]
|
||||||
@ -33,7 +33,7 @@ impl Builder {
|
|||||||
Login::Username(_username, _password) => {
|
Login::Username(_username, _password) => {
|
||||||
unimplemented!("login using username and passwond not implemented")
|
unimplemented!("login using username and passwond not implemented")
|
||||||
}
|
}
|
||||||
Login::MPRUUID(cookie_value) => Some(("MPRUUID".into(), cookie_value)),
|
Login::Mpruuid(cookie_value) => Some(("MPRUUID".into(), cookie_value)),
|
||||||
};
|
};
|
||||||
self
|
self
|
||||||
}
|
}
|
||||||
@ -86,7 +86,7 @@ impl Newspaper for Mediapart {
|
|||||||
};
|
};
|
||||||
|
|
||||||
// TODO: Move to const
|
// TODO: Move to const
|
||||||
let element_to_remove = [
|
let elements_to_remove = [
|
||||||
// header
|
// header
|
||||||
".fb-root",
|
".fb-root",
|
||||||
".skipLinks",
|
".skipLinks",
|
||||||
@ -104,8 +104,14 @@ impl Newspaper for Mediapart {
|
|||||||
"aside.cc-modal",
|
"aside.cc-modal",
|
||||||
];
|
];
|
||||||
|
|
||||||
let single_page_html =
|
let single_page_html = tools::self_contained_html::Config {
|
||||||
tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await;
|
downloader: Some(&downloader),
|
||||||
|
base_url: Some(&url),
|
||||||
|
elements_to_remove: &elements_to_remove,
|
||||||
|
..Default::default()
|
||||||
|
}
|
||||||
|
.run(&html)
|
||||||
|
.await;
|
||||||
Ok(single_page_html)
|
Ok(single_page_html)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -91,7 +91,7 @@ impl Newspaper for MondeDiplo {
|
|||||||
};
|
};
|
||||||
|
|
||||||
// TODO: Move to const
|
// TODO: Move to const
|
||||||
let element_to_remove = [
|
let elements_to_remove = [
|
||||||
// navigation elements
|
// navigation elements
|
||||||
"#tout-en-haut.preentete",
|
"#tout-en-haut.preentete",
|
||||||
"#entete.connecte",
|
"#entete.connecte",
|
||||||
@ -107,8 +107,14 @@ impl Newspaper for MondeDiplo {
|
|||||||
"noscript",
|
"noscript",
|
||||||
];
|
];
|
||||||
|
|
||||||
let single_page_html =
|
let single_page_html = tools::self_contained_html::Config {
|
||||||
tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await;
|
downloader: Some(&downloader),
|
||||||
|
base_url: Some(&url),
|
||||||
|
elements_to_remove: &elements_to_remove,
|
||||||
|
..Default::default()
|
||||||
|
}
|
||||||
|
.run(&html)
|
||||||
|
.await;
|
||||||
Ok(single_page_html)
|
Ok(single_page_html)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
mod download;
|
mod download;
|
||||||
mod self_contained_html;
|
pub mod self_contained_html;
|
||||||
|
|
||||||
pub use download::{Download, DownloadError, Downloader};
|
pub use download::{Download, DownloadError, Downloader};
|
||||||
pub use self_contained_html::self_contained_html;
|
|
||||||
|
@ -8,26 +8,58 @@ use url::Url;
|
|||||||
use crate::consts::{EVENT_HANDLERS, LINK_REL_EXTERNAL_RESOURCES};
|
use crate::consts::{EVENT_HANDLERS, LINK_REL_EXTERNAL_RESOURCES};
|
||||||
use crate::Download;
|
use crate::Download;
|
||||||
|
|
||||||
|
/// Stores configuration for the self_contained_html function
|
||||||
|
// TODO: write a builder
|
||||||
|
pub struct Config<'t, E, D, S1 = &'static str, S2 = &'static str>
|
||||||
|
where
|
||||||
|
E: std::error::Error,
|
||||||
|
D: Download<Error = E> + Send,
|
||||||
|
S1: AsRef<str>,
|
||||||
|
S2: AsRef<str>,
|
||||||
|
{
|
||||||
|
/// the downloader that will be used to retrieve ressources on the page
|
||||||
|
pub downloader: Option<&'t D>,
|
||||||
|
/// Base url for downloading ressources, it probably the
|
||||||
|
pub base_url: Option<&'t Url>,
|
||||||
|
pub elements_to_remove: &'t [S1],
|
||||||
|
pub styles_to_add: &'t [S2],
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'t, E, D> Default for Config<'t, E, D>
|
||||||
|
where
|
||||||
|
E: std::error::Error,
|
||||||
|
D: Download<Error = E> + Send,
|
||||||
|
{
|
||||||
|
fn default() -> Self {
|
||||||
|
Self {
|
||||||
|
downloader: None,
|
||||||
|
base_url: None,
|
||||||
|
elements_to_remove: &[],
|
||||||
|
styles_to_add: &[],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'t, E, D, S1, S2> Config<'t, E, D, S1, S2>
|
||||||
|
where
|
||||||
|
E: std::error::Error,
|
||||||
|
D: Download<Error = E> + Send,
|
||||||
|
S1: AsRef<str>,
|
||||||
|
S2: AsRef<str>,
|
||||||
|
{
|
||||||
/// Makes an html page self-contained
|
/// Makes an html page self-contained
|
||||||
///
|
///
|
||||||
/// The `downloader` must implement `Download` and is used to download ressources that are
|
/// The `downloader` must implement `Download` and is used to download ressources that are
|
||||||
/// needed to make this page self-contained such as stylesheets or images.
|
/// needed to make this page self-contained such as stylesheets or images.
|
||||||
///
|
///
|
||||||
/// The function also removes all scripts on the page
|
/// The function also removes all scripts on the page
|
||||||
pub async fn self_contained_html<E, D>(
|
pub async fn run(&self, html: impl AsRef<str>) -> String {
|
||||||
html: impl AsRef<str>,
|
//TODO: don't panic
|
||||||
downloader: &D,
|
let base_url = self.base_url.expect("Base url not defined");
|
||||||
base_url: &Url,
|
let downloader = self.downloader.expect("Downloader not defined");
|
||||||
elements_to_remove: &[impl AsRef<str>],
|
|
||||||
) -> String
|
|
||||||
where
|
|
||||||
E: std::error::Error,
|
|
||||||
D: Download<Error = E> + Send,
|
|
||||||
{
|
|
||||||
// TODO: split/refactor this function :
|
// TODO: split/refactor this function :
|
||||||
// - ¿ find a way to ease the |get urls| -> |async download| -> |replace| workflow ?
|
// - ¿ find a way to ease the |get urls| -> |async download| -> |replace| workflow ?
|
||||||
// - `element_to_remove` , `base_url` and `downloader` should be in a configuration structure
|
// - `element_to_remove` , `base_url` and `downloader` should be in a configuration structure
|
||||||
// - ¿ should be function of a trait ? or only of the configuration struct ?
|
|
||||||
let (style_urls, html) = {
|
let (style_urls, html) = {
|
||||||
let document = Document::from(html.as_ref());
|
let document = Document::from(html.as_ref());
|
||||||
|
|
||||||
@ -65,7 +97,9 @@ where
|
|||||||
};
|
};
|
||||||
|
|
||||||
let style_urls = style_urls.into_iter().map(|style_url| {
|
let style_urls = style_urls.into_iter().map(|style_url| {
|
||||||
OptionFuture::from(style_url.map(|s| async move { downloader.download(&s).await.unwrap() }))
|
OptionFuture::from(
|
||||||
|
style_url.map(|s| async move { downloader.download(&s).await.unwrap() }),
|
||||||
|
)
|
||||||
});
|
});
|
||||||
let downloaded_styles = futures::future::join_all(style_urls).await;
|
let downloaded_styles = futures::future::join_all(style_urls).await;
|
||||||
|
|
||||||
@ -122,7 +156,9 @@ where
|
|||||||
.for_each(|(mut img, data)| {
|
.for_each(|(mut img, data)| {
|
||||||
if let Some((url, Some(data))) = data {
|
if let Some((url, Some(data))) = data {
|
||||||
let data = base64::encode(data);
|
let data = base64::encode(data);
|
||||||
let extension = Path::new(url.path()).extension().unwrap().to_str().unwrap();
|
//TODO: use an extension hashmap
|
||||||
|
let extension =
|
||||||
|
Path::new(url.path()).extension().unwrap().to_str().unwrap();
|
||||||
img.set_attr("src", &format!("data:image/{};base64,{}", extension, data));
|
img.set_attr("src", &format!("data:image/{};base64,{}", extension, data));
|
||||||
} else {
|
} else {
|
||||||
img.remove()
|
img.remove()
|
||||||
@ -130,7 +166,7 @@ where
|
|||||||
});
|
});
|
||||||
// ---- Remove unwanted html elements -----
|
// ---- Remove unwanted html elements -----
|
||||||
//
|
//
|
||||||
for element in elements_to_remove {
|
for element in self.elements_to_remove {
|
||||||
document.select(element.as_ref()).remove();
|
document.select(element.as_ref()).remove();
|
||||||
}
|
}
|
||||||
String::from(document.html())
|
String::from(document.html())
|
||||||
@ -143,6 +179,7 @@ where
|
|||||||
|
|
||||||
String::from_utf8(minifier.get_html().into()).unwrap()
|
String::from_utf8(minifier.get_html().into()).unwrap()
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
@ -180,9 +217,14 @@ mod tests {
|
|||||||
let html = "<html><head><script>let id = id => id</script></head><body></body></html>";
|
let html = "<html><head><script>let id = id => id</script></head><body></body></html>";
|
||||||
let base_url = Url::parse("http://example.com")?;
|
let base_url = Url::parse("http://example.com")?;
|
||||||
let downloader = DummyDownloader {};
|
let downloader = DummyDownloader {};
|
||||||
let to_remove: &[&str] = &[];
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
self_contained_html(html, &downloader, &base_url, to_remove).await,
|
Config {
|
||||||
|
downloader: Some(&downloader),
|
||||||
|
base_url: Some(&base_url),
|
||||||
|
..Default::default()
|
||||||
|
}
|
||||||
|
.run(html)
|
||||||
|
.await,
|
||||||
"<html><head></head><body></body></html>"
|
"<html><head></head><body></body></html>"
|
||||||
);
|
);
|
||||||
Ok(())
|
Ok(())
|
||||||
@ -206,10 +248,13 @@ mod tests {
|
|||||||
};
|
};
|
||||||
|
|
||||||
let base_url = Url::parse("http://example.com")?;
|
let base_url = Url::parse("http://example.com")?;
|
||||||
let to_remove: &[&str] = &[];
|
|
||||||
for s in EVENT_HANDLERS {
|
for s in EVENT_HANDLERS {
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
self_contained_html(html(s), &downloader, &base_url, to_remove).await,
|
Config {
|
||||||
|
downloader: Some(&downloader),
|
||||||
|
base_url: Some(&base_url),
|
||||||
|
..Default::default()
|
||||||
|
}.run(html(s)).await,
|
||||||
"<html><head>\n</head>\n<body>\n<button class=\"activate\">button</button>\n</body></html>"
|
"<html><head>\n</head>\n<body>\n<button class=\"activate\">button</button>\n</body></html>"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@ -234,10 +279,15 @@ mod tests {
|
|||||||
};
|
};
|
||||||
|
|
||||||
let base_url = Url::parse("http://example.com")?;
|
let base_url = Url::parse("http://example.com")?;
|
||||||
let to_remove: &[&str] = &[];
|
|
||||||
for s in LINK_REL_EXTERNAL_RESOURCES {
|
for s in LINK_REL_EXTERNAL_RESOURCES {
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
self_contained_html(html(s), &downloader, &base_url, to_remove).await,
|
Config {
|
||||||
|
downloader: Some(&downloader),
|
||||||
|
base_url: Some(&base_url),
|
||||||
|
..Default::default()
|
||||||
|
}
|
||||||
|
.run(html(s))
|
||||||
|
.await,
|
||||||
"<html><head>\n</head>\n<body>\n</body></html>"
|
"<html><head>\n</head>\n<body>\n</body></html>"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@ -290,9 +340,14 @@ mod tests {
|
|||||||
let minified = String::from_utf8(minifier.get_html().into())?;
|
let minified = String::from_utf8(minifier.get_html().into())?;
|
||||||
|
|
||||||
let base_url = Url::parse("http://example.com")?;
|
let base_url = Url::parse("http://example.com")?;
|
||||||
let to_remove: &[&str] = &[];
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
self_contained_html(html, &downloader, &base_url, to_remove).await,
|
Config {
|
||||||
|
downloader: Some(&downloader),
|
||||||
|
base_url: Some(&base_url),
|
||||||
|
..Default::default()
|
||||||
|
}
|
||||||
|
.run(html)
|
||||||
|
.await,
|
||||||
minified
|
minified
|
||||||
);
|
);
|
||||||
Ok(())
|
Ok(())
|
||||||
@ -337,9 +392,14 @@ mod tests {
|
|||||||
let minified = String::from_utf8(minifier.get_html().into())?;
|
let minified = String::from_utf8(minifier.get_html().into())?;
|
||||||
|
|
||||||
let base_url = Url::parse("http://example.com")?;
|
let base_url = Url::parse("http://example.com")?;
|
||||||
let to_remove: &[&str] = &[];
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
self_contained_html(html, &downloader, &base_url, to_remove).await,
|
Config {
|
||||||
|
downloader: Some(&downloader),
|
||||||
|
base_url: Some(&base_url),
|
||||||
|
..Default::default()
|
||||||
|
}
|
||||||
|
.run(html)
|
||||||
|
.await,
|
||||||
minified
|
minified
|
||||||
);
|
);
|
||||||
Ok(())
|
Ok(())
|
||||||
@ -372,12 +432,13 @@ mod tests {
|
|||||||
let minified = String::from_utf8(minifier.get_html().into())?;
|
let minified = String::from_utf8(minifier.get_html().into())?;
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
self_contained_html(
|
Config {
|
||||||
html,
|
downloader: Some(&downloader),
|
||||||
&downloader,
|
base_url: Some(&base_url),
|
||||||
&base_url,
|
elements_to_remove: &["header", ".placeholder", "article > span.huge"],
|
||||||
&["header", ".placeholder", "article > span.huge"]
|
..Default::default()
|
||||||
)
|
}
|
||||||
|
.run(html)
|
||||||
.await,
|
.await,
|
||||||
minified
|
minified
|
||||||
);
|
);
|
||||||
|
@ -38,6 +38,7 @@ frame "backend" {
|
|||||||
newspaper -> retrieval_tools: uses to implement
|
newspaper -> retrieval_tools: uses to implement
|
||||||
|
|
||||||
article_location --> article_repr: uses
|
article_location --> article_repr: uses
|
||||||
|
retrieval_tools -up-> article_repr: uses
|
||||||
|
|
||||||
auto_retrieve --> rss: watches
|
auto_retrieve --> rss: watches
|
||||||
auto_retrieve --> article_location
|
auto_retrieve --> article_location
|
||||||
|
Loading…
Reference in New Issue
Block a user