add several newspapers #36

Merged
koalp merged 6 commits from feature/additional_newspapers into development 2021-05-22 04:50:43 +02:00
9 changed files with 239 additions and 181 deletions
Showing only changes of commit 6e091a32fc - Show all commits

View File

@ -2,15 +2,7 @@
use std::convert::TryInto; use std::convert::TryInto;
use anyhow::Result; use anyhow::Result;
use matrix_sdk::{ use matrix_sdk::{self, Client, SyncSettings};
self, async_trait,
events::{
room::message::{MessageEventContent, MessageType, TextMessageEventContent},
AnyMessageEventContent, SyncMessageEvent,
},
room::Room,
Client, ClientConfig, EventHandler, SyncSettings,
};
use crate::Html; use crate::Html;

View File

@ -1,7 +1,6 @@
use std::convert::TryInto; use std::convert::TryInto;
use std::env;
use log::{error, info}; use log::error;
use matrix_sdk::{ use matrix_sdk::{
self, async_trait, self, async_trait,
events::{ events::{
@ -9,7 +8,7 @@ use matrix_sdk::{
AnyMessageEventContent, SyncMessageEvent, AnyMessageEventContent, SyncMessageEvent,
}, },
room::Room, room::Room,
Client, ClientConfig, EventHandler, SyncSettings, EventHandler,
}; };
use crieur_retrieve::{article_location::Error, article_location::Result, ArticleLocation, Url}; use crieur_retrieve::{article_location::Error, article_location::Result, ArticleLocation, Url};

View File

@ -2,7 +2,6 @@ use std::boxed::Box;
use std::convert::TryInto; use std::convert::TryInto;
use std::env; use std::env;
use log::info;
use url::{Host, Url}; use url::{Host, Url};
use crate::newspaper::Newspaper; use crate::newspaper::Newspaper;
@ -36,27 +35,20 @@ pub type Result<T, E = Error> = core::result::Result<T, E>;
fn default_newpapers() -> Result<Newspapers> { fn default_newpapers() -> Result<Newspapers> {
// TODO: same thing is written too much times : how to DRY ? // TODO: same thing is written too much times : how to DRY ?
let config_key = "MEDIAPART_COOKIE".to_string(); let config_key = "MEDIAPART_COOKIE".to_string();
let mpruiid = env::var(&config_key) let mpruiid = env::var(&config_key).map_err(|_| Error::Misconfiguration(config_key))?;
.map_err(|_| Error::Misconfiguration(config_key))?
.into();
let mediapart = Mediapart::builder() let mediapart = Mediapart::builder()
.login(mediapart::Login::MPRUUID(mpruiid)) .login(mediapart::Login::Mpruuid(mpruiid))
.build()?; .build()?;
let lmd_a_m = "MONDE_DIPLO_LMD_A_M".to_string(); let lmd_a_m = "MONDE_DIPLO_LMD_A_M".to_string();
let phpsessid = "MONDE_DIPLO_PHPSESSID".to_string(); let phpsessid = "MONDE_DIPLO_PHPSESSID".to_string();
let spip_session = "MONDE_DIPLO_SPIP_SESSION".to_string(); let spip_session = "MONDE_DIPLO_SPIP_SESSION".to_string();
let lmd_a_m = env::var(&lmd_a_m) let lmd_a_m = env::var(&lmd_a_m).map_err(|_| Error::Misconfiguration(lmd_a_m))?;
.map_err(|_| Error::Misconfiguration(lmd_a_m))? let phpsessid = env::var(&phpsessid).map_err(|_| Error::Misconfiguration(phpsessid))?;
.into(); let spip_session =
let phpsessid = env::var(&phpsessid) env::var(&spip_session).map_err(|_| Error::Misconfiguration(spip_session))?;
.map_err(|_| Error::Misconfiguration(phpsessid))?
.into();
let spip_session = env::var(&spip_session)
.map_err(|_| Error::Misconfiguration(spip_session))?
.into();
let monde_diplo = MondeDiplo::builder() let monde_diplo = MondeDiplo::builder()
.login(monde_diplomatique::Login::Cookies { .login(monde_diplomatique::Login::Cookies {
@ -69,12 +61,8 @@ fn default_newpapers() -> Result<Newspapers> {
let lmd_a_m = "COURRIER_INTERNATIONAL_LMD_A_M".to_string(); let lmd_a_m = "COURRIER_INTERNATIONAL_LMD_A_M".to_string();
let ssess = "COURRIER_INTERNATIONAL_SSESS".to_string(); let ssess = "COURRIER_INTERNATIONAL_SSESS".to_string();
let lmd_a_m = env::var(&lmd_a_m) let lmd_a_m = env::var(&lmd_a_m).map_err(|_| Error::Misconfiguration(lmd_a_m))?;
.map_err(|_| Error::Misconfiguration(lmd_a_m))? let ssess = env::var(&ssess).map_err(|_| Error::Misconfiguration(ssess))?;
.into();
let ssess = env::var(&ssess)
.map_err(|_| Error::Misconfiguration(ssess))?
.into();
let courrier_international = CourrierInternational::builder() let courrier_international = CourrierInternational::builder()
.login(courrier_international::Login::Cookies { lmd_a_m, ssess }) .login(courrier_international::Login::Cookies { lmd_a_m, ssess })

View File

@ -82,13 +82,19 @@ impl Newspaper for CourrierInternational {
}; };
// TODO: Move to const // TODO: Move to const
let element_to_remove = [ let elements_to_remove = [
// navigation elements // navigation elements
"#entete.connecte", "#entete.connecte",
]; ];
let single_page_html = let single_page_html = tools::self_contained_html::Config {
tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await; downloader: Some(&downloader),
base_url: Some(&url),
elements_to_remove: &elements_to_remove,
..Default::default()
}
.run(&html)
.await;
Ok(single_page_html) Ok(single_page_html)
} }

View File

@ -10,7 +10,7 @@ use crate::{Download, Downloader};
pub enum Login { pub enum Login {
Username(String, String), Username(String, String),
MPRUUID(String), Mpruuid(String),
} }
#[derive(Debug, Clone, Default)] #[derive(Debug, Clone, Default)]
@ -33,7 +33,7 @@ impl Builder {
Login::Username(_username, _password) => { Login::Username(_username, _password) => {
unimplemented!("login using username and passwond not implemented") unimplemented!("login using username and passwond not implemented")
} }
Login::MPRUUID(cookie_value) => Some(("MPRUUID".into(), cookie_value)), Login::Mpruuid(cookie_value) => Some(("MPRUUID".into(), cookie_value)),
}; };
self self
} }
@ -86,7 +86,7 @@ impl Newspaper for Mediapart {
}; };
// TODO: Move to const // TODO: Move to const
let element_to_remove = [ let elements_to_remove = [
// header // header
".fb-root", ".fb-root",
".skipLinks", ".skipLinks",
@ -104,8 +104,14 @@ impl Newspaper for Mediapart {
"aside.cc-modal", "aside.cc-modal",
]; ];
let single_page_html = let single_page_html = tools::self_contained_html::Config {
tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await; downloader: Some(&downloader),
base_url: Some(&url),
elements_to_remove: &elements_to_remove,
..Default::default()
}
.run(&html)
.await;
Ok(single_page_html) Ok(single_page_html)
} }

View File

@ -91,7 +91,7 @@ impl Newspaper for MondeDiplo {
}; };
// TODO: Move to const // TODO: Move to const
let element_to_remove = [ let elements_to_remove = [
// navigation elements // navigation elements
"#tout-en-haut.preentete", "#tout-en-haut.preentete",
"#entete.connecte", "#entete.connecte",
@ -107,8 +107,14 @@ impl Newspaper for MondeDiplo {
"noscript", "noscript",
]; ];
let single_page_html = let single_page_html = tools::self_contained_html::Config {
tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await; downloader: Some(&downloader),
base_url: Some(&url),
elements_to_remove: &elements_to_remove,
..Default::default()
}
.run(&html)
.await;
Ok(single_page_html) Ok(single_page_html)
} }

View File

@ -1,5 +1,4 @@
mod download; mod download;
mod self_contained_html; pub mod self_contained_html;
pub use download::{Download, DownloadError, Downloader}; pub use download::{Download, DownloadError, Downloader};
pub use self_contained_html::self_contained_html;

View File

@ -8,140 +8,177 @@ use url::Url;
use crate::consts::{EVENT_HANDLERS, LINK_REL_EXTERNAL_RESOURCES}; use crate::consts::{EVENT_HANDLERS, LINK_REL_EXTERNAL_RESOURCES};
use crate::Download; use crate::Download;
/// Makes an html page self-contained /// Stores configuration for the self_contained_html function
/// // TODO: write a builder
/// The `downloader` must implement `Download` and is used to download ressources that are pub struct Config<'t, E, D, S1 = &'static str, S2 = &'static str>
/// needed to make this page self-contained such as stylesheets or images. where
/// E: std::error::Error,
/// The function also removes all scripts on the page D: Download<Error = E> + Send,
pub async fn self_contained_html<E, D>( S1: AsRef<str>,
html: impl AsRef<str>, S2: AsRef<str>,
downloader: &D, {
base_url: &Url, /// the downloader that will be used to retrieve ressources on the page
elements_to_remove: &[impl AsRef<str>], pub downloader: Option<&'t D>,
) -> String /// Base url for downloading ressources, it probably the
pub base_url: Option<&'t Url>,
pub elements_to_remove: &'t [S1],
pub styles_to_add: &'t [S2],
}
impl<'t, E, D> Default for Config<'t, E, D>
where where
E: std::error::Error, E: std::error::Error,
D: Download<Error = E> + Send, D: Download<Error = E> + Send,
{ {
// TODO: split/refactor this function : fn default() -> Self {
// - ¿ find a way to ease the |get urls| -> |async download| -> |replace| workflow ? Self {
// - `element_to_remove` , `base_url` and `downloader` should be in a configuration structure downloader: None,
// - ¿ should be function of a trait ? or only of the configuration struct ? base_url: None,
let (style_urls, html) = { elements_to_remove: &[],
let document = Document::from(html.as_ref()); styles_to_add: &[],
// ---- Remove scripts ----
//
document.select("script").remove();
for event in EVENT_HANDLERS {
document
.select(format!("[{}]", event).as_str())
.remove_attr(event);
} }
}
}
for rel in LINK_REL_EXTERNAL_RESOURCES { impl<'t, E, D, S1, S2> Config<'t, E, D, S1, S2>
document where
.select(format!("link[rel=\"{}\"]", rel).as_str()) E: std::error::Error,
.remove(); D: Download<Error = E> + Send,
} S1: AsRef<str>,
S2: AsRef<str>,
{
/// Makes an html page self-contained
///
/// The `downloader` must implement `Download` and is used to download ressources that are
/// needed to make this page self-contained such as stylesheets or images.
///
/// The function also removes all scripts on the page
pub async fn run(&self, html: impl AsRef<str>) -> String {
//TODO: don't panic
let base_url = self.base_url.expect("Base url not defined");
let downloader = self.downloader.expect("Downloader not defined");
// TODO: split/refactor this function :
// - ¿ find a way to ease the |get urls| -> |async download| -> |replace| workflow ?
// - `element_to_remove` , `base_url` and `downloader` should be in a configuration structure
let (style_urls, html) = {
let document = Document::from(html.as_ref());
// ---- Replace stylesheets ---- // ---- Remove scripts ----
//
document.select("script").remove();
for event in EVENT_HANDLERS {
document
.select(format!("[{}]", event).as_str())
.remove_attr(event);
}
for rel in LINK_REL_EXTERNAL_RESOURCES {
document
.select(format!("link[rel=\"{}\"]", rel).as_str())
.remove();
}
// ---- Replace stylesheets ----
//
let stylesheets = document.select("link[href][rel=\"stylesheet\"]");
let styles_url = stylesheets
.iter()
.map(|stylesheet| {
if let Some(src) = stylesheet.attr("href") {
//TODO: does it work with absolute urls ?
base_url.join(src.as_ref()).ok()
} else {
None
}
})
.collect::<Vec<_>>();
(styles_url, String::from(document.html()))
};
let style_urls = style_urls.into_iter().map(|style_url| {
OptionFuture::from(
style_url.map(|s| async move { downloader.download(&s).await.unwrap() }),
)
});
let downloaded_styles = futures::future::join_all(style_urls).await;
let html = {
let document = Document::from(&html);
let styles = document.select("link[href][rel=\"stylesheet\"]");
styles
.iter()
.zip(downloaded_styles.iter())
.for_each(|(mut stylesheet, inner_css)| {
if let Some(Some(inner_css)) = inner_css {
let css = String::from_utf8(inner_css.to_vec()).unwrap();
let css = format!("<style>{}</style>", css);
stylesheet.replace_with_html(css);
} else {
stylesheet.remove();
}
});
String::from(document.html())
};
// ---- Replace imgs ----
// //
let stylesheets = document.select("link[href][rel=\"stylesheet\"]"); let image_urls = {
let styles_url = stylesheets let document = Document::from(&html);
.iter() let imgs = document.select("img:not([src^=\"data:\"])");
.map(|stylesheet| {
if let Some(src) = stylesheet.attr("href") {
//TODO: does it work with absolute urls ?
base_url.join(src.as_ref()).ok()
} else {
None
}
})
.collect::<Vec<_>>();
(styles_url, String::from(document.html()))
};
let style_urls = style_urls.into_iter().map(|style_url| { imgs.iter()
OptionFuture::from(style_url.map(|s| async move { downloader.download(&s).await.unwrap() })) .map(|image| {
}); if let Some(src) = image.attr("src") {
let downloaded_styles = futures::future::join_all(style_urls).await; base_url.join(src.as_ref()).ok()
} else {
None
}
})
.collect::<Vec<_>>()
};
let html = { let downloaded_images = image_urls.into_iter().map(|image_url| {
let document = Document::from(&html); OptionFuture::from(image_url.map(|url| async move {
let styles = document.select("link[href][rel=\"stylesheet\"]"); let data = downloader.download(&url).await.unwrap();
(url, data)
}))
});
let downloaded_images = futures::future::join_all(downloaded_images).await;
styles let html = {
.iter() let document = Document::from(&html);
.zip(downloaded_styles.iter()) let imgs = document.select("img:not([src^=\"data:\"])");
.for_each(|(mut stylesheet, inner_css)| {
if let Some(Some(inner_css)) = inner_css {
let css = String::from_utf8(inner_css.to_vec()).unwrap();
let css = format!("<style>{}</style>", css);
stylesheet.replace_with_html(css);
} else {
stylesheet.remove();
}
});
String::from(document.html())
};
// ---- Replace imgs ---- imgs.iter()
// .zip(downloaded_images.iter())
let image_urls = { .for_each(|(mut img, data)| {
let document = Document::from(&html); if let Some((url, Some(data))) = data {
let imgs = document.select("img:not([src^=\"data:\"])"); let data = base64::encode(data);
//TODO: use an extension hashmap
let extension =
Path::new(url.path()).extension().unwrap().to_str().unwrap();
img.set_attr("src", &format!("data:image/{};base64,{}", extension, data));
} else {
img.remove()
}
});
// ---- Remove unwanted html elements -----
//
for element in self.elements_to_remove {
document.select(element.as_ref()).remove();
}
String::from(document.html())
};
imgs.iter() // ---- output ----
.map(|image| {
if let Some(src) = image.attr("src") {
base_url.join(src.as_ref()).ok()
} else {
None
}
})
.collect::<Vec<_>>()
};
let downloaded_images = image_urls.into_iter().map(|image_url| {
OptionFuture::from(image_url.map(|url| async move {
let data = downloader.download(&url).await.unwrap();
(url, data)
}))
});
let downloaded_images = futures::future::join_all(downloaded_images).await;
let html = {
let document = Document::from(&html);
let imgs = document.select("img:not([src^=\"data:\"])");
imgs.iter()
.zip(downloaded_images.iter())
.for_each(|(mut img, data)| {
if let Some((url, Some(data))) = data {
let data = base64::encode(data);
let extension = Path::new(url.path()).extension().unwrap().to_str().unwrap();
img.set_attr("src", &format!("data:image/{};base64,{}", extension, data));
} else {
img.remove()
}
});
// ---- Remove unwanted html elements -----
// //
for element in elements_to_remove { let mut minifier = HTMLMinifier::new();
document.select(element.as_ref()).remove(); minifier.digest(html.as_str()).unwrap();
}
String::from(document.html())
};
// ---- output ---- String::from_utf8(minifier.get_html().into()).unwrap()
// }
let mut minifier = HTMLMinifier::new();
minifier.digest(html.as_str()).unwrap();
String::from_utf8(minifier.get_html().into()).unwrap()
} }
#[cfg(test)] #[cfg(test)]
@ -180,9 +217,14 @@ mod tests {
let html = "<html><head><script>let id = id => id</script></head><body></body></html>"; let html = "<html><head><script>let id = id => id</script></head><body></body></html>";
let base_url = Url::parse("http://example.com")?; let base_url = Url::parse("http://example.com")?;
let downloader = DummyDownloader {}; let downloader = DummyDownloader {};
let to_remove: &[&str] = &[];
assert_eq!( assert_eq!(
self_contained_html(html, &downloader, &base_url, to_remove).await, Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}
.run(html)
.await,
"<html><head></head><body></body></html>" "<html><head></head><body></body></html>"
); );
Ok(()) Ok(())
@ -206,10 +248,13 @@ mod tests {
}; };
let base_url = Url::parse("http://example.com")?; let base_url = Url::parse("http://example.com")?;
let to_remove: &[&str] = &[];
for s in EVENT_HANDLERS { for s in EVENT_HANDLERS {
assert_eq!( assert_eq!(
self_contained_html(html(s), &downloader, &base_url, to_remove).await, Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}.run(html(s)).await,
"<html><head>\n</head>\n<body>\n<button class=\"activate\">button</button>\n</body></html>" "<html><head>\n</head>\n<body>\n<button class=\"activate\">button</button>\n</body></html>"
); );
} }
@ -234,10 +279,15 @@ mod tests {
}; };
let base_url = Url::parse("http://example.com")?; let base_url = Url::parse("http://example.com")?;
let to_remove: &[&str] = &[];
for s in LINK_REL_EXTERNAL_RESOURCES { for s in LINK_REL_EXTERNAL_RESOURCES {
assert_eq!( assert_eq!(
self_contained_html(html(s), &downloader, &base_url, to_remove).await, Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}
.run(html(s))
.await,
"<html><head>\n</head>\n<body>\n</body></html>" "<html><head>\n</head>\n<body>\n</body></html>"
); );
} }
@ -290,9 +340,14 @@ mod tests {
let minified = String::from_utf8(minifier.get_html().into())?; let minified = String::from_utf8(minifier.get_html().into())?;
let base_url = Url::parse("http://example.com")?; let base_url = Url::parse("http://example.com")?;
let to_remove: &[&str] = &[];
assert_eq!( assert_eq!(
self_contained_html(html, &downloader, &base_url, to_remove).await, Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}
.run(html)
.await,
minified minified
); );
Ok(()) Ok(())
@ -337,9 +392,14 @@ mod tests {
let minified = String::from_utf8(minifier.get_html().into())?; let minified = String::from_utf8(minifier.get_html().into())?;
let base_url = Url::parse("http://example.com")?; let base_url = Url::parse("http://example.com")?;
let to_remove: &[&str] = &[];
assert_eq!( assert_eq!(
self_contained_html(html, &downloader, &base_url, to_remove).await, Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}
.run(html)
.await,
minified minified
); );
Ok(()) Ok(())
@ -372,12 +432,13 @@ mod tests {
let minified = String::from_utf8(minifier.get_html().into())?; let minified = String::from_utf8(minifier.get_html().into())?;
assert_eq!( assert_eq!(
self_contained_html( Config {
html, downloader: Some(&downloader),
&downloader, base_url: Some(&base_url),
&base_url, elements_to_remove: &["header", ".placeholder", "article > span.huge"],
&["header", ".placeholder", "article > span.huge"] ..Default::default()
) }
.run(html)
.await, .await,
minified minified
); );

View File

@ -37,7 +37,8 @@ frame "backend" {
newspaper -> retrieval_tools: uses to implement newspaper -> retrieval_tools: uses to implement
article_location --> article_repr :uses article_location --> article_repr: uses
retrieval_tools -up-> article_repr: uses
auto_retrieve --> rss: watches auto_retrieve --> rss: watches
auto_retrieve --> article_location auto_retrieve --> article_location