add several newspapers #36

Merged
koalp merged 6 commits from feature/additional_newspapers into development 2021-05-22 04:50:43 +02:00
9 changed files with 239 additions and 181 deletions
Showing only changes of commit 6e091a32fc - Show all commits

View File

@ -2,15 +2,7 @@
use std::convert::TryInto; use std::convert::TryInto;
use anyhow::Result; use anyhow::Result;
use matrix_sdk::{ use matrix_sdk::{self, Client, SyncSettings};
self, async_trait,
events::{
room::message::{MessageEventContent, MessageType, TextMessageEventContent},
AnyMessageEventContent, SyncMessageEvent,
},
room::Room,
Client, ClientConfig, EventHandler, SyncSettings,
};
use crate::Html; use crate::Html;

View File

@ -1,7 +1,6 @@
use std::convert::TryInto; use std::convert::TryInto;
use std::env;
use log::{error, info}; use log::error;
use matrix_sdk::{ use matrix_sdk::{
self, async_trait, self, async_trait,
events::{ events::{
@ -9,7 +8,7 @@ use matrix_sdk::{
AnyMessageEventContent, SyncMessageEvent, AnyMessageEventContent, SyncMessageEvent,
}, },
room::Room, room::Room,
Client, ClientConfig, EventHandler, SyncSettings, EventHandler,
}; };
use crieur_retrieve::{article_location::Error, article_location::Result, ArticleLocation, Url}; use crieur_retrieve::{article_location::Error, article_location::Result, ArticleLocation, Url};

View File

@ -2,7 +2,6 @@ use std::boxed::Box;
use std::convert::TryInto; use std::convert::TryInto;
use std::env; use std::env;
use log::info;
use url::{Host, Url}; use url::{Host, Url};
use crate::newspaper::Newspaper; use crate::newspaper::Newspaper;
@ -36,27 +35,20 @@ pub type Result<T, E = Error> = core::result::Result<T, E>;
fn default_newpapers() -> Result<Newspapers> { fn default_newpapers() -> Result<Newspapers> {
// TODO: same thing is written too much times : how to DRY ? // TODO: same thing is written too much times : how to DRY ?
let config_key = "MEDIAPART_COOKIE".to_string(); let config_key = "MEDIAPART_COOKIE".to_string();
let mpruiid = env::var(&config_key) let mpruiid = env::var(&config_key).map_err(|_| Error::Misconfiguration(config_key))?;
.map_err(|_| Error::Misconfiguration(config_key))?
.into();
let mediapart = Mediapart::builder() let mediapart = Mediapart::builder()
.login(mediapart::Login::MPRUUID(mpruiid)) .login(mediapart::Login::Mpruuid(mpruiid))
.build()?; .build()?;
let lmd_a_m = "MONDE_DIPLO_LMD_A_M".to_string(); let lmd_a_m = "MONDE_DIPLO_LMD_A_M".to_string();
let phpsessid = "MONDE_DIPLO_PHPSESSID".to_string(); let phpsessid = "MONDE_DIPLO_PHPSESSID".to_string();
let spip_session = "MONDE_DIPLO_SPIP_SESSION".to_string(); let spip_session = "MONDE_DIPLO_SPIP_SESSION".to_string();
let lmd_a_m = env::var(&lmd_a_m) let lmd_a_m = env::var(&lmd_a_m).map_err(|_| Error::Misconfiguration(lmd_a_m))?;
.map_err(|_| Error::Misconfiguration(lmd_a_m))? let phpsessid = env::var(&phpsessid).map_err(|_| Error::Misconfiguration(phpsessid))?;
.into(); let spip_session =
let phpsessid = env::var(&phpsessid) env::var(&spip_session).map_err(|_| Error::Misconfiguration(spip_session))?;
.map_err(|_| Error::Misconfiguration(phpsessid))?
.into();
let spip_session = env::var(&spip_session)
.map_err(|_| Error::Misconfiguration(spip_session))?
.into();
let monde_diplo = MondeDiplo::builder() let monde_diplo = MondeDiplo::builder()
.login(monde_diplomatique::Login::Cookies { .login(monde_diplomatique::Login::Cookies {
@ -69,12 +61,8 @@ fn default_newpapers() -> Result<Newspapers> {
let lmd_a_m = "COURRIER_INTERNATIONAL_LMD_A_M".to_string(); let lmd_a_m = "COURRIER_INTERNATIONAL_LMD_A_M".to_string();
let ssess = "COURRIER_INTERNATIONAL_SSESS".to_string(); let ssess = "COURRIER_INTERNATIONAL_SSESS".to_string();
let lmd_a_m = env::var(&lmd_a_m) let lmd_a_m = env::var(&lmd_a_m).map_err(|_| Error::Misconfiguration(lmd_a_m))?;
.map_err(|_| Error::Misconfiguration(lmd_a_m))? let ssess = env::var(&ssess).map_err(|_| Error::Misconfiguration(ssess))?;
.into();
let ssess = env::var(&ssess)
.map_err(|_| Error::Misconfiguration(ssess))?
.into();
let courrier_international = CourrierInternational::builder() let courrier_international = CourrierInternational::builder()
.login(courrier_international::Login::Cookies { lmd_a_m, ssess }) .login(courrier_international::Login::Cookies { lmd_a_m, ssess })

View File

@ -82,13 +82,19 @@ impl Newspaper for CourrierInternational {
}; };
// TODO: Move to const // TODO: Move to const
let element_to_remove = [ let elements_to_remove = [
// navigation elements // navigation elements
"#entete.connecte", "#entete.connecte",
]; ];
let single_page_html = let single_page_html = tools::self_contained_html::Config {
tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await; downloader: Some(&downloader),
base_url: Some(&url),
elements_to_remove: &elements_to_remove,
..Default::default()
}
.run(&html)
.await;
Ok(single_page_html) Ok(single_page_html)
} }

View File

@ -10,7 +10,7 @@ use crate::{Download, Downloader};
pub enum Login { pub enum Login {
Username(String, String), Username(String, String),
MPRUUID(String), Mpruuid(String),
} }
#[derive(Debug, Clone, Default)] #[derive(Debug, Clone, Default)]
@ -33,7 +33,7 @@ impl Builder {
Login::Username(_username, _password) => { Login::Username(_username, _password) => {
unimplemented!("login using username and passwond not implemented") unimplemented!("login using username and passwond not implemented")
} }
Login::MPRUUID(cookie_value) => Some(("MPRUUID".into(), cookie_value)), Login::Mpruuid(cookie_value) => Some(("MPRUUID".into(), cookie_value)),
}; };
self self
} }
@ -86,7 +86,7 @@ impl Newspaper for Mediapart {
}; };
// TODO: Move to const // TODO: Move to const
let element_to_remove = [ let elements_to_remove = [
// header // header
".fb-root", ".fb-root",
".skipLinks", ".skipLinks",
@ -104,8 +104,14 @@ impl Newspaper for Mediapart {
"aside.cc-modal", "aside.cc-modal",
]; ];
let single_page_html = let single_page_html = tools::self_contained_html::Config {
tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await; downloader: Some(&downloader),
base_url: Some(&url),
elements_to_remove: &elements_to_remove,
..Default::default()
}
.run(&html)
.await;
Ok(single_page_html) Ok(single_page_html)
} }

View File

@ -91,7 +91,7 @@ impl Newspaper for MondeDiplo {
}; };
// TODO: Move to const // TODO: Move to const
let element_to_remove = [ let elements_to_remove = [
// navigation elements // navigation elements
"#tout-en-haut.preentete", "#tout-en-haut.preentete",
"#entete.connecte", "#entete.connecte",
@ -107,8 +107,14 @@ impl Newspaper for MondeDiplo {
"noscript", "noscript",
]; ];
let single_page_html = let single_page_html = tools::self_contained_html::Config {
tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await; downloader: Some(&downloader),
base_url: Some(&url),
elements_to_remove: &elements_to_remove,
..Default::default()
}
.run(&html)
.await;
Ok(single_page_html) Ok(single_page_html)
} }

View File

@ -1,5 +1,4 @@
mod download; mod download;
mod self_contained_html; pub mod self_contained_html;
pub use download::{Download, DownloadError, Downloader}; pub use download::{Download, DownloadError, Downloader};
pub use self_contained_html::self_contained_html;

View File

@ -8,26 +8,58 @@ use url::Url;
use crate::consts::{EVENT_HANDLERS, LINK_REL_EXTERNAL_RESOURCES}; use crate::consts::{EVENT_HANDLERS, LINK_REL_EXTERNAL_RESOURCES};
use crate::Download; use crate::Download;
/// Stores configuration for the self_contained_html function
// TODO: write a builder
pub struct Config<'t, E, D, S1 = &'static str, S2 = &'static str>
where
E: std::error::Error,
D: Download<Error = E> + Send,
S1: AsRef<str>,
S2: AsRef<str>,
{
/// the downloader that will be used to retrieve ressources on the page
pub downloader: Option<&'t D>,
/// Base url for downloading ressources, it probably the
pub base_url: Option<&'t Url>,
pub elements_to_remove: &'t [S1],
pub styles_to_add: &'t [S2],
}
impl<'t, E, D> Default for Config<'t, E, D>
where
E: std::error::Error,
D: Download<Error = E> + Send,
{
fn default() -> Self {
Self {
downloader: None,
base_url: None,
elements_to_remove: &[],
styles_to_add: &[],
}
}
}
impl<'t, E, D, S1, S2> Config<'t, E, D, S1, S2>
where
E: std::error::Error,
D: Download<Error = E> + Send,
S1: AsRef<str>,
S2: AsRef<str>,
{
/// Makes an html page self-contained /// Makes an html page self-contained
/// ///
/// The `downloader` must implement `Download` and is used to download ressources that are /// The `downloader` must implement `Download` and is used to download ressources that are
/// needed to make this page self-contained such as stylesheets or images. /// needed to make this page self-contained such as stylesheets or images.
/// ///
/// The function also removes all scripts on the page /// The function also removes all scripts on the page
pub async fn self_contained_html<E, D>( pub async fn run(&self, html: impl AsRef<str>) -> String {
html: impl AsRef<str>, //TODO: don't panic
downloader: &D, let base_url = self.base_url.expect("Base url not defined");
base_url: &Url, let downloader = self.downloader.expect("Downloader not defined");
elements_to_remove: &[impl AsRef<str>],
) -> String
where
E: std::error::Error,
D: Download<Error = E> + Send,
{
// TODO: split/refactor this function : // TODO: split/refactor this function :
// - ¿ find a way to ease the |get urls| -> |async download| -> |replace| workflow ? // - ¿ find a way to ease the |get urls| -> |async download| -> |replace| workflow ?
// - `element_to_remove` , `base_url` and `downloader` should be in a configuration structure // - `element_to_remove` , `base_url` and `downloader` should be in a configuration structure
// - ¿ should be function of a trait ? or only of the configuration struct ?
let (style_urls, html) = { let (style_urls, html) = {
let document = Document::from(html.as_ref()); let document = Document::from(html.as_ref());
@ -65,7 +97,9 @@ where
}; };
let style_urls = style_urls.into_iter().map(|style_url| { let style_urls = style_urls.into_iter().map(|style_url| {
OptionFuture::from(style_url.map(|s| async move { downloader.download(&s).await.unwrap() })) OptionFuture::from(
style_url.map(|s| async move { downloader.download(&s).await.unwrap() }),
)
}); });
let downloaded_styles = futures::future::join_all(style_urls).await; let downloaded_styles = futures::future::join_all(style_urls).await;
@ -122,7 +156,9 @@ where
.for_each(|(mut img, data)| { .for_each(|(mut img, data)| {
if let Some((url, Some(data))) = data { if let Some((url, Some(data))) = data {
let data = base64::encode(data); let data = base64::encode(data);
let extension = Path::new(url.path()).extension().unwrap().to_str().unwrap(); //TODO: use an extension hashmap
let extension =
Path::new(url.path()).extension().unwrap().to_str().unwrap();
img.set_attr("src", &format!("data:image/{};base64,{}", extension, data)); img.set_attr("src", &format!("data:image/{};base64,{}", extension, data));
} else { } else {
img.remove() img.remove()
@ -130,7 +166,7 @@ where
}); });
// ---- Remove unwanted html elements ----- // ---- Remove unwanted html elements -----
// //
for element in elements_to_remove { for element in self.elements_to_remove {
document.select(element.as_ref()).remove(); document.select(element.as_ref()).remove();
} }
String::from(document.html()) String::from(document.html())
@ -143,6 +179,7 @@ where
String::from_utf8(minifier.get_html().into()).unwrap() String::from_utf8(minifier.get_html().into()).unwrap()
} }
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
@ -180,9 +217,14 @@ mod tests {
let html = "<html><head><script>let id = id => id</script></head><body></body></html>"; let html = "<html><head><script>let id = id => id</script></head><body></body></html>";
let base_url = Url::parse("http://example.com")?; let base_url = Url::parse("http://example.com")?;
let downloader = DummyDownloader {}; let downloader = DummyDownloader {};
let to_remove: &[&str] = &[];
assert_eq!( assert_eq!(
self_contained_html(html, &downloader, &base_url, to_remove).await, Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}
.run(html)
.await,
"<html><head></head><body></body></html>" "<html><head></head><body></body></html>"
); );
Ok(()) Ok(())
@ -206,10 +248,13 @@ mod tests {
}; };
let base_url = Url::parse("http://example.com")?; let base_url = Url::parse("http://example.com")?;
let to_remove: &[&str] = &[];
for s in EVENT_HANDLERS { for s in EVENT_HANDLERS {
assert_eq!( assert_eq!(
self_contained_html(html(s), &downloader, &base_url, to_remove).await, Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}.run(html(s)).await,
"<html><head>\n</head>\n<body>\n<button class=\"activate\">button</button>\n</body></html>" "<html><head>\n</head>\n<body>\n<button class=\"activate\">button</button>\n</body></html>"
); );
} }
@ -234,10 +279,15 @@ mod tests {
}; };
let base_url = Url::parse("http://example.com")?; let base_url = Url::parse("http://example.com")?;
let to_remove: &[&str] = &[];
for s in LINK_REL_EXTERNAL_RESOURCES { for s in LINK_REL_EXTERNAL_RESOURCES {
assert_eq!( assert_eq!(
self_contained_html(html(s), &downloader, &base_url, to_remove).await, Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}
.run(html(s))
.await,
"<html><head>\n</head>\n<body>\n</body></html>" "<html><head>\n</head>\n<body>\n</body></html>"
); );
} }
@ -290,9 +340,14 @@ mod tests {
let minified = String::from_utf8(minifier.get_html().into())?; let minified = String::from_utf8(minifier.get_html().into())?;
let base_url = Url::parse("http://example.com")?; let base_url = Url::parse("http://example.com")?;
let to_remove: &[&str] = &[];
assert_eq!( assert_eq!(
self_contained_html(html, &downloader, &base_url, to_remove).await, Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}
.run(html)
.await,
minified minified
); );
Ok(()) Ok(())
@ -337,9 +392,14 @@ mod tests {
let minified = String::from_utf8(minifier.get_html().into())?; let minified = String::from_utf8(minifier.get_html().into())?;
let base_url = Url::parse("http://example.com")?; let base_url = Url::parse("http://example.com")?;
let to_remove: &[&str] = &[];
assert_eq!( assert_eq!(
self_contained_html(html, &downloader, &base_url, to_remove).await, Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}
.run(html)
.await,
minified minified
); );
Ok(()) Ok(())
@ -372,12 +432,13 @@ mod tests {
let minified = String::from_utf8(minifier.get_html().into())?; let minified = String::from_utf8(minifier.get_html().into())?;
assert_eq!( assert_eq!(
self_contained_html( Config {
html, downloader: Some(&downloader),
&downloader, base_url: Some(&base_url),
&base_url, elements_to_remove: &["header", ".placeholder", "article > span.huge"],
&["header", ".placeholder", "article > span.huge"] ..Default::default()
) }
.run(html)
.await, .await,
minified minified
); );

View File

@ -38,6 +38,7 @@ frame "backend" {
newspaper -> retrieval_tools: uses to implement newspaper -> retrieval_tools: uses to implement
article_location --> article_repr: uses article_location --> article_repr: uses
retrieval_tools -up-> article_repr: uses
auto_retrieve --> rss: watches auto_retrieve --> rss: watches
auto_retrieve --> article_location auto_retrieve --> article_location