add several newspapers #36

Merged
koalp merged 6 commits from feature/additional_newspapers into development 2021-05-22 04:50:43 +02:00
9 changed files with 239 additions and 181 deletions
Showing only changes of commit 6e091a32fc - Show all commits

View File

@ -2,15 +2,7 @@
use std::convert::TryInto;
use anyhow::Result;
use matrix_sdk::{
self, async_trait,
events::{
room::message::{MessageEventContent, MessageType, TextMessageEventContent},
AnyMessageEventContent, SyncMessageEvent,
},
room::Room,
Client, ClientConfig, EventHandler, SyncSettings,
};
use matrix_sdk::{self, Client, SyncSettings};
use crate::Html;

View File

@ -1,7 +1,6 @@
use std::convert::TryInto;
use std::env;
use log::{error, info};
use log::error;
use matrix_sdk::{
self, async_trait,
events::{
@ -9,7 +8,7 @@ use matrix_sdk::{
AnyMessageEventContent, SyncMessageEvent,
},
room::Room,
Client, ClientConfig, EventHandler, SyncSettings,
EventHandler,
};
use crieur_retrieve::{article_location::Error, article_location::Result, ArticleLocation, Url};

View File

@ -2,7 +2,6 @@ use std::boxed::Box;
use std::convert::TryInto;
use std::env;
use log::info;
use url::{Host, Url};
use crate::newspaper::Newspaper;
@ -36,27 +35,20 @@ pub type Result<T, E = Error> = core::result::Result<T, E>;
fn default_newpapers() -> Result<Newspapers> {
// TODO: same thing is written too much times : how to DRY ?
let config_key = "MEDIAPART_COOKIE".to_string();
let mpruiid = env::var(&config_key)
.map_err(|_| Error::Misconfiguration(config_key))?
.into();
let mpruiid = env::var(&config_key).map_err(|_| Error::Misconfiguration(config_key))?;
let mediapart = Mediapart::builder()
.login(mediapart::Login::MPRUUID(mpruiid))
.login(mediapart::Login::Mpruuid(mpruiid))
.build()?;
let lmd_a_m = "MONDE_DIPLO_LMD_A_M".to_string();
let phpsessid = "MONDE_DIPLO_PHPSESSID".to_string();
let spip_session = "MONDE_DIPLO_SPIP_SESSION".to_string();
let lmd_a_m = env::var(&lmd_a_m)
.map_err(|_| Error::Misconfiguration(lmd_a_m))?
.into();
let phpsessid = env::var(&phpsessid)
.map_err(|_| Error::Misconfiguration(phpsessid))?
.into();
let spip_session = env::var(&spip_session)
.map_err(|_| Error::Misconfiguration(spip_session))?
.into();
let lmd_a_m = env::var(&lmd_a_m).map_err(|_| Error::Misconfiguration(lmd_a_m))?;
let phpsessid = env::var(&phpsessid).map_err(|_| Error::Misconfiguration(phpsessid))?;
let spip_session =
env::var(&spip_session).map_err(|_| Error::Misconfiguration(spip_session))?;
let monde_diplo = MondeDiplo::builder()
.login(monde_diplomatique::Login::Cookies {
@ -69,12 +61,8 @@ fn default_newpapers() -> Result<Newspapers> {
let lmd_a_m = "COURRIER_INTERNATIONAL_LMD_A_M".to_string();
let ssess = "COURRIER_INTERNATIONAL_SSESS".to_string();
let lmd_a_m = env::var(&lmd_a_m)
.map_err(|_| Error::Misconfiguration(lmd_a_m))?
.into();
let ssess = env::var(&ssess)
.map_err(|_| Error::Misconfiguration(ssess))?
.into();
let lmd_a_m = env::var(&lmd_a_m).map_err(|_| Error::Misconfiguration(lmd_a_m))?;
let ssess = env::var(&ssess).map_err(|_| Error::Misconfiguration(ssess))?;
let courrier_international = CourrierInternational::builder()
.login(courrier_international::Login::Cookies { lmd_a_m, ssess })

View File

@ -82,13 +82,19 @@ impl Newspaper for CourrierInternational {
};
// TODO: Move to const
let element_to_remove = [
let elements_to_remove = [
// navigation elements
"#entete.connecte",
];
let single_page_html =
tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await;
let single_page_html = tools::self_contained_html::Config {
downloader: Some(&downloader),
base_url: Some(&url),
elements_to_remove: &elements_to_remove,
..Default::default()
}
.run(&html)
.await;
Ok(single_page_html)
}

View File

@ -10,7 +10,7 @@ use crate::{Download, Downloader};
pub enum Login {
Username(String, String),
MPRUUID(String),
Mpruuid(String),
}
#[derive(Debug, Clone, Default)]
@ -33,7 +33,7 @@ impl Builder {
Login::Username(_username, _password) => {
unimplemented!("login using username and passwond not implemented")
}
Login::MPRUUID(cookie_value) => Some(("MPRUUID".into(), cookie_value)),
Login::Mpruuid(cookie_value) => Some(("MPRUUID".into(), cookie_value)),
};
self
}
@ -86,7 +86,7 @@ impl Newspaper for Mediapart {
};
// TODO: Move to const
let element_to_remove = [
let elements_to_remove = [
// header
".fb-root",
".skipLinks",
@ -104,8 +104,14 @@ impl Newspaper for Mediapart {
"aside.cc-modal",
];
let single_page_html =
tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await;
let single_page_html = tools::self_contained_html::Config {
downloader: Some(&downloader),
base_url: Some(&url),
elements_to_remove: &elements_to_remove,
..Default::default()
}
.run(&html)
.await;
Ok(single_page_html)
}

View File

@ -91,7 +91,7 @@ impl Newspaper for MondeDiplo {
};
// TODO: Move to const
let element_to_remove = [
let elements_to_remove = [
// navigation elements
"#tout-en-haut.preentete",
"#entete.connecte",
@ -107,8 +107,14 @@ impl Newspaper for MondeDiplo {
"noscript",
];
let single_page_html =
tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await;
let single_page_html = tools::self_contained_html::Config {
downloader: Some(&downloader),
base_url: Some(&url),
elements_to_remove: &elements_to_remove,
..Default::default()
}
.run(&html)
.await;
Ok(single_page_html)
}

View File

@ -1,5 +1,4 @@
mod download;
mod self_contained_html;
pub mod self_contained_html;
pub use download::{Download, DownloadError, Downloader};
pub use self_contained_html::self_contained_html;

View File

@ -8,140 +8,177 @@ use url::Url;
use crate::consts::{EVENT_HANDLERS, LINK_REL_EXTERNAL_RESOURCES};
use crate::Download;
/// Makes an html page self-contained
///
/// The `downloader` must implement `Download` and is used to download ressources that are
/// needed to make this page self-contained such as stylesheets or images.
///
/// The function also removes all scripts on the page
pub async fn self_contained_html<E, D>(
html: impl AsRef<str>,
downloader: &D,
base_url: &Url,
elements_to_remove: &[impl AsRef<str>],
) -> String
/// Stores configuration for the self_contained_html function
// TODO: write a builder
pub struct Config<'t, E, D, S1 = &'static str, S2 = &'static str>
where
E: std::error::Error,
D: Download<Error = E> + Send,
S1: AsRef<str>,
S2: AsRef<str>,
{
/// the downloader that will be used to retrieve ressources on the page
pub downloader: Option<&'t D>,
/// Base url for downloading ressources, it probably the
pub base_url: Option<&'t Url>,
pub elements_to_remove: &'t [S1],
pub styles_to_add: &'t [S2],
}
impl<'t, E, D> Default for Config<'t, E, D>
where
E: std::error::Error,
D: Download<Error = E> + Send,
{
// TODO: split/refactor this function :
// - ¿ find a way to ease the |get urls| -> |async download| -> |replace| workflow ?
// - `element_to_remove` , `base_url` and `downloader` should be in a configuration structure
// - ¿ should be function of a trait ? or only of the configuration struct ?
let (style_urls, html) = {
let document = Document::from(html.as_ref());
// ---- Remove scripts ----
//
document.select("script").remove();
for event in EVENT_HANDLERS {
document
.select(format!("[{}]", event).as_str())
.remove_attr(event);
fn default() -> Self {
Self {
downloader: None,
base_url: None,
elements_to_remove: &[],
styles_to_add: &[],
}
}
}
for rel in LINK_REL_EXTERNAL_RESOURCES {
document
.select(format!("link[rel=\"{}\"]", rel).as_str())
.remove();
}
impl<'t, E, D, S1, S2> Config<'t, E, D, S1, S2>
where
E: std::error::Error,
D: Download<Error = E> + Send,
S1: AsRef<str>,
S2: AsRef<str>,
{
/// Makes an html page self-contained
///
/// The `downloader` must implement `Download` and is used to download ressources that are
/// needed to make this page self-contained such as stylesheets or images.
///
/// The function also removes all scripts on the page
pub async fn run(&self, html: impl AsRef<str>) -> String {
//TODO: don't panic
let base_url = self.base_url.expect("Base url not defined");
let downloader = self.downloader.expect("Downloader not defined");
// TODO: split/refactor this function :
// - ¿ find a way to ease the |get urls| -> |async download| -> |replace| workflow ?
// - `element_to_remove` , `base_url` and `downloader` should be in a configuration structure
let (style_urls, html) = {
let document = Document::from(html.as_ref());
// ---- Replace stylesheets ----
// ---- Remove scripts ----
//
document.select("script").remove();
for event in EVENT_HANDLERS {
document
.select(format!("[{}]", event).as_str())
.remove_attr(event);
}
for rel in LINK_REL_EXTERNAL_RESOURCES {
document
.select(format!("link[rel=\"{}\"]", rel).as_str())
.remove();
}
// ---- Replace stylesheets ----
//
let stylesheets = document.select("link[href][rel=\"stylesheet\"]");
let styles_url = stylesheets
.iter()
.map(|stylesheet| {
if let Some(src) = stylesheet.attr("href") {
//TODO: does it work with absolute urls ?
base_url.join(src.as_ref()).ok()
} else {
None
}
})
.collect::<Vec<_>>();
(styles_url, String::from(document.html()))
};
let style_urls = style_urls.into_iter().map(|style_url| {
OptionFuture::from(
style_url.map(|s| async move { downloader.download(&s).await.unwrap() }),
)
});
let downloaded_styles = futures::future::join_all(style_urls).await;
let html = {
let document = Document::from(&html);
let styles = document.select("link[href][rel=\"stylesheet\"]");
styles
.iter()
.zip(downloaded_styles.iter())
.for_each(|(mut stylesheet, inner_css)| {
if let Some(Some(inner_css)) = inner_css {
let css = String::from_utf8(inner_css.to_vec()).unwrap();
let css = format!("<style>{}</style>", css);
stylesheet.replace_with_html(css);
} else {
stylesheet.remove();
}
});
String::from(document.html())
};
// ---- Replace imgs ----
//
let stylesheets = document.select("link[href][rel=\"stylesheet\"]");
let styles_url = stylesheets
.iter()
.map(|stylesheet| {
if let Some(src) = stylesheet.attr("href") {
//TODO: does it work with absolute urls ?
base_url.join(src.as_ref()).ok()
} else {
None
}
})
.collect::<Vec<_>>();
(styles_url, String::from(document.html()))
};
let image_urls = {
let document = Document::from(&html);
let imgs = document.select("img:not([src^=\"data:\"])");
let style_urls = style_urls.into_iter().map(|style_url| {
OptionFuture::from(style_url.map(|s| async move { downloader.download(&s).await.unwrap() }))
});
let downloaded_styles = futures::future::join_all(style_urls).await;
imgs.iter()
.map(|image| {
if let Some(src) = image.attr("src") {
base_url.join(src.as_ref()).ok()
} else {
None
}
})
.collect::<Vec<_>>()
};
let html = {
let document = Document::from(&html);
let styles = document.select("link[href][rel=\"stylesheet\"]");
let downloaded_images = image_urls.into_iter().map(|image_url| {
OptionFuture::from(image_url.map(|url| async move {
let data = downloader.download(&url).await.unwrap();
(url, data)
}))
});
let downloaded_images = futures::future::join_all(downloaded_images).await;
styles
.iter()
.zip(downloaded_styles.iter())
.for_each(|(mut stylesheet, inner_css)| {
if let Some(Some(inner_css)) = inner_css {
let css = String::from_utf8(inner_css.to_vec()).unwrap();
let css = format!("<style>{}</style>", css);
stylesheet.replace_with_html(css);
} else {
stylesheet.remove();
}
});
String::from(document.html())
};
let html = {
let document = Document::from(&html);
let imgs = document.select("img:not([src^=\"data:\"])");
// ---- Replace imgs ----
//
let image_urls = {
let document = Document::from(&html);
let imgs = document.select("img:not([src^=\"data:\"])");
imgs.iter()
.zip(downloaded_images.iter())
.for_each(|(mut img, data)| {
if let Some((url, Some(data))) = data {
let data = base64::encode(data);
//TODO: use an extension hashmap
let extension =
Path::new(url.path()).extension().unwrap().to_str().unwrap();
img.set_attr("src", &format!("data:image/{};base64,{}", extension, data));
} else {
img.remove()
}
});
// ---- Remove unwanted html elements -----
//
for element in self.elements_to_remove {
document.select(element.as_ref()).remove();
}
String::from(document.html())
};
imgs.iter()
.map(|image| {
if let Some(src) = image.attr("src") {
base_url.join(src.as_ref()).ok()
} else {
None
}
})
.collect::<Vec<_>>()
};
let downloaded_images = image_urls.into_iter().map(|image_url| {
OptionFuture::from(image_url.map(|url| async move {
let data = downloader.download(&url).await.unwrap();
(url, data)
}))
});
let downloaded_images = futures::future::join_all(downloaded_images).await;
let html = {
let document = Document::from(&html);
let imgs = document.select("img:not([src^=\"data:\"])");
imgs.iter()
.zip(downloaded_images.iter())
.for_each(|(mut img, data)| {
if let Some((url, Some(data))) = data {
let data = base64::encode(data);
let extension = Path::new(url.path()).extension().unwrap().to_str().unwrap();
img.set_attr("src", &format!("data:image/{};base64,{}", extension, data));
} else {
img.remove()
}
});
// ---- Remove unwanted html elements -----
// ---- output ----
//
for element in elements_to_remove {
document.select(element.as_ref()).remove();
}
String::from(document.html())
};
let mut minifier = HTMLMinifier::new();
minifier.digest(html.as_str()).unwrap();
// ---- output ----
//
let mut minifier = HTMLMinifier::new();
minifier.digest(html.as_str()).unwrap();
String::from_utf8(minifier.get_html().into()).unwrap()
String::from_utf8(minifier.get_html().into()).unwrap()
}
}
#[cfg(test)]
@ -180,9 +217,14 @@ mod tests {
let html = "<html><head><script>let id = id => id</script></head><body></body></html>";
let base_url = Url::parse("http://example.com")?;
let downloader = DummyDownloader {};
let to_remove: &[&str] = &[];
assert_eq!(
self_contained_html(html, &downloader, &base_url, to_remove).await,
Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}
.run(html)
.await,
"<html><head></head><body></body></html>"
);
Ok(())
@ -206,10 +248,13 @@ mod tests {
};
let base_url = Url::parse("http://example.com")?;
let to_remove: &[&str] = &[];
for s in EVENT_HANDLERS {
assert_eq!(
self_contained_html(html(s), &downloader, &base_url, to_remove).await,
Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}.run(html(s)).await,
"<html><head>\n</head>\n<body>\n<button class=\"activate\">button</button>\n</body></html>"
);
}
@ -234,10 +279,15 @@ mod tests {
};
let base_url = Url::parse("http://example.com")?;
let to_remove: &[&str] = &[];
for s in LINK_REL_EXTERNAL_RESOURCES {
assert_eq!(
self_contained_html(html(s), &downloader, &base_url, to_remove).await,
Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}
.run(html(s))
.await,
"<html><head>\n</head>\n<body>\n</body></html>"
);
}
@ -290,9 +340,14 @@ mod tests {
let minified = String::from_utf8(minifier.get_html().into())?;
let base_url = Url::parse("http://example.com")?;
let to_remove: &[&str] = &[];
assert_eq!(
self_contained_html(html, &downloader, &base_url, to_remove).await,
Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}
.run(html)
.await,
minified
);
Ok(())
@ -337,9 +392,14 @@ mod tests {
let minified = String::from_utf8(minifier.get_html().into())?;
let base_url = Url::parse("http://example.com")?;
let to_remove: &[&str] = &[];
assert_eq!(
self_contained_html(html, &downloader, &base_url, to_remove).await,
Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}
.run(html)
.await,
minified
);
Ok(())
@ -372,12 +432,13 @@ mod tests {
let minified = String::from_utf8(minifier.get_html().into())?;
assert_eq!(
self_contained_html(
html,
&downloader,
&base_url,
&["header", ".placeholder", "article > span.huge"]
)
Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
elements_to_remove: &["header", ".placeholder", "article > span.huge"],
..Default::default()
}
.run(html)
.await,
minified
);

View File

@ -37,7 +37,8 @@ frame "backend" {
newspaper -> retrieval_tools: uses to implement
article_location --> article_repr :uses
article_location --> article_repr: uses
retrieval_tools -up-> article_repr: uses
auto_retrieve --> rss: watches
auto_retrieve --> article_location