add several newspapers #36

Merged
koalp merged 6 commits from feature/additional_newspapers into development 2021-05-22 04:50:43 +02:00
7 changed files with 126 additions and 39 deletions
Showing only changes of commit e34edf0b21 - Show all commits

50
Cargo.lock generated
View File

@ -662,9 +662,9 @@ dependencies = [
[[package]]
name = "futures"
version = "0.3.14"
version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a9d5813545e459ad3ca1bff9915e9ad7f1a47dc6a91b627ce321d5863b7dd253"
checksum = "0e7e43a803dae2fa37c1f6a8fe121e1f7bf9548b4dfc0522a42f34145dadfc27"
dependencies = [
"futures-channel",
"futures-core",
@ -677,9 +677,9 @@ dependencies = [
[[package]]
name = "futures-channel"
version = "0.3.14"
version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ce79c6a52a299137a6013061e0cf0e688fce5d7f1bc60125f520912fdb29ec25"
checksum = "e682a68b29a882df0545c143dc3646daefe80ba479bcdede94d5a703de2871e2"
dependencies = [
"futures-core",
"futures-sink",
@ -687,15 +687,15 @@ dependencies = [
[[package]]
name = "futures-core"
version = "0.3.14"
version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "098cd1c6dda6ca01650f1a37a794245eb73181d0d4d4e955e2f3c37db7af1815"
checksum = "0402f765d8a89a26043b889b26ce3c4679d268fa6bb22cd7c6aad98340e179d1"
[[package]]
name = "futures-executor"
version = "0.3.14"
version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "10f6cb7042eda00f0049b1d2080aa4b93442997ee507eb3828e8bd7577f94c9d"
checksum = "badaa6a909fac9e7236d0620a2f57f7664640c56575b71a7552fbd68deafab79"
dependencies = [
"futures-core",
"futures-task",
@ -704,9 +704,9 @@ dependencies = [
[[package]]
name = "futures-io"
version = "0.3.14"
version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "365a1a1fb30ea1c03a830fdb2158f5236833ac81fa0ad12fe35b29cddc35cb04"
checksum = "acc499defb3b348f8d8f3f66415835a9131856ff7714bf10dadfc4ec4bdb29a1"
[[package]]
name = "futures-locks"
@ -719,10 +719,11 @@ dependencies = [
[[package]]
name = "futures-macro"
version = "0.3.14"
version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "668c6733a182cd7deb4f1de7ba3bf2120823835b3bcfbeacf7d2c4a773c1bb8b"
checksum = "a4c40298486cdf52cc00cd6d6987892ba502c7656a16a4192a9992b1ccedd121"
dependencies = [
"autocfg 1.0.1",
"proc-macro-hack",
"proc-macro2",
"quote",
@ -731,15 +732,15 @@ dependencies = [
[[package]]
name = "futures-sink"
version = "0.3.14"
version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c5629433c555de3d82861a7a4e3794a4c40040390907cfbfd7143a92a426c23"
checksum = "a57bead0ceff0d6dde8f465ecd96c9338121bb7717d3e7b108059531870c4282"
[[package]]
name = "futures-task"
version = "0.3.14"
version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ba7aa51095076f3ba6d9a1f702f74bd05ec65f555d70d2033d55ba8d69f581bc"
checksum = "8a16bef9fc1a4dddb5bee51c989e3fbba26569cbb0e31f5b303c184e3dd33dae"
[[package]]
name = "futures-timer"
@ -753,10 +754,11 @@ dependencies = [
[[package]]
name = "futures-util"
version = "0.3.14"
version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3c144ad54d60f23927f0a6b6d816e4271278b64f005ad65e4e35291d2de9c025"
checksum = "feb5c238d27e2bf94ffdfd27b2c29e3df4a68c4193bb6427384259e2bf191967"
dependencies = [
"autocfg 1.0.1",
"futures-channel",
"futures-core",
"futures-io",
@ -895,9 +897,9 @@ dependencies = [
[[package]]
name = "html-minifier"
version = "3.0.12"
version = "3.0.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a3b862e637e82b3134913fdd0aa0b8e79b7486fe88878f6bab0d09daf4996af"
checksum = "70f11cbdecf4fde3b74532e8d6d5926b6354adfda3fe259ed585402f9ae0dbaf"
dependencies = [
"cow-utils",
"educe",
@ -1284,9 +1286,9 @@ checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d"
[[package]]
name = "minifier"
version = "0.0.40"
version = "0.0.41"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ed8f02a863a23d5797a6e72ea7102bd0ee38ceba1256b522aeddb70ff14b32c9"
checksum = "5594542d20834f2b974f5e5fb8e0cf1c67a2119dcadc29ef5d93a081fb30cc08"
dependencies = [
"macro-utils",
]
@ -2659,9 +2661,9 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c"
[[package]]
name = "tokio"
version = "1.5.0"
version = "1.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "83f0c8e7c0addab50b663055baf787d0af7f413a46e6e7fb9559a4e4db7137a5"
checksum = "bd3076b5c8cc18138b8f8814895c11eb4de37114a5d127bafdc5e55798ceef37"
dependencies = [
"autocfg 1.0.1",
"bytes",

View File

@ -22,5 +22,5 @@ crieur-chatbot = {version = "0.1", path="crieur-chatbot"}
dotenv = "0.15.0"
env_logger = "0.8.3"
log = "0.4.14"
tokio = { version = "1.5.0", features = ["full"] }
tokio = { version = "1.6.0", features = ["full"] }
tracing-subscriber = "0.2.18"

View File

@ -16,14 +16,14 @@ hyper-rustls = "0.22.1"
cookie = "0.15.0"
lol_html = "0.3.0"
indoc = "1.0.3"
html-minifier = "3.0.12"
html-minifier = "3.0.13"
bytes = "1.0.1"
base64 = "0.13.0"
futures = "0.3.14"
futures = "0.3.15"
derive_builder = "0.10.2"
nipper = "0.1.9"
log = "0.4.14"
env_logger = "0.8.3"
[dev-dependencies]
tokio = "1.5.0"
tokio = "1.6.0"

View File

@ -1,6 +1,7 @@
use anyhow::{anyhow, bail, Result};
use async_trait::async_trait;
use cookie::Cookie;
use indoc::indoc;
use url::Host;
use crate::newspaper::{Metadata, Newspaper};
@ -81,9 +82,42 @@ impl Newspaper for CourrierInternational {
None => bail!("404 not found"),
};
let elements_to_remove = &[
// navigation elements
"header.site-header",
"footer.site-footer",
// Social buttons
"#toolbox-share",
".toolbox-share",
".toolbox-print",
".toolbox-respond",
".toolbox-zen",
".toolbox-newsletter",
".toolbox-offer",
".box-article-offer-friend-abo",
// unused services
".article-aside",
".article-secondary",
".article-subject-readmore",
// misc
".element-invisible",
".gptcontainer",
];
// FIXME: it doesn't work because the aside is in the article body
//
let toolbox_style = indoc! {"
aside.article-toolbox {
position: sticky;
top: 1em;
}
"};
let single_page_html = tools::self_contained_html::Config {
downloader: Some(&downloader),
base_url: Some(&url),
elements_to_remove,
styles_to_add: &[toolbox_style],
..Default::default()
}
.run(&html)

View File

@ -86,7 +86,7 @@ impl Newspaper for Mediapart {
};
// TODO: Move to const
let elements_to_remove = [
let elements_to_remove = &[
// header
".fb-root",
".skipLinks",
@ -107,7 +107,7 @@ impl Newspaper for Mediapart {
let single_page_html = tools::self_contained_html::Config {
downloader: Some(&downloader),
base_url: Some(&url),
elements_to_remove: &elements_to_remove,
elements_to_remove,
..Default::default()
}
.run(&html)

View File

@ -91,7 +91,7 @@ impl Newspaper for MondeDiplo {
};
// TODO: Move to const
let elements_to_remove = [
let elements_to_remove = &[
// navigation elements
"#tout-en-haut.preentete",
"#entete.connecte",
@ -110,7 +110,7 @@ impl Newspaper for MondeDiplo {
let single_page_html = tools::self_contained_html::Config {
downloader: Some(&downloader),
base_url: Some(&url),
elements_to_remove: &elements_to_remove,
elements_to_remove,
..Default::default()
}
.run(&html)

View File

@ -59,6 +59,8 @@ where
let downloader = self.downloader.expect("Downloader not defined");
// TODO: split/refactor this function :
// - ¿ find a way to ease the |get urls| -> |async download| -> |replace| workflow ?
// - put each modification (ex: style in the `foreach`) in functions, maybe using
// (¿sealed?) Traits : https://git.alpaga.dev/poc/crieur/issues/42
// - `element_to_remove` , `base_url` and `downloader` should be in a configuration structure
let (style_urls, html) = {
let document = Document::from(html.as_ref());
@ -84,9 +86,8 @@ where
let stylesheets = document.select("link[href][rel=\"stylesheet\"]");
let styles_url = stylesheets
.iter()
.map(|stylesheet| {
if let Some(src) = stylesheet.attr("href") {
//TODO: does it work with absolute urls ?
.map(|style_link| {
if let Some(src) = style_link.attr("href") {
base_url.join(src.as_ref()).ok()
} else {
None
@ -110,13 +111,19 @@ where
styles
.iter()
.zip(downloaded_styles.iter())
.for_each(|(mut stylesheet, inner_css)| {
.for_each(|(mut style_link, inner_css)| {
if let Some(Some(inner_css)) = inner_css {
let css = String::from_utf8(inner_css.to_vec()).unwrap();
let css = format!("<style>{}</style>", css);
stylesheet.replace_with_html(css);
let media_query = style_link.attr("media");
let css = match media_query {
Some(media_query) => {
format!("<style media=\"{}\">{}</style>", media_query, css)
}
None => format!("<style>{}</style>", css),
};
style_link.replace_with_html(css);
} else {
stylesheet.remove();
style_link.remove();
}
});
String::from(document.html())
@ -192,6 +199,7 @@ where
#[cfg(test)]
mod tests {
// TODO: reduce boilerplate, DRY
use super::*;
@ -362,6 +370,49 @@ mod tests {
Ok(())
}
#[tokio::test]
async fn download_css_with_media_query() -> Result<()> {
let downloader = CssDownloader {};
let html = indoc! {"
<html>
<head>
<link rel=\"stylesheet\" href=\"main.css\" media=\"print\">
</head>
<body>
</body>
</html>
"};
let wanted_html = indoc! {"
<html><head>
<style media=\"print\">
section#warning {
color: red;
}
</style>
</head>
<body>
</body></html>
"};
let mut minifier = HTMLMinifier::new();
minifier.digest(wanted_html)?;
let minified = String::from_utf8(minifier.get_html().into())?;
let base_url = Url::parse("http://example.com")?;
assert_eq!(
Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}
.run(html)
.await,
minified
);
Ok(())
}
struct PngDownloader;
#[async_trait]
impl Download for PngDownloader {