add several newspapers #36
50
Cargo.lock
generated
50
Cargo.lock
generated
@ -662,9 +662,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "futures"
|
||||
version = "0.3.14"
|
||||
version = "0.3.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a9d5813545e459ad3ca1bff9915e9ad7f1a47dc6a91b627ce321d5863b7dd253"
|
||||
checksum = "0e7e43a803dae2fa37c1f6a8fe121e1f7bf9548b4dfc0522a42f34145dadfc27"
|
||||
dependencies = [
|
||||
"futures-channel",
|
||||
"futures-core",
|
||||
@ -677,9 +677,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "futures-channel"
|
||||
version = "0.3.14"
|
||||
version = "0.3.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ce79c6a52a299137a6013061e0cf0e688fce5d7f1bc60125f520912fdb29ec25"
|
||||
checksum = "e682a68b29a882df0545c143dc3646daefe80ba479bcdede94d5a703de2871e2"
|
||||
dependencies = [
|
||||
"futures-core",
|
||||
"futures-sink",
|
||||
@ -687,15 +687,15 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "futures-core"
|
||||
version = "0.3.14"
|
||||
version = "0.3.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "098cd1c6dda6ca01650f1a37a794245eb73181d0d4d4e955e2f3c37db7af1815"
|
||||
checksum = "0402f765d8a89a26043b889b26ce3c4679d268fa6bb22cd7c6aad98340e179d1"
|
||||
|
||||
[[package]]
|
||||
name = "futures-executor"
|
||||
version = "0.3.14"
|
||||
version = "0.3.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "10f6cb7042eda00f0049b1d2080aa4b93442997ee507eb3828e8bd7577f94c9d"
|
||||
checksum = "badaa6a909fac9e7236d0620a2f57f7664640c56575b71a7552fbd68deafab79"
|
||||
dependencies = [
|
||||
"futures-core",
|
||||
"futures-task",
|
||||
@ -704,9 +704,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "futures-io"
|
||||
version = "0.3.14"
|
||||
version = "0.3.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "365a1a1fb30ea1c03a830fdb2158f5236833ac81fa0ad12fe35b29cddc35cb04"
|
||||
checksum = "acc499defb3b348f8d8f3f66415835a9131856ff7714bf10dadfc4ec4bdb29a1"
|
||||
|
||||
[[package]]
|
||||
name = "futures-locks"
|
||||
@ -719,10 +719,11 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "futures-macro"
|
||||
version = "0.3.14"
|
||||
version = "0.3.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "668c6733a182cd7deb4f1de7ba3bf2120823835b3bcfbeacf7d2c4a773c1bb8b"
|
||||
checksum = "a4c40298486cdf52cc00cd6d6987892ba502c7656a16a4192a9992b1ccedd121"
|
||||
dependencies = [
|
||||
"autocfg 1.0.1",
|
||||
"proc-macro-hack",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
@ -731,15 +732,15 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "futures-sink"
|
||||
version = "0.3.14"
|
||||
version = "0.3.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5c5629433c555de3d82861a7a4e3794a4c40040390907cfbfd7143a92a426c23"
|
||||
checksum = "a57bead0ceff0d6dde8f465ecd96c9338121bb7717d3e7b108059531870c4282"
|
||||
|
||||
[[package]]
|
||||
name = "futures-task"
|
||||
version = "0.3.14"
|
||||
version = "0.3.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ba7aa51095076f3ba6d9a1f702f74bd05ec65f555d70d2033d55ba8d69f581bc"
|
||||
checksum = "8a16bef9fc1a4dddb5bee51c989e3fbba26569cbb0e31f5b303c184e3dd33dae"
|
||||
|
||||
[[package]]
|
||||
name = "futures-timer"
|
||||
@ -753,10 +754,11 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "futures-util"
|
||||
version = "0.3.14"
|
||||
version = "0.3.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3c144ad54d60f23927f0a6b6d816e4271278b64f005ad65e4e35291d2de9c025"
|
||||
checksum = "feb5c238d27e2bf94ffdfd27b2c29e3df4a68c4193bb6427384259e2bf191967"
|
||||
dependencies = [
|
||||
"autocfg 1.0.1",
|
||||
"futures-channel",
|
||||
"futures-core",
|
||||
"futures-io",
|
||||
@ -895,9 +897,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "html-minifier"
|
||||
version = "3.0.12"
|
||||
version = "3.0.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5a3b862e637e82b3134913fdd0aa0b8e79b7486fe88878f6bab0d09daf4996af"
|
||||
checksum = "70f11cbdecf4fde3b74532e8d6d5926b6354adfda3fe259ed585402f9ae0dbaf"
|
||||
dependencies = [
|
||||
"cow-utils",
|
||||
"educe",
|
||||
@ -1284,9 +1286,9 @@ checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d"
|
||||
|
||||
[[package]]
|
||||
name = "minifier"
|
||||
version = "0.0.40"
|
||||
version = "0.0.41"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ed8f02a863a23d5797a6e72ea7102bd0ee38ceba1256b522aeddb70ff14b32c9"
|
||||
checksum = "5594542d20834f2b974f5e5fb8e0cf1c67a2119dcadc29ef5d93a081fb30cc08"
|
||||
dependencies = [
|
||||
"macro-utils",
|
||||
]
|
||||
@ -2659,9 +2661,9 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c"
|
||||
|
||||
[[package]]
|
||||
name = "tokio"
|
||||
version = "1.5.0"
|
||||
version = "1.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "83f0c8e7c0addab50b663055baf787d0af7f413a46e6e7fb9559a4e4db7137a5"
|
||||
checksum = "bd3076b5c8cc18138b8f8814895c11eb4de37114a5d127bafdc5e55798ceef37"
|
||||
dependencies = [
|
||||
"autocfg 1.0.1",
|
||||
"bytes",
|
||||
|
@ -22,5 +22,5 @@ crieur-chatbot = {version = "0.1", path="crieur-chatbot"}
|
||||
dotenv = "0.15.0"
|
||||
env_logger = "0.8.3"
|
||||
log = "0.4.14"
|
||||
tokio = { version = "1.5.0", features = ["full"] }
|
||||
tokio = { version = "1.6.0", features = ["full"] }
|
||||
tracing-subscriber = "0.2.18"
|
||||
|
@ -16,14 +16,14 @@ hyper-rustls = "0.22.1"
|
||||
cookie = "0.15.0"
|
||||
lol_html = "0.3.0"
|
||||
indoc = "1.0.3"
|
||||
html-minifier = "3.0.12"
|
||||
html-minifier = "3.0.13"
|
||||
bytes = "1.0.1"
|
||||
base64 = "0.13.0"
|
||||
futures = "0.3.14"
|
||||
futures = "0.3.15"
|
||||
derive_builder = "0.10.2"
|
||||
nipper = "0.1.9"
|
||||
log = "0.4.14"
|
||||
env_logger = "0.8.3"
|
||||
|
||||
[dev-dependencies]
|
||||
tokio = "1.5.0"
|
||||
tokio = "1.6.0"
|
||||
|
@ -1,6 +1,7 @@
|
||||
use anyhow::{anyhow, bail, Result};
|
||||
use async_trait::async_trait;
|
||||
use cookie::Cookie;
|
||||
use indoc::indoc;
|
||||
use url::Host;
|
||||
|
||||
use crate::newspaper::{Metadata, Newspaper};
|
||||
@ -81,9 +82,42 @@ impl Newspaper for CourrierInternational {
|
||||
None => bail!("404 not found"),
|
||||
};
|
||||
|
||||
let elements_to_remove = &[
|
||||
// navigation elements
|
||||
"header.site-header",
|
||||
"footer.site-footer",
|
||||
// Social buttons
|
||||
"#toolbox-share",
|
||||
".toolbox-share",
|
||||
".toolbox-print",
|
||||
".toolbox-respond",
|
||||
".toolbox-zen",
|
||||
".toolbox-newsletter",
|
||||
".toolbox-offer",
|
||||
".box-article-offer-friend-abo",
|
||||
// unused services
|
||||
".article-aside",
|
||||
".article-secondary",
|
||||
".article-subject-readmore",
|
||||
// misc
|
||||
".element-invisible",
|
||||
".gptcontainer",
|
||||
];
|
||||
|
||||
// FIXME: it doesn't work because the aside is in the article body
|
||||
//
|
||||
let toolbox_style = indoc! {"
|
||||
aside.article-toolbox {
|
||||
position: sticky;
|
||||
top: 1em;
|
||||
}
|
||||
"};
|
||||
|
||||
let single_page_html = tools::self_contained_html::Config {
|
||||
downloader: Some(&downloader),
|
||||
base_url: Some(&url),
|
||||
elements_to_remove,
|
||||
styles_to_add: &[toolbox_style],
|
||||
..Default::default()
|
||||
}
|
||||
.run(&html)
|
||||
|
@ -86,7 +86,7 @@ impl Newspaper for Mediapart {
|
||||
};
|
||||
|
||||
// TODO: Move to const
|
||||
let elements_to_remove = [
|
||||
let elements_to_remove = &[
|
||||
// header
|
||||
".fb-root",
|
||||
".skipLinks",
|
||||
@ -107,7 +107,7 @@ impl Newspaper for Mediapart {
|
||||
let single_page_html = tools::self_contained_html::Config {
|
||||
downloader: Some(&downloader),
|
||||
base_url: Some(&url),
|
||||
elements_to_remove: &elements_to_remove,
|
||||
elements_to_remove,
|
||||
..Default::default()
|
||||
}
|
||||
.run(&html)
|
||||
|
@ -91,7 +91,7 @@ impl Newspaper for MondeDiplo {
|
||||
};
|
||||
|
||||
// TODO: Move to const
|
||||
let elements_to_remove = [
|
||||
let elements_to_remove = &[
|
||||
// navigation elements
|
||||
"#tout-en-haut.preentete",
|
||||
"#entete.connecte",
|
||||
@ -110,7 +110,7 @@ impl Newspaper for MondeDiplo {
|
||||
let single_page_html = tools::self_contained_html::Config {
|
||||
downloader: Some(&downloader),
|
||||
base_url: Some(&url),
|
||||
elements_to_remove: &elements_to_remove,
|
||||
elements_to_remove,
|
||||
..Default::default()
|
||||
}
|
||||
.run(&html)
|
||||
|
@ -59,6 +59,8 @@ where
|
||||
let downloader = self.downloader.expect("Downloader not defined");
|
||||
// TODO: split/refactor this function :
|
||||
// - ¿ find a way to ease the |get urls| -> |async download| -> |replace| workflow ?
|
||||
// - put each modification (ex: style in the `foreach`) in functions, maybe using
|
||||
// (¿sealed?) Traits : https://git.alpaga.dev/poc/crieur/issues/42
|
||||
// - `element_to_remove` , `base_url` and `downloader` should be in a configuration structure
|
||||
let (style_urls, html) = {
|
||||
let document = Document::from(html.as_ref());
|
||||
@ -84,9 +86,8 @@ where
|
||||
let stylesheets = document.select("link[href][rel=\"stylesheet\"]");
|
||||
let styles_url = stylesheets
|
||||
.iter()
|
||||
.map(|stylesheet| {
|
||||
if let Some(src) = stylesheet.attr("href") {
|
||||
//TODO: does it work with absolute urls ?
|
||||
.map(|style_link| {
|
||||
if let Some(src) = style_link.attr("href") {
|
||||
base_url.join(src.as_ref()).ok()
|
||||
} else {
|
||||
None
|
||||
@ -110,13 +111,19 @@ where
|
||||
styles
|
||||
.iter()
|
||||
.zip(downloaded_styles.iter())
|
||||
.for_each(|(mut stylesheet, inner_css)| {
|
||||
.for_each(|(mut style_link, inner_css)| {
|
||||
if let Some(Some(inner_css)) = inner_css {
|
||||
let css = String::from_utf8(inner_css.to_vec()).unwrap();
|
||||
let css = format!("<style>{}</style>", css);
|
||||
stylesheet.replace_with_html(css);
|
||||
let media_query = style_link.attr("media");
|
||||
let css = match media_query {
|
||||
Some(media_query) => {
|
||||
format!("<style media=\"{}\">{}</style>", media_query, css)
|
||||
}
|
||||
None => format!("<style>{}</style>", css),
|
||||
};
|
||||
style_link.replace_with_html(css);
|
||||
} else {
|
||||
stylesheet.remove();
|
||||
style_link.remove();
|
||||
}
|
||||
});
|
||||
String::from(document.html())
|
||||
@ -192,6 +199,7 @@ where
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
// TODO: reduce boilerplate, DRY
|
||||
|
||||
use super::*;
|
||||
|
||||
@ -362,6 +370,49 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn download_css_with_media_query() -> Result<()> {
|
||||
let downloader = CssDownloader {};
|
||||
|
||||
let html = indoc! {"
|
||||
<html>
|
||||
<head>
|
||||
<link rel=\"stylesheet\" href=\"main.css\" media=\"print\">
|
||||
</head>
|
||||
<body>
|
||||
</body>
|
||||
</html>
|
||||
"};
|
||||
|
||||
let wanted_html = indoc! {"
|
||||
<html><head>
|
||||
<style media=\"print\">
|
||||
section#warning {
|
||||
color: red;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
</body></html>
|
||||
"};
|
||||
let mut minifier = HTMLMinifier::new();
|
||||
minifier.digest(wanted_html)?;
|
||||
let minified = String::from_utf8(minifier.get_html().into())?;
|
||||
|
||||
let base_url = Url::parse("http://example.com")?;
|
||||
assert_eq!(
|
||||
Config {
|
||||
downloader: Some(&downloader),
|
||||
base_url: Some(&base_url),
|
||||
..Default::default()
|
||||
}
|
||||
.run(html)
|
||||
.await,
|
||||
minified
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
struct PngDownloader;
|
||||
#[async_trait]
|
||||
impl Download for PngDownloader {
|
||||
|
Loading…
Reference in New Issue
Block a user