From 756b1592b7e2b365522c9d02d44fd77d7679f8a8 Mon Sep 17 00:00:00 2001 From: koalp Date: Sat, 24 Apr 2021 03:44:54 +0200 Subject: [PATCH] feat: allows to remove elements of html pages A feature to remove elements of html pages based on css selectors have been added. The removal of link element that load external js have been added. --- .drone.yml | 21 +++ crieur-retrieve/src/article_location.rs | 17 ++- crieur-retrieve/src/consts.rs | 13 +- crieur-retrieve/src/errors.rs | 3 - crieur-retrieve/src/newspaper.rs | 9 +- crieur-retrieve/src/newspapers/mediapart.rs | 30 +++- .../src/tools/self_contained_html.rs | 131 +++++++++++++++--- .../guides/add_a_newspaper_source.md | 18 +++ examples/cli_downloader.rs | 7 +- src/main.rs | 4 +- 10 files changed, 215 insertions(+), 38 deletions(-) create mode 100644 .drone.yml diff --git a/.drone.yml b/.drone.yml new file mode 100644 index 0000000..0a95e84 --- /dev/null +++ b/.drone.yml @@ -0,0 +1,21 @@ +--- +kind: pipeline +name: global + +steps: +- name : lint + image: rust + pull: true + errignore: true + commands: + - rustup component add rustfmt + - rustup component add clippy + - cargo clippy + - cargo fmt -- --check +- name : test + image: rust + pull: true + errignore: true + commands: + - cargo test --all + - cargo build diff --git a/crieur-retrieve/src/article_location.rs b/crieur-retrieve/src/article_location.rs index b98318b..657b2c3 100644 --- a/crieur-retrieve/src/article_location.rs +++ b/crieur-retrieve/src/article_location.rs @@ -1,10 +1,9 @@ -use std::convert::TryInto; -use std::ops::Deref; use std::boxed::Box; +use std::convert::TryInto; use anyhow::{anyhow, Result}; -use url::{Host, Url}; use log::info; +use url::{Host, Url}; use crate::newspaper::Newspaper; @@ -27,7 +26,7 @@ impl<'a> ArticleLocationBuilder<'a> { /// /// An error is returned if the could not be converted into an url // TODO: move this to a defined error, remove anyhow ! - pub fn url<'e, U, E>(mut self, url: U) -> Result + pub fn url(mut self, url: U) -> Result where U: TryInto + Send, E: std::error::Error + Sync + Send + 'static, @@ -80,14 +79,18 @@ impl<'a> ArticleLocationBuilder<'a> { let host = url.host_str().ok_or(anyhow!("Given url has no host"))?; let host = Host::parse(host)?; let newspaper = self - .newspapers.as_ref() + .newspapers + .as_ref() .ok_or(anyhow!( "A list of NewsPaper must be set. It can be set with newspapers() function" ))? - .into_iter() + .iter() .find(|c| c.metadata().hosts.contains(&host)) .ok_or(anyhow!("Newspaper couldn't be found"))?; - Ok(ArticleLocation { newspaper: newspaper.clone(), url }) + Ok(ArticleLocation { + newspaper: newspaper.clone(), + url, + }) } } diff --git a/crieur-retrieve/src/consts.rs b/crieur-retrieve/src/consts.rs index 89c89fe..cc7da7a 100644 --- a/crieur-retrieve/src/consts.rs +++ b/crieur-retrieve/src/consts.rs @@ -1,4 +1,4 @@ -pub const EVENT_HANDLERS: &'static [&'static str] = &[ +pub const EVENT_HANDLERS: &[&str] = &[ // From https://www.w3.org/TR/html52/webappapis.html#event-handlers-on-elements-document-objects-and-window-objects "onabort", "onauxclick", @@ -81,3 +81,14 @@ pub const EVENT_HANDLERS: &'static [&'static str] = &[ "onpaste", "onreadystatechange", ]; + +pub const LINK_REL_EXTERNAL_RESOURCES: &[&str] = &[ + // source: https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes/rel + "dns-prefetch", + "modulepreload", + "pingback", + "preconnect", + "prefetch", + "preload", + "prerender", +]; diff --git a/crieur-retrieve/src/errors.rs b/crieur-retrieve/src/errors.rs index 677a501..0d5fe97 100644 --- a/crieur-retrieve/src/errors.rs +++ b/crieur-retrieve/src/errors.rs @@ -1,6 +1,3 @@ -use anyhow; -use thiserror; - #[derive(thiserror::Error, Debug)] pub enum Error { #[error(transparent)] diff --git a/crieur-retrieve/src/newspaper.rs b/crieur-retrieve/src/newspaper.rs index 1da4007..cfadba0 100644 --- a/crieur-retrieve/src/newspaper.rs +++ b/crieur-retrieve/src/newspaper.rs @@ -41,7 +41,12 @@ pub trait Newspaper { /// Returns true if the Newspaper has complete access to the articles /// /// Usually, it will may tell you if you are logged in when newspaper have a paywall - async fn has_complete_access(&self) -> bool; + async fn has_complete_access(&self) -> bool + where + Self: Sized, + { + true + } /// Returns a newspaper structure async fn new() -> Self @@ -52,5 +57,5 @@ pub trait Newspaper { /// The article **must** be self-contained async fn retrieve_html(&self, url: &Url) -> Result; - // fn login(login: Login) + // fn login(login: Login); } diff --git a/crieur-retrieve/src/newspapers/mediapart.rs b/crieur-retrieve/src/newspapers/mediapart.rs index 8d817e9..8070e7f 100644 --- a/crieur-retrieve/src/newspapers/mediapart.rs +++ b/crieur-retrieve/src/newspapers/mediapart.rs @@ -33,6 +33,14 @@ impl Newspaper for Mediapart { } async fn retrieve_html(&self, url: &Url) -> Result { + let initial_query = url.query(); + let query = match initial_query { + Some(q) => format!("{}&onglet=full", q), + None => "onglet=full".into(), + }; + let mut url = url.clone(); + url.set_query(Some(&query)); + // TODO: add "?onglet=full" to the url if not let cookies = if let Some((name, value)) = &self.login_cookie { let cookie = Cookie::build(name, value).secure(true).finish(); @@ -47,8 +55,28 @@ impl Newspaper for Mediapart { let body = downloader.download(&url).await?; let html = String::from_utf8(body.to_vec())?; + // TODO: Move to const + let element_to_remove = [ + // header + ".fb-root", + ".skipLinks", + ".js-flash-message", + ".header-sticky.sticky-links", + "nav.main-menu", + // menus inside and social media buttons + "ul.sub-menu-journal", + ".tools-social", + ".simple-list.universe-journal", + ".simple-list.universe-club", + // Footer + "footer", + // Misc + "aside.cc-modal", + ]; + // TODO: correction of usage of relative urls, and replace "" by the url - let single_page_html = tools::self_contained_html(&html, &downloader, &url).await; + let single_page_html = + tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await; Ok(single_page_html) } diff --git a/crieur-retrieve/src/tools/self_contained_html.rs b/crieur-retrieve/src/tools/self_contained_html.rs index 10ade69..7283a0e 100644 --- a/crieur-retrieve/src/tools/self_contained_html.rs +++ b/crieur-retrieve/src/tools/self_contained_html.rs @@ -1,21 +1,11 @@ -use log::debug; -use std::fs::File; -use std::io::prelude::*; use std::path::Path; -use anyhow::{anyhow, Result}; -use async_trait::async_trait; -use base64; -use bytes::Bytes; -use futures::future::{JoinAll, OptionFuture}; +use futures::future::OptionFuture; use html_minifier::HTMLMinifier; -use indoc::{formatdoc, indoc}; -use itertools::izip; use nipper::Document; use url::Url; -use crate::consts::EVENT_HANDLERS; -use crate::errors; +use crate::consts::{EVENT_HANDLERS, LINK_REL_EXTERNAL_RESOURCES}; use crate::Download; /// Makes an html page self-contained @@ -24,14 +14,20 @@ use crate::Download; /// needed to make this page self-contained such as stylesheets or images. /// /// The function also removes all scripts on the page -pub async fn self_contained_html(html: S, downloader: &D, base_url: &Url) -> String +pub async fn self_contained_html( + html: impl AsRef, + downloader: &D, + base_url: &Url, + elements_to_remove: &[impl AsRef], +) -> String where E: std::error::Error, D: Download + Send, - S: AsRef, { - // TODO: split/refactor this function - // ¿ find a way to ease the |get urls| -> |async download| -> |replace| workflow ? + // TODO: split/refactor this function : + // - ¿ find a way to ease the |get urls| -> |async download| -> |replace| workflow ? + // - `element_to_remove` , `base_url` and `downloader` should be in a configuration structure + // - ¿ should be function of a trait ? or only of the configuration struct ? let (style_urls, html) = { let document = Document::from(html.as_ref()); @@ -45,6 +41,12 @@ where .remove_attr(event); } + for rel in LINK_REL_EXTERNAL_RESOURCES { + document + .select(format!("link[rel=\"{}\"]", rel).as_str()) + .remove(); + } + // ---- Replace stylesheets ---- // let stylesheets = document.select("link[href][rel=\"stylesheet\"]"); @@ -124,9 +126,16 @@ where img.set_attr("src", &format!("data:image/{};base64,{}", extension, data)); } }); + // ---- Remove unwanted html elements ----- + // + for element in elements_to_remove { + document.select(element.as_ref()).remove(); + } String::from(document.html()) }; + // ---- output ---- + // let mut minifier = HTMLMinifier::new(); minifier.digest(html.as_str()).unwrap(); @@ -135,8 +144,19 @@ where #[cfg(test)] mod tests { + use super::*; + use std::fs::File; + use std::io::prelude::*; + + use anyhow::Result; + use async_trait::async_trait; + use bytes::Bytes; + use indoc::{formatdoc, indoc}; + + use crate::errors; + fn init() { let _ = env_logger::builder().is_test(true).try_init(); } @@ -158,8 +178,9 @@ mod tests { let html = ""; let base_url = Url::parse("http://example.com")?; let downloader = DummyDownloader {}; + let to_remove: &[&str] = &[]; assert_eq!( - self_contained_html(html, &downloader, &base_url).await, + self_contained_html(html, &downloader, &base_url, to_remove).await, "" ); Ok(()) @@ -183,15 +204,44 @@ mod tests { }; let base_url = Url::parse("http://example.com")?; + let to_remove: &[&str] = &[]; for s in EVENT_HANDLERS { assert_eq!( - self_contained_html(html(s), &downloader, &base_url).await, + self_contained_html(html(s), &downloader, &base_url, to_remove).await, "\n\n\n\n" ); } Ok(()) } + #[tokio::test] + async fn remove_link_with_external_ressource() -> Result<()> { + init(); + let downloader = DummyDownloader {}; + let html = |onevent| { + formatdoc! {" + + + + + + + ", + onevent + } + }; + + let base_url = Url::parse("http://example.com")?; + let to_remove: &[&str] = &[]; + for s in LINK_REL_EXTERNAL_RESOURCES { + assert_eq!( + self_contained_html(html(s), &downloader, &base_url, to_remove).await, + "\n\n\n" + ); + } + Ok(()) + } + struct CssDownloader; #[async_trait] impl Download for CssDownloader { @@ -236,8 +286,9 @@ mod tests { let minified = String::from_utf8(minifier.get_html().into())?; let base_url = Url::parse("http://example.com")?; + let to_remove: &[&str] = &[]; assert_eq!( - self_contained_html(html, &downloader, &base_url).await, + self_contained_html(html, &downloader, &base_url, to_remove).await, minified ); Ok(()) @@ -282,8 +333,48 @@ mod tests { let minified = String::from_utf8(minifier.get_html().into())?; let base_url = Url::parse("http://example.com")?; + let to_remove: &[&str] = &[]; assert_eq!( - self_contained_html(html, &downloader, &base_url).await, + self_contained_html(html, &downloader, &base_url, to_remove).await, + minified + ); + Ok(()) + } + + #[tokio::test] + async fn remove_css_selectors() -> Result<()> { + let html = indoc! {" + + + +
The header
+
The articlesocial media button
+
a placeholder>
+ + + "}; + + let wanted_html = indoc! {" + + +
The article
+ + "}; + let base_url = Url::parse("http://example.com")?; + let downloader = DummyDownloader {}; + + let mut minifier = HTMLMinifier::new(); + minifier.digest(wanted_html)?; + let minified = String::from_utf8(minifier.get_html().into())?; + + assert_eq!( + self_contained_html( + html, + &downloader, + &base_url, + &["header", ".placeholder", "article > span.huge"] + ) + .await, minified ); Ok(()) diff --git a/documentation/guides/add_a_newspaper_source.md b/documentation/guides/add_a_newspaper_source.md index 8b13789..39f0685 100644 --- a/documentation/guides/add_a_newspaper_source.md +++ b/documentation/guides/add_a_newspaper_source.md @@ -1 +1,19 @@ +--- +title: Add a newspaper source +--- +How to add a newspaper source ?  + +You must implement the `Newspaper` trait for you structure + +# 1. Write the `metadata` function + +It returns information about the newspaper + +# 2. Write the `has_complete_acess` function + +Usually, indicates if the user is logged in. +You are encouraged to test on the newspaper webpage by making an http call. + +You can use the **TODO** helper function that will look if a specific css +selector is in the page located at the given url. diff --git a/examples/cli_downloader.rs b/examples/cli_downloader.rs index d651762..fffe05d 100644 --- a/examples/cli_downloader.rs +++ b/examples/cli_downloader.rs @@ -2,7 +2,7 @@ use std::convert::TryInto; use std::env; use anyhow::Result; -use crieur_retrieve::{ArticleLocation, Mediapart, newspaper::Newspaper, Url}; +use crieur_retrieve::{newspaper::Newspaper, ArticleLocation, Mediapart, Url}; use dotenv::dotenv; use log::info; @@ -26,7 +26,10 @@ async fn main() -> Result<()> { info!("Trying to download article from {}", url); // TODO: shorten this, maybe an helper function ? - let article_location = ArticleLocation::builder().url(url)?.newspaper(&mediapart).build()?; + let article_location = ArticleLocation::builder() + .url(url)? + .newspaper(&mediapart) + .build()?; let article_str = article_location.retrieve_html().await?; diff --git a/src/main.rs b/src/main.rs index 9bab165..03095a9 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,5 +1,5 @@ use anyhow::Result; -use crieur_retrieve::{Mediapart, newspaper::Newspaper, Url}; +use crieur_retrieve::{newspaper::Newspaper, Mediapart, Url}; use dotenv::dotenv; use std::env; @@ -12,7 +12,7 @@ async fn main() -> Result<()> { // ; - mediapart.login_cookie = Some(("MPRUUID".into(), env::var("MEDIAPART_COOKIE")?.into())); + mediapart.login_cookie = Some(("MPRUUID".into(), env::var("MEDIAPART_COOKIE")?)); let url = Url::parse("https://www.mediapart.fr/journal/france/030421/l-hotel-dieu-patients-et-medecins-tentent-de-percer-les-mysteres-du-covid-long")?; println!("{}", mediapart.retrieve_html(&url).await?); Ok(())