From 756b1592b7e2b365522c9d02d44fd77d7679f8a8 Mon Sep 17 00:00:00 2001
From: koalp <koalp@alpaga.dev>
Date: Sat, 24 Apr 2021 03:44:54 +0200
Subject: [PATCH] feat: allows to remove elements of html pages

A feature to remove elements of html pages based on css selectors have
been added.

The removal of link element that load external js have been added.
---
 .drone.yml                                    |  21 +++
 crieur-retrieve/src/article_location.rs       |  17 ++-
 crieur-retrieve/src/consts.rs                 |  13 +-
 crieur-retrieve/src/errors.rs                 |   3 -
 crieur-retrieve/src/newspaper.rs              |   9 +-
 crieur-retrieve/src/newspapers/mediapart.rs   |  30 +++-
 .../src/tools/self_contained_html.rs          | 131 +++++++++++++++---
 .../guides/add_a_newspaper_source.md          |  18 +++
 examples/cli_downloader.rs                    |   7 +-
 src/main.rs                                   |   4 +-
 10 files changed, 215 insertions(+), 38 deletions(-)
 create mode 100644 .drone.yml
diff --git a/.drone.yml b/.drone.yml
new file mode 100644
index 0000000..0a95e84
--- /dev/null
+++ b/.drone.yml
@@ -0,0 +1,21 @@
+---
+kind: pipeline
+name: global
+
+steps:
+- name : lint
+  image: rust
+  pull: true
+  errignore: true
+  commands:
+  - rustup component add rustfmt
+  - rustup component add clippy
+  - cargo clippy
+  - cargo fmt -- --check
+- name : test
+  image: rust
+  pull: true
+  errignore: true
+  commands:
+  - cargo test --all
+  - cargo build
diff --git a/crieur-retrieve/src/article_location.rs b/crieur-retrieve/src/article_location.rs
index b98318b..657b2c3 100644
--- a/crieur-retrieve/src/article_location.rs
+++ b/crieur-retrieve/src/article_location.rs
@@ -1,10 +1,9 @@
-use std::convert::TryInto;
-use std::ops::Deref;
 use std::boxed::Box;
+use std::convert::TryInto;
 
 use anyhow::{anyhow, Result};
-use url::{Host, Url};
 use log::info;
+use url::{Host, Url};
 
 use crate::newspaper::Newspaper;
 
@@ -27,7 +26,7 @@ impl<'a> ArticleLocationBuilder<'a> {
     ///
     /// An error is returned if the could not be converted into an url
     // TODO: move this to a defined error, remove anyhow !
-    pub fn url<'e, U, E>(mut self, url: U) -> Result<Self>
+    pub fn url<U, E>(mut self, url: U) -> Result<Self>
     where
         U: TryInto<Url, Error = E> + Send,
         E: std::error::Error + Sync + Send + 'static,
@@ -80,14 +79,18 @@ impl<'a> ArticleLocationBuilder<'a> {
         let host = url.host_str().ok_or(anyhow!("Given url has no host"))?;
         let host = Host::parse(host)?;
         let newspaper = self
-            .newspapers.as_ref()
+            .newspapers
+            .as_ref()
             .ok_or(anyhow!(
                 "A list of NewsPaper must be set. It can be set with newspapers() function"
             ))?
-            .into_iter()
+            .iter()
             .find(|c| c.metadata().hosts.contains(&host))
             .ok_or(anyhow!("Newspaper couldn't be found"))?;
-        Ok(ArticleLocation { newspaper: newspaper.clone(), url })
+        Ok(ArticleLocation {
+            newspaper: newspaper.clone(),
+            url,
+        })
     }
 }
 
diff --git a/crieur-retrieve/src/consts.rs b/crieur-retrieve/src/consts.rs
index 89c89fe..cc7da7a 100644
--- a/crieur-retrieve/src/consts.rs
+++ b/crieur-retrieve/src/consts.rs
@@ -1,4 +1,4 @@
-pub const EVENT_HANDLERS: &'static [&'static str] = &[
+pub const EVENT_HANDLERS: &[&str] = &[
     // From https://www.w3.org/TR/html52/webappapis.html#event-handlers-on-elements-document-objects-and-window-objects
     "onabort",
     "onauxclick",
@@ -81,3 +81,14 @@ pub const EVENT_HANDLERS: &'static [&'static str] = &[
     "onpaste",
     "onreadystatechange",
 ];
+
+pub const LINK_REL_EXTERNAL_RESOURCES: &[&str] = &[
+    // source: https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes/rel
+    "dns-prefetch",
+    "modulepreload",
+    "pingback",
+    "preconnect",
+    "prefetch",
+    "preload",
+    "prerender",
+];
diff --git a/crieur-retrieve/src/errors.rs b/crieur-retrieve/src/errors.rs
index 677a501..0d5fe97 100644
--- a/crieur-retrieve/src/errors.rs
+++ b/crieur-retrieve/src/errors.rs
@@ -1,6 +1,3 @@
-use anyhow;
-use thiserror;
-
 #[derive(thiserror::Error, Debug)]
 pub enum Error {
     #[error(transparent)]
diff --git a/crieur-retrieve/src/newspaper.rs b/crieur-retrieve/src/newspaper.rs
index 1da4007..cfadba0 100644
--- a/crieur-retrieve/src/newspaper.rs
+++ b/crieur-retrieve/src/newspaper.rs
@@ -41,7 +41,12 @@ pub trait Newspaper {
     /// Returns true if the Newspaper has complete access to the articles
     ///
     /// Usually, it will may tell you if you are logged in when newspaper have a paywall
-    async fn has_complete_access(&self) -> bool;
+    async fn has_complete_access(&self) -> bool
+    where
+        Self: Sized,
+    {
+        true
+    }
 
     /// Returns a newspaper structure
     async fn new() -> Self
@@ -52,5 +57,5 @@ pub trait Newspaper {
     /// The article **must** be self-contained
     async fn retrieve_html(&self, url: &Url) -> Result<String>;
 
-    // fn login(login: Login)
+    // fn login(login: Login);
 }
diff --git a/crieur-retrieve/src/newspapers/mediapart.rs b/crieur-retrieve/src/newspapers/mediapart.rs
index 8d817e9..8070e7f 100644
--- a/crieur-retrieve/src/newspapers/mediapart.rs
+++ b/crieur-retrieve/src/newspapers/mediapart.rs
@@ -33,6 +33,14 @@ impl Newspaper for Mediapart {
     }
 
     async fn retrieve_html(&self, url: &Url) -> Result<String> {
+        let initial_query = url.query();
+        let query = match initial_query {
+            Some(q) => format!("{}&onglet=full", q),
+            None => "onglet=full".into(),
+        };
+        let mut url = url.clone();
+        url.set_query(Some(&query));
+
         // TODO: add "?onglet=full" to the url if not
         let cookies = if let Some((name, value)) = &self.login_cookie {
             let cookie = Cookie::build(name, value).secure(true).finish();
@@ -47,8 +55,28 @@ impl Newspaper for Mediapart {
         let body = downloader.download(&url).await?;
         let html = String::from_utf8(body.to_vec())?;
 
+        // TODO: Move to const
+        let element_to_remove = [
+            // header
+            ".fb-root",
+            ".skipLinks",
+            ".js-flash-message",
+            ".header-sticky.sticky-links",
+            "nav.main-menu",
+            // menus inside and social media buttons
+            "ul.sub-menu-journal",
+            ".tools-social",
+            ".simple-list.universe-journal",
+            ".simple-list.universe-club",
+            // Footer
+            "footer",
+            // Misc
+            "aside.cc-modal",
+        ];
+
         // TODO: correction of usage of relative urls, and replace "" by the url
-        let single_page_html = tools::self_contained_html(&html, &downloader, &url).await;
+        let single_page_html =
+            tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await;
         Ok(single_page_html)
     }
 
diff --git a/crieur-retrieve/src/tools/self_contained_html.rs b/crieur-retrieve/src/tools/self_contained_html.rs
index 10ade69..7283a0e 100644
--- a/crieur-retrieve/src/tools/self_contained_html.rs
+++ b/crieur-retrieve/src/tools/self_contained_html.rs
@@ -1,21 +1,11 @@
-use log::debug;
-use std::fs::File;
-use std::io::prelude::*;
 use std::path::Path;
 
-use anyhow::{anyhow, Result};
-use async_trait::async_trait;
-use base64;
-use bytes::Bytes;
-use futures::future::{JoinAll, OptionFuture};
+use futures::future::OptionFuture;
 use html_minifier::HTMLMinifier;
-use indoc::{formatdoc, indoc};
-use itertools::izip;
 use nipper::Document;
 use url::Url;
 
-use crate::consts::EVENT_HANDLERS;
-use crate::errors;
+use crate::consts::{EVENT_HANDLERS, LINK_REL_EXTERNAL_RESOURCES};
 use crate::Download;
 
 /// Makes an html page self-contained
@@ -24,14 +14,20 @@ use crate::Download;
 /// needed to make this page self-contained such as stylesheets or images.
 ///
 /// The function also removes all scripts on the page
-pub async fn self_contained_html<E, D, S>(html: S, downloader: &D, base_url: &Url) -> String
+pub async fn self_contained_html<E, D>(
+    html: impl AsRef<str>,
+    downloader: &D,
+    base_url: &Url,
+    elements_to_remove: &[impl AsRef<str>],
+) -> String
 where
     E: std::error::Error,
     D: Download<Error = E> + Send,
-    S: AsRef<str>,
 {
-    // TODO: split/refactor this function
-    //       ¿ find a way to ease the |get urls| -> |async download| -> |replace| workflow ?
+    // TODO: split/refactor this function :
+    //       - ¿ find a way to ease the |get urls| -> |async download| -> |replace| workflow ?
+    //       - `element_to_remove` , `base_url` and `downloader` should be in a configuration structure
+    //       - ¿ should be function of a trait ? or only of the configuration struct ?
     let (style_urls, html) = {
         let document = Document::from(html.as_ref());
 
@@ -45,6 +41,12 @@ where
                 .remove_attr(event);
         }
 
+        for rel in LINK_REL_EXTERNAL_RESOURCES {
+            document
+                .select(format!("link[rel=\"{}\"]", rel).as_str())
+                .remove();
+        }
+
         // ---- Replace stylesheets ----
         //
         let stylesheets = document.select("link[href][rel=\"stylesheet\"]");
@@ -124,9 +126,16 @@ where
                     img.set_attr("src", &format!("data:image/{};base64,{}", extension, data));
                 }
             });
+        // ---- Remove unwanted html elements -----
+        //
+        for element in elements_to_remove {
+            document.select(element.as_ref()).remove();
+        }
         String::from(document.html())
     };
 
+    // ---- output ----
+    //
     let mut minifier = HTMLMinifier::new();
     minifier.digest(html.as_str()).unwrap();
 
@@ -135,8 +144,19 @@ where
 
 #[cfg(test)]
 mod tests {
+
     use super::*;
 
+    use std::fs::File;
+    use std::io::prelude::*;
+
+    use anyhow::Result;
+    use async_trait::async_trait;
+    use bytes::Bytes;
+    use indoc::{formatdoc, indoc};
+
+    use crate::errors;
+
     fn init() {
         let _ = env_logger::builder().is_test(true).try_init();
     }
@@ -158,8 +178,9 @@ mod tests {
         let html = "<html><head><script>let id = id => id</script></head><body></body></html>";
         let base_url = Url::parse("http://example.com")?;
         let downloader = DummyDownloader {};
+        let to_remove: &[&str] = &[];
         assert_eq!(
-            self_contained_html(html, &downloader, &base_url).await,
+            self_contained_html(html, &downloader, &base_url, to_remove).await,
             "<html><head></head><body></body></html>"
         );
         Ok(())
@@ -183,15 +204,44 @@ mod tests {
         };
 
         let base_url = Url::parse("http://example.com")?;
+        let to_remove: &[&str] = &[];
         for s in EVENT_HANDLERS {
             assert_eq!(
-                self_contained_html(html(s), &downloader, &base_url).await,
+                self_contained_html(html(s), &downloader, &base_url, to_remove).await,
                 "<html><head>\n</head>\n<body>\n<button class=\"activate\">button</button>\n</body></html>"
             );
         }
         Ok(())
     }
 
+    #[tokio::test]
+    async fn remove_link_with_external_ressource() -> Result<()> {
+        init();
+        let downloader = DummyDownloader {};
+        let html = |onevent| {
+            formatdoc! {"
+                <html>
+                    <head>
+                        <link rel=\"{}\" href=\"https://example.org/script.js\">
+                    </head>
+                    <body>
+                    </body>
+                </html>",
+                onevent
+            }
+        };
+
+        let base_url = Url::parse("http://example.com")?;
+        let to_remove: &[&str] = &[];
+        for s in LINK_REL_EXTERNAL_RESOURCES {
+            assert_eq!(
+                self_contained_html(html(s), &downloader, &base_url, to_remove).await,
+                "<html><head>\n</head>\n<body>\n</body></html>"
+            );
+        }
+        Ok(())
+    }
+
     struct CssDownloader;
     #[async_trait]
     impl Download for CssDownloader {
@@ -236,8 +286,9 @@ mod tests {
         let minified = String::from_utf8(minifier.get_html().into())?;
 
         let base_url = Url::parse("http://example.com")?;
+        let to_remove: &[&str] = &[];
         assert_eq!(
-            self_contained_html(html, &downloader, &base_url).await,
+            self_contained_html(html, &downloader, &base_url, to_remove).await,
             minified
         );
         Ok(())
@@ -282,8 +333,48 @@ mod tests {
         let minified = String::from_utf8(minifier.get_html().into())?;
 
         let base_url = Url::parse("http://example.com")?;
+        let to_remove: &[&str] = &[];
         assert_eq!(
-            self_contained_html(html, &downloader, &base_url).await,
+            self_contained_html(html, &downloader, &base_url, to_remove).await,
+            minified
+        );
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn remove_css_selectors() -> Result<()> {
+        let html = indoc! {"
+            <html>
+                <head></head>
+                <body>
+                    <header>The header</header>
+                    <article>The article<span class=\"huge\">social media button</span></article>
+                    <div class=\"placeholder\">a placeholder></div>
+                </body>
+            </html>
+        "};
+
+        let wanted_html = indoc! {"
+            <html><head></head>
+                <body>
+                    <article>The article</article>
+                </body></html>
+        "};
+        let base_url = Url::parse("http://example.com")?;
+        let downloader = DummyDownloader {};
+
+        let mut minifier = HTMLMinifier::new();
+        minifier.digest(wanted_html)?;
+        let minified = String::from_utf8(minifier.get_html().into())?;
+
+        assert_eq!(
+            self_contained_html(
+                html,
+                &downloader,
+                &base_url,
+                &["header", ".placeholder", "article > span.huge"]
+            )
+            .await,
             minified
         );
         Ok(())
diff --git a/documentation/guides/add_a_newspaper_source.md b/documentation/guides/add_a_newspaper_source.md
index 8b13789..39f0685 100644
--- a/documentation/guides/add_a_newspaper_source.md
+++ b/documentation/guides/add_a_newspaper_source.md
@@ -1 +1,19 @@
+---
+title: Add a newspaper source
+---
 
+How to add a newspaper source ? 
+
+You must implement the `Newspaper` trait for you structure
+
+# 1. Write the `metadata` function
+
+It returns information about the newspaper
+
+# 2. Write the `has_complete_acess` function
+
+Usually, indicates if the user is logged in.
+You are encouraged to test on the newspaper webpage by making an http call.
+
+You can use the **TODO** helper function that will look if a specific css
+selector is in the page located at the given url.
diff --git a/examples/cli_downloader.rs b/examples/cli_downloader.rs
index d651762..fffe05d 100644
--- a/examples/cli_downloader.rs
+++ b/examples/cli_downloader.rs
@@ -2,7 +2,7 @@ use std::convert::TryInto;
 use std::env;
 
 use anyhow::Result;
-use crieur_retrieve::{ArticleLocation, Mediapart, newspaper::Newspaper, Url};
+use crieur_retrieve::{newspaper::Newspaper, ArticleLocation, Mediapart, Url};
 use dotenv::dotenv;
 use log::info;
 
@@ -26,7 +26,10 @@ async fn main() -> Result<()> {
     info!("Trying to download article from {}", url);
 
     // TODO: shorten this, maybe an helper function ?
-    let article_location = ArticleLocation::builder().url(url)?.newspaper(&mediapart).build()?;
+    let article_location = ArticleLocation::builder()
+        .url(url)?
+        .newspaper(&mediapart)
+        .build()?;
 
     let article_str = article_location.retrieve_html().await?;
 
diff --git a/src/main.rs b/src/main.rs
index 9bab165..03095a9 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,5 +1,5 @@
 use anyhow::Result;
-use crieur_retrieve::{Mediapart, newspaper::Newspaper, Url};
+use crieur_retrieve::{newspaper::Newspaper, Mediapart, Url};
 use dotenv::dotenv;
 use std::env;
 
@@ -12,7 +12,7 @@ async fn main() -> Result<()> {
         //
         ;
 
-    mediapart.login_cookie = Some(("MPRUUID".into(), env::var("MEDIAPART_COOKIE")?.into()));
+    mediapart.login_cookie = Some(("MPRUUID".into(), env::var("MEDIAPART_COOKIE")?));
     let url = Url::parse("https://www.mediapart.fr/journal/france/030421/l-hotel-dieu-patients-et-medecins-tentent-de-percer-les-mysteres-du-covid-long")?;
     println!("{}", mediapart.retrieve_html(&url).await?);
     Ok(())