feat: allows to remove elements of html pages

A feature to remove elements of html pages based on css selectors have
been added.

The removal of link element that load external js have been added.
This commit is contained in:
koalp 2021-04-24 03:44:54 +02:00
parent c4ab210c4d
commit 756b1592b7
Signed by: koalp
GPG Key ID: 35B21047DEB09A81
10 changed files with 215 additions and 38 deletions

21
.drone.yml Normal file
View File

@ -0,0 +1,21 @@
---
kind: pipeline
name: global
steps:
- name : lint
image: rust
pull: true
errignore: true
commands:
- rustup component add rustfmt
- rustup component add clippy
- cargo clippy
- cargo fmt -- --check
- name : test
image: rust
pull: true
errignore: true
commands:
- cargo test --all
- cargo build

View File

@ -1,10 +1,9 @@
use std::convert::TryInto;
use std::ops::Deref;
use std::boxed::Box;
use std::convert::TryInto;
use anyhow::{anyhow, Result};
use url::{Host, Url};
use log::info;
use url::{Host, Url};
use crate::newspaper::Newspaper;
@ -27,7 +26,7 @@ impl<'a> ArticleLocationBuilder<'a> {
///
/// An error is returned if the could not be converted into an url
// TODO: move this to a defined error, remove anyhow !
pub fn url<'e, U, E>(mut self, url: U) -> Result<Self>
pub fn url<U, E>(mut self, url: U) -> Result<Self>
where
U: TryInto<Url, Error = E> + Send,
E: std::error::Error + Sync + Send + 'static,
@ -80,14 +79,18 @@ impl<'a> ArticleLocationBuilder<'a> {
let host = url.host_str().ok_or(anyhow!("Given url has no host"))?;
let host = Host::parse(host)?;
let newspaper = self
.newspapers.as_ref()
.newspapers
.as_ref()
.ok_or(anyhow!(
"A list of NewsPaper must be set. It can be set with newspapers() function"
))?
.into_iter()
.iter()
.find(|c| c.metadata().hosts.contains(&host))
.ok_or(anyhow!("Newspaper couldn't be found"))?;
Ok(ArticleLocation { newspaper: newspaper.clone(), url })
Ok(ArticleLocation {
newspaper: newspaper.clone(),
url,
})
}
}

View File

@ -1,4 +1,4 @@
pub const EVENT_HANDLERS: &'static [&'static str] = &[
pub const EVENT_HANDLERS: &[&str] = &[
// From https://www.w3.org/TR/html52/webappapis.html#event-handlers-on-elements-document-objects-and-window-objects
"onabort",
"onauxclick",
@ -81,3 +81,14 @@ pub const EVENT_HANDLERS: &'static [&'static str] = &[
"onpaste",
"onreadystatechange",
];
pub const LINK_REL_EXTERNAL_RESOURCES: &[&str] = &[
// source: https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes/rel
"dns-prefetch",
"modulepreload",
"pingback",
"preconnect",
"prefetch",
"preload",
"prerender",
];

View File

@ -1,6 +1,3 @@
use anyhow;
use thiserror;
#[derive(thiserror::Error, Debug)]
pub enum Error {
#[error(transparent)]

View File

@ -41,7 +41,12 @@ pub trait Newspaper {
/// Returns true if the Newspaper has complete access to the articles
///
/// Usually, it will may tell you if you are logged in when newspaper have a paywall
async fn has_complete_access(&self) -> bool;
async fn has_complete_access(&self) -> bool
where
Self: Sized,
{
true
}
/// Returns a newspaper structure
async fn new() -> Self
@ -52,5 +57,5 @@ pub trait Newspaper {
/// The article **must** be self-contained
async fn retrieve_html(&self, url: &Url) -> Result<String>;
// fn login(login: Login)
// fn login(login: Login);
}

View File

@ -33,6 +33,14 @@ impl Newspaper for Mediapart {
}
async fn retrieve_html(&self, url: &Url) -> Result<String> {
let initial_query = url.query();
let query = match initial_query {
Some(q) => format!("{}&onglet=full", q),
None => "onglet=full".into(),
};
let mut url = url.clone();
url.set_query(Some(&query));
// TODO: add "?onglet=full" to the url if not
let cookies = if let Some((name, value)) = &self.login_cookie {
let cookie = Cookie::build(name, value).secure(true).finish();
@ -47,8 +55,28 @@ impl Newspaper for Mediapart {
let body = downloader.download(&url).await?;
let html = String::from_utf8(body.to_vec())?;
// TODO: Move to const
let element_to_remove = [
// header
".fb-root",
".skipLinks",
".js-flash-message",
".header-sticky.sticky-links",
"nav.main-menu",
// menus inside and social media buttons
"ul.sub-menu-journal",
".tools-social",
".simple-list.universe-journal",
".simple-list.universe-club",
// Footer
"footer",
// Misc
"aside.cc-modal",
];
// TODO: correction of usage of relative urls, and replace "" by the url
let single_page_html = tools::self_contained_html(&html, &downloader, &url).await;
let single_page_html =
tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await;
Ok(single_page_html)
}

View File

@ -1,21 +1,11 @@
use log::debug;
use std::fs::File;
use std::io::prelude::*;
use std::path::Path;
use anyhow::{anyhow, Result};
use async_trait::async_trait;
use base64;
use bytes::Bytes;
use futures::future::{JoinAll, OptionFuture};
use futures::future::OptionFuture;
use html_minifier::HTMLMinifier;
use indoc::{formatdoc, indoc};
use itertools::izip;
use nipper::Document;
use url::Url;
use crate::consts::EVENT_HANDLERS;
use crate::errors;
use crate::consts::{EVENT_HANDLERS, LINK_REL_EXTERNAL_RESOURCES};
use crate::Download;
/// Makes an html page self-contained
@ -24,14 +14,20 @@ use crate::Download;
/// needed to make this page self-contained such as stylesheets or images.
///
/// The function also removes all scripts on the page
pub async fn self_contained_html<E, D, S>(html: S, downloader: &D, base_url: &Url) -> String
pub async fn self_contained_html<E, D>(
html: impl AsRef<str>,
downloader: &D,
base_url: &Url,
elements_to_remove: &[impl AsRef<str>],
) -> String
where
E: std::error::Error,
D: Download<Error = E> + Send,
S: AsRef<str>,
{
// TODO: split/refactor this function
// ¿ find a way to ease the |get urls| -> |async download| -> |replace| workflow ?
// TODO: split/refactor this function :
// - ¿ find a way to ease the |get urls| -> |async download| -> |replace| workflow ?
// - `element_to_remove` , `base_url` and `downloader` should be in a configuration structure
// - ¿ should be function of a trait ? or only of the configuration struct ?
let (style_urls, html) = {
let document = Document::from(html.as_ref());
@ -45,6 +41,12 @@ where
.remove_attr(event);
}
for rel in LINK_REL_EXTERNAL_RESOURCES {
document
.select(format!("link[rel=\"{}\"]", rel).as_str())
.remove();
}
// ---- Replace stylesheets ----
//
let stylesheets = document.select("link[href][rel=\"stylesheet\"]");
@ -124,9 +126,16 @@ where
img.set_attr("src", &format!("data:image/{};base64,{}", extension, data));
}
});
// ---- Remove unwanted html elements -----
//
for element in elements_to_remove {
document.select(element.as_ref()).remove();
}
String::from(document.html())
};
// ---- output ----
//
let mut minifier = HTMLMinifier::new();
minifier.digest(html.as_str()).unwrap();
@ -135,8 +144,19 @@ where
#[cfg(test)]
mod tests {
use super::*;
use std::fs::File;
use std::io::prelude::*;
use anyhow::Result;
use async_trait::async_trait;
use bytes::Bytes;
use indoc::{formatdoc, indoc};
use crate::errors;
fn init() {
let _ = env_logger::builder().is_test(true).try_init();
}
@ -158,8 +178,9 @@ mod tests {
let html = "<html><head><script>let id = id => id</script></head><body></body></html>";
let base_url = Url::parse("http://example.com")?;
let downloader = DummyDownloader {};
let to_remove: &[&str] = &[];
assert_eq!(
self_contained_html(html, &downloader, &base_url).await,
self_contained_html(html, &downloader, &base_url, to_remove).await,
"<html><head></head><body></body></html>"
);
Ok(())
@ -183,15 +204,44 @@ mod tests {
};
let base_url = Url::parse("http://example.com")?;
let to_remove: &[&str] = &[];
for s in EVENT_HANDLERS {
assert_eq!(
self_contained_html(html(s), &downloader, &base_url).await,
self_contained_html(html(s), &downloader, &base_url, to_remove).await,
"<html><head>\n</head>\n<body>\n<button class=\"activate\">button</button>\n</body></html>"
);
}
Ok(())
}
#[tokio::test]
async fn remove_link_with_external_ressource() -> Result<()> {
init();
let downloader = DummyDownloader {};
let html = |onevent| {
formatdoc! {"
<html>
<head>
<link rel=\"{}\" href=\"https://example.org/script.js\">
</head>
<body>
</body>
</html>",
onevent
}
};
let base_url = Url::parse("http://example.com")?;
let to_remove: &[&str] = &[];
for s in LINK_REL_EXTERNAL_RESOURCES {
assert_eq!(
self_contained_html(html(s), &downloader, &base_url, to_remove).await,
"<html><head>\n</head>\n<body>\n</body></html>"
);
}
Ok(())
}
struct CssDownloader;
#[async_trait]
impl Download for CssDownloader {
@ -236,8 +286,9 @@ mod tests {
let minified = String::from_utf8(minifier.get_html().into())?;
let base_url = Url::parse("http://example.com")?;
let to_remove: &[&str] = &[];
assert_eq!(
self_contained_html(html, &downloader, &base_url).await,
self_contained_html(html, &downloader, &base_url, to_remove).await,
minified
);
Ok(())
@ -282,8 +333,48 @@ mod tests {
let minified = String::from_utf8(minifier.get_html().into())?;
let base_url = Url::parse("http://example.com")?;
let to_remove: &[&str] = &[];
assert_eq!(
self_contained_html(html, &downloader, &base_url).await,
self_contained_html(html, &downloader, &base_url, to_remove).await,
minified
);
Ok(())
}
#[tokio::test]
async fn remove_css_selectors() -> Result<()> {
let html = indoc! {"
<html>
<head></head>
<body>
<header>The header</header>
<article>The article<span class=\"huge\">social media button</span></article>
<div class=\"placeholder\">a placeholder></div>
</body>
</html>
"};
let wanted_html = indoc! {"
<html><head></head>
<body>
<article>The article</article>
</body></html>
"};
let base_url = Url::parse("http://example.com")?;
let downloader = DummyDownloader {};
let mut minifier = HTMLMinifier::new();
minifier.digest(wanted_html)?;
let minified = String::from_utf8(minifier.get_html().into())?;
assert_eq!(
self_contained_html(
html,
&downloader,
&base_url,
&["header", ".placeholder", "article > span.huge"]
)
.await,
minified
);
Ok(())

View File

@ -1 +1,19 @@
---
title: Add a newspaper source
---
How to add a newspaper source ? 
You must implement the `Newspaper` trait for you structure
# 1. Write the `metadata` function
It returns information about the newspaper
# 2. Write the `has_complete_acess` function
Usually, indicates if the user is logged in.
You are encouraged to test on the newspaper webpage by making an http call.
You can use the **TODO** helper function that will look if a specific css
selector is in the page located at the given url.

View File

@ -2,7 +2,7 @@ use std::convert::TryInto;
use std::env;
use anyhow::Result;
use crieur_retrieve::{ArticleLocation, Mediapart, newspaper::Newspaper, Url};
use crieur_retrieve::{newspaper::Newspaper, ArticleLocation, Mediapart, Url};
use dotenv::dotenv;
use log::info;
@ -26,7 +26,10 @@ async fn main() -> Result<()> {
info!("Trying to download article from {}", url);
// TODO: shorten this, maybe an helper function ?
let article_location = ArticleLocation::builder().url(url)?.newspaper(&mediapart).build()?;
let article_location = ArticleLocation::builder()
.url(url)?
.newspaper(&mediapart)
.build()?;
let article_str = article_location.retrieve_html().await?;

View File

@ -1,5 +1,5 @@
use anyhow::Result;
use crieur_retrieve::{Mediapart, newspaper::Newspaper, Url};
use crieur_retrieve::{newspaper::Newspaper, Mediapart, Url};
use dotenv::dotenv;
use std::env;
@ -12,7 +12,7 @@ async fn main() -> Result<()> {
//
;
mediapart.login_cookie = Some(("MPRUUID".into(), env::var("MEDIAPART_COOKIE")?.into()));
mediapart.login_cookie = Some(("MPRUUID".into(), env::var("MEDIAPART_COOKIE")?));
let url = Url::parse("https://www.mediapart.fr/journal/france/030421/l-hotel-dieu-patients-et-medecins-tentent-de-percer-les-mysteres-du-covid-long")?;
println!("{}", mediapart.retrieve_html(&url).await?);
Ok(())