feat: allows to remove elements of html pages
A feature to remove elements of html pages based on css selectors have been added. The removal of link element that load external js have been added.
This commit is contained in:
parent
c4ab210c4d
commit
756b1592b7
21
.drone.yml
Normal file
21
.drone.yml
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
---
|
||||||
|
kind: pipeline
|
||||||
|
name: global
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name : lint
|
||||||
|
image: rust
|
||||||
|
pull: true
|
||||||
|
errignore: true
|
||||||
|
commands:
|
||||||
|
- rustup component add rustfmt
|
||||||
|
- rustup component add clippy
|
||||||
|
- cargo clippy
|
||||||
|
- cargo fmt -- --check
|
||||||
|
- name : test
|
||||||
|
image: rust
|
||||||
|
pull: true
|
||||||
|
errignore: true
|
||||||
|
commands:
|
||||||
|
- cargo test --all
|
||||||
|
- cargo build
|
@ -1,10 +1,9 @@
|
|||||||
use std::convert::TryInto;
|
|
||||||
use std::ops::Deref;
|
|
||||||
use std::boxed::Box;
|
use std::boxed::Box;
|
||||||
|
use std::convert::TryInto;
|
||||||
|
|
||||||
use anyhow::{anyhow, Result};
|
use anyhow::{anyhow, Result};
|
||||||
use url::{Host, Url};
|
|
||||||
use log::info;
|
use log::info;
|
||||||
|
use url::{Host, Url};
|
||||||
|
|
||||||
use crate::newspaper::Newspaper;
|
use crate::newspaper::Newspaper;
|
||||||
|
|
||||||
@ -27,7 +26,7 @@ impl<'a> ArticleLocationBuilder<'a> {
|
|||||||
///
|
///
|
||||||
/// An error is returned if the could not be converted into an url
|
/// An error is returned if the could not be converted into an url
|
||||||
// TODO: move this to a defined error, remove anyhow !
|
// TODO: move this to a defined error, remove anyhow !
|
||||||
pub fn url<'e, U, E>(mut self, url: U) -> Result<Self>
|
pub fn url<U, E>(mut self, url: U) -> Result<Self>
|
||||||
where
|
where
|
||||||
U: TryInto<Url, Error = E> + Send,
|
U: TryInto<Url, Error = E> + Send,
|
||||||
E: std::error::Error + Sync + Send + 'static,
|
E: std::error::Error + Sync + Send + 'static,
|
||||||
@ -80,14 +79,18 @@ impl<'a> ArticleLocationBuilder<'a> {
|
|||||||
let host = url.host_str().ok_or(anyhow!("Given url has no host"))?;
|
let host = url.host_str().ok_or(anyhow!("Given url has no host"))?;
|
||||||
let host = Host::parse(host)?;
|
let host = Host::parse(host)?;
|
||||||
let newspaper = self
|
let newspaper = self
|
||||||
.newspapers.as_ref()
|
.newspapers
|
||||||
|
.as_ref()
|
||||||
.ok_or(anyhow!(
|
.ok_or(anyhow!(
|
||||||
"A list of NewsPaper must be set. It can be set with newspapers() function"
|
"A list of NewsPaper must be set. It can be set with newspapers() function"
|
||||||
))?
|
))?
|
||||||
.into_iter()
|
.iter()
|
||||||
.find(|c| c.metadata().hosts.contains(&host))
|
.find(|c| c.metadata().hosts.contains(&host))
|
||||||
.ok_or(anyhow!("Newspaper couldn't be found"))?;
|
.ok_or(anyhow!("Newspaper couldn't be found"))?;
|
||||||
Ok(ArticleLocation { newspaper: newspaper.clone(), url })
|
Ok(ArticleLocation {
|
||||||
|
newspaper: newspaper.clone(),
|
||||||
|
url,
|
||||||
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
pub const EVENT_HANDLERS: &'static [&'static str] = &[
|
pub const EVENT_HANDLERS: &[&str] = &[
|
||||||
// From https://www.w3.org/TR/html52/webappapis.html#event-handlers-on-elements-document-objects-and-window-objects
|
// From https://www.w3.org/TR/html52/webappapis.html#event-handlers-on-elements-document-objects-and-window-objects
|
||||||
"onabort",
|
"onabort",
|
||||||
"onauxclick",
|
"onauxclick",
|
||||||
@ -81,3 +81,14 @@ pub const EVENT_HANDLERS: &'static [&'static str] = &[
|
|||||||
"onpaste",
|
"onpaste",
|
||||||
"onreadystatechange",
|
"onreadystatechange",
|
||||||
];
|
];
|
||||||
|
|
||||||
|
pub const LINK_REL_EXTERNAL_RESOURCES: &[&str] = &[
|
||||||
|
// source: https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes/rel
|
||||||
|
"dns-prefetch",
|
||||||
|
"modulepreload",
|
||||||
|
"pingback",
|
||||||
|
"preconnect",
|
||||||
|
"prefetch",
|
||||||
|
"preload",
|
||||||
|
"prerender",
|
||||||
|
];
|
||||||
|
@ -1,6 +1,3 @@
|
|||||||
use anyhow;
|
|
||||||
use thiserror;
|
|
||||||
|
|
||||||
#[derive(thiserror::Error, Debug)]
|
#[derive(thiserror::Error, Debug)]
|
||||||
pub enum Error {
|
pub enum Error {
|
||||||
#[error(transparent)]
|
#[error(transparent)]
|
||||||
|
@ -41,7 +41,12 @@ pub trait Newspaper {
|
|||||||
/// Returns true if the Newspaper has complete access to the articles
|
/// Returns true if the Newspaper has complete access to the articles
|
||||||
///
|
///
|
||||||
/// Usually, it will may tell you if you are logged in when newspaper have a paywall
|
/// Usually, it will may tell you if you are logged in when newspaper have a paywall
|
||||||
async fn has_complete_access(&self) -> bool;
|
async fn has_complete_access(&self) -> bool
|
||||||
|
where
|
||||||
|
Self: Sized,
|
||||||
|
{
|
||||||
|
true
|
||||||
|
}
|
||||||
|
|
||||||
/// Returns a newspaper structure
|
/// Returns a newspaper structure
|
||||||
async fn new() -> Self
|
async fn new() -> Self
|
||||||
@ -52,5 +57,5 @@ pub trait Newspaper {
|
|||||||
/// The article **must** be self-contained
|
/// The article **must** be self-contained
|
||||||
async fn retrieve_html(&self, url: &Url) -> Result<String>;
|
async fn retrieve_html(&self, url: &Url) -> Result<String>;
|
||||||
|
|
||||||
// fn login(login: Login)
|
// fn login(login: Login);
|
||||||
}
|
}
|
||||||
|
@ -33,6 +33,14 @@ impl Newspaper for Mediapart {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async fn retrieve_html(&self, url: &Url) -> Result<String> {
|
async fn retrieve_html(&self, url: &Url) -> Result<String> {
|
||||||
|
let initial_query = url.query();
|
||||||
|
let query = match initial_query {
|
||||||
|
Some(q) => format!("{}&onglet=full", q),
|
||||||
|
None => "onglet=full".into(),
|
||||||
|
};
|
||||||
|
let mut url = url.clone();
|
||||||
|
url.set_query(Some(&query));
|
||||||
|
|
||||||
// TODO: add "?onglet=full" to the url if not
|
// TODO: add "?onglet=full" to the url if not
|
||||||
let cookies = if let Some((name, value)) = &self.login_cookie {
|
let cookies = if let Some((name, value)) = &self.login_cookie {
|
||||||
let cookie = Cookie::build(name, value).secure(true).finish();
|
let cookie = Cookie::build(name, value).secure(true).finish();
|
||||||
@ -47,8 +55,28 @@ impl Newspaper for Mediapart {
|
|||||||
let body = downloader.download(&url).await?;
|
let body = downloader.download(&url).await?;
|
||||||
let html = String::from_utf8(body.to_vec())?;
|
let html = String::from_utf8(body.to_vec())?;
|
||||||
|
|
||||||
|
// TODO: Move to const
|
||||||
|
let element_to_remove = [
|
||||||
|
// header
|
||||||
|
".fb-root",
|
||||||
|
".skipLinks",
|
||||||
|
".js-flash-message",
|
||||||
|
".header-sticky.sticky-links",
|
||||||
|
"nav.main-menu",
|
||||||
|
// menus inside and social media buttons
|
||||||
|
"ul.sub-menu-journal",
|
||||||
|
".tools-social",
|
||||||
|
".simple-list.universe-journal",
|
||||||
|
".simple-list.universe-club",
|
||||||
|
// Footer
|
||||||
|
"footer",
|
||||||
|
// Misc
|
||||||
|
"aside.cc-modal",
|
||||||
|
];
|
||||||
|
|
||||||
// TODO: correction of usage of relative urls, and replace "" by the url
|
// TODO: correction of usage of relative urls, and replace "" by the url
|
||||||
let single_page_html = tools::self_contained_html(&html, &downloader, &url).await;
|
let single_page_html =
|
||||||
|
tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await;
|
||||||
Ok(single_page_html)
|
Ok(single_page_html)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,21 +1,11 @@
|
|||||||
use log::debug;
|
|
||||||
use std::fs::File;
|
|
||||||
use std::io::prelude::*;
|
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
|
|
||||||
use anyhow::{anyhow, Result};
|
use futures::future::OptionFuture;
|
||||||
use async_trait::async_trait;
|
|
||||||
use base64;
|
|
||||||
use bytes::Bytes;
|
|
||||||
use futures::future::{JoinAll, OptionFuture};
|
|
||||||
use html_minifier::HTMLMinifier;
|
use html_minifier::HTMLMinifier;
|
||||||
use indoc::{formatdoc, indoc};
|
|
||||||
use itertools::izip;
|
|
||||||
use nipper::Document;
|
use nipper::Document;
|
||||||
use url::Url;
|
use url::Url;
|
||||||
|
|
||||||
use crate::consts::EVENT_HANDLERS;
|
use crate::consts::{EVENT_HANDLERS, LINK_REL_EXTERNAL_RESOURCES};
|
||||||
use crate::errors;
|
|
||||||
use crate::Download;
|
use crate::Download;
|
||||||
|
|
||||||
/// Makes an html page self-contained
|
/// Makes an html page self-contained
|
||||||
@ -24,14 +14,20 @@ use crate::Download;
|
|||||||
/// needed to make this page self-contained such as stylesheets or images.
|
/// needed to make this page self-contained such as stylesheets or images.
|
||||||
///
|
///
|
||||||
/// The function also removes all scripts on the page
|
/// The function also removes all scripts on the page
|
||||||
pub async fn self_contained_html<E, D, S>(html: S, downloader: &D, base_url: &Url) -> String
|
pub async fn self_contained_html<E, D>(
|
||||||
|
html: impl AsRef<str>,
|
||||||
|
downloader: &D,
|
||||||
|
base_url: &Url,
|
||||||
|
elements_to_remove: &[impl AsRef<str>],
|
||||||
|
) -> String
|
||||||
where
|
where
|
||||||
E: std::error::Error,
|
E: std::error::Error,
|
||||||
D: Download<Error = E> + Send,
|
D: Download<Error = E> + Send,
|
||||||
S: AsRef<str>,
|
|
||||||
{
|
{
|
||||||
// TODO: split/refactor this function
|
// TODO: split/refactor this function :
|
||||||
// ¿ find a way to ease the |get urls| -> |async download| -> |replace| workflow ?
|
// - ¿ find a way to ease the |get urls| -> |async download| -> |replace| workflow ?
|
||||||
|
// - `element_to_remove` , `base_url` and `downloader` should be in a configuration structure
|
||||||
|
// - ¿ should be function of a trait ? or only of the configuration struct ?
|
||||||
let (style_urls, html) = {
|
let (style_urls, html) = {
|
||||||
let document = Document::from(html.as_ref());
|
let document = Document::from(html.as_ref());
|
||||||
|
|
||||||
@ -45,6 +41,12 @@ where
|
|||||||
.remove_attr(event);
|
.remove_attr(event);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for rel in LINK_REL_EXTERNAL_RESOURCES {
|
||||||
|
document
|
||||||
|
.select(format!("link[rel=\"{}\"]", rel).as_str())
|
||||||
|
.remove();
|
||||||
|
}
|
||||||
|
|
||||||
// ---- Replace stylesheets ----
|
// ---- Replace stylesheets ----
|
||||||
//
|
//
|
||||||
let stylesheets = document.select("link[href][rel=\"stylesheet\"]");
|
let stylesheets = document.select("link[href][rel=\"stylesheet\"]");
|
||||||
@ -124,9 +126,16 @@ where
|
|||||||
img.set_attr("src", &format!("data:image/{};base64,{}", extension, data));
|
img.set_attr("src", &format!("data:image/{};base64,{}", extension, data));
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
// ---- Remove unwanted html elements -----
|
||||||
|
//
|
||||||
|
for element in elements_to_remove {
|
||||||
|
document.select(element.as_ref()).remove();
|
||||||
|
}
|
||||||
String::from(document.html())
|
String::from(document.html())
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// ---- output ----
|
||||||
|
//
|
||||||
let mut minifier = HTMLMinifier::new();
|
let mut minifier = HTMLMinifier::new();
|
||||||
minifier.digest(html.as_str()).unwrap();
|
minifier.digest(html.as_str()).unwrap();
|
||||||
|
|
||||||
@ -135,8 +144,19 @@ where
|
|||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
|
use std::fs::File;
|
||||||
|
use std::io::prelude::*;
|
||||||
|
|
||||||
|
use anyhow::Result;
|
||||||
|
use async_trait::async_trait;
|
||||||
|
use bytes::Bytes;
|
||||||
|
use indoc::{formatdoc, indoc};
|
||||||
|
|
||||||
|
use crate::errors;
|
||||||
|
|
||||||
fn init() {
|
fn init() {
|
||||||
let _ = env_logger::builder().is_test(true).try_init();
|
let _ = env_logger::builder().is_test(true).try_init();
|
||||||
}
|
}
|
||||||
@ -158,8 +178,9 @@ mod tests {
|
|||||||
let html = "<html><head><script>let id = id => id</script></head><body></body></html>";
|
let html = "<html><head><script>let id = id => id</script></head><body></body></html>";
|
||||||
let base_url = Url::parse("http://example.com")?;
|
let base_url = Url::parse("http://example.com")?;
|
||||||
let downloader = DummyDownloader {};
|
let downloader = DummyDownloader {};
|
||||||
|
let to_remove: &[&str] = &[];
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
self_contained_html(html, &downloader, &base_url).await,
|
self_contained_html(html, &downloader, &base_url, to_remove).await,
|
||||||
"<html><head></head><body></body></html>"
|
"<html><head></head><body></body></html>"
|
||||||
);
|
);
|
||||||
Ok(())
|
Ok(())
|
||||||
@ -183,15 +204,44 @@ mod tests {
|
|||||||
};
|
};
|
||||||
|
|
||||||
let base_url = Url::parse("http://example.com")?;
|
let base_url = Url::parse("http://example.com")?;
|
||||||
|
let to_remove: &[&str] = &[];
|
||||||
for s in EVENT_HANDLERS {
|
for s in EVENT_HANDLERS {
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
self_contained_html(html(s), &downloader, &base_url).await,
|
self_contained_html(html(s), &downloader, &base_url, to_remove).await,
|
||||||
"<html><head>\n</head>\n<body>\n<button class=\"activate\">button</button>\n</body></html>"
|
"<html><head>\n</head>\n<body>\n<button class=\"activate\">button</button>\n</body></html>"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn remove_link_with_external_ressource() -> Result<()> {
|
||||||
|
init();
|
||||||
|
let downloader = DummyDownloader {};
|
||||||
|
let html = |onevent| {
|
||||||
|
formatdoc! {"
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<link rel=\"{}\" href=\"https://example.org/script.js\">
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
</body>
|
||||||
|
</html>",
|
||||||
|
onevent
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let base_url = Url::parse("http://example.com")?;
|
||||||
|
let to_remove: &[&str] = &[];
|
||||||
|
for s in LINK_REL_EXTERNAL_RESOURCES {
|
||||||
|
assert_eq!(
|
||||||
|
self_contained_html(html(s), &downloader, &base_url, to_remove).await,
|
||||||
|
"<html><head>\n</head>\n<body>\n</body></html>"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
struct CssDownloader;
|
struct CssDownloader;
|
||||||
#[async_trait]
|
#[async_trait]
|
||||||
impl Download for CssDownloader {
|
impl Download for CssDownloader {
|
||||||
@ -236,8 +286,9 @@ mod tests {
|
|||||||
let minified = String::from_utf8(minifier.get_html().into())?;
|
let minified = String::from_utf8(minifier.get_html().into())?;
|
||||||
|
|
||||||
let base_url = Url::parse("http://example.com")?;
|
let base_url = Url::parse("http://example.com")?;
|
||||||
|
let to_remove: &[&str] = &[];
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
self_contained_html(html, &downloader, &base_url).await,
|
self_contained_html(html, &downloader, &base_url, to_remove).await,
|
||||||
minified
|
minified
|
||||||
);
|
);
|
||||||
Ok(())
|
Ok(())
|
||||||
@ -282,8 +333,48 @@ mod tests {
|
|||||||
let minified = String::from_utf8(minifier.get_html().into())?;
|
let minified = String::from_utf8(minifier.get_html().into())?;
|
||||||
|
|
||||||
let base_url = Url::parse("http://example.com")?;
|
let base_url = Url::parse("http://example.com")?;
|
||||||
|
let to_remove: &[&str] = &[];
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
self_contained_html(html, &downloader, &base_url).await,
|
self_contained_html(html, &downloader, &base_url, to_remove).await,
|
||||||
|
minified
|
||||||
|
);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn remove_css_selectors() -> Result<()> {
|
||||||
|
let html = indoc! {"
|
||||||
|
<html>
|
||||||
|
<head></head>
|
||||||
|
<body>
|
||||||
|
<header>The header</header>
|
||||||
|
<article>The article<span class=\"huge\">social media button</span></article>
|
||||||
|
<div class=\"placeholder\">a placeholder></div>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"};
|
||||||
|
|
||||||
|
let wanted_html = indoc! {"
|
||||||
|
<html><head></head>
|
||||||
|
<body>
|
||||||
|
<article>The article</article>
|
||||||
|
</body></html>
|
||||||
|
"};
|
||||||
|
let base_url = Url::parse("http://example.com")?;
|
||||||
|
let downloader = DummyDownloader {};
|
||||||
|
|
||||||
|
let mut minifier = HTMLMinifier::new();
|
||||||
|
minifier.digest(wanted_html)?;
|
||||||
|
let minified = String::from_utf8(minifier.get_html().into())?;
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
self_contained_html(
|
||||||
|
html,
|
||||||
|
&downloader,
|
||||||
|
&base_url,
|
||||||
|
&["header", ".placeholder", "article > span.huge"]
|
||||||
|
)
|
||||||
|
.await,
|
||||||
minified
|
minified
|
||||||
);
|
);
|
||||||
Ok(())
|
Ok(())
|
||||||
|
@ -1 +1,19 @@
|
|||||||
|
---
|
||||||
|
title: Add a newspaper source
|
||||||
|
---
|
||||||
|
|
||||||
|
How to add a newspaper source ?
|
||||||
|
|
||||||
|
You must implement the `Newspaper` trait for you structure
|
||||||
|
|
||||||
|
# 1. Write the `metadata` function
|
||||||
|
|
||||||
|
It returns information about the newspaper
|
||||||
|
|
||||||
|
# 2. Write the `has_complete_acess` function
|
||||||
|
|
||||||
|
Usually, indicates if the user is logged in.
|
||||||
|
You are encouraged to test on the newspaper webpage by making an http call.
|
||||||
|
|
||||||
|
You can use the **TODO** helper function that will look if a specific css
|
||||||
|
selector is in the page located at the given url.
|
||||||
|
@ -2,7 +2,7 @@ use std::convert::TryInto;
|
|||||||
use std::env;
|
use std::env;
|
||||||
|
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use crieur_retrieve::{ArticleLocation, Mediapart, newspaper::Newspaper, Url};
|
use crieur_retrieve::{newspaper::Newspaper, ArticleLocation, Mediapart, Url};
|
||||||
use dotenv::dotenv;
|
use dotenv::dotenv;
|
||||||
use log::info;
|
use log::info;
|
||||||
|
|
||||||
@ -26,7 +26,10 @@ async fn main() -> Result<()> {
|
|||||||
info!("Trying to download article from {}", url);
|
info!("Trying to download article from {}", url);
|
||||||
|
|
||||||
// TODO: shorten this, maybe an helper function ?
|
// TODO: shorten this, maybe an helper function ?
|
||||||
let article_location = ArticleLocation::builder().url(url)?.newspaper(&mediapart).build()?;
|
let article_location = ArticleLocation::builder()
|
||||||
|
.url(url)?
|
||||||
|
.newspaper(&mediapart)
|
||||||
|
.build()?;
|
||||||
|
|
||||||
let article_str = article_location.retrieve_html().await?;
|
let article_str = article_location.retrieve_html().await?;
|
||||||
|
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use crieur_retrieve::{Mediapart, newspaper::Newspaper, Url};
|
use crieur_retrieve::{newspaper::Newspaper, Mediapart, Url};
|
||||||
use dotenv::dotenv;
|
use dotenv::dotenv;
|
||||||
use std::env;
|
use std::env;
|
||||||
|
|
||||||
@ -12,7 +12,7 @@ async fn main() -> Result<()> {
|
|||||||
//
|
//
|
||||||
;
|
;
|
||||||
|
|
||||||
mediapart.login_cookie = Some(("MPRUUID".into(), env::var("MEDIAPART_COOKIE")?.into()));
|
mediapart.login_cookie = Some(("MPRUUID".into(), env::var("MEDIAPART_COOKIE")?));
|
||||||
let url = Url::parse("https://www.mediapart.fr/journal/france/030421/l-hotel-dieu-patients-et-medecins-tentent-de-percer-les-mysteres-du-covid-long")?;
|
let url = Url::parse("https://www.mediapart.fr/journal/france/030421/l-hotel-dieu-patients-et-medecins-tentent-de-percer-les-mysteres-du-covid-long")?;
|
||||||
println!("{}", mediapart.retrieve_html(&url).await?);
|
println!("{}", mediapart.retrieve_html(&url).await?);
|
||||||
Ok(())
|
Ok(())
|
||||||
|
Loading…
Reference in New Issue
Block a user