feat: add retrieval application and one newspaper
A first example as well as some documentation have been added The first example builds an article location and download the article as an html String. The documentation explains how it has been designed and what is the goal of the application as well as it's intended architecture
This commit is contained in:
parent
7fec40e9e4
commit
c4ab210c4d
1
.gitattributes
vendored
Normal file
1
.gitattributes
vendored
Normal file
@ -0,0 +1 @@
|
||||
*.png filter=lfs diff=lfs merge=lfs -text
|
22
.gitea/issue_template/bug_report.md
Normal file
22
.gitea/issue_template/bug_report.md
Normal file
@ -0,0 +1,22 @@
|
||||
---
|
||||
name: "Bug report"
|
||||
about: "This template is for reporting a bug"
|
||||
title: ""
|
||||
labels:
|
||||
- "type::bug"
|
||||
- "status::review_needed"
|
||||
|
||||
---
|
||||
|
||||
**Description**
|
||||
*write a concise bug description*
|
||||
|
||||
**Steps to reproduce**
|
||||
1.
|
||||
2.
|
||||
|
||||
**Expected behavior**
|
||||
*describe what you expected to happen*
|
||||
|
||||
**Configuration**
|
||||
*paste the result of `stage --version`
|
19
.gitea/issue_template/design_discussion.md
Normal file
19
.gitea/issue_template/design_discussion.md
Normal file
@ -0,0 +1,19 @@
|
||||
---
|
||||
name: "Design discussion"
|
||||
about: "For discussion about the design of features in the application, when there are several possibilities for implementation"
|
||||
title: ""
|
||||
labels:
|
||||
- "type::discussion"
|
||||
- "status::review_needed"
|
||||
|
||||
---
|
||||
|
||||
*describe shortly the problem*
|
||||
|
||||
## Requirements
|
||||
|
||||
*list requirements that the feature have*
|
||||
|
||||
## Propositions
|
||||
|
||||
*explain the different implementation that you would propose for the feature*
|
15
.gitea/issue_template/feature_request.md
Normal file
15
.gitea/issue_template/feature_request.md
Normal file
@ -0,0 +1,15 @@
|
||||
---
|
||||
name: "Feature request"
|
||||
about: "This template is for requesting a new feature"
|
||||
title: ""
|
||||
labels:
|
||||
- "type::feature"
|
||||
- "status::review_needed"
|
||||
|
||||
---
|
||||
|
||||
*(if applicable) describe what problem or frustration you have currently*
|
||||
|
||||
*describe what you would like to be able to do, or what solution you would like (you can propose several)*
|
||||
|
||||
*(optional) additional context, comments or implementation propositions*
|
15
.gitea/issue_template/question.md
Normal file
15
.gitea/issue_template/question.md
Normal file
@ -0,0 +1,15 @@
|
||||
---
|
||||
name: "Ask a question"
|
||||
about: "If you have a question about the usage of the libraries or the tool"
|
||||
title: ""
|
||||
labels:
|
||||
- "type::question"
|
||||
- "status::review_needed"
|
||||
|
||||
---
|
||||
|
||||
*ask your question*
|
||||
|
||||
*describe what you have you read so far to try to answer this question ?*
|
||||
|
||||
*(optional) would you think the is a in documentation ?*
|
11
.gitea/issue_template/refactor.md
Normal file
11
.gitea/issue_template/refactor.md
Normal file
@ -0,0 +1,11 @@
|
||||
---
|
||||
name: "Refactor"
|
||||
about: "For refactoring propositions"
|
||||
title: ""
|
||||
labels:
|
||||
- "type::refactor"
|
||||
- "status::review_needed"
|
||||
|
||||
---
|
||||
|
||||
*explain why and what you want to refactor*
|
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
/target
|
||||
.env
|
2
.rustfmt.toml
Normal file
2
.rustfmt.toml
Normal file
@ -0,0 +1,2 @@
|
||||
format_strings = true
|
||||
wrap_comments = true
|
2153
Cargo.lock
generated
Normal file
2153
Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
23
Cargo.toml
Normal file
23
Cargo.toml
Normal file
@ -0,0 +1,23 @@
|
||||
[workspace]
|
||||
|
||||
members = [
|
||||
"crieur-retrieve",
|
||||
]
|
||||
|
||||
|
||||
[package]
|
||||
name = "crieur"
|
||||
version = "0.1.0"
|
||||
authors = ["koalp <koalp@alpaga.dev>"]
|
||||
edition = "2018"
|
||||
publish = false
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1.0.40"
|
||||
crieur-retrieve = {version = "0.1", path="crieur-retrieve"}
|
||||
dotenv = "0.15.0"
|
||||
env_logger = "0.8.3"
|
||||
log = "0.4.14"
|
||||
tokio = { version = "1.5.0", features = ["full"] }
|
17
README.md
Normal file
17
README.md
Normal file
@ -0,0 +1,17 @@
|
||||
Tools to retrieve articles from multiple newspaper you subscribed to.
|
||||
|
||||
**This is a prototype, it isn't stable at all and you may not want to use it if you expect it to just work !**
|
||||
|
||||
# How to use it
|
||||
|
||||
First retrieve login cookies for websites and put it in a `.env`
|
||||
|
||||
```
|
||||
cargo run --example=retrive_html_articles
|
||||
```
|
||||
|
||||
# Documentation
|
||||
|
||||
- 1. [Design](documentation/design/index.md)
|
||||
- a. [Scope of the project](documentation/design/scope.md)
|
||||
- b. [Retrieve](documentation/design/retrieve.md)
|
1
crieur-retrieve/.gitignore
vendored
Normal file
1
crieur-retrieve/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
Cargo.lock
|
30
crieur-retrieve/Cargo.toml
Normal file
30
crieur-retrieve/Cargo.toml
Normal file
@ -0,0 +1,30 @@
|
||||
[package]
|
||||
name = "crieur-retrieve"
|
||||
description = "Retrive articles from newspapers websites"
|
||||
authors = ["koalp <koalp@aplaga.dev>"]
|
||||
version = "0.1.0"
|
||||
edition = "2018"
|
||||
publish = false
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1.0.40"
|
||||
async-trait = "0.1.48"
|
||||
thiserror = "1.0.24"
|
||||
url = "2.2.1"
|
||||
hyper = { version = "0.14.5", features = ["full"] }
|
||||
hyper-rustls = "0.22.1"
|
||||
cookie = "0.15.0"
|
||||
lol_html = "0.3.0"
|
||||
indoc = "1.0.3"
|
||||
html-minifier = "3.0.9"
|
||||
bytes = "1.0.1"
|
||||
base64 = "0.13.0"
|
||||
futures = "0.3.14"
|
||||
derive_builder = "0.10.0"
|
||||
nipper = "0.1.9"
|
||||
log = "0.4.14"
|
||||
env_logger = "0.8.3"
|
||||
itertools = "0.10.0"
|
||||
|
||||
[dev-dependencies]
|
||||
tokio = "1.5.0"
|
108
crieur-retrieve/src/article_location.rs
Normal file
108
crieur-retrieve/src/article_location.rs
Normal file
@ -0,0 +1,108 @@
|
||||
use std::convert::TryInto;
|
||||
use std::ops::Deref;
|
||||
use std::boxed::Box;
|
||||
|
||||
use anyhow::{anyhow, Result};
|
||||
use url::{Host, Url};
|
||||
use log::info;
|
||||
|
||||
use crate::newspaper::Newspaper;
|
||||
|
||||
type Newspapers<'a> = Vec<Box<&'a dyn Newspaper>>;
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct ArticleLocationBuilder<'a> {
|
||||
url: Option<Url>,
|
||||
newspapers: Option<Newspapers<'a>>,
|
||||
}
|
||||
|
||||
impl<'a> ArticleLocationBuilder<'a> {
|
||||
pub fn new() -> Self {
|
||||
Self::default()
|
||||
}
|
||||
|
||||
/// Adds an url corresponding to article location
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// An error is returned if the could not be converted into an url
|
||||
// TODO: move this to a defined error, remove anyhow !
|
||||
pub fn url<'e, U, E>(mut self, url: U) -> Result<Self>
|
||||
where
|
||||
U: TryInto<Url, Error = E> + Send,
|
||||
E: std::error::Error + Sync + Send + 'static,
|
||||
{
|
||||
let url = url.try_into()?;
|
||||
self.url = Some(url);
|
||||
Ok(self)
|
||||
}
|
||||
|
||||
/// Adds a newspaper to the list
|
||||
pub fn newspaper<T>(&mut self, newspaper: &'a T) -> &mut Self
|
||||
where
|
||||
T: 'a + Newspaper,
|
||||
{
|
||||
match &mut self.newspapers {
|
||||
Some(newspapers) => newspapers.push(Box::new(newspaper)),
|
||||
None => self.newspapers = Some(vec![Box::new(newspaper)]),
|
||||
}
|
||||
self
|
||||
}
|
||||
|
||||
/// Adds several newspapers to the list of accepted newspapers
|
||||
//fn newspapers(&mut self, newspapers: Newspapers) -> Result<&mut Self> {
|
||||
// let newspapers = match &self.newspapers {
|
||||
// Some(current_newspapers) => newspapers
|
||||
// .iter()
|
||||
// .chain(current_newspapers.iter())
|
||||
// .map(|s| *(s.clone()))
|
||||
// .collect::<Newspapers>(),
|
||||
// None => newspapers.into_iter().collect::<Vec<_>>(),
|
||||
// };
|
||||
// self.newspapers = Some(newspapers);
|
||||
// Ok(self)
|
||||
//}
|
||||
|
||||
/// Builds the ArticleLocation by looking which newspaper
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// The following errors can be returned
|
||||
///
|
||||
/// - no newpspaper is given
|
||||
/// - the url is not set
|
||||
/// - the given url has no host
|
||||
// TODO: move this to a defined error, remove anyhow !
|
||||
pub fn build(&self) -> Result<ArticleLocation<'a>> {
|
||||
let url = Clone::clone(self.url.as_ref().ok_or(anyhow!(
|
||||
"No url set. You can set it with the url() function"
|
||||
))?);
|
||||
let host = url.host_str().ok_or(anyhow!("Given url has no host"))?;
|
||||
let host = Host::parse(host)?;
|
||||
let newspaper = self
|
||||
.newspapers.as_ref()
|
||||
.ok_or(anyhow!(
|
||||
"A list of NewsPaper must be set. It can be set with newspapers() function"
|
||||
))?
|
||||
.into_iter()
|
||||
.find(|c| c.metadata().hosts.contains(&host))
|
||||
.ok_or(anyhow!("Newspaper couldn't be found"))?;
|
||||
Ok(ArticleLocation { newspaper: newspaper.clone(), url })
|
||||
}
|
||||
}
|
||||
|
||||
pub struct ArticleLocation<'a> {
|
||||
newspaper: Box<&'a dyn Newspaper>,
|
||||
pub url: Url,
|
||||
}
|
||||
|
||||
impl<'a> ArticleLocation<'a> {
|
||||
pub fn builder() -> ArticleLocationBuilder<'a> {
|
||||
ArticleLocationBuilder::new()
|
||||
}
|
||||
|
||||
pub async fn retrieve_html(&self) -> Result<String> {
|
||||
info!("It will download from {}", self.url);
|
||||
self.newspaper.retrieve_html(&self.url).await
|
||||
}
|
||||
}
|
83
crieur-retrieve/src/consts.rs
Normal file
83
crieur-retrieve/src/consts.rs
Normal file
@ -0,0 +1,83 @@
|
||||
pub const EVENT_HANDLERS: &'static [&'static str] = &[
|
||||
// From https://www.w3.org/TR/html52/webappapis.html#event-handlers-on-elements-document-objects-and-window-objects
|
||||
"onabort",
|
||||
"onauxclick",
|
||||
"oncancel",
|
||||
"oncanplay",
|
||||
"oncanplaythrough",
|
||||
"onchange",
|
||||
"onclick",
|
||||
"onclose",
|
||||
"oncuechange",
|
||||
"ondblclick",
|
||||
"ondrag",
|
||||
"ondragend",
|
||||
"ondragenter",
|
||||
"ondragexit",
|
||||
"ondragleave",
|
||||
"ondragover",
|
||||
"ondragstart",
|
||||
"ondrop",
|
||||
"ondurationchange",
|
||||
"onemptied",
|
||||
"onended",
|
||||
"oninput",
|
||||
"oninvalid",
|
||||
"onkeydown",
|
||||
"onkeypress",
|
||||
"onkeyup",
|
||||
"onloadeddata",
|
||||
"onloadedmetadata",
|
||||
"onloadend",
|
||||
"onloadstart",
|
||||
"onmousedown",
|
||||
"onmouseenter",
|
||||
"onmouseleave",
|
||||
"onmousemove",
|
||||
"onmouseout",
|
||||
"onmouseover",
|
||||
"onmouseup",
|
||||
"onwheel",
|
||||
"onpause",
|
||||
"onplay",
|
||||
"onplaying",
|
||||
"onprogress",
|
||||
"onratechange",
|
||||
"onreset",
|
||||
"onseeked",
|
||||
"onseeking",
|
||||
"onselect",
|
||||
"onshow",
|
||||
"onstalled",
|
||||
"onsubmit",
|
||||
"onsuspend",
|
||||
"ontimeupdate",
|
||||
"ontoggle",
|
||||
"onvolumechange",
|
||||
"onwaiting",
|
||||
"onblur",
|
||||
"onerror",
|
||||
"onfocus",
|
||||
"onload",
|
||||
"onresize",
|
||||
"onscroll",
|
||||
"onafterprint",
|
||||
"onbeforeprint",
|
||||
"onbeforeunload",
|
||||
"onhashchange",
|
||||
"onlanguagechange",
|
||||
"onmessage",
|
||||
"onoffline",
|
||||
"ononline",
|
||||
"onpagehide",
|
||||
"onpageshow",
|
||||
"onrejectionhandled",
|
||||
"onpopstate",
|
||||
"onstorage",
|
||||
"onunhandledrejection",
|
||||
"onunload",
|
||||
"oncut",
|
||||
"oncopy",
|
||||
"onpaste",
|
||||
"onreadystatechange",
|
||||
];
|
10
crieur-retrieve/src/errors.rs
Normal file
10
crieur-retrieve/src/errors.rs
Normal file
@ -0,0 +1,10 @@
|
||||
use anyhow;
|
||||
use thiserror;
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub enum Error {
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
pub type Result<T> = core::result::Result<T, Error>;
|
17
crieur-retrieve/src/lib.rs
Normal file
17
crieur-retrieve/src/lib.rs
Normal file
@ -0,0 +1,17 @@
|
||||
pub use url::Url;
|
||||
|
||||
pub mod errors;
|
||||
|
||||
mod tools;
|
||||
pub use tools::{Download, Downloader};
|
||||
|
||||
pub mod newspaper;
|
||||
|
||||
// TODO: move to another crate
|
||||
mod newspapers;
|
||||
pub use newspapers::Mediapart;
|
||||
|
||||
mod article_location;
|
||||
pub use article_location::ArticleLocation;
|
||||
|
||||
mod consts;
|
56
crieur-retrieve/src/newspaper.rs
Normal file
56
crieur-retrieve/src/newspaper.rs
Normal file
@ -0,0 +1,56 @@
|
||||
use anyhow::Result;
|
||||
use async_trait::async_trait;
|
||||
use derive_builder::Builder;
|
||||
use url::Host;
|
||||
pub use url::Url;
|
||||
|
||||
enum Login {
|
||||
Username(String, String),
|
||||
Cookie(String),
|
||||
}
|
||||
|
||||
/// Contains metadata about a newspaper
|
||||
// TODO: provide builder
|
||||
#[derive(Debug, PartialEq, Default, Builder)]
|
||||
#[builder(default)]
|
||||
pub struct Metadata {
|
||||
/// The hosts that can be corresponds to this newspaper
|
||||
#[builder(setter(into))]
|
||||
pub hosts: Vec<Host>,
|
||||
/// The name of the newspaper, in lower case, without spaces
|
||||
///
|
||||
/// As it should be unique and contain no spaces, it may be used for configuration purposes
|
||||
#[builder(setter(into))]
|
||||
pub lower_case_name: String,
|
||||
/// The full name of the newspaper, in lower case, without spaces
|
||||
#[builder(setter(into))]
|
||||
pub name: String,
|
||||
}
|
||||
|
||||
impl Metadata {
|
||||
pub fn builder() -> MetadataBuilder {
|
||||
MetadataBuilder::default()
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
pub trait Newspaper {
|
||||
/// Returns a list of hosts that corresponds to the newspapers
|
||||
fn metadata(&self) -> Metadata;
|
||||
|
||||
/// Returns true if the Newspaper has complete access to the articles
|
||||
///
|
||||
/// Usually, it will may tell you if you are logged in when newspaper have a paywall
|
||||
async fn has_complete_access(&self) -> bool;
|
||||
|
||||
/// Returns a newspaper structure
|
||||
async fn new() -> Self
|
||||
where
|
||||
Self: Sized;
|
||||
|
||||
/// Retrieve an article under the hmtl format
|
||||
/// The article **must** be self-contained
|
||||
async fn retrieve_html(&self, url: &Url) -> Result<String>;
|
||||
|
||||
// fn login(login: Login)
|
||||
}
|
65
crieur-retrieve/src/newspapers/mediapart.rs
Normal file
65
crieur-retrieve/src/newspapers/mediapart.rs
Normal file
@ -0,0 +1,65 @@
|
||||
use anyhow::Result;
|
||||
use async_trait::async_trait;
|
||||
use cookie::Cookie;
|
||||
use url::Host;
|
||||
|
||||
use crate::newspaper::{Metadata, Newspaper};
|
||||
use crate::tools;
|
||||
use crate::Url;
|
||||
use crate::{Download, Downloader};
|
||||
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct Mediapart {
|
||||
// TODO: remove this pub !!
|
||||
pub login_cookie: Option<(String, String)>,
|
||||
}
|
||||
|
||||
fn str_to_host<S: Into<String>>(host: S) -> Host {
|
||||
Host::Domain(host.into())
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Newspaper for Mediapart {
|
||||
fn metadata(&self) -> Metadata {
|
||||
Metadata::builder()
|
||||
.hosts(vec![
|
||||
str_to_host("mediapart.fr"),
|
||||
str_to_host("www.mediapart.fr"),
|
||||
])
|
||||
.lower_case_name("mediapart")
|
||||
.name("Médiapart")
|
||||
.build()
|
||||
.unwrap_or_default()
|
||||
}
|
||||
|
||||
async fn retrieve_html(&self, url: &Url) -> Result<String> {
|
||||
// TODO: add "?onglet=full" to the url if not
|
||||
let cookies = if let Some((name, value)) = &self.login_cookie {
|
||||
let cookie = Cookie::build(name, value).secure(true).finish();
|
||||
vec![cookie]
|
||||
} else {
|
||||
vec![]
|
||||
};
|
||||
|
||||
// TODO: replace by builder
|
||||
let downloader = Downloader { cookies };
|
||||
|
||||
let body = downloader.download(&url).await?;
|
||||
let html = String::from_utf8(body.to_vec())?;
|
||||
|
||||
// TODO: correction of usage of relative urls, and replace "" by the url
|
||||
let single_page_html = tools::self_contained_html(&html, &downloader, &url).await;
|
||||
Ok(single_page_html)
|
||||
}
|
||||
|
||||
async fn new() -> Self {
|
||||
Self {
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
|
||||
async fn has_complete_access(&self) -> bool {
|
||||
// TODO: check if we are logged using the cookie
|
||||
true
|
||||
}
|
||||
}
|
3
crieur-retrieve/src/newspapers/mod.rs
Normal file
3
crieur-retrieve/src/newspapers/mod.rs
Normal file
@ -0,0 +1,3 @@
|
||||
mod mediapart;
|
||||
|
||||
pub use mediapart::Mediapart;
|
57
crieur-retrieve/src/tools/download.rs
Normal file
57
crieur-retrieve/src/tools/download.rs
Normal file
@ -0,0 +1,57 @@
|
||||
use std::error::Error as StdError;
|
||||
|
||||
use anyhow::Result;
|
||||
use async_trait::async_trait;
|
||||
use bytes::Bytes;
|
||||
use cookie::Cookie;
|
||||
use hyper::{header, Body, Client, Method, Request};
|
||||
use thiserror::Error;
|
||||
use url::Url;
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum DownloadError {
|
||||
#[error("Http error")]
|
||||
HttpError(#[from] hyper::http::Error),
|
||||
#[error("Hyper error")]
|
||||
HyperError(#[from] hyper::Error),
|
||||
}
|
||||
|
||||
/// Downloads documents
|
||||
#[async_trait]
|
||||
pub trait Download {
|
||||
type Error: StdError;
|
||||
|
||||
/// Downloads a file from an url and returns the result as bytes
|
||||
async fn download(&self, file_link: &Url) -> Result<Bytes, Self::Error>;
|
||||
}
|
||||
|
||||
/// Store several cookies
|
||||
// TODO: add builder or new() funciton
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Downloader<'c> {
|
||||
pub cookies: Vec<Cookie<'c>>,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl<'c> Download for Downloader<'c> {
|
||||
type Error = DownloadError;
|
||||
|
||||
async fn download(&self, file_link: &Url) -> Result<Bytes, Self::Error> {
|
||||
let https = hyper_rustls::HttpsConnector::with_native_roots();
|
||||
let client: Client<_, hyper::Body> = Client::builder().build(https);
|
||||
|
||||
let mut req = Request::builder()
|
||||
.method(Method::GET)
|
||||
.uri(file_link.as_str());
|
||||
|
||||
for cookie in &self.cookies {
|
||||
req = req.header(header::COOKIE, cookie.to_string());
|
||||
}
|
||||
|
||||
let req = req.body(Body::empty())?;
|
||||
|
||||
let resp = client.request(req).await?;
|
||||
let body = hyper::body::to_bytes(resp).await?;
|
||||
Ok(body)
|
||||
}
|
||||
}
|
5
crieur-retrieve/src/tools/mod.rs
Normal file
5
crieur-retrieve/src/tools/mod.rs
Normal file
@ -0,0 +1,5 @@
|
||||
mod download;
|
||||
mod self_contained_html;
|
||||
|
||||
pub use download::{Download, DownloadError, Downloader};
|
||||
pub use self_contained_html::self_contained_html;
|
291
crieur-retrieve/src/tools/self_contained_html.rs
Normal file
291
crieur-retrieve/src/tools/self_contained_html.rs
Normal file
@ -0,0 +1,291 @@
|
||||
use log::debug;
|
||||
use std::fs::File;
|
||||
use std::io::prelude::*;
|
||||
use std::path::Path;
|
||||
|
||||
use anyhow::{anyhow, Result};
|
||||
use async_trait::async_trait;
|
||||
use base64;
|
||||
use bytes::Bytes;
|
||||
use futures::future::{JoinAll, OptionFuture};
|
||||
use html_minifier::HTMLMinifier;
|
||||
use indoc::{formatdoc, indoc};
|
||||
use itertools::izip;
|
||||
use nipper::Document;
|
||||
use url::Url;
|
||||
|
||||
use crate::consts::EVENT_HANDLERS;
|
||||
use crate::errors;
|
||||
use crate::Download;
|
||||
|
||||
/// Makes an html page self-contained
|
||||
///
|
||||
/// The `downloader` must implement `Download` and is used to download ressources that are
|
||||
/// needed to make this page self-contained such as stylesheets or images.
|
||||
///
|
||||
/// The function also removes all scripts on the page
|
||||
pub async fn self_contained_html<E, D, S>(html: S, downloader: &D, base_url: &Url) -> String
|
||||
where
|
||||
E: std::error::Error,
|
||||
D: Download<Error = E> + Send,
|
||||
S: AsRef<str>,
|
||||
{
|
||||
// TODO: split/refactor this function
|
||||
// ¿ find a way to ease the |get urls| -> |async download| -> |replace| workflow ?
|
||||
let (style_urls, html) = {
|
||||
let document = Document::from(html.as_ref());
|
||||
|
||||
// ---- Remove scripts ----
|
||||
//
|
||||
document.select("script").remove();
|
||||
|
||||
for event in EVENT_HANDLERS {
|
||||
document
|
||||
.select(format!("[{}]", event).as_str())
|
||||
.remove_attr(event);
|
||||
}
|
||||
|
||||
// ---- Replace stylesheets ----
|
||||
//
|
||||
let stylesheets = document.select("link[href][rel=\"stylesheet\"]");
|
||||
let styles_url = stylesheets
|
||||
.iter()
|
||||
.map(|stylesheet| {
|
||||
if let Some(src) = stylesheet.attr("href") {
|
||||
//TODO: does it work with absolute urls ?
|
||||
base_url.join(src.as_ref()).ok()
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
(styles_url, String::from(document.html()))
|
||||
};
|
||||
|
||||
let style_urls = style_urls.into_iter().map(|style_url| {
|
||||
OptionFuture::from(style_url.map(|s| async move { downloader.download(&s).await.unwrap() }))
|
||||
});
|
||||
let downloaded_styles = futures::future::join_all(style_urls).await;
|
||||
|
||||
let html = {
|
||||
let document = Document::from(&html);
|
||||
let styles = document.select("link[href][rel=\"stylesheet\"]");
|
||||
|
||||
styles
|
||||
.iter()
|
||||
.zip(downloaded_styles.iter())
|
||||
.for_each(|(mut stylesheet, inner_css)| {
|
||||
if let Some(inner_css) = inner_css {
|
||||
let css = String::from_utf8(inner_css.to_vec()).unwrap();
|
||||
let css = format!("<style>{}</style>", css);
|
||||
stylesheet.replace_with_html(css);
|
||||
} else {
|
||||
stylesheet.remove();
|
||||
}
|
||||
});
|
||||
String::from(document.html())
|
||||
};
|
||||
|
||||
// ---- Replace imgs ----
|
||||
//
|
||||
let image_urls = {
|
||||
let document = Document::from(&html);
|
||||
let imgs = document.select("img");
|
||||
|
||||
imgs.iter()
|
||||
.map(|image| {
|
||||
if let Some(src) = image.attr("src") {
|
||||
base_url.join(src.as_ref()).ok()
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
let downloaded_images = image_urls.into_iter().map(|image_url| {
|
||||
OptionFuture::from(image_url.map(|url| async move {
|
||||
let data = downloader.download(&url).await.unwrap();
|
||||
(url, data)
|
||||
}))
|
||||
});
|
||||
let downloaded_images = futures::future::join_all(downloaded_images).await;
|
||||
|
||||
let html = {
|
||||
let document = Document::from(&html);
|
||||
let imgs = document.select("img");
|
||||
|
||||
imgs.iter()
|
||||
.zip(downloaded_images.iter())
|
||||
.for_each(|(mut img, data)| {
|
||||
if let Some((url, data)) = data {
|
||||
let data = base64::encode(data);
|
||||
let extension = Path::new(url.path()).extension().unwrap().to_str().unwrap();
|
||||
img.set_attr("src", &format!("data:image/{};base64,{}", extension, data));
|
||||
}
|
||||
});
|
||||
String::from(document.html())
|
||||
};
|
||||
|
||||
let mut minifier = HTMLMinifier::new();
|
||||
minifier.digest(html.as_str()).unwrap();
|
||||
|
||||
String::from_utf8(minifier.get_html().into()).unwrap()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn init() {
|
||||
let _ = env_logger::builder().is_test(true).try_init();
|
||||
}
|
||||
|
||||
// TODO: the Dummy,Css and Png Downloaders don't really test the async scenario as
|
||||
// they don't use futures : they don't call await.
|
||||
// They should be testing the async scenario
|
||||
struct DummyDownloader;
|
||||
#[async_trait]
|
||||
impl Download for DummyDownloader {
|
||||
type Error = errors::Error;
|
||||
async fn download(&self, _file_link: &Url) -> errors::Result<Bytes> {
|
||||
Ok(Bytes::from(""))
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn remove_scripts() -> Result<()> {
|
||||
let html = "<html><head><script>let id = id => id</script></head><body></body></html>";
|
||||
let base_url = Url::parse("http://example.com")?;
|
||||
let downloader = DummyDownloader {};
|
||||
assert_eq!(
|
||||
self_contained_html(html, &downloader, &base_url).await,
|
||||
"<html><head></head><body></body></html>"
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn remove_onevent_handlers() -> Result<()> {
|
||||
init();
|
||||
let downloader = DummyDownloader {};
|
||||
let html = |onevent| {
|
||||
formatdoc! {"
|
||||
<html>
|
||||
<head>
|
||||
</head>
|
||||
<body>
|
||||
<button class=\"activate\" {}=\"let id = id => id\">button</button>
|
||||
</body>
|
||||
</html>",
|
||||
onevent
|
||||
}
|
||||
};
|
||||
|
||||
let base_url = Url::parse("http://example.com")?;
|
||||
for s in EVENT_HANDLERS {
|
||||
assert_eq!(
|
||||
self_contained_html(html(s), &downloader, &base_url).await,
|
||||
"<html><head>\n</head>\n<body>\n<button class=\"activate\">button</button>\n</body></html>"
|
||||
);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
struct CssDownloader;
|
||||
#[async_trait]
|
||||
impl Download for CssDownloader {
|
||||
type Error = errors::Error;
|
||||
async fn download(&self, _file_link: &Url) -> errors::Result<Bytes> {
|
||||
Ok(indoc! {"
|
||||
section#warning {
|
||||
color: red;
|
||||
}"}
|
||||
.into())
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn download_css() -> Result<()> {
|
||||
let downloader = CssDownloader {};
|
||||
|
||||
let html = indoc! {"
|
||||
<html>
|
||||
<head>
|
||||
<link rel=\"stylesheet\" href=\"main.css\">
|
||||
</head>
|
||||
<body>
|
||||
</body>
|
||||
</html>
|
||||
"};
|
||||
|
||||
// FIXME: find why minify doesn't minify
|
||||
let wanted_html = indoc! {"
|
||||
<html><head>
|
||||
<style>
|
||||
section#warning {
|
||||
color: red;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
</body></html>
|
||||
"};
|
||||
let mut minifier = HTMLMinifier::new();
|
||||
minifier.digest(wanted_html)?;
|
||||
let minified = String::from_utf8(minifier.get_html().into())?;
|
||||
|
||||
let base_url = Url::parse("http://example.com")?;
|
||||
assert_eq!(
|
||||
self_contained_html(html, &downloader, &base_url).await,
|
||||
minified
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
struct PngDownloader;
|
||||
#[async_trait]
|
||||
impl Download for PngDownloader {
|
||||
type Error = errors::Error;
|
||||
async fn download(&self, _file_link: &Url) -> errors::Result<Bytes> {
|
||||
let image_path = Path::new("test_data/home.png");
|
||||
let mut image_file = File::open(&image_path).unwrap();
|
||||
let mut image_buf: Vec<u8> = vec![];
|
||||
image_file.read_to_end(&mut image_buf).unwrap();
|
||||
Ok(image_buf.into())
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn download_image_png() -> Result<()> {
|
||||
let downloader = PngDownloader {};
|
||||
let html = indoc! {"
|
||||
<html>
|
||||
<head></head>
|
||||
<body>
|
||||
<img src=\"home.png\" alt=\"an home\" />
|
||||
</body>
|
||||
</html>
|
||||
"};
|
||||
|
||||
let wanted_html = indoc! {"
|
||||
<html><head></head>
|
||||
<body>
|
||||
<img src=\"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAABm\
|
||||
JLR0QA/wD/AP+gvaeTAAAAh0lEQVQ4jc2RQQqAIBBFn12ioNNE544gWhStc+FNWtVmBDFHsjZ9+DBffeOgo\
|
||||
KsBVmCRukgNYIFTbGVtlDzk4DqCwyZhTqoFXAJO+RN8a1ADewF8CvPqZm8nLNsL2HutgEN70Qc6TBDUr1Fk\
|
||||
AKrMgU4OGaDPdlEmMFFO7ucmeKR/NZgLuMkXFxHZVhLI8sXeAAAAAElFTkSuQmCC\" alt=\"an home\">
|
||||
</body></html>
|
||||
"};
|
||||
let mut minifier = HTMLMinifier::new();
|
||||
minifier.digest(wanted_html)?;
|
||||
let minified = String::from_utf8(minifier.get_html().into())?;
|
||||
|
||||
let base_url = Url::parse("http://example.com")?;
|
||||
assert_eq!(
|
||||
self_contained_html(html, &downloader, &base_url).await,
|
||||
minified
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
}
|
BIN
crieur-retrieve/test_data/home.png
(Stored with Git LFS)
Normal file
BIN
crieur-retrieve/test_data/home.png
(Stored with Git LFS)
Normal file
Binary file not shown.
76
documentation/design/retrieve.md
Normal file
76
documentation/design/retrieve.md
Normal file
@ -0,0 +1,76 @@
|
||||
---
|
||||
title: crieur-retrieve design
|
||||
---
|
||||
|
||||
# Self-contained html
|
||||
|
||||
Exporting the article as a self-contained may be the easier and more reliable , as it keep the
|
||||
original ui of the newspaper, and do not require to export in a different format.
|
||||
|
||||
Creating reusable methods to create a self-contained html page will make it easier to write
|
||||
`Newspaper`s . Those methods would be part of a `crieur-retrieve-tool` library.
|
||||
|
||||
The `self_contained_html` function have been created to do this.
|
||||
|
||||
```rust
|
||||
pub fn self_contained_html<S: AsRef<str>>()
|
||||
html: S,
|
||||
downloader: &dyn Fn(Url) -> Option<Bytes>,
|
||||
) -> String
|
||||
```
|
||||
|
||||
## Script removal
|
||||
|
||||
Nothing should be executed by the exported html page.
|
||||
|
||||
Scripts elements are contained in `<script>` tags as well as with event handlers (ex : `onclick`,
|
||||
`onmousedown`).
|
||||
|
||||
## CSS
|
||||
|
||||
CSS should be retrieved and included in the web page.
|
||||
|
||||
To make the web pages minimal, it would be nice to remove all unused CSS, but that may be difficult technically.
|
||||
|
||||
## Images
|
||||
|
||||
All images should be included in the html page. It can be done by transforming them to base64.
|
||||
A drawback is that it takes more place.
|
||||
|
||||
## (options) Custom filters
|
||||
|
||||
Allowing `Newspaper` creators to write custom html filters can allow to
|
||||
|
||||
The different filters that creators may want to write are :
|
||||
|
||||
- `delete` : delete part of the page that are useless based on css selector (navbars, account, comments)
|
||||
- `link rewrite` : rewrite links so they are absolute. It can be useful if you want to keep external link, to other articles, to the comment sections, to the main page of the newspaper, et c
|
||||
- other filters : asking users what filter they want to write could be useful to know if features are lacking
|
||||
|
||||
`delete` filters seems the most useful and is easy to do as you can just provide a list of CSS filters.
|
||||
|
||||
The other need to be designed.
|
||||
|
||||
## Minify
|
||||
|
||||
The html and css is minified to take the less place possible
|
||||
|
||||
**unimplemented** Images size could be reduced if they are too big. A format such as webp could
|
||||
also be used.
|
||||
|
||||
## Inspiration
|
||||
|
||||
- [monolith](https://github.com/y2z/monolith), a CLI tool for saving complete web pages as a single HTML file
|
||||
- not really a library (yet ?)
|
||||
- lacks custom selector for removal of unwanted parts
|
||||
- not async
|
||||
|
||||
## Libraries
|
||||
|
||||
[lol-html](https://github.com/cloudflare/lol-html) is a great library and is designed to be fast as it is streaming through rather than parsing, storing and modifying it. Unfortunately, it isn't compatible with async downloads as the library relies on setting up executors (functions) that will be runned during the processing, and those functions can't be async.
|
||||
|
||||
Therefore, a library that seems to be less used, [nipper](https://github.com/importcjj/nipper), has been choosen. The `Document` type of this library is not `Send`, so it can't be used in two different `Future`. To circumvent this issue, the `Document` is recreated after each `await`. The overhead of doing so have not been measured yet.
|
||||
|
||||
# Downloader
|
||||
|
||||
A `downloader` tool helps to write Newspaper interfaces. The `Download` `Trait` to allows the user to provide it's own `downloader`, it also helps to unit test as a dummy downloader can be created.
|
48
documentation/design/scope.md
Normal file
48
documentation/design/scope.md
Normal file
@ -0,0 +1,48 @@
|
||||
This project mainly aims at providing an unified interface for several newspapers. Side
|
||||
objectives are to provide web API and different clients like a webUI or chatbots.
|
||||
|
||||
Several big components are planned for this project
|
||||
|
||||
```dot
|
||||
digraph G {
|
||||
rankdir=TB
|
||||
node [shape=rectangle, style=filled, color="#779988"]
|
||||
|
||||
subgraph cluster_frontend {
|
||||
color = transparent
|
||||
webui
|
||||
chatbot
|
||||
}
|
||||
|
||||
|
||||
webui -> api [color = red]
|
||||
chatbot -> api [color = red]
|
||||
|
||||
subgraph cluster_backend {
|
||||
label = "Backend\ncrieur binary"
|
||||
labelloc = b
|
||||
style=filled
|
||||
|
||||
retrieve_tools [label="retrieve-tools"]
|
||||
retrieve_adapters [label="retrieve-adapters"]
|
||||
retrieve [label="retrieve-interface"]
|
||||
auto_retrieve [label="automatic-retrieve"]
|
||||
article_repr [label="article-representation\nRepresentation for articles"]
|
||||
api
|
||||
cache [label="Cache database"]
|
||||
rss [label="Atom/RSS adapters"]
|
||||
|
||||
retrieve_tools -> retrieve_adapters
|
||||
retrieve_adapters -> retrieve
|
||||
retrieve_tools -> retrieve
|
||||
rss -> auto_retrieve
|
||||
article_repr -> retrieve_adapters
|
||||
|
||||
retrieve -> api
|
||||
auto_retrieve -> api
|
||||
cache -> api
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
```
|
1
documentation/guides/add_a_newspaper_source.md
Normal file
1
documentation/guides/add_a_newspaper_source.md
Normal file
@ -0,0 +1 @@
|
||||
|
36
examples/cli_downloader.rs
Normal file
36
examples/cli_downloader.rs
Normal file
@ -0,0 +1,36 @@
|
||||
use std::convert::TryInto;
|
||||
use std::env;
|
||||
|
||||
use anyhow::Result;
|
||||
use crieur_retrieve::{ArticleLocation, Mediapart, newspaper::Newspaper, Url};
|
||||
use dotenv::dotenv;
|
||||
use log::info;
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<()> {
|
||||
dotenv().ok();
|
||||
env_logger::init();
|
||||
|
||||
let url = match env::args().nth(1) {
|
||||
Some(url) => Url::parse(&url)?,
|
||||
None => "https://www.mediapart.fr/journal/france/030421/l-hotel-dieu-patients-et-medecins-tentent-de-percer-les-mysteres-du-covid-long".try_into()?,
|
||||
};
|
||||
|
||||
// TODO: remove this in favor of default newspapers
|
||||
let mut mediapart = Mediapart::new().await
|
||||
//.login(USERNAME, PASSWORD)
|
||||
//
|
||||
;
|
||||
|
||||
mediapart.login_cookie = Some(("MPRUUID".into(), env::var("MEDIAPART_COOKIE")?.into()));
|
||||
info!("Trying to download article from {}", url);
|
||||
|
||||
// TODO: shorten this, maybe an helper function ?
|
||||
let article_location = ArticleLocation::builder().url(url)?.newspaper(&mediapart).build()?;
|
||||
|
||||
let article_str = article_location.retrieve_html().await?;
|
||||
|
||||
println!("{}", article_str);
|
||||
|
||||
Ok(())
|
||||
}
|
16
justfile
Normal file
16
justfile
Normal file
@ -0,0 +1,16 @@
|
||||
@build:
|
||||
cargo build
|
||||
|
||||
@clean:
|
||||
cargo clean
|
||||
|
||||
@run:
|
||||
cargo run
|
||||
|
||||
@audit:
|
||||
cargo audit
|
||||
|
||||
@crev:
|
||||
cargo crev verify
|
||||
|
||||
@verify: audit crev
|
19
src/main.rs
Normal file
19
src/main.rs
Normal file
@ -0,0 +1,19 @@
|
||||
use anyhow::Result;
|
||||
use crieur_retrieve::{Mediapart, newspaper::Newspaper, Url};
|
||||
use dotenv::dotenv;
|
||||
use std::env;
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<()> {
|
||||
dotenv().ok();
|
||||
|
||||
let mut mediapart = Mediapart::new().await
|
||||
//.login(USERNAME, PASSWORD)
|
||||
//
|
||||
;
|
||||
|
||||
mediapart.login_cookie = Some(("MPRUUID".into(), env::var("MEDIAPART_COOKIE")?.into()));
|
||||
let url = Url::parse("https://www.mediapart.fr/journal/france/030421/l-hotel-dieu-patients-et-medecins-tentent-de-percer-les-mysteres-du-covid-long")?;
|
||||
println!("{}", mediapart.retrieve_html(&url).await?);
|
||||
Ok(())
|
||||
}
|
Loading…
Reference in New Issue
Block a user