Merge branch 'feature/mediapart_poc' into development

This commit is contained in:
koalp 2021-04-24 03:48:39 +02:00
commit 9aa2b5f07b
Signed by: koalp
GPG Key ID: 35B21047DEB09A81
31 changed files with 3382 additions and 0 deletions

21
.drone.yml Normal file
View File

@ -0,0 +1,21 @@
---
kind: pipeline
name: global
steps:
- name : lint
image: rust
pull: true
errignore: true
commands:
- rustup component add rustfmt
- rustup component add clippy
- cargo clippy
- cargo fmt -- --check
- name : test
image: rust
pull: true
errignore: true
commands:
- cargo test --all
- cargo build

1
.gitattributes vendored Normal file
View File

@ -0,0 +1 @@
*.png filter=lfs diff=lfs merge=lfs -text

View File

@ -0,0 +1,22 @@
---
name: "Bug report"
about: "This template is for reporting a bug"
title: ""
labels:
- "type::bug"
- "status::review_needed"
---
**Description**
*write a concise bug description*
**Steps to reproduce**
1.
2.
**Expected behavior**
*describe what you expected to happen*
**Configuration**
*paste the result of `stage --version`

View File

@ -0,0 +1,19 @@
---
name: "Design discussion"
about: "For discussion about the design of features in the application, when there are several possibilities for implementation"
title: ""
labels:
- "type::discussion"
- "status::review_needed"
---
*describe shortly the problem*
## Requirements
*list requirements that the feature have*
## Propositions
*explain the different implementation that you would propose for the feature*

View File

@ -0,0 +1,15 @@
---
name: "Feature request"
about: "This template is for requesting a new feature"
title: ""
labels:
- "type::feature"
- "status::review_needed"
---
*(if applicable) describe what problem or frustration you have currently*
*describe what you would like to be able to do, or what solution you would like (you can propose several)*
*(optional) additional context, comments or implementation propositions*

View File

@ -0,0 +1,15 @@
---
name: "Ask a question"
about: "If you have a question about the usage of the libraries or the tool"
title: ""
labels:
- "type::question"
- "status::review_needed"
---
*ask your question*
*describe what you have you read so far to try to answer this question ?*
*(optional) would you think the is a in documentation ?*

View File

@ -0,0 +1,11 @@
---
name: "Refactor"
about: "For refactoring propositions"
title: ""
labels:
- "type::refactor"
- "status::review_needed"
---
*explain why and what you want to refactor*

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
/target
.env

2
.rustfmt.toml Normal file
View File

@ -0,0 +1,2 @@
format_strings = true
wrap_comments = true

2153
Cargo.lock generated Normal file

File diff suppressed because it is too large Load Diff

23
Cargo.toml Normal file
View File

@ -0,0 +1,23 @@
[workspace]
members = [
"crieur-retrieve",
]
[package]
name = "crieur"
version = "0.1.0"
authors = ["koalp <koalp@alpaga.dev>"]
edition = "2018"
publish = false
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
anyhow = "1.0.40"
crieur-retrieve = {version = "0.1", path="crieur-retrieve"}
dotenv = "0.15.0"
env_logger = "0.8.3"
log = "0.4.14"
tokio = { version = "1.5.0", features = ["full"] }

17
README.md Normal file
View File

@ -0,0 +1,17 @@
Tools to retrieve articles from multiple newspaper you subscribed to.
**This is a prototype, it isn't stable at all and you may not want to use it if you expect it to just work !**
# How to use it
First retrieve login cookies for websites and put it in a `.env`
```
cargo run --example=retrive_html_articles
```
# Documentation
- 1. [Design](documentation/design/index.md)
- a. [Scope of the project](documentation/design/scope.md)
- b. [Retrieve](documentation/design/retrieve.md)

1
crieur-retrieve/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
Cargo.lock

View File

@ -0,0 +1,30 @@
[package]
name = "crieur-retrieve"
description = "Retrive articles from newspapers websites"
authors = ["koalp <koalp@aplaga.dev>"]
version = "0.1.0"
edition = "2018"
publish = false
[dependencies]
anyhow = "1.0.40"
async-trait = "0.1.48"
thiserror = "1.0.24"
url = "2.2.1"
hyper = { version = "0.14.5", features = ["full"] }
hyper-rustls = "0.22.1"
cookie = "0.15.0"
lol_html = "0.3.0"
indoc = "1.0.3"
html-minifier = "3.0.9"
bytes = "1.0.1"
base64 = "0.13.0"
futures = "0.3.14"
derive_builder = "0.10.0"
nipper = "0.1.9"
log = "0.4.14"
env_logger = "0.8.3"
itertools = "0.10.0"
[dev-dependencies]
tokio = "1.5.0"

View File

@ -0,0 +1,111 @@
use std::boxed::Box;
use std::convert::TryInto;
use anyhow::{anyhow, Result};
use log::info;
use url::{Host, Url};
use crate::newspaper::Newspaper;
type Newspapers<'a> = Vec<Box<&'a dyn Newspaper>>;
#[derive(Default)]
pub struct ArticleLocationBuilder<'a> {
url: Option<Url>,
newspapers: Option<Newspapers<'a>>,
}
impl<'a> ArticleLocationBuilder<'a> {
pub fn new() -> Self {
Self::default()
}
/// Adds an url corresponding to article location
///
/// # Errors
///
/// An error is returned if the could not be converted into an url
// TODO: move this to a defined error, remove anyhow !
pub fn url<U, E>(mut self, url: U) -> Result<Self>
where
U: TryInto<Url, Error = E> + Send,
E: std::error::Error + Sync + Send + 'static,
{
let url = url.try_into()?;
self.url = Some(url);
Ok(self)
}
/// Adds a newspaper to the list
pub fn newspaper<T>(&mut self, newspaper: &'a T) -> &mut Self
where
T: 'a + Newspaper,
{
match &mut self.newspapers {
Some(newspapers) => newspapers.push(Box::new(newspaper)),
None => self.newspapers = Some(vec![Box::new(newspaper)]),
}
self
}
/// Adds several newspapers to the list of accepted newspapers
//fn newspapers(&mut self, newspapers: Newspapers) -> Result<&mut Self> {
// let newspapers = match &self.newspapers {
// Some(current_newspapers) => newspapers
// .iter()
// .chain(current_newspapers.iter())
// .map(|s| *(s.clone()))
// .collect::<Newspapers>(),
// None => newspapers.into_iter().collect::<Vec<_>>(),
// };
// self.newspapers = Some(newspapers);
// Ok(self)
//}
/// Builds the ArticleLocation by looking which newspaper
///
/// # Errors
///
/// The following errors can be returned
///
/// - no newpspaper is given
/// - the url is not set
/// - the given url has no host
// TODO: move this to a defined error, remove anyhow !
pub fn build(&self) -> Result<ArticleLocation<'a>> {
let url = Clone::clone(self.url.as_ref().ok_or(anyhow!(
"No url set. You can set it with the url() function"
))?);
let host = url.host_str().ok_or(anyhow!("Given url has no host"))?;
let host = Host::parse(host)?;
let newspaper = self
.newspapers
.as_ref()
.ok_or(anyhow!(
"A list of NewsPaper must be set. It can be set with newspapers() function"
))?
.iter()
.find(|c| c.metadata().hosts.contains(&host))
.ok_or(anyhow!("Newspaper couldn't be found"))?;
Ok(ArticleLocation {
newspaper: newspaper.clone(),
url,
})
}
}
pub struct ArticleLocation<'a> {
newspaper: Box<&'a dyn Newspaper>,
pub url: Url,
}
impl<'a> ArticleLocation<'a> {
pub fn builder() -> ArticleLocationBuilder<'a> {
ArticleLocationBuilder::new()
}
pub async fn retrieve_html(&self) -> Result<String> {
info!("It will download from {}", self.url);
self.newspaper.retrieve_html(&self.url).await
}
}

View File

@ -0,0 +1,94 @@
pub const EVENT_HANDLERS: &[&str] = &[
// From https://www.w3.org/TR/html52/webappapis.html#event-handlers-on-elements-document-objects-and-window-objects
"onabort",
"onauxclick",
"oncancel",
"oncanplay",
"oncanplaythrough",
"onchange",
"onclick",
"onclose",
"oncuechange",
"ondblclick",
"ondrag",
"ondragend",
"ondragenter",
"ondragexit",
"ondragleave",
"ondragover",
"ondragstart",
"ondrop",
"ondurationchange",
"onemptied",
"onended",
"oninput",
"oninvalid",
"onkeydown",
"onkeypress",
"onkeyup",
"onloadeddata",
"onloadedmetadata",
"onloadend",
"onloadstart",
"onmousedown",
"onmouseenter",
"onmouseleave",
"onmousemove",
"onmouseout",
"onmouseover",
"onmouseup",
"onwheel",
"onpause",
"onplay",
"onplaying",
"onprogress",
"onratechange",
"onreset",
"onseeked",
"onseeking",
"onselect",
"onshow",
"onstalled",
"onsubmit",
"onsuspend",
"ontimeupdate",
"ontoggle",
"onvolumechange",
"onwaiting",
"onblur",
"onerror",
"onfocus",
"onload",
"onresize",
"onscroll",
"onafterprint",
"onbeforeprint",
"onbeforeunload",
"onhashchange",
"onlanguagechange",
"onmessage",
"onoffline",
"ononline",
"onpagehide",
"onpageshow",
"onrejectionhandled",
"onpopstate",
"onstorage",
"onunhandledrejection",
"onunload",
"oncut",
"oncopy",
"onpaste",
"onreadystatechange",
];
pub const LINK_REL_EXTERNAL_RESOURCES: &[&str] = &[
// source: https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes/rel
"dns-prefetch",
"modulepreload",
"pingback",
"preconnect",
"prefetch",
"preload",
"prerender",
];

View File

@ -0,0 +1,7 @@
#[derive(thiserror::Error, Debug)]
pub enum Error {
#[error(transparent)]
Other(#[from] anyhow::Error),
}
pub type Result<T> = core::result::Result<T, Error>;

View File

@ -0,0 +1,17 @@
pub use url::Url;
pub mod errors;
mod tools;
pub use tools::{Download, Downloader};
pub mod newspaper;
// TODO: move to another crate
mod newspapers;
pub use newspapers::Mediapart;
mod article_location;
pub use article_location::ArticleLocation;
mod consts;

View File

@ -0,0 +1,61 @@
use anyhow::Result;
use async_trait::async_trait;
use derive_builder::Builder;
use url::Host;
pub use url::Url;
enum Login {
Username(String, String),
Cookie(String),
}
/// Contains metadata about a newspaper
// TODO: provide builder
#[derive(Debug, PartialEq, Default, Builder)]
#[builder(default)]
pub struct Metadata {
/// The hosts that can be corresponds to this newspaper
#[builder(setter(into))]
pub hosts: Vec<Host>,
/// The name of the newspaper, in lower case, without spaces
///
/// As it should be unique and contain no spaces, it may be used for configuration purposes
#[builder(setter(into))]
pub lower_case_name: String,
/// The full name of the newspaper, in lower case, without spaces
#[builder(setter(into))]
pub name: String,
}
impl Metadata {
pub fn builder() -> MetadataBuilder {
MetadataBuilder::default()
}
}
#[async_trait]
pub trait Newspaper {
/// Returns a list of hosts that corresponds to the newspapers
fn metadata(&self) -> Metadata;
/// Returns true if the Newspaper has complete access to the articles
///
/// Usually, it will may tell you if you are logged in when newspaper have a paywall
async fn has_complete_access(&self) -> bool
where
Self: Sized,
{
true
}
/// Returns a newspaper structure
async fn new() -> Self
where
Self: Sized;
/// Retrieve an article under the hmtl format
/// The article **must** be self-contained
async fn retrieve_html(&self, url: &Url) -> Result<String>;
// fn login(login: Login);
}

View File

@ -0,0 +1,93 @@
use anyhow::Result;
use async_trait::async_trait;
use cookie::Cookie;
use url::Host;
use crate::newspaper::{Metadata, Newspaper};
use crate::tools;
use crate::Url;
use crate::{Download, Downloader};
#[derive(Debug, Clone, Default)]
pub struct Mediapart {
// TODO: remove this pub !!
pub login_cookie: Option<(String, String)>,
}
fn str_to_host<S: Into<String>>(host: S) -> Host {
Host::Domain(host.into())
}
#[async_trait]
impl Newspaper for Mediapart {
fn metadata(&self) -> Metadata {
Metadata::builder()
.hosts(vec![
str_to_host("mediapart.fr"),
str_to_host("www.mediapart.fr"),
])
.lower_case_name("mediapart")
.name("Médiapart")
.build()
.unwrap_or_default()
}
async fn retrieve_html(&self, url: &Url) -> Result<String> {
let initial_query = url.query();
let query = match initial_query {
Some(q) => format!("{}&onglet=full", q),
None => "onglet=full".into(),
};
let mut url = url.clone();
url.set_query(Some(&query));
// TODO: add "?onglet=full" to the url if not
let cookies = if let Some((name, value)) = &self.login_cookie {
let cookie = Cookie::build(name, value).secure(true).finish();
vec![cookie]
} else {
vec![]
};
// TODO: replace by builder
let downloader = Downloader { cookies };
let body = downloader.download(&url).await?;
let html = String::from_utf8(body.to_vec())?;
// TODO: Move to const
let element_to_remove = [
// header
".fb-root",
".skipLinks",
".js-flash-message",
".header-sticky.sticky-links",
"nav.main-menu",
// menus inside and social media buttons
"ul.sub-menu-journal",
".tools-social",
".simple-list.universe-journal",
".simple-list.universe-club",
// Footer
"footer",
// Misc
"aside.cc-modal",
];
// TODO: correction of usage of relative urls, and replace "" by the url
let single_page_html =
tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await;
Ok(single_page_html)
}
async fn new() -> Self {
Self {
..Default::default()
}
}
async fn has_complete_access(&self) -> bool {
// TODO: check if we are logged using the cookie
true
}
}

View File

@ -0,0 +1,3 @@
mod mediapart;
pub use mediapart::Mediapart;

View File

@ -0,0 +1,57 @@
use std::error::Error as StdError;
use anyhow::Result;
use async_trait::async_trait;
use bytes::Bytes;
use cookie::Cookie;
use hyper::{header, Body, Client, Method, Request};
use thiserror::Error;
use url::Url;
#[derive(Error, Debug)]
pub enum DownloadError {
#[error("Http error")]
HttpError(#[from] hyper::http::Error),
#[error("Hyper error")]
HyperError(#[from] hyper::Error),
}
/// Downloads documents
#[async_trait]
pub trait Download {
type Error: StdError;
/// Downloads a file from an url and returns the result as bytes
async fn download(&self, file_link: &Url) -> Result<Bytes, Self::Error>;
}
/// Store several cookies
// TODO: add builder or new() funciton
#[derive(Debug, Clone)]
pub struct Downloader<'c> {
pub cookies: Vec<Cookie<'c>>,
}
#[async_trait]
impl<'c> Download for Downloader<'c> {
type Error = DownloadError;
async fn download(&self, file_link: &Url) -> Result<Bytes, Self::Error> {
let https = hyper_rustls::HttpsConnector::with_native_roots();
let client: Client<_, hyper::Body> = Client::builder().build(https);
let mut req = Request::builder()
.method(Method::GET)
.uri(file_link.as_str());
for cookie in &self.cookies {
req = req.header(header::COOKIE, cookie.to_string());
}
let req = req.body(Body::empty())?;
let resp = client.request(req).await?;
let body = hyper::body::to_bytes(resp).await?;
Ok(body)
}
}

View File

@ -0,0 +1,5 @@
mod download;
mod self_contained_html;
pub use download::{Download, DownloadError, Downloader};
pub use self_contained_html::self_contained_html;

View File

@ -0,0 +1,382 @@
use std::path::Path;
use futures::future::OptionFuture;
use html_minifier::HTMLMinifier;
use nipper::Document;
use url::Url;
use crate::consts::{EVENT_HANDLERS, LINK_REL_EXTERNAL_RESOURCES};
use crate::Download;
/// Makes an html page self-contained
///
/// The `downloader` must implement `Download` and is used to download ressources that are
/// needed to make this page self-contained such as stylesheets or images.
///
/// The function also removes all scripts on the page
pub async fn self_contained_html<E, D>(
html: impl AsRef<str>,
downloader: &D,
base_url: &Url,
elements_to_remove: &[impl AsRef<str>],
) -> String
where
E: std::error::Error,
D: Download<Error = E> + Send,
{
// TODO: split/refactor this function :
// - ¿ find a way to ease the |get urls| -> |async download| -> |replace| workflow ?
// - `element_to_remove` , `base_url` and `downloader` should be in a configuration structure
// - ¿ should be function of a trait ? or only of the configuration struct ?
let (style_urls, html) = {
let document = Document::from(html.as_ref());
// ---- Remove scripts ----
//
document.select("script").remove();
for event in EVENT_HANDLERS {
document
.select(format!("[{}]", event).as_str())
.remove_attr(event);
}
for rel in LINK_REL_EXTERNAL_RESOURCES {
document
.select(format!("link[rel=\"{}\"]", rel).as_str())
.remove();
}
// ---- Replace stylesheets ----
//
let stylesheets = document.select("link[href][rel=\"stylesheet\"]");
let styles_url = stylesheets
.iter()
.map(|stylesheet| {
if let Some(src) = stylesheet.attr("href") {
//TODO: does it work with absolute urls ?
base_url.join(src.as_ref()).ok()
} else {
None
}
})
.collect::<Vec<_>>();
(styles_url, String::from(document.html()))
};
let style_urls = style_urls.into_iter().map(|style_url| {
OptionFuture::from(style_url.map(|s| async move { downloader.download(&s).await.unwrap() }))
});
let downloaded_styles = futures::future::join_all(style_urls).await;
let html = {
let document = Document::from(&html);
let styles = document.select("link[href][rel=\"stylesheet\"]");
styles
.iter()
.zip(downloaded_styles.iter())
.for_each(|(mut stylesheet, inner_css)| {
if let Some(inner_css) = inner_css {
let css = String::from_utf8(inner_css.to_vec()).unwrap();
let css = format!("<style>{}</style>", css);
stylesheet.replace_with_html(css);
} else {
stylesheet.remove();
}
});
String::from(document.html())
};
// ---- Replace imgs ----
//
let image_urls = {
let document = Document::from(&html);
let imgs = document.select("img");
imgs.iter()
.map(|image| {
if let Some(src) = image.attr("src") {
base_url.join(src.as_ref()).ok()
} else {
None
}
})
.collect::<Vec<_>>()
};
let downloaded_images = image_urls.into_iter().map(|image_url| {
OptionFuture::from(image_url.map(|url| async move {
let data = downloader.download(&url).await.unwrap();
(url, data)
}))
});
let downloaded_images = futures::future::join_all(downloaded_images).await;
let html = {
let document = Document::from(&html);
let imgs = document.select("img");
imgs.iter()
.zip(downloaded_images.iter())
.for_each(|(mut img, data)| {
if let Some((url, data)) = data {
let data = base64::encode(data);
let extension = Path::new(url.path()).extension().unwrap().to_str().unwrap();
img.set_attr("src", &format!("data:image/{};base64,{}", extension, data));
}
});
// ---- Remove unwanted html elements -----
//
for element in elements_to_remove {
document.select(element.as_ref()).remove();
}
String::from(document.html())
};
// ---- output ----
//
let mut minifier = HTMLMinifier::new();
minifier.digest(html.as_str()).unwrap();
String::from_utf8(minifier.get_html().into()).unwrap()
}
#[cfg(test)]
mod tests {
use super::*;
use std::fs::File;
use std::io::prelude::*;
use anyhow::Result;
use async_trait::async_trait;
use bytes::Bytes;
use indoc::{formatdoc, indoc};
use crate::errors;
fn init() {
let _ = env_logger::builder().is_test(true).try_init();
}
// TODO: the Dummy,Css and Png Downloaders don't really test the async scenario as
// they don't use futures : they don't call await.
// They should be testing the async scenario
struct DummyDownloader;
#[async_trait]
impl Download for DummyDownloader {
type Error = errors::Error;
async fn download(&self, _file_link: &Url) -> errors::Result<Bytes> {
Ok(Bytes::from(""))
}
}
#[tokio::test]
async fn remove_scripts() -> Result<()> {
let html = "<html><head><script>let id = id => id</script></head><body></body></html>";
let base_url = Url::parse("http://example.com")?;
let downloader = DummyDownloader {};
let to_remove: &[&str] = &[];
assert_eq!(
self_contained_html(html, &downloader, &base_url, to_remove).await,
"<html><head></head><body></body></html>"
);
Ok(())
}
#[tokio::test]
async fn remove_onevent_handlers() -> Result<()> {
init();
let downloader = DummyDownloader {};
let html = |onevent| {
formatdoc! {"
<html>
<head>
</head>
<body>
<button class=\"activate\" {}=\"let id = id => id\">button</button>
</body>
</html>",
onevent
}
};
let base_url = Url::parse("http://example.com")?;
let to_remove: &[&str] = &[];
for s in EVENT_HANDLERS {
assert_eq!(
self_contained_html(html(s), &downloader, &base_url, to_remove).await,
"<html><head>\n</head>\n<body>\n<button class=\"activate\">button</button>\n</body></html>"
);
}
Ok(())
}
#[tokio::test]
async fn remove_link_with_external_ressource() -> Result<()> {
init();
let downloader = DummyDownloader {};
let html = |onevent| {
formatdoc! {"
<html>
<head>
<link rel=\"{}\" href=\"https://example.org/script.js\">
</head>
<body>
</body>
</html>",
onevent
}
};
let base_url = Url::parse("http://example.com")?;
let to_remove: &[&str] = &[];
for s in LINK_REL_EXTERNAL_RESOURCES {
assert_eq!(
self_contained_html(html(s), &downloader, &base_url, to_remove).await,
"<html><head>\n</head>\n<body>\n</body></html>"
);
}
Ok(())
}
struct CssDownloader;
#[async_trait]
impl Download for CssDownloader {
type Error = errors::Error;
async fn download(&self, _file_link: &Url) -> errors::Result<Bytes> {
Ok(indoc! {"
section#warning {
color: red;
}"}
.into())
}
}
#[tokio::test]
async fn download_css() -> Result<()> {
let downloader = CssDownloader {};
let html = indoc! {"
<html>
<head>
<link rel=\"stylesheet\" href=\"main.css\">
</head>
<body>
</body>
</html>
"};
// FIXME: find why minify doesn't minify
let wanted_html = indoc! {"
<html><head>
<style>
section#warning {
color: red;
}
</style>
</head>
<body>
</body></html>
"};
let mut minifier = HTMLMinifier::new();
minifier.digest(wanted_html)?;
let minified = String::from_utf8(minifier.get_html().into())?;
let base_url = Url::parse("http://example.com")?;
let to_remove: &[&str] = &[];
assert_eq!(
self_contained_html(html, &downloader, &base_url, to_remove).await,
minified
);
Ok(())
}
struct PngDownloader;
#[async_trait]
impl Download for PngDownloader {
type Error = errors::Error;
async fn download(&self, _file_link: &Url) -> errors::Result<Bytes> {
let image_path = Path::new("test_data/home.png");
let mut image_file = File::open(&image_path).unwrap();
let mut image_buf: Vec<u8> = vec![];
image_file.read_to_end(&mut image_buf).unwrap();
Ok(image_buf.into())
}
}
#[tokio::test]
async fn download_image_png() -> Result<()> {
let downloader = PngDownloader {};
let html = indoc! {"
<html>
<head></head>
<body>
<img src=\"home.png\" alt=\"an home\" />
</body>
</html>
"};
let wanted_html = indoc! {"
<html><head></head>
<body>
<img src=\"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAABm\
JLR0QA/wD/AP+gvaeTAAAAh0lEQVQ4jc2RQQqAIBBFn12ioNNE544gWhStc+FNWtVmBDFHsjZ9+DBffeOgo\
KsBVmCRukgNYIFTbGVtlDzk4DqCwyZhTqoFXAJO+RN8a1ADewF8CvPqZm8nLNsL2HutgEN70Qc6TBDUr1Fk\
AKrMgU4OGaDPdlEmMFFO7ucmeKR/NZgLuMkXFxHZVhLI8sXeAAAAAElFTkSuQmCC\" alt=\"an home\">
</body></html>
"};
let mut minifier = HTMLMinifier::new();
minifier.digest(wanted_html)?;
let minified = String::from_utf8(minifier.get_html().into())?;
let base_url = Url::parse("http://example.com")?;
let to_remove: &[&str] = &[];
assert_eq!(
self_contained_html(html, &downloader, &base_url, to_remove).await,
minified
);
Ok(())
}
#[tokio::test]
async fn remove_css_selectors() -> Result<()> {
let html = indoc! {"
<html>
<head></head>
<body>
<header>The header</header>
<article>The article<span class=\"huge\">social media button</span></article>
<div class=\"placeholder\">a placeholder></div>
</body>
</html>
"};
let wanted_html = indoc! {"
<html><head></head>
<body>
<article>The article</article>
</body></html>
"};
let base_url = Url::parse("http://example.com")?;
let downloader = DummyDownloader {};
let mut minifier = HTMLMinifier::new();
minifier.digest(wanted_html)?;
let minified = String::from_utf8(minifier.get_html().into())?;
assert_eq!(
self_contained_html(
html,
&downloader,
&base_url,
&["header", ".placeholder", "article > span.huge"]
)
.await,
minified
);
Ok(())
}
}

BIN
crieur-retrieve/test_data/home.png (Stored with Git LFS) Normal file

Binary file not shown.

View File

@ -0,0 +1,76 @@
---
title: crieur-retrieve design
---
# Self-contained html
Exporting the article as a self-contained may be the easier and more reliable , as it keep the
original ui of the newspaper, and do not require to export in a different format.
Creating reusable methods to create a self-contained html page will make it easier to write
`Newspaper`s . Those methods would be part of a `crieur-retrieve-tool` library.
The `self_contained_html` function have been created to do this.
```rust
pub fn self_contained_html<S: AsRef<str>>()
html: S,
downloader: &dyn Fn(Url) -> Option<Bytes>,
) -> String
```
## Script removal
Nothing should be executed by the exported html page.
Scripts elements are contained in `<script>` tags as well as with event handlers (ex : `onclick`,
`onmousedown`).
## CSS
CSS should be retrieved and included in the web page.
To make the web pages minimal, it would be nice to remove all unused CSS, but that may be difficult technically.
## Images
All images should be included in the html page. It can be done by transforming them to base64.
A drawback is that it takes more place.
## (options) Custom filters
Allowing `Newspaper` creators to write custom html filters can allow to
The different filters that creators may want to write are :
- `delete` : delete part of the page that are useless based on css selector (navbars, account, comments)
- `link rewrite` : rewrite links so they are absolute. It can be useful if you want to keep external link, to other articles, to the comment sections, to the main page of the newspaper, et c
- other filters : asking users what filter they want to write could be useful to know if features are lacking
`delete` filters seems the most useful and is easy to do as you can just provide a list of CSS filters.
The other need to be designed.
## Minify
The html and css is minified to take the less place possible
**unimplemented** Images size could be reduced if they are too big. A format such as webp could
also be used.
## Inspiration
- [monolith](https://github.com/y2z/monolith), a CLI tool for saving complete web pages as a single HTML file
- not really a library (yet ?)
- lacks custom selector for removal of unwanted parts
- not async
## Libraries
[lol-html](https://github.com/cloudflare/lol-html) is a great library and is designed to be fast as it is streaming through rather than parsing, storing and modifying it. Unfortunately, it isn't compatible with async downloads as the library relies on setting up executors (functions) that will be runned during the processing, and those functions can't be async.
Therefore, a library that seems to be less used, [nipper](https://github.com/importcjj/nipper), has been choosen. The `Document` type of this library is not `Send`, so it can't be used in two different `Future`. To circumvent this issue, the `Document` is recreated after each `await`. The overhead of doing so have not been measured yet.
# Downloader
A `downloader` tool helps to write Newspaper interfaces. The `Download` `Trait` to allows the user to provide it's own `downloader`, it also helps to unit test as a dummy downloader can be created.

View File

@ -0,0 +1,48 @@
This project mainly aims at providing an unified interface for several newspapers. Side
objectives are to provide web API and different clients like a webUI or chatbots.
Several big components are planned for this project
```dot
digraph G {
rankdir=TB
node [shape=rectangle, style=filled, color="#779988"]
subgraph cluster_frontend {
color = transparent
webui
chatbot
}
webui -> api [color = red]
chatbot -> api [color = red]
subgraph cluster_backend {
label = "Backend\ncrieur binary"
labelloc = b
style=filled
retrieve_tools [label="retrieve-tools"]
retrieve_adapters [label="retrieve-adapters"]
retrieve [label="retrieve-interface"]
auto_retrieve [label="automatic-retrieve"]
article_repr [label="article-representation\nRepresentation for articles"]
api
cache [label="Cache database"]
rss [label="Atom/RSS adapters"]
retrieve_tools -> retrieve_adapters
retrieve_adapters -> retrieve
retrieve_tools -> retrieve
rss -> auto_retrieve
article_repr -> retrieve_adapters
retrieve -> api
auto_retrieve -> api
cache -> api
}
}
```

View File

@ -0,0 +1,19 @@
---
title: Add a newspaper source
---
How to add a newspaper source ? 
You must implement the `Newspaper` trait for you structure
# 1. Write the `metadata` function
It returns information about the newspaper
# 2. Write the `has_complete_acess` function
Usually, indicates if the user is logged in.
You are encouraged to test on the newspaper webpage by making an http call.
You can use the **TODO** helper function that will look if a specific css
selector is in the page located at the given url.

View File

@ -0,0 +1,39 @@
use std::convert::TryInto;
use std::env;
use anyhow::Result;
use crieur_retrieve::{newspaper::Newspaper, ArticleLocation, Mediapart, Url};
use dotenv::dotenv;
use log::info;
#[tokio::main]
async fn main() -> Result<()> {
dotenv().ok();
env_logger::init();
let url = match env::args().nth(1) {
Some(url) => Url::parse(&url)?,
None => "https://www.mediapart.fr/journal/france/030421/l-hotel-dieu-patients-et-medecins-tentent-de-percer-les-mysteres-du-covid-long".try_into()?,
};
// TODO: remove this in favor of default newspapers
let mut mediapart = Mediapart::new().await
//.login(USERNAME, PASSWORD)
//
;
mediapart.login_cookie = Some(("MPRUUID".into(), env::var("MEDIAPART_COOKIE")?.into()));
info!("Trying to download article from {}", url);
// TODO: shorten this, maybe an helper function ?
let article_location = ArticleLocation::builder()
.url(url)?
.newspaper(&mediapart)
.build()?;
let article_str = article_location.retrieve_html().await?;
println!("{}", article_str);
Ok(())
}

16
justfile Normal file
View File

@ -0,0 +1,16 @@
@build:
cargo build
@clean:
cargo clean
@run:
cargo run
@audit:
cargo audit
@crev:
cargo crev verify
@verify: audit crev

19
src/main.rs Normal file
View File

@ -0,0 +1,19 @@
use anyhow::Result;
use crieur_retrieve::{newspaper::Newspaper, Mediapart, Url};
use dotenv::dotenv;
use std::env;
#[tokio::main]
async fn main() -> Result<()> {
dotenv().ok();
let mut mediapart = Mediapart::new().await
//.login(USERNAME, PASSWORD)
//
;
mediapart.login_cookie = Some(("MPRUUID".into(), env::var("MEDIAPART_COOKIE")?));
let url = Url::parse("https://www.mediapart.fr/journal/france/030421/l-hotel-dieu-patients-et-medecins-tentent-de-percer-les-mysteres-du-covid-long")?;
println!("{}", mediapart.retrieve_html(&url).await?);
Ok(())
}