crieur/crieur-retrieve/src/newspapers/mediapart.rs
koalp 865b949b5f
All checks were successful
continuous-integration/drone/pr Build is passing
feat: add builder for mediapart, document chatbot
A builder for mediapart have been added. No generic builder have been
created as there is no usecase yet.

Some documentation have been added, roadmap and scope have been
clarified and chatbot have been lightly documented.
2021-04-29 02:11:32 +02:00

126 lines
3.2 KiB
Rust

use anyhow::{anyhow, Result};
use async_trait::async_trait;
use cookie::Cookie;
use url::Host;
use crate::newspaper::{Metadata, Newspaper};
use crate::tools;
use crate::Url;
use crate::{Download, Downloader};
pub enum Login {
Username(String, String),
MPRUUID(String),
}
#[derive(Debug, Clone, Default)]
pub struct Mediapart {
login_cookie: (String, String),
}
fn str_to_host<S: Into<String>>(host: S) -> Host {
Host::Domain(host.into())
}
#[derive(Debug, Clone, Default)]
pub struct Builder {
login_cookie: Option<(String, String)>,
}
impl Builder {
pub fn login(&mut self, login: Login) -> &mut Self {
self.login_cookie = match login {
Login::Username(_username, _password) => {
unimplemented!("login using username and passwond not implemented")
}
Login::MPRUUID(cookie_value) => Some(("MPRUUID".into(), cookie_value)),
};
self
}
pub fn build(&self) -> Result<Mediapart> {
match &self.login_cookie {
Some(login_cookie) => Ok(Mediapart {
login_cookie: login_cookie.clone(),
}),
None => Err(anyhow!("You have to log in to access this newspaper")),
}
}
}
#[async_trait]
impl Newspaper for Mediapart {
fn metadata(&self) -> Metadata {
Metadata::builder()
.hosts(vec![
str_to_host("mediapart.fr"),
str_to_host("www.mediapart.fr"),
])
.lower_case_name("mediapart")
.name("Médiapart")
.build()
.unwrap_or_default()
}
async fn retrieve_html(&self, url: &Url) -> Result<String> {
let initial_query = url.query();
let query = match initial_query {
Some(q) => format!("{}&onglet=full", q),
None => "onglet=full".into(),
};
let mut url = url.clone();
url.set_query(Some(&query));
let cookie = Cookie::build(&self.login_cookie.0, &self.login_cookie.1)
.secure(true)
.finish();
let cookies = vec![cookie];
// TODO: replace by builder
let downloader = Downloader { cookies };
let body = downloader.download(&url).await?;
let html = String::from_utf8(body.to_vec())?;
// TODO: Move to const
let element_to_remove = [
// header
".fb-root",
".skipLinks",
".js-flash-message",
".header-sticky.sticky-links",
"nav.main-menu",
// menus inside and social media buttons
"ul.sub-menu-journal",
".tools-social",
".simple-list.universe-journal",
".simple-list.universe-club",
// Footer
"footer",
// Misc
"aside.cc-modal",
];
let single_page_html =
tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await;
Ok(single_page_html)
}
fn new() -> Self {
Self {
..Default::default()
}
}
async fn has_complete_access(&self) -> bool {
// TODO: check if we are logged using the cookie
true
}
}
impl Mediapart {
pub fn builder() -> Builder {
Builder::default()
}
}