crieur/crieur-retrieve/src/newspapers/mediapart.rs
koalp a16dbbc790
Some checks failed
continuous-integration/drone/pr Build is running
continuous-integration/drone/push Build is failing
feat: add basic chatbot
A basic chabot application that downloads article from one newspaper
have been added.

It can download html pages and is called with !hmtl

ArticleLocation have been refactored to own it's internal data.
2021-04-27 04:32:37 +02:00

94 lines
2.5 KiB
Rust

use anyhow::Result;
use async_trait::async_trait;
use cookie::Cookie;
use url::Host;
use crate::newspaper::{Metadata, Newspaper};
use crate::tools;
use crate::Url;
use crate::{Download, Downloader};
#[derive(Debug, Clone, Default)]
pub struct Mediapart {
// TODO: remove this pub !!
pub login_cookie: Option<(String, String)>,
}
fn str_to_host<S: Into<String>>(host: S) -> Host {
Host::Domain(host.into())
}
#[async_trait]
impl Newspaper for Mediapart {
fn metadata(&self) -> Metadata {
Metadata::builder()
.hosts(vec![
str_to_host("mediapart.fr"),
str_to_host("www.mediapart.fr"),
])
.lower_case_name("mediapart")
.name("Médiapart")
.build()
.unwrap_or_default()
}
async fn retrieve_html(&self, url: &Url) -> Result<String> {
let initial_query = url.query();
let query = match initial_query {
Some(q) => format!("{}&onglet=full", q),
None => "onglet=full".into(),
};
let mut url = url.clone();
url.set_query(Some(&query));
// TODO: add "?onglet=full" to the url if not
let cookies = if let Some((name, value)) = &self.login_cookie {
let cookie = Cookie::build(name, value).secure(true).finish();
vec![cookie]
} else {
vec![]
};
// TODO: replace by builder
let downloader = Downloader { cookies };
let body = downloader.download(&url).await?;
let html = String::from_utf8(body.to_vec())?;
// TODO: Move to const
let element_to_remove = [
// header
".fb-root",
".skipLinks",
".js-flash-message",
".header-sticky.sticky-links",
"nav.main-menu",
// menus inside and social media buttons
"ul.sub-menu-journal",
".tools-social",
".simple-list.universe-journal",
".simple-list.universe-club",
// Footer
"footer",
// Misc
"aside.cc-modal",
];
// TODO: correction of usage of relative urls, and replace "" by the url
let single_page_html =
tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await;
Ok(single_page_html)
}
fn new() -> Self {
Self {
..Default::default()
}
}
async fn has_complete_access(&self) -> bool {
// TODO: check if we are logged using the cookie
true
}
}