A basic chabot application that downloads article from one newspaper have been added. It can download html pages and is called with !hmtl ArticleLocation have been refactored to own it's internal data.
94 lines
2.5 KiB
Rust
94 lines
2.5 KiB
Rust
use anyhow::Result;
|
|
use async_trait::async_trait;
|
|
use cookie::Cookie;
|
|
use url::Host;
|
|
|
|
use crate::newspaper::{Metadata, Newspaper};
|
|
use crate::tools;
|
|
use crate::Url;
|
|
use crate::{Download, Downloader};
|
|
|
|
#[derive(Debug, Clone, Default)]
|
|
pub struct Mediapart {
|
|
// TODO: remove this pub !!
|
|
pub login_cookie: Option<(String, String)>,
|
|
}
|
|
|
|
fn str_to_host<S: Into<String>>(host: S) -> Host {
|
|
Host::Domain(host.into())
|
|
}
|
|
|
|
#[async_trait]
|
|
impl Newspaper for Mediapart {
|
|
fn metadata(&self) -> Metadata {
|
|
Metadata::builder()
|
|
.hosts(vec![
|
|
str_to_host("mediapart.fr"),
|
|
str_to_host("www.mediapart.fr"),
|
|
])
|
|
.lower_case_name("mediapart")
|
|
.name("Médiapart")
|
|
.build()
|
|
.unwrap_or_default()
|
|
}
|
|
|
|
async fn retrieve_html(&self, url: &Url) -> Result<String> {
|
|
let initial_query = url.query();
|
|
let query = match initial_query {
|
|
Some(q) => format!("{}&onglet=full", q),
|
|
None => "onglet=full".into(),
|
|
};
|
|
let mut url = url.clone();
|
|
url.set_query(Some(&query));
|
|
|
|
// TODO: add "?onglet=full" to the url if not
|
|
let cookies = if let Some((name, value)) = &self.login_cookie {
|
|
let cookie = Cookie::build(name, value).secure(true).finish();
|
|
vec![cookie]
|
|
} else {
|
|
vec![]
|
|
};
|
|
|
|
// TODO: replace by builder
|
|
let downloader = Downloader { cookies };
|
|
|
|
let body = downloader.download(&url).await?;
|
|
let html = String::from_utf8(body.to_vec())?;
|
|
|
|
// TODO: Move to const
|
|
let element_to_remove = [
|
|
// header
|
|
".fb-root",
|
|
".skipLinks",
|
|
".js-flash-message",
|
|
".header-sticky.sticky-links",
|
|
"nav.main-menu",
|
|
// menus inside and social media buttons
|
|
"ul.sub-menu-journal",
|
|
".tools-social",
|
|
".simple-list.universe-journal",
|
|
".simple-list.universe-club",
|
|
// Footer
|
|
"footer",
|
|
// Misc
|
|
"aside.cc-modal",
|
|
];
|
|
|
|
// TODO: correction of usage of relative urls, and replace "" by the url
|
|
let single_page_html =
|
|
tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await;
|
|
Ok(single_page_html)
|
|
}
|
|
|
|
fn new() -> Self {
|
|
Self {
|
|
..Default::default()
|
|
}
|
|
}
|
|
|
|
async fn has_complete_access(&self) -> bool {
|
|
// TODO: check if we are logged using the cookie
|
|
true
|
|
}
|
|
}
|