feat : add retrieve from courrier international

Retrieval of articles from courrier international have been added
This commit is contained in:
koalp 2021-05-13 20:29:36 +02:00
parent cee0af6c3c
commit 5d0872b4d9
Signed by: koalp
GPG Key ID: 35B21047DEB09A81
11 changed files with 176 additions and 47 deletions

24
Cargo.lock generated
View File

@ -362,7 +362,6 @@ dependencies = [
"hyper",
"hyper-rustls",
"indoc",
"itertools",
"log",
"lol_html",
"nipper",
@ -594,12 +593,6 @@ dependencies = [
"syn",
]
[[package]]
name = "either"
version = "1.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457"
[[package]]
name = "encoding_rs"
version = "0.8.28"
@ -902,9 +895,9 @@ dependencies = [
[[package]]
name = "html-minifier"
version = "3.0.11"
version = "3.0.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9876ed2cad2fd2f78bad42688bcac8a87cebce9b2381c5b05efc30b0c8429af0"
checksum = "5a3b862e637e82b3134913fdd0aa0b8e79b7486fe88878f6bab0d09daf4996af"
dependencies = [
"cow-utils",
"educe",
@ -1063,15 +1056,6 @@ version = "2.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "47be2f14c678be2fdcab04ab1171db51b2762ce6f0a8ee87c8dd4a04ed216135"
[[package]]
name = "itertools"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "37d572918e350e82412fe766d24b15e6682fb2ed2bbe018280caa810397cb319"
dependencies = [
"either",
]
[[package]]
name = "itoa"
version = "0.4.7"
@ -1300,9 +1284,9 @@ checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d"
[[package]]
name = "minifier"
version = "0.0.39"
version = "0.0.40"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6cdf618de5c9c98d4a7b2e0d1f1e44f82a19196cfd94040bb203621c25d28d98"
checksum = "ed8f02a863a23d5797a6e72ea7102bd0ee38ceba1256b522aeddb70ff14b32c9"
dependencies = [
"macro-utils",
]

View File

@ -113,7 +113,6 @@ impl EventHandler for Html {
} else {
return;
};
info!("sending file");
match msg_body.split(' ').collect::<Vec<_>>().as_slice() {
["!html", url, ..] => send_article(*url, room).await,

View File

@ -16,7 +16,7 @@ hyper-rustls = "0.22.1"
cookie = "0.15.0"
lol_html = "0.3.0"
indoc = "1.0.3"
html-minifier = "3.0.11"
html-minifier = "3.0.12"
bytes = "1.0.1"
base64 = "0.13.0"
futures = "0.3.14"
@ -24,7 +24,6 @@ derive_builder = "0.10.2"
nipper = "0.1.9"
log = "0.4.14"
env_logger = "0.8.3"
itertools = "0.10.0"
[dev-dependencies]
tokio = "1.5.0"

View File

@ -6,6 +6,7 @@ use log::info;
use url::{Host, Url};
use crate::newspaper::Newspaper;
use crate::newspapers::courrier_international::{self, CourrierInternational};
use crate::newspapers::mediapart::{self, Mediapart};
use crate::newspapers::monde_diplomatique::{self, MondeDiplo};
@ -65,7 +66,25 @@ fn default_newpapers() -> Result<Newspapers> {
})
.build()?;
Ok(vec![Box::new(mediapart), Box::new(monde_diplo)])
let lmd_a_m = "COURRIER_INTERNATIONAL_LMD_A_M".to_string();
let ssess = "COURRIER_INTERNATIONAL_SSESS".to_string();
let lmd_a_m = env::var(&lmd_a_m)
.map_err(|_| Error::Misconfiguration(lmd_a_m))?
.into();
let ssess = env::var(&ssess)
.map_err(|_| Error::Misconfiguration(ssess))?
.into();
let courrier_international = CourrierInternational::builder()
.login(courrier_international::Login::Cookies { lmd_a_m, ssess })
.build()?;
Ok(vec![
Box::new(mediapart),
Box::new(monde_diplo),
Box::new(courrier_international),
])
}
#[derive(Default)]
@ -149,7 +168,6 @@ impl ArticleLocation {
}
pub async fn retrieve_html(&self) -> Result<String> {
info!("It will download from {}", self.url);
// TODO: modify when retrieve_html returns a specific Error type
Ok(self.newspaper.retrieve_html(&self.url).await?)
}

View File

@ -0,0 +1,111 @@
use anyhow::{anyhow, bail, Result};
use async_trait::async_trait;
use cookie::Cookie;
use url::Host;
use crate::newspaper::{Metadata, Newspaper};
use crate::tools;
use crate::Url;
use crate::{Download, Downloader};
pub enum Login {
Username(String, String),
Cookies { lmd_a_m: String, ssess: String },
}
#[derive(Debug, Clone, Default)]
pub struct CourrierInternational {
login_cookies: Vec<(String, String)>,
}
fn str_to_host<S: Into<String>>(host: S) -> Host {
Host::Domain(host.into())
}
#[derive(Debug, Clone, Default)]
pub struct Builder {
login_cookies: Option<Vec<(String, String)>>,
}
impl Builder {
pub fn login(&mut self, login: Login) -> &mut Self {
self.login_cookies = match login {
Login::Username(_username, _password) => {
unimplemented!("login using username and passwond not implemented")
}
Login::Cookies { lmd_a_m, ssess } => Some(vec![
("lmd_a_m".into(), lmd_a_m),
("SSESS862c7003d721c672d39f161b1456b890".into(), ssess),
]),
};
self
}
pub fn build(&self) -> Result<CourrierInternational> {
match &self.login_cookies {
Some(login_cookies) => Ok(CourrierInternational {
login_cookies: login_cookies.clone(),
}),
None => Err(anyhow!("You have to log in to access this newspaper")),
}
}
}
#[async_trait]
impl Newspaper for CourrierInternational {
fn metadata(&self) -> Metadata {
Metadata::builder()
.hosts(vec![
str_to_host("courrierinternational.com"),
str_to_host("www.courrierinternational.com"),
])
.lower_case_name("courrier-international")
.name("Courrier international")
.build()
.unwrap_or_default()
}
async fn retrieve_html(&self, url: &Url) -> Result<String> {
let cookies = self
.login_cookies
.iter()
.map(|cookie| Cookie::build(&cookie.0, &cookie.1).finish())
.collect::<Vec<_>>();
// TODO: replace by builder
let downloader = Downloader { cookies };
let body = downloader.download(&url).await?;
let html = match body {
Some(body) => String::from_utf8(body.to_vec())?,
None => bail!("404 not found"),
};
// TODO: Move to const
let element_to_remove = [
// navigation elements
"#entete.connecte",
];
let single_page_html =
tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await;
Ok(single_page_html)
}
fn new() -> Self {
Self {
..Default::default()
}
}
async fn has_complete_access(&self) -> bool {
// TODO: check if we are logged using the cookie
true
}
}
impl CourrierInternational {
pub fn builder() -> Builder {
Builder::default()
}
}

View File

@ -1,2 +1,3 @@
pub mod courrier_international;
pub mod mediapart;
pub mod monde_diplomatique;

View File

@ -80,10 +80,6 @@ impl Newspaper for MondeDiplo {
.iter()
.map(|cookie| Cookie::build(&cookie.0, &cookie.1).finish())
.collect::<Vec<_>>();
//let cookie = Cookie::build(&self.login_cookie.0, &self.login_cookie.1)
// .secure(true)
// .finish();
//let cookies = vec![cookie];
// TODO: replace by builder
let downloader = Downloader { cookies };

View File

@ -39,7 +39,7 @@ impl<'c> Download for Downloader<'c> {
type Error = DownloadError;
async fn download(&self, file_link: &Url) -> Result<Option<Bytes>, Self::Error> {
log::info!("downloading url {:?}", file_link);
log::debug!("downloading url {:?}", file_link);
let https = hyper_rustls::HttpsConnector::with_native_roots();
let client: Client<_, hyper::Body> = Client::builder().build(https);
@ -55,11 +55,12 @@ impl<'c> Download for Downloader<'c> {
.collect::<Vec<_>>()
.join(";"),
);
log::info!("headers : {:?}", req.headers_ref());
log::debug!("headers : {:?}", req.headers_ref());
let req = req.body(Body::empty())?;
let resp = client.request(req).await?;
log::debug!("Response status : {:?}", resp.status());
let body = match resp.status() {
StatusCode::OK => Some(hyper::body::to_bytes(resp).await?),
StatusCode::NOT_FOUND => None,

View File

@ -170,8 +170,8 @@ mod tests {
#[async_trait]
impl Download for DummyDownloader {
type Error = errors::Error;
async fn download(&self, _file_link: &Url) -> errors::Result<Bytes> {
Ok(Bytes::from(""))
async fn download(&self, _file_link: &Url) -> errors::Result<Option<Bytes>> {
Ok(Some(Bytes::from("")))
}
}
@ -248,12 +248,14 @@ mod tests {
#[async_trait]
impl Download for CssDownloader {
type Error = errors::Error;
async fn download(&self, _file_link: &Url) -> errors::Result<Bytes> {
Ok(indoc! {"
async fn download(&self, _file_link: &Url) -> errors::Result<Option<Bytes>> {
Ok(Some(
indoc! {"
section#warning {
color: red;
}"}
.into())
.into(),
))
}
}
@ -300,12 +302,12 @@ mod tests {
#[async_trait]
impl Download for PngDownloader {
type Error = errors::Error;
async fn download(&self, _file_link: &Url) -> errors::Result<Bytes> {
async fn download(&self, _file_link: &Url) -> errors::Result<Option<Bytes>> {
let image_path = Path::new("test_data/home.png");
let mut image_file = File::open(&image_path).unwrap();
let mut image_buf: Vec<u8> = vec![];
image_file.read_to_end(&mut image_buf).unwrap();
Ok(image_buf.into())
Ok(Some(image_buf.into()))
}
}

View File

@ -21,3 +21,11 @@ MONDE_DIPLO_PHPSESSID
MONDE_DIPLO_SPIP_SESSION
: sets the `spip_session` cookie
# Courrier international
COURRIER_INTERNATIONAL_LMD_A_M
: sets the `lmd_a_m` cookie
COURRIER_INTERNATIONAL_SSESS
: sets the `ssess` cookie

View File

@ -1,19 +1,29 @@
@build:
cargo build
cargo build
@build-container:
podman build --file ./containers/chatbot.containerfile -t crieur-chatbot .
@clean:
cargo clean
cargo clean
@run:
cargo run
cargo run
@test:
cargo test --all
@clippy:
cargo clippy
@fmt:
cargo fmt
@simulate-ci: fmt clippy test
@container:
podman build --file ./containers/chatbot.containerfile -t crieur-chatbot .
@audit:
cargo audit
cargo audit
@crev:
cargo crev verify
@verify: audit crev
cargo crev verify