feat : add retrieve from courrier international
Retrieval of articles from courrier international have been added
This commit is contained in:
parent
cee0af6c3c
commit
5d0872b4d9
24
Cargo.lock
generated
24
Cargo.lock
generated
@ -362,7 +362,6 @@ dependencies = [
|
||||
"hyper",
|
||||
"hyper-rustls",
|
||||
"indoc",
|
||||
"itertools",
|
||||
"log",
|
||||
"lol_html",
|
||||
"nipper",
|
||||
@ -594,12 +593,6 @@ dependencies = [
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "either"
|
||||
version = "1.6.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457"
|
||||
|
||||
[[package]]
|
||||
name = "encoding_rs"
|
||||
version = "0.8.28"
|
||||
@ -902,9 +895,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "html-minifier"
|
||||
version = "3.0.11"
|
||||
version = "3.0.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9876ed2cad2fd2f78bad42688bcac8a87cebce9b2381c5b05efc30b0c8429af0"
|
||||
checksum = "5a3b862e637e82b3134913fdd0aa0b8e79b7486fe88878f6bab0d09daf4996af"
|
||||
dependencies = [
|
||||
"cow-utils",
|
||||
"educe",
|
||||
@ -1063,15 +1056,6 @@ version = "2.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "47be2f14c678be2fdcab04ab1171db51b2762ce6f0a8ee87c8dd4a04ed216135"
|
||||
|
||||
[[package]]
|
||||
name = "itertools"
|
||||
version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "37d572918e350e82412fe766d24b15e6682fb2ed2bbe018280caa810397cb319"
|
||||
dependencies = [
|
||||
"either",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "itoa"
|
||||
version = "0.4.7"
|
||||
@ -1300,9 +1284,9 @@ checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d"
|
||||
|
||||
[[package]]
|
||||
name = "minifier"
|
||||
version = "0.0.39"
|
||||
version = "0.0.40"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6cdf618de5c9c98d4a7b2e0d1f1e44f82a19196cfd94040bb203621c25d28d98"
|
||||
checksum = "ed8f02a863a23d5797a6e72ea7102bd0ee38ceba1256b522aeddb70ff14b32c9"
|
||||
dependencies = [
|
||||
"macro-utils",
|
||||
]
|
||||
|
@ -113,7 +113,6 @@ impl EventHandler for Html {
|
||||
} else {
|
||||
return;
|
||||
};
|
||||
info!("sending file");
|
||||
|
||||
match msg_body.split(' ').collect::<Vec<_>>().as_slice() {
|
||||
["!html", url, ..] => send_article(*url, room).await,
|
||||
|
@ -16,7 +16,7 @@ hyper-rustls = "0.22.1"
|
||||
cookie = "0.15.0"
|
||||
lol_html = "0.3.0"
|
||||
indoc = "1.0.3"
|
||||
html-minifier = "3.0.11"
|
||||
html-minifier = "3.0.12"
|
||||
bytes = "1.0.1"
|
||||
base64 = "0.13.0"
|
||||
futures = "0.3.14"
|
||||
@ -24,7 +24,6 @@ derive_builder = "0.10.2"
|
||||
nipper = "0.1.9"
|
||||
log = "0.4.14"
|
||||
env_logger = "0.8.3"
|
||||
itertools = "0.10.0"
|
||||
|
||||
[dev-dependencies]
|
||||
tokio = "1.5.0"
|
||||
|
@ -6,6 +6,7 @@ use log::info;
|
||||
use url::{Host, Url};
|
||||
|
||||
use crate::newspaper::Newspaper;
|
||||
use crate::newspapers::courrier_international::{self, CourrierInternational};
|
||||
use crate::newspapers::mediapart::{self, Mediapart};
|
||||
use crate::newspapers::monde_diplomatique::{self, MondeDiplo};
|
||||
|
||||
@ -65,7 +66,25 @@ fn default_newpapers() -> Result<Newspapers> {
|
||||
})
|
||||
.build()?;
|
||||
|
||||
Ok(vec![Box::new(mediapart), Box::new(monde_diplo)])
|
||||
let lmd_a_m = "COURRIER_INTERNATIONAL_LMD_A_M".to_string();
|
||||
let ssess = "COURRIER_INTERNATIONAL_SSESS".to_string();
|
||||
|
||||
let lmd_a_m = env::var(&lmd_a_m)
|
||||
.map_err(|_| Error::Misconfiguration(lmd_a_m))?
|
||||
.into();
|
||||
let ssess = env::var(&ssess)
|
||||
.map_err(|_| Error::Misconfiguration(ssess))?
|
||||
.into();
|
||||
|
||||
let courrier_international = CourrierInternational::builder()
|
||||
.login(courrier_international::Login::Cookies { lmd_a_m, ssess })
|
||||
.build()?;
|
||||
|
||||
Ok(vec![
|
||||
Box::new(mediapart),
|
||||
Box::new(monde_diplo),
|
||||
Box::new(courrier_international),
|
||||
])
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
@ -149,7 +168,6 @@ impl ArticleLocation {
|
||||
}
|
||||
|
||||
pub async fn retrieve_html(&self) -> Result<String> {
|
||||
info!("It will download from {}", self.url);
|
||||
// TODO: modify when retrieve_html returns a specific Error type
|
||||
Ok(self.newspaper.retrieve_html(&self.url).await?)
|
||||
}
|
||||
|
111
crieur-retrieve/src/newspapers/courrier_international.rs
Normal file
111
crieur-retrieve/src/newspapers/courrier_international.rs
Normal file
@ -0,0 +1,111 @@
|
||||
use anyhow::{anyhow, bail, Result};
|
||||
use async_trait::async_trait;
|
||||
use cookie::Cookie;
|
||||
use url::Host;
|
||||
|
||||
use crate::newspaper::{Metadata, Newspaper};
|
||||
use crate::tools;
|
||||
use crate::Url;
|
||||
use crate::{Download, Downloader};
|
||||
|
||||
pub enum Login {
|
||||
Username(String, String),
|
||||
Cookies { lmd_a_m: String, ssess: String },
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct CourrierInternational {
|
||||
login_cookies: Vec<(String, String)>,
|
||||
}
|
||||
|
||||
fn str_to_host<S: Into<String>>(host: S) -> Host {
|
||||
Host::Domain(host.into())
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct Builder {
|
||||
login_cookies: Option<Vec<(String, String)>>,
|
||||
}
|
||||
|
||||
impl Builder {
|
||||
pub fn login(&mut self, login: Login) -> &mut Self {
|
||||
self.login_cookies = match login {
|
||||
Login::Username(_username, _password) => {
|
||||
unimplemented!("login using username and passwond not implemented")
|
||||
}
|
||||
Login::Cookies { lmd_a_m, ssess } => Some(vec![
|
||||
("lmd_a_m".into(), lmd_a_m),
|
||||
("SSESS862c7003d721c672d39f161b1456b890".into(), ssess),
|
||||
]),
|
||||
};
|
||||
self
|
||||
}
|
||||
|
||||
pub fn build(&self) -> Result<CourrierInternational> {
|
||||
match &self.login_cookies {
|
||||
Some(login_cookies) => Ok(CourrierInternational {
|
||||
login_cookies: login_cookies.clone(),
|
||||
}),
|
||||
None => Err(anyhow!("You have to log in to access this newspaper")),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Newspaper for CourrierInternational {
|
||||
fn metadata(&self) -> Metadata {
|
||||
Metadata::builder()
|
||||
.hosts(vec![
|
||||
str_to_host("courrierinternational.com"),
|
||||
str_to_host("www.courrierinternational.com"),
|
||||
])
|
||||
.lower_case_name("courrier-international")
|
||||
.name("Courrier international")
|
||||
.build()
|
||||
.unwrap_or_default()
|
||||
}
|
||||
|
||||
async fn retrieve_html(&self, url: &Url) -> Result<String> {
|
||||
let cookies = self
|
||||
.login_cookies
|
||||
.iter()
|
||||
.map(|cookie| Cookie::build(&cookie.0, &cookie.1).finish())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
// TODO: replace by builder
|
||||
let downloader = Downloader { cookies };
|
||||
|
||||
let body = downloader.download(&url).await?;
|
||||
let html = match body {
|
||||
Some(body) => String::from_utf8(body.to_vec())?,
|
||||
None => bail!("404 not found"),
|
||||
};
|
||||
|
||||
// TODO: Move to const
|
||||
let element_to_remove = [
|
||||
// navigation elements
|
||||
"#entete.connecte",
|
||||
];
|
||||
|
||||
let single_page_html =
|
||||
tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await;
|
||||
Ok(single_page_html)
|
||||
}
|
||||
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
|
||||
async fn has_complete_access(&self) -> bool {
|
||||
// TODO: check if we are logged using the cookie
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
impl CourrierInternational {
|
||||
pub fn builder() -> Builder {
|
||||
Builder::default()
|
||||
}
|
||||
}
|
@ -1,2 +1,3 @@
|
||||
pub mod courrier_international;
|
||||
pub mod mediapart;
|
||||
pub mod monde_diplomatique;
|
||||
|
@ -80,10 +80,6 @@ impl Newspaper for MondeDiplo {
|
||||
.iter()
|
||||
.map(|cookie| Cookie::build(&cookie.0, &cookie.1).finish())
|
||||
.collect::<Vec<_>>();
|
||||
//let cookie = Cookie::build(&self.login_cookie.0, &self.login_cookie.1)
|
||||
// .secure(true)
|
||||
// .finish();
|
||||
//let cookies = vec![cookie];
|
||||
|
||||
// TODO: replace by builder
|
||||
let downloader = Downloader { cookies };
|
||||
|
@ -39,7 +39,7 @@ impl<'c> Download for Downloader<'c> {
|
||||
type Error = DownloadError;
|
||||
|
||||
async fn download(&self, file_link: &Url) -> Result<Option<Bytes>, Self::Error> {
|
||||
log::info!("downloading url {:?}", file_link);
|
||||
log::debug!("downloading url {:?}", file_link);
|
||||
let https = hyper_rustls::HttpsConnector::with_native_roots();
|
||||
let client: Client<_, hyper::Body> = Client::builder().build(https);
|
||||
|
||||
@ -55,11 +55,12 @@ impl<'c> Download for Downloader<'c> {
|
||||
.collect::<Vec<_>>()
|
||||
.join(";"),
|
||||
);
|
||||
log::info!("headers : {:?}", req.headers_ref());
|
||||
log::debug!("headers : {:?}", req.headers_ref());
|
||||
|
||||
let req = req.body(Body::empty())?;
|
||||
|
||||
let resp = client.request(req).await?;
|
||||
log::debug!("Response status : {:?}", resp.status());
|
||||
let body = match resp.status() {
|
||||
StatusCode::OK => Some(hyper::body::to_bytes(resp).await?),
|
||||
StatusCode::NOT_FOUND => None,
|
||||
|
@ -170,8 +170,8 @@ mod tests {
|
||||
#[async_trait]
|
||||
impl Download for DummyDownloader {
|
||||
type Error = errors::Error;
|
||||
async fn download(&self, _file_link: &Url) -> errors::Result<Bytes> {
|
||||
Ok(Bytes::from(""))
|
||||
async fn download(&self, _file_link: &Url) -> errors::Result<Option<Bytes>> {
|
||||
Ok(Some(Bytes::from("")))
|
||||
}
|
||||
}
|
||||
|
||||
@ -248,12 +248,14 @@ mod tests {
|
||||
#[async_trait]
|
||||
impl Download for CssDownloader {
|
||||
type Error = errors::Error;
|
||||
async fn download(&self, _file_link: &Url) -> errors::Result<Bytes> {
|
||||
Ok(indoc! {"
|
||||
async fn download(&self, _file_link: &Url) -> errors::Result<Option<Bytes>> {
|
||||
Ok(Some(
|
||||
indoc! {"
|
||||
section#warning {
|
||||
color: red;
|
||||
}"}
|
||||
.into())
|
||||
.into(),
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
@ -300,12 +302,12 @@ mod tests {
|
||||
#[async_trait]
|
||||
impl Download for PngDownloader {
|
||||
type Error = errors::Error;
|
||||
async fn download(&self, _file_link: &Url) -> errors::Result<Bytes> {
|
||||
async fn download(&self, _file_link: &Url) -> errors::Result<Option<Bytes>> {
|
||||
let image_path = Path::new("test_data/home.png");
|
||||
let mut image_file = File::open(&image_path).unwrap();
|
||||
let mut image_buf: Vec<u8> = vec![];
|
||||
image_file.read_to_end(&mut image_buf).unwrap();
|
||||
Ok(image_buf.into())
|
||||
Ok(Some(image_buf.into()))
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -21,3 +21,11 @@ MONDE_DIPLO_PHPSESSID
|
||||
|
||||
MONDE_DIPLO_SPIP_SESSION
|
||||
: sets the `spip_session` cookie
|
||||
|
||||
# Courrier international
|
||||
|
||||
COURRIER_INTERNATIONAL_LMD_A_M
|
||||
: sets the `lmd_a_m` cookie
|
||||
|
||||
COURRIER_INTERNATIONAL_SSESS
|
||||
: sets the `ssess` cookie
|
||||
|
28
justfile
28
justfile
@ -1,19 +1,29 @@
|
||||
@build:
|
||||
cargo build
|
||||
cargo build
|
||||
|
||||
@build-container:
|
||||
podman build --file ./containers/chatbot.containerfile -t crieur-chatbot .
|
||||
|
||||
@clean:
|
||||
cargo clean
|
||||
cargo clean
|
||||
|
||||
@run:
|
||||
cargo run
|
||||
cargo run
|
||||
|
||||
@test:
|
||||
cargo test --all
|
||||
|
||||
@clippy:
|
||||
cargo clippy
|
||||
|
||||
@fmt:
|
||||
cargo fmt
|
||||
|
||||
@simulate-ci: fmt clippy test
|
||||
|
||||
@container:
|
||||
podman build --file ./containers/chatbot.containerfile -t crieur-chatbot .
|
||||
|
||||
@audit:
|
||||
cargo audit
|
||||
cargo audit
|
||||
|
||||
@crev:
|
||||
cargo crev verify
|
||||
|
||||
@verify: audit crev
|
||||
cargo crev verify
|
||||
|
Loading…
Reference in New Issue
Block a user