Compare commits

..

No commits in common. "development" and "feature/mediapart_poc" have entirely different histories.

36 changed files with 471 additions and 2461 deletions

View File

@ -8,7 +8,6 @@ steps:
pull: true pull: true
errignore: true errignore: true
commands: commands:
- apt-get update && apt-get install -y cmake
- rustup component add rustfmt - rustup component add rustfmt
- rustup component add clippy - rustup component add clippy
- cargo clippy - cargo clippy
@ -18,6 +17,5 @@ steps:
pull: true pull: true
errignore: true errignore: true
commands: commands:
- apt-get update && apt-get install -y cmake
- cargo test --all - cargo test --all
- cargo build - cargo build

View File

@ -1,6 +1,6 @@
--- ---
name: "🐛 Bug report" name: "Bug report"
about: "For reporting bugs" about: "This template is for reporting a bug"
title: "" title: ""
labels: labels:
- "type::bug" - "type::bug"
@ -17,3 +17,6 @@ labels:
**Expected behavior** **Expected behavior**
*describe what you expected to happen* *describe what you expected to happen*
**Configuration**
*paste the result of `stage --version`

View File

@ -1,6 +1,6 @@
--- ---
name: "🗣 Discussion" name: "Design discussion"
about: "For discussion about the software, when you want to discuss about several conception possibilities" about: "For discussion about the design of features in the application, when there are several possibilities for implementation"
title: "" title: ""
labels: labels:
- "type::discussion" - "type::discussion"
@ -8,8 +8,12 @@ labels:
--- ---
*describe the problem * *describe shortly the problem*
## Requirements
*list requirements that the feature have*
## Propositions ## Propositions
*(optionnal) explain the different implementation that you would propose* *explain the different implementation that you would propose for the feature*

View File

@ -1,14 +1,15 @@
--- ---
name: "💡 Feature request" name: "Feature request"
about: "For requesting a new feature, with an implementation plan" about: "This template is for requesting a new feature"
title: "" title: ""
labels: labels:
- "type::enhancement" - "type::feature"
- "status::review_needed" - "status::review_needed"
--- ---
*(if applicable) describe what problem or frustration you have currently* *(if applicable) describe what problem or frustration you have currently*
*describe what you would like to be able to do, or what solution you would like* *describe what you would like to be able to do, or what solution you would like (you can propose several)*
*(optional) additional context, comments *(optional) additional context, comments or implementation propositions*

View File

@ -1,5 +1,5 @@
--- ---
name: "Ask a question" name: "Ask a question"
about: "If you have a question about the usage of the libraries or the tool" about: "If you have a question about the usage of the libraries or the tool"
title: "" title: ""
labels: labels:

View File

@ -1,5 +1,5 @@
--- ---
name: "🚧 Refactor" name: "Refactor"
about: "For refactoring propositions" about: "For refactoring propositions"
title: "" title: ""
labels: labels:

1219
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -2,7 +2,6 @@
members = [ members = [
"crieur-retrieve", "crieur-retrieve",
"crieur-chatbot",
] ]
@ -18,9 +17,7 @@ publish = false
[dependencies] [dependencies]
anyhow = "1.0.40" anyhow = "1.0.40"
crieur-retrieve = {version = "0.1", path="crieur-retrieve"} crieur-retrieve = {version = "0.1", path="crieur-retrieve"}
crieur-chatbot = {version = "0.1", path="crieur-chatbot"}
dotenv = "0.15.0" dotenv = "0.15.0"
env_logger = "0.8.3" env_logger = "0.8.3"
log = "0.4.14" log = "0.4.14"
tokio = { version = "1.6.0", features = ["full"] } tokio = { version = "1.5.0", features = ["full"] }
tracing-subscriber = "0.2.18"

View File

@ -1,41 +1,17 @@
Tools to retrieve articles from multiple newspaper you subscribed to, all from Tools to retrieve articles from multiple newspaper you subscribed to.
the same place.
**This is a prototype, it isn't stable at all and you may not want to use it if **This is a prototype, it isn't stable at all and you may not want to use it if you expect it to just work !**
you expect it to just work !**
# How to use it # How to use it
First retrieve login cookies for websites and put it in a `.env` such as First retrieve login cookies for websites and put it in a `.env`
explained in the [newspaper source configuration
documentation](./documentation/reference/newspaper_configuration.md)
Then you can run [an example](./examples/cli_downloader.rs) using
``` ```
cargo run --example=cli_downloader cargo run --example=retrive_html_articles
``` ```
You can also specify the URL using
```
cargo run --example=cli_downloader -- [your url]
```
To know how to run the chatbot, please read the [chatbot
guide](./documentation/guides/run_chatbot.md)
# Documentation # Documentation
1. Design - 1. [Design](documentation/design/index.md)
1. [Scope of the project and roadmap](./documentation/design/scope.md) - a. [Scope of the project](documentation/design/scope.md)
2. [Retrieve](./documentation/design/retrieve.md) - b. [Retrieve](documentation/design/retrieve.md)
3. [Tooling](./documentation/design/tooling.md)
2. Guides
1. [Add a newspaper a source
](./documentation/guides/add_a_newspaper_source.md)
2. [Build and run the chatbot](./documentation/guides/run_chatbot.md)
3. Reference
1. [Newspaper source
configuration](./documentation/reference/newspaper_configuration.md)
2. [Chatbot
configuration](./documentation/reference/chatbot_configuration.md)

View File

@ -1,26 +0,0 @@
FROM docker.io/rust:1.51-alpine as build
WORKDIR /app
RUN apk add \
cmake \
musl-dev \
make \
g++ \
&& rustup target add x86_64-unknown-linux-musl
COPY Cargo.lock Cargo.toml .
COPY crieur-chatbot crieur-chatbot
COPY crieur-retrieve crieur-retrieve
COPY src src
RUN RUSTFLAGS=-Ctarget-feature=-crt-static cargo build --target x86_64-unknown-linux-musl --release --bin=crieur-chatbot
FROM scratch
WORKDIR /
COPY --from=build /usr/lib/libstdc++.so.6 /usr/lib/libstdc++.so.6
COPY --from=build /usr/lib/libgcc_s.so.1 /usr/lib/libgcc_s.so.1
COPY --from=build /lib/ld-musl-x86_64.so.1 /lib/ld-musl-x86_64.so.1
COPY --from=build /app/target/x86_64-unknown-linux-musl/release/crieur-chatbot /crieur-chatbot
CMD ["./crieur-chatbot"]

View File

@ -1,21 +0,0 @@
[package]
name = "crieur-chatbot"
version = "0.1.0"
authors = ["koalp <koalp@alpaga.dev>"]
edition = "2018"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
anyhow = "1.0.40"
dotenv = "0.15.0"
crieur-retrieve = {version = "0.1.0", path = "../crieur-retrieve"}
mime = "0.3.16"
log = "0.4.14"
[dependencies.matrix-sdk]
git = "https://github.com/matrix-org/matrix-rust-sdk"
rev = "242d46c9a1bf40fa15b5892c2ee81cb0f4508ff4"
version = "0.2.0"
default-features = false
features = ["encryption", "rustls-tls", "require_auth_for_profile_requests"]

View File

@ -1,84 +0,0 @@
//! Chatbot
use std::convert::TryInto;
use anyhow::Result;
use matrix_sdk::{self, Client, SyncSettings};
use crate::Html;
#[derive(Debug, Clone, Default)]
pub(crate) struct Builder {
user: String,
password: String,
homeserver: String,
//TODO: rooms
rooms: Vec<String>,
}
impl Builder {
fn new() -> Self {
Default::default()
}
pub(crate) async fn connect(&self) -> Result<Chatbot> {
let client = Client::new(self.homeserver.as_str())?;
client
.login(self.user.as_str(), self.password.as_str(), None, None)
.await?;
assert!(client.logged_in().await);
for room in &self.rooms {
client.join_room_by_id(&room.as_str().try_into()?).await?;
}
Ok(Chatbot { client })
}
pub(crate) fn login(
&mut self,
user: &impl AsRef<str>,
password: &impl AsRef<str>,
) -> &mut Self {
self.user = user.as_ref().into();
self.password = password.as_ref().into();
self
}
pub(crate) fn homeserver(&mut self, homeserver: &impl AsRef<str>) -> &mut Self {
self.homeserver = homeserver.as_ref().into();
self
}
pub(crate) fn room(&mut self, room: impl AsRef<str>) -> &mut Self {
self.rooms.push(room.as_ref().into());
self
}
pub(crate) fn rooms(&mut self, rooms: Vec<String>) -> &mut Self {
for room in rooms {
self.room(room);
}
self
}
}
#[derive(Debug, Clone)]
pub(crate) struct Chatbot {
client: Client,
}
impl Chatbot {
pub(crate) fn builder() -> Builder {
Builder::new()
}
pub(crate) async fn run(&self) -> Result<()> {
self.client.set_event_handler(Box::new(Html::new())).await;
let mut settings = SyncSettings::default();
if let Some(token) = self.client.sync_token().await {
settings = settings.token(token);
}
self.client.sync(settings).await;
Ok(())
}
}

View File

@ -1,40 +0,0 @@
use std::env;
use anyhow::{bail, Result};
use dotenv::dotenv;
use crate::Chatbot;
/// Runs the chatbot
pub async fn run() -> Result<()> {
dotenv().ok();
let (user, password, homeserver, rooms) = match (
env::var("CRIEUR_MATRIX_USER"),
env::var("CRIEUR_MATRIX_PASSWORD"),
env::var("CRIEUR_MATRIX_HOMESERVER"),
env::var("CRIEUR_MATRIX_ROOMS"),
) {
(Ok(user), Ok(password), Ok(homeserver), Ok(rooms)) => (
user,
password,
homeserver,
rooms
.split(",")
.map(|s| s.to_string())
.collect::<Vec<String>>(),
),
_ => bail!("Configuration incomplete, please set all required environment variables"),
};
let chatbot = Chatbot::builder()
.login(&user, &password)
.homeserver(&homeserver)
.rooms(rooms)
.connect()
.await?;
chatbot.run().await?;
Ok(())
}

View File

@ -1,122 +0,0 @@
use std::convert::TryInto;
use log::error;
use matrix_sdk::{
self, async_trait,
events::{
room::message::{MessageEventContent, MessageType, TextMessageEventContent},
AnyMessageEventContent, SyncMessageEvent,
},
room::Room,
EventHandler,
};
use crieur_retrieve::{article_location::Error, article_location::Result, ArticleLocation, Url};
pub(crate) struct Html {}
impl Html {
pub fn new() -> Self {
Self {}
}
}
async fn send_article<U, E>(url: U, room: matrix_sdk::room::Joined)
where
U: TryInto<Url, Error = E> + Send,
E: std::error::Error + Sync + Send + 'static,
{
//TODO: replace by async block when async block is stable
async fn article_html<U, E>(url: U) -> Result<String>
where
U: TryInto<Url, Error = E> + Send,
E: std::error::Error + Sync + Send + 'static,
{
let article_str = ArticleLocation::builder()
.url(url)?
.build()?
.retrieve_html()
.await?;
Ok(article_str)
}
let text_message =
|message| AnyMessageEventContent::RoomMessage(MessageEventContent::text_plain(message));
//TODO: replace occurences ok() by async and logging block when async block is stable
let article_html = match article_html(url).await {
Ok(url) => url,
Err(Error::MalformedUrl) => {
room.send(text_message("Error: Given url is malformed"), None)
.await
.ok();
return;
}
Err(Error::UnknownNewspaper) => {
room.send(
text_message("Error: Given url is do not correspond to a known newspaper"),
None,
)
.await
.ok();
return;
}
Err(Error::Misconfiguration(key)) => {
error!(
"Error in configuration : {} key is missing or malformed",
&key
);
room.send(
text_message("Error: configuration error, please contact your admin"),
None,
)
.await
.ok();
return;
}
Err(_) => {
room.send(
text_message("Unknown error =/, can't download the file"),
None,
)
.await
.ok();
return;
}
};
room.send_attachment(
"article.html",
&mime::TEXT_HTML_UTF_8,
&mut article_html.as_bytes(),
None,
)
.await
.ok();
}
#[async_trait]
impl EventHandler for Html {
async fn on_room_message(&self, room: Room, event: &SyncMessageEvent<MessageEventContent>) {
if let Room::Joined(room) = room {
let msg_body = if let SyncMessageEvent {
content:
MessageEventContent {
msgtype: MessageType::Text(TextMessageEventContent { body: msg_body, .. }),
..
},
..
} = event
{
msg_body
} else {
return;
};
match msg_body.split(' ').collect::<Vec<_>>().as_slice() {
["!html", url, ..] => send_article(*url, room).await,
_ => return,
}
}
}
}

View File

@ -1,2 +0,0 @@
mod html;
pub(crate) use html::Html;

View File

@ -1,10 +0,0 @@
//! Provides a matrix chatbot to download newspaper articles
mod cli;
pub use cli::run;
mod chatbot;
use chatbot::Chatbot;
mod handlers;
use handlers::Html;

View File

@ -8,22 +8,23 @@ publish = false
[dependencies] [dependencies]
anyhow = "1.0.40" anyhow = "1.0.40"
async-trait = "0.1.50" async-trait = "0.1.48"
thiserror = "1.0.24" thiserror = "1.0.24"
url = "2.2.2" url = "2.2.1"
hyper = { version = "0.14.7", features = ["full"] } hyper = { version = "0.14.5", features = ["full"] }
hyper-rustls = "0.22.1" hyper-rustls = "0.22.1"
cookie = "0.15.0" cookie = "0.15.0"
lol_html = "0.3.0" lol_html = "0.3.0"
indoc = "1.0.3" indoc = "1.0.3"
html-minifier = "3.0.13" html-minifier = "3.0.9"
bytes = "1.0.1" bytes = "1.0.1"
base64 = "0.13.0" base64 = "0.13.0"
futures = "0.3.15" futures = "0.3.14"
derive_builder = "0.10.2" derive_builder = "0.10.0"
nipper = "0.1.9" nipper = "0.1.9"
log = "0.4.14" log = "0.4.14"
env_logger = "0.8.3" env_logger = "0.8.3"
itertools = "0.10.0"
[dev-dependencies] [dev-dependencies]
tokio = "1.6.0" tokio = "1.5.0"

View File

@ -1,87 +1,21 @@
use std::boxed::Box; use std::boxed::Box;
use std::convert::TryInto; use std::convert::TryInto;
use std::env;
use anyhow::{anyhow, Result};
use log::info;
use url::{Host, Url}; use url::{Host, Url};
use crate::newspaper::Newspaper; use crate::newspaper::Newspaper;
use crate::newspapers::courrier_international::{self, CourrierInternational};
use crate::newspapers::mediapart::{self, Mediapart};
use crate::newspapers::monde_diplomatique::{self, MondeDiplo};
/// Enumerate all errors that can be encountered when using ArticleLocation type Newspapers<'a> = Vec<Box<&'a dyn Newspaper>>;
#[derive(thiserror::Error, Debug)]
pub enum Error {
/// The url was not set. Therefore, the article location can't be deduced
#[error("No url set")]
NoUrl,
/// The given URL isn't an accepted Url
#[error("Malformed URL")]
MalformedUrl,
/// The given url doesn't correspond to a newspaper.
#[error("The given url doesn't link to a known newspaper")]
UnknownNewspaper,
/// Error in configuration : used for missing or malformed configuration
#[error("Error in configuration (configuration key {0} malformed or missing)")]
Misconfiguration(String),
/// Other errors
#[error(transparent)]
Other(#[from] anyhow::Error),
}
type Newspapers = Vec<Box<dyn Newspaper>>;
pub type Result<T, E = Error> = core::result::Result<T, E>;
fn default_newpapers() -> Result<Newspapers> {
// TODO: same thing is written too much times : how to DRY ?
let config_key = "MEDIAPART_COOKIE".to_string();
let mpruiid = env::var(&config_key).map_err(|_| Error::Misconfiguration(config_key))?;
let mediapart = Mediapart::builder()
.login(mediapart::Login::Mpruuid(mpruiid))
.build()?;
let lmd_a_m = "MONDE_DIPLO_LMD_A_M".to_string();
let phpsessid = "MONDE_DIPLO_PHPSESSID".to_string();
let spip_session = "MONDE_DIPLO_SPIP_SESSION".to_string();
let lmd_a_m = env::var(&lmd_a_m).map_err(|_| Error::Misconfiguration(lmd_a_m))?;
let phpsessid = env::var(&phpsessid).map_err(|_| Error::Misconfiguration(phpsessid))?;
let spip_session =
env::var(&spip_session).map_err(|_| Error::Misconfiguration(spip_session))?;
let monde_diplo = MondeDiplo::builder()
.login(monde_diplomatique::Login::Cookies {
lmd_a_m,
phpsessid,
spip_session,
})
.build()?;
let lmd_a_m = "COURRIER_INTERNATIONAL_LMD_A_M".to_string();
let ssess = "COURRIER_INTERNATIONAL_SSESS".to_string();
let lmd_a_m = env::var(&lmd_a_m).map_err(|_| Error::Misconfiguration(lmd_a_m))?;
let ssess = env::var(&ssess).map_err(|_| Error::Misconfiguration(ssess))?;
let courrier_international = CourrierInternational::builder()
.login(courrier_international::Login::Cookies { lmd_a_m, ssess })
.build()?;
Ok(vec![
Box::new(mediapart),
Box::new(monde_diplo),
Box::new(courrier_international),
])
}
#[derive(Default)] #[derive(Default)]
pub struct Builder { pub struct ArticleLocationBuilder<'a> {
url: Option<Url>, url: Option<Url>,
newspapers: Option<Newspapers>, newspapers: Option<Newspapers<'a>>,
} }
impl Builder { impl<'a> ArticleLocationBuilder<'a> {
pub fn new() -> Self { pub fn new() -> Self {
Self::default() Self::default()
} }
@ -91,20 +25,21 @@ impl Builder {
/// # Errors /// # Errors
/// ///
/// An error is returned if the could not be converted into an url /// An error is returned if the could not be converted into an url
// TODO: move this to a defined error, remove anyhow !
pub fn url<U, E>(mut self, url: U) -> Result<Self> pub fn url<U, E>(mut self, url: U) -> Result<Self>
where where
U: TryInto<Url, Error = E> + Send, U: TryInto<Url, Error = E> + Send,
E: std::error::Error + Sync + Send + 'static, E: std::error::Error + Sync + Send + 'static,
{ {
let url = url.try_into().map_err(|_| Error::MalformedUrl)?; let url = url.try_into()?;
self.url = Some(url); self.url = Some(url);
Ok(self) Ok(self)
} }
/// Adds a newspaper to the list /// Adds a newspaper to the list
pub fn newspaper<T>(mut self, newspaper: T) -> Self pub fn newspaper<T>(&mut self, newspaper: &'a T) -> &mut Self
where where
T: 'static + Newspaper, T: 'a + Newspaper,
{ {
match &mut self.newspapers { match &mut self.newspapers {
Some(newspapers) => newspapers.push(Box::new(newspaper)), Some(newspapers) => newspapers.push(Box::new(newspaper)),
@ -114,13 +49,18 @@ impl Builder {
} }
/// Adds several newspapers to the list of accepted newspapers /// Adds several newspapers to the list of accepted newspapers
pub fn newspapers(mut self, newspapers: Newspapers) -> Self { //fn newspapers(&mut self, newspapers: Newspapers) -> Result<&mut Self> {
match &mut self.newspapers { // let newspapers = match &self.newspapers {
Some(current_newspapers) => current_newspapers.extend(newspapers), // Some(current_newspapers) => newspapers
None => self.newspapers = Some(newspapers.into_iter().collect::<Vec<_>>()), // .iter()
}; // .chain(current_newspapers.iter())
self // .map(|s| *(s.clone()))
} // .collect::<Newspapers>(),
// None => newspapers.into_iter().collect::<Vec<_>>(),
// };
// self.newspapers = Some(newspapers);
// Ok(self)
//}
/// Builds the ArticleLocation by looking which newspaper /// Builds the ArticleLocation by looking which newspaper
/// ///
@ -131,32 +71,41 @@ impl Builder {
/// - no newpspaper is given /// - no newpspaper is given
/// - the url is not set /// - the url is not set
/// - the given url has no host /// - the given url has no host
pub fn build(self) -> Result<ArticleLocation> { // TODO: move this to a defined error, remove anyhow !
let url = Clone::clone(self.url.as_ref().ok_or(Error::NoUrl)?); pub fn build(&self) -> Result<ArticleLocation<'a>> {
let host = url.host_str().ok_or(Error::MalformedUrl)?; let url = Clone::clone(self.url.as_ref().ok_or(anyhow!(
let host = Host::parse(host).map_err(|_| Error::MalformedUrl)?; "No url set. You can set it with the url() function"
))?);
let host = url.host_str().ok_or(anyhow!("Given url has no host"))?;
let host = Host::parse(host)?;
let newspaper = self let newspaper = self
.newspapers .newspapers
.unwrap_or(default_newpapers()?) .as_ref()
.into_iter() .ok_or(anyhow!(
"A list of NewsPaper must be set. It can be set with newspapers() function"
))?
.iter()
.find(|c| c.metadata().hosts.contains(&host)) .find(|c| c.metadata().hosts.contains(&host))
.ok_or(Error::UnknownNewspaper)?; .ok_or(anyhow!("Newspaper couldn't be found"))?;
Ok(ArticleLocation { newspaper, url }) Ok(ArticleLocation {
newspaper: newspaper.clone(),
url,
})
} }
} }
pub struct ArticleLocation { pub struct ArticleLocation<'a> {
newspaper: Box<dyn Newspaper>, newspaper: Box<&'a dyn Newspaper>,
pub url: Url, pub url: Url,
} }
impl ArticleLocation { impl<'a> ArticleLocation<'a> {
pub fn builder() -> Builder { pub fn builder() -> ArticleLocationBuilder<'a> {
Builder::new() ArticleLocationBuilder::new()
} }
pub async fn retrieve_html(&self) -> Result<String> { pub async fn retrieve_html(&self) -> Result<String> {
// TODO: modify when retrieve_html returns a specific Error type info!("It will download from {}", self.url);
Ok(self.newspaper.retrieve_html(&self.url).await?) self.newspaper.retrieve_html(&self.url).await
} }
} }

View File

@ -8,9 +8,10 @@ pub use tools::{Download, Downloader};
pub mod newspaper; pub mod newspaper;
// TODO: move to another crate // TODO: move to another crate
pub mod newspapers; mod newspapers;
pub use newspapers::Mediapart;
pub mod article_location; mod article_location;
pub use article_location::ArticleLocation; pub use article_location::ArticleLocation;
mod consts; mod consts;

View File

@ -1,10 +1,17 @@
use anyhow::Result; use anyhow::Result;
use async_trait::async_trait; use async_trait::async_trait;
use derive_builder::Builder;
use url::Host; use url::Host;
pub use url::Url; pub use url::Url;
enum Login {
Username(String, String),
Cookie(String),
}
/// Contains metadata about a newspaper /// Contains metadata about a newspaper
#[derive(Debug, PartialEq, Default, derive_builder::Builder)] // TODO: provide builder
#[derive(Debug, PartialEq, Default, Builder)]
#[builder(default)] #[builder(default)]
pub struct Metadata { pub struct Metadata {
/// The hosts that can be corresponds to this newspaper /// The hosts that can be corresponds to this newspaper
@ -21,14 +28,13 @@ pub struct Metadata {
} }
impl Metadata { impl Metadata {
/// Get metadata builder
pub fn builder() -> MetadataBuilder { pub fn builder() -> MetadataBuilder {
MetadataBuilder::default() MetadataBuilder::default()
} }
} }
#[async_trait] #[async_trait]
pub trait Newspaper: Send + Sync { pub trait Newspaper {
/// Returns a list of hosts that corresponds to the newspapers /// Returns a list of hosts that corresponds to the newspapers
fn metadata(&self) -> Metadata; fn metadata(&self) -> Metadata;
@ -43,7 +49,7 @@ pub trait Newspaper: Send + Sync {
} }
/// Returns a newspaper structure /// Returns a newspaper structure
fn new() -> Self async fn new() -> Self
where where
Self: Sized; Self: Sized;

View File

@ -1,144 +0,0 @@
use anyhow::{anyhow, bail, Result};
use async_trait::async_trait;
use cookie::Cookie;
use indoc::indoc;
use url::Host;
use crate::newspaper::{Metadata, Newspaper};
use crate::tools;
use crate::Url;
use crate::{Download, Downloader};
pub enum Login {
Username(String, String),
Cookies { lmd_a_m: String, ssess: String },
}
#[derive(Debug, Clone, Default)]
pub struct CourrierInternational {
login_cookies: Vec<(String, String)>,
}
fn str_to_host<S: Into<String>>(host: S) -> Host {
Host::Domain(host.into())
}
#[derive(Debug, Clone, Default)]
pub struct Builder {
login_cookies: Option<Vec<(String, String)>>,
}
impl Builder {
pub fn login(&mut self, login: Login) -> &mut Self {
self.login_cookies = match login {
Login::Username(_username, _password) => {
unimplemented!("login using username and passwond not implemented")
}
Login::Cookies { lmd_a_m, ssess } => Some(vec![
("lmd_a_m".into(), lmd_a_m),
("SSESS862c7003d721c672d39f161b1456b890".into(), ssess),
]),
};
self
}
pub fn build(&self) -> Result<CourrierInternational> {
match &self.login_cookies {
Some(login_cookies) => Ok(CourrierInternational {
login_cookies: login_cookies.clone(),
}),
None => Err(anyhow!("You have to log in to access this newspaper")),
}
}
}
#[async_trait]
impl Newspaper for CourrierInternational {
fn metadata(&self) -> Metadata {
Metadata::builder()
.hosts(vec![
str_to_host("courrierinternational.com"),
str_to_host("www.courrierinternational.com"),
])
.lower_case_name("courrier-international")
.name("Courrier international")
.build()
.unwrap_or_default()
}
async fn retrieve_html(&self, url: &Url) -> Result<String> {
let cookies = self
.login_cookies
.iter()
.map(|cookie| Cookie::build(&cookie.0, &cookie.1).finish())
.collect::<Vec<_>>();
// TODO: replace by builder
let downloader = Downloader { cookies };
let body = downloader.download(&url).await?;
let html = match body {
Some(body) => String::from_utf8(body.to_vec())?,
None => bail!("404 not found"),
};
let elements_to_remove = &[
// navigation elements
"header.site-header",
"footer.site-footer",
// Social buttons
"#toolbox-share",
".toolbox-share",
".toolbox-print",
".toolbox-respond",
".toolbox-zen",
".toolbox-newsletter",
".toolbox-offer",
".box-article-offer-friend-abo",
// unused services
".article-aside",
".article-secondary",
".article-subject-readmore",
// misc
".element-invisible",
".gptcontainer",
];
// FIXME: it doesn't work because the aside is in the article body
//
let toolbox_style = indoc! {"
aside.article-toolbox {
position: sticky;
top: 1em;
}
"};
let single_page_html = tools::self_contained_html::Config {
downloader: Some(&downloader),
base_url: Some(&url),
elements_to_remove,
styles_to_add: &[toolbox_style],
..Default::default()
}
.run(&html)
.await;
Ok(single_page_html)
}
fn new() -> Self {
Self {
..Default::default()
}
}
async fn has_complete_access(&self) -> bool {
// TODO: check if we are logged using the cookie
true
}
}
impl CourrierInternational {
pub fn builder() -> Builder {
Builder::default()
}
}

View File

@ -1,4 +1,4 @@
use anyhow::{anyhow, bail, Result}; use anyhow::Result;
use async_trait::async_trait; use async_trait::async_trait;
use cookie::Cookie; use cookie::Cookie;
use url::Host; use url::Host;
@ -8,46 +8,16 @@ use crate::tools;
use crate::Url; use crate::Url;
use crate::{Download, Downloader}; use crate::{Download, Downloader};
pub enum Login {
Username(String, String),
Mpruuid(String),
}
#[derive(Debug, Clone, Default)] #[derive(Debug, Clone, Default)]
pub struct Mediapart { pub struct Mediapart {
login_cookie: (String, String), // TODO: remove this pub !!
pub login_cookie: Option<(String, String)>,
} }
fn str_to_host<S: Into<String>>(host: S) -> Host { fn str_to_host<S: Into<String>>(host: S) -> Host {
Host::Domain(host.into()) Host::Domain(host.into())
} }
#[derive(Debug, Clone, Default)]
pub struct Builder {
login_cookie: Option<(String, String)>,
}
impl Builder {
pub fn login(&mut self, login: Login) -> &mut Self {
self.login_cookie = match login {
Login::Username(_username, _password) => {
unimplemented!("login using username and passwond not implemented")
}
Login::Mpruuid(cookie_value) => Some(("MPRUUID".into(), cookie_value)),
};
self
}
pub fn build(&self) -> Result<Mediapart> {
match &self.login_cookie {
Some(login_cookie) => Ok(Mediapart {
login_cookie: login_cookie.clone(),
}),
None => Err(anyhow!("You have to log in to access this newspaper")),
}
}
}
#[async_trait] #[async_trait]
impl Newspaper for Mediapart { impl Newspaper for Mediapart {
fn metadata(&self) -> Metadata { fn metadata(&self) -> Metadata {
@ -71,22 +41,22 @@ impl Newspaper for Mediapart {
let mut url = url.clone(); let mut url = url.clone();
url.set_query(Some(&query)); url.set_query(Some(&query));
let cookie = Cookie::build(&self.login_cookie.0, &self.login_cookie.1) // TODO: add "?onglet=full" to the url if not
.secure(true) let cookies = if let Some((name, value)) = &self.login_cookie {
.finish(); let cookie = Cookie::build(name, value).secure(true).finish();
let cookies = vec![cookie]; vec![cookie]
} else {
vec![]
};
// TODO: replace by builder // TODO: replace by builder
let downloader = Downloader { cookies }; let downloader = Downloader { cookies };
let body = downloader.download(&url).await?; let body = downloader.download(&url).await?;
let html = match body { let html = String::from_utf8(body.to_vec())?;
Some(body) => String::from_utf8(body.to_vec())?,
None => bail!("404 not found"),
};
// TODO: Move to const // TODO: Move to const
let elements_to_remove = &[ let element_to_remove = [
// header // header
".fb-root", ".fb-root",
".skipLinks", ".skipLinks",
@ -104,18 +74,13 @@ impl Newspaper for Mediapart {
"aside.cc-modal", "aside.cc-modal",
]; ];
let single_page_html = tools::self_contained_html::Config { // TODO: correction of usage of relative urls, and replace "" by the url
downloader: Some(&downloader), let single_page_html =
base_url: Some(&url), tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await;
elements_to_remove,
..Default::default()
}
.run(&html)
.await;
Ok(single_page_html) Ok(single_page_html)
} }
fn new() -> Self { async fn new() -> Self {
Self { Self {
..Default::default() ..Default::default()
} }
@ -126,9 +91,3 @@ impl Newspaper for Mediapart {
true true
} }
} }
impl Mediapart {
pub fn builder() -> Builder {
Builder::default()
}
}

View File

@ -1,3 +1,3 @@
pub mod courrier_international; mod mediapart;
pub mod mediapart;
pub mod monde_diplomatique; pub use mediapart::Mediapart;

View File

@ -1,137 +0,0 @@
use anyhow::{anyhow, bail, Result};
use async_trait::async_trait;
use cookie::Cookie;
use url::Host;
use crate::newspaper::{Metadata, Newspaper};
use crate::tools;
use crate::Url;
use crate::{Download, Downloader};
pub enum Login {
Username(String, String),
Cookies {
lmd_a_m: String,
phpsessid: String,
spip_session: String,
},
}
#[derive(Debug, Clone, Default)]
pub struct MondeDiplo {
login_cookies: Vec<(String, String)>,
}
fn str_to_host<S: Into<String>>(host: S) -> Host {
Host::Domain(host.into())
}
#[derive(Debug, Clone, Default)]
pub struct Builder {
login_cookies: Option<Vec<(String, String)>>,
}
impl Builder {
pub fn login(&mut self, login: Login) -> &mut Self {
self.login_cookies = match login {
Login::Username(_username, _password) => {
unimplemented!("login using username and passwond not implemented")
}
Login::Cookies {
lmd_a_m,
phpsessid,
spip_session,
} => Some(vec![
("lmd_a_m".into(), lmd_a_m),
("PHPSESSID".into(), phpsessid),
("spip_session".into(), spip_session),
]),
};
self
}
pub fn build(&self) -> Result<MondeDiplo> {
match &self.login_cookies {
Some(login_cookies) => Ok(MondeDiplo {
login_cookies: login_cookies.clone(),
}),
None => Err(anyhow!("You have to log in to access this newspaper")),
}
}
}
#[async_trait]
impl Newspaper for MondeDiplo {
fn metadata(&self) -> Metadata {
Metadata::builder()
.hosts(vec![
str_to_host("monde-diplomatique.fr"),
str_to_host("www.monde-diplomatique.fr"),
])
.lower_case_name("monde-diplomatique")
.name("Le Monde Diplomatique")
.build()
.unwrap_or_default()
}
async fn retrieve_html(&self, url: &Url) -> Result<String> {
let cookies = self
.login_cookies
.iter()
.map(|cookie| Cookie::build(&cookie.0, &cookie.1).finish())
.collect::<Vec<_>>();
// TODO: replace by builder
let downloader = Downloader { cookies };
let body = downloader.download(&url).await?;
let html = match body {
Some(body) => String::from_utf8(body.to_vec())?,
None => bail!("404 not found"),
};
// TODO: Move to const
let elements_to_remove = &[
// navigation elements
"#tout-en-haut.preentete",
"#entete.connecte",
"#navigation",
"#pied",
".bloc-connexion",
// unused features
"#ecouter",
// Social buttons
".actions-article",
"#partage",
// misc
"noscript",
];
let single_page_html = tools::self_contained_html::Config {
downloader: Some(&downloader),
base_url: Some(&url),
elements_to_remove,
..Default::default()
}
.run(&html)
.await;
Ok(single_page_html)
}
fn new() -> Self {
Self {
..Default::default()
}
}
async fn has_complete_access(&self) -> bool {
// TODO: check if we are logged using the cookie
true
}
}
impl MondeDiplo {
pub fn builder() -> Builder {
Builder::default()
}
}

View File

@ -4,7 +4,7 @@ use anyhow::Result;
use async_trait::async_trait; use async_trait::async_trait;
use bytes::Bytes; use bytes::Bytes;
use cookie::Cookie; use cookie::Cookie;
use hyper::{header, Body, Client, Method, Request, StatusCode}; use hyper::{header, Body, Client, Method, Request};
use thiserror::Error; use thiserror::Error;
use url::Url; use url::Url;
@ -22,9 +22,7 @@ pub trait Download {
type Error: StdError; type Error: StdError;
/// Downloads a file from an url and returns the result as bytes /// Downloads a file from an url and returns the result as bytes
/// async fn download(&self, file_link: &Url) -> Result<Bytes, Self::Error>;
/// If the file is not found, returns None
async fn download(&self, file_link: &Url) -> Result<Option<Bytes>, Self::Error>;
} }
/// Store several cookies /// Store several cookies
@ -38,8 +36,7 @@ pub struct Downloader<'c> {
impl<'c> Download for Downloader<'c> { impl<'c> Download for Downloader<'c> {
type Error = DownloadError; type Error = DownloadError;
async fn download(&self, file_link: &Url) -> Result<Option<Bytes>, Self::Error> { async fn download(&self, file_link: &Url) -> Result<Bytes, Self::Error> {
log::debug!("downloading url {:?}", file_link);
let https = hyper_rustls::HttpsConnector::with_native_roots(); let https = hyper_rustls::HttpsConnector::with_native_roots();
let client: Client<_, hyper::Body> = Client::builder().build(https); let client: Client<_, hyper::Body> = Client::builder().build(https);
@ -47,26 +44,14 @@ impl<'c> Download for Downloader<'c> {
.method(Method::GET) .method(Method::GET)
.uri(file_link.as_str()); .uri(file_link.as_str());
req = req.header( for cookie in &self.cookies {
header::COOKIE, req = req.header(header::COOKIE, cookie.to_string());
self.cookies }
.iter()
.map(Cookie::to_string)
.collect::<Vec<_>>()
.join(";"),
);
log::debug!("headers : {:?}", req.headers_ref());
let req = req.body(Body::empty())?; let req = req.body(Body::empty())?;
let resp = client.request(req).await?; let resp = client.request(req).await?;
log::debug!("Response status : {:?}", resp.status()); let body = hyper::body::to_bytes(resp).await?;
let body = match resp.status() {
StatusCode::OK => Some(hyper::body::to_bytes(resp).await?),
StatusCode::NOT_FOUND => None,
// TODO: enhance this by handling more error codes
_ => None,
};
Ok(body) Ok(body)
} }
} }

View File

@ -1,4 +1,5 @@
mod download; mod download;
pub mod self_contained_html; mod self_contained_html;
pub use download::{Download, DownloadError, Downloader}; pub use download::{Download, DownloadError, Downloader};
pub use self_contained_html::self_contained_html;

View File

@ -8,198 +8,142 @@ use url::Url;
use crate::consts::{EVENT_HANDLERS, LINK_REL_EXTERNAL_RESOURCES}; use crate::consts::{EVENT_HANDLERS, LINK_REL_EXTERNAL_RESOURCES};
use crate::Download; use crate::Download;
/// Stores configuration for the self_contained_html function /// Makes an html page self-contained
// TODO: write a builder ///
pub struct Config<'t, E, D, S1 = &'static str, S2 = &'static str> /// The `downloader` must implement `Download` and is used to download ressources that are
/// needed to make this page self-contained such as stylesheets or images.
///
/// The function also removes all scripts on the page
pub async fn self_contained_html<E, D>(
html: impl AsRef<str>,
downloader: &D,
base_url: &Url,
elements_to_remove: &[impl AsRef<str>],
) -> String
where where
E: std::error::Error, E: std::error::Error,
D: Download<Error = E> + Send, D: Download<Error = E> + Send,
S1: AsRef<str>,
S2: AsRef<str>,
{ {
/// the downloader that will be used to retrieve ressources on the page // TODO: split/refactor this function :
pub downloader: Option<&'t D>, // - ¿ find a way to ease the |get urls| -> |async download| -> |replace| workflow ?
/// Base url for downloading ressources, it probably the // - `element_to_remove` , `base_url` and `downloader` should be in a configuration structure
pub base_url: Option<&'t Url>, // - ¿ should be function of a trait ? or only of the configuration struct ?
pub elements_to_remove: &'t [S1], let (style_urls, html) = {
pub styles_to_add: &'t [S2], let document = Document::from(html.as_ref());
}
impl<'t, E, D> Default for Config<'t, E, D> // ---- Remove scripts ----
where //
E: std::error::Error, document.select("script").remove();
D: Download<Error = E> + Send,
{ for event in EVENT_HANDLERS {
fn default() -> Self { document
Self { .select(format!("[{}]", event).as_str())
downloader: None, .remove_attr(event);
base_url: None,
elements_to_remove: &[],
styles_to_add: &[],
} }
}
}
impl<'t, E, D, S1, S2> Config<'t, E, D, S1, S2> for rel in LINK_REL_EXTERNAL_RESOURCES {
where document
E: std::error::Error, .select(format!("link[rel=\"{}\"]", rel).as_str())
D: Download<Error = E> + Send, .remove();
S1: AsRef<str>, }
S2: AsRef<str>,
{
/// Makes an html page self-contained
///
/// The `downloader` must implement `Download` and is used to download ressources that are
/// needed to make this page self-contained such as stylesheets or images.
///
/// The function also removes all scripts on the page
pub async fn run(&self, html: impl AsRef<str>) -> String {
//TODO: don't panic
let base_url = self.base_url.expect("Base url not defined");
let downloader = self.downloader.expect("Downloader not defined");
// TODO: split/refactor this function :
// - ¿ find a way to ease the |get urls| -> |async download| -> |replace| workflow ?
// - put each modification (ex: style in the `foreach`) in functions, maybe using
// (¿sealed?) Traits : https://git.alpaga.dev/poc/crieur/issues/42
// - `element_to_remove` , `base_url` and `downloader` should be in a configuration structure
let (style_urls, html) = {
let document = Document::from(html.as_ref());
// ---- Remove scripts ---- // ---- Replace stylesheets ----
//
document.select("script").remove();
for event in EVENT_HANDLERS {
document
.select(format!("[{}]", event).as_str())
.remove_attr(event);
}
for rel in LINK_REL_EXTERNAL_RESOURCES {
document
.select(format!("link[rel=\"{}\"]", rel).as_str())
.remove();
}
// ---- Replace stylesheets ----
//
let stylesheets = document.select("link[href][rel=\"stylesheet\"]");
let styles_url = stylesheets
.iter()
.map(|style_link| {
if let Some(src) = style_link.attr("href") {
base_url.join(src.as_ref()).ok()
} else {
None
}
})
.collect::<Vec<_>>();
(styles_url, String::from(document.html()))
};
let style_urls = style_urls.into_iter().map(|style_url| {
OptionFuture::from(
style_url.map(|s| async move { downloader.download(&s).await.unwrap() }),
)
});
let downloaded_styles = futures::future::join_all(style_urls).await;
let html = {
let document = Document::from(&html);
let styles = document.select("link[href][rel=\"stylesheet\"]");
styles
.iter()
.zip(downloaded_styles.iter())
.for_each(|(mut style_link, inner_css)| {
if let Some(Some(inner_css)) = inner_css {
let css = String::from_utf8(inner_css.to_vec()).unwrap();
let media_query = style_link.attr("media");
let css = match media_query {
Some(media_query) => {
format!("<style media=\"{}\">{}</style>", media_query, css)
}
None => format!("<style>{}</style>", css),
};
style_link.replace_with_html(css);
} else {
style_link.remove();
}
});
String::from(document.html())
};
// ---- Replace imgs ----
// //
let image_urls = { let stylesheets = document.select("link[href][rel=\"stylesheet\"]");
let document = Document::from(&html); let styles_url = stylesheets
let imgs = document.select("img:not([src^=\"data:\"])"); .iter()
.map(|stylesheet| {
if let Some(src) = stylesheet.attr("href") {
//TODO: does it work with absolute urls ?
base_url.join(src.as_ref()).ok()
} else {
None
}
})
.collect::<Vec<_>>();
(styles_url, String::from(document.html()))
};
imgs.iter() let style_urls = style_urls.into_iter().map(|style_url| {
.map(|image| { OptionFuture::from(style_url.map(|s| async move { downloader.download(&s).await.unwrap() }))
if let Some(src) = image.attr("src") { });
base_url.join(src.as_ref()).ok() let downloaded_styles = futures::future::join_all(style_urls).await;
} else {
None
}
})
.collect::<Vec<_>>()
};
let downloaded_images = image_urls.into_iter().map(|image_url| { let html = {
OptionFuture::from(image_url.map(|url| async move { let document = Document::from(&html);
let data = downloader.download(&url).await.unwrap(); let styles = document.select("link[href][rel=\"stylesheet\"]");
(url, data)
}))
});
let downloaded_images = futures::future::join_all(downloaded_images).await;
let html = { styles
let document = Document::from(&html); .iter()
let imgs = document.select("img:not([src^=\"data:\"])"); .zip(downloaded_styles.iter())
.for_each(|(mut stylesheet, inner_css)| {
if let Some(inner_css) = inner_css {
let css = String::from_utf8(inner_css.to_vec()).unwrap();
let css = format!("<style>{}</style>", css);
stylesheet.replace_with_html(css);
} else {
stylesheet.remove();
}
});
String::from(document.html())
};
imgs.iter() // ---- Replace imgs ----
.zip(downloaded_images.iter()) //
.for_each(|(mut img, data)| { let image_urls = {
if let Some((url, Some(data))) = data { let document = Document::from(&html);
let data = base64::encode(data); let imgs = document.select("img");
//TODO: use an extension hashmap
let extension =
Path::new(url.path()).extension().unwrap().to_str().unwrap();
img.set_attr("src", &format!("data:image/{};base64,{}", extension, data));
} else {
img.remove()
}
});
// ---- Remove unwanted html elements -----
//
for element in self.elements_to_remove {
document.select(element.as_ref()).remove();
}
// ---- Add additional styles ---- imgs.iter()
// .map(|image| {
for style in self.styles_to_add { if let Some(src) = image.attr("src") {
document base_url.join(src.as_ref()).ok()
.select("head") } else {
.append_html(format!("\n<style>{}</style>\n", style.as_ref())); None
} }
})
.collect::<Vec<_>>()
};
String::from(document.html()) let downloaded_images = image_urls.into_iter().map(|image_url| {
}; OptionFuture::from(image_url.map(|url| async move {
let data = downloader.download(&url).await.unwrap();
(url, data)
}))
});
let downloaded_images = futures::future::join_all(downloaded_images).await;
// ---- output ---- let html = {
let document = Document::from(&html);
let imgs = document.select("img");
imgs.iter()
.zip(downloaded_images.iter())
.for_each(|(mut img, data)| {
if let Some((url, data)) = data {
let data = base64::encode(data);
let extension = Path::new(url.path()).extension().unwrap().to_str().unwrap();
img.set_attr("src", &format!("data:image/{};base64,{}", extension, data));
}
});
// ---- Remove unwanted html elements -----
// //
let mut minifier = HTMLMinifier::new(); for element in elements_to_remove {
minifier.digest(html.as_str()).unwrap(); document.select(element.as_ref()).remove();
}
String::from(document.html())
};
String::from_utf8(minifier.get_html().into()).unwrap() // ---- output ----
} //
let mut minifier = HTMLMinifier::new();
minifier.digest(html.as_str()).unwrap();
String::from_utf8(minifier.get_html().into()).unwrap()
} }
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
// TODO: reduce boilerplate, DRY
use super::*; use super::*;
@ -224,8 +168,8 @@ mod tests {
#[async_trait] #[async_trait]
impl Download for DummyDownloader { impl Download for DummyDownloader {
type Error = errors::Error; type Error = errors::Error;
async fn download(&self, _file_link: &Url) -> errors::Result<Option<Bytes>> { async fn download(&self, _file_link: &Url) -> errors::Result<Bytes> {
Ok(Some(Bytes::from(""))) Ok(Bytes::from(""))
} }
} }
@ -234,14 +178,9 @@ mod tests {
let html = "<html><head><script>let id = id => id</script></head><body></body></html>"; let html = "<html><head><script>let id = id => id</script></head><body></body></html>";
let base_url = Url::parse("http://example.com")?; let base_url = Url::parse("http://example.com")?;
let downloader = DummyDownloader {}; let downloader = DummyDownloader {};
let to_remove: &[&str] = &[];
assert_eq!( assert_eq!(
Config { self_contained_html(html, &downloader, &base_url, to_remove).await,
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}
.run(html)
.await,
"<html><head></head><body></body></html>" "<html><head></head><body></body></html>"
); );
Ok(()) Ok(())
@ -265,13 +204,10 @@ mod tests {
}; };
let base_url = Url::parse("http://example.com")?; let base_url = Url::parse("http://example.com")?;
let to_remove: &[&str] = &[];
for s in EVENT_HANDLERS { for s in EVENT_HANDLERS {
assert_eq!( assert_eq!(
Config { self_contained_html(html(s), &downloader, &base_url, to_remove).await,
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}.run(html(s)).await,
"<html><head>\n</head>\n<body>\n<button class=\"activate\">button</button>\n</body></html>" "<html><head>\n</head>\n<body>\n<button class=\"activate\">button</button>\n</body></html>"
); );
} }
@ -296,15 +232,10 @@ mod tests {
}; };
let base_url = Url::parse("http://example.com")?; let base_url = Url::parse("http://example.com")?;
let to_remove: &[&str] = &[];
for s in LINK_REL_EXTERNAL_RESOURCES { for s in LINK_REL_EXTERNAL_RESOURCES {
assert_eq!( assert_eq!(
Config { self_contained_html(html(s), &downloader, &base_url, to_remove).await,
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}
.run(html(s))
.await,
"<html><head>\n</head>\n<body>\n</body></html>" "<html><head>\n</head>\n<body>\n</body></html>"
); );
} }
@ -315,14 +246,12 @@ mod tests {
#[async_trait] #[async_trait]
impl Download for CssDownloader { impl Download for CssDownloader {
type Error = errors::Error; type Error = errors::Error;
async fn download(&self, _file_link: &Url) -> errors::Result<Option<Bytes>> { async fn download(&self, _file_link: &Url) -> errors::Result<Bytes> {
Ok(Some( Ok(indoc! {"
indoc! {"
section#warning { section#warning {
color: red; color: red;
}"} }"}
.into(), .into())
))
} }
} }
@ -357,57 +286,9 @@ mod tests {
let minified = String::from_utf8(minifier.get_html().into())?; let minified = String::from_utf8(minifier.get_html().into())?;
let base_url = Url::parse("http://example.com")?; let base_url = Url::parse("http://example.com")?;
let to_remove: &[&str] = &[];
assert_eq!( assert_eq!(
Config { self_contained_html(html, &downloader, &base_url, to_remove).await,
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}
.run(html)
.await,
minified
);
Ok(())
}
#[tokio::test]
async fn download_css_with_media_query() -> Result<()> {
let downloader = CssDownloader {};
let html = indoc! {"
<html>
<head>
<link rel=\"stylesheet\" href=\"main.css\" media=\"print\">
</head>
<body>
</body>
</html>
"};
let wanted_html = indoc! {"
<html><head>
<style media=\"print\">
section#warning {
color: red;
}
</style>
</head>
<body>
</body></html>
"};
let mut minifier = HTMLMinifier::new();
minifier.digest(wanted_html)?;
let minified = String::from_utf8(minifier.get_html().into())?;
let base_url = Url::parse("http://example.com")?;
assert_eq!(
Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}
.run(html)
.await,
minified minified
); );
Ok(()) Ok(())
@ -417,12 +298,12 @@ mod tests {
#[async_trait] #[async_trait]
impl Download for PngDownloader { impl Download for PngDownloader {
type Error = errors::Error; type Error = errors::Error;
async fn download(&self, _file_link: &Url) -> errors::Result<Option<Bytes>> { async fn download(&self, _file_link: &Url) -> errors::Result<Bytes> {
let image_path = Path::new("test_data/home.png"); let image_path = Path::new("test_data/home.png");
let mut image_file = File::open(&image_path).unwrap(); let mut image_file = File::open(&image_path).unwrap();
let mut image_buf: Vec<u8> = vec![]; let mut image_buf: Vec<u8> = vec![];
image_file.read_to_end(&mut image_buf).unwrap(); image_file.read_to_end(&mut image_buf).unwrap();
Ok(Some(image_buf.into())) Ok(image_buf.into())
} }
} }
@ -452,14 +333,9 @@ mod tests {
let minified = String::from_utf8(minifier.get_html().into())?; let minified = String::from_utf8(minifier.get_html().into())?;
let base_url = Url::parse("http://example.com")?; let base_url = Url::parse("http://example.com")?;
let to_remove: &[&str] = &[];
assert_eq!( assert_eq!(
Config { self_contained_html(html, &downloader, &base_url, to_remove).await,
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}
.run(html)
.await,
minified minified
); );
Ok(()) Ok(())
@ -492,67 +368,12 @@ mod tests {
let minified = String::from_utf8(minifier.get_html().into())?; let minified = String::from_utf8(minifier.get_html().into())?;
assert_eq!( assert_eq!(
Config { self_contained_html(
downloader: Some(&downloader), html,
base_url: Some(&base_url), &downloader,
elements_to_remove: &["header", ".placeholder", "article > span.huge"], &base_url,
..Default::default() &["header", ".placeholder", "article > span.huge"]
} )
.run(html)
.await,
minified
);
Ok(())
}
#[tokio::test]
async fn add_style() -> Result<()> {
let html = indoc! {"
<html>
<head>
<meta charset=\"UTF-8\">
</head>
<body>
The body
</body>
</html>
"};
let wanted_html = indoc! {"
<html><head>
<meta charset=\"UTF-8\">
<style>
body {
margin: 3em;
}
</style>
</head>
<body>
The body
</body></html>
"};
let style_to_add = indoc! {"
body {
margin: 3em;
}
"};
let base_url = Url::parse("http://example.com")?;
let downloader = DummyDownloader {};
let mut minifier = HTMLMinifier::new();
minifier.digest(wanted_html)?;
let minified = String::from_utf8(minifier.get_html().into())?;
assert_eq!(
Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
styles_to_add: &[style_to_add],
..Default::default()
}
.run(html)
.await, .await,
minified minified
); );

View File

@ -1,98 +1,48 @@
--- This project mainly aims at providing an unified interface for several newspapers. Side
title: Scope of the project objectives are to provide web API and different clients like a webUI or chatbots.
---
This project mainly aims at providing an unified interface for several Several big components are planned for this project
newspapers. Side objectives are to provide web API and different clients like a
webUI or chatbots.
Several big components are planned for this project (it is an initial draft and ```dot
may change later) : digraph G {
rankdir=TB
node [shape=rectangle, style=filled, color="#779988"]
```plantuml subgraph cluster_frontend {
@startuml color = transparent
webui
frame "backend" { chatbot
[Retrieval tools] as retrieval_tools }
[Article representation] as article_repr
[Automatic retrieval] as auto_retrieve
[Atom/RSS adapters] as rss
[Cache DB] as cache
[Newspaper\n(Mediapart, …)] as newspaper
() "Newspaper" as np_i
newspaper -up- np_i
[Article location] as article_location webui -> api [color = red]
chatbot -> api [color = red]
[API] as api subgraph cluster_backend {
() "API" as api_i label = "Backend\ncrieur binary"
api -up- api_i labelloc = b
style=filled
article_location ..> np_i retrieve_tools [label="retrieve-tools"]
retrieve_adapters [label="retrieve-adapters"]
retrieve [label="retrieve-interface"]
auto_retrieve [label="automatic-retrieve"]
article_repr [label="article-representation\nRepresentation for articles"]
api
cache [label="Cache database"]
rss [label="Atom/RSS adapters"]
api -> article_location retrieve_tools -> retrieve_adapters
api -> rss retrieve_adapters -> retrieve
retrieve_tools -> retrieve
rss -> auto_retrieve
article_repr -> retrieve_adapters
newspaper -> retrieval_tools: uses to implement retrieve -> api
auto_retrieve -> api
cache -> api
article_location --> article_repr: uses }
retrieval_tools -up-> article_repr: uses
auto_retrieve --> rss: watches
auto_retrieve --> article_location
auto_retrieve --> cache: stores in
} }
frame "Web ui" {
[Web UI] as webui
[HTML renderer] as html_rend
[Pdf exporter] as pdf_rend
[Articles] as articles
webui --> html_rend
webui --> pdf_rend
webui -> articles
articles ..> api_i
}
[Chatbot] as chatbot
chatbot ..> api_i
actor User
User ..> webui
User ..> chatbot
actor "Newspaper programmer" as newspaper_programmer
newspaper_programmer ..> newspaper: implements
@enduml
``` ```
A task queue could be added later to space requests.
# Implementation plan
## Phase I
- [x] `Newspaper` interface : use to retrieve from newspaper websites
- [ ] minimal chatbot (uses libraries directly)
- [x] `ArticleLocation` : library for using several `Newspaper` and retrieving from
a given url.
## Phase II
- [ ] Article Representation : having a (beta) unified representation for downloaded
articles
- [ ] adding this representation to Newpsaper
## Phase III
- [ ] Cache
- [ ] Atom/rss adapters
- [ ] automatic retrieve
## Phase IV
- [ ] API
- [ ] chatbot (uses api)
## Phase V
- [ ] web ui

View File

@ -1,36 +0,0 @@
---
title: Project tooling
---
# Container image
## Chatbot release
The [chatbot containerfile](../../containers/chatbot.containerfile) intend to
be the smaller possible in order to ease and reduce the storage needed in
registries.
In order to provide a minimal image, the rust-alpine container image is used.
This image uses the `x86_64-unknown-linux-musl` target that provides static
linking with `musl`.
However, the `olm-sys` couldn't be linked statically[^oml-sys-static-error].
The workaround have been to introduce the
`RUSTFLAGS=-Ctarget-feature=-crt-static` environment variable that disables
static linking.
The following lines have been added to copy the needed libraries.
```containerfile
COPY --from=build /usr/lib/libstdc++.so.6 /usr/lib/libstdc++.so.6
COPY --from=build /usr/lib/libgcc_s.so.1 /usr/lib/libgcc_s.so.1
COPY --from=build /lib/ld-musl-x86_64.so.1 /lib/ld-musl-x86_64.so.1
```
## Development
An image aimed at providing a development environment for developers may
be added later.
[^oml-sys-static-error]: with `oml-sys` v1.1.1, in march 2021

View File

@ -1,27 +0,0 @@
---
title: Build and run the chatbot
---
1. You must first configure matrix login, every variable in [the_reference](../reference/chatbot_configuration.md) is mandatory.
```env
CRIEUR_MATRIX_USER=user
CRIEUR_MATRIX_PASSWORD=password
CRIEUR_MATRIX_HOMESERVER=https://homeserv.er
CRIEUR_MATRIX_ROOMS=roomid1,roomid2,
```
You can put it in a `.env` file.
2. Run the chatbot
**Using `podman` (or another container tool, like `docker`)**
```
podman build --file ./containers/chatbot.containerfile -t crieur-chatbot .
podman run --env-file .env --rm -i -t crieur-chatbot
```
**Using `cargo` (for development)**
```
cargo run --release --bin crieur-chatbot
```

View File

@ -1,17 +0,0 @@
---
Title: Chatbot configuration reference
---
The chatbot is configured using environment variables
CRIEUR_MATRIX_USER
: username of the matrix bot account
CRIEUR_MATRIX_PASSWORD
: password of the matrix bot account
CRIEUR_MATRIX_HOMESERVER
: homeserver of the matrix bot account
CRIEUR_MATRIX_ROOMS
: rooms in which to listen to events

View File

@ -1,31 +0,0 @@
---
title: Newspapers configuration
---
The newspapers are configured using environment variables
# Mediapart
MEDIAPART_COOKIE
: sets the `MPRUUID` cookie, used to log in
# Le Monde Diplomatique
All cookies are mandatory to log in
MONDE_DIPLO_LMD_A_M
: sets the `lmd_a_m` cookie
MONDE_DIPLO_PHPSESSID
: sets the `PHPSESSID` cookie
MONDE_DIPLO_SPIP_SESSION
: sets the `spip_session` cookie
# Courrier international
COURRIER_INTERNATIONAL_LMD_A_M
: sets the `lmd_a_m` cookie
COURRIER_INTERNATIONAL_SSESS
: sets the `ssess` cookie

View File

@ -2,27 +2,34 @@ use std::convert::TryInto;
use std::env; use std::env;
use anyhow::Result; use anyhow::Result;
use crieur_retrieve::{ArticleLocation, Url}; use crieur_retrieve::{newspaper::Newspaper, ArticleLocation, Mediapart, Url};
use dotenv::dotenv; use dotenv::dotenv;
use log::info; use log::info;
#[tokio::main] #[tokio::main]
async fn main() -> Result<()> { async fn main() -> Result<()> {
dotenv().ok(); dotenv().ok();
tracing_subscriber::fmt() env_logger::init();
.with_writer(std::io::stderr)
.with_env_filter(tracing_subscriber::EnvFilter::from_default_env())
.init();
let url = match env::args().nth(1) { let url = match env::args().nth(1) {
Some(url) => Url::parse(&url)?, Some(url) => Url::parse(&url)?,
None => "https://www.mediapart.fr/journal/france/030421/l-hotel-dieu-patients-et-medecins-tentent-de-percer-les-mysteres-du-covid-long".try_into()?, None => "https://www.mediapart.fr/journal/france/030421/l-hotel-dieu-patients-et-medecins-tentent-de-percer-les-mysteres-du-covid-long".try_into()?,
}; };
// TODO: remove this in favor of default newspapers
let mut mediapart = Mediapart::new().await
//.login(USERNAME, PASSWORD)
//
;
mediapart.login_cookie = Some(("MPRUUID".into(), env::var("MEDIAPART_COOKIE")?.into()));
info!("Trying to download article from {}", url); info!("Trying to download article from {}", url);
// TODO: shorten this, maybe an helper function ? // TODO: shorten this, maybe an helper function ?
let article_location = ArticleLocation::builder().url(url)?.build()?; let article_location = ArticleLocation::builder()
.url(url)?
.newspaper(&mediapart)
.build()?;
let article_str = article_location.retrieve_html().await?; let article_str = article_location.retrieve_html().await?;

View File

@ -1,29 +1,16 @@
@build: @build:
cargo build cargo build
@build-container:
podman build --file ./containers/chatbot.containerfile -t crieur-chatbot .
@clean: @clean:
cargo clean cargo clean
@run: @run:
cargo run cargo run
@test:
cargo test --all
@clippy:
cargo clippy
@fmt:
cargo fmt
@simulate-ci: fmt clippy test
@audit: @audit:
cargo audit cargo audit
@crev: @crev:
cargo crev verify cargo crev verify
@verify: audit crev

View File

@ -1,11 +0,0 @@
use anyhow::Result;
use crieur_chatbot::run;
use dotenv::dotenv;
#[tokio::main]
async fn main() -> Result<()> {
env_logger::init();
dotenv().ok();
run().await?;
Ok(())
}

19
src/main.rs Normal file
View File

@ -0,0 +1,19 @@
use anyhow::Result;
use crieur_retrieve::{newspaper::Newspaper, Mediapart, Url};
use dotenv::dotenv;
use std::env;
#[tokio::main]
async fn main() -> Result<()> {
dotenv().ok();
let mut mediapart = Mediapart::new().await
//.login(USERNAME, PASSWORD)
//
;
mediapart.login_cookie = Some(("MPRUUID".into(), env::var("MEDIAPART_COOKIE")?));
let url = Url::parse("https://www.mediapart.fr/journal/france/030421/l-hotel-dieu-patients-et-medecins-tentent-de-percer-les-mysteres-du-covid-long")?;
println!("{}", mediapart.retrieve_html(&url).await?);
Ok(())
}