Compare commits

..

No commits in common. "development" and "feature/mediapart_poc" have entirely different histories.

36 changed files with 471 additions and 2461 deletions

View File

@ -8,7 +8,6 @@ steps:
pull: true
errignore: true
commands:
- apt-get update && apt-get install -y cmake
- rustup component add rustfmt
- rustup component add clippy
- cargo clippy
@ -18,6 +17,5 @@ steps:
pull: true
errignore: true
commands:
- apt-get update && apt-get install -y cmake
- cargo test --all
- cargo build

View File

@ -1,6 +1,6 @@
---
name: "🐛 Bug report"
about: "For reporting bugs"
name: "Bug report"
about: "This template is for reporting a bug"
title: ""
labels:
- "type::bug"
@ -17,3 +17,6 @@ labels:
**Expected behavior**
*describe what you expected to happen*
**Configuration**
*paste the result of `stage --version`

View File

@ -1,6 +1,6 @@
---
name: "🗣 Discussion"
about: "For discussion about the software, when you want to discuss about several conception possibilities"
name: "Design discussion"
about: "For discussion about the design of features in the application, when there are several possibilities for implementation"
title: ""
labels:
- "type::discussion"
@ -8,8 +8,12 @@ labels:
---
*describe the problem *
*describe shortly the problem*
## Requirements
*list requirements that the feature have*
## Propositions
*(optionnal) explain the different implementation that you would propose*
*explain the different implementation that you would propose for the feature*

View File

@ -1,14 +1,15 @@
---
name: "💡 Feature request"
about: "For requesting a new feature, with an implementation plan"
name: "Feature request"
about: "This template is for requesting a new feature"
title: ""
labels:
- "type::enhancement"
- "type::feature"
- "status::review_needed"
---
*(if applicable) describe what problem or frustration you have currently*
*describe what you would like to be able to do, or what solution you would like*
*describe what you would like to be able to do, or what solution you would like (you can propose several)*
*(optional) additional context, comments
*(optional) additional context, comments or implementation propositions*

View File

@ -1,5 +1,5 @@
---
name: "Ask a question"
name: "Ask a question"
about: "If you have a question about the usage of the libraries or the tool"
title: ""
labels:

View File

@ -1,5 +1,5 @@
---
name: "🚧 Refactor"
name: "Refactor"
about: "For refactoring propositions"
title: ""
labels:

1219
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -2,7 +2,6 @@
members = [
"crieur-retrieve",
"crieur-chatbot",
]
@ -18,9 +17,7 @@ publish = false
[dependencies]
anyhow = "1.0.40"
crieur-retrieve = {version = "0.1", path="crieur-retrieve"}
crieur-chatbot = {version = "0.1", path="crieur-chatbot"}
dotenv = "0.15.0"
env_logger = "0.8.3"
log = "0.4.14"
tokio = { version = "1.6.0", features = ["full"] }
tracing-subscriber = "0.2.18"
tokio = { version = "1.5.0", features = ["full"] }

View File

@ -1,41 +1,17 @@
Tools to retrieve articles from multiple newspaper you subscribed to, all from
the same place.
Tools to retrieve articles from multiple newspaper you subscribed to.
**This is a prototype, it isn't stable at all and you may not want to use it if
you expect it to just work !**
**This is a prototype, it isn't stable at all and you may not want to use it if you expect it to just work !**
# How to use it
First retrieve login cookies for websites and put it in a `.env` such as
explained in the [newspaper source configuration
documentation](./documentation/reference/newspaper_configuration.md)
Then you can run [an example](./examples/cli_downloader.rs) using
First retrieve login cookies for websites and put it in a `.env`
```
cargo run --example=cli_downloader
cargo run --example=retrive_html_articles
```
You can also specify the URL using
```
cargo run --example=cli_downloader -- [your url]
```
To know how to run the chatbot, please read the [chatbot
guide](./documentation/guides/run_chatbot.md)
# Documentation
1. Design
1. [Scope of the project and roadmap](./documentation/design/scope.md)
2. [Retrieve](./documentation/design/retrieve.md)
3. [Tooling](./documentation/design/tooling.md)
2. Guides
1. [Add a newspaper a source
](./documentation/guides/add_a_newspaper_source.md)
2. [Build and run the chatbot](./documentation/guides/run_chatbot.md)
3. Reference
1. [Newspaper source
configuration](./documentation/reference/newspaper_configuration.md)
2. [Chatbot
configuration](./documentation/reference/chatbot_configuration.md)
- 1. [Design](documentation/design/index.md)
- a. [Scope of the project](documentation/design/scope.md)
- b. [Retrieve](documentation/design/retrieve.md)

View File

@ -1,26 +0,0 @@
FROM docker.io/rust:1.51-alpine as build
WORKDIR /app
RUN apk add \
cmake \
musl-dev \
make \
g++ \
&& rustup target add x86_64-unknown-linux-musl
COPY Cargo.lock Cargo.toml .
COPY crieur-chatbot crieur-chatbot
COPY crieur-retrieve crieur-retrieve
COPY src src
RUN RUSTFLAGS=-Ctarget-feature=-crt-static cargo build --target x86_64-unknown-linux-musl --release --bin=crieur-chatbot
FROM scratch
WORKDIR /
COPY --from=build /usr/lib/libstdc++.so.6 /usr/lib/libstdc++.so.6
COPY --from=build /usr/lib/libgcc_s.so.1 /usr/lib/libgcc_s.so.1
COPY --from=build /lib/ld-musl-x86_64.so.1 /lib/ld-musl-x86_64.so.1
COPY --from=build /app/target/x86_64-unknown-linux-musl/release/crieur-chatbot /crieur-chatbot
CMD ["./crieur-chatbot"]

View File

@ -1,21 +0,0 @@
[package]
name = "crieur-chatbot"
version = "0.1.0"
authors = ["koalp <koalp@alpaga.dev>"]
edition = "2018"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
anyhow = "1.0.40"
dotenv = "0.15.0"
crieur-retrieve = {version = "0.1.0", path = "../crieur-retrieve"}
mime = "0.3.16"
log = "0.4.14"
[dependencies.matrix-sdk]
git = "https://github.com/matrix-org/matrix-rust-sdk"
rev = "242d46c9a1bf40fa15b5892c2ee81cb0f4508ff4"
version = "0.2.0"
default-features = false
features = ["encryption", "rustls-tls", "require_auth_for_profile_requests"]

View File

@ -1,84 +0,0 @@
//! Chatbot
use std::convert::TryInto;
use anyhow::Result;
use matrix_sdk::{self, Client, SyncSettings};
use crate::Html;
#[derive(Debug, Clone, Default)]
pub(crate) struct Builder {
user: String,
password: String,
homeserver: String,
//TODO: rooms
rooms: Vec<String>,
}
impl Builder {
fn new() -> Self {
Default::default()
}
pub(crate) async fn connect(&self) -> Result<Chatbot> {
let client = Client::new(self.homeserver.as_str())?;
client
.login(self.user.as_str(), self.password.as_str(), None, None)
.await?;
assert!(client.logged_in().await);
for room in &self.rooms {
client.join_room_by_id(&room.as_str().try_into()?).await?;
}
Ok(Chatbot { client })
}
pub(crate) fn login(
&mut self,
user: &impl AsRef<str>,
password: &impl AsRef<str>,
) -> &mut Self {
self.user = user.as_ref().into();
self.password = password.as_ref().into();
self
}
pub(crate) fn homeserver(&mut self, homeserver: &impl AsRef<str>) -> &mut Self {
self.homeserver = homeserver.as_ref().into();
self
}
pub(crate) fn room(&mut self, room: impl AsRef<str>) -> &mut Self {
self.rooms.push(room.as_ref().into());
self
}
pub(crate) fn rooms(&mut self, rooms: Vec<String>) -> &mut Self {
for room in rooms {
self.room(room);
}
self
}
}
#[derive(Debug, Clone)]
pub(crate) struct Chatbot {
client: Client,
}
impl Chatbot {
pub(crate) fn builder() -> Builder {
Builder::new()
}
pub(crate) async fn run(&self) -> Result<()> {
self.client.set_event_handler(Box::new(Html::new())).await;
let mut settings = SyncSettings::default();
if let Some(token) = self.client.sync_token().await {
settings = settings.token(token);
}
self.client.sync(settings).await;
Ok(())
}
}

View File

@ -1,40 +0,0 @@
use std::env;
use anyhow::{bail, Result};
use dotenv::dotenv;
use crate::Chatbot;
/// Runs the chatbot
pub async fn run() -> Result<()> {
dotenv().ok();
let (user, password, homeserver, rooms) = match (
env::var("CRIEUR_MATRIX_USER"),
env::var("CRIEUR_MATRIX_PASSWORD"),
env::var("CRIEUR_MATRIX_HOMESERVER"),
env::var("CRIEUR_MATRIX_ROOMS"),
) {
(Ok(user), Ok(password), Ok(homeserver), Ok(rooms)) => (
user,
password,
homeserver,
rooms
.split(",")
.map(|s| s.to_string())
.collect::<Vec<String>>(),
),
_ => bail!("Configuration incomplete, please set all required environment variables"),
};
let chatbot = Chatbot::builder()
.login(&user, &password)
.homeserver(&homeserver)
.rooms(rooms)
.connect()
.await?;
chatbot.run().await?;
Ok(())
}

View File

@ -1,122 +0,0 @@
use std::convert::TryInto;
use log::error;
use matrix_sdk::{
self, async_trait,
events::{
room::message::{MessageEventContent, MessageType, TextMessageEventContent},
AnyMessageEventContent, SyncMessageEvent,
},
room::Room,
EventHandler,
};
use crieur_retrieve::{article_location::Error, article_location::Result, ArticleLocation, Url};
pub(crate) struct Html {}
impl Html {
pub fn new() -> Self {
Self {}
}
}
async fn send_article<U, E>(url: U, room: matrix_sdk::room::Joined)
where
U: TryInto<Url, Error = E> + Send,
E: std::error::Error + Sync + Send + 'static,
{
//TODO: replace by async block when async block is stable
async fn article_html<U, E>(url: U) -> Result<String>
where
U: TryInto<Url, Error = E> + Send,
E: std::error::Error + Sync + Send + 'static,
{
let article_str = ArticleLocation::builder()
.url(url)?
.build()?
.retrieve_html()
.await?;
Ok(article_str)
}
let text_message =
|message| AnyMessageEventContent::RoomMessage(MessageEventContent::text_plain(message));
//TODO: replace occurences ok() by async and logging block when async block is stable
let article_html = match article_html(url).await {
Ok(url) => url,
Err(Error::MalformedUrl) => {
room.send(text_message("Error: Given url is malformed"), None)
.await
.ok();
return;
}
Err(Error::UnknownNewspaper) => {
room.send(
text_message("Error: Given url is do not correspond to a known newspaper"),
None,
)
.await
.ok();
return;
}
Err(Error::Misconfiguration(key)) => {
error!(
"Error in configuration : {} key is missing or malformed",
&key
);
room.send(
text_message("Error: configuration error, please contact your admin"),
None,
)
.await
.ok();
return;
}
Err(_) => {
room.send(
text_message("Unknown error =/, can't download the file"),
None,
)
.await
.ok();
return;
}
};
room.send_attachment(
"article.html",
&mime::TEXT_HTML_UTF_8,
&mut article_html.as_bytes(),
None,
)
.await
.ok();
}
#[async_trait]
impl EventHandler for Html {
async fn on_room_message(&self, room: Room, event: &SyncMessageEvent<MessageEventContent>) {
if let Room::Joined(room) = room {
let msg_body = if let SyncMessageEvent {
content:
MessageEventContent {
msgtype: MessageType::Text(TextMessageEventContent { body: msg_body, .. }),
..
},
..
} = event
{
msg_body
} else {
return;
};
match msg_body.split(' ').collect::<Vec<_>>().as_slice() {
["!html", url, ..] => send_article(*url, room).await,
_ => return,
}
}
}
}

View File

@ -1,2 +0,0 @@
mod html;
pub(crate) use html::Html;

View File

@ -1,10 +0,0 @@
//! Provides a matrix chatbot to download newspaper articles
mod cli;
pub use cli::run;
mod chatbot;
use chatbot::Chatbot;
mod handlers;
use handlers::Html;

View File

@ -8,22 +8,23 @@ publish = false
[dependencies]
anyhow = "1.0.40"
async-trait = "0.1.50"
async-trait = "0.1.48"
thiserror = "1.0.24"
url = "2.2.2"
hyper = { version = "0.14.7", features = ["full"] }
url = "2.2.1"
hyper = { version = "0.14.5", features = ["full"] }
hyper-rustls = "0.22.1"
cookie = "0.15.0"
lol_html = "0.3.0"
indoc = "1.0.3"
html-minifier = "3.0.13"
html-minifier = "3.0.9"
bytes = "1.0.1"
base64 = "0.13.0"
futures = "0.3.15"
derive_builder = "0.10.2"
futures = "0.3.14"
derive_builder = "0.10.0"
nipper = "0.1.9"
log = "0.4.14"
env_logger = "0.8.3"
itertools = "0.10.0"
[dev-dependencies]
tokio = "1.6.0"
tokio = "1.5.0"

View File

@ -1,87 +1,21 @@
use std::boxed::Box;
use std::convert::TryInto;
use std::env;
use anyhow::{anyhow, Result};
use log::info;
use url::{Host, Url};
use crate::newspaper::Newspaper;
use crate::newspapers::courrier_international::{self, CourrierInternational};
use crate::newspapers::mediapart::{self, Mediapart};
use crate::newspapers::monde_diplomatique::{self, MondeDiplo};
/// Enumerate all errors that can be encountered when using ArticleLocation
#[derive(thiserror::Error, Debug)]
pub enum Error {
/// The url was not set. Therefore, the article location can't be deduced
#[error("No url set")]
NoUrl,
/// The given URL isn't an accepted Url
#[error("Malformed URL")]
MalformedUrl,
/// The given url doesn't correspond to a newspaper.
#[error("The given url doesn't link to a known newspaper")]
UnknownNewspaper,
/// Error in configuration : used for missing or malformed configuration
#[error("Error in configuration (configuration key {0} malformed or missing)")]
Misconfiguration(String),
/// Other errors
#[error(transparent)]
Other(#[from] anyhow::Error),
}
type Newspapers = Vec<Box<dyn Newspaper>>;
pub type Result<T, E = Error> = core::result::Result<T, E>;
fn default_newpapers() -> Result<Newspapers> {
// TODO: same thing is written too much times : how to DRY ?
let config_key = "MEDIAPART_COOKIE".to_string();
let mpruiid = env::var(&config_key).map_err(|_| Error::Misconfiguration(config_key))?;
let mediapart = Mediapart::builder()
.login(mediapart::Login::Mpruuid(mpruiid))
.build()?;
let lmd_a_m = "MONDE_DIPLO_LMD_A_M".to_string();
let phpsessid = "MONDE_DIPLO_PHPSESSID".to_string();
let spip_session = "MONDE_DIPLO_SPIP_SESSION".to_string();
let lmd_a_m = env::var(&lmd_a_m).map_err(|_| Error::Misconfiguration(lmd_a_m))?;
let phpsessid = env::var(&phpsessid).map_err(|_| Error::Misconfiguration(phpsessid))?;
let spip_session =
env::var(&spip_session).map_err(|_| Error::Misconfiguration(spip_session))?;
let monde_diplo = MondeDiplo::builder()
.login(monde_diplomatique::Login::Cookies {
lmd_a_m,
phpsessid,
spip_session,
})
.build()?;
let lmd_a_m = "COURRIER_INTERNATIONAL_LMD_A_M".to_string();
let ssess = "COURRIER_INTERNATIONAL_SSESS".to_string();
let lmd_a_m = env::var(&lmd_a_m).map_err(|_| Error::Misconfiguration(lmd_a_m))?;
let ssess = env::var(&ssess).map_err(|_| Error::Misconfiguration(ssess))?;
let courrier_international = CourrierInternational::builder()
.login(courrier_international::Login::Cookies { lmd_a_m, ssess })
.build()?;
Ok(vec![
Box::new(mediapart),
Box::new(monde_diplo),
Box::new(courrier_international),
])
}
type Newspapers<'a> = Vec<Box<&'a dyn Newspaper>>;
#[derive(Default)]
pub struct Builder {
pub struct ArticleLocationBuilder<'a> {
url: Option<Url>,
newspapers: Option<Newspapers>,
newspapers: Option<Newspapers<'a>>,
}
impl Builder {
impl<'a> ArticleLocationBuilder<'a> {
pub fn new() -> Self {
Self::default()
}
@ -91,20 +25,21 @@ impl Builder {
/// # Errors
///
/// An error is returned if the could not be converted into an url
// TODO: move this to a defined error, remove anyhow !
pub fn url<U, E>(mut self, url: U) -> Result<Self>
where
U: TryInto<Url, Error = E> + Send,
E: std::error::Error + Sync + Send + 'static,
{
let url = url.try_into().map_err(|_| Error::MalformedUrl)?;
let url = url.try_into()?;
self.url = Some(url);
Ok(self)
}
/// Adds a newspaper to the list
pub fn newspaper<T>(mut self, newspaper: T) -> Self
pub fn newspaper<T>(&mut self, newspaper: &'a T) -> &mut Self
where
T: 'static + Newspaper,
T: 'a + Newspaper,
{
match &mut self.newspapers {
Some(newspapers) => newspapers.push(Box::new(newspaper)),
@ -114,13 +49,18 @@ impl Builder {
}
/// Adds several newspapers to the list of accepted newspapers
pub fn newspapers(mut self, newspapers: Newspapers) -> Self {
match &mut self.newspapers {
Some(current_newspapers) => current_newspapers.extend(newspapers),
None => self.newspapers = Some(newspapers.into_iter().collect::<Vec<_>>()),
};
self
}
//fn newspapers(&mut self, newspapers: Newspapers) -> Result<&mut Self> {
// let newspapers = match &self.newspapers {
// Some(current_newspapers) => newspapers
// .iter()
// .chain(current_newspapers.iter())
// .map(|s| *(s.clone()))
// .collect::<Newspapers>(),
// None => newspapers.into_iter().collect::<Vec<_>>(),
// };
// self.newspapers = Some(newspapers);
// Ok(self)
//}
/// Builds the ArticleLocation by looking which newspaper
///
@ -131,32 +71,41 @@ impl Builder {
/// - no newpspaper is given
/// - the url is not set
/// - the given url has no host
pub fn build(self) -> Result<ArticleLocation> {
let url = Clone::clone(self.url.as_ref().ok_or(Error::NoUrl)?);
let host = url.host_str().ok_or(Error::MalformedUrl)?;
let host = Host::parse(host).map_err(|_| Error::MalformedUrl)?;
// TODO: move this to a defined error, remove anyhow !
pub fn build(&self) -> Result<ArticleLocation<'a>> {
let url = Clone::clone(self.url.as_ref().ok_or(anyhow!(
"No url set. You can set it with the url() function"
))?);
let host = url.host_str().ok_or(anyhow!("Given url has no host"))?;
let host = Host::parse(host)?;
let newspaper = self
.newspapers
.unwrap_or(default_newpapers()?)
.into_iter()
.as_ref()
.ok_or(anyhow!(
"A list of NewsPaper must be set. It can be set with newspapers() function"
))?
.iter()
.find(|c| c.metadata().hosts.contains(&host))
.ok_or(Error::UnknownNewspaper)?;
Ok(ArticleLocation { newspaper, url })
.ok_or(anyhow!("Newspaper couldn't be found"))?;
Ok(ArticleLocation {
newspaper: newspaper.clone(),
url,
})
}
}
pub struct ArticleLocation {
newspaper: Box<dyn Newspaper>,
pub struct ArticleLocation<'a> {
newspaper: Box<&'a dyn Newspaper>,
pub url: Url,
}
impl ArticleLocation {
pub fn builder() -> Builder {
Builder::new()
impl<'a> ArticleLocation<'a> {
pub fn builder() -> ArticleLocationBuilder<'a> {
ArticleLocationBuilder::new()
}
pub async fn retrieve_html(&self) -> Result<String> {
// TODO: modify when retrieve_html returns a specific Error type
Ok(self.newspaper.retrieve_html(&self.url).await?)
info!("It will download from {}", self.url);
self.newspaper.retrieve_html(&self.url).await
}
}

View File

@ -8,9 +8,10 @@ pub use tools::{Download, Downloader};
pub mod newspaper;
// TODO: move to another crate
pub mod newspapers;
mod newspapers;
pub use newspapers::Mediapart;
pub mod article_location;
mod article_location;
pub use article_location::ArticleLocation;
mod consts;

View File

@ -1,10 +1,17 @@
use anyhow::Result;
use async_trait::async_trait;
use derive_builder::Builder;
use url::Host;
pub use url::Url;
enum Login {
Username(String, String),
Cookie(String),
}
/// Contains metadata about a newspaper
#[derive(Debug, PartialEq, Default, derive_builder::Builder)]
// TODO: provide builder
#[derive(Debug, PartialEq, Default, Builder)]
#[builder(default)]
pub struct Metadata {
/// The hosts that can be corresponds to this newspaper
@ -21,14 +28,13 @@ pub struct Metadata {
}
impl Metadata {
/// Get metadata builder
pub fn builder() -> MetadataBuilder {
MetadataBuilder::default()
}
}
#[async_trait]
pub trait Newspaper: Send + Sync {
pub trait Newspaper {
/// Returns a list of hosts that corresponds to the newspapers
fn metadata(&self) -> Metadata;
@ -43,7 +49,7 @@ pub trait Newspaper: Send + Sync {
}
/// Returns a newspaper structure
fn new() -> Self
async fn new() -> Self
where
Self: Sized;

View File

@ -1,144 +0,0 @@
use anyhow::{anyhow, bail, Result};
use async_trait::async_trait;
use cookie::Cookie;
use indoc::indoc;
use url::Host;
use crate::newspaper::{Metadata, Newspaper};
use crate::tools;
use crate::Url;
use crate::{Download, Downloader};
pub enum Login {
Username(String, String),
Cookies { lmd_a_m: String, ssess: String },
}
#[derive(Debug, Clone, Default)]
pub struct CourrierInternational {
login_cookies: Vec<(String, String)>,
}
fn str_to_host<S: Into<String>>(host: S) -> Host {
Host::Domain(host.into())
}
#[derive(Debug, Clone, Default)]
pub struct Builder {
login_cookies: Option<Vec<(String, String)>>,
}
impl Builder {
pub fn login(&mut self, login: Login) -> &mut Self {
self.login_cookies = match login {
Login::Username(_username, _password) => {
unimplemented!("login using username and passwond not implemented")
}
Login::Cookies { lmd_a_m, ssess } => Some(vec![
("lmd_a_m".into(), lmd_a_m),
("SSESS862c7003d721c672d39f161b1456b890".into(), ssess),
]),
};
self
}
pub fn build(&self) -> Result<CourrierInternational> {
match &self.login_cookies {
Some(login_cookies) => Ok(CourrierInternational {
login_cookies: login_cookies.clone(),
}),
None => Err(anyhow!("You have to log in to access this newspaper")),
}
}
}
#[async_trait]
impl Newspaper for CourrierInternational {
fn metadata(&self) -> Metadata {
Metadata::builder()
.hosts(vec![
str_to_host("courrierinternational.com"),
str_to_host("www.courrierinternational.com"),
])
.lower_case_name("courrier-international")
.name("Courrier international")
.build()
.unwrap_or_default()
}
async fn retrieve_html(&self, url: &Url) -> Result<String> {
let cookies = self
.login_cookies
.iter()
.map(|cookie| Cookie::build(&cookie.0, &cookie.1).finish())
.collect::<Vec<_>>();
// TODO: replace by builder
let downloader = Downloader { cookies };
let body = downloader.download(&url).await?;
let html = match body {
Some(body) => String::from_utf8(body.to_vec())?,
None => bail!("404 not found"),
};
let elements_to_remove = &[
// navigation elements
"header.site-header",
"footer.site-footer",
// Social buttons
"#toolbox-share",
".toolbox-share",
".toolbox-print",
".toolbox-respond",
".toolbox-zen",
".toolbox-newsletter",
".toolbox-offer",
".box-article-offer-friend-abo",
// unused services
".article-aside",
".article-secondary",
".article-subject-readmore",
// misc
".element-invisible",
".gptcontainer",
];
// FIXME: it doesn't work because the aside is in the article body
//
let toolbox_style = indoc! {"
aside.article-toolbox {
position: sticky;
top: 1em;
}
"};
let single_page_html = tools::self_contained_html::Config {
downloader: Some(&downloader),
base_url: Some(&url),
elements_to_remove,
styles_to_add: &[toolbox_style],
..Default::default()
}
.run(&html)
.await;
Ok(single_page_html)
}
fn new() -> Self {
Self {
..Default::default()
}
}
async fn has_complete_access(&self) -> bool {
// TODO: check if we are logged using the cookie
true
}
}
impl CourrierInternational {
pub fn builder() -> Builder {
Builder::default()
}
}

View File

@ -1,4 +1,4 @@
use anyhow::{anyhow, bail, Result};
use anyhow::Result;
use async_trait::async_trait;
use cookie::Cookie;
use url::Host;
@ -8,46 +8,16 @@ use crate::tools;
use crate::Url;
use crate::{Download, Downloader};
pub enum Login {
Username(String, String),
Mpruuid(String),
}
#[derive(Debug, Clone, Default)]
pub struct Mediapart {
login_cookie: (String, String),
// TODO: remove this pub !!
pub login_cookie: Option<(String, String)>,
}
fn str_to_host<S: Into<String>>(host: S) -> Host {
Host::Domain(host.into())
}
#[derive(Debug, Clone, Default)]
pub struct Builder {
login_cookie: Option<(String, String)>,
}
impl Builder {
pub fn login(&mut self, login: Login) -> &mut Self {
self.login_cookie = match login {
Login::Username(_username, _password) => {
unimplemented!("login using username and passwond not implemented")
}
Login::Mpruuid(cookie_value) => Some(("MPRUUID".into(), cookie_value)),
};
self
}
pub fn build(&self) -> Result<Mediapart> {
match &self.login_cookie {
Some(login_cookie) => Ok(Mediapart {
login_cookie: login_cookie.clone(),
}),
None => Err(anyhow!("You have to log in to access this newspaper")),
}
}
}
#[async_trait]
impl Newspaper for Mediapart {
fn metadata(&self) -> Metadata {
@ -71,22 +41,22 @@ impl Newspaper for Mediapart {
let mut url = url.clone();
url.set_query(Some(&query));
let cookie = Cookie::build(&self.login_cookie.0, &self.login_cookie.1)
.secure(true)
.finish();
let cookies = vec![cookie];
// TODO: add "?onglet=full" to the url if not
let cookies = if let Some((name, value)) = &self.login_cookie {
let cookie = Cookie::build(name, value).secure(true).finish();
vec![cookie]
} else {
vec![]
};
// TODO: replace by builder
let downloader = Downloader { cookies };
let body = downloader.download(&url).await?;
let html = match body {
Some(body) => String::from_utf8(body.to_vec())?,
None => bail!("404 not found"),
};
let html = String::from_utf8(body.to_vec())?;
// TODO: Move to const
let elements_to_remove = &[
let element_to_remove = [
// header
".fb-root",
".skipLinks",
@ -104,18 +74,13 @@ impl Newspaper for Mediapart {
"aside.cc-modal",
];
let single_page_html = tools::self_contained_html::Config {
downloader: Some(&downloader),
base_url: Some(&url),
elements_to_remove,
..Default::default()
}
.run(&html)
.await;
// TODO: correction of usage of relative urls, and replace "" by the url
let single_page_html =
tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await;
Ok(single_page_html)
}
fn new() -> Self {
async fn new() -> Self {
Self {
..Default::default()
}
@ -126,9 +91,3 @@ impl Newspaper for Mediapart {
true
}
}
impl Mediapart {
pub fn builder() -> Builder {
Builder::default()
}
}

View File

@ -1,3 +1,3 @@
pub mod courrier_international;
pub mod mediapart;
pub mod monde_diplomatique;
mod mediapart;
pub use mediapart::Mediapart;

View File

@ -1,137 +0,0 @@
use anyhow::{anyhow, bail, Result};
use async_trait::async_trait;
use cookie::Cookie;
use url::Host;
use crate::newspaper::{Metadata, Newspaper};
use crate::tools;
use crate::Url;
use crate::{Download, Downloader};
pub enum Login {
Username(String, String),
Cookies {
lmd_a_m: String,
phpsessid: String,
spip_session: String,
},
}
#[derive(Debug, Clone, Default)]
pub struct MondeDiplo {
login_cookies: Vec<(String, String)>,
}
fn str_to_host<S: Into<String>>(host: S) -> Host {
Host::Domain(host.into())
}
#[derive(Debug, Clone, Default)]
pub struct Builder {
login_cookies: Option<Vec<(String, String)>>,
}
impl Builder {
pub fn login(&mut self, login: Login) -> &mut Self {
self.login_cookies = match login {
Login::Username(_username, _password) => {
unimplemented!("login using username and passwond not implemented")
}
Login::Cookies {
lmd_a_m,
phpsessid,
spip_session,
} => Some(vec![
("lmd_a_m".into(), lmd_a_m),
("PHPSESSID".into(), phpsessid),
("spip_session".into(), spip_session),
]),
};
self
}
pub fn build(&self) -> Result<MondeDiplo> {
match &self.login_cookies {
Some(login_cookies) => Ok(MondeDiplo {
login_cookies: login_cookies.clone(),
}),
None => Err(anyhow!("You have to log in to access this newspaper")),
}
}
}
#[async_trait]
impl Newspaper for MondeDiplo {
fn metadata(&self) -> Metadata {
Metadata::builder()
.hosts(vec![
str_to_host("monde-diplomatique.fr"),
str_to_host("www.monde-diplomatique.fr"),
])
.lower_case_name("monde-diplomatique")
.name("Le Monde Diplomatique")
.build()
.unwrap_or_default()
}
async fn retrieve_html(&self, url: &Url) -> Result<String> {
let cookies = self
.login_cookies
.iter()
.map(|cookie| Cookie::build(&cookie.0, &cookie.1).finish())
.collect::<Vec<_>>();
// TODO: replace by builder
let downloader = Downloader { cookies };
let body = downloader.download(&url).await?;
let html = match body {
Some(body) => String::from_utf8(body.to_vec())?,
None => bail!("404 not found"),
};
// TODO: Move to const
let elements_to_remove = &[
// navigation elements
"#tout-en-haut.preentete",
"#entete.connecte",
"#navigation",
"#pied",
".bloc-connexion",
// unused features
"#ecouter",
// Social buttons
".actions-article",
"#partage",
// misc
"noscript",
];
let single_page_html = tools::self_contained_html::Config {
downloader: Some(&downloader),
base_url: Some(&url),
elements_to_remove,
..Default::default()
}
.run(&html)
.await;
Ok(single_page_html)
}
fn new() -> Self {
Self {
..Default::default()
}
}
async fn has_complete_access(&self) -> bool {
// TODO: check if we are logged using the cookie
true
}
}
impl MondeDiplo {
pub fn builder() -> Builder {
Builder::default()
}
}

View File

@ -4,7 +4,7 @@ use anyhow::Result;
use async_trait::async_trait;
use bytes::Bytes;
use cookie::Cookie;
use hyper::{header, Body, Client, Method, Request, StatusCode};
use hyper::{header, Body, Client, Method, Request};
use thiserror::Error;
use url::Url;
@ -22,9 +22,7 @@ pub trait Download {
type Error: StdError;
/// Downloads a file from an url and returns the result as bytes
///
/// If the file is not found, returns None
async fn download(&self, file_link: &Url) -> Result<Option<Bytes>, Self::Error>;
async fn download(&self, file_link: &Url) -> Result<Bytes, Self::Error>;
}
/// Store several cookies
@ -38,8 +36,7 @@ pub struct Downloader<'c> {
impl<'c> Download for Downloader<'c> {
type Error = DownloadError;
async fn download(&self, file_link: &Url) -> Result<Option<Bytes>, Self::Error> {
log::debug!("downloading url {:?}", file_link);
async fn download(&self, file_link: &Url) -> Result<Bytes, Self::Error> {
let https = hyper_rustls::HttpsConnector::with_native_roots();
let client: Client<_, hyper::Body> = Client::builder().build(https);
@ -47,26 +44,14 @@ impl<'c> Download for Downloader<'c> {
.method(Method::GET)
.uri(file_link.as_str());
req = req.header(
header::COOKIE,
self.cookies
.iter()
.map(Cookie::to_string)
.collect::<Vec<_>>()
.join(";"),
);
log::debug!("headers : {:?}", req.headers_ref());
for cookie in &self.cookies {
req = req.header(header::COOKIE, cookie.to_string());
}
let req = req.body(Body::empty())?;
let resp = client.request(req).await?;
log::debug!("Response status : {:?}", resp.status());
let body = match resp.status() {
StatusCode::OK => Some(hyper::body::to_bytes(resp).await?),
StatusCode::NOT_FOUND => None,
// TODO: enhance this by handling more error codes
_ => None,
};
let body = hyper::body::to_bytes(resp).await?;
Ok(body)
}
}

View File

@ -1,4 +1,5 @@
mod download;
pub mod self_contained_html;
mod self_contained_html;
pub use download::{Download, DownloadError, Downloader};
pub use self_contained_html::self_contained_html;

View File

@ -8,198 +8,142 @@ use url::Url;
use crate::consts::{EVENT_HANDLERS, LINK_REL_EXTERNAL_RESOURCES};
use crate::Download;
/// Stores configuration for the self_contained_html function
// TODO: write a builder
pub struct Config<'t, E, D, S1 = &'static str, S2 = &'static str>
/// Makes an html page self-contained
///
/// The `downloader` must implement `Download` and is used to download ressources that are
/// needed to make this page self-contained such as stylesheets or images.
///
/// The function also removes all scripts on the page
pub async fn self_contained_html<E, D>(
html: impl AsRef<str>,
downloader: &D,
base_url: &Url,
elements_to_remove: &[impl AsRef<str>],
) -> String
where
E: std::error::Error,
D: Download<Error = E> + Send,
S1: AsRef<str>,
S2: AsRef<str>,
{
/// the downloader that will be used to retrieve ressources on the page
pub downloader: Option<&'t D>,
/// Base url for downloading ressources, it probably the
pub base_url: Option<&'t Url>,
pub elements_to_remove: &'t [S1],
pub styles_to_add: &'t [S2],
}
// TODO: split/refactor this function :
// - ¿ find a way to ease the |get urls| -> |async download| -> |replace| workflow ?
// - `element_to_remove` , `base_url` and `downloader` should be in a configuration structure
// - ¿ should be function of a trait ? or only of the configuration struct ?
let (style_urls, html) = {
let document = Document::from(html.as_ref());
impl<'t, E, D> Default for Config<'t, E, D>
where
E: std::error::Error,
D: Download<Error = E> + Send,
{
fn default() -> Self {
Self {
downloader: None,
base_url: None,
elements_to_remove: &[],
styles_to_add: &[],
// ---- Remove scripts ----
//
document.select("script").remove();
for event in EVENT_HANDLERS {
document
.select(format!("[{}]", event).as_str())
.remove_attr(event);
}
}
}
impl<'t, E, D, S1, S2> Config<'t, E, D, S1, S2>
where
E: std::error::Error,
D: Download<Error = E> + Send,
S1: AsRef<str>,
S2: AsRef<str>,
{
/// Makes an html page self-contained
///
/// The `downloader` must implement `Download` and is used to download ressources that are
/// needed to make this page self-contained such as stylesheets or images.
///
/// The function also removes all scripts on the page
pub async fn run(&self, html: impl AsRef<str>) -> String {
//TODO: don't panic
let base_url = self.base_url.expect("Base url not defined");
let downloader = self.downloader.expect("Downloader not defined");
// TODO: split/refactor this function :
// - ¿ find a way to ease the |get urls| -> |async download| -> |replace| workflow ?
// - put each modification (ex: style in the `foreach`) in functions, maybe using
// (¿sealed?) Traits : https://git.alpaga.dev/poc/crieur/issues/42
// - `element_to_remove` , `base_url` and `downloader` should be in a configuration structure
let (style_urls, html) = {
let document = Document::from(html.as_ref());
for rel in LINK_REL_EXTERNAL_RESOURCES {
document
.select(format!("link[rel=\"{}\"]", rel).as_str())
.remove();
}
// ---- Remove scripts ----
//
document.select("script").remove();
for event in EVENT_HANDLERS {
document
.select(format!("[{}]", event).as_str())
.remove_attr(event);
}
for rel in LINK_REL_EXTERNAL_RESOURCES {
document
.select(format!("link[rel=\"{}\"]", rel).as_str())
.remove();
}
// ---- Replace stylesheets ----
//
let stylesheets = document.select("link[href][rel=\"stylesheet\"]");
let styles_url = stylesheets
.iter()
.map(|style_link| {
if let Some(src) = style_link.attr("href") {
base_url.join(src.as_ref()).ok()
} else {
None
}
})
.collect::<Vec<_>>();
(styles_url, String::from(document.html()))
};
let style_urls = style_urls.into_iter().map(|style_url| {
OptionFuture::from(
style_url.map(|s| async move { downloader.download(&s).await.unwrap() }),
)
});
let downloaded_styles = futures::future::join_all(style_urls).await;
let html = {
let document = Document::from(&html);
let styles = document.select("link[href][rel=\"stylesheet\"]");
styles
.iter()
.zip(downloaded_styles.iter())
.for_each(|(mut style_link, inner_css)| {
if let Some(Some(inner_css)) = inner_css {
let css = String::from_utf8(inner_css.to_vec()).unwrap();
let media_query = style_link.attr("media");
let css = match media_query {
Some(media_query) => {
format!("<style media=\"{}\">{}</style>", media_query, css)
}
None => format!("<style>{}</style>", css),
};
style_link.replace_with_html(css);
} else {
style_link.remove();
}
});
String::from(document.html())
};
// ---- Replace imgs ----
// ---- Replace stylesheets ----
//
let image_urls = {
let document = Document::from(&html);
let imgs = document.select("img:not([src^=\"data:\"])");
let stylesheets = document.select("link[href][rel=\"stylesheet\"]");
let styles_url = stylesheets
.iter()
.map(|stylesheet| {
if let Some(src) = stylesheet.attr("href") {
//TODO: does it work with absolute urls ?
base_url.join(src.as_ref()).ok()
} else {
None
}
})
.collect::<Vec<_>>();
(styles_url, String::from(document.html()))
};
imgs.iter()
.map(|image| {
if let Some(src) = image.attr("src") {
base_url.join(src.as_ref()).ok()
} else {
None
}
})
.collect::<Vec<_>>()
};
let style_urls = style_urls.into_iter().map(|style_url| {
OptionFuture::from(style_url.map(|s| async move { downloader.download(&s).await.unwrap() }))
});
let downloaded_styles = futures::future::join_all(style_urls).await;
let downloaded_images = image_urls.into_iter().map(|image_url| {
OptionFuture::from(image_url.map(|url| async move {
let data = downloader.download(&url).await.unwrap();
(url, data)
}))
});
let downloaded_images = futures::future::join_all(downloaded_images).await;
let html = {
let document = Document::from(&html);
let styles = document.select("link[href][rel=\"stylesheet\"]");
let html = {
let document = Document::from(&html);
let imgs = document.select("img:not([src^=\"data:\"])");
styles
.iter()
.zip(downloaded_styles.iter())
.for_each(|(mut stylesheet, inner_css)| {
if let Some(inner_css) = inner_css {
let css = String::from_utf8(inner_css.to_vec()).unwrap();
let css = format!("<style>{}</style>", css);
stylesheet.replace_with_html(css);
} else {
stylesheet.remove();
}
});
String::from(document.html())
};
imgs.iter()
.zip(downloaded_images.iter())
.for_each(|(mut img, data)| {
if let Some((url, Some(data))) = data {
let data = base64::encode(data);
//TODO: use an extension hashmap
let extension =
Path::new(url.path()).extension().unwrap().to_str().unwrap();
img.set_attr("src", &format!("data:image/{};base64,{}", extension, data));
} else {
img.remove()
}
});
// ---- Remove unwanted html elements -----
//
for element in self.elements_to_remove {
document.select(element.as_ref()).remove();
}
// ---- Replace imgs ----
//
let image_urls = {
let document = Document::from(&html);
let imgs = document.select("img");
// ---- Add additional styles ----
//
for style in self.styles_to_add {
document
.select("head")
.append_html(format!("\n<style>{}</style>\n", style.as_ref()));
}
imgs.iter()
.map(|image| {
if let Some(src) = image.attr("src") {
base_url.join(src.as_ref()).ok()
} else {
None
}
})
.collect::<Vec<_>>()
};
String::from(document.html())
};
let downloaded_images = image_urls.into_iter().map(|image_url| {
OptionFuture::from(image_url.map(|url| async move {
let data = downloader.download(&url).await.unwrap();
(url, data)
}))
});
let downloaded_images = futures::future::join_all(downloaded_images).await;
// ---- output ----
let html = {
let document = Document::from(&html);
let imgs = document.select("img");
imgs.iter()
.zip(downloaded_images.iter())
.for_each(|(mut img, data)| {
if let Some((url, data)) = data {
let data = base64::encode(data);
let extension = Path::new(url.path()).extension().unwrap().to_str().unwrap();
img.set_attr("src", &format!("data:image/{};base64,{}", extension, data));
}
});
// ---- Remove unwanted html elements -----
//
let mut minifier = HTMLMinifier::new();
minifier.digest(html.as_str()).unwrap();
for element in elements_to_remove {
document.select(element.as_ref()).remove();
}
String::from(document.html())
};
String::from_utf8(minifier.get_html().into()).unwrap()
}
// ---- output ----
//
let mut minifier = HTMLMinifier::new();
minifier.digest(html.as_str()).unwrap();
String::from_utf8(minifier.get_html().into()).unwrap()
}
#[cfg(test)]
mod tests {
// TODO: reduce boilerplate, DRY
use super::*;
@ -224,8 +168,8 @@ mod tests {
#[async_trait]
impl Download for DummyDownloader {
type Error = errors::Error;
async fn download(&self, _file_link: &Url) -> errors::Result<Option<Bytes>> {
Ok(Some(Bytes::from("")))
async fn download(&self, _file_link: &Url) -> errors::Result<Bytes> {
Ok(Bytes::from(""))
}
}
@ -234,14 +178,9 @@ mod tests {
let html = "<html><head><script>let id = id => id</script></head><body></body></html>";
let base_url = Url::parse("http://example.com")?;
let downloader = DummyDownloader {};
let to_remove: &[&str] = &[];
assert_eq!(
Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}
.run(html)
.await,
self_contained_html(html, &downloader, &base_url, to_remove).await,
"<html><head></head><body></body></html>"
);
Ok(())
@ -265,13 +204,10 @@ mod tests {
};
let base_url = Url::parse("http://example.com")?;
let to_remove: &[&str] = &[];
for s in EVENT_HANDLERS {
assert_eq!(
Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}.run(html(s)).await,
self_contained_html(html(s), &downloader, &base_url, to_remove).await,
"<html><head>\n</head>\n<body>\n<button class=\"activate\">button</button>\n</body></html>"
);
}
@ -296,15 +232,10 @@ mod tests {
};
let base_url = Url::parse("http://example.com")?;
let to_remove: &[&str] = &[];
for s in LINK_REL_EXTERNAL_RESOURCES {
assert_eq!(
Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}
.run(html(s))
.await,
self_contained_html(html(s), &downloader, &base_url, to_remove).await,
"<html><head>\n</head>\n<body>\n</body></html>"
);
}
@ -315,14 +246,12 @@ mod tests {
#[async_trait]
impl Download for CssDownloader {
type Error = errors::Error;
async fn download(&self, _file_link: &Url) -> errors::Result<Option<Bytes>> {
Ok(Some(
indoc! {"
async fn download(&self, _file_link: &Url) -> errors::Result<Bytes> {
Ok(indoc! {"
section#warning {
color: red;
}"}
.into(),
))
.into())
}
}
@ -357,57 +286,9 @@ mod tests {
let minified = String::from_utf8(minifier.get_html().into())?;
let base_url = Url::parse("http://example.com")?;
let to_remove: &[&str] = &[];
assert_eq!(
Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}
.run(html)
.await,
minified
);
Ok(())
}
#[tokio::test]
async fn download_css_with_media_query() -> Result<()> {
let downloader = CssDownloader {};
let html = indoc! {"
<html>
<head>
<link rel=\"stylesheet\" href=\"main.css\" media=\"print\">
</head>
<body>
</body>
</html>
"};
let wanted_html = indoc! {"
<html><head>
<style media=\"print\">
section#warning {
color: red;
}
</style>
</head>
<body>
</body></html>
"};
let mut minifier = HTMLMinifier::new();
minifier.digest(wanted_html)?;
let minified = String::from_utf8(minifier.get_html().into())?;
let base_url = Url::parse("http://example.com")?;
assert_eq!(
Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}
.run(html)
.await,
self_contained_html(html, &downloader, &base_url, to_remove).await,
minified
);
Ok(())
@ -417,12 +298,12 @@ mod tests {
#[async_trait]
impl Download for PngDownloader {
type Error = errors::Error;
async fn download(&self, _file_link: &Url) -> errors::Result<Option<Bytes>> {
async fn download(&self, _file_link: &Url) -> errors::Result<Bytes> {
let image_path = Path::new("test_data/home.png");
let mut image_file = File::open(&image_path).unwrap();
let mut image_buf: Vec<u8> = vec![];
image_file.read_to_end(&mut image_buf).unwrap();
Ok(Some(image_buf.into()))
Ok(image_buf.into())
}
}
@ -452,14 +333,9 @@ mod tests {
let minified = String::from_utf8(minifier.get_html().into())?;
let base_url = Url::parse("http://example.com")?;
let to_remove: &[&str] = &[];
assert_eq!(
Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}
.run(html)
.await,
self_contained_html(html, &downloader, &base_url, to_remove).await,
minified
);
Ok(())
@ -492,67 +368,12 @@ mod tests {
let minified = String::from_utf8(minifier.get_html().into())?;
assert_eq!(
Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
elements_to_remove: &["header", ".placeholder", "article > span.huge"],
..Default::default()
}
.run(html)
.await,
minified
);
Ok(())
}
#[tokio::test]
async fn add_style() -> Result<()> {
let html = indoc! {"
<html>
<head>
<meta charset=\"UTF-8\">
</head>
<body>
The body
</body>
</html>
"};
let wanted_html = indoc! {"
<html><head>
<meta charset=\"UTF-8\">
<style>
body {
margin: 3em;
}
</style>
</head>
<body>
The body
</body></html>
"};
let style_to_add = indoc! {"
body {
margin: 3em;
}
"};
let base_url = Url::parse("http://example.com")?;
let downloader = DummyDownloader {};
let mut minifier = HTMLMinifier::new();
minifier.digest(wanted_html)?;
let minified = String::from_utf8(minifier.get_html().into())?;
assert_eq!(
Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
styles_to_add: &[style_to_add],
..Default::default()
}
.run(html)
self_contained_html(
html,
&downloader,
&base_url,
&["header", ".placeholder", "article > span.huge"]
)
.await,
minified
);

View File

@ -1,98 +1,48 @@
---
title: Scope of the project
---
This project mainly aims at providing an unified interface for several newspapers. Side
objectives are to provide web API and different clients like a webUI or chatbots.
This project mainly aims at providing an unified interface for several
newspapers. Side objectives are to provide web API and different clients like a
webUI or chatbots.
Several big components are planned for this project
Several big components are planned for this project (it is an initial draft and
may change later) :
```dot
digraph G {
rankdir=TB
node [shape=rectangle, style=filled, color="#779988"]
```plantuml
@startuml
frame "backend" {
[Retrieval tools] as retrieval_tools
[Article representation] as article_repr
[Automatic retrieval] as auto_retrieve
[Atom/RSS adapters] as rss
[Cache DB] as cache
[Newspaper\n(Mediapart, …)] as newspaper
() "Newspaper" as np_i
newspaper -up- np_i
subgraph cluster_frontend {
color = transparent
webui
chatbot
}
[Article location] as article_location
webui -> api [color = red]
chatbot -> api [color = red]
[API] as api
() "API" as api_i
api -up- api_i
subgraph cluster_backend {
label = "Backend\ncrieur binary"
labelloc = b
style=filled
article_location ..> np_i
retrieve_tools [label="retrieve-tools"]
retrieve_adapters [label="retrieve-adapters"]
retrieve [label="retrieve-interface"]
auto_retrieve [label="automatic-retrieve"]
article_repr [label="article-representation\nRepresentation for articles"]
api
cache [label="Cache database"]
rss [label="Atom/RSS adapters"]
api -> article_location
api -> rss
retrieve_tools -> retrieve_adapters
retrieve_adapters -> retrieve
retrieve_tools -> retrieve
rss -> auto_retrieve
article_repr -> retrieve_adapters
newspaper -> retrieval_tools: uses to implement
retrieve -> api
auto_retrieve -> api
cache -> api
article_location --> article_repr: uses
retrieval_tools -up-> article_repr: uses
auto_retrieve --> rss: watches
auto_retrieve --> article_location
auto_retrieve --> cache: stores in
}
}
frame "Web ui" {
[Web UI] as webui
[HTML renderer] as html_rend
[Pdf exporter] as pdf_rend
[Articles] as articles
webui --> html_rend
webui --> pdf_rend
webui -> articles
articles ..> api_i
}
[Chatbot] as chatbot
chatbot ..> api_i
actor User
User ..> webui
User ..> chatbot
actor "Newspaper programmer" as newspaper_programmer
newspaper_programmer ..> newspaper: implements
@enduml
```
A task queue could be added later to space requests.
# Implementation plan
## Phase I
- [x] `Newspaper` interface : use to retrieve from newspaper websites
- [ ] minimal chatbot (uses libraries directly)
- [x] `ArticleLocation` : library for using several `Newspaper` and retrieving from
a given url.
## Phase II
- [ ] Article Representation : having a (beta) unified representation for downloaded
articles
- [ ] adding this representation to Newpsaper
## Phase III
- [ ] Cache
- [ ] Atom/rss adapters
- [ ] automatic retrieve
## Phase IV
- [ ] API
- [ ] chatbot (uses api)
## Phase V
- [ ] web ui

View File

@ -1,36 +0,0 @@
---
title: Project tooling
---
# Container image
## Chatbot release
The [chatbot containerfile](../../containers/chatbot.containerfile) intend to
be the smaller possible in order to ease and reduce the storage needed in
registries.
In order to provide a minimal image, the rust-alpine container image is used.
This image uses the `x86_64-unknown-linux-musl` target that provides static
linking with `musl`.
However, the `olm-sys` couldn't be linked statically[^oml-sys-static-error].
The workaround have been to introduce the
`RUSTFLAGS=-Ctarget-feature=-crt-static` environment variable that disables
static linking.
The following lines have been added to copy the needed libraries.
```containerfile
COPY --from=build /usr/lib/libstdc++.so.6 /usr/lib/libstdc++.so.6
COPY --from=build /usr/lib/libgcc_s.so.1 /usr/lib/libgcc_s.so.1
COPY --from=build /lib/ld-musl-x86_64.so.1 /lib/ld-musl-x86_64.so.1
```
## Development
An image aimed at providing a development environment for developers may
be added later.
[^oml-sys-static-error]: with `oml-sys` v1.1.1, in march 2021

View File

@ -1,27 +0,0 @@
---
title: Build and run the chatbot
---
1. You must first configure matrix login, every variable in [the_reference](../reference/chatbot_configuration.md) is mandatory.
```env
CRIEUR_MATRIX_USER=user
CRIEUR_MATRIX_PASSWORD=password
CRIEUR_MATRIX_HOMESERVER=https://homeserv.er
CRIEUR_MATRIX_ROOMS=roomid1,roomid2,
```
You can put it in a `.env` file.
2. Run the chatbot
**Using `podman` (or another container tool, like `docker`)**
```
podman build --file ./containers/chatbot.containerfile -t crieur-chatbot .
podman run --env-file .env --rm -i -t crieur-chatbot
```
**Using `cargo` (for development)**
```
cargo run --release --bin crieur-chatbot
```

View File

@ -1,17 +0,0 @@
---
Title: Chatbot configuration reference
---
The chatbot is configured using environment variables
CRIEUR_MATRIX_USER
: username of the matrix bot account
CRIEUR_MATRIX_PASSWORD
: password of the matrix bot account
CRIEUR_MATRIX_HOMESERVER
: homeserver of the matrix bot account
CRIEUR_MATRIX_ROOMS
: rooms in which to listen to events

View File

@ -1,31 +0,0 @@
---
title: Newspapers configuration
---
The newspapers are configured using environment variables
# Mediapart
MEDIAPART_COOKIE
: sets the `MPRUUID` cookie, used to log in
# Le Monde Diplomatique
All cookies are mandatory to log in
MONDE_DIPLO_LMD_A_M
: sets the `lmd_a_m` cookie
MONDE_DIPLO_PHPSESSID
: sets the `PHPSESSID` cookie
MONDE_DIPLO_SPIP_SESSION
: sets the `spip_session` cookie
# Courrier international
COURRIER_INTERNATIONAL_LMD_A_M
: sets the `lmd_a_m` cookie
COURRIER_INTERNATIONAL_SSESS
: sets the `ssess` cookie

View File

@ -2,27 +2,34 @@ use std::convert::TryInto;
use std::env;
use anyhow::Result;
use crieur_retrieve::{ArticleLocation, Url};
use crieur_retrieve::{newspaper::Newspaper, ArticleLocation, Mediapart, Url};
use dotenv::dotenv;
use log::info;
#[tokio::main]
async fn main() -> Result<()> {
dotenv().ok();
tracing_subscriber::fmt()
.with_writer(std::io::stderr)
.with_env_filter(tracing_subscriber::EnvFilter::from_default_env())
.init();
env_logger::init();
let url = match env::args().nth(1) {
Some(url) => Url::parse(&url)?,
None => "https://www.mediapart.fr/journal/france/030421/l-hotel-dieu-patients-et-medecins-tentent-de-percer-les-mysteres-du-covid-long".try_into()?,
};
// TODO: remove this in favor of default newspapers
let mut mediapart = Mediapart::new().await
//.login(USERNAME, PASSWORD)
//
;
mediapart.login_cookie = Some(("MPRUUID".into(), env::var("MEDIAPART_COOKIE")?.into()));
info!("Trying to download article from {}", url);
// TODO: shorten this, maybe an helper function ?
let article_location = ArticleLocation::builder().url(url)?.build()?;
let article_location = ArticleLocation::builder()
.url(url)?
.newspaper(&mediapart)
.build()?;
let article_str = article_location.retrieve_html().await?;

View File

@ -1,29 +1,16 @@
@build:
cargo build
@build-container:
podman build --file ./containers/chatbot.containerfile -t crieur-chatbot .
cargo build
@clean:
cargo clean
cargo clean
@run:
cargo run
@test:
cargo test --all
@clippy:
cargo clippy
@fmt:
cargo fmt
@simulate-ci: fmt clippy test
cargo run
@audit:
cargo audit
cargo audit
@crev:
cargo crev verify
cargo crev verify
@verify: audit crev

View File

@ -1,11 +0,0 @@
use anyhow::Result;
use crieur_chatbot::run;
use dotenv::dotenv;
#[tokio::main]
async fn main() -> Result<()> {
env_logger::init();
dotenv().ok();
run().await?;
Ok(())
}

19
src/main.rs Normal file
View File

@ -0,0 +1,19 @@
use anyhow::Result;
use crieur_retrieve::{newspaper::Newspaper, Mediapart, Url};
use dotenv::dotenv;
use std::env;
#[tokio::main]
async fn main() -> Result<()> {
dotenv().ok();
let mut mediapart = Mediapart::new().await
//.login(USERNAME, PASSWORD)
//
;
mediapart.login_cookie = Some(("MPRUUID".into(), env::var("MEDIAPART_COOKIE")?));
let url = Url::parse("https://www.mediapart.fr/journal/france/030421/l-hotel-dieu-patients-et-medecins-tentent-de-percer-les-mysteres-du-covid-long")?;
println!("{}", mediapart.retrieve_html(&url).await?);
Ok(())
}