diff --git a/.drone.yml b/.drone.yml index 0a95e84..e304ae4 100644 --- a/.drone.yml +++ b/.drone.yml @@ -8,6 +8,7 @@ steps: pull: true errignore: true commands: + - apt-get update && apt-get install -y cmake - rustup component add rustfmt - rustup component add clippy - cargo clippy diff --git a/README.md b/README.md index fc0677f..f5970e5 100644 --- a/README.md +++ b/README.md @@ -1,17 +1,33 @@ Tools to retrieve articles from multiple newspaper you subscribed to. -**This is a prototype, it isn't stable at all and you may not want to use it if you expect it to just work !** +**This is a prototype, it isn't stable at all and you may not want to use it if +you expect it to just work !** # How to use it -First retrieve login cookies for websites and put it in a `.env` +First retrieve login cookies for websites and put it in a `.env` such as +explained in the [newspaper source configuration +documentation](./documentation/reference/newspaper_configuration.md) + +Then you can run run ``` cargo run --example=cli_downloader ``` +To know how to run the chatbot, please read the [chatbot +guide](./documentation/guides/run_chatbot.md) + # Documentation -- 1. [Design](documentation/design/index.md) - - a. [Scope of the project](documentation/design/scope.md) - - b. [Retrieve](documentation/design/retrieve.md) +- 1. Design + - a. [Scope of the project and roadmap](./documentation/design/scope.md) + - b. [Retrieve](./documentation/design/retrieve.md) +- 2. Guides + - a. [Add a newspaper a + source](./documentation/guides/add_a_newspaper_source.md) +- 3. Reference + - a. [Newspaper source + configuration](./documentation/reference/newspaper_configuration.md) + - b. [Chatbot + configuration](./documentation/reference/chatbot_configuration.md) diff --git a/crieur-chatbot/src/handlers/html.rs b/crieur-chatbot/src/handlers/html.rs index 4c7ef7f..5cad197 100644 --- a/crieur-chatbot/src/handlers/html.rs +++ b/crieur-chatbot/src/handlers/html.rs @@ -13,7 +13,7 @@ use matrix_sdk::{ Client, ClientConfig, EventHandler, SyncSettings, }; -use crieur_retrieve::{newspaper::Newspaper, ArticleLocation, Mediapart, Url}; +use crieur_retrieve::{ArticleLocation, Url}; pub(crate) struct Html {} diff --git a/crieur-retrieve/src/article_location.rs b/crieur-retrieve/src/article_location.rs index 734c947..1cd6178 100644 --- a/crieur-retrieve/src/article_location.rs +++ b/crieur-retrieve/src/article_location.rs @@ -7,19 +7,17 @@ use log::info; use url::{Host, Url}; use crate::newspaper::Newspaper; -use crate::newspapers::Mediapart; +use crate::newspapers::mediapart::{self, Mediapart}; type Newspapers = Vec>; -fn default_newpapers() -> Newspapers { - let mut mediapart = Mediapart::new(); +fn default_newpapers() -> Result { + let mpruiid = env::var("MEDIAPART_COOKIE")?.into(); + let mediapart = Mediapart::builder() + .login(mediapart::Login::MPRUUID(mpruiid)) + .build()?; - mediapart.login_cookie = Some(( - "MPRUUID".into(), - env::var("MEDIAPART_COOKIE").unwrap().into(), - )); - - vec![Box::new(mediapart)] + Ok(vec![Box::new(mediapart)]) } #[derive(Default)] @@ -93,7 +91,7 @@ impl Builder { let host = Host::parse(host)?; let newspaper = self .newspapers - .unwrap_or(default_newpapers()) + .unwrap_or(default_newpapers()?) .into_iter() .find(|c| c.metadata().hosts.contains(&host)) .ok_or(anyhow!("Newspaper couldn't be found"))?; diff --git a/crieur-retrieve/src/lib.rs b/crieur-retrieve/src/lib.rs index 7b9d058..31dc3a8 100644 --- a/crieur-retrieve/src/lib.rs +++ b/crieur-retrieve/src/lib.rs @@ -8,8 +8,7 @@ pub use tools::{Download, Downloader}; pub mod newspaper; // TODO: move to another crate -mod newspapers; -pub use newspapers::Mediapart; +pub mod newspapers; mod article_location; pub use article_location::ArticleLocation; diff --git a/crieur-retrieve/src/newspaper.rs b/crieur-retrieve/src/newspaper.rs index 532cc62..d78c81c 100644 --- a/crieur-retrieve/src/newspaper.rs +++ b/crieur-retrieve/src/newspaper.rs @@ -1,17 +1,10 @@ use anyhow::Result; use async_trait::async_trait; -use derive_builder::Builder; use url::Host; pub use url::Url; -enum Login { - Username(String, String), - Cookie(String), -} - /// Contains metadata about a newspaper -// TODO: provide builder -#[derive(Debug, PartialEq, Default, Builder)] +#[derive(Debug, PartialEq, Default, derive_builder::Builder)] #[builder(default)] pub struct Metadata { /// The hosts that can be corresponds to this newspaper @@ -28,6 +21,7 @@ pub struct Metadata { } impl Metadata { + /// Get metadata builder pub fn builder() -> MetadataBuilder { MetadataBuilder::default() } diff --git a/crieur-retrieve/src/newspapers/mediapart.rs b/crieur-retrieve/src/newspapers/mediapart.rs index 678a8ac..0933b6e 100644 --- a/crieur-retrieve/src/newspapers/mediapart.rs +++ b/crieur-retrieve/src/newspapers/mediapart.rs @@ -1,4 +1,4 @@ -use anyhow::Result; +use anyhow::{anyhow, Result}; use async_trait::async_trait; use cookie::Cookie; use url::Host; @@ -8,16 +8,46 @@ use crate::tools; use crate::Url; use crate::{Download, Downloader}; +pub enum Login { + Username(String, String), + MPRUUID(String), +} + #[derive(Debug, Clone, Default)] pub struct Mediapart { - // TODO: remove this pub !! - pub login_cookie: Option<(String, String)>, + login_cookie: (String, String), } fn str_to_host>(host: S) -> Host { Host::Domain(host.into()) } +#[derive(Debug, Clone, Default)] +pub struct Builder { + login_cookie: Option<(String, String)>, +} + +impl Builder { + pub fn login(&mut self, login: Login) -> &mut Self { + self.login_cookie = match login { + Login::Username(_username, _password) => { + unimplemented!("login using username and passwond not implemented") + } + Login::MPRUUID(cookie_value) => Some(("MPRUUID".into(), cookie_value)), + }; + self + } + + pub fn build(&self) -> Result { + match &self.login_cookie { + Some(login_cookie) => Ok(Mediapart { + login_cookie: login_cookie.clone(), + }), + None => Err(anyhow!("You have to log in to access this newspaper")), + } + } +} + #[async_trait] impl Newspaper for Mediapart { fn metadata(&self) -> Metadata { @@ -41,13 +71,10 @@ impl Newspaper for Mediapart { let mut url = url.clone(); url.set_query(Some(&query)); - // TODO: add "?onglet=full" to the url if not - let cookies = if let Some((name, value)) = &self.login_cookie { - let cookie = Cookie::build(name, value).secure(true).finish(); - vec![cookie] - } else { - vec![] - }; + let cookie = Cookie::build(&self.login_cookie.0, &self.login_cookie.1) + .secure(true) + .finish(); + let cookies = vec![cookie]; // TODO: replace by builder let downloader = Downloader { cookies }; @@ -74,7 +101,6 @@ impl Newspaper for Mediapart { "aside.cc-modal", ]; - // TODO: correction of usage of relative urls, and replace "" by the url let single_page_html = tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await; Ok(single_page_html) @@ -91,3 +117,9 @@ impl Newspaper for Mediapart { true } } + +impl Mediapart { + pub fn builder() -> Builder { + Builder::default() + } +} diff --git a/crieur-retrieve/src/newspapers/mod.rs b/crieur-retrieve/src/newspapers/mod.rs index b7c9e05..7f44529 100644 --- a/crieur-retrieve/src/newspapers/mod.rs +++ b/crieur-retrieve/src/newspapers/mod.rs @@ -1,3 +1 @@ -mod mediapart; - -pub use mediapart::Mediapart; +pub mod mediapart; diff --git a/documentation/design/scope.md b/documentation/design/scope.md index bc8ca23..6417bf7 100644 --- a/documentation/design/scope.md +++ b/documentation/design/scope.md @@ -1,48 +1,97 @@ -This project mainly aims at providing an unified interface for several newspapers. Side -objectives are to provide web API and different clients like a webUI or chatbots. +--- +title: Scope of the project +--- -Several big components are planned for this project +This project mainly aims at providing an unified interface for several +newspapers. Side objectives are to provide web API and different clients like a +webUI or chatbots. -```dot -digraph G { - rankdir=TB - node [shape=rectangle, style=filled, color="#779988"] +Several big components are planned for this project (it is an initial draft and +may change later) : - subgraph cluster_frontend { - color = transparent - webui - chatbot - } +```plantuml +@startuml + +frame "backend" { + [Retrieval tools] as retrieval_tools + [Article representation] as article_repr + [Automatic retrieval] as auto_retrieve + [Atom/RSS adapters] as rss + [Cache DB] as cache + + [Newspaper\n(Mediapart, …)] as newspaper + () "Newspaper" as np_i + newspaper -up- np_i - webui -> api [color = red] - chatbot -> api [color = red] + [Article location] as article_location - subgraph cluster_backend { - label = "Backend\ncrieur binary" - labelloc = b - style=filled + [API] as api + () "API" as api_i + api -up- api_i - retrieve_tools [label="retrieve-tools"] - retrieve_adapters [label="retrieve-adapters"] - retrieve [label="retrieve-interface"] - auto_retrieve [label="automatic-retrieve"] - article_repr [label="article-representation\nRepresentation for articles"] - api - cache [label="Cache database"] - rss [label="Atom/RSS adapters"] + article_location ..> np_i - retrieve_tools -> retrieve_adapters - retrieve_adapters -> retrieve - retrieve_tools -> retrieve - rss -> auto_retrieve - article_repr -> retrieve_adapters + api -> article_location + api -> rss - retrieve -> api - auto_retrieve -> api - cache -> api + newspaper -> retrieval_tools: uses to implement - } + article_location --> article_repr :uses + + auto_retrieve --> rss: watches + auto_retrieve --> article_location + auto_retrieve --> cache: stores in } + +frame "Web ui" { + [Web UI] as webui + [HTML renderer] as html_rend + [Pdf exporter] as pdf_rend + [Articles] as articles + webui --> html_rend + webui --> pdf_rend + webui -> articles + articles ..> api_i +} + +[Chatbot] as chatbot + +chatbot ..> api_i + +actor User +User ..> webui +User ..> chatbot + +actor "Newspaper programmer" as newspaper_programmer +newspaper_programmer ..> newspaper: implements +@enduml ``` + +A task queue could be added later to space requests. + +# Implementation plan + +## Phase I +- [x] `Newspaper` interface : use to retrieve from newspaper websites +- [ ] minimal chatbot (uses libraries directly) +- [x] `ArticleLocation` : library for using several `Newspaper` and retrieving from +a given url. + +## Phase II +- [ ] Article Representation : having a (beta) unified representation for downloaded +articles + - [ ] adding this representation to Newpsaper + +## Phase III +- [ ] Cache +- [ ] Atom/rss adapters +- [ ] automatic retrieve + +## Phase IV +- [ ] API +- [ ] chatbot (uses api) + +## Phase V +- [ ] web ui diff --git a/documentation/guides/run_chatbot.md b/documentation/guides/run_chatbot.md new file mode 100644 index 0000000..b6f9322 --- /dev/null +++ b/documentation/guides/run_chatbot.md @@ -0,0 +1,19 @@ +--- +title: run the chatbot +--- + +1. You must first configure matrix login, every variable in [the_reference](../reference/chatbot_configuration.md) is mandatory. + +```env +CRIEUR_MATRIX_USER=user +CRIEUR_MATRIX_PASSWORD=password +CRIEUR_MATRIX_HOMESERVER=https://homeserv.er +CRIEUR_MATRIX_ROOM=roomid +``` + +You can put it in a `.env` file. + +2. run the chatbot +``` +cargo run --release --bin crieur-chatbot +``` diff --git a/documentation/reference/chatbot_configuration.md b/documentation/reference/chatbot_configuration.md new file mode 100644 index 0000000..3467dd9 --- /dev/null +++ b/documentation/reference/chatbot_configuration.md @@ -0,0 +1,17 @@ +--- +Title: Chatbot configuration reference +--- + +The chatbot is configured using environment variables + +CRIEUR_MATRIX_USER +: username of the matrix bot account + +CRIEUR_MATRIX_PASSWORD +: password of the matrix bot account + +CRIEUR_MATRIX_HOMESERVER +: homeserver of the matrix bot account + +CRIEUR_MATRIX_ROOM +: the room in which to listen to events diff --git a/documentation/reference/newspaper_configuration.md b/documentation/reference/newspaper_configuration.md new file mode 100644 index 0000000..8658087 --- /dev/null +++ b/documentation/reference/newspaper_configuration.md @@ -0,0 +1,10 @@ +--- +title: Newspapers configuration +--- + +The newspapers are configured using environment variables + +# Mediapart + +MEDIAPART_COOKIE +: sets the `MPRUUID` cookie, used to log in diff --git a/examples/cli_downloader.rs b/examples/cli_downloader.rs index 24341ba..4a4eefe 100644 --- a/examples/cli_downloader.rs +++ b/examples/cli_downloader.rs @@ -2,7 +2,11 @@ use std::convert::TryInto; use std::env; use anyhow::Result; -use crieur_retrieve::{newspaper::Newspaper, ArticleLocation, Mediapart, Url}; +use crieur_retrieve::{ + newspaper::Newspaper, + newspapers::mediapart::{self, Mediapart}, + ArticleLocation, Url, +}; use dotenv::dotenv; use log::info; @@ -17,12 +21,12 @@ async fn main() -> Result<()> { }; // TODO: remove this in favor of default newspapers - let mut mediapart = Mediapart::new() - //.login(USERNAME, PASSWORD) - // - ; - mediapart.login_cookie = Some(("MPRUUID".into(), env::var("MEDIAPART_COOKIE")?.into())); + let mpruiid = env::var("MEDIAPART_COOKIE")?.into(); + let mediapart = Mediapart::builder() + .login(mediapart::Login::MPRUUID(mpruiid)) + .build()?; + info!("Trying to download article from {}", url); // TODO: shorten this, maybe an helper function ? diff --git a/src/bin/crieur-chatbot.rs b/src/bin/crieur-chatbot.rs new file mode 100644 index 0000000..d0179be --- /dev/null +++ b/src/bin/crieur-chatbot.rs @@ -0,0 +1,10 @@ +use anyhow::Result; +use crieur_chatbot::run; +use dotenv::dotenv; + +#[tokio::main] +async fn main() -> Result<()> { + dotenv().ok(); + run().await?; + Ok(()) +} diff --git a/src/main.rs b/src/main.rs deleted file mode 100644 index 3b7e283..0000000 --- a/src/main.rs +++ /dev/null @@ -1,19 +0,0 @@ -use anyhow::Result; -use crieur_chatbot::run; -use crieur_retrieve::{newspaper::Newspaper, Mediapart, Url}; -use dotenv::dotenv; -use std::env; - -#[tokio::main] -async fn main() -> Result<()> { - dotenv().ok(); - - let mut mediapart = Mediapart::new() - //.login(USERNAME, PASSWORD) - // - ; - - mediapart.login_cookie = Some(("MPRUUID".into(), env::var("MEDIAPART_COOKIE")?)); - run().await?; - Ok(()) -}