feat: add builder for mediapart, document chatbot
All checks were successful
continuous-integration/drone/pr Build is passing

A builder for mediapart have been added. No generic builder have been
created as there is no usecase yet.

Some documentation have been added, roadmap and scope have been
clarified and chatbot have been lightly documented.
This commit is contained in:
koalp 2021-04-29 02:06:14 +02:00
parent a16dbbc790
commit 865b949b5f
Signed by: koalp
GPG Key ID: 35B21047DEB09A81
15 changed files with 228 additions and 100 deletions

View File

@ -8,6 +8,7 @@ steps:
pull: true
errignore: true
commands:
- apt-get update && apt-get install -y cmake
- rustup component add rustfmt
- rustup component add clippy
- cargo clippy

View File

@ -1,17 +1,33 @@
Tools to retrieve articles from multiple newspaper you subscribed to.
**This is a prototype, it isn't stable at all and you may not want to use it if you expect it to just work !**
**This is a prototype, it isn't stable at all and you may not want to use it if
you expect it to just work !**
# How to use it
First retrieve login cookies for websites and put it in a `.env`
First retrieve login cookies for websites and put it in a `.env` such as
explained in the [newspaper source configuration
documentation](./documentation/reference/newspaper_configuration.md)
Then you can run run
```
cargo run --example=cli_downloader
```
To know how to run the chatbot, please read the [chatbot
guide](./documentation/guides/run_chatbot.md)
# Documentation
- 1. [Design](documentation/design/index.md)
- a. [Scope of the project](documentation/design/scope.md)
- b. [Retrieve](documentation/design/retrieve.md)
- 1. Design
- a. [Scope of the project and roadmap](./documentation/design/scope.md)
- b. [Retrieve](./documentation/design/retrieve.md)
- 2. Guides
- a. [Add a newspaper a
source](./documentation/guides/add_a_newspaper_source.md)
- 3. Reference
- a. [Newspaper source
configuration](./documentation/reference/newspaper_configuration.md)
- b. [Chatbot
configuration](./documentation/reference/chatbot_configuration.md)

View File

@ -13,7 +13,7 @@ use matrix_sdk::{
Client, ClientConfig, EventHandler, SyncSettings,
};
use crieur_retrieve::{newspaper::Newspaper, ArticleLocation, Mediapart, Url};
use crieur_retrieve::{ArticleLocation, Url};
pub(crate) struct Html {}

View File

@ -7,19 +7,17 @@ use log::info;
use url::{Host, Url};
use crate::newspaper::Newspaper;
use crate::newspapers::Mediapart;
use crate::newspapers::mediapart::{self, Mediapart};
type Newspapers = Vec<Box<dyn Newspaper>>;
fn default_newpapers() -> Newspapers {
let mut mediapart = Mediapart::new();
fn default_newpapers() -> Result<Newspapers> {
let mpruiid = env::var("MEDIAPART_COOKIE")?.into();
let mediapart = Mediapart::builder()
.login(mediapart::Login::MPRUUID(mpruiid))
.build()?;
mediapart.login_cookie = Some((
"MPRUUID".into(),
env::var("MEDIAPART_COOKIE").unwrap().into(),
));
vec![Box::new(mediapart)]
Ok(vec![Box::new(mediapart)])
}
#[derive(Default)]
@ -93,7 +91,7 @@ impl Builder {
let host = Host::parse(host)?;
let newspaper = self
.newspapers
.unwrap_or(default_newpapers())
.unwrap_or(default_newpapers()?)
.into_iter()
.find(|c| c.metadata().hosts.contains(&host))
.ok_or(anyhow!("Newspaper couldn't be found"))?;

View File

@ -8,8 +8,7 @@ pub use tools::{Download, Downloader};
pub mod newspaper;
// TODO: move to another crate
mod newspapers;
pub use newspapers::Mediapart;
pub mod newspapers;
mod article_location;
pub use article_location::ArticleLocation;

View File

@ -1,17 +1,10 @@
use anyhow::Result;
use async_trait::async_trait;
use derive_builder::Builder;
use url::Host;
pub use url::Url;
enum Login {
Username(String, String),
Cookie(String),
}
/// Contains metadata about a newspaper
// TODO: provide builder
#[derive(Debug, PartialEq, Default, Builder)]
#[derive(Debug, PartialEq, Default, derive_builder::Builder)]
#[builder(default)]
pub struct Metadata {
/// The hosts that can be corresponds to this newspaper
@ -28,6 +21,7 @@ pub struct Metadata {
}
impl Metadata {
/// Get metadata builder
pub fn builder() -> MetadataBuilder {
MetadataBuilder::default()
}

View File

@ -1,4 +1,4 @@
use anyhow::Result;
use anyhow::{anyhow, Result};
use async_trait::async_trait;
use cookie::Cookie;
use url::Host;
@ -8,16 +8,46 @@ use crate::tools;
use crate::Url;
use crate::{Download, Downloader};
pub enum Login {
Username(String, String),
MPRUUID(String),
}
#[derive(Debug, Clone, Default)]
pub struct Mediapart {
// TODO: remove this pub !!
pub login_cookie: Option<(String, String)>,
login_cookie: (String, String),
}
fn str_to_host<S: Into<String>>(host: S) -> Host {
Host::Domain(host.into())
}
#[derive(Debug, Clone, Default)]
pub struct Builder {
login_cookie: Option<(String, String)>,
}
impl Builder {
pub fn login(&mut self, login: Login) -> &mut Self {
self.login_cookie = match login {
Login::Username(_username, _password) => {
unimplemented!("login using username and passwond not implemented")
}
Login::MPRUUID(cookie_value) => Some(("MPRUUID".into(), cookie_value)),
};
self
}
pub fn build(&self) -> Result<Mediapart> {
match &self.login_cookie {
Some(login_cookie) => Ok(Mediapart {
login_cookie: login_cookie.clone(),
}),
None => Err(anyhow!("You have to log in to access this newspaper")),
}
}
}
#[async_trait]
impl Newspaper for Mediapart {
fn metadata(&self) -> Metadata {
@ -41,13 +71,10 @@ impl Newspaper for Mediapart {
let mut url = url.clone();
url.set_query(Some(&query));
// TODO: add "?onglet=full" to the url if not
let cookies = if let Some((name, value)) = &self.login_cookie {
let cookie = Cookie::build(name, value).secure(true).finish();
vec![cookie]
} else {
vec![]
};
let cookie = Cookie::build(&self.login_cookie.0, &self.login_cookie.1)
.secure(true)
.finish();
let cookies = vec![cookie];
// TODO: replace by builder
let downloader = Downloader { cookies };
@ -74,7 +101,6 @@ impl Newspaper for Mediapart {
"aside.cc-modal",
];
// TODO: correction of usage of relative urls, and replace "" by the url
let single_page_html =
tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await;
Ok(single_page_html)
@ -91,3 +117,9 @@ impl Newspaper for Mediapart {
true
}
}
impl Mediapart {
pub fn builder() -> Builder {
Builder::default()
}
}

View File

@ -1,3 +1 @@
mod mediapart;
pub use mediapart::Mediapart;
pub mod mediapart;

View File

@ -1,48 +1,97 @@
This project mainly aims at providing an unified interface for several newspapers. Side
objectives are to provide web API and different clients like a webUI or chatbots.
---
title: Scope of the project
---
Several big components are planned for this project
This project mainly aims at providing an unified interface for several
newspapers. Side objectives are to provide web API and different clients like a
webUI or chatbots.
```dot
digraph G {
rankdir=TB
node [shape=rectangle, style=filled, color="#779988"]
Several big components are planned for this project (it is an initial draft and
may change later) :
subgraph cluster_frontend {
color = transparent
webui
chatbot
}
```plantuml
@startuml
frame "backend" {
[Retrieval tools] as retrieval_tools
[Article representation] as article_repr
[Automatic retrieval] as auto_retrieve
[Atom/RSS adapters] as rss
[Cache DB] as cache
[Newspaper\n(Mediapart, …)] as newspaper
() "Newspaper" as np_i
newspaper -up- np_i
webui -> api [color = red]
chatbot -> api [color = red]
[Article location] as article_location
subgraph cluster_backend {
label = "Backend\ncrieur binary"
labelloc = b
style=filled
[API] as api
() "API" as api_i
api -up- api_i
retrieve_tools [label="retrieve-tools"]
retrieve_adapters [label="retrieve-adapters"]
retrieve [label="retrieve-interface"]
auto_retrieve [label="automatic-retrieve"]
article_repr [label="article-representation\nRepresentation for articles"]
api
cache [label="Cache database"]
rss [label="Atom/RSS adapters"]
article_location ..> np_i
retrieve_tools -> retrieve_adapters
retrieve_adapters -> retrieve
retrieve_tools -> retrieve
rss -> auto_retrieve
article_repr -> retrieve_adapters
api -> article_location
api -> rss
retrieve -> api
auto_retrieve -> api
cache -> api
newspaper -> retrieval_tools: uses to implement
}
article_location --> article_repr :uses
auto_retrieve --> rss: watches
auto_retrieve --> article_location
auto_retrieve --> cache: stores in
}
frame "Web ui" {
[Web UI] as webui
[HTML renderer] as html_rend
[Pdf exporter] as pdf_rend
[Articles] as articles
webui --> html_rend
webui --> pdf_rend
webui -> articles
articles ..> api_i
}
[Chatbot] as chatbot
chatbot ..> api_i
actor User
User ..> webui
User ..> chatbot
actor "Newspaper programmer" as newspaper_programmer
newspaper_programmer ..> newspaper: implements
@enduml
```
A task queue could be added later to space requests.
# Implementation plan
## Phase I
- [x] `Newspaper` interface : use to retrieve from newspaper websites
- [ ] minimal chatbot (uses libraries directly)
- [x] `ArticleLocation` : library for using several `Newspaper` and retrieving from
a given url.
## Phase II
- [ ] Article Representation : having a (beta) unified representation for downloaded
articles
- [ ] adding this representation to Newpsaper
## Phase III
- [ ] Cache
- [ ] Atom/rss adapters
- [ ] automatic retrieve
## Phase IV
- [ ] API
- [ ] chatbot (uses api)
## Phase V
- [ ] web ui

View File

@ -0,0 +1,19 @@
---
title: run the chatbot
---
1. You must first configure matrix login, every variable in [the_reference](../reference/chatbot_configuration.md) is mandatory.
```env
CRIEUR_MATRIX_USER=user
CRIEUR_MATRIX_PASSWORD=password
CRIEUR_MATRIX_HOMESERVER=https://homeserv.er
CRIEUR_MATRIX_ROOM=roomid
```
You can put it in a `.env` file.
2. run the chatbot
```
cargo run --release --bin crieur-chatbot
```

View File

@ -0,0 +1,17 @@
---
Title: Chatbot configuration reference
---
The chatbot is configured using environment variables
CRIEUR_MATRIX_USER
: username of the matrix bot account
CRIEUR_MATRIX_PASSWORD
: password of the matrix bot account
CRIEUR_MATRIX_HOMESERVER
: homeserver of the matrix bot account
CRIEUR_MATRIX_ROOM
: the room in which to listen to events

View File

@ -0,0 +1,10 @@
---
title: Newspapers configuration
---
The newspapers are configured using environment variables
# Mediapart
MEDIAPART_COOKIE
: sets the `MPRUUID` cookie, used to log in

View File

@ -2,7 +2,11 @@ use std::convert::TryInto;
use std::env;
use anyhow::Result;
use crieur_retrieve::{newspaper::Newspaper, ArticleLocation, Mediapart, Url};
use crieur_retrieve::{
newspaper::Newspaper,
newspapers::mediapart::{self, Mediapart},
ArticleLocation, Url,
};
use dotenv::dotenv;
use log::info;
@ -17,12 +21,12 @@ async fn main() -> Result<()> {
};
// TODO: remove this in favor of default newspapers
let mut mediapart = Mediapart::new()
//.login(USERNAME, PASSWORD)
//
;
mediapart.login_cookie = Some(("MPRUUID".into(), env::var("MEDIAPART_COOKIE")?.into()));
let mpruiid = env::var("MEDIAPART_COOKIE")?.into();
let mediapart = Mediapart::builder()
.login(mediapart::Login::MPRUUID(mpruiid))
.build()?;
info!("Trying to download article from {}", url);
// TODO: shorten this, maybe an helper function ?

10
src/bin/crieur-chatbot.rs Normal file
View File

@ -0,0 +1,10 @@
use anyhow::Result;
use crieur_chatbot::run;
use dotenv::dotenv;
#[tokio::main]
async fn main() -> Result<()> {
dotenv().ok();
run().await?;
Ok(())
}

View File

@ -1,19 +0,0 @@
use anyhow::Result;
use crieur_chatbot::run;
use crieur_retrieve::{newspaper::Newspaper, Mediapart, Url};
use dotenv::dotenv;
use std::env;
#[tokio::main]
async fn main() -> Result<()> {
dotenv().ok();
let mut mediapart = Mediapart::new()
//.login(USERNAME, PASSWORD)
//
;
mediapart.login_cookie = Some(("MPRUUID".into(), env::var("MEDIAPART_COOKIE")?));
run().await?;
Ok(())
}