feat: add builder for mediapart, document chatbot
All checks were successful
continuous-integration/drone/pr Build is passing
All checks were successful
continuous-integration/drone/pr Build is passing
A builder for mediapart have been added. No generic builder have been created as there is no usecase yet. Some documentation have been added, roadmap and scope have been clarified and chatbot have been lightly documented.
This commit is contained in:
parent
a16dbbc790
commit
865b949b5f
@ -8,6 +8,7 @@ steps:
|
|||||||
pull: true
|
pull: true
|
||||||
errignore: true
|
errignore: true
|
||||||
commands:
|
commands:
|
||||||
|
- apt-get update && apt-get install -y cmake
|
||||||
- rustup component add rustfmt
|
- rustup component add rustfmt
|
||||||
- rustup component add clippy
|
- rustup component add clippy
|
||||||
- cargo clippy
|
- cargo clippy
|
||||||
|
26
README.md
26
README.md
@ -1,17 +1,33 @@
|
|||||||
Tools to retrieve articles from multiple newspaper you subscribed to.
|
Tools to retrieve articles from multiple newspaper you subscribed to.
|
||||||
|
|
||||||
**This is a prototype, it isn't stable at all and you may not want to use it if you expect it to just work !**
|
**This is a prototype, it isn't stable at all and you may not want to use it if
|
||||||
|
you expect it to just work !**
|
||||||
|
|
||||||
# How to use it
|
# How to use it
|
||||||
|
|
||||||
First retrieve login cookies for websites and put it in a `.env`
|
First retrieve login cookies for websites and put it in a `.env` such as
|
||||||
|
explained in the [newspaper source configuration
|
||||||
|
documentation](./documentation/reference/newspaper_configuration.md)
|
||||||
|
|
||||||
|
Then you can run run
|
||||||
|
|
||||||
```
|
```
|
||||||
cargo run --example=cli_downloader
|
cargo run --example=cli_downloader
|
||||||
```
|
```
|
||||||
|
|
||||||
|
To know how to run the chatbot, please read the [chatbot
|
||||||
|
guide](./documentation/guides/run_chatbot.md)
|
||||||
|
|
||||||
# Documentation
|
# Documentation
|
||||||
|
|
||||||
- 1. [Design](documentation/design/index.md)
|
- 1. Design
|
||||||
- a. [Scope of the project](documentation/design/scope.md)
|
- a. [Scope of the project and roadmap](./documentation/design/scope.md)
|
||||||
- b. [Retrieve](documentation/design/retrieve.md)
|
- b. [Retrieve](./documentation/design/retrieve.md)
|
||||||
|
- 2. Guides
|
||||||
|
- a. [Add a newspaper a
|
||||||
|
source](./documentation/guides/add_a_newspaper_source.md)
|
||||||
|
- 3. Reference
|
||||||
|
- a. [Newspaper source
|
||||||
|
configuration](./documentation/reference/newspaper_configuration.md)
|
||||||
|
- b. [Chatbot
|
||||||
|
configuration](./documentation/reference/chatbot_configuration.md)
|
||||||
|
@ -13,7 +13,7 @@ use matrix_sdk::{
|
|||||||
Client, ClientConfig, EventHandler, SyncSettings,
|
Client, ClientConfig, EventHandler, SyncSettings,
|
||||||
};
|
};
|
||||||
|
|
||||||
use crieur_retrieve::{newspaper::Newspaper, ArticleLocation, Mediapart, Url};
|
use crieur_retrieve::{ArticleLocation, Url};
|
||||||
|
|
||||||
pub(crate) struct Html {}
|
pub(crate) struct Html {}
|
||||||
|
|
||||||
|
@ -7,19 +7,17 @@ use log::info;
|
|||||||
use url::{Host, Url};
|
use url::{Host, Url};
|
||||||
|
|
||||||
use crate::newspaper::Newspaper;
|
use crate::newspaper::Newspaper;
|
||||||
use crate::newspapers::Mediapart;
|
use crate::newspapers::mediapart::{self, Mediapart};
|
||||||
|
|
||||||
type Newspapers = Vec<Box<dyn Newspaper>>;
|
type Newspapers = Vec<Box<dyn Newspaper>>;
|
||||||
|
|
||||||
fn default_newpapers() -> Newspapers {
|
fn default_newpapers() -> Result<Newspapers> {
|
||||||
let mut mediapart = Mediapart::new();
|
let mpruiid = env::var("MEDIAPART_COOKIE")?.into();
|
||||||
|
let mediapart = Mediapart::builder()
|
||||||
|
.login(mediapart::Login::MPRUUID(mpruiid))
|
||||||
|
.build()?;
|
||||||
|
|
||||||
mediapart.login_cookie = Some((
|
Ok(vec![Box::new(mediapart)])
|
||||||
"MPRUUID".into(),
|
|
||||||
env::var("MEDIAPART_COOKIE").unwrap().into(),
|
|
||||||
));
|
|
||||||
|
|
||||||
vec![Box::new(mediapart)]
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Default)]
|
#[derive(Default)]
|
||||||
@ -93,7 +91,7 @@ impl Builder {
|
|||||||
let host = Host::parse(host)?;
|
let host = Host::parse(host)?;
|
||||||
let newspaper = self
|
let newspaper = self
|
||||||
.newspapers
|
.newspapers
|
||||||
.unwrap_or(default_newpapers())
|
.unwrap_or(default_newpapers()?)
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.find(|c| c.metadata().hosts.contains(&host))
|
.find(|c| c.metadata().hosts.contains(&host))
|
||||||
.ok_or(anyhow!("Newspaper couldn't be found"))?;
|
.ok_or(anyhow!("Newspaper couldn't be found"))?;
|
||||||
|
@ -8,8 +8,7 @@ pub use tools::{Download, Downloader};
|
|||||||
pub mod newspaper;
|
pub mod newspaper;
|
||||||
|
|
||||||
// TODO: move to another crate
|
// TODO: move to another crate
|
||||||
mod newspapers;
|
pub mod newspapers;
|
||||||
pub use newspapers::Mediapart;
|
|
||||||
|
|
||||||
mod article_location;
|
mod article_location;
|
||||||
pub use article_location::ArticleLocation;
|
pub use article_location::ArticleLocation;
|
||||||
|
@ -1,17 +1,10 @@
|
|||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use derive_builder::Builder;
|
|
||||||
use url::Host;
|
use url::Host;
|
||||||
pub use url::Url;
|
pub use url::Url;
|
||||||
|
|
||||||
enum Login {
|
|
||||||
Username(String, String),
|
|
||||||
Cookie(String),
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Contains metadata about a newspaper
|
/// Contains metadata about a newspaper
|
||||||
// TODO: provide builder
|
#[derive(Debug, PartialEq, Default, derive_builder::Builder)]
|
||||||
#[derive(Debug, PartialEq, Default, Builder)]
|
|
||||||
#[builder(default)]
|
#[builder(default)]
|
||||||
pub struct Metadata {
|
pub struct Metadata {
|
||||||
/// The hosts that can be corresponds to this newspaper
|
/// The hosts that can be corresponds to this newspaper
|
||||||
@ -28,6 +21,7 @@ pub struct Metadata {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl Metadata {
|
impl Metadata {
|
||||||
|
/// Get metadata builder
|
||||||
pub fn builder() -> MetadataBuilder {
|
pub fn builder() -> MetadataBuilder {
|
||||||
MetadataBuilder::default()
|
MetadataBuilder::default()
|
||||||
}
|
}
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
use anyhow::Result;
|
use anyhow::{anyhow, Result};
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use cookie::Cookie;
|
use cookie::Cookie;
|
||||||
use url::Host;
|
use url::Host;
|
||||||
@ -8,16 +8,46 @@ use crate::tools;
|
|||||||
use crate::Url;
|
use crate::Url;
|
||||||
use crate::{Download, Downloader};
|
use crate::{Download, Downloader};
|
||||||
|
|
||||||
|
pub enum Login {
|
||||||
|
Username(String, String),
|
||||||
|
MPRUUID(String),
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Default)]
|
#[derive(Debug, Clone, Default)]
|
||||||
pub struct Mediapart {
|
pub struct Mediapart {
|
||||||
// TODO: remove this pub !!
|
login_cookie: (String, String),
|
||||||
pub login_cookie: Option<(String, String)>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn str_to_host<S: Into<String>>(host: S) -> Host {
|
fn str_to_host<S: Into<String>>(host: S) -> Host {
|
||||||
Host::Domain(host.into())
|
Host::Domain(host.into())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Default)]
|
||||||
|
pub struct Builder {
|
||||||
|
login_cookie: Option<(String, String)>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Builder {
|
||||||
|
pub fn login(&mut self, login: Login) -> &mut Self {
|
||||||
|
self.login_cookie = match login {
|
||||||
|
Login::Username(_username, _password) => {
|
||||||
|
unimplemented!("login using username and passwond not implemented")
|
||||||
|
}
|
||||||
|
Login::MPRUUID(cookie_value) => Some(("MPRUUID".into(), cookie_value)),
|
||||||
|
};
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn build(&self) -> Result<Mediapart> {
|
||||||
|
match &self.login_cookie {
|
||||||
|
Some(login_cookie) => Ok(Mediapart {
|
||||||
|
login_cookie: login_cookie.clone(),
|
||||||
|
}),
|
||||||
|
None => Err(anyhow!("You have to log in to access this newspaper")),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[async_trait]
|
#[async_trait]
|
||||||
impl Newspaper for Mediapart {
|
impl Newspaper for Mediapart {
|
||||||
fn metadata(&self) -> Metadata {
|
fn metadata(&self) -> Metadata {
|
||||||
@ -41,13 +71,10 @@ impl Newspaper for Mediapart {
|
|||||||
let mut url = url.clone();
|
let mut url = url.clone();
|
||||||
url.set_query(Some(&query));
|
url.set_query(Some(&query));
|
||||||
|
|
||||||
// TODO: add "?onglet=full" to the url if not
|
let cookie = Cookie::build(&self.login_cookie.0, &self.login_cookie.1)
|
||||||
let cookies = if let Some((name, value)) = &self.login_cookie {
|
.secure(true)
|
||||||
let cookie = Cookie::build(name, value).secure(true).finish();
|
.finish();
|
||||||
vec![cookie]
|
let cookies = vec![cookie];
|
||||||
} else {
|
|
||||||
vec![]
|
|
||||||
};
|
|
||||||
|
|
||||||
// TODO: replace by builder
|
// TODO: replace by builder
|
||||||
let downloader = Downloader { cookies };
|
let downloader = Downloader { cookies };
|
||||||
@ -74,7 +101,6 @@ impl Newspaper for Mediapart {
|
|||||||
"aside.cc-modal",
|
"aside.cc-modal",
|
||||||
];
|
];
|
||||||
|
|
||||||
// TODO: correction of usage of relative urls, and replace "" by the url
|
|
||||||
let single_page_html =
|
let single_page_html =
|
||||||
tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await;
|
tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await;
|
||||||
Ok(single_page_html)
|
Ok(single_page_html)
|
||||||
@ -91,3 +117,9 @@ impl Newspaper for Mediapart {
|
|||||||
true
|
true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl Mediapart {
|
||||||
|
pub fn builder() -> Builder {
|
||||||
|
Builder::default()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -1,3 +1 @@
|
|||||||
mod mediapart;
|
pub mod mediapart;
|
||||||
|
|
||||||
pub use mediapart::Mediapart;
|
|
||||||
|
@ -1,48 +1,97 @@
|
|||||||
This project mainly aims at providing an unified interface for several newspapers. Side
|
---
|
||||||
objectives are to provide web API and different clients like a webUI or chatbots.
|
title: Scope of the project
|
||||||
|
---
|
||||||
|
|
||||||
Several big components are planned for this project
|
This project mainly aims at providing an unified interface for several
|
||||||
|
newspapers. Side objectives are to provide web API and different clients like a
|
||||||
|
webUI or chatbots.
|
||||||
|
|
||||||
```dot
|
Several big components are planned for this project (it is an initial draft and
|
||||||
digraph G {
|
may change later) :
|
||||||
rankdir=TB
|
|
||||||
node [shape=rectangle, style=filled, color="#779988"]
|
|
||||||
|
|
||||||
subgraph cluster_frontend {
|
```plantuml
|
||||||
color = transparent
|
@startuml
|
||||||
webui
|
|
||||||
chatbot
|
frame "backend" {
|
||||||
}
|
[Retrieval tools] as retrieval_tools
|
||||||
|
[Article representation] as article_repr
|
||||||
|
[Automatic retrieval] as auto_retrieve
|
||||||
|
[Atom/RSS adapters] as rss
|
||||||
|
[Cache DB] as cache
|
||||||
|
|
||||||
|
[Newspaper\n(Mediapart, …)] as newspaper
|
||||||
|
() "Newspaper" as np_i
|
||||||
|
newspaper -up- np_i
|
||||||
|
|
||||||
|
|
||||||
webui -> api [color = red]
|
[Article location] as article_location
|
||||||
chatbot -> api [color = red]
|
|
||||||
|
|
||||||
subgraph cluster_backend {
|
[API] as api
|
||||||
label = "Backend\ncrieur binary"
|
() "API" as api_i
|
||||||
labelloc = b
|
api -up- api_i
|
||||||
style=filled
|
|
||||||
|
|
||||||
retrieve_tools [label="retrieve-tools"]
|
article_location ..> np_i
|
||||||
retrieve_adapters [label="retrieve-adapters"]
|
|
||||||
retrieve [label="retrieve-interface"]
|
|
||||||
auto_retrieve [label="automatic-retrieve"]
|
|
||||||
article_repr [label="article-representation\nRepresentation for articles"]
|
|
||||||
api
|
|
||||||
cache [label="Cache database"]
|
|
||||||
rss [label="Atom/RSS adapters"]
|
|
||||||
|
|
||||||
retrieve_tools -> retrieve_adapters
|
api -> article_location
|
||||||
retrieve_adapters -> retrieve
|
api -> rss
|
||||||
retrieve_tools -> retrieve
|
|
||||||
rss -> auto_retrieve
|
|
||||||
article_repr -> retrieve_adapters
|
|
||||||
|
|
||||||
retrieve -> api
|
newspaper -> retrieval_tools: uses to implement
|
||||||
auto_retrieve -> api
|
|
||||||
cache -> api
|
|
||||||
|
|
||||||
}
|
article_location --> article_repr :uses
|
||||||
|
|
||||||
|
auto_retrieve --> rss: watches
|
||||||
|
auto_retrieve --> article_location
|
||||||
|
auto_retrieve --> cache: stores in
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
frame "Web ui" {
|
||||||
|
[Web UI] as webui
|
||||||
|
[HTML renderer] as html_rend
|
||||||
|
[Pdf exporter] as pdf_rend
|
||||||
|
[Articles] as articles
|
||||||
|
webui --> html_rend
|
||||||
|
webui --> pdf_rend
|
||||||
|
webui -> articles
|
||||||
|
articles ..> api_i
|
||||||
|
}
|
||||||
|
|
||||||
|
[Chatbot] as chatbot
|
||||||
|
|
||||||
|
chatbot ..> api_i
|
||||||
|
|
||||||
|
actor User
|
||||||
|
User ..> webui
|
||||||
|
User ..> chatbot
|
||||||
|
|
||||||
|
actor "Newspaper programmer" as newspaper_programmer
|
||||||
|
newspaper_programmer ..> newspaper: implements
|
||||||
|
@enduml
|
||||||
```
|
```
|
||||||
|
|
||||||
|
A task queue could be added later to space requests.
|
||||||
|
|
||||||
|
# Implementation plan
|
||||||
|
|
||||||
|
## Phase I
|
||||||
|
- [x] `Newspaper` interface : use to retrieve from newspaper websites
|
||||||
|
- [ ] minimal chatbot (uses libraries directly)
|
||||||
|
- [x] `ArticleLocation` : library for using several `Newspaper` and retrieving from
|
||||||
|
a given url.
|
||||||
|
|
||||||
|
## Phase II
|
||||||
|
- [ ] Article Representation : having a (beta) unified representation for downloaded
|
||||||
|
articles
|
||||||
|
- [ ] adding this representation to Newpsaper
|
||||||
|
|
||||||
|
## Phase III
|
||||||
|
- [ ] Cache
|
||||||
|
- [ ] Atom/rss adapters
|
||||||
|
- [ ] automatic retrieve
|
||||||
|
|
||||||
|
## Phase IV
|
||||||
|
- [ ] API
|
||||||
|
- [ ] chatbot (uses api)
|
||||||
|
|
||||||
|
## Phase V
|
||||||
|
- [ ] web ui
|
||||||
|
19
documentation/guides/run_chatbot.md
Normal file
19
documentation/guides/run_chatbot.md
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
---
|
||||||
|
title: run the chatbot
|
||||||
|
---
|
||||||
|
|
||||||
|
1. You must first configure matrix login, every variable in [the_reference](../reference/chatbot_configuration.md) is mandatory.
|
||||||
|
|
||||||
|
```env
|
||||||
|
CRIEUR_MATRIX_USER=user
|
||||||
|
CRIEUR_MATRIX_PASSWORD=password
|
||||||
|
CRIEUR_MATRIX_HOMESERVER=https://homeserv.er
|
||||||
|
CRIEUR_MATRIX_ROOM=roomid
|
||||||
|
```
|
||||||
|
|
||||||
|
You can put it in a `.env` file.
|
||||||
|
|
||||||
|
2. run the chatbot
|
||||||
|
```
|
||||||
|
cargo run --release --bin crieur-chatbot
|
||||||
|
```
|
17
documentation/reference/chatbot_configuration.md
Normal file
17
documentation/reference/chatbot_configuration.md
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
---
|
||||||
|
Title: Chatbot configuration reference
|
||||||
|
---
|
||||||
|
|
||||||
|
The chatbot is configured using environment variables
|
||||||
|
|
||||||
|
CRIEUR_MATRIX_USER
|
||||||
|
: username of the matrix bot account
|
||||||
|
|
||||||
|
CRIEUR_MATRIX_PASSWORD
|
||||||
|
: password of the matrix bot account
|
||||||
|
|
||||||
|
CRIEUR_MATRIX_HOMESERVER
|
||||||
|
: homeserver of the matrix bot account
|
||||||
|
|
||||||
|
CRIEUR_MATRIX_ROOM
|
||||||
|
: the room in which to listen to events
|
10
documentation/reference/newspaper_configuration.md
Normal file
10
documentation/reference/newspaper_configuration.md
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
---
|
||||||
|
title: Newspapers configuration
|
||||||
|
---
|
||||||
|
|
||||||
|
The newspapers are configured using environment variables
|
||||||
|
|
||||||
|
# Mediapart
|
||||||
|
|
||||||
|
MEDIAPART_COOKIE
|
||||||
|
: sets the `MPRUUID` cookie, used to log in
|
@ -2,7 +2,11 @@ use std::convert::TryInto;
|
|||||||
use std::env;
|
use std::env;
|
||||||
|
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use crieur_retrieve::{newspaper::Newspaper, ArticleLocation, Mediapart, Url};
|
use crieur_retrieve::{
|
||||||
|
newspaper::Newspaper,
|
||||||
|
newspapers::mediapart::{self, Mediapart},
|
||||||
|
ArticleLocation, Url,
|
||||||
|
};
|
||||||
use dotenv::dotenv;
|
use dotenv::dotenv;
|
||||||
use log::info;
|
use log::info;
|
||||||
|
|
||||||
@ -17,12 +21,12 @@ async fn main() -> Result<()> {
|
|||||||
};
|
};
|
||||||
|
|
||||||
// TODO: remove this in favor of default newspapers
|
// TODO: remove this in favor of default newspapers
|
||||||
let mut mediapart = Mediapart::new()
|
|
||||||
//.login(USERNAME, PASSWORD)
|
|
||||||
//
|
|
||||||
;
|
|
||||||
|
|
||||||
mediapart.login_cookie = Some(("MPRUUID".into(), env::var("MEDIAPART_COOKIE")?.into()));
|
let mpruiid = env::var("MEDIAPART_COOKIE")?.into();
|
||||||
|
let mediapart = Mediapart::builder()
|
||||||
|
.login(mediapart::Login::MPRUUID(mpruiid))
|
||||||
|
.build()?;
|
||||||
|
|
||||||
info!("Trying to download article from {}", url);
|
info!("Trying to download article from {}", url);
|
||||||
|
|
||||||
// TODO: shorten this, maybe an helper function ?
|
// TODO: shorten this, maybe an helper function ?
|
||||||
|
10
src/bin/crieur-chatbot.rs
Normal file
10
src/bin/crieur-chatbot.rs
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
use anyhow::Result;
|
||||||
|
use crieur_chatbot::run;
|
||||||
|
use dotenv::dotenv;
|
||||||
|
|
||||||
|
#[tokio::main]
|
||||||
|
async fn main() -> Result<()> {
|
||||||
|
dotenv().ok();
|
||||||
|
run().await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
19
src/main.rs
19
src/main.rs
@ -1,19 +0,0 @@
|
|||||||
use anyhow::Result;
|
|
||||||
use crieur_chatbot::run;
|
|
||||||
use crieur_retrieve::{newspaper::Newspaper, Mediapart, Url};
|
|
||||||
use dotenv::dotenv;
|
|
||||||
use std::env;
|
|
||||||
|
|
||||||
#[tokio::main]
|
|
||||||
async fn main() -> Result<()> {
|
|
||||||
dotenv().ok();
|
|
||||||
|
|
||||||
let mut mediapart = Mediapart::new()
|
|
||||||
//.login(USERNAME, PASSWORD)
|
|
||||||
//
|
|
||||||
;
|
|
||||||
|
|
||||||
mediapart.login_cookie = Some(("MPRUUID".into(), env::var("MEDIAPART_COOKIE")?));
|
|
||||||
run().await?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
Loading…
Reference in New Issue
Block a user