Compare commits

..

No commits in common. "9655b086f04838494f4458ef7182f8924c772f87" and "9aa2b5f07be6ac2db8d9d68db837ef97fd549c19" have entirely different histories.

24 changed files with 124 additions and 1573 deletions

View File

@ -8,7 +8,6 @@ steps:
pull: true
errignore: true
commands:
- apt-get update && apt-get install -y cmake
- rustup component add rustfmt
- rustup component add clippy
- cargo clippy
@ -18,6 +17,5 @@ steps:
pull: true
errignore: true
commands:
- apt-get update && apt-get install -y cmake
- cargo test --all
- cargo build

View File

@ -17,3 +17,6 @@ labels:
**Expected behavior**
*describe what you expected to happen*
**Configuration**
*paste the result of `stage --version`

View File

@ -3,12 +3,13 @@ name: "Feature request"
about: "This template is for requesting a new feature"
title: ""
labels:
- "type::enhancement"
- "type::feature"
- "status::review_needed"
---
*(if applicable) describe what problem or frustration you have currently*
*describe what you would like to be able to do, or what solution you would like*
*describe what you would like to be able to do, or what solution you would like (you can propose several)*
*(optional) additional context, comments or implementation propositions*

1079
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -2,7 +2,6 @@
members = [
"crieur-retrieve",
"crieur-chatbot",
]
@ -18,7 +17,6 @@ publish = false
[dependencies]
anyhow = "1.0.40"
crieur-retrieve = {version = "0.1", path="crieur-retrieve"}
crieur-chatbot = {version = "0.1", path="crieur-chatbot"}
dotenv = "0.15.0"
env_logger = "0.8.3"
log = "0.4.14"

View File

@ -1,33 +1,17 @@
Tools to retrieve articles from multiple newspaper you subscribed to.
**This is a prototype, it isn't stable at all and you may not want to use it if
you expect it to just work !**
**This is a prototype, it isn't stable at all and you may not want to use it if you expect it to just work !**
# How to use it
First retrieve login cookies for websites and put it in a `.env` such as
explained in the [newspaper source configuration
documentation](./documentation/reference/newspaper_configuration.md)
Then you can run run
First retrieve login cookies for websites and put it in a `.env`
```
cargo run --example=cli_downloader
cargo run --example=retrive_html_articles
```
To know how to run the chatbot, please read the [chatbot
guide](./documentation/guides/run_chatbot.md)
# Documentation
- 1. Design
- a. [Scope of the project and roadmap](./documentation/design/scope.md)
- b. [Retrieve](./documentation/design/retrieve.md)
- 2. Guides
- a. [Add a newspaper a
source](./documentation/guides/add_a_newspaper_source.md)
- 3. Reference
- a. [Newspaper source
configuration](./documentation/reference/newspaper_configuration.md)
- b. [Chatbot
configuration](./documentation/reference/chatbot_configuration.md)
- 1. [Design](documentation/design/index.md)
- a. [Scope of the project](documentation/design/scope.md)
- b. [Retrieve](documentation/design/retrieve.md)

View File

@ -1,19 +0,0 @@
[package]
name = "crieur-chatbot"
version = "0.1.0"
authors = ["koalp <koalp@alpaga.dev>"]
edition = "2018"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
anyhow = "1.0.40"
dotenv = "0.15.0"
crieur-retrieve = {version = "0.1.0", path = "../crieur-retrieve"}
mime = "0.3.16"
log = "0.4.14"
[dependencies.matrix-sdk]
git = "https://github.com/matrix-org/matrix-rust-sdk"
rev = "ab180362c931606385dd53b73620d82ef2c3166d"
version = "0.2.0"

View File

@ -1,85 +0,0 @@
//! Chatbot
use std::convert::TryInto;
use anyhow::Result;
use matrix_sdk::{
self, async_trait,
events::{
room::message::{MessageEventContent, MessageType, TextMessageEventContent},
AnyMessageEventContent, SyncMessageEvent,
},
room::Room,
Client, ClientConfig, EventHandler, SyncSettings,
};
use crate::Html;
#[derive(Debug, Clone, Default)]
pub(crate) struct Builder {
user: String,
password: String,
homeserver: String,
//TODO: rooms
room: String,
}
impl Builder {
fn new() -> Self {
Default::default()
}
pub(crate) async fn connect(&self) -> Result<Chatbot> {
let client = Client::new(self.homeserver.as_str())?;
client
.login(self.user.as_str(), self.password.as_str(), None, None)
.await?;
assert!(client.logged_in().await);
client
.join_room_by_id(&self.room.as_str().try_into()?)
.await?;
Ok(Chatbot { client })
}
pub(crate) fn login(
&mut self,
user: &impl AsRef<str>,
password: &impl AsRef<str>,
) -> &mut Self {
self.user = String::from(user.as_ref());
self.password = String::from(password.as_ref());
self
}
pub(crate) fn homeserver(&mut self, homeserver: &impl AsRef<str>) -> &mut Self {
self.homeserver = String::from(homeserver.as_ref());
self
}
pub(crate) fn room(&mut self, room: &impl AsRef<str>) -> &mut Self {
self.room = String::from(room.as_ref());
self
}
}
#[derive(Debug, Clone)]
pub(crate) struct Chatbot {
client: Client,
}
impl Chatbot {
pub(crate) fn builder() -> Builder {
Builder::new()
}
pub(crate) async fn run(&self) -> Result<()> {
self.client.set_event_handler(Box::new(Html::new())).await;
let mut settings = SyncSettings::default();
if let Some(token) = self.client.sync_token().await {
settings = settings.token(token);
}
self.client.sync(settings).await;
Ok(())
}
}

View File

@ -1,32 +0,0 @@
use std::env;
use anyhow::{bail, Result};
use dotenv::dotenv;
use crate::Chatbot;
/// Runs the chatbot
pub async fn run() -> Result<()> {
dotenv().ok();
let (user, password, homeserver, room) = match (
env::var("CRIEUR_MATRIX_USER"),
env::var("CRIEUR_MATRIX_PASSWORD"),
env::var("CRIEUR_MATRIX_HOMESERVER"),
env::var("CRIEUR_MATRIX_ROOM"),
) {
(Ok(user), Ok(password), Ok(homeserver), Ok(room)) => (user, password, homeserver, room),
_ => bail!("Configuration incomplete, please set all required environment variables"),
};
let chatbot = Chatbot::builder()
.login(&user, &password)
.homeserver(&homeserver)
.room(&room)
.connect()
.await?;
chatbot.run().await?;
Ok(())
}

View File

@ -1,94 +0,0 @@
use std::convert::TryInto;
use std::env;
use anyhow::Result;
use log::info;
use matrix_sdk::{
self, async_trait,
events::{
room::message::{MessageEventContent, MessageType, TextMessageEventContent},
AnyMessageEventContent, SyncMessageEvent,
},
room::Room,
Client, ClientConfig, EventHandler, SyncSettings,
};
use crieur_retrieve::{ArticleLocation, Url};
pub(crate) struct Html {}
impl Html {
pub fn new() -> Self {
Self {}
}
}
async fn send_article<U, E>(url: U, room: matrix_sdk::room::Joined)
where
U: TryInto<Url, Error = E> + Send,
E: std::error::Error + Sync + Send + 'static,
{
//TODO: replace by async block when async block is stable
async fn article_html<U, E>(url: U) -> Result<String>
where
U: TryInto<Url, Error = E> + Send,
E: std::error::Error + Sync + Send + 'static,
{
let article_str = ArticleLocation::builder()
.url(url)?
.build()?
.retrieve_html()
.await?;
Ok(article_str)
}
let text_message =
|message| AnyMessageEventContent::RoomMessage(MessageEventContent::text_plain(message));
//TODO: replace occurences ok() by async and logging block when async block is stable
let article_html = match article_html(url).await {
Ok(url) => url,
Err(_) => {
room.send(text_message("Can't download the file"), None)
.await
.ok();
return;
}
};
room.send_attachment(
"test.html",
&mime::TEXT_HTML_UTF_8,
&mut article_html.as_bytes(),
None,
)
.await
.ok();
}
#[async_trait]
impl EventHandler for Html {
async fn on_room_message(&self, room: Room, event: &SyncMessageEvent<MessageEventContent>) {
if let Room::Joined(room) = room {
let msg_body = if let SyncMessageEvent {
content:
MessageEventContent {
msgtype: MessageType::Text(TextMessageEventContent { body: msg_body, .. }),
..
},
..
} = event
{
msg_body
} else {
return;
};
info!("sending file");
match msg_body.split(' ').collect::<Vec<_>>().as_slice() {
["!html", url, ..] => send_article(*url, room).await,
_ => return,
}
}
}
}

View File

@ -1,2 +0,0 @@
mod html;
pub(crate) use html::Html;

View File

@ -1,10 +0,0 @@
//! Provides a matrix chatbot to download newspaper articles
mod cli;
pub use cli::run;
mod chatbot;
use chatbot::Chatbot;
mod handlers;
use handlers::Html;

View File

@ -1,32 +1,21 @@
use std::boxed::Box;
use std::convert::TryInto;
use std::env;
use anyhow::{anyhow, Result};
use log::info;
use url::{Host, Url};
use crate::newspaper::Newspaper;
use crate::newspapers::mediapart::{self, Mediapart};
type Newspapers = Vec<Box<dyn Newspaper>>;
fn default_newpapers() -> Result<Newspapers> {
let mpruiid = env::var("MEDIAPART_COOKIE")?.into();
let mediapart = Mediapart::builder()
.login(mediapart::Login::MPRUUID(mpruiid))
.build()?;
Ok(vec![Box::new(mediapart)])
}
type Newspapers<'a> = Vec<Box<&'a dyn Newspaper>>;
#[derive(Default)]
pub struct Builder {
pub struct ArticleLocationBuilder<'a> {
url: Option<Url>,
newspapers: Option<Newspapers>,
newspapers: Option<Newspapers<'a>>,
}
impl Builder {
impl<'a> ArticleLocationBuilder<'a> {
pub fn new() -> Self {
Self::default()
}
@ -48,9 +37,9 @@ impl Builder {
}
/// Adds a newspaper to the list
pub fn newspaper<T>(mut self, newspaper: T) -> Self
pub fn newspaper<T>(&mut self, newspaper: &'a T) -> &mut Self
where
T: 'static + Newspaper,
T: 'a + Newspaper,
{
match &mut self.newspapers {
Some(newspapers) => newspapers.push(Box::new(newspaper)),
@ -83,7 +72,7 @@ impl Builder {
/// - the url is not set
/// - the given url has no host
// TODO: move this to a defined error, remove anyhow !
pub fn build(self) -> Result<ArticleLocation> {
pub fn build(&self) -> Result<ArticleLocation<'a>> {
let url = Clone::clone(self.url.as_ref().ok_or(anyhow!(
"No url set. You can set it with the url() function"
))?);
@ -91,22 +80,28 @@ impl Builder {
let host = Host::parse(host)?;
let newspaper = self
.newspapers
.unwrap_or(default_newpapers()?)
.into_iter()
.as_ref()
.ok_or(anyhow!(
"A list of NewsPaper must be set. It can be set with newspapers() function"
))?
.iter()
.find(|c| c.metadata().hosts.contains(&host))
.ok_or(anyhow!("Newspaper couldn't be found"))?;
Ok(ArticleLocation { newspaper, url })
Ok(ArticleLocation {
newspaper: newspaper.clone(),
url,
})
}
}
pub struct ArticleLocation {
newspaper: Box<dyn Newspaper>,
pub struct ArticleLocation<'a> {
newspaper: Box<&'a dyn Newspaper>,
pub url: Url,
}
impl ArticleLocation {
pub fn builder() -> Builder {
Builder::new()
impl<'a> ArticleLocation<'a> {
pub fn builder() -> ArticleLocationBuilder<'a> {
ArticleLocationBuilder::new()
}
pub async fn retrieve_html(&self) -> Result<String> {

View File

@ -8,7 +8,8 @@ pub use tools::{Download, Downloader};
pub mod newspaper;
// TODO: move to another crate
pub mod newspapers;
mod newspapers;
pub use newspapers::Mediapart;
mod article_location;
pub use article_location::ArticleLocation;

View File

@ -1,10 +1,17 @@
use anyhow::Result;
use async_trait::async_trait;
use derive_builder::Builder;
use url::Host;
pub use url::Url;
enum Login {
Username(String, String),
Cookie(String),
}
/// Contains metadata about a newspaper
#[derive(Debug, PartialEq, Default, derive_builder::Builder)]
// TODO: provide builder
#[derive(Debug, PartialEq, Default, Builder)]
#[builder(default)]
pub struct Metadata {
/// The hosts that can be corresponds to this newspaper
@ -21,14 +28,13 @@ pub struct Metadata {
}
impl Metadata {
/// Get metadata builder
pub fn builder() -> MetadataBuilder {
MetadataBuilder::default()
}
}
#[async_trait]
pub trait Newspaper: Send + Sync {
pub trait Newspaper {
/// Returns a list of hosts that corresponds to the newspapers
fn metadata(&self) -> Metadata;
@ -43,7 +49,7 @@ pub trait Newspaper: Send + Sync {
}
/// Returns a newspaper structure
fn new() -> Self
async fn new() -> Self
where
Self: Sized;

View File

@ -1,4 +1,4 @@
use anyhow::{anyhow, Result};
use anyhow::Result;
use async_trait::async_trait;
use cookie::Cookie;
use url::Host;
@ -8,46 +8,16 @@ use crate::tools;
use crate::Url;
use crate::{Download, Downloader};
pub enum Login {
Username(String, String),
MPRUUID(String),
}
#[derive(Debug, Clone, Default)]
pub struct Mediapart {
login_cookie: (String, String),
// TODO: remove this pub !!
pub login_cookie: Option<(String, String)>,
}
fn str_to_host<S: Into<String>>(host: S) -> Host {
Host::Domain(host.into())
}
#[derive(Debug, Clone, Default)]
pub struct Builder {
login_cookie: Option<(String, String)>,
}
impl Builder {
pub fn login(&mut self, login: Login) -> &mut Self {
self.login_cookie = match login {
Login::Username(_username, _password) => {
unimplemented!("login using username and passwond not implemented")
}
Login::MPRUUID(cookie_value) => Some(("MPRUUID".into(), cookie_value)),
};
self
}
pub fn build(&self) -> Result<Mediapart> {
match &self.login_cookie {
Some(login_cookie) => Ok(Mediapart {
login_cookie: login_cookie.clone(),
}),
None => Err(anyhow!("You have to log in to access this newspaper")),
}
}
}
#[async_trait]
impl Newspaper for Mediapart {
fn metadata(&self) -> Metadata {
@ -71,10 +41,13 @@ impl Newspaper for Mediapart {
let mut url = url.clone();
url.set_query(Some(&query));
let cookie = Cookie::build(&self.login_cookie.0, &self.login_cookie.1)
.secure(true)
.finish();
let cookies = vec![cookie];
// TODO: add "?onglet=full" to the url if not
let cookies = if let Some((name, value)) = &self.login_cookie {
let cookie = Cookie::build(name, value).secure(true).finish();
vec![cookie]
} else {
vec![]
};
// TODO: replace by builder
let downloader = Downloader { cookies };
@ -101,12 +74,13 @@ impl Newspaper for Mediapart {
"aside.cc-modal",
];
// TODO: correction of usage of relative urls, and replace "" by the url
let single_page_html =
tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await;
Ok(single_page_html)
}
fn new() -> Self {
async fn new() -> Self {
Self {
..Default::default()
}
@ -117,9 +91,3 @@ impl Newspaper for Mediapart {
true
}
}
impl Mediapart {
pub fn builder() -> Builder {
Builder::default()
}
}

View File

@ -1 +1,3 @@
pub mod mediapart;
mod mediapart;
pub use mediapart::Mediapart;

View File

@ -1,97 +1,48 @@
---
title: Scope of the project
---
This project mainly aims at providing an unified interface for several newspapers. Side
objectives are to provide web API and different clients like a webUI or chatbots.
This project mainly aims at providing an unified interface for several
newspapers. Side objectives are to provide web API and different clients like a
webUI or chatbots.
Several big components are planned for this project
Several big components are planned for this project (it is an initial draft and
may change later) :
```dot
digraph G {
rankdir=TB
node [shape=rectangle, style=filled, color="#779988"]
```plantuml
@startuml
frame "backend" {
[Retrieval tools] as retrieval_tools
[Article representation] as article_repr
[Automatic retrieval] as auto_retrieve
[Atom/RSS adapters] as rss
[Cache DB] as cache
[Newspaper\n(Mediapart, …)] as newspaper
() "Newspaper" as np_i
newspaper -up- np_i
subgraph cluster_frontend {
color = transparent
webui
chatbot
}
[Article location] as article_location
webui -> api [color = red]
chatbot -> api [color = red]
[API] as api
() "API" as api_i
api -up- api_i
subgraph cluster_backend {
label = "Backend\ncrieur binary"
labelloc = b
style=filled
article_location ..> np_i
retrieve_tools [label="retrieve-tools"]
retrieve_adapters [label="retrieve-adapters"]
retrieve [label="retrieve-interface"]
auto_retrieve [label="automatic-retrieve"]
article_repr [label="article-representation\nRepresentation for articles"]
api
cache [label="Cache database"]
rss [label="Atom/RSS adapters"]
api -> article_location
api -> rss
retrieve_tools -> retrieve_adapters
retrieve_adapters -> retrieve
retrieve_tools -> retrieve
rss -> auto_retrieve
article_repr -> retrieve_adapters
newspaper -> retrieval_tools: uses to implement
article_location --> article_repr :uses
auto_retrieve --> rss: watches
auto_retrieve --> article_location
auto_retrieve --> cache: stores in
retrieve -> api
auto_retrieve -> api
cache -> api
}
frame "Web ui" {
[Web UI] as webui
[HTML renderer] as html_rend
[Pdf exporter] as pdf_rend
[Articles] as articles
webui --> html_rend
webui --> pdf_rend
webui -> articles
articles ..> api_i
}
[Chatbot] as chatbot
chatbot ..> api_i
actor User
User ..> webui
User ..> chatbot
actor "Newspaper programmer" as newspaper_programmer
newspaper_programmer ..> newspaper: implements
@enduml
```
A task queue could be added later to space requests.
# Implementation plan
## Phase I
- [x] `Newspaper` interface : use to retrieve from newspaper websites
- [ ] minimal chatbot (uses libraries directly)
- [x] `ArticleLocation` : library for using several `Newspaper` and retrieving from
a given url.
## Phase II
- [ ] Article Representation : having a (beta) unified representation for downloaded
articles
- [ ] adding this representation to Newpsaper
## Phase III
- [ ] Cache
- [ ] Atom/rss adapters
- [ ] automatic retrieve
## Phase IV
- [ ] API
- [ ] chatbot (uses api)
## Phase V
- [ ] web ui

View File

@ -1,19 +0,0 @@
---
title: run the chatbot
---
1. You must first configure matrix login, every variable in [the_reference](../reference/chatbot_configuration.md) is mandatory.
```env
CRIEUR_MATRIX_USER=user
CRIEUR_MATRIX_PASSWORD=password
CRIEUR_MATRIX_HOMESERVER=https://homeserv.er
CRIEUR_MATRIX_ROOM=roomid
```
You can put it in a `.env` file.
2. run the chatbot
```
cargo run --release --bin crieur-chatbot
```

View File

@ -1,17 +0,0 @@
---
Title: Chatbot configuration reference
---
The chatbot is configured using environment variables
CRIEUR_MATRIX_USER
: username of the matrix bot account
CRIEUR_MATRIX_PASSWORD
: password of the matrix bot account
CRIEUR_MATRIX_HOMESERVER
: homeserver of the matrix bot account
CRIEUR_MATRIX_ROOM
: the room in which to listen to events

View File

@ -1,10 +0,0 @@
---
title: Newspapers configuration
---
The newspapers are configured using environment variables
# Mediapart
MEDIAPART_COOKIE
: sets the `MPRUUID` cookie, used to log in

View File

@ -2,11 +2,7 @@ use std::convert::TryInto;
use std::env;
use anyhow::Result;
use crieur_retrieve::{
newspaper::Newspaper,
newspapers::mediapart::{self, Mediapart},
ArticleLocation, Url,
};
use crieur_retrieve::{newspaper::Newspaper, ArticleLocation, Mediapart, Url};
use dotenv::dotenv;
use log::info;
@ -21,18 +17,18 @@ async fn main() -> Result<()> {
};
// TODO: remove this in favor of default newspapers
let mut mediapart = Mediapart::new().await
//.login(USERNAME, PASSWORD)
//
;
let mpruiid = env::var("MEDIAPART_COOKIE")?.into();
let mediapart = Mediapart::builder()
.login(mediapart::Login::MPRUUID(mpruiid))
.build()?;
mediapart.login_cookie = Some(("MPRUUID".into(), env::var("MEDIAPART_COOKIE")?.into()));
info!("Trying to download article from {}", url);
// TODO: shorten this, maybe an helper function ?
let article_location = ArticleLocation::builder()
.url(url)?
.newspaper(mediapart)
.newspaper(&mediapart)
.build()?;
let article_str = article_location.retrieve_html().await?;

View File

@ -1,10 +0,0 @@
use anyhow::Result;
use crieur_chatbot::run;
use dotenv::dotenv;
#[tokio::main]
async fn main() -> Result<()> {
dotenv().ok();
run().await?;
Ok(())
}

19
src/main.rs Normal file
View File

@ -0,0 +1,19 @@
use anyhow::Result;
use crieur_retrieve::{newspaper::Newspaper, Mediapart, Url};
use dotenv::dotenv;
use std::env;
#[tokio::main]
async fn main() -> Result<()> {
dotenv().ok();
let mut mediapart = Mediapart::new().await
//.login(USERNAME, PASSWORD)
//
;
mediapart.login_cookie = Some(("MPRUUID".into(), env::var("MEDIAPART_COOKIE")?));
let url = Url::parse("https://www.mediapart.fr/journal/france/030421/l-hotel-dieu-patients-et-medecins-tentent-de-percer-les-mysteres-du-covid-long")?;
println!("{}", mediapart.retrieve_html(&url).await?);
Ok(())
}