Compare commits
18 Commits
feature/me
...
developmen
Author | SHA1 | Date | |
---|---|---|---|
|
16ad14467e | ||
e34edf0b21 | |||
40ebc1ddea | |||
6e091a32fc | |||
5d0872b4d9 | |||
cee0af6c3c | |||
970f510cd1 | |||
|
8afd74995b | ||
dd26be54b5 | |||
ac5ef59dfa | |||
|
a64096fa87 | ||
b1d025a23c | |||
|
2e6aed97ef | ||
48485c3097 | |||
|
9655b086f0 | ||
9dd501418e | |||
865b949b5f | |||
a16dbbc790 |
@ -8,6 +8,7 @@ steps:
|
|||||||
pull: true
|
pull: true
|
||||||
errignore: true
|
errignore: true
|
||||||
commands:
|
commands:
|
||||||
|
- apt-get update && apt-get install -y cmake
|
||||||
- rustup component add rustfmt
|
- rustup component add rustfmt
|
||||||
- rustup component add clippy
|
- rustup component add clippy
|
||||||
- cargo clippy
|
- cargo clippy
|
||||||
@ -17,5 +18,6 @@ steps:
|
|||||||
pull: true
|
pull: true
|
||||||
errignore: true
|
errignore: true
|
||||||
commands:
|
commands:
|
||||||
|
- apt-get update && apt-get install -y cmake
|
||||||
- cargo test --all
|
- cargo test --all
|
||||||
- cargo build
|
- cargo build
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
---
|
---
|
||||||
name: "Bug report"
|
name: "🐛 Bug report"
|
||||||
about: "This template is for reporting a bug"
|
about: "For reporting bugs"
|
||||||
title: ""
|
title: ""
|
||||||
labels:
|
labels:
|
||||||
- "type::bug"
|
- "type::bug"
|
||||||
@ -17,6 +17,3 @@ labels:
|
|||||||
|
|
||||||
**Expected behavior**
|
**Expected behavior**
|
||||||
*describe what you expected to happen*
|
*describe what you expected to happen*
|
||||||
|
|
||||||
**Configuration**
|
|
||||||
*paste the result of `stage --version`
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
---
|
---
|
||||||
name: "Design discussion"
|
name: "🗣 Discussion"
|
||||||
about: "For discussion about the design of features in the application, when there are several possibilities for implementation"
|
about: "For discussion about the software, when you want to discuss about several conception possibilities"
|
||||||
title: ""
|
title: ""
|
||||||
labels:
|
labels:
|
||||||
- "type::discussion"
|
- "type::discussion"
|
||||||
@ -8,12 +8,8 @@ labels:
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
*describe shortly the problem*
|
*describe the problem *
|
||||||
|
|
||||||
## Requirements
|
|
||||||
|
|
||||||
*list requirements that the feature have*
|
|
||||||
|
|
||||||
## Propositions
|
## Propositions
|
||||||
|
|
||||||
*explain the different implementation that you would propose for the feature*
|
*(optionnal) explain the different implementation that you would propose*
|
||||||
|
@ -1,15 +1,14 @@
|
|||||||
---
|
---
|
||||||
name: "Feature request"
|
name: "💡 Feature request"
|
||||||
about: "This template is for requesting a new feature"
|
about: "For requesting a new feature, with an implementation plan"
|
||||||
title: ""
|
title: ""
|
||||||
labels:
|
labels:
|
||||||
- "type::feature"
|
- "type::enhancement"
|
||||||
- "status::review_needed"
|
- "status::review_needed"
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
*(if applicable) describe what problem or frustration you have currently*
|
*(if applicable) describe what problem or frustration you have currently*
|
||||||
|
|
||||||
*describe what you would like to be able to do, or what solution you would like (you can propose several)*
|
*describe what you would like to be able to do, or what solution you would like*
|
||||||
|
|
||||||
*(optional) additional context, comments or implementation propositions*
|
*(optional) additional context, comments
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
---
|
---
|
||||||
name: "Ask a question"
|
name: "❓ Ask a question"
|
||||||
about: "If you have a question about the usage of the libraries or the tool"
|
about: "If you have a question about the usage of the libraries or the tool"
|
||||||
title: ""
|
title: ""
|
||||||
labels:
|
labels:
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
---
|
---
|
||||||
name: "Refactor"
|
name: "🚧 Refactor"
|
||||||
about: "For refactoring propositions"
|
about: "For refactoring propositions"
|
||||||
title: ""
|
title: ""
|
||||||
labels:
|
labels:
|
||||||
|
1241
Cargo.lock
generated
1241
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@ -2,6 +2,7 @@
|
|||||||
|
|
||||||
members = [
|
members = [
|
||||||
"crieur-retrieve",
|
"crieur-retrieve",
|
||||||
|
"crieur-chatbot",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@ -17,7 +18,9 @@ publish = false
|
|||||||
[dependencies]
|
[dependencies]
|
||||||
anyhow = "1.0.40"
|
anyhow = "1.0.40"
|
||||||
crieur-retrieve = {version = "0.1", path="crieur-retrieve"}
|
crieur-retrieve = {version = "0.1", path="crieur-retrieve"}
|
||||||
|
crieur-chatbot = {version = "0.1", path="crieur-chatbot"}
|
||||||
dotenv = "0.15.0"
|
dotenv = "0.15.0"
|
||||||
env_logger = "0.8.3"
|
env_logger = "0.8.3"
|
||||||
log = "0.4.14"
|
log = "0.4.14"
|
||||||
tokio = { version = "1.5.0", features = ["full"] }
|
tokio = { version = "1.6.0", features = ["full"] }
|
||||||
|
tracing-subscriber = "0.2.18"
|
||||||
|
38
README.md
38
README.md
@ -1,17 +1,41 @@
|
|||||||
Tools to retrieve articles from multiple newspaper you subscribed to.
|
Tools to retrieve articles from multiple newspaper you subscribed to, all from
|
||||||
|
the same place.
|
||||||
|
|
||||||
**This is a prototype, it isn't stable at all and you may not want to use it if you expect it to just work !**
|
**This is a prototype, it isn't stable at all and you may not want to use it if
|
||||||
|
you expect it to just work !**
|
||||||
|
|
||||||
# How to use it
|
# How to use it
|
||||||
|
|
||||||
First retrieve login cookies for websites and put it in a `.env`
|
First retrieve login cookies for websites and put it in a `.env` such as
|
||||||
|
explained in the [newspaper source configuration
|
||||||
|
documentation](./documentation/reference/newspaper_configuration.md)
|
||||||
|
|
||||||
|
Then you can run [an example](./examples/cli_downloader.rs) using
|
||||||
|
|
||||||
```
|
```
|
||||||
cargo run --example=retrive_html_articles
|
cargo run --example=cli_downloader
|
||||||
```
|
```
|
||||||
|
|
||||||
|
You can also specify the URL using
|
||||||
|
```
|
||||||
|
cargo run --example=cli_downloader -- [your url]
|
||||||
|
```
|
||||||
|
|
||||||
|
To know how to run the chatbot, please read the [chatbot
|
||||||
|
guide](./documentation/guides/run_chatbot.md)
|
||||||
|
|
||||||
# Documentation
|
# Documentation
|
||||||
|
|
||||||
- 1. [Design](documentation/design/index.md)
|
1. Design
|
||||||
- a. [Scope of the project](documentation/design/scope.md)
|
1. [Scope of the project and roadmap](./documentation/design/scope.md)
|
||||||
- b. [Retrieve](documentation/design/retrieve.md)
|
2. [Retrieve](./documentation/design/retrieve.md)
|
||||||
|
3. [Tooling](./documentation/design/tooling.md)
|
||||||
|
2. Guides
|
||||||
|
1. [Add a newspaper a source
|
||||||
|
](./documentation/guides/add_a_newspaper_source.md)
|
||||||
|
2. [Build and run the chatbot](./documentation/guides/run_chatbot.md)
|
||||||
|
3. Reference
|
||||||
|
1. [Newspaper source
|
||||||
|
configuration](./documentation/reference/newspaper_configuration.md)
|
||||||
|
2. [Chatbot
|
||||||
|
configuration](./documentation/reference/chatbot_configuration.md)
|
||||||
|
26
containers/chatbot.containerfile
Normal file
26
containers/chatbot.containerfile
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
FROM docker.io/rust:1.51-alpine as build
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
RUN apk add \
|
||||||
|
cmake \
|
||||||
|
musl-dev \
|
||||||
|
make \
|
||||||
|
g++ \
|
||||||
|
&& rustup target add x86_64-unknown-linux-musl
|
||||||
|
|
||||||
|
COPY Cargo.lock Cargo.toml .
|
||||||
|
COPY crieur-chatbot crieur-chatbot
|
||||||
|
COPY crieur-retrieve crieur-retrieve
|
||||||
|
COPY src src
|
||||||
|
|
||||||
|
RUN RUSTFLAGS=-Ctarget-feature=-crt-static cargo build --target x86_64-unknown-linux-musl --release --bin=crieur-chatbot
|
||||||
|
|
||||||
|
FROM scratch
|
||||||
|
WORKDIR /
|
||||||
|
|
||||||
|
COPY --from=build /usr/lib/libstdc++.so.6 /usr/lib/libstdc++.so.6
|
||||||
|
COPY --from=build /usr/lib/libgcc_s.so.1 /usr/lib/libgcc_s.so.1
|
||||||
|
COPY --from=build /lib/ld-musl-x86_64.so.1 /lib/ld-musl-x86_64.so.1
|
||||||
|
COPY --from=build /app/target/x86_64-unknown-linux-musl/release/crieur-chatbot /crieur-chatbot
|
||||||
|
|
||||||
|
CMD ["./crieur-chatbot"]
|
21
crieur-chatbot/Cargo.toml
Normal file
21
crieur-chatbot/Cargo.toml
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
[package]
|
||||||
|
name = "crieur-chatbot"
|
||||||
|
version = "0.1.0"
|
||||||
|
authors = ["koalp <koalp@alpaga.dev>"]
|
||||||
|
edition = "2018"
|
||||||
|
|
||||||
|
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
anyhow = "1.0.40"
|
||||||
|
dotenv = "0.15.0"
|
||||||
|
crieur-retrieve = {version = "0.1.0", path = "../crieur-retrieve"}
|
||||||
|
mime = "0.3.16"
|
||||||
|
log = "0.4.14"
|
||||||
|
|
||||||
|
[dependencies.matrix-sdk]
|
||||||
|
git = "https://github.com/matrix-org/matrix-rust-sdk"
|
||||||
|
rev = "242d46c9a1bf40fa15b5892c2ee81cb0f4508ff4"
|
||||||
|
version = "0.2.0"
|
||||||
|
default-features = false
|
||||||
|
features = ["encryption", "rustls-tls", "require_auth_for_profile_requests"]
|
84
crieur-chatbot/src/chatbot.rs
Normal file
84
crieur-chatbot/src/chatbot.rs
Normal file
@ -0,0 +1,84 @@
|
|||||||
|
//! Chatbot
|
||||||
|
use std::convert::TryInto;
|
||||||
|
|
||||||
|
use anyhow::Result;
|
||||||
|
use matrix_sdk::{self, Client, SyncSettings};
|
||||||
|
|
||||||
|
use crate::Html;
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Default)]
|
||||||
|
pub(crate) struct Builder {
|
||||||
|
user: String,
|
||||||
|
password: String,
|
||||||
|
homeserver: String,
|
||||||
|
//TODO: rooms
|
||||||
|
rooms: Vec<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Builder {
|
||||||
|
fn new() -> Self {
|
||||||
|
Default::default()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) async fn connect(&self) -> Result<Chatbot> {
|
||||||
|
let client = Client::new(self.homeserver.as_str())?;
|
||||||
|
client
|
||||||
|
.login(self.user.as_str(), self.password.as_str(), None, None)
|
||||||
|
.await?;
|
||||||
|
assert!(client.logged_in().await);
|
||||||
|
for room in &self.rooms {
|
||||||
|
client.join_room_by_id(&room.as_str().try_into()?).await?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(Chatbot { client })
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn login(
|
||||||
|
&mut self,
|
||||||
|
user: &impl AsRef<str>,
|
||||||
|
password: &impl AsRef<str>,
|
||||||
|
) -> &mut Self {
|
||||||
|
self.user = user.as_ref().into();
|
||||||
|
self.password = password.as_ref().into();
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn homeserver(&mut self, homeserver: &impl AsRef<str>) -> &mut Self {
|
||||||
|
self.homeserver = homeserver.as_ref().into();
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn room(&mut self, room: impl AsRef<str>) -> &mut Self {
|
||||||
|
self.rooms.push(room.as_ref().into());
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn rooms(&mut self, rooms: Vec<String>) -> &mut Self {
|
||||||
|
for room in rooms {
|
||||||
|
self.room(room);
|
||||||
|
}
|
||||||
|
self
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub(crate) struct Chatbot {
|
||||||
|
client: Client,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Chatbot {
|
||||||
|
pub(crate) fn builder() -> Builder {
|
||||||
|
Builder::new()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) async fn run(&self) -> Result<()> {
|
||||||
|
self.client.set_event_handler(Box::new(Html::new())).await;
|
||||||
|
|
||||||
|
let mut settings = SyncSettings::default();
|
||||||
|
if let Some(token) = self.client.sync_token().await {
|
||||||
|
settings = settings.token(token);
|
||||||
|
}
|
||||||
|
self.client.sync(settings).await;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
40
crieur-chatbot/src/cli.rs
Normal file
40
crieur-chatbot/src/cli.rs
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
use std::env;
|
||||||
|
|
||||||
|
use anyhow::{bail, Result};
|
||||||
|
use dotenv::dotenv;
|
||||||
|
|
||||||
|
use crate::Chatbot;
|
||||||
|
|
||||||
|
/// Runs the chatbot
|
||||||
|
pub async fn run() -> Result<()> {
|
||||||
|
dotenv().ok();
|
||||||
|
|
||||||
|
let (user, password, homeserver, rooms) = match (
|
||||||
|
env::var("CRIEUR_MATRIX_USER"),
|
||||||
|
env::var("CRIEUR_MATRIX_PASSWORD"),
|
||||||
|
env::var("CRIEUR_MATRIX_HOMESERVER"),
|
||||||
|
env::var("CRIEUR_MATRIX_ROOMS"),
|
||||||
|
) {
|
||||||
|
(Ok(user), Ok(password), Ok(homeserver), Ok(rooms)) => (
|
||||||
|
user,
|
||||||
|
password,
|
||||||
|
homeserver,
|
||||||
|
rooms
|
||||||
|
.split(",")
|
||||||
|
.map(|s| s.to_string())
|
||||||
|
.collect::<Vec<String>>(),
|
||||||
|
),
|
||||||
|
_ => bail!("Configuration incomplete, please set all required environment variables"),
|
||||||
|
};
|
||||||
|
|
||||||
|
let chatbot = Chatbot::builder()
|
||||||
|
.login(&user, &password)
|
||||||
|
.homeserver(&homeserver)
|
||||||
|
.rooms(rooms)
|
||||||
|
.connect()
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
chatbot.run().await?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
122
crieur-chatbot/src/handlers/html.rs
Normal file
122
crieur-chatbot/src/handlers/html.rs
Normal file
@ -0,0 +1,122 @@
|
|||||||
|
use std::convert::TryInto;
|
||||||
|
|
||||||
|
use log::error;
|
||||||
|
use matrix_sdk::{
|
||||||
|
self, async_trait,
|
||||||
|
events::{
|
||||||
|
room::message::{MessageEventContent, MessageType, TextMessageEventContent},
|
||||||
|
AnyMessageEventContent, SyncMessageEvent,
|
||||||
|
},
|
||||||
|
room::Room,
|
||||||
|
EventHandler,
|
||||||
|
};
|
||||||
|
|
||||||
|
use crieur_retrieve::{article_location::Error, article_location::Result, ArticleLocation, Url};
|
||||||
|
|
||||||
|
pub(crate) struct Html {}
|
||||||
|
|
||||||
|
impl Html {
|
||||||
|
pub fn new() -> Self {
|
||||||
|
Self {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn send_article<U, E>(url: U, room: matrix_sdk::room::Joined)
|
||||||
|
where
|
||||||
|
U: TryInto<Url, Error = E> + Send,
|
||||||
|
E: std::error::Error + Sync + Send + 'static,
|
||||||
|
{
|
||||||
|
//TODO: replace by async block when async block is stable
|
||||||
|
async fn article_html<U, E>(url: U) -> Result<String>
|
||||||
|
where
|
||||||
|
U: TryInto<Url, Error = E> + Send,
|
||||||
|
E: std::error::Error + Sync + Send + 'static,
|
||||||
|
{
|
||||||
|
let article_str = ArticleLocation::builder()
|
||||||
|
.url(url)?
|
||||||
|
.build()?
|
||||||
|
.retrieve_html()
|
||||||
|
.await?;
|
||||||
|
Ok(article_str)
|
||||||
|
}
|
||||||
|
|
||||||
|
let text_message =
|
||||||
|
|message| AnyMessageEventContent::RoomMessage(MessageEventContent::text_plain(message));
|
||||||
|
|
||||||
|
//TODO: replace occurences ok() by async and logging block when async block is stable
|
||||||
|
let article_html = match article_html(url).await {
|
||||||
|
Ok(url) => url,
|
||||||
|
Err(Error::MalformedUrl) => {
|
||||||
|
room.send(text_message("Error: Given url is malformed"), None)
|
||||||
|
.await
|
||||||
|
.ok();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
Err(Error::UnknownNewspaper) => {
|
||||||
|
room.send(
|
||||||
|
text_message("Error: Given url is do not correspond to a known newspaper"),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.ok();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
Err(Error::Misconfiguration(key)) => {
|
||||||
|
error!(
|
||||||
|
"Error in configuration : {} key is missing or malformed",
|
||||||
|
&key
|
||||||
|
);
|
||||||
|
room.send(
|
||||||
|
text_message("Error: configuration error, please contact your admin"),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.ok();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
Err(_) => {
|
||||||
|
room.send(
|
||||||
|
text_message("Unknown error =/, can't download the file"),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.ok();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
room.send_attachment(
|
||||||
|
"article.html",
|
||||||
|
&mime::TEXT_HTML_UTF_8,
|
||||||
|
&mut article_html.as_bytes(),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.ok();
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait]
|
||||||
|
impl EventHandler for Html {
|
||||||
|
async fn on_room_message(&self, room: Room, event: &SyncMessageEvent<MessageEventContent>) {
|
||||||
|
if let Room::Joined(room) = room {
|
||||||
|
let msg_body = if let SyncMessageEvent {
|
||||||
|
content:
|
||||||
|
MessageEventContent {
|
||||||
|
msgtype: MessageType::Text(TextMessageEventContent { body: msg_body, .. }),
|
||||||
|
..
|
||||||
|
},
|
||||||
|
..
|
||||||
|
} = event
|
||||||
|
{
|
||||||
|
msg_body
|
||||||
|
} else {
|
||||||
|
return;
|
||||||
|
};
|
||||||
|
|
||||||
|
match msg_body.split(' ').collect::<Vec<_>>().as_slice() {
|
||||||
|
["!html", url, ..] => send_article(*url, room).await,
|
||||||
|
_ => return,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
2
crieur-chatbot/src/handlers/mod.rs
Normal file
2
crieur-chatbot/src/handlers/mod.rs
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
mod html;
|
||||||
|
pub(crate) use html::Html;
|
10
crieur-chatbot/src/lib.rs
Normal file
10
crieur-chatbot/src/lib.rs
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
//! Provides a matrix chatbot to download newspaper articles
|
||||||
|
|
||||||
|
mod cli;
|
||||||
|
pub use cli::run;
|
||||||
|
|
||||||
|
mod chatbot;
|
||||||
|
use chatbot::Chatbot;
|
||||||
|
|
||||||
|
mod handlers;
|
||||||
|
use handlers::Html;
|
@ -8,23 +8,22 @@ publish = false
|
|||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
anyhow = "1.0.40"
|
anyhow = "1.0.40"
|
||||||
async-trait = "0.1.48"
|
async-trait = "0.1.50"
|
||||||
thiserror = "1.0.24"
|
thiserror = "1.0.24"
|
||||||
url = "2.2.1"
|
url = "2.2.2"
|
||||||
hyper = { version = "0.14.5", features = ["full"] }
|
hyper = { version = "0.14.7", features = ["full"] }
|
||||||
hyper-rustls = "0.22.1"
|
hyper-rustls = "0.22.1"
|
||||||
cookie = "0.15.0"
|
cookie = "0.15.0"
|
||||||
lol_html = "0.3.0"
|
lol_html = "0.3.0"
|
||||||
indoc = "1.0.3"
|
indoc = "1.0.3"
|
||||||
html-minifier = "3.0.9"
|
html-minifier = "3.0.13"
|
||||||
bytes = "1.0.1"
|
bytes = "1.0.1"
|
||||||
base64 = "0.13.0"
|
base64 = "0.13.0"
|
||||||
futures = "0.3.14"
|
futures = "0.3.15"
|
||||||
derive_builder = "0.10.0"
|
derive_builder = "0.10.2"
|
||||||
nipper = "0.1.9"
|
nipper = "0.1.9"
|
||||||
log = "0.4.14"
|
log = "0.4.14"
|
||||||
env_logger = "0.8.3"
|
env_logger = "0.8.3"
|
||||||
itertools = "0.10.0"
|
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
tokio = "1.5.0"
|
tokio = "1.6.0"
|
||||||
|
@ -1,21 +1,87 @@
|
|||||||
use std::boxed::Box;
|
use std::boxed::Box;
|
||||||
use std::convert::TryInto;
|
use std::convert::TryInto;
|
||||||
|
use std::env;
|
||||||
|
|
||||||
use anyhow::{anyhow, Result};
|
|
||||||
use log::info;
|
|
||||||
use url::{Host, Url};
|
use url::{Host, Url};
|
||||||
|
|
||||||
use crate::newspaper::Newspaper;
|
use crate::newspaper::Newspaper;
|
||||||
|
use crate::newspapers::courrier_international::{self, CourrierInternational};
|
||||||
|
use crate::newspapers::mediapart::{self, Mediapart};
|
||||||
|
use crate::newspapers::monde_diplomatique::{self, MondeDiplo};
|
||||||
|
|
||||||
type Newspapers<'a> = Vec<Box<&'a dyn Newspaper>>;
|
/// Enumerate all errors that can be encountered when using ArticleLocation
|
||||||
|
#[derive(thiserror::Error, Debug)]
|
||||||
#[derive(Default)]
|
pub enum Error {
|
||||||
pub struct ArticleLocationBuilder<'a> {
|
/// The url was not set. Therefore, the article location can't be deduced
|
||||||
url: Option<Url>,
|
#[error("No url set")]
|
||||||
newspapers: Option<Newspapers<'a>>,
|
NoUrl,
|
||||||
|
/// The given URL isn't an accepted Url
|
||||||
|
#[error("Malformed URL")]
|
||||||
|
MalformedUrl,
|
||||||
|
/// The given url doesn't correspond to a newspaper.
|
||||||
|
#[error("The given url doesn't link to a known newspaper")]
|
||||||
|
UnknownNewspaper,
|
||||||
|
/// Error in configuration : used for missing or malformed configuration
|
||||||
|
#[error("Error in configuration (configuration key {0} malformed or missing)")]
|
||||||
|
Misconfiguration(String),
|
||||||
|
/// Other errors
|
||||||
|
#[error(transparent)]
|
||||||
|
Other(#[from] anyhow::Error),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> ArticleLocationBuilder<'a> {
|
type Newspapers = Vec<Box<dyn Newspaper>>;
|
||||||
|
pub type Result<T, E = Error> = core::result::Result<T, E>;
|
||||||
|
|
||||||
|
fn default_newpapers() -> Result<Newspapers> {
|
||||||
|
// TODO: same thing is written too much times : how to DRY ?
|
||||||
|
let config_key = "MEDIAPART_COOKIE".to_string();
|
||||||
|
let mpruiid = env::var(&config_key).map_err(|_| Error::Misconfiguration(config_key))?;
|
||||||
|
|
||||||
|
let mediapart = Mediapart::builder()
|
||||||
|
.login(mediapart::Login::Mpruuid(mpruiid))
|
||||||
|
.build()?;
|
||||||
|
|
||||||
|
let lmd_a_m = "MONDE_DIPLO_LMD_A_M".to_string();
|
||||||
|
let phpsessid = "MONDE_DIPLO_PHPSESSID".to_string();
|
||||||
|
let spip_session = "MONDE_DIPLO_SPIP_SESSION".to_string();
|
||||||
|
|
||||||
|
let lmd_a_m = env::var(&lmd_a_m).map_err(|_| Error::Misconfiguration(lmd_a_m))?;
|
||||||
|
let phpsessid = env::var(&phpsessid).map_err(|_| Error::Misconfiguration(phpsessid))?;
|
||||||
|
let spip_session =
|
||||||
|
env::var(&spip_session).map_err(|_| Error::Misconfiguration(spip_session))?;
|
||||||
|
|
||||||
|
let monde_diplo = MondeDiplo::builder()
|
||||||
|
.login(monde_diplomatique::Login::Cookies {
|
||||||
|
lmd_a_m,
|
||||||
|
phpsessid,
|
||||||
|
spip_session,
|
||||||
|
})
|
||||||
|
.build()?;
|
||||||
|
|
||||||
|
let lmd_a_m = "COURRIER_INTERNATIONAL_LMD_A_M".to_string();
|
||||||
|
let ssess = "COURRIER_INTERNATIONAL_SSESS".to_string();
|
||||||
|
|
||||||
|
let lmd_a_m = env::var(&lmd_a_m).map_err(|_| Error::Misconfiguration(lmd_a_m))?;
|
||||||
|
let ssess = env::var(&ssess).map_err(|_| Error::Misconfiguration(ssess))?;
|
||||||
|
|
||||||
|
let courrier_international = CourrierInternational::builder()
|
||||||
|
.login(courrier_international::Login::Cookies { lmd_a_m, ssess })
|
||||||
|
.build()?;
|
||||||
|
|
||||||
|
Ok(vec![
|
||||||
|
Box::new(mediapart),
|
||||||
|
Box::new(monde_diplo),
|
||||||
|
Box::new(courrier_international),
|
||||||
|
])
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Default)]
|
||||||
|
pub struct Builder {
|
||||||
|
url: Option<Url>,
|
||||||
|
newspapers: Option<Newspapers>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Builder {
|
||||||
pub fn new() -> Self {
|
pub fn new() -> Self {
|
||||||
Self::default()
|
Self::default()
|
||||||
}
|
}
|
||||||
@ -25,21 +91,20 @@ impl<'a> ArticleLocationBuilder<'a> {
|
|||||||
/// # Errors
|
/// # Errors
|
||||||
///
|
///
|
||||||
/// An error is returned if the could not be converted into an url
|
/// An error is returned if the could not be converted into an url
|
||||||
// TODO: move this to a defined error, remove anyhow !
|
|
||||||
pub fn url<U, E>(mut self, url: U) -> Result<Self>
|
pub fn url<U, E>(mut self, url: U) -> Result<Self>
|
||||||
where
|
where
|
||||||
U: TryInto<Url, Error = E> + Send,
|
U: TryInto<Url, Error = E> + Send,
|
||||||
E: std::error::Error + Sync + Send + 'static,
|
E: std::error::Error + Sync + Send + 'static,
|
||||||
{
|
{
|
||||||
let url = url.try_into()?;
|
let url = url.try_into().map_err(|_| Error::MalformedUrl)?;
|
||||||
self.url = Some(url);
|
self.url = Some(url);
|
||||||
Ok(self)
|
Ok(self)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Adds a newspaper to the list
|
/// Adds a newspaper to the list
|
||||||
pub fn newspaper<T>(&mut self, newspaper: &'a T) -> &mut Self
|
pub fn newspaper<T>(mut self, newspaper: T) -> Self
|
||||||
where
|
where
|
||||||
T: 'a + Newspaper,
|
T: 'static + Newspaper,
|
||||||
{
|
{
|
||||||
match &mut self.newspapers {
|
match &mut self.newspapers {
|
||||||
Some(newspapers) => newspapers.push(Box::new(newspaper)),
|
Some(newspapers) => newspapers.push(Box::new(newspaper)),
|
||||||
@ -49,18 +114,13 @@ impl<'a> ArticleLocationBuilder<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Adds several newspapers to the list of accepted newspapers
|
/// Adds several newspapers to the list of accepted newspapers
|
||||||
//fn newspapers(&mut self, newspapers: Newspapers) -> Result<&mut Self> {
|
pub fn newspapers(mut self, newspapers: Newspapers) -> Self {
|
||||||
// let newspapers = match &self.newspapers {
|
match &mut self.newspapers {
|
||||||
// Some(current_newspapers) => newspapers
|
Some(current_newspapers) => current_newspapers.extend(newspapers),
|
||||||
// .iter()
|
None => self.newspapers = Some(newspapers.into_iter().collect::<Vec<_>>()),
|
||||||
// .chain(current_newspapers.iter())
|
};
|
||||||
// .map(|s| *(s.clone()))
|
self
|
||||||
// .collect::<Newspapers>(),
|
}
|
||||||
// None => newspapers.into_iter().collect::<Vec<_>>(),
|
|
||||||
// };
|
|
||||||
// self.newspapers = Some(newspapers);
|
|
||||||
// Ok(self)
|
|
||||||
//}
|
|
||||||
|
|
||||||
/// Builds the ArticleLocation by looking which newspaper
|
/// Builds the ArticleLocation by looking which newspaper
|
||||||
///
|
///
|
||||||
@ -71,41 +131,32 @@ impl<'a> ArticleLocationBuilder<'a> {
|
|||||||
/// - no newpspaper is given
|
/// - no newpspaper is given
|
||||||
/// - the url is not set
|
/// - the url is not set
|
||||||
/// - the given url has no host
|
/// - the given url has no host
|
||||||
// TODO: move this to a defined error, remove anyhow !
|
pub fn build(self) -> Result<ArticleLocation> {
|
||||||
pub fn build(&self) -> Result<ArticleLocation<'a>> {
|
let url = Clone::clone(self.url.as_ref().ok_or(Error::NoUrl)?);
|
||||||
let url = Clone::clone(self.url.as_ref().ok_or(anyhow!(
|
let host = url.host_str().ok_or(Error::MalformedUrl)?;
|
||||||
"No url set. You can set it with the url() function"
|
let host = Host::parse(host).map_err(|_| Error::MalformedUrl)?;
|
||||||
))?);
|
|
||||||
let host = url.host_str().ok_or(anyhow!("Given url has no host"))?;
|
|
||||||
let host = Host::parse(host)?;
|
|
||||||
let newspaper = self
|
let newspaper = self
|
||||||
.newspapers
|
.newspapers
|
||||||
.as_ref()
|
.unwrap_or(default_newpapers()?)
|
||||||
.ok_or(anyhow!(
|
.into_iter()
|
||||||
"A list of NewsPaper must be set. It can be set with newspapers() function"
|
|
||||||
))?
|
|
||||||
.iter()
|
|
||||||
.find(|c| c.metadata().hosts.contains(&host))
|
.find(|c| c.metadata().hosts.contains(&host))
|
||||||
.ok_or(anyhow!("Newspaper couldn't be found"))?;
|
.ok_or(Error::UnknownNewspaper)?;
|
||||||
Ok(ArticleLocation {
|
Ok(ArticleLocation { newspaper, url })
|
||||||
newspaper: newspaper.clone(),
|
|
||||||
url,
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct ArticleLocation<'a> {
|
pub struct ArticleLocation {
|
||||||
newspaper: Box<&'a dyn Newspaper>,
|
newspaper: Box<dyn Newspaper>,
|
||||||
pub url: Url,
|
pub url: Url,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> ArticleLocation<'a> {
|
impl ArticleLocation {
|
||||||
pub fn builder() -> ArticleLocationBuilder<'a> {
|
pub fn builder() -> Builder {
|
||||||
ArticleLocationBuilder::new()
|
Builder::new()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn retrieve_html(&self) -> Result<String> {
|
pub async fn retrieve_html(&self) -> Result<String> {
|
||||||
info!("It will download from {}", self.url);
|
// TODO: modify when retrieve_html returns a specific Error type
|
||||||
self.newspaper.retrieve_html(&self.url).await
|
Ok(self.newspaper.retrieve_html(&self.url).await?)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -8,10 +8,9 @@ pub use tools::{Download, Downloader};
|
|||||||
pub mod newspaper;
|
pub mod newspaper;
|
||||||
|
|
||||||
// TODO: move to another crate
|
// TODO: move to another crate
|
||||||
mod newspapers;
|
pub mod newspapers;
|
||||||
pub use newspapers::Mediapart;
|
|
||||||
|
|
||||||
mod article_location;
|
pub mod article_location;
|
||||||
pub use article_location::ArticleLocation;
|
pub use article_location::ArticleLocation;
|
||||||
|
|
||||||
mod consts;
|
mod consts;
|
||||||
|
@ -1,17 +1,10 @@
|
|||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use derive_builder::Builder;
|
|
||||||
use url::Host;
|
use url::Host;
|
||||||
pub use url::Url;
|
pub use url::Url;
|
||||||
|
|
||||||
enum Login {
|
|
||||||
Username(String, String),
|
|
||||||
Cookie(String),
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Contains metadata about a newspaper
|
/// Contains metadata about a newspaper
|
||||||
// TODO: provide builder
|
#[derive(Debug, PartialEq, Default, derive_builder::Builder)]
|
||||||
#[derive(Debug, PartialEq, Default, Builder)]
|
|
||||||
#[builder(default)]
|
#[builder(default)]
|
||||||
pub struct Metadata {
|
pub struct Metadata {
|
||||||
/// The hosts that can be corresponds to this newspaper
|
/// The hosts that can be corresponds to this newspaper
|
||||||
@ -28,13 +21,14 @@ pub struct Metadata {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl Metadata {
|
impl Metadata {
|
||||||
|
/// Get metadata builder
|
||||||
pub fn builder() -> MetadataBuilder {
|
pub fn builder() -> MetadataBuilder {
|
||||||
MetadataBuilder::default()
|
MetadataBuilder::default()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait]
|
#[async_trait]
|
||||||
pub trait Newspaper {
|
pub trait Newspaper: Send + Sync {
|
||||||
/// Returns a list of hosts that corresponds to the newspapers
|
/// Returns a list of hosts that corresponds to the newspapers
|
||||||
fn metadata(&self) -> Metadata;
|
fn metadata(&self) -> Metadata;
|
||||||
|
|
||||||
@ -49,7 +43,7 @@ pub trait Newspaper {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Returns a newspaper structure
|
/// Returns a newspaper structure
|
||||||
async fn new() -> Self
|
fn new() -> Self
|
||||||
where
|
where
|
||||||
Self: Sized;
|
Self: Sized;
|
||||||
|
|
||||||
|
144
crieur-retrieve/src/newspapers/courrier_international.rs
Normal file
144
crieur-retrieve/src/newspapers/courrier_international.rs
Normal file
@ -0,0 +1,144 @@
|
|||||||
|
use anyhow::{anyhow, bail, Result};
|
||||||
|
use async_trait::async_trait;
|
||||||
|
use cookie::Cookie;
|
||||||
|
use indoc::indoc;
|
||||||
|
use url::Host;
|
||||||
|
|
||||||
|
use crate::newspaper::{Metadata, Newspaper};
|
||||||
|
use crate::tools;
|
||||||
|
use crate::Url;
|
||||||
|
use crate::{Download, Downloader};
|
||||||
|
|
||||||
|
pub enum Login {
|
||||||
|
Username(String, String),
|
||||||
|
Cookies { lmd_a_m: String, ssess: String },
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Default)]
|
||||||
|
pub struct CourrierInternational {
|
||||||
|
login_cookies: Vec<(String, String)>,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn str_to_host<S: Into<String>>(host: S) -> Host {
|
||||||
|
Host::Domain(host.into())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Default)]
|
||||||
|
pub struct Builder {
|
||||||
|
login_cookies: Option<Vec<(String, String)>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Builder {
|
||||||
|
pub fn login(&mut self, login: Login) -> &mut Self {
|
||||||
|
self.login_cookies = match login {
|
||||||
|
Login::Username(_username, _password) => {
|
||||||
|
unimplemented!("login using username and passwond not implemented")
|
||||||
|
}
|
||||||
|
Login::Cookies { lmd_a_m, ssess } => Some(vec![
|
||||||
|
("lmd_a_m".into(), lmd_a_m),
|
||||||
|
("SSESS862c7003d721c672d39f161b1456b890".into(), ssess),
|
||||||
|
]),
|
||||||
|
};
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn build(&self) -> Result<CourrierInternational> {
|
||||||
|
match &self.login_cookies {
|
||||||
|
Some(login_cookies) => Ok(CourrierInternational {
|
||||||
|
login_cookies: login_cookies.clone(),
|
||||||
|
}),
|
||||||
|
None => Err(anyhow!("You have to log in to access this newspaper")),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait]
|
||||||
|
impl Newspaper for CourrierInternational {
|
||||||
|
fn metadata(&self) -> Metadata {
|
||||||
|
Metadata::builder()
|
||||||
|
.hosts(vec![
|
||||||
|
str_to_host("courrierinternational.com"),
|
||||||
|
str_to_host("www.courrierinternational.com"),
|
||||||
|
])
|
||||||
|
.lower_case_name("courrier-international")
|
||||||
|
.name("Courrier international")
|
||||||
|
.build()
|
||||||
|
.unwrap_or_default()
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn retrieve_html(&self, url: &Url) -> Result<String> {
|
||||||
|
let cookies = self
|
||||||
|
.login_cookies
|
||||||
|
.iter()
|
||||||
|
.map(|cookie| Cookie::build(&cookie.0, &cookie.1).finish())
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
|
// TODO: replace by builder
|
||||||
|
let downloader = Downloader { cookies };
|
||||||
|
|
||||||
|
let body = downloader.download(&url).await?;
|
||||||
|
let html = match body {
|
||||||
|
Some(body) => String::from_utf8(body.to_vec())?,
|
||||||
|
None => bail!("404 not found"),
|
||||||
|
};
|
||||||
|
|
||||||
|
let elements_to_remove = &[
|
||||||
|
// navigation elements
|
||||||
|
"header.site-header",
|
||||||
|
"footer.site-footer",
|
||||||
|
// Social buttons
|
||||||
|
"#toolbox-share",
|
||||||
|
".toolbox-share",
|
||||||
|
".toolbox-print",
|
||||||
|
".toolbox-respond",
|
||||||
|
".toolbox-zen",
|
||||||
|
".toolbox-newsletter",
|
||||||
|
".toolbox-offer",
|
||||||
|
".box-article-offer-friend-abo",
|
||||||
|
// unused services
|
||||||
|
".article-aside",
|
||||||
|
".article-secondary",
|
||||||
|
".article-subject-readmore",
|
||||||
|
// misc
|
||||||
|
".element-invisible",
|
||||||
|
".gptcontainer",
|
||||||
|
];
|
||||||
|
|
||||||
|
// FIXME: it doesn't work because the aside is in the article body
|
||||||
|
//
|
||||||
|
let toolbox_style = indoc! {"
|
||||||
|
aside.article-toolbox {
|
||||||
|
position: sticky;
|
||||||
|
top: 1em;
|
||||||
|
}
|
||||||
|
"};
|
||||||
|
|
||||||
|
let single_page_html = tools::self_contained_html::Config {
|
||||||
|
downloader: Some(&downloader),
|
||||||
|
base_url: Some(&url),
|
||||||
|
elements_to_remove,
|
||||||
|
styles_to_add: &[toolbox_style],
|
||||||
|
..Default::default()
|
||||||
|
}
|
||||||
|
.run(&html)
|
||||||
|
.await;
|
||||||
|
Ok(single_page_html)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn new() -> Self {
|
||||||
|
Self {
|
||||||
|
..Default::default()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn has_complete_access(&self) -> bool {
|
||||||
|
// TODO: check if we are logged using the cookie
|
||||||
|
true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl CourrierInternational {
|
||||||
|
pub fn builder() -> Builder {
|
||||||
|
Builder::default()
|
||||||
|
}
|
||||||
|
}
|
@ -1,4 +1,4 @@
|
|||||||
use anyhow::Result;
|
use anyhow::{anyhow, bail, Result};
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use cookie::Cookie;
|
use cookie::Cookie;
|
||||||
use url::Host;
|
use url::Host;
|
||||||
@ -8,16 +8,46 @@ use crate::tools;
|
|||||||
use crate::Url;
|
use crate::Url;
|
||||||
use crate::{Download, Downloader};
|
use crate::{Download, Downloader};
|
||||||
|
|
||||||
|
pub enum Login {
|
||||||
|
Username(String, String),
|
||||||
|
Mpruuid(String),
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Default)]
|
#[derive(Debug, Clone, Default)]
|
||||||
pub struct Mediapart {
|
pub struct Mediapart {
|
||||||
// TODO: remove this pub !!
|
login_cookie: (String, String),
|
||||||
pub login_cookie: Option<(String, String)>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn str_to_host<S: Into<String>>(host: S) -> Host {
|
fn str_to_host<S: Into<String>>(host: S) -> Host {
|
||||||
Host::Domain(host.into())
|
Host::Domain(host.into())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Default)]
|
||||||
|
pub struct Builder {
|
||||||
|
login_cookie: Option<(String, String)>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Builder {
|
||||||
|
pub fn login(&mut self, login: Login) -> &mut Self {
|
||||||
|
self.login_cookie = match login {
|
||||||
|
Login::Username(_username, _password) => {
|
||||||
|
unimplemented!("login using username and passwond not implemented")
|
||||||
|
}
|
||||||
|
Login::Mpruuid(cookie_value) => Some(("MPRUUID".into(), cookie_value)),
|
||||||
|
};
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn build(&self) -> Result<Mediapart> {
|
||||||
|
match &self.login_cookie {
|
||||||
|
Some(login_cookie) => Ok(Mediapart {
|
||||||
|
login_cookie: login_cookie.clone(),
|
||||||
|
}),
|
||||||
|
None => Err(anyhow!("You have to log in to access this newspaper")),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[async_trait]
|
#[async_trait]
|
||||||
impl Newspaper for Mediapart {
|
impl Newspaper for Mediapart {
|
||||||
fn metadata(&self) -> Metadata {
|
fn metadata(&self) -> Metadata {
|
||||||
@ -41,22 +71,22 @@ impl Newspaper for Mediapart {
|
|||||||
let mut url = url.clone();
|
let mut url = url.clone();
|
||||||
url.set_query(Some(&query));
|
url.set_query(Some(&query));
|
||||||
|
|
||||||
// TODO: add "?onglet=full" to the url if not
|
let cookie = Cookie::build(&self.login_cookie.0, &self.login_cookie.1)
|
||||||
let cookies = if let Some((name, value)) = &self.login_cookie {
|
.secure(true)
|
||||||
let cookie = Cookie::build(name, value).secure(true).finish();
|
.finish();
|
||||||
vec![cookie]
|
let cookies = vec![cookie];
|
||||||
} else {
|
|
||||||
vec![]
|
|
||||||
};
|
|
||||||
|
|
||||||
// TODO: replace by builder
|
// TODO: replace by builder
|
||||||
let downloader = Downloader { cookies };
|
let downloader = Downloader { cookies };
|
||||||
|
|
||||||
let body = downloader.download(&url).await?;
|
let body = downloader.download(&url).await?;
|
||||||
let html = String::from_utf8(body.to_vec())?;
|
let html = match body {
|
||||||
|
Some(body) => String::from_utf8(body.to_vec())?,
|
||||||
|
None => bail!("404 not found"),
|
||||||
|
};
|
||||||
|
|
||||||
// TODO: Move to const
|
// TODO: Move to const
|
||||||
let element_to_remove = [
|
let elements_to_remove = &[
|
||||||
// header
|
// header
|
||||||
".fb-root",
|
".fb-root",
|
||||||
".skipLinks",
|
".skipLinks",
|
||||||
@ -74,13 +104,18 @@ impl Newspaper for Mediapart {
|
|||||||
"aside.cc-modal",
|
"aside.cc-modal",
|
||||||
];
|
];
|
||||||
|
|
||||||
// TODO: correction of usage of relative urls, and replace "" by the url
|
let single_page_html = tools::self_contained_html::Config {
|
||||||
let single_page_html =
|
downloader: Some(&downloader),
|
||||||
tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await;
|
base_url: Some(&url),
|
||||||
|
elements_to_remove,
|
||||||
|
..Default::default()
|
||||||
|
}
|
||||||
|
.run(&html)
|
||||||
|
.await;
|
||||||
Ok(single_page_html)
|
Ok(single_page_html)
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn new() -> Self {
|
fn new() -> Self {
|
||||||
Self {
|
Self {
|
||||||
..Default::default()
|
..Default::default()
|
||||||
}
|
}
|
||||||
@ -91,3 +126,9 @@ impl Newspaper for Mediapart {
|
|||||||
true
|
true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl Mediapart {
|
||||||
|
pub fn builder() -> Builder {
|
||||||
|
Builder::default()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -1,3 +1,3 @@
|
|||||||
mod mediapart;
|
pub mod courrier_international;
|
||||||
|
pub mod mediapart;
|
||||||
pub use mediapart::Mediapart;
|
pub mod monde_diplomatique;
|
||||||
|
137
crieur-retrieve/src/newspapers/monde_diplomatique.rs
Normal file
137
crieur-retrieve/src/newspapers/monde_diplomatique.rs
Normal file
@ -0,0 +1,137 @@
|
|||||||
|
use anyhow::{anyhow, bail, Result};
|
||||||
|
use async_trait::async_trait;
|
||||||
|
use cookie::Cookie;
|
||||||
|
use url::Host;
|
||||||
|
|
||||||
|
use crate::newspaper::{Metadata, Newspaper};
|
||||||
|
use crate::tools;
|
||||||
|
use crate::Url;
|
||||||
|
use crate::{Download, Downloader};
|
||||||
|
|
||||||
|
pub enum Login {
|
||||||
|
Username(String, String),
|
||||||
|
Cookies {
|
||||||
|
lmd_a_m: String,
|
||||||
|
phpsessid: String,
|
||||||
|
spip_session: String,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Default)]
|
||||||
|
pub struct MondeDiplo {
|
||||||
|
login_cookies: Vec<(String, String)>,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn str_to_host<S: Into<String>>(host: S) -> Host {
|
||||||
|
Host::Domain(host.into())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Default)]
|
||||||
|
pub struct Builder {
|
||||||
|
login_cookies: Option<Vec<(String, String)>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Builder {
|
||||||
|
pub fn login(&mut self, login: Login) -> &mut Self {
|
||||||
|
self.login_cookies = match login {
|
||||||
|
Login::Username(_username, _password) => {
|
||||||
|
unimplemented!("login using username and passwond not implemented")
|
||||||
|
}
|
||||||
|
Login::Cookies {
|
||||||
|
lmd_a_m,
|
||||||
|
phpsessid,
|
||||||
|
spip_session,
|
||||||
|
} => Some(vec![
|
||||||
|
("lmd_a_m".into(), lmd_a_m),
|
||||||
|
("PHPSESSID".into(), phpsessid),
|
||||||
|
("spip_session".into(), spip_session),
|
||||||
|
]),
|
||||||
|
};
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn build(&self) -> Result<MondeDiplo> {
|
||||||
|
match &self.login_cookies {
|
||||||
|
Some(login_cookies) => Ok(MondeDiplo {
|
||||||
|
login_cookies: login_cookies.clone(),
|
||||||
|
}),
|
||||||
|
None => Err(anyhow!("You have to log in to access this newspaper")),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait]
|
||||||
|
impl Newspaper for MondeDiplo {
|
||||||
|
fn metadata(&self) -> Metadata {
|
||||||
|
Metadata::builder()
|
||||||
|
.hosts(vec![
|
||||||
|
str_to_host("monde-diplomatique.fr"),
|
||||||
|
str_to_host("www.monde-diplomatique.fr"),
|
||||||
|
])
|
||||||
|
.lower_case_name("monde-diplomatique")
|
||||||
|
.name("Le Monde Diplomatique")
|
||||||
|
.build()
|
||||||
|
.unwrap_or_default()
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn retrieve_html(&self, url: &Url) -> Result<String> {
|
||||||
|
let cookies = self
|
||||||
|
.login_cookies
|
||||||
|
.iter()
|
||||||
|
.map(|cookie| Cookie::build(&cookie.0, &cookie.1).finish())
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
|
// TODO: replace by builder
|
||||||
|
let downloader = Downloader { cookies };
|
||||||
|
|
||||||
|
let body = downloader.download(&url).await?;
|
||||||
|
let html = match body {
|
||||||
|
Some(body) => String::from_utf8(body.to_vec())?,
|
||||||
|
None => bail!("404 not found"),
|
||||||
|
};
|
||||||
|
|
||||||
|
// TODO: Move to const
|
||||||
|
let elements_to_remove = &[
|
||||||
|
// navigation elements
|
||||||
|
"#tout-en-haut.preentete",
|
||||||
|
"#entete.connecte",
|
||||||
|
"#navigation",
|
||||||
|
"#pied",
|
||||||
|
".bloc-connexion",
|
||||||
|
// unused features
|
||||||
|
"#ecouter",
|
||||||
|
// Social buttons
|
||||||
|
".actions-article",
|
||||||
|
"#partage",
|
||||||
|
// misc
|
||||||
|
"noscript",
|
||||||
|
];
|
||||||
|
|
||||||
|
let single_page_html = tools::self_contained_html::Config {
|
||||||
|
downloader: Some(&downloader),
|
||||||
|
base_url: Some(&url),
|
||||||
|
elements_to_remove,
|
||||||
|
..Default::default()
|
||||||
|
}
|
||||||
|
.run(&html)
|
||||||
|
.await;
|
||||||
|
Ok(single_page_html)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn new() -> Self {
|
||||||
|
Self {
|
||||||
|
..Default::default()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn has_complete_access(&self) -> bool {
|
||||||
|
// TODO: check if we are logged using the cookie
|
||||||
|
true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl MondeDiplo {
|
||||||
|
pub fn builder() -> Builder {
|
||||||
|
Builder::default()
|
||||||
|
}
|
||||||
|
}
|
@ -4,7 +4,7 @@ use anyhow::Result;
|
|||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use bytes::Bytes;
|
use bytes::Bytes;
|
||||||
use cookie::Cookie;
|
use cookie::Cookie;
|
||||||
use hyper::{header, Body, Client, Method, Request};
|
use hyper::{header, Body, Client, Method, Request, StatusCode};
|
||||||
use thiserror::Error;
|
use thiserror::Error;
|
||||||
use url::Url;
|
use url::Url;
|
||||||
|
|
||||||
@ -22,7 +22,9 @@ pub trait Download {
|
|||||||
type Error: StdError;
|
type Error: StdError;
|
||||||
|
|
||||||
/// Downloads a file from an url and returns the result as bytes
|
/// Downloads a file from an url and returns the result as bytes
|
||||||
async fn download(&self, file_link: &Url) -> Result<Bytes, Self::Error>;
|
///
|
||||||
|
/// If the file is not found, returns None
|
||||||
|
async fn download(&self, file_link: &Url) -> Result<Option<Bytes>, Self::Error>;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Store several cookies
|
/// Store several cookies
|
||||||
@ -36,7 +38,8 @@ pub struct Downloader<'c> {
|
|||||||
impl<'c> Download for Downloader<'c> {
|
impl<'c> Download for Downloader<'c> {
|
||||||
type Error = DownloadError;
|
type Error = DownloadError;
|
||||||
|
|
||||||
async fn download(&self, file_link: &Url) -> Result<Bytes, Self::Error> {
|
async fn download(&self, file_link: &Url) -> Result<Option<Bytes>, Self::Error> {
|
||||||
|
log::debug!("downloading url {:?}", file_link);
|
||||||
let https = hyper_rustls::HttpsConnector::with_native_roots();
|
let https = hyper_rustls::HttpsConnector::with_native_roots();
|
||||||
let client: Client<_, hyper::Body> = Client::builder().build(https);
|
let client: Client<_, hyper::Body> = Client::builder().build(https);
|
||||||
|
|
||||||
@ -44,14 +47,26 @@ impl<'c> Download for Downloader<'c> {
|
|||||||
.method(Method::GET)
|
.method(Method::GET)
|
||||||
.uri(file_link.as_str());
|
.uri(file_link.as_str());
|
||||||
|
|
||||||
for cookie in &self.cookies {
|
req = req.header(
|
||||||
req = req.header(header::COOKIE, cookie.to_string());
|
header::COOKIE,
|
||||||
}
|
self.cookies
|
||||||
|
.iter()
|
||||||
|
.map(Cookie::to_string)
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
.join(";"),
|
||||||
|
);
|
||||||
|
log::debug!("headers : {:?}", req.headers_ref());
|
||||||
|
|
||||||
let req = req.body(Body::empty())?;
|
let req = req.body(Body::empty())?;
|
||||||
|
|
||||||
let resp = client.request(req).await?;
|
let resp = client.request(req).await?;
|
||||||
let body = hyper::body::to_bytes(resp).await?;
|
log::debug!("Response status : {:?}", resp.status());
|
||||||
|
let body = match resp.status() {
|
||||||
|
StatusCode::OK => Some(hyper::body::to_bytes(resp).await?),
|
||||||
|
StatusCode::NOT_FOUND => None,
|
||||||
|
// TODO: enhance this by handling more error codes
|
||||||
|
_ => None,
|
||||||
|
};
|
||||||
Ok(body)
|
Ok(body)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
mod download;
|
mod download;
|
||||||
mod self_contained_html;
|
pub mod self_contained_html;
|
||||||
|
|
||||||
pub use download::{Download, DownloadError, Downloader};
|
pub use download::{Download, DownloadError, Downloader};
|
||||||
pub use self_contained_html::self_contained_html;
|
|
||||||
|
@ -8,142 +8,198 @@ use url::Url;
|
|||||||
use crate::consts::{EVENT_HANDLERS, LINK_REL_EXTERNAL_RESOURCES};
|
use crate::consts::{EVENT_HANDLERS, LINK_REL_EXTERNAL_RESOURCES};
|
||||||
use crate::Download;
|
use crate::Download;
|
||||||
|
|
||||||
/// Makes an html page self-contained
|
/// Stores configuration for the self_contained_html function
|
||||||
///
|
// TODO: write a builder
|
||||||
/// The `downloader` must implement `Download` and is used to download ressources that are
|
pub struct Config<'t, E, D, S1 = &'static str, S2 = &'static str>
|
||||||
/// needed to make this page self-contained such as stylesheets or images.
|
where
|
||||||
///
|
E: std::error::Error,
|
||||||
/// The function also removes all scripts on the page
|
D: Download<Error = E> + Send,
|
||||||
pub async fn self_contained_html<E, D>(
|
S1: AsRef<str>,
|
||||||
html: impl AsRef<str>,
|
S2: AsRef<str>,
|
||||||
downloader: &D,
|
{
|
||||||
base_url: &Url,
|
/// the downloader that will be used to retrieve ressources on the page
|
||||||
elements_to_remove: &[impl AsRef<str>],
|
pub downloader: Option<&'t D>,
|
||||||
) -> String
|
/// Base url for downloading ressources, it probably the
|
||||||
|
pub base_url: Option<&'t Url>,
|
||||||
|
pub elements_to_remove: &'t [S1],
|
||||||
|
pub styles_to_add: &'t [S2],
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'t, E, D> Default for Config<'t, E, D>
|
||||||
where
|
where
|
||||||
E: std::error::Error,
|
E: std::error::Error,
|
||||||
D: Download<Error = E> + Send,
|
D: Download<Error = E> + Send,
|
||||||
{
|
{
|
||||||
// TODO: split/refactor this function :
|
fn default() -> Self {
|
||||||
// - ¿ find a way to ease the |get urls| -> |async download| -> |replace| workflow ?
|
Self {
|
||||||
// - `element_to_remove` , `base_url` and `downloader` should be in a configuration structure
|
downloader: None,
|
||||||
// - ¿ should be function of a trait ? or only of the configuration struct ?
|
base_url: None,
|
||||||
let (style_urls, html) = {
|
elements_to_remove: &[],
|
||||||
let document = Document::from(html.as_ref());
|
styles_to_add: &[],
|
||||||
|
|
||||||
// ---- Remove scripts ----
|
|
||||||
//
|
|
||||||
document.select("script").remove();
|
|
||||||
|
|
||||||
for event in EVENT_HANDLERS {
|
|
||||||
document
|
|
||||||
.select(format!("[{}]", event).as_str())
|
|
||||||
.remove_attr(event);
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
for rel in LINK_REL_EXTERNAL_RESOURCES {
|
impl<'t, E, D, S1, S2> Config<'t, E, D, S1, S2>
|
||||||
document
|
where
|
||||||
.select(format!("link[rel=\"{}\"]", rel).as_str())
|
E: std::error::Error,
|
||||||
.remove();
|
D: Download<Error = E> + Send,
|
||||||
}
|
S1: AsRef<str>,
|
||||||
|
S2: AsRef<str>,
|
||||||
|
{
|
||||||
|
/// Makes an html page self-contained
|
||||||
|
///
|
||||||
|
/// The `downloader` must implement `Download` and is used to download ressources that are
|
||||||
|
/// needed to make this page self-contained such as stylesheets or images.
|
||||||
|
///
|
||||||
|
/// The function also removes all scripts on the page
|
||||||
|
pub async fn run(&self, html: impl AsRef<str>) -> String {
|
||||||
|
//TODO: don't panic
|
||||||
|
let base_url = self.base_url.expect("Base url not defined");
|
||||||
|
let downloader = self.downloader.expect("Downloader not defined");
|
||||||
|
// TODO: split/refactor this function :
|
||||||
|
// - ¿ find a way to ease the |get urls| -> |async download| -> |replace| workflow ?
|
||||||
|
// - put each modification (ex: style in the `foreach`) in functions, maybe using
|
||||||
|
// (¿sealed?) Traits : https://git.alpaga.dev/poc/crieur/issues/42
|
||||||
|
// - `element_to_remove` , `base_url` and `downloader` should be in a configuration structure
|
||||||
|
let (style_urls, html) = {
|
||||||
|
let document = Document::from(html.as_ref());
|
||||||
|
|
||||||
// ---- Replace stylesheets ----
|
// ---- Remove scripts ----
|
||||||
|
//
|
||||||
|
document.select("script").remove();
|
||||||
|
|
||||||
|
for event in EVENT_HANDLERS {
|
||||||
|
document
|
||||||
|
.select(format!("[{}]", event).as_str())
|
||||||
|
.remove_attr(event);
|
||||||
|
}
|
||||||
|
|
||||||
|
for rel in LINK_REL_EXTERNAL_RESOURCES {
|
||||||
|
document
|
||||||
|
.select(format!("link[rel=\"{}\"]", rel).as_str())
|
||||||
|
.remove();
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Replace stylesheets ----
|
||||||
|
//
|
||||||
|
let stylesheets = document.select("link[href][rel=\"stylesheet\"]");
|
||||||
|
let styles_url = stylesheets
|
||||||
|
.iter()
|
||||||
|
.map(|style_link| {
|
||||||
|
if let Some(src) = style_link.attr("href") {
|
||||||
|
base_url.join(src.as_ref()).ok()
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
(styles_url, String::from(document.html()))
|
||||||
|
};
|
||||||
|
|
||||||
|
let style_urls = style_urls.into_iter().map(|style_url| {
|
||||||
|
OptionFuture::from(
|
||||||
|
style_url.map(|s| async move { downloader.download(&s).await.unwrap() }),
|
||||||
|
)
|
||||||
|
});
|
||||||
|
let downloaded_styles = futures::future::join_all(style_urls).await;
|
||||||
|
|
||||||
|
let html = {
|
||||||
|
let document = Document::from(&html);
|
||||||
|
let styles = document.select("link[href][rel=\"stylesheet\"]");
|
||||||
|
|
||||||
|
styles
|
||||||
|
.iter()
|
||||||
|
.zip(downloaded_styles.iter())
|
||||||
|
.for_each(|(mut style_link, inner_css)| {
|
||||||
|
if let Some(Some(inner_css)) = inner_css {
|
||||||
|
let css = String::from_utf8(inner_css.to_vec()).unwrap();
|
||||||
|
let media_query = style_link.attr("media");
|
||||||
|
let css = match media_query {
|
||||||
|
Some(media_query) => {
|
||||||
|
format!("<style media=\"{}\">{}</style>", media_query, css)
|
||||||
|
}
|
||||||
|
None => format!("<style>{}</style>", css),
|
||||||
|
};
|
||||||
|
style_link.replace_with_html(css);
|
||||||
|
} else {
|
||||||
|
style_link.remove();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
String::from(document.html())
|
||||||
|
};
|
||||||
|
|
||||||
|
// ---- Replace imgs ----
|
||||||
//
|
//
|
||||||
let stylesheets = document.select("link[href][rel=\"stylesheet\"]");
|
let image_urls = {
|
||||||
let styles_url = stylesheets
|
let document = Document::from(&html);
|
||||||
.iter()
|
let imgs = document.select("img:not([src^=\"data:\"])");
|
||||||
.map(|stylesheet| {
|
|
||||||
if let Some(src) = stylesheet.attr("href") {
|
|
||||||
//TODO: does it work with absolute urls ?
|
|
||||||
base_url.join(src.as_ref()).ok()
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.collect::<Vec<_>>();
|
|
||||||
(styles_url, String::from(document.html()))
|
|
||||||
};
|
|
||||||
|
|
||||||
let style_urls = style_urls.into_iter().map(|style_url| {
|
imgs.iter()
|
||||||
OptionFuture::from(style_url.map(|s| async move { downloader.download(&s).await.unwrap() }))
|
.map(|image| {
|
||||||
});
|
if let Some(src) = image.attr("src") {
|
||||||
let downloaded_styles = futures::future::join_all(style_urls).await;
|
base_url.join(src.as_ref()).ok()
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
};
|
||||||
|
|
||||||
let html = {
|
let downloaded_images = image_urls.into_iter().map(|image_url| {
|
||||||
let document = Document::from(&html);
|
OptionFuture::from(image_url.map(|url| async move {
|
||||||
let styles = document.select("link[href][rel=\"stylesheet\"]");
|
let data = downloader.download(&url).await.unwrap();
|
||||||
|
(url, data)
|
||||||
|
}))
|
||||||
|
});
|
||||||
|
let downloaded_images = futures::future::join_all(downloaded_images).await;
|
||||||
|
|
||||||
styles
|
let html = {
|
||||||
.iter()
|
let document = Document::from(&html);
|
||||||
.zip(downloaded_styles.iter())
|
let imgs = document.select("img:not([src^=\"data:\"])");
|
||||||
.for_each(|(mut stylesheet, inner_css)| {
|
|
||||||
if let Some(inner_css) = inner_css {
|
|
||||||
let css = String::from_utf8(inner_css.to_vec()).unwrap();
|
|
||||||
let css = format!("<style>{}</style>", css);
|
|
||||||
stylesheet.replace_with_html(css);
|
|
||||||
} else {
|
|
||||||
stylesheet.remove();
|
|
||||||
}
|
|
||||||
});
|
|
||||||
String::from(document.html())
|
|
||||||
};
|
|
||||||
|
|
||||||
// ---- Replace imgs ----
|
imgs.iter()
|
||||||
//
|
.zip(downloaded_images.iter())
|
||||||
let image_urls = {
|
.for_each(|(mut img, data)| {
|
||||||
let document = Document::from(&html);
|
if let Some((url, Some(data))) = data {
|
||||||
let imgs = document.select("img");
|
let data = base64::encode(data);
|
||||||
|
//TODO: use an extension hashmap
|
||||||
|
let extension =
|
||||||
|
Path::new(url.path()).extension().unwrap().to_str().unwrap();
|
||||||
|
img.set_attr("src", &format!("data:image/{};base64,{}", extension, data));
|
||||||
|
} else {
|
||||||
|
img.remove()
|
||||||
|
}
|
||||||
|
});
|
||||||
|
// ---- Remove unwanted html elements -----
|
||||||
|
//
|
||||||
|
for element in self.elements_to_remove {
|
||||||
|
document.select(element.as_ref()).remove();
|
||||||
|
}
|
||||||
|
|
||||||
imgs.iter()
|
// ---- Add additional styles ----
|
||||||
.map(|image| {
|
//
|
||||||
if let Some(src) = image.attr("src") {
|
for style in self.styles_to_add {
|
||||||
base_url.join(src.as_ref()).ok()
|
document
|
||||||
} else {
|
.select("head")
|
||||||
None
|
.append_html(format!("\n<style>{}</style>\n", style.as_ref()));
|
||||||
}
|
}
|
||||||
})
|
|
||||||
.collect::<Vec<_>>()
|
|
||||||
};
|
|
||||||
|
|
||||||
let downloaded_images = image_urls.into_iter().map(|image_url| {
|
String::from(document.html())
|
||||||
OptionFuture::from(image_url.map(|url| async move {
|
};
|
||||||
let data = downloader.download(&url).await.unwrap();
|
|
||||||
(url, data)
|
|
||||||
}))
|
|
||||||
});
|
|
||||||
let downloaded_images = futures::future::join_all(downloaded_images).await;
|
|
||||||
|
|
||||||
let html = {
|
// ---- output ----
|
||||||
let document = Document::from(&html);
|
|
||||||
let imgs = document.select("img");
|
|
||||||
|
|
||||||
imgs.iter()
|
|
||||||
.zip(downloaded_images.iter())
|
|
||||||
.for_each(|(mut img, data)| {
|
|
||||||
if let Some((url, data)) = data {
|
|
||||||
let data = base64::encode(data);
|
|
||||||
let extension = Path::new(url.path()).extension().unwrap().to_str().unwrap();
|
|
||||||
img.set_attr("src", &format!("data:image/{};base64,{}", extension, data));
|
|
||||||
}
|
|
||||||
});
|
|
||||||
// ---- Remove unwanted html elements -----
|
|
||||||
//
|
//
|
||||||
for element in elements_to_remove {
|
let mut minifier = HTMLMinifier::new();
|
||||||
document.select(element.as_ref()).remove();
|
minifier.digest(html.as_str()).unwrap();
|
||||||
}
|
|
||||||
String::from(document.html())
|
|
||||||
};
|
|
||||||
|
|
||||||
// ---- output ----
|
String::from_utf8(minifier.get_html().into()).unwrap()
|
||||||
//
|
}
|
||||||
let mut minifier = HTMLMinifier::new();
|
|
||||||
minifier.digest(html.as_str()).unwrap();
|
|
||||||
|
|
||||||
String::from_utf8(minifier.get_html().into()).unwrap()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
|
// TODO: reduce boilerplate, DRY
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
@ -168,8 +224,8 @@ mod tests {
|
|||||||
#[async_trait]
|
#[async_trait]
|
||||||
impl Download for DummyDownloader {
|
impl Download for DummyDownloader {
|
||||||
type Error = errors::Error;
|
type Error = errors::Error;
|
||||||
async fn download(&self, _file_link: &Url) -> errors::Result<Bytes> {
|
async fn download(&self, _file_link: &Url) -> errors::Result<Option<Bytes>> {
|
||||||
Ok(Bytes::from(""))
|
Ok(Some(Bytes::from("")))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -178,9 +234,14 @@ mod tests {
|
|||||||
let html = "<html><head><script>let id = id => id</script></head><body></body></html>";
|
let html = "<html><head><script>let id = id => id</script></head><body></body></html>";
|
||||||
let base_url = Url::parse("http://example.com")?;
|
let base_url = Url::parse("http://example.com")?;
|
||||||
let downloader = DummyDownloader {};
|
let downloader = DummyDownloader {};
|
||||||
let to_remove: &[&str] = &[];
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
self_contained_html(html, &downloader, &base_url, to_remove).await,
|
Config {
|
||||||
|
downloader: Some(&downloader),
|
||||||
|
base_url: Some(&base_url),
|
||||||
|
..Default::default()
|
||||||
|
}
|
||||||
|
.run(html)
|
||||||
|
.await,
|
||||||
"<html><head></head><body></body></html>"
|
"<html><head></head><body></body></html>"
|
||||||
);
|
);
|
||||||
Ok(())
|
Ok(())
|
||||||
@ -204,10 +265,13 @@ mod tests {
|
|||||||
};
|
};
|
||||||
|
|
||||||
let base_url = Url::parse("http://example.com")?;
|
let base_url = Url::parse("http://example.com")?;
|
||||||
let to_remove: &[&str] = &[];
|
|
||||||
for s in EVENT_HANDLERS {
|
for s in EVENT_HANDLERS {
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
self_contained_html(html(s), &downloader, &base_url, to_remove).await,
|
Config {
|
||||||
|
downloader: Some(&downloader),
|
||||||
|
base_url: Some(&base_url),
|
||||||
|
..Default::default()
|
||||||
|
}.run(html(s)).await,
|
||||||
"<html><head>\n</head>\n<body>\n<button class=\"activate\">button</button>\n</body></html>"
|
"<html><head>\n</head>\n<body>\n<button class=\"activate\">button</button>\n</body></html>"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@ -232,10 +296,15 @@ mod tests {
|
|||||||
};
|
};
|
||||||
|
|
||||||
let base_url = Url::parse("http://example.com")?;
|
let base_url = Url::parse("http://example.com")?;
|
||||||
let to_remove: &[&str] = &[];
|
|
||||||
for s in LINK_REL_EXTERNAL_RESOURCES {
|
for s in LINK_REL_EXTERNAL_RESOURCES {
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
self_contained_html(html(s), &downloader, &base_url, to_remove).await,
|
Config {
|
||||||
|
downloader: Some(&downloader),
|
||||||
|
base_url: Some(&base_url),
|
||||||
|
..Default::default()
|
||||||
|
}
|
||||||
|
.run(html(s))
|
||||||
|
.await,
|
||||||
"<html><head>\n</head>\n<body>\n</body></html>"
|
"<html><head>\n</head>\n<body>\n</body></html>"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@ -246,12 +315,14 @@ mod tests {
|
|||||||
#[async_trait]
|
#[async_trait]
|
||||||
impl Download for CssDownloader {
|
impl Download for CssDownloader {
|
||||||
type Error = errors::Error;
|
type Error = errors::Error;
|
||||||
async fn download(&self, _file_link: &Url) -> errors::Result<Bytes> {
|
async fn download(&self, _file_link: &Url) -> errors::Result<Option<Bytes>> {
|
||||||
Ok(indoc! {"
|
Ok(Some(
|
||||||
|
indoc! {"
|
||||||
section#warning {
|
section#warning {
|
||||||
color: red;
|
color: red;
|
||||||
}"}
|
}"}
|
||||||
.into())
|
.into(),
|
||||||
|
))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -286,9 +357,57 @@ mod tests {
|
|||||||
let minified = String::from_utf8(minifier.get_html().into())?;
|
let minified = String::from_utf8(minifier.get_html().into())?;
|
||||||
|
|
||||||
let base_url = Url::parse("http://example.com")?;
|
let base_url = Url::parse("http://example.com")?;
|
||||||
let to_remove: &[&str] = &[];
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
self_contained_html(html, &downloader, &base_url, to_remove).await,
|
Config {
|
||||||
|
downloader: Some(&downloader),
|
||||||
|
base_url: Some(&base_url),
|
||||||
|
..Default::default()
|
||||||
|
}
|
||||||
|
.run(html)
|
||||||
|
.await,
|
||||||
|
minified
|
||||||
|
);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn download_css_with_media_query() -> Result<()> {
|
||||||
|
let downloader = CssDownloader {};
|
||||||
|
|
||||||
|
let html = indoc! {"
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<link rel=\"stylesheet\" href=\"main.css\" media=\"print\">
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"};
|
||||||
|
|
||||||
|
let wanted_html = indoc! {"
|
||||||
|
<html><head>
|
||||||
|
<style media=\"print\">
|
||||||
|
section#warning {
|
||||||
|
color: red;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
</body></html>
|
||||||
|
"};
|
||||||
|
let mut minifier = HTMLMinifier::new();
|
||||||
|
minifier.digest(wanted_html)?;
|
||||||
|
let minified = String::from_utf8(minifier.get_html().into())?;
|
||||||
|
|
||||||
|
let base_url = Url::parse("http://example.com")?;
|
||||||
|
assert_eq!(
|
||||||
|
Config {
|
||||||
|
downloader: Some(&downloader),
|
||||||
|
base_url: Some(&base_url),
|
||||||
|
..Default::default()
|
||||||
|
}
|
||||||
|
.run(html)
|
||||||
|
.await,
|
||||||
minified
|
minified
|
||||||
);
|
);
|
||||||
Ok(())
|
Ok(())
|
||||||
@ -298,12 +417,12 @@ mod tests {
|
|||||||
#[async_trait]
|
#[async_trait]
|
||||||
impl Download for PngDownloader {
|
impl Download for PngDownloader {
|
||||||
type Error = errors::Error;
|
type Error = errors::Error;
|
||||||
async fn download(&self, _file_link: &Url) -> errors::Result<Bytes> {
|
async fn download(&self, _file_link: &Url) -> errors::Result<Option<Bytes>> {
|
||||||
let image_path = Path::new("test_data/home.png");
|
let image_path = Path::new("test_data/home.png");
|
||||||
let mut image_file = File::open(&image_path).unwrap();
|
let mut image_file = File::open(&image_path).unwrap();
|
||||||
let mut image_buf: Vec<u8> = vec![];
|
let mut image_buf: Vec<u8> = vec![];
|
||||||
image_file.read_to_end(&mut image_buf).unwrap();
|
image_file.read_to_end(&mut image_buf).unwrap();
|
||||||
Ok(image_buf.into())
|
Ok(Some(image_buf.into()))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -333,9 +452,14 @@ mod tests {
|
|||||||
let minified = String::from_utf8(minifier.get_html().into())?;
|
let minified = String::from_utf8(minifier.get_html().into())?;
|
||||||
|
|
||||||
let base_url = Url::parse("http://example.com")?;
|
let base_url = Url::parse("http://example.com")?;
|
||||||
let to_remove: &[&str] = &[];
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
self_contained_html(html, &downloader, &base_url, to_remove).await,
|
Config {
|
||||||
|
downloader: Some(&downloader),
|
||||||
|
base_url: Some(&base_url),
|
||||||
|
..Default::default()
|
||||||
|
}
|
||||||
|
.run(html)
|
||||||
|
.await,
|
||||||
minified
|
minified
|
||||||
);
|
);
|
||||||
Ok(())
|
Ok(())
|
||||||
@ -368,12 +492,67 @@ mod tests {
|
|||||||
let minified = String::from_utf8(minifier.get_html().into())?;
|
let minified = String::from_utf8(minifier.get_html().into())?;
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
self_contained_html(
|
Config {
|
||||||
html,
|
downloader: Some(&downloader),
|
||||||
&downloader,
|
base_url: Some(&base_url),
|
||||||
&base_url,
|
elements_to_remove: &["header", ".placeholder", "article > span.huge"],
|
||||||
&["header", ".placeholder", "article > span.huge"]
|
..Default::default()
|
||||||
)
|
}
|
||||||
|
.run(html)
|
||||||
|
.await,
|
||||||
|
minified
|
||||||
|
);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn add_style() -> Result<()> {
|
||||||
|
let html = indoc! {"
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<meta charset=\"UTF-8\">
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
The body
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"};
|
||||||
|
|
||||||
|
let wanted_html = indoc! {"
|
||||||
|
<html><head>
|
||||||
|
<meta charset=\"UTF-8\">
|
||||||
|
<style>
|
||||||
|
body {
|
||||||
|
margin: 3em;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
The body
|
||||||
|
</body></html>
|
||||||
|
"};
|
||||||
|
|
||||||
|
let style_to_add = indoc! {"
|
||||||
|
body {
|
||||||
|
margin: 3em;
|
||||||
|
}
|
||||||
|
"};
|
||||||
|
|
||||||
|
let base_url = Url::parse("http://example.com")?;
|
||||||
|
let downloader = DummyDownloader {};
|
||||||
|
|
||||||
|
let mut minifier = HTMLMinifier::new();
|
||||||
|
minifier.digest(wanted_html)?;
|
||||||
|
let minified = String::from_utf8(minifier.get_html().into())?;
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
Config {
|
||||||
|
downloader: Some(&downloader),
|
||||||
|
base_url: Some(&base_url),
|
||||||
|
styles_to_add: &[style_to_add],
|
||||||
|
..Default::default()
|
||||||
|
}
|
||||||
|
.run(html)
|
||||||
.await,
|
.await,
|
||||||
minified
|
minified
|
||||||
);
|
);
|
||||||
|
@ -1,48 +1,98 @@
|
|||||||
This project mainly aims at providing an unified interface for several newspapers. Side
|
---
|
||||||
objectives are to provide web API and different clients like a webUI or chatbots.
|
title: Scope of the project
|
||||||
|
---
|
||||||
|
|
||||||
Several big components are planned for this project
|
This project mainly aims at providing an unified interface for several
|
||||||
|
newspapers. Side objectives are to provide web API and different clients like a
|
||||||
|
webUI or chatbots.
|
||||||
|
|
||||||
```dot
|
Several big components are planned for this project (it is an initial draft and
|
||||||
digraph G {
|
may change later) :
|
||||||
rankdir=TB
|
|
||||||
node [shape=rectangle, style=filled, color="#779988"]
|
|
||||||
|
|
||||||
subgraph cluster_frontend {
|
```plantuml
|
||||||
color = transparent
|
@startuml
|
||||||
webui
|
|
||||||
chatbot
|
frame "backend" {
|
||||||
}
|
[Retrieval tools] as retrieval_tools
|
||||||
|
[Article representation] as article_repr
|
||||||
|
[Automatic retrieval] as auto_retrieve
|
||||||
|
[Atom/RSS adapters] as rss
|
||||||
|
[Cache DB] as cache
|
||||||
|
|
||||||
|
[Newspaper\n(Mediapart, …)] as newspaper
|
||||||
|
() "Newspaper" as np_i
|
||||||
|
newspaper -up- np_i
|
||||||
|
|
||||||
|
|
||||||
webui -> api [color = red]
|
[Article location] as article_location
|
||||||
chatbot -> api [color = red]
|
|
||||||
|
|
||||||
subgraph cluster_backend {
|
[API] as api
|
||||||
label = "Backend\ncrieur binary"
|
() "API" as api_i
|
||||||
labelloc = b
|
api -up- api_i
|
||||||
style=filled
|
|
||||||
|
|
||||||
retrieve_tools [label="retrieve-tools"]
|
article_location ..> np_i
|
||||||
retrieve_adapters [label="retrieve-adapters"]
|
|
||||||
retrieve [label="retrieve-interface"]
|
|
||||||
auto_retrieve [label="automatic-retrieve"]
|
|
||||||
article_repr [label="article-representation\nRepresentation for articles"]
|
|
||||||
api
|
|
||||||
cache [label="Cache database"]
|
|
||||||
rss [label="Atom/RSS adapters"]
|
|
||||||
|
|
||||||
retrieve_tools -> retrieve_adapters
|
api -> article_location
|
||||||
retrieve_adapters -> retrieve
|
api -> rss
|
||||||
retrieve_tools -> retrieve
|
|
||||||
rss -> auto_retrieve
|
|
||||||
article_repr -> retrieve_adapters
|
|
||||||
|
|
||||||
retrieve -> api
|
newspaper -> retrieval_tools: uses to implement
|
||||||
auto_retrieve -> api
|
|
||||||
cache -> api
|
|
||||||
|
|
||||||
}
|
article_location --> article_repr: uses
|
||||||
|
retrieval_tools -up-> article_repr: uses
|
||||||
|
|
||||||
|
auto_retrieve --> rss: watches
|
||||||
|
auto_retrieve --> article_location
|
||||||
|
auto_retrieve --> cache: stores in
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
frame "Web ui" {
|
||||||
|
[Web UI] as webui
|
||||||
|
[HTML renderer] as html_rend
|
||||||
|
[Pdf exporter] as pdf_rend
|
||||||
|
[Articles] as articles
|
||||||
|
webui --> html_rend
|
||||||
|
webui --> pdf_rend
|
||||||
|
webui -> articles
|
||||||
|
articles ..> api_i
|
||||||
|
}
|
||||||
|
|
||||||
|
[Chatbot] as chatbot
|
||||||
|
|
||||||
|
chatbot ..> api_i
|
||||||
|
|
||||||
|
actor User
|
||||||
|
User ..> webui
|
||||||
|
User ..> chatbot
|
||||||
|
|
||||||
|
actor "Newspaper programmer" as newspaper_programmer
|
||||||
|
newspaper_programmer ..> newspaper: implements
|
||||||
|
@enduml
|
||||||
```
|
```
|
||||||
|
|
||||||
|
A task queue could be added later to space requests.
|
||||||
|
|
||||||
|
# Implementation plan
|
||||||
|
|
||||||
|
## Phase I
|
||||||
|
- [x] `Newspaper` interface : use to retrieve from newspaper websites
|
||||||
|
- [ ] minimal chatbot (uses libraries directly)
|
||||||
|
- [x] `ArticleLocation` : library for using several `Newspaper` and retrieving from
|
||||||
|
a given url.
|
||||||
|
|
||||||
|
## Phase II
|
||||||
|
- [ ] Article Representation : having a (beta) unified representation for downloaded
|
||||||
|
articles
|
||||||
|
- [ ] adding this representation to Newpsaper
|
||||||
|
|
||||||
|
## Phase III
|
||||||
|
- [ ] Cache
|
||||||
|
- [ ] Atom/rss adapters
|
||||||
|
- [ ] automatic retrieve
|
||||||
|
|
||||||
|
## Phase IV
|
||||||
|
- [ ] API
|
||||||
|
- [ ] chatbot (uses api)
|
||||||
|
|
||||||
|
## Phase V
|
||||||
|
- [ ] web ui
|
||||||
|
36
documentation/design/tooling.md
Normal file
36
documentation/design/tooling.md
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
---
|
||||||
|
title: Project tooling
|
||||||
|
---
|
||||||
|
|
||||||
|
# Container image
|
||||||
|
|
||||||
|
|
||||||
|
## Chatbot release
|
||||||
|
|
||||||
|
The [chatbot containerfile](../../containers/chatbot.containerfile) intend to
|
||||||
|
be the smaller possible in order to ease and reduce the storage needed in
|
||||||
|
registries.
|
||||||
|
|
||||||
|
In order to provide a minimal image, the rust-alpine container image is used.
|
||||||
|
This image uses the `x86_64-unknown-linux-musl` target that provides static
|
||||||
|
linking with `musl`.
|
||||||
|
|
||||||
|
However, the `olm-sys` couldn't be linked statically[^oml-sys-static-error].
|
||||||
|
The workaround have been to introduce the
|
||||||
|
`RUSTFLAGS=-Ctarget-feature=-crt-static` environment variable that disables
|
||||||
|
static linking.
|
||||||
|
|
||||||
|
The following lines have been added to copy the needed libraries.
|
||||||
|
|
||||||
|
```containerfile
|
||||||
|
COPY --from=build /usr/lib/libstdc++.so.6 /usr/lib/libstdc++.so.6
|
||||||
|
COPY --from=build /usr/lib/libgcc_s.so.1 /usr/lib/libgcc_s.so.1
|
||||||
|
COPY --from=build /lib/ld-musl-x86_64.so.1 /lib/ld-musl-x86_64.so.1
|
||||||
|
```
|
||||||
|
|
||||||
|
## Development
|
||||||
|
|
||||||
|
An image aimed at providing a development environment for developers may
|
||||||
|
be added later.
|
||||||
|
|
||||||
|
[^oml-sys-static-error]: with `oml-sys` v1.1.1, in march 2021
|
27
documentation/guides/run_chatbot.md
Normal file
27
documentation/guides/run_chatbot.md
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
---
|
||||||
|
title: Build and run the chatbot
|
||||||
|
---
|
||||||
|
|
||||||
|
1. You must first configure matrix login, every variable in [the_reference](../reference/chatbot_configuration.md) is mandatory.
|
||||||
|
|
||||||
|
```env
|
||||||
|
CRIEUR_MATRIX_USER=user
|
||||||
|
CRIEUR_MATRIX_PASSWORD=password
|
||||||
|
CRIEUR_MATRIX_HOMESERVER=https://homeserv.er
|
||||||
|
CRIEUR_MATRIX_ROOMS=roomid1,roomid2,
|
||||||
|
```
|
||||||
|
|
||||||
|
You can put it in a `.env` file.
|
||||||
|
|
||||||
|
2. Run the chatbot
|
||||||
|
|
||||||
|
**Using `podman` (or another container tool, like `docker`)**
|
||||||
|
```
|
||||||
|
podman build --file ./containers/chatbot.containerfile -t crieur-chatbot .
|
||||||
|
podman run --env-file .env --rm -i -t crieur-chatbot
|
||||||
|
```
|
||||||
|
|
||||||
|
**Using `cargo` (for development)**
|
||||||
|
```
|
||||||
|
cargo run --release --bin crieur-chatbot
|
||||||
|
```
|
17
documentation/reference/chatbot_configuration.md
Normal file
17
documentation/reference/chatbot_configuration.md
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
---
|
||||||
|
Title: Chatbot configuration reference
|
||||||
|
---
|
||||||
|
|
||||||
|
The chatbot is configured using environment variables
|
||||||
|
|
||||||
|
CRIEUR_MATRIX_USER
|
||||||
|
: username of the matrix bot account
|
||||||
|
|
||||||
|
CRIEUR_MATRIX_PASSWORD
|
||||||
|
: password of the matrix bot account
|
||||||
|
|
||||||
|
CRIEUR_MATRIX_HOMESERVER
|
||||||
|
: homeserver of the matrix bot account
|
||||||
|
|
||||||
|
CRIEUR_MATRIX_ROOMS
|
||||||
|
: rooms in which to listen to events
|
31
documentation/reference/newspaper_configuration.md
Normal file
31
documentation/reference/newspaper_configuration.md
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
---
|
||||||
|
title: Newspapers configuration
|
||||||
|
---
|
||||||
|
|
||||||
|
The newspapers are configured using environment variables
|
||||||
|
|
||||||
|
# Mediapart
|
||||||
|
|
||||||
|
MEDIAPART_COOKIE
|
||||||
|
: sets the `MPRUUID` cookie, used to log in
|
||||||
|
|
||||||
|
# Le Monde Diplomatique
|
||||||
|
|
||||||
|
All cookies are mandatory to log in
|
||||||
|
|
||||||
|
MONDE_DIPLO_LMD_A_M
|
||||||
|
: sets the `lmd_a_m` cookie
|
||||||
|
|
||||||
|
MONDE_DIPLO_PHPSESSID
|
||||||
|
: sets the `PHPSESSID` cookie
|
||||||
|
|
||||||
|
MONDE_DIPLO_SPIP_SESSION
|
||||||
|
: sets the `spip_session` cookie
|
||||||
|
|
||||||
|
# Courrier international
|
||||||
|
|
||||||
|
COURRIER_INTERNATIONAL_LMD_A_M
|
||||||
|
: sets the `lmd_a_m` cookie
|
||||||
|
|
||||||
|
COURRIER_INTERNATIONAL_SSESS
|
||||||
|
: sets the `ssess` cookie
|
@ -2,34 +2,27 @@ use std::convert::TryInto;
|
|||||||
use std::env;
|
use std::env;
|
||||||
|
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use crieur_retrieve::{newspaper::Newspaper, ArticleLocation, Mediapart, Url};
|
use crieur_retrieve::{ArticleLocation, Url};
|
||||||
use dotenv::dotenv;
|
use dotenv::dotenv;
|
||||||
use log::info;
|
use log::info;
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
async fn main() -> Result<()> {
|
async fn main() -> Result<()> {
|
||||||
dotenv().ok();
|
dotenv().ok();
|
||||||
env_logger::init();
|
tracing_subscriber::fmt()
|
||||||
|
.with_writer(std::io::stderr)
|
||||||
|
.with_env_filter(tracing_subscriber::EnvFilter::from_default_env())
|
||||||
|
.init();
|
||||||
|
|
||||||
let url = match env::args().nth(1) {
|
let url = match env::args().nth(1) {
|
||||||
Some(url) => Url::parse(&url)?,
|
Some(url) => Url::parse(&url)?,
|
||||||
None => "https://www.mediapart.fr/journal/france/030421/l-hotel-dieu-patients-et-medecins-tentent-de-percer-les-mysteres-du-covid-long".try_into()?,
|
None => "https://www.mediapart.fr/journal/france/030421/l-hotel-dieu-patients-et-medecins-tentent-de-percer-les-mysteres-du-covid-long".try_into()?,
|
||||||
};
|
};
|
||||||
|
|
||||||
// TODO: remove this in favor of default newspapers
|
|
||||||
let mut mediapart = Mediapart::new().await
|
|
||||||
//.login(USERNAME, PASSWORD)
|
|
||||||
//
|
|
||||||
;
|
|
||||||
|
|
||||||
mediapart.login_cookie = Some(("MPRUUID".into(), env::var("MEDIAPART_COOKIE")?.into()));
|
|
||||||
info!("Trying to download article from {}", url);
|
info!("Trying to download article from {}", url);
|
||||||
|
|
||||||
// TODO: shorten this, maybe an helper function ?
|
// TODO: shorten this, maybe an helper function ?
|
||||||
let article_location = ArticleLocation::builder()
|
let article_location = ArticleLocation::builder().url(url)?.build()?;
|
||||||
.url(url)?
|
|
||||||
.newspaper(&mediapart)
|
|
||||||
.build()?;
|
|
||||||
|
|
||||||
let article_str = article_location.retrieve_html().await?;
|
let article_str = article_location.retrieve_html().await?;
|
||||||
|
|
||||||
|
27
justfile
27
justfile
@ -1,16 +1,29 @@
|
|||||||
@build:
|
@build:
|
||||||
cargo build
|
cargo build
|
||||||
|
|
||||||
|
@build-container:
|
||||||
|
podman build --file ./containers/chatbot.containerfile -t crieur-chatbot .
|
||||||
|
|
||||||
@clean:
|
@clean:
|
||||||
cargo clean
|
cargo clean
|
||||||
|
|
||||||
@run:
|
@run:
|
||||||
cargo run
|
cargo run
|
||||||
|
|
||||||
|
@test:
|
||||||
|
cargo test --all
|
||||||
|
|
||||||
|
@clippy:
|
||||||
|
cargo clippy
|
||||||
|
|
||||||
|
@fmt:
|
||||||
|
cargo fmt
|
||||||
|
|
||||||
|
@simulate-ci: fmt clippy test
|
||||||
|
|
||||||
|
|
||||||
@audit:
|
@audit:
|
||||||
cargo audit
|
cargo audit
|
||||||
|
|
||||||
@crev:
|
@crev:
|
||||||
cargo crev verify
|
cargo crev verify
|
||||||
|
|
||||||
@verify: audit crev
|
|
||||||
|
11
src/bin/crieur-chatbot.rs
Normal file
11
src/bin/crieur-chatbot.rs
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
use anyhow::Result;
|
||||||
|
use crieur_chatbot::run;
|
||||||
|
use dotenv::dotenv;
|
||||||
|
|
||||||
|
#[tokio::main]
|
||||||
|
async fn main() -> Result<()> {
|
||||||
|
env_logger::init();
|
||||||
|
dotenv().ok();
|
||||||
|
run().await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
19
src/main.rs
19
src/main.rs
@ -1,19 +0,0 @@
|
|||||||
use anyhow::Result;
|
|
||||||
use crieur_retrieve::{newspaper::Newspaper, Mediapart, Url};
|
|
||||||
use dotenv::dotenv;
|
|
||||||
use std::env;
|
|
||||||
|
|
||||||
#[tokio::main]
|
|
||||||
async fn main() -> Result<()> {
|
|
||||||
dotenv().ok();
|
|
||||||
|
|
||||||
let mut mediapart = Mediapart::new().await
|
|
||||||
//.login(USERNAME, PASSWORD)
|
|
||||||
//
|
|
||||||
;
|
|
||||||
|
|
||||||
mediapart.login_cookie = Some(("MPRUUID".into(), env::var("MEDIAPART_COOKIE")?));
|
|
||||||
let url = Url::parse("https://www.mediapart.fr/journal/france/030421/l-hotel-dieu-patients-et-medecins-tentent-de-percer-les-mysteres-du-covid-long")?;
|
|
||||||
println!("{}", mediapart.retrieve_html(&url).await?);
|
|
||||||
Ok(())
|
|
||||||
}
|
|
Loading…
Reference in New Issue
Block a user