A feature to remove elements of html pages based on css selectors have been added. The removal of link element that load external js have been added.
40 lines
1.1 KiB
Rust
40 lines
1.1 KiB
Rust
use std::convert::TryInto;
|
|
use std::env;
|
|
|
|
use anyhow::Result;
|
|
use crieur_retrieve::{newspaper::Newspaper, ArticleLocation, Mediapart, Url};
|
|
use dotenv::dotenv;
|
|
use log::info;
|
|
|
|
#[tokio::main]
|
|
async fn main() -> Result<()> {
|
|
dotenv().ok();
|
|
env_logger::init();
|
|
|
|
let url = match env::args().nth(1) {
|
|
Some(url) => Url::parse(&url)?,
|
|
None => "https://www.mediapart.fr/journal/france/030421/l-hotel-dieu-patients-et-medecins-tentent-de-percer-les-mysteres-du-covid-long".try_into()?,
|
|
};
|
|
|
|
// TODO: remove this in favor of default newspapers
|
|
let mut mediapart = Mediapart::new().await
|
|
//.login(USERNAME, PASSWORD)
|
|
//
|
|
;
|
|
|
|
mediapart.login_cookie = Some(("MPRUUID".into(), env::var("MEDIAPART_COOKIE")?.into()));
|
|
info!("Trying to download article from {}", url);
|
|
|
|
// TODO: shorten this, maybe an helper function ?
|
|
let article_location = ArticleLocation::builder()
|
|
.url(url)?
|
|
.newspaper(&mediapart)
|
|
.build()?;
|
|
|
|
let article_str = article_location.retrieve_html().await?;
|
|
|
|
println!("{}", article_str);
|
|
|
|
Ok(())
|
|
}
|