crieur/examples/cli_downloader.rs
koalp 756b1592b7
feat: allows to remove elements of html pages
A feature to remove elements of html pages based on css selectors have
been added.

The removal of link element that load external js have been added.
2021-04-24 03:45:13 +02:00

40 lines
1.1 KiB
Rust

use std::convert::TryInto;
use std::env;
use anyhow::Result;
use crieur_retrieve::{newspaper::Newspaper, ArticleLocation, Mediapart, Url};
use dotenv::dotenv;
use log::info;
#[tokio::main]
async fn main() -> Result<()> {
dotenv().ok();
env_logger::init();
let url = match env::args().nth(1) {
Some(url) => Url::parse(&url)?,
None => "https://www.mediapart.fr/journal/france/030421/l-hotel-dieu-patients-et-medecins-tentent-de-percer-les-mysteres-du-covid-long".try_into()?,
};
// TODO: remove this in favor of default newspapers
let mut mediapart = Mediapart::new().await
//.login(USERNAME, PASSWORD)
//
;
mediapart.login_cookie = Some(("MPRUUID".into(), env::var("MEDIAPART_COOKIE")?.into()));
info!("Trying to download article from {}", url);
// TODO: shorten this, maybe an helper function ?
let article_location = ArticleLocation::builder()
.url(url)?
.newspaper(&mediapart)
.build()?;
let article_str = article_location.retrieve_html().await?;
println!("{}", article_str);
Ok(())
}