add several newspapers #36

Merged
koalp merged 6 commits from feature/additional_newspapers into development 2021-05-22 04:50:43 +02:00
12 changed files with 375 additions and 88 deletions
Showing only changes of commit 970f510cd1 - Show all commits

209
Cargo.lock generated
View File

@ -74,13 +74,22 @@ checksum = "739f4a8db6605981345c5654f3a85b056ce52f37a39d34da03f25bf2151ea16e"
[[package]] [[package]]
name = "aho-corasick" name = "aho-corasick"
version = "0.7.15" version = "0.7.18"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7404febffaa47dac81aa44dba71523c9d069b1bdc50a77db41195149e17f68e5" checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f"
dependencies = [ dependencies = [
"memchr", "memchr",
] ]
[[package]]
name = "ansi_term"
version = "0.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2"
dependencies = [
"winapi",
]
[[package]] [[package]]
name = "anyhow" name = "anyhow"
version = "1.0.40" version = "1.0.40"
@ -213,6 +222,18 @@ version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "chrono"
version = "0.4.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "670ad68c9088c2a963aaa298cb369688cf3f9465ce5e2d4ca10e6e0098a1ce73"
dependencies = [
"libc",
"num-integer",
"num-traits",
"winapi",
]
[[package]] [[package]]
name = "cipher" name = "cipher"
version = "0.2.5" version = "0.2.5"
@ -285,10 +306,13 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "79bb3adfaf5f75d24b01aee375f7555907840fa2800e5ec8fa3b9e2031830173" checksum = "79bb3adfaf5f75d24b01aee375f7555907840fa2800e5ec8fa3b9e2031830173"
[[package]] [[package]]
name = "cpuid-bool" name = "cpufeatures"
version = "0.1.2" version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8aebca1129a03dc6dc2b127edd729435bbc4a37e1d5f4d7513165089ceb02634" checksum = "dec1028182c380cc45a2e2c5ec841134f2dfd0f8f5f0a5bcd68004f81b5efdf4"
dependencies = [
"libc",
]
[[package]] [[package]]
name = "cpuid-bool" name = "cpuid-bool"
@ -307,6 +331,7 @@ dependencies = [
"env_logger", "env_logger",
"log", "log",
"tokio", "tokio",
"tracing-subscriber",
] ]
[[package]] [[package]]
@ -821,9 +846,9 @@ dependencies = [
[[package]] [[package]]
name = "h2" name = "h2"
version = "0.3.2" version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fc018e188373e2777d0ef2467ebff62a08e66c3f5857b23c8fbec3018210dc00" checksum = "825343c4eef0b63f541f8903f395dc5beb362a979b5799a84062527ef1e37726"
dependencies = [ dependencies = [
"bytes", "bytes",
"fnv", "fnv",
@ -877,9 +902,9 @@ dependencies = [
[[package]] [[package]]
name = "html-minifier" name = "html-minifier"
version = "3.0.10" version = "3.0.11"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "108452631307790510cde91282fc706ae70076bd68200add8638773f06d5e122" checksum = "9876ed2cad2fd2f78bad42688bcac8a87cebce9b2381c5b05efc30b0c8429af0"
dependencies = [ dependencies = [
"cow-utils", "cow-utils",
"educe", "educe",
@ -914,9 +939,9 @@ dependencies = [
[[package]] [[package]]
name = "http-body" name = "http-body"
version = "0.4.1" version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5dfb77c123b4e2f72a2069aeae0b4b4949cc7e966df277813fc16347e7549737" checksum = "60daa14be0e0786db0f03a9e57cb404c9d756eed2b6c62b9ea98ec5743ec75a9"
dependencies = [ dependencies = [
"bytes", "bytes",
"http", "http",
@ -1055,9 +1080,9 @@ checksum = "dd25036021b0de88a0aff6b850051563c6516d0bf53f8638938edbb9de732736"
[[package]] [[package]]
name = "js-sys" name = "js-sys"
version = "0.3.50" version = "0.3.51"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2d99f9e3e84b8f67f846ef5b4cbbc3b1c29f6c759fcbce6f01aa0e73d932a24c" checksum = "83bdfbace3a0e81a4253f73b49e960b053e396a11012cbd49b9b74d6a2b67062"
dependencies = [ dependencies = [
"wasm-bindgen", "wasm-bindgen",
] ]
@ -1091,9 +1116,9 @@ checksum = "18794a8ad5b29321f790b55d93dfba91e125cb1a9edbd4f8e3150acc771c1a5e"
[[package]] [[package]]
name = "lock_api" name = "lock_api"
version = "0.4.3" version = "0.4.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a3c91c24eae6777794bb1997ad98bbb87daf92890acab859f7eaa4320333176" checksum = "0382880606dff6d15c9476c416d18690b72742aa7b605bb6dd6ec9030fbf07eb"
dependencies = [ dependencies = [
"scopeguard", "scopeguard",
] ]
@ -1158,6 +1183,15 @@ dependencies = [
"tendril", "tendril",
] ]
[[package]]
name = "matchers"
version = "0.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f099785f7595cc4b4553a174ce30dd7589ef93391ff414dbb67f62392b9e0ce1"
dependencies = [
"regex-automata",
]
[[package]] [[package]]
name = "matches" name = "matches"
version = "0.1.8" version = "0.1.8"
@ -1254,9 +1288,9 @@ checksum = "60302e4db3a61da70c0cb7991976248362f30319e88850c487b9b95bbf059e00"
[[package]] [[package]]
name = "memchr" name = "memchr"
version = "2.3.4" version = "2.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0ee1c47aaa256ecabcaea351eae4a9b01ef39ed810004e298d2511ed284b1525" checksum = "b16bd47d9e329435e309c58469fe0791c2d0d1ba96ec0954152a5ae2b04387dc"
[[package]] [[package]]
name = "mime" name = "mime"
@ -1585,7 +1619,7 @@ version = "0.4.5"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eebcc4aa140b9abd2bc40d9c3f7ccec842679cd79045ac3a7ac698c1a064b7cd" checksum = "eebcc4aa140b9abd2bc40d9c3f7ccec842679cd79045ac3a7ac698c1a064b7cd"
dependencies = [ dependencies = [
"cpuid-bool 0.2.0", "cpuid-bool",
"opaque-debug", "opaque-debug",
"universal-hash", "universal-hash",
] ]
@ -1856,18 +1890,18 @@ dependencies = [
[[package]] [[package]]
name = "redox_syscall" name = "redox_syscall"
version = "0.2.7" version = "0.2.8"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "85dd92e586f7355c633911e11f77f3d12f04b1b1bd76a198bd34ae3af8341ef2" checksum = "742739e41cd49414de871ea5e549afb7e2a3ac77b589bcbebe8c82fab37147fc"
dependencies = [ dependencies = [
"bitflags", "bitflags",
] ]
[[package]] [[package]]
name = "regex" name = "regex"
version = "1.4.6" version = "1.5.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2a26af418b574bd56588335b3a3659a65725d4e636eb1016c2f9e3b38c7cc759" checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461"
dependencies = [ dependencies = [
"aho-corasick", "aho-corasick",
"memchr", "memchr",
@ -1875,10 +1909,20 @@ dependencies = [
] ]
[[package]] [[package]]
name = "regex-syntax" name = "regex-automata"
version = "0.6.23" version = "0.1.9"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "24d5f089152e60f62d28b835fbff2cd2e8dc0baf1ac13343bef92ab7eed84548" checksum = "ae1ded71d66a4a97f5e961fd0cb25a5f366a42a41570d16a763a69c092c26ae4"
dependencies = [
"byteorder",
"regex-syntax",
]
[[package]]
name = "regex-syntax"
version = "0.6.25"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b"
[[package]] [[package]]
name = "reqwest" name = "reqwest"
@ -2319,17 +2363,26 @@ checksum = "2579985fda508104f7587689507983eadd6a6e84dd35d6d115361f530916fa0d"
[[package]] [[package]]
name = "sha2" name = "sha2"
version = "0.9.3" version = "0.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fa827a14b29ab7f44778d14a88d3cb76e949c45083f7dbfa507d0cb699dc12de" checksum = "d8f6b75b17576b792bef0db1bcc4b8b8bcdf9506744cf34b974195487af6cff2"
dependencies = [ dependencies = [
"block-buffer", "block-buffer",
"cfg-if 1.0.0", "cfg-if 1.0.0",
"cpuid-bool 0.1.2", "cpufeatures",
"digest", "digest",
"opaque-debug", "opaque-debug",
] ]
[[package]]
name = "sharded-slab"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "79c719719ee05df97490f80a45acfc99e5a30ce98a1e4fb67aee422745ae14e3"
dependencies = [
"lazy_static",
]
[[package]] [[package]]
name = "signal-hook-registry" name = "signal-hook-registry"
version = "1.3.0" version = "1.3.0"
@ -2491,9 +2544,9 @@ checksum = "1e81da0851ada1f3e9d4312c704aa4f8806f0f9d69faaf8df2f3464b4a9437c2"
[[package]] [[package]]
name = "syn" name = "syn"
version = "1.0.71" version = "1.0.72"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ad184cc9470f9117b2ac6817bfe297307418819ba40552f9b3846f05c33d5373" checksum = "a1e8cdbefb79a9a5a65e0db8b47b723ee907b7c7f8496c76a1770b5c310bab82"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
@ -2558,6 +2611,15 @@ dependencies = [
"syn", "syn",
] ]
[[package]]
name = "thread_local"
version = "1.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8018d24e04c95ac8790716a5987d0fec4f8b27249ffa0f7d33f1369bdfb88cbd"
dependencies = [
"once_cell",
]
[[package]] [[package]]
name = "time" name = "time"
version = "0.2.26" version = "0.2.26"
@ -2684,9 +2746,9 @@ checksum = "360dfd1d6d30e05fda32ace2c8c70e9c0a9da713275777f5a4dbb8a1893930c6"
[[package]] [[package]]
name = "tracing" name = "tracing"
version = "0.1.25" version = "0.1.26"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "01ebdc2bb4498ab1ab5f5b73c5803825e60199229ccba0698170e3be0e7f959f" checksum = "09adeb8c97449311ccd28a427f96fb563e7fd31aabf994189879d9da2394b89d"
dependencies = [ dependencies = [
"cfg-if 1.0.0", "cfg-if 1.0.0",
"pin-project-lite", "pin-project-lite",
@ -2707,9 +2769,9 @@ dependencies = [
[[package]] [[package]]
name = "tracing-core" name = "tracing-core"
version = "0.1.17" version = "0.1.18"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f50de3927f93d202783f4513cda820ab47ef17f624b03c096e86ef00c67e6b5f" checksum = "a9ff14f98b1a4b289c6248a023c1c2fa1491062964e9fed67ab29c4e4da4a052"
dependencies = [ dependencies = [
"lazy_static", "lazy_static",
] ]
@ -2724,6 +2786,49 @@ dependencies = [
"tracing", "tracing",
] ]
[[package]]
name = "tracing-log"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a6923477a48e41c1951f1999ef8bb5a3023eb723ceadafe78ffb65dc366761e3"
dependencies = [
"lazy_static",
"log",
"tracing-core",
]
[[package]]
name = "tracing-serde"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fb65ea441fbb84f9f6748fd496cf7f63ec9af5bca94dd86456978d055e8eb28b"
dependencies = [
"serde",
"tracing-core",
]
[[package]]
name = "tracing-subscriber"
version = "0.2.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aa5553bf0883ba7c9cbe493b085c29926bd41b66afc31ff72cf17ff4fb60dcd5"
dependencies = [
"ansi_term",
"chrono",
"lazy_static",
"matchers",
"regex",
"serde",
"serde_json",
"sharded-slab",
"smallvec 1.6.1",
"thread_local",
"tracing",
"tracing-core",
"tracing-log",
"tracing-serde",
]
[[package]] [[package]]
name = "try-lock" name = "try-lock"
version = "0.2.3" version = "0.2.3"
@ -2756,9 +2861,9 @@ dependencies = [
[[package]] [[package]]
name = "unicode-xid" name = "unicode-xid"
version = "0.2.1" version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f7fe0bb3479651439c9112f72b6c505038574c9fbb575ed1bf3b797fa39dd564" checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3"
[[package]] [[package]]
name = "unindent" name = "unindent"
@ -2784,9 +2889,9 @@ checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a"
[[package]] [[package]]
name = "url" name = "url"
version = "2.2.1" version = "2.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9ccd964113622c8e9322cfac19eb1004a07e636c545f325da085d5cdde6f1f8b" checksum = "a507c383b2d33b5fc35d1861e77e6b383d158b2da5e14fe51b83dfedf6fd578c"
dependencies = [ dependencies = [
"form_urlencoded", "form_urlencoded",
"idna", "idna",
@ -2846,9 +2951,9 @@ checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6"
[[package]] [[package]]
name = "wasm-bindgen" name = "wasm-bindgen"
version = "0.2.73" version = "0.2.74"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "83240549659d187488f91f33c0f8547cbfef0b2088bc470c116d1d260ef623d9" checksum = "d54ee1d4ed486f78874278e63e4069fc1ab9f6a18ca492076ffb90c5eb2997fd"
dependencies = [ dependencies = [
"cfg-if 1.0.0", "cfg-if 1.0.0",
"serde", "serde",
@ -2858,9 +2963,9 @@ dependencies = [
[[package]] [[package]]
name = "wasm-bindgen-backend" name = "wasm-bindgen-backend"
version = "0.2.73" version = "0.2.74"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ae70622411ca953215ca6d06d3ebeb1e915f0f6613e3b495122878d7ebec7dae" checksum = "3b33f6a0694ccfea53d94db8b2ed1c3a8a4c86dd936b13b9f0a15ec4a451b900"
dependencies = [ dependencies = [
"bumpalo", "bumpalo",
"lazy_static", "lazy_static",
@ -2873,9 +2978,9 @@ dependencies = [
[[package]] [[package]]
name = "wasm-bindgen-futures" name = "wasm-bindgen-futures"
version = "0.4.23" version = "0.4.24"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "81b8b767af23de6ac18bf2168b690bed2902743ddf0fb39252e36f9e2bfc63ea" checksum = "5fba7978c679d53ce2d0ac80c8c175840feb849a161664365d1287b41f2e67f1"
dependencies = [ dependencies = [
"cfg-if 1.0.0", "cfg-if 1.0.0",
"js-sys", "js-sys",
@ -2885,9 +2990,9 @@ dependencies = [
[[package]] [[package]]
name = "wasm-bindgen-macro" name = "wasm-bindgen-macro"
version = "0.2.73" version = "0.2.74"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3e734d91443f177bfdb41969de821e15c516931c3c3db3d318fa1b68975d0f6f" checksum = "088169ca61430fe1e58b8096c24975251700e7b1f6fd91cc9d59b04fb9b18bd4"
dependencies = [ dependencies = [
"quote", "quote",
"wasm-bindgen-macro-support", "wasm-bindgen-macro-support",
@ -2895,9 +3000,9 @@ dependencies = [
[[package]] [[package]]
name = "wasm-bindgen-macro-support" name = "wasm-bindgen-macro-support"
version = "0.2.73" version = "0.2.74"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d53739ff08c8a68b0fdbcd54c372b8ab800b1449ab3c9d706503bc7dd1621b2c" checksum = "be2241542ff3d9f241f5e2cb6dd09b37efe786df8851c54957683a49f0987a97"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
@ -2908,15 +3013,15 @@ dependencies = [
[[package]] [[package]]
name = "wasm-bindgen-shared" name = "wasm-bindgen-shared"
version = "0.2.73" version = "0.2.74"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9a543ae66aa233d14bb765ed9af4a33e81b8b58d1584cf1b47ff8cd0b9e4489" checksum = "d7cff876b8f18eed75a66cf49b65e7f967cb354a7aa16003fb55dbfd25b44b4f"
[[package]] [[package]]
name = "web-sys" name = "web-sys"
version = "0.3.50" version = "0.3.51"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a905d57e488fec8861446d3393670fb50d27a262344013181c2cdf9fff5481be" checksum = "e828417b379f3df7111d3a2a9e5753706cae29c41f7c4029ee9fd77f3e09e582"
dependencies = [ dependencies = [
"js-sys", "js-sys",
"wasm-bindgen", "wasm-bindgen",

View File

@ -23,3 +23,4 @@ dotenv = "0.15.0"
env_logger = "0.8.3" env_logger = "0.8.3"
log = "0.4.14" log = "0.4.14"
tokio = { version = "1.5.0", features = ["full"] } tokio = { version = "1.5.0", features = ["full"] }
tracing-subscriber = "0.2.18"

View File

@ -1,4 +1,5 @@
Tools to retrieve articles from multiple newspaper you subscribed to. Tools to retrieve articles from multiple newspaper you subscribed to, all from
the same place.
**This is a prototype, it isn't stable at all and you may not want to use it if **This is a prototype, it isn't stable at all and you may not want to use it if
you expect it to just work !** you expect it to just work !**

View File

@ -8,19 +8,19 @@ publish = false
[dependencies] [dependencies]
anyhow = "1.0.40" anyhow = "1.0.40"
async-trait = "0.1.48" async-trait = "0.1.50"
thiserror = "1.0.24" thiserror = "1.0.24"
url = "2.2.1" url = "2.2.2"
hyper = { version = "0.14.5", features = ["full"] } hyper = { version = "0.14.7", features = ["full"] }
hyper-rustls = "0.22.1" hyper-rustls = "0.22.1"
cookie = "0.15.0" cookie = "0.15.0"
lol_html = "0.3.0" lol_html = "0.3.0"
indoc = "1.0.3" indoc = "1.0.3"
html-minifier = "3.0.9" html-minifier = "3.0.11"
bytes = "1.0.1" bytes = "1.0.1"
base64 = "0.13.0" base64 = "0.13.0"
futures = "0.3.14" futures = "0.3.14"
derive_builder = "0.10.0" derive_builder = "0.10.2"
nipper = "0.1.9" nipper = "0.1.9"
log = "0.4.14" log = "0.4.14"
env_logger = "0.8.3" env_logger = "0.8.3"

View File

@ -2,12 +2,12 @@ use std::boxed::Box;
use std::convert::TryInto; use std::convert::TryInto;
use std::env; use std::env;
use anyhow::anyhow;
use log::info; use log::info;
use url::{Host, Url}; use url::{Host, Url};
use crate::newspaper::Newspaper; use crate::newspaper::Newspaper;
use crate::newspapers::mediapart::{self, Mediapart}; use crate::newspapers::mediapart::{self, Mediapart};
use crate::newspapers::monde_diplomatique::{self, MondeDiplo};
/// Enumerate all errors that can be encountered when using ArticleLocation /// Enumerate all errors that can be encountered when using ArticleLocation
#[derive(thiserror::Error, Debug)] #[derive(thiserror::Error, Debug)]
@ -33,6 +33,7 @@ type Newspapers = Vec<Box<dyn Newspaper>>;
pub type Result<T, E = Error> = core::result::Result<T, E>; pub type Result<T, E = Error> = core::result::Result<T, E>;
fn default_newpapers() -> Result<Newspapers> { fn default_newpapers() -> Result<Newspapers> {
// TODO: same thing is written too much times : how to DRY ?
let config_key = "MEDIAPART_COOKIE".to_string(); let config_key = "MEDIAPART_COOKIE".to_string();
let mpruiid = env::var(&config_key) let mpruiid = env::var(&config_key)
.map_err(|_| Error::Misconfiguration(config_key))? .map_err(|_| Error::Misconfiguration(config_key))?
@ -42,7 +43,29 @@ fn default_newpapers() -> Result<Newspapers> {
.login(mediapart::Login::MPRUUID(mpruiid)) .login(mediapart::Login::MPRUUID(mpruiid))
.build()?; .build()?;
Ok(vec![Box::new(mediapart)]) let lmd_a_m = "MONDE_DIPLO_LMD_A_M".to_string();
let phpsessid = "MONDE_DIPLO_PHPSESSID".to_string();
let spip_session = "MONDE_DIPLO_SPIP_SESSION".to_string();
let lmd_a_m = env::var(&lmd_a_m)
.map_err(|_| Error::Misconfiguration(lmd_a_m))?
.into();
let phpsessid = env::var(&phpsessid)
.map_err(|_| Error::Misconfiguration(phpsessid))?
.into();
let spip_session = env::var(&spip_session)
.map_err(|_| Error::Misconfiguration(spip_session))?
.into();
let monde_diplo = MondeDiplo::builder()
.login(monde_diplomatique::Login::Cookies {
lmd_a_m,
phpsessid,
spip_session,
})
.build()?;
Ok(vec![Box::new(mediapart), Box::new(monde_diplo)])
} }
#[derive(Default)] #[derive(Default)]

View File

@ -1,4 +1,4 @@
use anyhow::{anyhow, Result}; use anyhow::{anyhow, bail, Result};
use async_trait::async_trait; use async_trait::async_trait;
use cookie::Cookie; use cookie::Cookie;
use url::Host; use url::Host;
@ -80,7 +80,10 @@ impl Newspaper for Mediapart {
let downloader = Downloader { cookies }; let downloader = Downloader { cookies };
let body = downloader.download(&url).await?; let body = downloader.download(&url).await?;
let html = String::from_utf8(body.to_vec())?; let html = match body {
Some(body) => String::from_utf8(body.to_vec())?,
None => bail!("404 not found"),
};
// TODO: Move to const // TODO: Move to const
let element_to_remove = [ let element_to_remove = [

View File

@ -1 +1,2 @@
pub mod mediapart; pub mod mediapart;
pub mod monde_diplomatique;

View File

@ -0,0 +1,135 @@
use anyhow::{anyhow, bail, Result};
use async_trait::async_trait;
use cookie::Cookie;
use url::Host;
use crate::newspaper::{Metadata, Newspaper};
use crate::tools;
use crate::Url;
use crate::{Download, Downloader};
pub enum Login {
Username(String, String),
Cookies {
lmd_a_m: String,
phpsessid: String,
spip_session: String,
},
}
#[derive(Debug, Clone, Default)]
pub struct MondeDiplo {
login_cookies: Vec<(String, String)>,
}
fn str_to_host<S: Into<String>>(host: S) -> Host {
Host::Domain(host.into())
}
#[derive(Debug, Clone, Default)]
pub struct Builder {
login_cookies: Option<Vec<(String, String)>>,
}
impl Builder {
pub fn login(&mut self, login: Login) -> &mut Self {
self.login_cookies = match login {
Login::Username(_username, _password) => {
unimplemented!("login using username and passwond not implemented")
}
Login::Cookies {
lmd_a_m,
phpsessid,
spip_session,
} => Some(vec![
("lmd_a_m".into(), lmd_a_m),
("PHPSESSID".into(), phpsessid),
("spip_session".into(), spip_session),
]),
};
self
}
pub fn build(&self) -> Result<MondeDiplo> {
match &self.login_cookies {
Some(login_cookies) => Ok(MondeDiplo {
login_cookies: login_cookies.clone(),
}),
None => Err(anyhow!("You have to log in to access this newspaper")),
}
}
}
#[async_trait]
impl Newspaper for MondeDiplo {
fn metadata(&self) -> Metadata {
Metadata::builder()
.hosts(vec![
str_to_host("monde-diplomatique.fr"),
str_to_host("www.monde-diplomatique.fr"),
])
.lower_case_name("monde-diplomatique")
.name("Le Monde Diplomatique")
.build()
.unwrap_or_default()
}
async fn retrieve_html(&self, url: &Url) -> Result<String> {
let cookies = self
.login_cookies
.iter()
.map(|cookie| Cookie::build(&cookie.0, &cookie.1).finish())
.collect::<Vec<_>>();
//let cookie = Cookie::build(&self.login_cookie.0, &self.login_cookie.1)
// .secure(true)
// .finish();
//let cookies = vec![cookie];
// TODO: replace by builder
let downloader = Downloader { cookies };
let body = downloader.download(&url).await?;
let html = match body {
Some(body) => String::from_utf8(body.to_vec())?,
None => bail!("404 not found"),
};
// TODO: Move to const
let element_to_remove = [
// navigation elements
"#tout-en-haut.preentete",
"#entete.connecte",
"#navigation",
"#pied",
".bloc-connexion",
// unused features
"#ecouter",
// Social buttons
".actions-article",
"#partage",
// misc
"noscript",
];
let single_page_html =
tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await;
Ok(single_page_html)
}
fn new() -> Self {
Self {
..Default::default()
}
}
async fn has_complete_access(&self) -> bool {
// TODO: check if we are logged using the cookie
true
}
}
impl MondeDiplo {
pub fn builder() -> Builder {
Builder::default()
}
}

View File

@ -4,7 +4,7 @@ use anyhow::Result;
use async_trait::async_trait; use async_trait::async_trait;
use bytes::Bytes; use bytes::Bytes;
use cookie::Cookie; use cookie::Cookie;
use hyper::{header, Body, Client, Method, Request}; use hyper::{header, Body, Client, Method, Request, StatusCode};
use thiserror::Error; use thiserror::Error;
use url::Url; use url::Url;
@ -22,7 +22,9 @@ pub trait Download {
type Error: StdError; type Error: StdError;
/// Downloads a file from an url and returns the result as bytes /// Downloads a file from an url and returns the result as bytes
async fn download(&self, file_link: &Url) -> Result<Bytes, Self::Error>; ///
/// If the file is not found, returns None
async fn download(&self, file_link: &Url) -> Result<Option<Bytes>, Self::Error>;
} }
/// Store several cookies /// Store several cookies
@ -36,7 +38,8 @@ pub struct Downloader<'c> {
impl<'c> Download for Downloader<'c> { impl<'c> Download for Downloader<'c> {
type Error = DownloadError; type Error = DownloadError;
async fn download(&self, file_link: &Url) -> Result<Bytes, Self::Error> { async fn download(&self, file_link: &Url) -> Result<Option<Bytes>, Self::Error> {
log::info!("downloading url {:?}", file_link);
let https = hyper_rustls::HttpsConnector::with_native_roots(); let https = hyper_rustls::HttpsConnector::with_native_roots();
let client: Client<_, hyper::Body> = Client::builder().build(https); let client: Client<_, hyper::Body> = Client::builder().build(https);
@ -44,14 +47,25 @@ impl<'c> Download for Downloader<'c> {
.method(Method::GET) .method(Method::GET)
.uri(file_link.as_str()); .uri(file_link.as_str());
for cookie in &self.cookies { req = req.header(
req = req.header(header::COOKIE, cookie.to_string()); header::COOKIE,
} self.cookies
.iter()
.map(Cookie::to_string)
.collect::<Vec<_>>()
.join(";"),
);
log::info!("headers : {:?}", req.headers_ref());
let req = req.body(Body::empty())?; let req = req.body(Body::empty())?;
let resp = client.request(req).await?; let resp = client.request(req).await?;
let body = hyper::body::to_bytes(resp).await?; let body = match resp.status() {
StatusCode::OK => Some(hyper::body::to_bytes(resp).await?),
StatusCode::NOT_FOUND => None,
// TODO: enhance this by handling more error codes
_ => None,
};
Ok(body) Ok(body)
} }
} }

View File

@ -77,7 +77,7 @@ where
.iter() .iter()
.zip(downloaded_styles.iter()) .zip(downloaded_styles.iter())
.for_each(|(mut stylesheet, inner_css)| { .for_each(|(mut stylesheet, inner_css)| {
if let Some(inner_css) = inner_css { if let Some(Some(inner_css)) = inner_css {
let css = String::from_utf8(inner_css.to_vec()).unwrap(); let css = String::from_utf8(inner_css.to_vec()).unwrap();
let css = format!("<style>{}</style>", css); let css = format!("<style>{}</style>", css);
stylesheet.replace_with_html(css); stylesheet.replace_with_html(css);
@ -120,10 +120,12 @@ where
imgs.iter() imgs.iter()
.zip(downloaded_images.iter()) .zip(downloaded_images.iter())
.for_each(|(mut img, data)| { .for_each(|(mut img, data)| {
if let Some((url, data)) = data { if let Some((url, Some(data))) = data {
let data = base64::encode(data); let data = base64::encode(data);
let extension = Path::new(url.path()).extension().unwrap().to_str().unwrap(); let extension = Path::new(url.path()).extension().unwrap().to_str().unwrap();
img.set_attr("src", &format!("data:image/{};base64,{}", extension, data)); img.set_attr("src", &format!("data:image/{};base64,{}", extension, data));
} else {
img.remove()
} }
}); });
// ---- Remove unwanted html elements ----- // ---- Remove unwanted html elements -----

View File

@ -8,3 +8,16 @@ The newspapers are configured using environment variables
MEDIAPART_COOKIE MEDIAPART_COOKIE
: sets the `MPRUUID` cookie, used to log in : sets the `MPRUUID` cookie, used to log in
# Le Monde Diplomatique
All cookies are mandatory to log in
MONDE_DIPLO_LMD_A_M
: sets the `lmd_a_m` cookie
MONDE_DIPLO_PHPSESSID
: sets the `PHPSESSID` cookie
MONDE_DIPLO_SPIP_SESSION
: sets the `spip_session` cookie

View File

@ -2,38 +2,27 @@ use std::convert::TryInto;
use std::env; use std::env;
use anyhow::Result; use anyhow::Result;
use crieur_retrieve::{ use crieur_retrieve::{ArticleLocation, Url};
newspaper::Newspaper,
newspapers::mediapart::{self, Mediapart},
ArticleLocation, Url,
};
use dotenv::dotenv; use dotenv::dotenv;
use log::info; use log::info;
#[tokio::main] #[tokio::main]
async fn main() -> Result<()> { async fn main() -> Result<()> {
dotenv().ok(); dotenv().ok();
env_logger::init(); tracing_subscriber::fmt()
.with_writer(std::io::stderr)
.with_env_filter(tracing_subscriber::EnvFilter::from_default_env())
.init();
let url = match env::args().nth(1) { let url = match env::args().nth(1) {
Some(url) => Url::parse(&url)?, Some(url) => Url::parse(&url)?,
None => "https://www.mediapart.fr/journal/france/030421/l-hotel-dieu-patients-et-medecins-tentent-de-percer-les-mysteres-du-covid-long".try_into()?, None => "https://www.mediapart.fr/journal/france/030421/l-hotel-dieu-patients-et-medecins-tentent-de-percer-les-mysteres-du-covid-long".try_into()?,
}; };
// TODO: remove this in favor of default newspapers
let mpruiid = env::var("MEDIAPART_COOKIE")?.into();
let mediapart = Mediapart::builder()
.login(mediapart::Login::MPRUUID(mpruiid))
.build()?;
info!("Trying to download article from {}", url); info!("Trying to download article from {}", url);
// TODO: shorten this, maybe an helper function ? // TODO: shorten this, maybe an helper function ?
let article_location = ArticleLocation::builder() let article_location = ArticleLocation::builder().url(url)?.build()?;
.url(url)?
.newspaper(mediapart)
.build()?;
let article_str = article_location.retrieve_html().await?; let article_str = article_location.retrieve_html().await?;