feat: add retrieval from le monde diplomatique
Add retrieval from le monde diplomatique Previously, 404 pages were injected in the document when downloading styles Now, the downloader returns None when documents are not found
This commit is contained in:
parent
8afd74995b
commit
970f510cd1
209
Cargo.lock
generated
209
Cargo.lock
generated
@ -74,13 +74,22 @@ checksum = "739f4a8db6605981345c5654f3a85b056ce52f37a39d34da03f25bf2151ea16e"
|
||||
|
||||
[[package]]
|
||||
name = "aho-corasick"
|
||||
version = "0.7.15"
|
||||
version = "0.7.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7404febffaa47dac81aa44dba71523c9d069b1bdc50a77db41195149e17f68e5"
|
||||
checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ansi_term"
|
||||
version = "0.12.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2"
|
||||
dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "anyhow"
|
||||
version = "1.0.40"
|
||||
@ -213,6 +222,18 @@ version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
||||
|
||||
[[package]]
|
||||
name = "chrono"
|
||||
version = "0.4.19"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "670ad68c9088c2a963aaa298cb369688cf3f9465ce5e2d4ca10e6e0098a1ce73"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"num-integer",
|
||||
"num-traits",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cipher"
|
||||
version = "0.2.5"
|
||||
@ -285,10 +306,13 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "79bb3adfaf5f75d24b01aee375f7555907840fa2800e5ec8fa3b9e2031830173"
|
||||
|
||||
[[package]]
|
||||
name = "cpuid-bool"
|
||||
version = "0.1.2"
|
||||
name = "cpufeatures"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8aebca1129a03dc6dc2b127edd729435bbc4a37e1d5f4d7513165089ceb02634"
|
||||
checksum = "dec1028182c380cc45a2e2c5ec841134f2dfd0f8f5f0a5bcd68004f81b5efdf4"
|
||||
dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cpuid-bool"
|
||||
@ -307,6 +331,7 @@ dependencies = [
|
||||
"env_logger",
|
||||
"log",
|
||||
"tokio",
|
||||
"tracing-subscriber",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -821,9 +846,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "h2"
|
||||
version = "0.3.2"
|
||||
version = "0.3.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fc018e188373e2777d0ef2467ebff62a08e66c3f5857b23c8fbec3018210dc00"
|
||||
checksum = "825343c4eef0b63f541f8903f395dc5beb362a979b5799a84062527ef1e37726"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fnv",
|
||||
@ -877,9 +902,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "html-minifier"
|
||||
version = "3.0.10"
|
||||
version = "3.0.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "108452631307790510cde91282fc706ae70076bd68200add8638773f06d5e122"
|
||||
checksum = "9876ed2cad2fd2f78bad42688bcac8a87cebce9b2381c5b05efc30b0c8429af0"
|
||||
dependencies = [
|
||||
"cow-utils",
|
||||
"educe",
|
||||
@ -914,9 +939,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "http-body"
|
||||
version = "0.4.1"
|
||||
version = "0.4.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5dfb77c123b4e2f72a2069aeae0b4b4949cc7e966df277813fc16347e7549737"
|
||||
checksum = "60daa14be0e0786db0f03a9e57cb404c9d756eed2b6c62b9ea98ec5743ec75a9"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"http",
|
||||
@ -1055,9 +1080,9 @@ checksum = "dd25036021b0de88a0aff6b850051563c6516d0bf53f8638938edbb9de732736"
|
||||
|
||||
[[package]]
|
||||
name = "js-sys"
|
||||
version = "0.3.50"
|
||||
version = "0.3.51"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2d99f9e3e84b8f67f846ef5b4cbbc3b1c29f6c759fcbce6f01aa0e73d932a24c"
|
||||
checksum = "83bdfbace3a0e81a4253f73b49e960b053e396a11012cbd49b9b74d6a2b67062"
|
||||
dependencies = [
|
||||
"wasm-bindgen",
|
||||
]
|
||||
@ -1091,9 +1116,9 @@ checksum = "18794a8ad5b29321f790b55d93dfba91e125cb1a9edbd4f8e3150acc771c1a5e"
|
||||
|
||||
[[package]]
|
||||
name = "lock_api"
|
||||
version = "0.4.3"
|
||||
version = "0.4.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5a3c91c24eae6777794bb1997ad98bbb87daf92890acab859f7eaa4320333176"
|
||||
checksum = "0382880606dff6d15c9476c416d18690b72742aa7b605bb6dd6ec9030fbf07eb"
|
||||
dependencies = [
|
||||
"scopeguard",
|
||||
]
|
||||
@ -1158,6 +1183,15 @@ dependencies = [
|
||||
"tendril",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "matchers"
|
||||
version = "0.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f099785f7595cc4b4553a174ce30dd7589ef93391ff414dbb67f62392b9e0ce1"
|
||||
dependencies = [
|
||||
"regex-automata",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "matches"
|
||||
version = "0.1.8"
|
||||
@ -1254,9 +1288,9 @@ checksum = "60302e4db3a61da70c0cb7991976248362f30319e88850c487b9b95bbf059e00"
|
||||
|
||||
[[package]]
|
||||
name = "memchr"
|
||||
version = "2.3.4"
|
||||
version = "2.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0ee1c47aaa256ecabcaea351eae4a9b01ef39ed810004e298d2511ed284b1525"
|
||||
checksum = "b16bd47d9e329435e309c58469fe0791c2d0d1ba96ec0954152a5ae2b04387dc"
|
||||
|
||||
[[package]]
|
||||
name = "mime"
|
||||
@ -1585,7 +1619,7 @@ version = "0.4.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "eebcc4aa140b9abd2bc40d9c3f7ccec842679cd79045ac3a7ac698c1a064b7cd"
|
||||
dependencies = [
|
||||
"cpuid-bool 0.2.0",
|
||||
"cpuid-bool",
|
||||
"opaque-debug",
|
||||
"universal-hash",
|
||||
]
|
||||
@ -1856,18 +1890,18 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "redox_syscall"
|
||||
version = "0.2.7"
|
||||
version = "0.2.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "85dd92e586f7355c633911e11f77f3d12f04b1b1bd76a198bd34ae3af8341ef2"
|
||||
checksum = "742739e41cd49414de871ea5e549afb7e2a3ac77b589bcbebe8c82fab37147fc"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex"
|
||||
version = "1.4.6"
|
||||
version = "1.5.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2a26af418b574bd56588335b3a3659a65725d4e636eb1016c2f9e3b38c7cc759"
|
||||
checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
@ -1875,10 +1909,20 @@ dependencies = [
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-syntax"
|
||||
version = "0.6.23"
|
||||
name = "regex-automata"
|
||||
version = "0.1.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "24d5f089152e60f62d28b835fbff2cd2e8dc0baf1ac13343bef92ab7eed84548"
|
||||
checksum = "ae1ded71d66a4a97f5e961fd0cb25a5f366a42a41570d16a763a69c092c26ae4"
|
||||
dependencies = [
|
||||
"byteorder",
|
||||
"regex-syntax",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-syntax"
|
||||
version = "0.6.25"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b"
|
||||
|
||||
[[package]]
|
||||
name = "reqwest"
|
||||
@ -2319,17 +2363,26 @@ checksum = "2579985fda508104f7587689507983eadd6a6e84dd35d6d115361f530916fa0d"
|
||||
|
||||
[[package]]
|
||||
name = "sha2"
|
||||
version = "0.9.3"
|
||||
version = "0.9.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fa827a14b29ab7f44778d14a88d3cb76e949c45083f7dbfa507d0cb699dc12de"
|
||||
checksum = "d8f6b75b17576b792bef0db1bcc4b8b8bcdf9506744cf34b974195487af6cff2"
|
||||
dependencies = [
|
||||
"block-buffer",
|
||||
"cfg-if 1.0.0",
|
||||
"cpuid-bool 0.1.2",
|
||||
"cpufeatures",
|
||||
"digest",
|
||||
"opaque-debug",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sharded-slab"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "79c719719ee05df97490f80a45acfc99e5a30ce98a1e4fb67aee422745ae14e3"
|
||||
dependencies = [
|
||||
"lazy_static",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "signal-hook-registry"
|
||||
version = "1.3.0"
|
||||
@ -2491,9 +2544,9 @@ checksum = "1e81da0851ada1f3e9d4312c704aa4f8806f0f9d69faaf8df2f3464b4a9437c2"
|
||||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
version = "1.0.71"
|
||||
version = "1.0.72"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ad184cc9470f9117b2ac6817bfe297307418819ba40552f9b3846f05c33d5373"
|
||||
checksum = "a1e8cdbefb79a9a5a65e0db8b47b723ee907b7c7f8496c76a1770b5c310bab82"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
@ -2558,6 +2611,15 @@ dependencies = [
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thread_local"
|
||||
version = "1.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8018d24e04c95ac8790716a5987d0fec4f8b27249ffa0f7d33f1369bdfb88cbd"
|
||||
dependencies = [
|
||||
"once_cell",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "time"
|
||||
version = "0.2.26"
|
||||
@ -2684,9 +2746,9 @@ checksum = "360dfd1d6d30e05fda32ace2c8c70e9c0a9da713275777f5a4dbb8a1893930c6"
|
||||
|
||||
[[package]]
|
||||
name = "tracing"
|
||||
version = "0.1.25"
|
||||
version = "0.1.26"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "01ebdc2bb4498ab1ab5f5b73c5803825e60199229ccba0698170e3be0e7f959f"
|
||||
checksum = "09adeb8c97449311ccd28a427f96fb563e7fd31aabf994189879d9da2394b89d"
|
||||
dependencies = [
|
||||
"cfg-if 1.0.0",
|
||||
"pin-project-lite",
|
||||
@ -2707,9 +2769,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tracing-core"
|
||||
version = "0.1.17"
|
||||
version = "0.1.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f50de3927f93d202783f4513cda820ab47ef17f624b03c096e86ef00c67e6b5f"
|
||||
checksum = "a9ff14f98b1a4b289c6248a023c1c2fa1491062964e9fed67ab29c4e4da4a052"
|
||||
dependencies = [
|
||||
"lazy_static",
|
||||
]
|
||||
@ -2724,6 +2786,49 @@ dependencies = [
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-log"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a6923477a48e41c1951f1999ef8bb5a3023eb723ceadafe78ffb65dc366761e3"
|
||||
dependencies = [
|
||||
"lazy_static",
|
||||
"log",
|
||||
"tracing-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-serde"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fb65ea441fbb84f9f6748fd496cf7f63ec9af5bca94dd86456978d055e8eb28b"
|
||||
dependencies = [
|
||||
"serde",
|
||||
"tracing-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-subscriber"
|
||||
version = "0.2.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "aa5553bf0883ba7c9cbe493b085c29926bd41b66afc31ff72cf17ff4fb60dcd5"
|
||||
dependencies = [
|
||||
"ansi_term",
|
||||
"chrono",
|
||||
"lazy_static",
|
||||
"matchers",
|
||||
"regex",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"sharded-slab",
|
||||
"smallvec 1.6.1",
|
||||
"thread_local",
|
||||
"tracing",
|
||||
"tracing-core",
|
||||
"tracing-log",
|
||||
"tracing-serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "try-lock"
|
||||
version = "0.2.3"
|
||||
@ -2756,9 +2861,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "unicode-xid"
|
||||
version = "0.2.1"
|
||||
version = "0.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f7fe0bb3479651439c9112f72b6c505038574c9fbb575ed1bf3b797fa39dd564"
|
||||
checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3"
|
||||
|
||||
[[package]]
|
||||
name = "unindent"
|
||||
@ -2784,9 +2889,9 @@ checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a"
|
||||
|
||||
[[package]]
|
||||
name = "url"
|
||||
version = "2.2.1"
|
||||
version = "2.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9ccd964113622c8e9322cfac19eb1004a07e636c545f325da085d5cdde6f1f8b"
|
||||
checksum = "a507c383b2d33b5fc35d1861e77e6b383d158b2da5e14fe51b83dfedf6fd578c"
|
||||
dependencies = [
|
||||
"form_urlencoded",
|
||||
"idna",
|
||||
@ -2846,9 +2951,9 @@ checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6"
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen"
|
||||
version = "0.2.73"
|
||||
version = "0.2.74"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "83240549659d187488f91f33c0f8547cbfef0b2088bc470c116d1d260ef623d9"
|
||||
checksum = "d54ee1d4ed486f78874278e63e4069fc1ab9f6a18ca492076ffb90c5eb2997fd"
|
||||
dependencies = [
|
||||
"cfg-if 1.0.0",
|
||||
"serde",
|
||||
@ -2858,9 +2963,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-backend"
|
||||
version = "0.2.73"
|
||||
version = "0.2.74"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ae70622411ca953215ca6d06d3ebeb1e915f0f6613e3b495122878d7ebec7dae"
|
||||
checksum = "3b33f6a0694ccfea53d94db8b2ed1c3a8a4c86dd936b13b9f0a15ec4a451b900"
|
||||
dependencies = [
|
||||
"bumpalo",
|
||||
"lazy_static",
|
||||
@ -2873,9 +2978,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-futures"
|
||||
version = "0.4.23"
|
||||
version = "0.4.24"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "81b8b767af23de6ac18bf2168b690bed2902743ddf0fb39252e36f9e2bfc63ea"
|
||||
checksum = "5fba7978c679d53ce2d0ac80c8c175840feb849a161664365d1287b41f2e67f1"
|
||||
dependencies = [
|
||||
"cfg-if 1.0.0",
|
||||
"js-sys",
|
||||
@ -2885,9 +2990,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-macro"
|
||||
version = "0.2.73"
|
||||
version = "0.2.74"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3e734d91443f177bfdb41969de821e15c516931c3c3db3d318fa1b68975d0f6f"
|
||||
checksum = "088169ca61430fe1e58b8096c24975251700e7b1f6fd91cc9d59b04fb9b18bd4"
|
||||
dependencies = [
|
||||
"quote",
|
||||
"wasm-bindgen-macro-support",
|
||||
@ -2895,9 +3000,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-macro-support"
|
||||
version = "0.2.73"
|
||||
version = "0.2.74"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d53739ff08c8a68b0fdbcd54c372b8ab800b1449ab3c9d706503bc7dd1621b2c"
|
||||
checksum = "be2241542ff3d9f241f5e2cb6dd09b37efe786df8851c54957683a49f0987a97"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
@ -2908,15 +3013,15 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-shared"
|
||||
version = "0.2.73"
|
||||
version = "0.2.74"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d9a543ae66aa233d14bb765ed9af4a33e81b8b58d1584cf1b47ff8cd0b9e4489"
|
||||
checksum = "d7cff876b8f18eed75a66cf49b65e7f967cb354a7aa16003fb55dbfd25b44b4f"
|
||||
|
||||
[[package]]
|
||||
name = "web-sys"
|
||||
version = "0.3.50"
|
||||
version = "0.3.51"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a905d57e488fec8861446d3393670fb50d27a262344013181c2cdf9fff5481be"
|
||||
checksum = "e828417b379f3df7111d3a2a9e5753706cae29c41f7c4029ee9fd77f3e09e582"
|
||||
dependencies = [
|
||||
"js-sys",
|
||||
"wasm-bindgen",
|
||||
|
@ -23,3 +23,4 @@ dotenv = "0.15.0"
|
||||
env_logger = "0.8.3"
|
||||
log = "0.4.14"
|
||||
tokio = { version = "1.5.0", features = ["full"] }
|
||||
tracing-subscriber = "0.2.18"
|
||||
|
@ -1,4 +1,5 @@
|
||||
Tools to retrieve articles from multiple newspaper you subscribed to.
|
||||
Tools to retrieve articles from multiple newspaper you subscribed to, all from
|
||||
the same place.
|
||||
|
||||
**This is a prototype, it isn't stable at all and you may not want to use it if
|
||||
you expect it to just work !**
|
||||
|
@ -8,19 +8,19 @@ publish = false
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1.0.40"
|
||||
async-trait = "0.1.48"
|
||||
async-trait = "0.1.50"
|
||||
thiserror = "1.0.24"
|
||||
url = "2.2.1"
|
||||
hyper = { version = "0.14.5", features = ["full"] }
|
||||
url = "2.2.2"
|
||||
hyper = { version = "0.14.7", features = ["full"] }
|
||||
hyper-rustls = "0.22.1"
|
||||
cookie = "0.15.0"
|
||||
lol_html = "0.3.0"
|
||||
indoc = "1.0.3"
|
||||
html-minifier = "3.0.9"
|
||||
html-minifier = "3.0.11"
|
||||
bytes = "1.0.1"
|
||||
base64 = "0.13.0"
|
||||
futures = "0.3.14"
|
||||
derive_builder = "0.10.0"
|
||||
derive_builder = "0.10.2"
|
||||
nipper = "0.1.9"
|
||||
log = "0.4.14"
|
||||
env_logger = "0.8.3"
|
||||
|
@ -2,12 +2,12 @@ use std::boxed::Box;
|
||||
use std::convert::TryInto;
|
||||
use std::env;
|
||||
|
||||
use anyhow::anyhow;
|
||||
use log::info;
|
||||
use url::{Host, Url};
|
||||
|
||||
use crate::newspaper::Newspaper;
|
||||
use crate::newspapers::mediapart::{self, Mediapart};
|
||||
use crate::newspapers::monde_diplomatique::{self, MondeDiplo};
|
||||
|
||||
/// Enumerate all errors that can be encountered when using ArticleLocation
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
@ -33,6 +33,7 @@ type Newspapers = Vec<Box<dyn Newspaper>>;
|
||||
pub type Result<T, E = Error> = core::result::Result<T, E>;
|
||||
|
||||
fn default_newpapers() -> Result<Newspapers> {
|
||||
// TODO: same thing is written too much times : how to DRY ?
|
||||
let config_key = "MEDIAPART_COOKIE".to_string();
|
||||
let mpruiid = env::var(&config_key)
|
||||
.map_err(|_| Error::Misconfiguration(config_key))?
|
||||
@ -42,7 +43,29 @@ fn default_newpapers() -> Result<Newspapers> {
|
||||
.login(mediapart::Login::MPRUUID(mpruiid))
|
||||
.build()?;
|
||||
|
||||
Ok(vec![Box::new(mediapart)])
|
||||
let lmd_a_m = "MONDE_DIPLO_LMD_A_M".to_string();
|
||||
let phpsessid = "MONDE_DIPLO_PHPSESSID".to_string();
|
||||
let spip_session = "MONDE_DIPLO_SPIP_SESSION".to_string();
|
||||
|
||||
let lmd_a_m = env::var(&lmd_a_m)
|
||||
.map_err(|_| Error::Misconfiguration(lmd_a_m))?
|
||||
.into();
|
||||
let phpsessid = env::var(&phpsessid)
|
||||
.map_err(|_| Error::Misconfiguration(phpsessid))?
|
||||
.into();
|
||||
let spip_session = env::var(&spip_session)
|
||||
.map_err(|_| Error::Misconfiguration(spip_session))?
|
||||
.into();
|
||||
|
||||
let monde_diplo = MondeDiplo::builder()
|
||||
.login(monde_diplomatique::Login::Cookies {
|
||||
lmd_a_m,
|
||||
phpsessid,
|
||||
spip_session,
|
||||
})
|
||||
.build()?;
|
||||
|
||||
Ok(vec![Box::new(mediapart), Box::new(monde_diplo)])
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
|
@ -1,4 +1,4 @@
|
||||
use anyhow::{anyhow, Result};
|
||||
use anyhow::{anyhow, bail, Result};
|
||||
use async_trait::async_trait;
|
||||
use cookie::Cookie;
|
||||
use url::Host;
|
||||
@ -80,7 +80,10 @@ impl Newspaper for Mediapart {
|
||||
let downloader = Downloader { cookies };
|
||||
|
||||
let body = downloader.download(&url).await?;
|
||||
let html = String::from_utf8(body.to_vec())?;
|
||||
let html = match body {
|
||||
Some(body) => String::from_utf8(body.to_vec())?,
|
||||
None => bail!("404 not found"),
|
||||
};
|
||||
|
||||
// TODO: Move to const
|
||||
let element_to_remove = [
|
||||
|
@ -1 +1,2 @@
|
||||
pub mod mediapart;
|
||||
pub mod monde_diplomatique;
|
||||
|
135
crieur-retrieve/src/newspapers/monde_diplomatique.rs
Normal file
135
crieur-retrieve/src/newspapers/monde_diplomatique.rs
Normal file
@ -0,0 +1,135 @@
|
||||
use anyhow::{anyhow, bail, Result};
|
||||
use async_trait::async_trait;
|
||||
use cookie::Cookie;
|
||||
use url::Host;
|
||||
|
||||
use crate::newspaper::{Metadata, Newspaper};
|
||||
use crate::tools;
|
||||
use crate::Url;
|
||||
use crate::{Download, Downloader};
|
||||
|
||||
pub enum Login {
|
||||
Username(String, String),
|
||||
Cookies {
|
||||
lmd_a_m: String,
|
||||
phpsessid: String,
|
||||
spip_session: String,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct MondeDiplo {
|
||||
login_cookies: Vec<(String, String)>,
|
||||
}
|
||||
|
||||
fn str_to_host<S: Into<String>>(host: S) -> Host {
|
||||
Host::Domain(host.into())
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct Builder {
|
||||
login_cookies: Option<Vec<(String, String)>>,
|
||||
}
|
||||
|
||||
impl Builder {
|
||||
pub fn login(&mut self, login: Login) -> &mut Self {
|
||||
self.login_cookies = match login {
|
||||
Login::Username(_username, _password) => {
|
||||
unimplemented!("login using username and passwond not implemented")
|
||||
}
|
||||
Login::Cookies {
|
||||
lmd_a_m,
|
||||
phpsessid,
|
||||
spip_session,
|
||||
} => Some(vec![
|
||||
("lmd_a_m".into(), lmd_a_m),
|
||||
("PHPSESSID".into(), phpsessid),
|
||||
("spip_session".into(), spip_session),
|
||||
]),
|
||||
};
|
||||
self
|
||||
}
|
||||
|
||||
pub fn build(&self) -> Result<MondeDiplo> {
|
||||
match &self.login_cookies {
|
||||
Some(login_cookies) => Ok(MondeDiplo {
|
||||
login_cookies: login_cookies.clone(),
|
||||
}),
|
||||
None => Err(anyhow!("You have to log in to access this newspaper")),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Newspaper for MondeDiplo {
|
||||
fn metadata(&self) -> Metadata {
|
||||
Metadata::builder()
|
||||
.hosts(vec![
|
||||
str_to_host("monde-diplomatique.fr"),
|
||||
str_to_host("www.monde-diplomatique.fr"),
|
||||
])
|
||||
.lower_case_name("monde-diplomatique")
|
||||
.name("Le Monde Diplomatique")
|
||||
.build()
|
||||
.unwrap_or_default()
|
||||
}
|
||||
|
||||
async fn retrieve_html(&self, url: &Url) -> Result<String> {
|
||||
let cookies = self
|
||||
.login_cookies
|
||||
.iter()
|
||||
.map(|cookie| Cookie::build(&cookie.0, &cookie.1).finish())
|
||||
.collect::<Vec<_>>();
|
||||
//let cookie = Cookie::build(&self.login_cookie.0, &self.login_cookie.1)
|
||||
// .secure(true)
|
||||
// .finish();
|
||||
//let cookies = vec![cookie];
|
||||
|
||||
// TODO: replace by builder
|
||||
let downloader = Downloader { cookies };
|
||||
|
||||
let body = downloader.download(&url).await?;
|
||||
let html = match body {
|
||||
Some(body) => String::from_utf8(body.to_vec())?,
|
||||
None => bail!("404 not found"),
|
||||
};
|
||||
|
||||
// TODO: Move to const
|
||||
let element_to_remove = [
|
||||
// navigation elements
|
||||
"#tout-en-haut.preentete",
|
||||
"#entete.connecte",
|
||||
"#navigation",
|
||||
"#pied",
|
||||
".bloc-connexion",
|
||||
// unused features
|
||||
"#ecouter",
|
||||
// Social buttons
|
||||
".actions-article",
|
||||
"#partage",
|
||||
// misc
|
||||
"noscript",
|
||||
];
|
||||
|
||||
let single_page_html =
|
||||
tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await;
|
||||
Ok(single_page_html)
|
||||
}
|
||||
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
|
||||
async fn has_complete_access(&self) -> bool {
|
||||
// TODO: check if we are logged using the cookie
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
impl MondeDiplo {
|
||||
pub fn builder() -> Builder {
|
||||
Builder::default()
|
||||
}
|
||||
}
|
@ -4,7 +4,7 @@ use anyhow::Result;
|
||||
use async_trait::async_trait;
|
||||
use bytes::Bytes;
|
||||
use cookie::Cookie;
|
||||
use hyper::{header, Body, Client, Method, Request};
|
||||
use hyper::{header, Body, Client, Method, Request, StatusCode};
|
||||
use thiserror::Error;
|
||||
use url::Url;
|
||||
|
||||
@ -22,7 +22,9 @@ pub trait Download {
|
||||
type Error: StdError;
|
||||
|
||||
/// Downloads a file from an url and returns the result as bytes
|
||||
async fn download(&self, file_link: &Url) -> Result<Bytes, Self::Error>;
|
||||
///
|
||||
/// If the file is not found, returns None
|
||||
async fn download(&self, file_link: &Url) -> Result<Option<Bytes>, Self::Error>;
|
||||
}
|
||||
|
||||
/// Store several cookies
|
||||
@ -36,7 +38,8 @@ pub struct Downloader<'c> {
|
||||
impl<'c> Download for Downloader<'c> {
|
||||
type Error = DownloadError;
|
||||
|
||||
async fn download(&self, file_link: &Url) -> Result<Bytes, Self::Error> {
|
||||
async fn download(&self, file_link: &Url) -> Result<Option<Bytes>, Self::Error> {
|
||||
log::info!("downloading url {:?}", file_link);
|
||||
let https = hyper_rustls::HttpsConnector::with_native_roots();
|
||||
let client: Client<_, hyper::Body> = Client::builder().build(https);
|
||||
|
||||
@ -44,14 +47,25 @@ impl<'c> Download for Downloader<'c> {
|
||||
.method(Method::GET)
|
||||
.uri(file_link.as_str());
|
||||
|
||||
for cookie in &self.cookies {
|
||||
req = req.header(header::COOKIE, cookie.to_string());
|
||||
}
|
||||
req = req.header(
|
||||
header::COOKIE,
|
||||
self.cookies
|
||||
.iter()
|
||||
.map(Cookie::to_string)
|
||||
.collect::<Vec<_>>()
|
||||
.join(";"),
|
||||
);
|
||||
log::info!("headers : {:?}", req.headers_ref());
|
||||
|
||||
let req = req.body(Body::empty())?;
|
||||
|
||||
let resp = client.request(req).await?;
|
||||
let body = hyper::body::to_bytes(resp).await?;
|
||||
let body = match resp.status() {
|
||||
StatusCode::OK => Some(hyper::body::to_bytes(resp).await?),
|
||||
StatusCode::NOT_FOUND => None,
|
||||
// TODO: enhance this by handling more error codes
|
||||
_ => None,
|
||||
};
|
||||
Ok(body)
|
||||
}
|
||||
}
|
||||
|
@ -77,7 +77,7 @@ where
|
||||
.iter()
|
||||
.zip(downloaded_styles.iter())
|
||||
.for_each(|(mut stylesheet, inner_css)| {
|
||||
if let Some(inner_css) = inner_css {
|
||||
if let Some(Some(inner_css)) = inner_css {
|
||||
let css = String::from_utf8(inner_css.to_vec()).unwrap();
|
||||
let css = format!("<style>{}</style>", css);
|
||||
stylesheet.replace_with_html(css);
|
||||
@ -120,10 +120,12 @@ where
|
||||
imgs.iter()
|
||||
.zip(downloaded_images.iter())
|
||||
.for_each(|(mut img, data)| {
|
||||
if let Some((url, data)) = data {
|
||||
if let Some((url, Some(data))) = data {
|
||||
let data = base64::encode(data);
|
||||
let extension = Path::new(url.path()).extension().unwrap().to_str().unwrap();
|
||||
img.set_attr("src", &format!("data:image/{};base64,{}", extension, data));
|
||||
} else {
|
||||
img.remove()
|
||||
}
|
||||
});
|
||||
// ---- Remove unwanted html elements -----
|
||||
|
@ -8,3 +8,16 @@ The newspapers are configured using environment variables
|
||||
|
||||
MEDIAPART_COOKIE
|
||||
: sets the `MPRUUID` cookie, used to log in
|
||||
|
||||
# Le Monde Diplomatique
|
||||
|
||||
All cookies are mandatory to log in
|
||||
|
||||
MONDE_DIPLO_LMD_A_M
|
||||
: sets the `lmd_a_m` cookie
|
||||
|
||||
MONDE_DIPLO_PHPSESSID
|
||||
: sets the `PHPSESSID` cookie
|
||||
|
||||
MONDE_DIPLO_SPIP_SESSION
|
||||
: sets the `spip_session` cookie
|
||||
|
@ -2,38 +2,27 @@ use std::convert::TryInto;
|
||||
use std::env;
|
||||
|
||||
use anyhow::Result;
|
||||
use crieur_retrieve::{
|
||||
newspaper::Newspaper,
|
||||
newspapers::mediapart::{self, Mediapart},
|
||||
ArticleLocation, Url,
|
||||
};
|
||||
use crieur_retrieve::{ArticleLocation, Url};
|
||||
use dotenv::dotenv;
|
||||
use log::info;
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<()> {
|
||||
dotenv().ok();
|
||||
env_logger::init();
|
||||
tracing_subscriber::fmt()
|
||||
.with_writer(std::io::stderr)
|
||||
.with_env_filter(tracing_subscriber::EnvFilter::from_default_env())
|
||||
.init();
|
||||
|
||||
let url = match env::args().nth(1) {
|
||||
Some(url) => Url::parse(&url)?,
|
||||
None => "https://www.mediapart.fr/journal/france/030421/l-hotel-dieu-patients-et-medecins-tentent-de-percer-les-mysteres-du-covid-long".try_into()?,
|
||||
};
|
||||
|
||||
// TODO: remove this in favor of default newspapers
|
||||
|
||||
let mpruiid = env::var("MEDIAPART_COOKIE")?.into();
|
||||
let mediapart = Mediapart::builder()
|
||||
.login(mediapart::Login::MPRUUID(mpruiid))
|
||||
.build()?;
|
||||
|
||||
info!("Trying to download article from {}", url);
|
||||
|
||||
// TODO: shorten this, maybe an helper function ?
|
||||
let article_location = ArticleLocation::builder()
|
||||
.url(url)?
|
||||
.newspaper(mediapart)
|
||||
.build()?;
|
||||
let article_location = ArticleLocation::builder().url(url)?.build()?;
|
||||
|
||||
let article_str = article_location.retrieve_html().await?;
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user