diff --git a/Cargo.lock b/Cargo.lock index 25be411..95f5f16 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -74,13 +74,22 @@ checksum = "739f4a8db6605981345c5654f3a85b056ce52f37a39d34da03f25bf2151ea16e" [[package]] name = "aho-corasick" -version = "0.7.15" +version = "0.7.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7404febffaa47dac81aa44dba71523c9d069b1bdc50a77db41195149e17f68e5" +checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f" dependencies = [ "memchr", ] +[[package]] +name = "ansi_term" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2" +dependencies = [ + "winapi", +] + [[package]] name = "anyhow" version = "1.0.40" @@ -213,6 +222,18 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "chrono" +version = "0.4.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "670ad68c9088c2a963aaa298cb369688cf3f9465ce5e2d4ca10e6e0098a1ce73" +dependencies = [ + "libc", + "num-integer", + "num-traits", + "winapi", +] + [[package]] name = "cipher" version = "0.2.5" @@ -285,10 +306,13 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "79bb3adfaf5f75d24b01aee375f7555907840fa2800e5ec8fa3b9e2031830173" [[package]] -name = "cpuid-bool" -version = "0.1.2" +name = "cpufeatures" +version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8aebca1129a03dc6dc2b127edd729435bbc4a37e1d5f4d7513165089ceb02634" +checksum = "dec1028182c380cc45a2e2c5ec841134f2dfd0f8f5f0a5bcd68004f81b5efdf4" +dependencies = [ + "libc", +] [[package]] name = "cpuid-bool" @@ -307,6 +331,7 @@ dependencies = [ "env_logger", "log", "tokio", + "tracing-subscriber", ] [[package]] @@ -337,7 +362,6 @@ dependencies = [ "hyper", "hyper-rustls", "indoc", - "itertools", "log", "lol_html", "nipper", @@ -569,12 +593,6 @@ dependencies = [ "syn", ] -[[package]] -name = "either" -version = "1.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" - [[package]] name = "encoding_rs" version = "0.8.28" @@ -644,9 +662,9 @@ dependencies = [ [[package]] name = "futures" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a9d5813545e459ad3ca1bff9915e9ad7f1a47dc6a91b627ce321d5863b7dd253" +checksum = "0e7e43a803dae2fa37c1f6a8fe121e1f7bf9548b4dfc0522a42f34145dadfc27" dependencies = [ "futures-channel", "futures-core", @@ -659,9 +677,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce79c6a52a299137a6013061e0cf0e688fce5d7f1bc60125f520912fdb29ec25" +checksum = "e682a68b29a882df0545c143dc3646daefe80ba479bcdede94d5a703de2871e2" dependencies = [ "futures-core", "futures-sink", @@ -669,15 +687,15 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "098cd1c6dda6ca01650f1a37a794245eb73181d0d4d4e955e2f3c37db7af1815" +checksum = "0402f765d8a89a26043b889b26ce3c4679d268fa6bb22cd7c6aad98340e179d1" [[package]] name = "futures-executor" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10f6cb7042eda00f0049b1d2080aa4b93442997ee507eb3828e8bd7577f94c9d" +checksum = "badaa6a909fac9e7236d0620a2f57f7664640c56575b71a7552fbd68deafab79" dependencies = [ "futures-core", "futures-task", @@ -686,9 +704,9 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "365a1a1fb30ea1c03a830fdb2158f5236833ac81fa0ad12fe35b29cddc35cb04" +checksum = "acc499defb3b348f8d8f3f66415835a9131856ff7714bf10dadfc4ec4bdb29a1" [[package]] name = "futures-locks" @@ -701,10 +719,11 @@ dependencies = [ [[package]] name = "futures-macro" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "668c6733a182cd7deb4f1de7ba3bf2120823835b3bcfbeacf7d2c4a773c1bb8b" +checksum = "a4c40298486cdf52cc00cd6d6987892ba502c7656a16a4192a9992b1ccedd121" dependencies = [ + "autocfg 1.0.1", "proc-macro-hack", "proc-macro2", "quote", @@ -713,15 +732,15 @@ dependencies = [ [[package]] name = "futures-sink" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c5629433c555de3d82861a7a4e3794a4c40040390907cfbfd7143a92a426c23" +checksum = "a57bead0ceff0d6dde8f465ecd96c9338121bb7717d3e7b108059531870c4282" [[package]] name = "futures-task" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba7aa51095076f3ba6d9a1f702f74bd05ec65f555d70d2033d55ba8d69f581bc" +checksum = "8a16bef9fc1a4dddb5bee51c989e3fbba26569cbb0e31f5b303c184e3dd33dae" [[package]] name = "futures-timer" @@ -735,10 +754,11 @@ dependencies = [ [[package]] name = "futures-util" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c144ad54d60f23927f0a6b6d816e4271278b64f005ad65e4e35291d2de9c025" +checksum = "feb5c238d27e2bf94ffdfd27b2c29e3df4a68c4193bb6427384259e2bf191967" dependencies = [ + "autocfg 1.0.1", "futures-channel", "futures-core", "futures-io", @@ -821,9 +841,9 @@ dependencies = [ [[package]] name = "h2" -version = "0.3.2" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc018e188373e2777d0ef2467ebff62a08e66c3f5857b23c8fbec3018210dc00" +checksum = "825343c4eef0b63f541f8903f395dc5beb362a979b5799a84062527ef1e37726" dependencies = [ "bytes", "fnv", @@ -877,9 +897,9 @@ dependencies = [ [[package]] name = "html-minifier" -version = "3.0.10" +version = "3.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "108452631307790510cde91282fc706ae70076bd68200add8638773f06d5e122" +checksum = "70f11cbdecf4fde3b74532e8d6d5926b6354adfda3fe259ed585402f9ae0dbaf" dependencies = [ "cow-utils", "educe", @@ -914,9 +934,9 @@ dependencies = [ [[package]] name = "http-body" -version = "0.4.1" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5dfb77c123b4e2f72a2069aeae0b4b4949cc7e966df277813fc16347e7549737" +checksum = "60daa14be0e0786db0f03a9e57cb404c9d756eed2b6c62b9ea98ec5743ec75a9" dependencies = [ "bytes", "http", @@ -1038,15 +1058,6 @@ version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "47be2f14c678be2fdcab04ab1171db51b2762ce6f0a8ee87c8dd4a04ed216135" -[[package]] -name = "itertools" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37d572918e350e82412fe766d24b15e6682fb2ed2bbe018280caa810397cb319" -dependencies = [ - "either", -] - [[package]] name = "itoa" version = "0.4.7" @@ -1055,9 +1066,9 @@ checksum = "dd25036021b0de88a0aff6b850051563c6516d0bf53f8638938edbb9de732736" [[package]] name = "js-sys" -version = "0.3.50" +version = "0.3.51" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d99f9e3e84b8f67f846ef5b4cbbc3b1c29f6c759fcbce6f01aa0e73d932a24c" +checksum = "83bdfbace3a0e81a4253f73b49e960b053e396a11012cbd49b9b74d6a2b67062" dependencies = [ "wasm-bindgen", ] @@ -1091,9 +1102,9 @@ checksum = "18794a8ad5b29321f790b55d93dfba91e125cb1a9edbd4f8e3150acc771c1a5e" [[package]] name = "lock_api" -version = "0.4.3" +version = "0.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a3c91c24eae6777794bb1997ad98bbb87daf92890acab859f7eaa4320333176" +checksum = "0382880606dff6d15c9476c416d18690b72742aa7b605bb6dd6ec9030fbf07eb" dependencies = [ "scopeguard", ] @@ -1158,6 +1169,15 @@ dependencies = [ "tendril", ] +[[package]] +name = "matchers" +version = "0.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f099785f7595cc4b4553a174ce30dd7589ef93391ff414dbb67f62392b9e0ce1" +dependencies = [ + "regex-automata", +] + [[package]] name = "matches" version = "0.1.8" @@ -1254,9 +1274,9 @@ checksum = "60302e4db3a61da70c0cb7991976248362f30319e88850c487b9b95bbf059e00" [[package]] name = "memchr" -version = "2.3.4" +version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ee1c47aaa256ecabcaea351eae4a9b01ef39ed810004e298d2511ed284b1525" +checksum = "b16bd47d9e329435e309c58469fe0791c2d0d1ba96ec0954152a5ae2b04387dc" [[package]] name = "mime" @@ -1266,9 +1286,9 @@ checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d" [[package]] name = "minifier" -version = "0.0.39" +version = "0.0.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6cdf618de5c9c98d4a7b2e0d1f1e44f82a19196cfd94040bb203621c25d28d98" +checksum = "5594542d20834f2b974f5e5fb8e0cf1c67a2119dcadc29ef5d93a081fb30cc08" dependencies = [ "macro-utils", ] @@ -1585,7 +1605,7 @@ version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eebcc4aa140b9abd2bc40d9c3f7ccec842679cd79045ac3a7ac698c1a064b7cd" dependencies = [ - "cpuid-bool 0.2.0", + "cpuid-bool", "opaque-debug", "universal-hash", ] @@ -1856,18 +1876,18 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.2.7" +version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85dd92e586f7355c633911e11f77f3d12f04b1b1bd76a198bd34ae3af8341ef2" +checksum = "742739e41cd49414de871ea5e549afb7e2a3ac77b589bcbebe8c82fab37147fc" dependencies = [ "bitflags", ] [[package]] name = "regex" -version = "1.4.6" +version = "1.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a26af418b574bd56588335b3a3659a65725d4e636eb1016c2f9e3b38c7cc759" +checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461" dependencies = [ "aho-corasick", "memchr", @@ -1875,10 +1895,20 @@ dependencies = [ ] [[package]] -name = "regex-syntax" -version = "0.6.23" +name = "regex-automata" +version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24d5f089152e60f62d28b835fbff2cd2e8dc0baf1ac13343bef92ab7eed84548" +checksum = "ae1ded71d66a4a97f5e961fd0cb25a5f366a42a41570d16a763a69c092c26ae4" +dependencies = [ + "byteorder", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.6.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b" [[package]] name = "reqwest" @@ -2319,17 +2349,26 @@ checksum = "2579985fda508104f7587689507983eadd6a6e84dd35d6d115361f530916fa0d" [[package]] name = "sha2" -version = "0.9.3" +version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa827a14b29ab7f44778d14a88d3cb76e949c45083f7dbfa507d0cb699dc12de" +checksum = "d8f6b75b17576b792bef0db1bcc4b8b8bcdf9506744cf34b974195487af6cff2" dependencies = [ "block-buffer", "cfg-if 1.0.0", - "cpuid-bool 0.1.2", + "cpufeatures", "digest", "opaque-debug", ] +[[package]] +name = "sharded-slab" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79c719719ee05df97490f80a45acfc99e5a30ce98a1e4fb67aee422745ae14e3" +dependencies = [ + "lazy_static", +] + [[package]] name = "signal-hook-registry" version = "1.3.0" @@ -2491,9 +2530,9 @@ checksum = "1e81da0851ada1f3e9d4312c704aa4f8806f0f9d69faaf8df2f3464b4a9437c2" [[package]] name = "syn" -version = "1.0.71" +version = "1.0.72" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad184cc9470f9117b2ac6817bfe297307418819ba40552f9b3846f05c33d5373" +checksum = "a1e8cdbefb79a9a5a65e0db8b47b723ee907b7c7f8496c76a1770b5c310bab82" dependencies = [ "proc-macro2", "quote", @@ -2558,6 +2597,15 @@ dependencies = [ "syn", ] +[[package]] +name = "thread_local" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8018d24e04c95ac8790716a5987d0fec4f8b27249ffa0f7d33f1369bdfb88cbd" +dependencies = [ + "once_cell", +] + [[package]] name = "time" version = "0.2.26" @@ -2613,9 +2661,9 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" [[package]] name = "tokio" -version = "1.5.0" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83f0c8e7c0addab50b663055baf787d0af7f413a46e6e7fb9559a4e4db7137a5" +checksum = "bd3076b5c8cc18138b8f8814895c11eb4de37114a5d127bafdc5e55798ceef37" dependencies = [ "autocfg 1.0.1", "bytes", @@ -2684,9 +2732,9 @@ checksum = "360dfd1d6d30e05fda32ace2c8c70e9c0a9da713275777f5a4dbb8a1893930c6" [[package]] name = "tracing" -version = "0.1.25" +version = "0.1.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01ebdc2bb4498ab1ab5f5b73c5803825e60199229ccba0698170e3be0e7f959f" +checksum = "09adeb8c97449311ccd28a427f96fb563e7fd31aabf994189879d9da2394b89d" dependencies = [ "cfg-if 1.0.0", "pin-project-lite", @@ -2707,9 +2755,9 @@ dependencies = [ [[package]] name = "tracing-core" -version = "0.1.17" +version = "0.1.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f50de3927f93d202783f4513cda820ab47ef17f624b03c096e86ef00c67e6b5f" +checksum = "a9ff14f98b1a4b289c6248a023c1c2fa1491062964e9fed67ab29c4e4da4a052" dependencies = [ "lazy_static", ] @@ -2724,6 +2772,49 @@ dependencies = [ "tracing", ] +[[package]] +name = "tracing-log" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6923477a48e41c1951f1999ef8bb5a3023eb723ceadafe78ffb65dc366761e3" +dependencies = [ + "lazy_static", + "log", + "tracing-core", +] + +[[package]] +name = "tracing-serde" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb65ea441fbb84f9f6748fd496cf7f63ec9af5bca94dd86456978d055e8eb28b" +dependencies = [ + "serde", + "tracing-core", +] + +[[package]] +name = "tracing-subscriber" +version = "0.2.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa5553bf0883ba7c9cbe493b085c29926bd41b66afc31ff72cf17ff4fb60dcd5" +dependencies = [ + "ansi_term", + "chrono", + "lazy_static", + "matchers", + "regex", + "serde", + "serde_json", + "sharded-slab", + "smallvec 1.6.1", + "thread_local", + "tracing", + "tracing-core", + "tracing-log", + "tracing-serde", +] + [[package]] name = "try-lock" version = "0.2.3" @@ -2756,9 +2847,9 @@ dependencies = [ [[package]] name = "unicode-xid" -version = "0.2.1" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7fe0bb3479651439c9112f72b6c505038574c9fbb575ed1bf3b797fa39dd564" +checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3" [[package]] name = "unindent" @@ -2784,9 +2875,9 @@ checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a" [[package]] name = "url" -version = "2.2.1" +version = "2.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ccd964113622c8e9322cfac19eb1004a07e636c545f325da085d5cdde6f1f8b" +checksum = "a507c383b2d33b5fc35d1861e77e6b383d158b2da5e14fe51b83dfedf6fd578c" dependencies = [ "form_urlencoded", "idna", @@ -2846,9 +2937,9 @@ checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6" [[package]] name = "wasm-bindgen" -version = "0.2.73" +version = "0.2.74" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83240549659d187488f91f33c0f8547cbfef0b2088bc470c116d1d260ef623d9" +checksum = "d54ee1d4ed486f78874278e63e4069fc1ab9f6a18ca492076ffb90c5eb2997fd" dependencies = [ "cfg-if 1.0.0", "serde", @@ -2858,9 +2949,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.73" +version = "0.2.74" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae70622411ca953215ca6d06d3ebeb1e915f0f6613e3b495122878d7ebec7dae" +checksum = "3b33f6a0694ccfea53d94db8b2ed1c3a8a4c86dd936b13b9f0a15ec4a451b900" dependencies = [ "bumpalo", "lazy_static", @@ -2873,9 +2964,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.23" +version = "0.4.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81b8b767af23de6ac18bf2168b690bed2902743ddf0fb39252e36f9e2bfc63ea" +checksum = "5fba7978c679d53ce2d0ac80c8c175840feb849a161664365d1287b41f2e67f1" dependencies = [ "cfg-if 1.0.0", "js-sys", @@ -2885,9 +2976,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.73" +version = "0.2.74" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3e734d91443f177bfdb41969de821e15c516931c3c3db3d318fa1b68975d0f6f" +checksum = "088169ca61430fe1e58b8096c24975251700e7b1f6fd91cc9d59b04fb9b18bd4" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -2895,9 +2986,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.73" +version = "0.2.74" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d53739ff08c8a68b0fdbcd54c372b8ab800b1449ab3c9d706503bc7dd1621b2c" +checksum = "be2241542ff3d9f241f5e2cb6dd09b37efe786df8851c54957683a49f0987a97" dependencies = [ "proc-macro2", "quote", @@ -2908,15 +2999,15 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.73" +version = "0.2.74" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9a543ae66aa233d14bb765ed9af4a33e81b8b58d1584cf1b47ff8cd0b9e4489" +checksum = "d7cff876b8f18eed75a66cf49b65e7f967cb354a7aa16003fb55dbfd25b44b4f" [[package]] name = "web-sys" -version = "0.3.50" +version = "0.3.51" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a905d57e488fec8861446d3393670fb50d27a262344013181c2cdf9fff5481be" +checksum = "e828417b379f3df7111d3a2a9e5753706cae29c41f7c4029ee9fd77f3e09e582" dependencies = [ "js-sys", "wasm-bindgen", diff --git a/Cargo.toml b/Cargo.toml index 458df77..6d1c92b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,4 +22,5 @@ crieur-chatbot = {version = "0.1", path="crieur-chatbot"} dotenv = "0.15.0" env_logger = "0.8.3" log = "0.4.14" -tokio = { version = "1.5.0", features = ["full"] } +tokio = { version = "1.6.0", features = ["full"] } +tracing-subscriber = "0.2.18" diff --git a/README.md b/README.md index 455d599..8a77713 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,5 @@ -Tools to retrieve articles from multiple newspaper you subscribed to. +Tools to retrieve articles from multiple newspaper you subscribed to, all from +the same place. **This is a prototype, it isn't stable at all and you may not want to use it if you expect it to just work !** diff --git a/crieur-chatbot/src/chatbot.rs b/crieur-chatbot/src/chatbot.rs index 137b9ef..85018b2 100644 --- a/crieur-chatbot/src/chatbot.rs +++ b/crieur-chatbot/src/chatbot.rs @@ -2,15 +2,7 @@ use std::convert::TryInto; use anyhow::Result; -use matrix_sdk::{ - self, async_trait, - events::{ - room::message::{MessageEventContent, MessageType, TextMessageEventContent}, - AnyMessageEventContent, SyncMessageEvent, - }, - room::Room, - Client, ClientConfig, EventHandler, SyncSettings, -}; +use matrix_sdk::{self, Client, SyncSettings}; use crate::Html; diff --git a/crieur-chatbot/src/handlers/html.rs b/crieur-chatbot/src/handlers/html.rs index f8a73fd..3d5bf61 100644 --- a/crieur-chatbot/src/handlers/html.rs +++ b/crieur-chatbot/src/handlers/html.rs @@ -1,7 +1,6 @@ use std::convert::TryInto; -use std::env; -use log::{error, info}; +use log::error; use matrix_sdk::{ self, async_trait, events::{ @@ -9,7 +8,7 @@ use matrix_sdk::{ AnyMessageEventContent, SyncMessageEvent, }, room::Room, - Client, ClientConfig, EventHandler, SyncSettings, + EventHandler, }; use crieur_retrieve::{article_location::Error, article_location::Result, ArticleLocation, Url}; @@ -113,7 +112,6 @@ impl EventHandler for Html { } else { return; }; - info!("sending file"); match msg_body.split(' ').collect::>().as_slice() { ["!html", url, ..] => send_article(*url, room).await, diff --git a/crieur-retrieve/Cargo.toml b/crieur-retrieve/Cargo.toml index 7b35a3c..6c3c947 100644 --- a/crieur-retrieve/Cargo.toml +++ b/crieur-retrieve/Cargo.toml @@ -8,23 +8,22 @@ publish = false [dependencies] anyhow = "1.0.40" -async-trait = "0.1.48" +async-trait = "0.1.50" thiserror = "1.0.24" -url = "2.2.1" -hyper = { version = "0.14.5", features = ["full"] } +url = "2.2.2" +hyper = { version = "0.14.7", features = ["full"] } hyper-rustls = "0.22.1" cookie = "0.15.0" lol_html = "0.3.0" indoc = "1.0.3" -html-minifier = "3.0.9" +html-minifier = "3.0.13" bytes = "1.0.1" base64 = "0.13.0" -futures = "0.3.14" -derive_builder = "0.10.0" +futures = "0.3.15" +derive_builder = "0.10.2" nipper = "0.1.9" log = "0.4.14" env_logger = "0.8.3" -itertools = "0.10.0" [dev-dependencies] -tokio = "1.5.0" +tokio = "1.6.0" diff --git a/crieur-retrieve/src/article_location.rs b/crieur-retrieve/src/article_location.rs index d6a177a..2062c0b 100644 --- a/crieur-retrieve/src/article_location.rs +++ b/crieur-retrieve/src/article_location.rs @@ -2,12 +2,12 @@ use std::boxed::Box; use std::convert::TryInto; use std::env; -use anyhow::anyhow; -use log::info; use url::{Host, Url}; use crate::newspaper::Newspaper; +use crate::newspapers::courrier_international::{self, CourrierInternational}; use crate::newspapers::mediapart::{self, Mediapart}; +use crate::newspapers::monde_diplomatique::{self, MondeDiplo}; /// Enumerate all errors that can be encountered when using ArticleLocation #[derive(thiserror::Error, Debug)] @@ -33,16 +33,46 @@ type Newspapers = Vec>; pub type Result = core::result::Result; fn default_newpapers() -> Result { + // TODO: same thing is written too much times : how to DRY ? let config_key = "MEDIAPART_COOKIE".to_string(); - let mpruiid = env::var(&config_key) - .map_err(|_| Error::Misconfiguration(config_key))? - .into(); + let mpruiid = env::var(&config_key).map_err(|_| Error::Misconfiguration(config_key))?; let mediapart = Mediapart::builder() - .login(mediapart::Login::MPRUUID(mpruiid)) + .login(mediapart::Login::Mpruuid(mpruiid)) .build()?; - Ok(vec![Box::new(mediapart)]) + let lmd_a_m = "MONDE_DIPLO_LMD_A_M".to_string(); + let phpsessid = "MONDE_DIPLO_PHPSESSID".to_string(); + let spip_session = "MONDE_DIPLO_SPIP_SESSION".to_string(); + + let lmd_a_m = env::var(&lmd_a_m).map_err(|_| Error::Misconfiguration(lmd_a_m))?; + let phpsessid = env::var(&phpsessid).map_err(|_| Error::Misconfiguration(phpsessid))?; + let spip_session = + env::var(&spip_session).map_err(|_| Error::Misconfiguration(spip_session))?; + + let monde_diplo = MondeDiplo::builder() + .login(monde_diplomatique::Login::Cookies { + lmd_a_m, + phpsessid, + spip_session, + }) + .build()?; + + let lmd_a_m = "COURRIER_INTERNATIONAL_LMD_A_M".to_string(); + let ssess = "COURRIER_INTERNATIONAL_SSESS".to_string(); + + let lmd_a_m = env::var(&lmd_a_m).map_err(|_| Error::Misconfiguration(lmd_a_m))?; + let ssess = env::var(&ssess).map_err(|_| Error::Misconfiguration(ssess))?; + + let courrier_international = CourrierInternational::builder() + .login(courrier_international::Login::Cookies { lmd_a_m, ssess }) + .build()?; + + Ok(vec![ + Box::new(mediapart), + Box::new(monde_diplo), + Box::new(courrier_international), + ]) } #[derive(Default)] @@ -126,7 +156,6 @@ impl ArticleLocation { } pub async fn retrieve_html(&self) -> Result { - info!("It will download from {}", self.url); // TODO: modify when retrieve_html returns a specific Error type Ok(self.newspaper.retrieve_html(&self.url).await?) } diff --git a/crieur-retrieve/src/newspapers/courrier_international.rs b/crieur-retrieve/src/newspapers/courrier_international.rs new file mode 100644 index 0000000..7263d11 --- /dev/null +++ b/crieur-retrieve/src/newspapers/courrier_international.rs @@ -0,0 +1,144 @@ +use anyhow::{anyhow, bail, Result}; +use async_trait::async_trait; +use cookie::Cookie; +use indoc::indoc; +use url::Host; + +use crate::newspaper::{Metadata, Newspaper}; +use crate::tools; +use crate::Url; +use crate::{Download, Downloader}; + +pub enum Login { + Username(String, String), + Cookies { lmd_a_m: String, ssess: String }, +} + +#[derive(Debug, Clone, Default)] +pub struct CourrierInternational { + login_cookies: Vec<(String, String)>, +} + +fn str_to_host>(host: S) -> Host { + Host::Domain(host.into()) +} + +#[derive(Debug, Clone, Default)] +pub struct Builder { + login_cookies: Option>, +} + +impl Builder { + pub fn login(&mut self, login: Login) -> &mut Self { + self.login_cookies = match login { + Login::Username(_username, _password) => { + unimplemented!("login using username and passwond not implemented") + } + Login::Cookies { lmd_a_m, ssess } => Some(vec![ + ("lmd_a_m".into(), lmd_a_m), + ("SSESS862c7003d721c672d39f161b1456b890".into(), ssess), + ]), + }; + self + } + + pub fn build(&self) -> Result { + match &self.login_cookies { + Some(login_cookies) => Ok(CourrierInternational { + login_cookies: login_cookies.clone(), + }), + None => Err(anyhow!("You have to log in to access this newspaper")), + } + } +} + +#[async_trait] +impl Newspaper for CourrierInternational { + fn metadata(&self) -> Metadata { + Metadata::builder() + .hosts(vec![ + str_to_host("courrierinternational.com"), + str_to_host("www.courrierinternational.com"), + ]) + .lower_case_name("courrier-international") + .name("Courrier international") + .build() + .unwrap_or_default() + } + + async fn retrieve_html(&self, url: &Url) -> Result { + let cookies = self + .login_cookies + .iter() + .map(|cookie| Cookie::build(&cookie.0, &cookie.1).finish()) + .collect::>(); + + // TODO: replace by builder + let downloader = Downloader { cookies }; + + let body = downloader.download(&url).await?; + let html = match body { + Some(body) => String::from_utf8(body.to_vec())?, + None => bail!("404 not found"), + }; + + let elements_to_remove = &[ + // navigation elements + "header.site-header", + "footer.site-footer", + // Social buttons + "#toolbox-share", + ".toolbox-share", + ".toolbox-print", + ".toolbox-respond", + ".toolbox-zen", + ".toolbox-newsletter", + ".toolbox-offer", + ".box-article-offer-friend-abo", + // unused services + ".article-aside", + ".article-secondary", + ".article-subject-readmore", + // misc + ".element-invisible", + ".gptcontainer", + ]; + + // FIXME: it doesn't work because the aside is in the article body + // + let toolbox_style = indoc! {" + aside.article-toolbox { + position: sticky; + top: 1em; + } + "}; + + let single_page_html = tools::self_contained_html::Config { + downloader: Some(&downloader), + base_url: Some(&url), + elements_to_remove, + styles_to_add: &[toolbox_style], + ..Default::default() + } + .run(&html) + .await; + Ok(single_page_html) + } + + fn new() -> Self { + Self { + ..Default::default() + } + } + + async fn has_complete_access(&self) -> bool { + // TODO: check if we are logged using the cookie + true + } +} + +impl CourrierInternational { + pub fn builder() -> Builder { + Builder::default() + } +} diff --git a/crieur-retrieve/src/newspapers/mediapart.rs b/crieur-retrieve/src/newspapers/mediapart.rs index 0933b6e..40f17c1 100644 --- a/crieur-retrieve/src/newspapers/mediapart.rs +++ b/crieur-retrieve/src/newspapers/mediapart.rs @@ -1,4 +1,4 @@ -use anyhow::{anyhow, Result}; +use anyhow::{anyhow, bail, Result}; use async_trait::async_trait; use cookie::Cookie; use url::Host; @@ -10,7 +10,7 @@ use crate::{Download, Downloader}; pub enum Login { Username(String, String), - MPRUUID(String), + Mpruuid(String), } #[derive(Debug, Clone, Default)] @@ -33,7 +33,7 @@ impl Builder { Login::Username(_username, _password) => { unimplemented!("login using username and passwond not implemented") } - Login::MPRUUID(cookie_value) => Some(("MPRUUID".into(), cookie_value)), + Login::Mpruuid(cookie_value) => Some(("MPRUUID".into(), cookie_value)), }; self } @@ -80,10 +80,13 @@ impl Newspaper for Mediapart { let downloader = Downloader { cookies }; let body = downloader.download(&url).await?; - let html = String::from_utf8(body.to_vec())?; + let html = match body { + Some(body) => String::from_utf8(body.to_vec())?, + None => bail!("404 not found"), + }; // TODO: Move to const - let element_to_remove = [ + let elements_to_remove = &[ // header ".fb-root", ".skipLinks", @@ -101,8 +104,14 @@ impl Newspaper for Mediapart { "aside.cc-modal", ]; - let single_page_html = - tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await; + let single_page_html = tools::self_contained_html::Config { + downloader: Some(&downloader), + base_url: Some(&url), + elements_to_remove, + ..Default::default() + } + .run(&html) + .await; Ok(single_page_html) } diff --git a/crieur-retrieve/src/newspapers/mod.rs b/crieur-retrieve/src/newspapers/mod.rs index 7f44529..d07c868 100644 --- a/crieur-retrieve/src/newspapers/mod.rs +++ b/crieur-retrieve/src/newspapers/mod.rs @@ -1 +1,3 @@ +pub mod courrier_international; pub mod mediapart; +pub mod monde_diplomatique; diff --git a/crieur-retrieve/src/newspapers/monde_diplomatique.rs b/crieur-retrieve/src/newspapers/monde_diplomatique.rs new file mode 100644 index 0000000..e3a12df --- /dev/null +++ b/crieur-retrieve/src/newspapers/monde_diplomatique.rs @@ -0,0 +1,137 @@ +use anyhow::{anyhow, bail, Result}; +use async_trait::async_trait; +use cookie::Cookie; +use url::Host; + +use crate::newspaper::{Metadata, Newspaper}; +use crate::tools; +use crate::Url; +use crate::{Download, Downloader}; + +pub enum Login { + Username(String, String), + Cookies { + lmd_a_m: String, + phpsessid: String, + spip_session: String, + }, +} + +#[derive(Debug, Clone, Default)] +pub struct MondeDiplo { + login_cookies: Vec<(String, String)>, +} + +fn str_to_host>(host: S) -> Host { + Host::Domain(host.into()) +} + +#[derive(Debug, Clone, Default)] +pub struct Builder { + login_cookies: Option>, +} + +impl Builder { + pub fn login(&mut self, login: Login) -> &mut Self { + self.login_cookies = match login { + Login::Username(_username, _password) => { + unimplemented!("login using username and passwond not implemented") + } + Login::Cookies { + lmd_a_m, + phpsessid, + spip_session, + } => Some(vec![ + ("lmd_a_m".into(), lmd_a_m), + ("PHPSESSID".into(), phpsessid), + ("spip_session".into(), spip_session), + ]), + }; + self + } + + pub fn build(&self) -> Result { + match &self.login_cookies { + Some(login_cookies) => Ok(MondeDiplo { + login_cookies: login_cookies.clone(), + }), + None => Err(anyhow!("You have to log in to access this newspaper")), + } + } +} + +#[async_trait] +impl Newspaper for MondeDiplo { + fn metadata(&self) -> Metadata { + Metadata::builder() + .hosts(vec![ + str_to_host("monde-diplomatique.fr"), + str_to_host("www.monde-diplomatique.fr"), + ]) + .lower_case_name("monde-diplomatique") + .name("Le Monde Diplomatique") + .build() + .unwrap_or_default() + } + + async fn retrieve_html(&self, url: &Url) -> Result { + let cookies = self + .login_cookies + .iter() + .map(|cookie| Cookie::build(&cookie.0, &cookie.1).finish()) + .collect::>(); + + // TODO: replace by builder + let downloader = Downloader { cookies }; + + let body = downloader.download(&url).await?; + let html = match body { + Some(body) => String::from_utf8(body.to_vec())?, + None => bail!("404 not found"), + }; + + // TODO: Move to const + let elements_to_remove = &[ + // navigation elements + "#tout-en-haut.preentete", + "#entete.connecte", + "#navigation", + "#pied", + ".bloc-connexion", + // unused features + "#ecouter", + // Social buttons + ".actions-article", + "#partage", + // misc + "noscript", + ]; + + let single_page_html = tools::self_contained_html::Config { + downloader: Some(&downloader), + base_url: Some(&url), + elements_to_remove, + ..Default::default() + } + .run(&html) + .await; + Ok(single_page_html) + } + + fn new() -> Self { + Self { + ..Default::default() + } + } + + async fn has_complete_access(&self) -> bool { + // TODO: check if we are logged using the cookie + true + } +} + +impl MondeDiplo { + pub fn builder() -> Builder { + Builder::default() + } +} diff --git a/crieur-retrieve/src/tools/download.rs b/crieur-retrieve/src/tools/download.rs index e00f77f..ff5096b 100644 --- a/crieur-retrieve/src/tools/download.rs +++ b/crieur-retrieve/src/tools/download.rs @@ -4,7 +4,7 @@ use anyhow::Result; use async_trait::async_trait; use bytes::Bytes; use cookie::Cookie; -use hyper::{header, Body, Client, Method, Request}; +use hyper::{header, Body, Client, Method, Request, StatusCode}; use thiserror::Error; use url::Url; @@ -22,7 +22,9 @@ pub trait Download { type Error: StdError; /// Downloads a file from an url and returns the result as bytes - async fn download(&self, file_link: &Url) -> Result; + /// + /// If the file is not found, returns None + async fn download(&self, file_link: &Url) -> Result, Self::Error>; } /// Store several cookies @@ -36,7 +38,8 @@ pub struct Downloader<'c> { impl<'c> Download for Downloader<'c> { type Error = DownloadError; - async fn download(&self, file_link: &Url) -> Result { + async fn download(&self, file_link: &Url) -> Result, Self::Error> { + log::debug!("downloading url {:?}", file_link); let https = hyper_rustls::HttpsConnector::with_native_roots(); let client: Client<_, hyper::Body> = Client::builder().build(https); @@ -44,14 +47,26 @@ impl<'c> Download for Downloader<'c> { .method(Method::GET) .uri(file_link.as_str()); - for cookie in &self.cookies { - req = req.header(header::COOKIE, cookie.to_string()); - } + req = req.header( + header::COOKIE, + self.cookies + .iter() + .map(Cookie::to_string) + .collect::>() + .join(";"), + ); + log::debug!("headers : {:?}", req.headers_ref()); let req = req.body(Body::empty())?; let resp = client.request(req).await?; - let body = hyper::body::to_bytes(resp).await?; + log::debug!("Response status : {:?}", resp.status()); + let body = match resp.status() { + StatusCode::OK => Some(hyper::body::to_bytes(resp).await?), + StatusCode::NOT_FOUND => None, + // TODO: enhance this by handling more error codes + _ => None, + }; Ok(body) } } diff --git a/crieur-retrieve/src/tools/mod.rs b/crieur-retrieve/src/tools/mod.rs index 59381b1..80f159e 100644 --- a/crieur-retrieve/src/tools/mod.rs +++ b/crieur-retrieve/src/tools/mod.rs @@ -1,5 +1,4 @@ mod download; -mod self_contained_html; +pub mod self_contained_html; pub use download::{Download, DownloadError, Downloader}; -pub use self_contained_html::self_contained_html; diff --git a/crieur-retrieve/src/tools/self_contained_html.rs b/crieur-retrieve/src/tools/self_contained_html.rs index 7283a0e..e04234f 100644 --- a/crieur-retrieve/src/tools/self_contained_html.rs +++ b/crieur-retrieve/src/tools/self_contained_html.rs @@ -8,142 +8,198 @@ use url::Url; use crate::consts::{EVENT_HANDLERS, LINK_REL_EXTERNAL_RESOURCES}; use crate::Download; -/// Makes an html page self-contained -/// -/// The `downloader` must implement `Download` and is used to download ressources that are -/// needed to make this page self-contained such as stylesheets or images. -/// -/// The function also removes all scripts on the page -pub async fn self_contained_html( - html: impl AsRef, - downloader: &D, - base_url: &Url, - elements_to_remove: &[impl AsRef], -) -> String +/// Stores configuration for the self_contained_html function +// TODO: write a builder +pub struct Config<'t, E, D, S1 = &'static str, S2 = &'static str> +where + E: std::error::Error, + D: Download + Send, + S1: AsRef, + S2: AsRef, +{ + /// the downloader that will be used to retrieve ressources on the page + pub downloader: Option<&'t D>, + /// Base url for downloading ressources, it probably the + pub base_url: Option<&'t Url>, + pub elements_to_remove: &'t [S1], + pub styles_to_add: &'t [S2], +} + +impl<'t, E, D> Default for Config<'t, E, D> where E: std::error::Error, D: Download + Send, { - // TODO: split/refactor this function : - // - ¿ find a way to ease the |get urls| -> |async download| -> |replace| workflow ? - // - `element_to_remove` , `base_url` and `downloader` should be in a configuration structure - // - ¿ should be function of a trait ? or only of the configuration struct ? - let (style_urls, html) = { - let document = Document::from(html.as_ref()); - - // ---- Remove scripts ---- - // - document.select("script").remove(); - - for event in EVENT_HANDLERS { - document - .select(format!("[{}]", event).as_str()) - .remove_attr(event); + fn default() -> Self { + Self { + downloader: None, + base_url: None, + elements_to_remove: &[], + styles_to_add: &[], } + } +} - for rel in LINK_REL_EXTERNAL_RESOURCES { - document - .select(format!("link[rel=\"{}\"]", rel).as_str()) - .remove(); - } +impl<'t, E, D, S1, S2> Config<'t, E, D, S1, S2> +where + E: std::error::Error, + D: Download + Send, + S1: AsRef, + S2: AsRef, +{ + /// Makes an html page self-contained + /// + /// The `downloader` must implement `Download` and is used to download ressources that are + /// needed to make this page self-contained such as stylesheets or images. + /// + /// The function also removes all scripts on the page + pub async fn run(&self, html: impl AsRef) -> String { + //TODO: don't panic + let base_url = self.base_url.expect("Base url not defined"); + let downloader = self.downloader.expect("Downloader not defined"); + // TODO: split/refactor this function : + // - ¿ find a way to ease the |get urls| -> |async download| -> |replace| workflow ? + // - put each modification (ex: style in the `foreach`) in functions, maybe using + // (¿sealed?) Traits : https://git.alpaga.dev/poc/crieur/issues/42 + // - `element_to_remove` , `base_url` and `downloader` should be in a configuration structure + let (style_urls, html) = { + let document = Document::from(html.as_ref()); - // ---- Replace stylesheets ---- + // ---- Remove scripts ---- + // + document.select("script").remove(); + + for event in EVENT_HANDLERS { + document + .select(format!("[{}]", event).as_str()) + .remove_attr(event); + } + + for rel in LINK_REL_EXTERNAL_RESOURCES { + document + .select(format!("link[rel=\"{}\"]", rel).as_str()) + .remove(); + } + + // ---- Replace stylesheets ---- + // + let stylesheets = document.select("link[href][rel=\"stylesheet\"]"); + let styles_url = stylesheets + .iter() + .map(|style_link| { + if let Some(src) = style_link.attr("href") { + base_url.join(src.as_ref()).ok() + } else { + None + } + }) + .collect::>(); + (styles_url, String::from(document.html())) + }; + + let style_urls = style_urls.into_iter().map(|style_url| { + OptionFuture::from( + style_url.map(|s| async move { downloader.download(&s).await.unwrap() }), + ) + }); + let downloaded_styles = futures::future::join_all(style_urls).await; + + let html = { + let document = Document::from(&html); + let styles = document.select("link[href][rel=\"stylesheet\"]"); + + styles + .iter() + .zip(downloaded_styles.iter()) + .for_each(|(mut style_link, inner_css)| { + if let Some(Some(inner_css)) = inner_css { + let css = String::from_utf8(inner_css.to_vec()).unwrap(); + let media_query = style_link.attr("media"); + let css = match media_query { + Some(media_query) => { + format!("", media_query, css) + } + None => format!("", css), + }; + style_link.replace_with_html(css); + } else { + style_link.remove(); + } + }); + String::from(document.html()) + }; + + // ---- Replace imgs ---- // - let stylesheets = document.select("link[href][rel=\"stylesheet\"]"); - let styles_url = stylesheets - .iter() - .map(|stylesheet| { - if let Some(src) = stylesheet.attr("href") { - //TODO: does it work with absolute urls ? - base_url.join(src.as_ref()).ok() - } else { - None - } - }) - .collect::>(); - (styles_url, String::from(document.html())) - }; + let image_urls = { + let document = Document::from(&html); + let imgs = document.select("img:not([src^=\"data:\"])"); - let style_urls = style_urls.into_iter().map(|style_url| { - OptionFuture::from(style_url.map(|s| async move { downloader.download(&s).await.unwrap() })) - }); - let downloaded_styles = futures::future::join_all(style_urls).await; + imgs.iter() + .map(|image| { + if let Some(src) = image.attr("src") { + base_url.join(src.as_ref()).ok() + } else { + None + } + }) + .collect::>() + }; - let html = { - let document = Document::from(&html); - let styles = document.select("link[href][rel=\"stylesheet\"]"); + let downloaded_images = image_urls.into_iter().map(|image_url| { + OptionFuture::from(image_url.map(|url| async move { + let data = downloader.download(&url).await.unwrap(); + (url, data) + })) + }); + let downloaded_images = futures::future::join_all(downloaded_images).await; - styles - .iter() - .zip(downloaded_styles.iter()) - .for_each(|(mut stylesheet, inner_css)| { - if let Some(inner_css) = inner_css { - let css = String::from_utf8(inner_css.to_vec()).unwrap(); - let css = format!("", css); - stylesheet.replace_with_html(css); - } else { - stylesheet.remove(); - } - }); - String::from(document.html()) - }; + let html = { + let document = Document::from(&html); + let imgs = document.select("img:not([src^=\"data:\"])"); - // ---- Replace imgs ---- - // - let image_urls = { - let document = Document::from(&html); - let imgs = document.select("img"); + imgs.iter() + .zip(downloaded_images.iter()) + .for_each(|(mut img, data)| { + if let Some((url, Some(data))) = data { + let data = base64::encode(data); + //TODO: use an extension hashmap + let extension = + Path::new(url.path()).extension().unwrap().to_str().unwrap(); + img.set_attr("src", &format!("data:image/{};base64,{}", extension, data)); + } else { + img.remove() + } + }); + // ---- Remove unwanted html elements ----- + // + for element in self.elements_to_remove { + document.select(element.as_ref()).remove(); + } - imgs.iter() - .map(|image| { - if let Some(src) = image.attr("src") { - base_url.join(src.as_ref()).ok() - } else { - None - } - }) - .collect::>() - }; + // ---- Add additional styles ---- + // + for style in self.styles_to_add { + document + .select("head") + .append_html(format!("\n\n", style.as_ref())); + } - let downloaded_images = image_urls.into_iter().map(|image_url| { - OptionFuture::from(image_url.map(|url| async move { - let data = downloader.download(&url).await.unwrap(); - (url, data) - })) - }); - let downloaded_images = futures::future::join_all(downloaded_images).await; + String::from(document.html()) + }; - let html = { - let document = Document::from(&html); - let imgs = document.select("img"); - - imgs.iter() - .zip(downloaded_images.iter()) - .for_each(|(mut img, data)| { - if let Some((url, data)) = data { - let data = base64::encode(data); - let extension = Path::new(url.path()).extension().unwrap().to_str().unwrap(); - img.set_attr("src", &format!("data:image/{};base64,{}", extension, data)); - } - }); - // ---- Remove unwanted html elements ----- + // ---- output ---- // - for element in elements_to_remove { - document.select(element.as_ref()).remove(); - } - String::from(document.html()) - }; + let mut minifier = HTMLMinifier::new(); + minifier.digest(html.as_str()).unwrap(); - // ---- output ---- - // - let mut minifier = HTMLMinifier::new(); - minifier.digest(html.as_str()).unwrap(); - - String::from_utf8(minifier.get_html().into()).unwrap() + String::from_utf8(minifier.get_html().into()).unwrap() + } } #[cfg(test)] mod tests { + // TODO: reduce boilerplate, DRY use super::*; @@ -168,8 +224,8 @@ mod tests { #[async_trait] impl Download for DummyDownloader { type Error = errors::Error; - async fn download(&self, _file_link: &Url) -> errors::Result { - Ok(Bytes::from("")) + async fn download(&self, _file_link: &Url) -> errors::Result> { + Ok(Some(Bytes::from(""))) } } @@ -178,9 +234,14 @@ mod tests { let html = ""; let base_url = Url::parse("http://example.com")?; let downloader = DummyDownloader {}; - let to_remove: &[&str] = &[]; assert_eq!( - self_contained_html(html, &downloader, &base_url, to_remove).await, + Config { + downloader: Some(&downloader), + base_url: Some(&base_url), + ..Default::default() + } + .run(html) + .await, "" ); Ok(()) @@ -204,10 +265,13 @@ mod tests { }; let base_url = Url::parse("http://example.com")?; - let to_remove: &[&str] = &[]; for s in EVENT_HANDLERS { assert_eq!( - self_contained_html(html(s), &downloader, &base_url, to_remove).await, + Config { + downloader: Some(&downloader), + base_url: Some(&base_url), + ..Default::default() + }.run(html(s)).await, "\n\n\n\n" ); } @@ -232,10 +296,15 @@ mod tests { }; let base_url = Url::parse("http://example.com")?; - let to_remove: &[&str] = &[]; for s in LINK_REL_EXTERNAL_RESOURCES { assert_eq!( - self_contained_html(html(s), &downloader, &base_url, to_remove).await, + Config { + downloader: Some(&downloader), + base_url: Some(&base_url), + ..Default::default() + } + .run(html(s)) + .await, "\n\n\n" ); } @@ -246,12 +315,14 @@ mod tests { #[async_trait] impl Download for CssDownloader { type Error = errors::Error; - async fn download(&self, _file_link: &Url) -> errors::Result { - Ok(indoc! {" + async fn download(&self, _file_link: &Url) -> errors::Result> { + Ok(Some( + indoc! {" section#warning { color: red; }"} - .into()) + .into(), + )) } } @@ -286,9 +357,57 @@ mod tests { let minified = String::from_utf8(minifier.get_html().into())?; let base_url = Url::parse("http://example.com")?; - let to_remove: &[&str] = &[]; assert_eq!( - self_contained_html(html, &downloader, &base_url, to_remove).await, + Config { + downloader: Some(&downloader), + base_url: Some(&base_url), + ..Default::default() + } + .run(html) + .await, + minified + ); + Ok(()) + } + + #[tokio::test] + async fn download_css_with_media_query() -> Result<()> { + let downloader = CssDownloader {}; + + let html = indoc! {" + + + + + + + + "}; + + let wanted_html = indoc! {" + + + + + + "}; + let mut minifier = HTMLMinifier::new(); + minifier.digest(wanted_html)?; + let minified = String::from_utf8(minifier.get_html().into())?; + + let base_url = Url::parse("http://example.com")?; + assert_eq!( + Config { + downloader: Some(&downloader), + base_url: Some(&base_url), + ..Default::default() + } + .run(html) + .await, minified ); Ok(()) @@ -298,12 +417,12 @@ mod tests { #[async_trait] impl Download for PngDownloader { type Error = errors::Error; - async fn download(&self, _file_link: &Url) -> errors::Result { + async fn download(&self, _file_link: &Url) -> errors::Result> { let image_path = Path::new("test_data/home.png"); let mut image_file = File::open(&image_path).unwrap(); let mut image_buf: Vec = vec![]; image_file.read_to_end(&mut image_buf).unwrap(); - Ok(image_buf.into()) + Ok(Some(image_buf.into())) } } @@ -333,9 +452,14 @@ mod tests { let minified = String::from_utf8(minifier.get_html().into())?; let base_url = Url::parse("http://example.com")?; - let to_remove: &[&str] = &[]; assert_eq!( - self_contained_html(html, &downloader, &base_url, to_remove).await, + Config { + downloader: Some(&downloader), + base_url: Some(&base_url), + ..Default::default() + } + .run(html) + .await, minified ); Ok(()) @@ -368,12 +492,67 @@ mod tests { let minified = String::from_utf8(minifier.get_html().into())?; assert_eq!( - self_contained_html( - html, - &downloader, - &base_url, - &["header", ".placeholder", "article > span.huge"] - ) + Config { + downloader: Some(&downloader), + base_url: Some(&base_url), + elements_to_remove: &["header", ".placeholder", "article > span.huge"], + ..Default::default() + } + .run(html) + .await, + minified + ); + Ok(()) + } + + #[tokio::test] + async fn add_style() -> Result<()> { + let html = indoc! {" + + + + + + The body + + + "}; + + let wanted_html = indoc! {" + + + + + + The body + + "}; + + let style_to_add = indoc! {" + body { + margin: 3em; + } + "}; + + let base_url = Url::parse("http://example.com")?; + let downloader = DummyDownloader {}; + + let mut minifier = HTMLMinifier::new(); + minifier.digest(wanted_html)?; + let minified = String::from_utf8(minifier.get_html().into())?; + + assert_eq!( + Config { + downloader: Some(&downloader), + base_url: Some(&base_url), + styles_to_add: &[style_to_add], + ..Default::default() + } + .run(html) .await, minified ); diff --git a/documentation/design/scope.md b/documentation/design/scope.md index 6417bf7..cca4047 100644 --- a/documentation/design/scope.md +++ b/documentation/design/scope.md @@ -37,7 +37,8 @@ frame "backend" { newspaper -> retrieval_tools: uses to implement - article_location --> article_repr :uses + article_location --> article_repr: uses + retrieval_tools -up-> article_repr: uses auto_retrieve --> rss: watches auto_retrieve --> article_location diff --git a/documentation/reference/newspaper_configuration.md b/documentation/reference/newspaper_configuration.md index 8658087..fa7ab2c 100644 --- a/documentation/reference/newspaper_configuration.md +++ b/documentation/reference/newspaper_configuration.md @@ -8,3 +8,24 @@ The newspapers are configured using environment variables MEDIAPART_COOKIE : sets the `MPRUUID` cookie, used to log in + +# Le Monde Diplomatique + +All cookies are mandatory to log in + +MONDE_DIPLO_LMD_A_M +: sets the `lmd_a_m` cookie + +MONDE_DIPLO_PHPSESSID +: sets the `PHPSESSID` cookie + +MONDE_DIPLO_SPIP_SESSION +: sets the `spip_session` cookie + +# Courrier international + +COURRIER_INTERNATIONAL_LMD_A_M +: sets the `lmd_a_m` cookie + +COURRIER_INTERNATIONAL_SSESS +: sets the `ssess` cookie diff --git a/examples/cli_downloader.rs b/examples/cli_downloader.rs index 4a4eefe..84d1815 100644 --- a/examples/cli_downloader.rs +++ b/examples/cli_downloader.rs @@ -2,38 +2,27 @@ use std::convert::TryInto; use std::env; use anyhow::Result; -use crieur_retrieve::{ - newspaper::Newspaper, - newspapers::mediapart::{self, Mediapart}, - ArticleLocation, Url, -}; +use crieur_retrieve::{ArticleLocation, Url}; use dotenv::dotenv; use log::info; #[tokio::main] async fn main() -> Result<()> { dotenv().ok(); - env_logger::init(); + tracing_subscriber::fmt() + .with_writer(std::io::stderr) + .with_env_filter(tracing_subscriber::EnvFilter::from_default_env()) + .init(); let url = match env::args().nth(1) { Some(url) => Url::parse(&url)?, None => "https://www.mediapart.fr/journal/france/030421/l-hotel-dieu-patients-et-medecins-tentent-de-percer-les-mysteres-du-covid-long".try_into()?, }; - // TODO: remove this in favor of default newspapers - - let mpruiid = env::var("MEDIAPART_COOKIE")?.into(); - let mediapart = Mediapart::builder() - .login(mediapart::Login::MPRUUID(mpruiid)) - .build()?; - info!("Trying to download article from {}", url); // TODO: shorten this, maybe an helper function ? - let article_location = ArticleLocation::builder() - .url(url)? - .newspaper(mediapart) - .build()?; + let article_location = ArticleLocation::builder().url(url)?.build()?; let article_str = article_location.retrieve_html().await?; diff --git a/justfile b/justfile index a28b9d5..3b60ba7 100644 --- a/justfile +++ b/justfile @@ -1,19 +1,29 @@ @build: - cargo build + cargo build + +@build-container: + podman build --file ./containers/chatbot.containerfile -t crieur-chatbot . @clean: - cargo clean + cargo clean @run: - cargo run + cargo run + +@test: + cargo test --all + +@clippy: + cargo clippy + +@fmt: + cargo fmt + +@simulate-ci: fmt clippy test -@container: - podman build --file ./containers/chatbot.containerfile -t crieur-chatbot . @audit: - cargo audit + cargo audit @crev: - cargo crev verify - -@verify: audit crev + cargo crev verify