Merge pull request 'add several newspapers' (#36) from feature/additional_newspapers into development
Reviewed-on: #36
This commit is contained in:
commit
16ad14467e
271
Cargo.lock
generated
271
Cargo.lock
generated
@ -74,13 +74,22 @@ checksum = "739f4a8db6605981345c5654f3a85b056ce52f37a39d34da03f25bf2151ea16e"
|
||||
|
||||
[[package]]
|
||||
name = "aho-corasick"
|
||||
version = "0.7.15"
|
||||
version = "0.7.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7404febffaa47dac81aa44dba71523c9d069b1bdc50a77db41195149e17f68e5"
|
||||
checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ansi_term"
|
||||
version = "0.12.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2"
|
||||
dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "anyhow"
|
||||
version = "1.0.40"
|
||||
@ -213,6 +222,18 @@ version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
||||
|
||||
[[package]]
|
||||
name = "chrono"
|
||||
version = "0.4.19"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "670ad68c9088c2a963aaa298cb369688cf3f9465ce5e2d4ca10e6e0098a1ce73"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"num-integer",
|
||||
"num-traits",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cipher"
|
||||
version = "0.2.5"
|
||||
@ -285,10 +306,13 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "79bb3adfaf5f75d24b01aee375f7555907840fa2800e5ec8fa3b9e2031830173"
|
||||
|
||||
[[package]]
|
||||
name = "cpuid-bool"
|
||||
version = "0.1.2"
|
||||
name = "cpufeatures"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8aebca1129a03dc6dc2b127edd729435bbc4a37e1d5f4d7513165089ceb02634"
|
||||
checksum = "dec1028182c380cc45a2e2c5ec841134f2dfd0f8f5f0a5bcd68004f81b5efdf4"
|
||||
dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cpuid-bool"
|
||||
@ -307,6 +331,7 @@ dependencies = [
|
||||
"env_logger",
|
||||
"log",
|
||||
"tokio",
|
||||
"tracing-subscriber",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -337,7 +362,6 @@ dependencies = [
|
||||
"hyper",
|
||||
"hyper-rustls",
|
||||
"indoc",
|
||||
"itertools",
|
||||
"log",
|
||||
"lol_html",
|
||||
"nipper",
|
||||
@ -569,12 +593,6 @@ dependencies = [
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "either"
|
||||
version = "1.6.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457"
|
||||
|
||||
[[package]]
|
||||
name = "encoding_rs"
|
||||
version = "0.8.28"
|
||||
@ -644,9 +662,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "futures"
|
||||
version = "0.3.14"
|
||||
version = "0.3.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a9d5813545e459ad3ca1bff9915e9ad7f1a47dc6a91b627ce321d5863b7dd253"
|
||||
checksum = "0e7e43a803dae2fa37c1f6a8fe121e1f7bf9548b4dfc0522a42f34145dadfc27"
|
||||
dependencies = [
|
||||
"futures-channel",
|
||||
"futures-core",
|
||||
@ -659,9 +677,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "futures-channel"
|
||||
version = "0.3.14"
|
||||
version = "0.3.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ce79c6a52a299137a6013061e0cf0e688fce5d7f1bc60125f520912fdb29ec25"
|
||||
checksum = "e682a68b29a882df0545c143dc3646daefe80ba479bcdede94d5a703de2871e2"
|
||||
dependencies = [
|
||||
"futures-core",
|
||||
"futures-sink",
|
||||
@ -669,15 +687,15 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "futures-core"
|
||||
version = "0.3.14"
|
||||
version = "0.3.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "098cd1c6dda6ca01650f1a37a794245eb73181d0d4d4e955e2f3c37db7af1815"
|
||||
checksum = "0402f765d8a89a26043b889b26ce3c4679d268fa6bb22cd7c6aad98340e179d1"
|
||||
|
||||
[[package]]
|
||||
name = "futures-executor"
|
||||
version = "0.3.14"
|
||||
version = "0.3.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "10f6cb7042eda00f0049b1d2080aa4b93442997ee507eb3828e8bd7577f94c9d"
|
||||
checksum = "badaa6a909fac9e7236d0620a2f57f7664640c56575b71a7552fbd68deafab79"
|
||||
dependencies = [
|
||||
"futures-core",
|
||||
"futures-task",
|
||||
@ -686,9 +704,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "futures-io"
|
||||
version = "0.3.14"
|
||||
version = "0.3.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "365a1a1fb30ea1c03a830fdb2158f5236833ac81fa0ad12fe35b29cddc35cb04"
|
||||
checksum = "acc499defb3b348f8d8f3f66415835a9131856ff7714bf10dadfc4ec4bdb29a1"
|
||||
|
||||
[[package]]
|
||||
name = "futures-locks"
|
||||
@ -701,10 +719,11 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "futures-macro"
|
||||
version = "0.3.14"
|
||||
version = "0.3.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "668c6733a182cd7deb4f1de7ba3bf2120823835b3bcfbeacf7d2c4a773c1bb8b"
|
||||
checksum = "a4c40298486cdf52cc00cd6d6987892ba502c7656a16a4192a9992b1ccedd121"
|
||||
dependencies = [
|
||||
"autocfg 1.0.1",
|
||||
"proc-macro-hack",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
@ -713,15 +732,15 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "futures-sink"
|
||||
version = "0.3.14"
|
||||
version = "0.3.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5c5629433c555de3d82861a7a4e3794a4c40040390907cfbfd7143a92a426c23"
|
||||
checksum = "a57bead0ceff0d6dde8f465ecd96c9338121bb7717d3e7b108059531870c4282"
|
||||
|
||||
[[package]]
|
||||
name = "futures-task"
|
||||
version = "0.3.14"
|
||||
version = "0.3.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ba7aa51095076f3ba6d9a1f702f74bd05ec65f555d70d2033d55ba8d69f581bc"
|
||||
checksum = "8a16bef9fc1a4dddb5bee51c989e3fbba26569cbb0e31f5b303c184e3dd33dae"
|
||||
|
||||
[[package]]
|
||||
name = "futures-timer"
|
||||
@ -735,10 +754,11 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "futures-util"
|
||||
version = "0.3.14"
|
||||
version = "0.3.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3c144ad54d60f23927f0a6b6d816e4271278b64f005ad65e4e35291d2de9c025"
|
||||
checksum = "feb5c238d27e2bf94ffdfd27b2c29e3df4a68c4193bb6427384259e2bf191967"
|
||||
dependencies = [
|
||||
"autocfg 1.0.1",
|
||||
"futures-channel",
|
||||
"futures-core",
|
||||
"futures-io",
|
||||
@ -821,9 +841,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "h2"
|
||||
version = "0.3.2"
|
||||
version = "0.3.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fc018e188373e2777d0ef2467ebff62a08e66c3f5857b23c8fbec3018210dc00"
|
||||
checksum = "825343c4eef0b63f541f8903f395dc5beb362a979b5799a84062527ef1e37726"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fnv",
|
||||
@ -877,9 +897,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "html-minifier"
|
||||
version = "3.0.10"
|
||||
version = "3.0.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "108452631307790510cde91282fc706ae70076bd68200add8638773f06d5e122"
|
||||
checksum = "70f11cbdecf4fde3b74532e8d6d5926b6354adfda3fe259ed585402f9ae0dbaf"
|
||||
dependencies = [
|
||||
"cow-utils",
|
||||
"educe",
|
||||
@ -914,9 +934,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "http-body"
|
||||
version = "0.4.1"
|
||||
version = "0.4.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5dfb77c123b4e2f72a2069aeae0b4b4949cc7e966df277813fc16347e7549737"
|
||||
checksum = "60daa14be0e0786db0f03a9e57cb404c9d756eed2b6c62b9ea98ec5743ec75a9"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"http",
|
||||
@ -1038,15 +1058,6 @@ version = "2.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "47be2f14c678be2fdcab04ab1171db51b2762ce6f0a8ee87c8dd4a04ed216135"
|
||||
|
||||
[[package]]
|
||||
name = "itertools"
|
||||
version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "37d572918e350e82412fe766d24b15e6682fb2ed2bbe018280caa810397cb319"
|
||||
dependencies = [
|
||||
"either",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "itoa"
|
||||
version = "0.4.7"
|
||||
@ -1055,9 +1066,9 @@ checksum = "dd25036021b0de88a0aff6b850051563c6516d0bf53f8638938edbb9de732736"
|
||||
|
||||
[[package]]
|
||||
name = "js-sys"
|
||||
version = "0.3.50"
|
||||
version = "0.3.51"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2d99f9e3e84b8f67f846ef5b4cbbc3b1c29f6c759fcbce6f01aa0e73d932a24c"
|
||||
checksum = "83bdfbace3a0e81a4253f73b49e960b053e396a11012cbd49b9b74d6a2b67062"
|
||||
dependencies = [
|
||||
"wasm-bindgen",
|
||||
]
|
||||
@ -1091,9 +1102,9 @@ checksum = "18794a8ad5b29321f790b55d93dfba91e125cb1a9edbd4f8e3150acc771c1a5e"
|
||||
|
||||
[[package]]
|
||||
name = "lock_api"
|
||||
version = "0.4.3"
|
||||
version = "0.4.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5a3c91c24eae6777794bb1997ad98bbb87daf92890acab859f7eaa4320333176"
|
||||
checksum = "0382880606dff6d15c9476c416d18690b72742aa7b605bb6dd6ec9030fbf07eb"
|
||||
dependencies = [
|
||||
"scopeguard",
|
||||
]
|
||||
@ -1158,6 +1169,15 @@ dependencies = [
|
||||
"tendril",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "matchers"
|
||||
version = "0.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f099785f7595cc4b4553a174ce30dd7589ef93391ff414dbb67f62392b9e0ce1"
|
||||
dependencies = [
|
||||
"regex-automata",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "matches"
|
||||
version = "0.1.8"
|
||||
@ -1254,9 +1274,9 @@ checksum = "60302e4db3a61da70c0cb7991976248362f30319e88850c487b9b95bbf059e00"
|
||||
|
||||
[[package]]
|
||||
name = "memchr"
|
||||
version = "2.3.4"
|
||||
version = "2.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0ee1c47aaa256ecabcaea351eae4a9b01ef39ed810004e298d2511ed284b1525"
|
||||
checksum = "b16bd47d9e329435e309c58469fe0791c2d0d1ba96ec0954152a5ae2b04387dc"
|
||||
|
||||
[[package]]
|
||||
name = "mime"
|
||||
@ -1266,9 +1286,9 @@ checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d"
|
||||
|
||||
[[package]]
|
||||
name = "minifier"
|
||||
version = "0.0.39"
|
||||
version = "0.0.41"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6cdf618de5c9c98d4a7b2e0d1f1e44f82a19196cfd94040bb203621c25d28d98"
|
||||
checksum = "5594542d20834f2b974f5e5fb8e0cf1c67a2119dcadc29ef5d93a081fb30cc08"
|
||||
dependencies = [
|
||||
"macro-utils",
|
||||
]
|
||||
@ -1585,7 +1605,7 @@ version = "0.4.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "eebcc4aa140b9abd2bc40d9c3f7ccec842679cd79045ac3a7ac698c1a064b7cd"
|
||||
dependencies = [
|
||||
"cpuid-bool 0.2.0",
|
||||
"cpuid-bool",
|
||||
"opaque-debug",
|
||||
"universal-hash",
|
||||
]
|
||||
@ -1856,18 +1876,18 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "redox_syscall"
|
||||
version = "0.2.7"
|
||||
version = "0.2.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "85dd92e586f7355c633911e11f77f3d12f04b1b1bd76a198bd34ae3af8341ef2"
|
||||
checksum = "742739e41cd49414de871ea5e549afb7e2a3ac77b589bcbebe8c82fab37147fc"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex"
|
||||
version = "1.4.6"
|
||||
version = "1.5.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2a26af418b574bd56588335b3a3659a65725d4e636eb1016c2f9e3b38c7cc759"
|
||||
checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
@ -1875,10 +1895,20 @@ dependencies = [
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-syntax"
|
||||
version = "0.6.23"
|
||||
name = "regex-automata"
|
||||
version = "0.1.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "24d5f089152e60f62d28b835fbff2cd2e8dc0baf1ac13343bef92ab7eed84548"
|
||||
checksum = "ae1ded71d66a4a97f5e961fd0cb25a5f366a42a41570d16a763a69c092c26ae4"
|
||||
dependencies = [
|
||||
"byteorder",
|
||||
"regex-syntax",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-syntax"
|
||||
version = "0.6.25"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b"
|
||||
|
||||
[[package]]
|
||||
name = "reqwest"
|
||||
@ -2319,17 +2349,26 @@ checksum = "2579985fda508104f7587689507983eadd6a6e84dd35d6d115361f530916fa0d"
|
||||
|
||||
[[package]]
|
||||
name = "sha2"
|
||||
version = "0.9.3"
|
||||
version = "0.9.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fa827a14b29ab7f44778d14a88d3cb76e949c45083f7dbfa507d0cb699dc12de"
|
||||
checksum = "d8f6b75b17576b792bef0db1bcc4b8b8bcdf9506744cf34b974195487af6cff2"
|
||||
dependencies = [
|
||||
"block-buffer",
|
||||
"cfg-if 1.0.0",
|
||||
"cpuid-bool 0.1.2",
|
||||
"cpufeatures",
|
||||
"digest",
|
||||
"opaque-debug",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sharded-slab"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "79c719719ee05df97490f80a45acfc99e5a30ce98a1e4fb67aee422745ae14e3"
|
||||
dependencies = [
|
||||
"lazy_static",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "signal-hook-registry"
|
||||
version = "1.3.0"
|
||||
@ -2491,9 +2530,9 @@ checksum = "1e81da0851ada1f3e9d4312c704aa4f8806f0f9d69faaf8df2f3464b4a9437c2"
|
||||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
version = "1.0.71"
|
||||
version = "1.0.72"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ad184cc9470f9117b2ac6817bfe297307418819ba40552f9b3846f05c33d5373"
|
||||
checksum = "a1e8cdbefb79a9a5a65e0db8b47b723ee907b7c7f8496c76a1770b5c310bab82"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
@ -2558,6 +2597,15 @@ dependencies = [
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thread_local"
|
||||
version = "1.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8018d24e04c95ac8790716a5987d0fec4f8b27249ffa0f7d33f1369bdfb88cbd"
|
||||
dependencies = [
|
||||
"once_cell",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "time"
|
||||
version = "0.2.26"
|
||||
@ -2613,9 +2661,9 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c"
|
||||
|
||||
[[package]]
|
||||
name = "tokio"
|
||||
version = "1.5.0"
|
||||
version = "1.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "83f0c8e7c0addab50b663055baf787d0af7f413a46e6e7fb9559a4e4db7137a5"
|
||||
checksum = "bd3076b5c8cc18138b8f8814895c11eb4de37114a5d127bafdc5e55798ceef37"
|
||||
dependencies = [
|
||||
"autocfg 1.0.1",
|
||||
"bytes",
|
||||
@ -2684,9 +2732,9 @@ checksum = "360dfd1d6d30e05fda32ace2c8c70e9c0a9da713275777f5a4dbb8a1893930c6"
|
||||
|
||||
[[package]]
|
||||
name = "tracing"
|
||||
version = "0.1.25"
|
||||
version = "0.1.26"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "01ebdc2bb4498ab1ab5f5b73c5803825e60199229ccba0698170e3be0e7f959f"
|
||||
checksum = "09adeb8c97449311ccd28a427f96fb563e7fd31aabf994189879d9da2394b89d"
|
||||
dependencies = [
|
||||
"cfg-if 1.0.0",
|
||||
"pin-project-lite",
|
||||
@ -2707,9 +2755,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tracing-core"
|
||||
version = "0.1.17"
|
||||
version = "0.1.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f50de3927f93d202783f4513cda820ab47ef17f624b03c096e86ef00c67e6b5f"
|
||||
checksum = "a9ff14f98b1a4b289c6248a023c1c2fa1491062964e9fed67ab29c4e4da4a052"
|
||||
dependencies = [
|
||||
"lazy_static",
|
||||
]
|
||||
@ -2724,6 +2772,49 @@ dependencies = [
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-log"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a6923477a48e41c1951f1999ef8bb5a3023eb723ceadafe78ffb65dc366761e3"
|
||||
dependencies = [
|
||||
"lazy_static",
|
||||
"log",
|
||||
"tracing-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-serde"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fb65ea441fbb84f9f6748fd496cf7f63ec9af5bca94dd86456978d055e8eb28b"
|
||||
dependencies = [
|
||||
"serde",
|
||||
"tracing-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-subscriber"
|
||||
version = "0.2.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "aa5553bf0883ba7c9cbe493b085c29926bd41b66afc31ff72cf17ff4fb60dcd5"
|
||||
dependencies = [
|
||||
"ansi_term",
|
||||
"chrono",
|
||||
"lazy_static",
|
||||
"matchers",
|
||||
"regex",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"sharded-slab",
|
||||
"smallvec 1.6.1",
|
||||
"thread_local",
|
||||
"tracing",
|
||||
"tracing-core",
|
||||
"tracing-log",
|
||||
"tracing-serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "try-lock"
|
||||
version = "0.2.3"
|
||||
@ -2756,9 +2847,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "unicode-xid"
|
||||
version = "0.2.1"
|
||||
version = "0.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f7fe0bb3479651439c9112f72b6c505038574c9fbb575ed1bf3b797fa39dd564"
|
||||
checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3"
|
||||
|
||||
[[package]]
|
||||
name = "unindent"
|
||||
@ -2784,9 +2875,9 @@ checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a"
|
||||
|
||||
[[package]]
|
||||
name = "url"
|
||||
version = "2.2.1"
|
||||
version = "2.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9ccd964113622c8e9322cfac19eb1004a07e636c545f325da085d5cdde6f1f8b"
|
||||
checksum = "a507c383b2d33b5fc35d1861e77e6b383d158b2da5e14fe51b83dfedf6fd578c"
|
||||
dependencies = [
|
||||
"form_urlencoded",
|
||||
"idna",
|
||||
@ -2846,9 +2937,9 @@ checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6"
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen"
|
||||
version = "0.2.73"
|
||||
version = "0.2.74"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "83240549659d187488f91f33c0f8547cbfef0b2088bc470c116d1d260ef623d9"
|
||||
checksum = "d54ee1d4ed486f78874278e63e4069fc1ab9f6a18ca492076ffb90c5eb2997fd"
|
||||
dependencies = [
|
||||
"cfg-if 1.0.0",
|
||||
"serde",
|
||||
@ -2858,9 +2949,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-backend"
|
||||
version = "0.2.73"
|
||||
version = "0.2.74"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ae70622411ca953215ca6d06d3ebeb1e915f0f6613e3b495122878d7ebec7dae"
|
||||
checksum = "3b33f6a0694ccfea53d94db8b2ed1c3a8a4c86dd936b13b9f0a15ec4a451b900"
|
||||
dependencies = [
|
||||
"bumpalo",
|
||||
"lazy_static",
|
||||
@ -2873,9 +2964,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-futures"
|
||||
version = "0.4.23"
|
||||
version = "0.4.24"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "81b8b767af23de6ac18bf2168b690bed2902743ddf0fb39252e36f9e2bfc63ea"
|
||||
checksum = "5fba7978c679d53ce2d0ac80c8c175840feb849a161664365d1287b41f2e67f1"
|
||||
dependencies = [
|
||||
"cfg-if 1.0.0",
|
||||
"js-sys",
|
||||
@ -2885,9 +2976,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-macro"
|
||||
version = "0.2.73"
|
||||
version = "0.2.74"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3e734d91443f177bfdb41969de821e15c516931c3c3db3d318fa1b68975d0f6f"
|
||||
checksum = "088169ca61430fe1e58b8096c24975251700e7b1f6fd91cc9d59b04fb9b18bd4"
|
||||
dependencies = [
|
||||
"quote",
|
||||
"wasm-bindgen-macro-support",
|
||||
@ -2895,9 +2986,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-macro-support"
|
||||
version = "0.2.73"
|
||||
version = "0.2.74"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d53739ff08c8a68b0fdbcd54c372b8ab800b1449ab3c9d706503bc7dd1621b2c"
|
||||
checksum = "be2241542ff3d9f241f5e2cb6dd09b37efe786df8851c54957683a49f0987a97"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
@ -2908,15 +2999,15 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-shared"
|
||||
version = "0.2.73"
|
||||
version = "0.2.74"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d9a543ae66aa233d14bb765ed9af4a33e81b8b58d1584cf1b47ff8cd0b9e4489"
|
||||
checksum = "d7cff876b8f18eed75a66cf49b65e7f967cb354a7aa16003fb55dbfd25b44b4f"
|
||||
|
||||
[[package]]
|
||||
name = "web-sys"
|
||||
version = "0.3.50"
|
||||
version = "0.3.51"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a905d57e488fec8861446d3393670fb50d27a262344013181c2cdf9fff5481be"
|
||||
checksum = "e828417b379f3df7111d3a2a9e5753706cae29c41f7c4029ee9fd77f3e09e582"
|
||||
dependencies = [
|
||||
"js-sys",
|
||||
"wasm-bindgen",
|
||||
|
@ -22,4 +22,5 @@ crieur-chatbot = {version = "0.1", path="crieur-chatbot"}
|
||||
dotenv = "0.15.0"
|
||||
env_logger = "0.8.3"
|
||||
log = "0.4.14"
|
||||
tokio = { version = "1.5.0", features = ["full"] }
|
||||
tokio = { version = "1.6.0", features = ["full"] }
|
||||
tracing-subscriber = "0.2.18"
|
||||
|
@ -1,4 +1,5 @@
|
||||
Tools to retrieve articles from multiple newspaper you subscribed to.
|
||||
Tools to retrieve articles from multiple newspaper you subscribed to, all from
|
||||
the same place.
|
||||
|
||||
**This is a prototype, it isn't stable at all and you may not want to use it if
|
||||
you expect it to just work !**
|
||||
|
@ -2,15 +2,7 @@
|
||||
use std::convert::TryInto;
|
||||
|
||||
use anyhow::Result;
|
||||
use matrix_sdk::{
|
||||
self, async_trait,
|
||||
events::{
|
||||
room::message::{MessageEventContent, MessageType, TextMessageEventContent},
|
||||
AnyMessageEventContent, SyncMessageEvent,
|
||||
},
|
||||
room::Room,
|
||||
Client, ClientConfig, EventHandler, SyncSettings,
|
||||
};
|
||||
use matrix_sdk::{self, Client, SyncSettings};
|
||||
|
||||
use crate::Html;
|
||||
|
||||
|
@ -1,7 +1,6 @@
|
||||
use std::convert::TryInto;
|
||||
use std::env;
|
||||
|
||||
use log::{error, info};
|
||||
use log::error;
|
||||
use matrix_sdk::{
|
||||
self, async_trait,
|
||||
events::{
|
||||
@ -9,7 +8,7 @@ use matrix_sdk::{
|
||||
AnyMessageEventContent, SyncMessageEvent,
|
||||
},
|
||||
room::Room,
|
||||
Client, ClientConfig, EventHandler, SyncSettings,
|
||||
EventHandler,
|
||||
};
|
||||
|
||||
use crieur_retrieve::{article_location::Error, article_location::Result, ArticleLocation, Url};
|
||||
@ -113,7 +112,6 @@ impl EventHandler for Html {
|
||||
} else {
|
||||
return;
|
||||
};
|
||||
info!("sending file");
|
||||
|
||||
match msg_body.split(' ').collect::<Vec<_>>().as_slice() {
|
||||
["!html", url, ..] => send_article(*url, room).await,
|
||||
|
@ -8,23 +8,22 @@ publish = false
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1.0.40"
|
||||
async-trait = "0.1.48"
|
||||
async-trait = "0.1.50"
|
||||
thiserror = "1.0.24"
|
||||
url = "2.2.1"
|
||||
hyper = { version = "0.14.5", features = ["full"] }
|
||||
url = "2.2.2"
|
||||
hyper = { version = "0.14.7", features = ["full"] }
|
||||
hyper-rustls = "0.22.1"
|
||||
cookie = "0.15.0"
|
||||
lol_html = "0.3.0"
|
||||
indoc = "1.0.3"
|
||||
html-minifier = "3.0.9"
|
||||
html-minifier = "3.0.13"
|
||||
bytes = "1.0.1"
|
||||
base64 = "0.13.0"
|
||||
futures = "0.3.14"
|
||||
derive_builder = "0.10.0"
|
||||
futures = "0.3.15"
|
||||
derive_builder = "0.10.2"
|
||||
nipper = "0.1.9"
|
||||
log = "0.4.14"
|
||||
env_logger = "0.8.3"
|
||||
itertools = "0.10.0"
|
||||
|
||||
[dev-dependencies]
|
||||
tokio = "1.5.0"
|
||||
tokio = "1.6.0"
|
||||
|
@ -2,12 +2,12 @@ use std::boxed::Box;
|
||||
use std::convert::TryInto;
|
||||
use std::env;
|
||||
|
||||
use anyhow::anyhow;
|
||||
use log::info;
|
||||
use url::{Host, Url};
|
||||
|
||||
use crate::newspaper::Newspaper;
|
||||
use crate::newspapers::courrier_international::{self, CourrierInternational};
|
||||
use crate::newspapers::mediapart::{self, Mediapart};
|
||||
use crate::newspapers::monde_diplomatique::{self, MondeDiplo};
|
||||
|
||||
/// Enumerate all errors that can be encountered when using ArticleLocation
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
@ -33,16 +33,46 @@ type Newspapers = Vec<Box<dyn Newspaper>>;
|
||||
pub type Result<T, E = Error> = core::result::Result<T, E>;
|
||||
|
||||
fn default_newpapers() -> Result<Newspapers> {
|
||||
// TODO: same thing is written too much times : how to DRY ?
|
||||
let config_key = "MEDIAPART_COOKIE".to_string();
|
||||
let mpruiid = env::var(&config_key)
|
||||
.map_err(|_| Error::Misconfiguration(config_key))?
|
||||
.into();
|
||||
let mpruiid = env::var(&config_key).map_err(|_| Error::Misconfiguration(config_key))?;
|
||||
|
||||
let mediapart = Mediapart::builder()
|
||||
.login(mediapart::Login::MPRUUID(mpruiid))
|
||||
.login(mediapart::Login::Mpruuid(mpruiid))
|
||||
.build()?;
|
||||
|
||||
Ok(vec![Box::new(mediapart)])
|
||||
let lmd_a_m = "MONDE_DIPLO_LMD_A_M".to_string();
|
||||
let phpsessid = "MONDE_DIPLO_PHPSESSID".to_string();
|
||||
let spip_session = "MONDE_DIPLO_SPIP_SESSION".to_string();
|
||||
|
||||
let lmd_a_m = env::var(&lmd_a_m).map_err(|_| Error::Misconfiguration(lmd_a_m))?;
|
||||
let phpsessid = env::var(&phpsessid).map_err(|_| Error::Misconfiguration(phpsessid))?;
|
||||
let spip_session =
|
||||
env::var(&spip_session).map_err(|_| Error::Misconfiguration(spip_session))?;
|
||||
|
||||
let monde_diplo = MondeDiplo::builder()
|
||||
.login(monde_diplomatique::Login::Cookies {
|
||||
lmd_a_m,
|
||||
phpsessid,
|
||||
spip_session,
|
||||
})
|
||||
.build()?;
|
||||
|
||||
let lmd_a_m = "COURRIER_INTERNATIONAL_LMD_A_M".to_string();
|
||||
let ssess = "COURRIER_INTERNATIONAL_SSESS".to_string();
|
||||
|
||||
let lmd_a_m = env::var(&lmd_a_m).map_err(|_| Error::Misconfiguration(lmd_a_m))?;
|
||||
let ssess = env::var(&ssess).map_err(|_| Error::Misconfiguration(ssess))?;
|
||||
|
||||
let courrier_international = CourrierInternational::builder()
|
||||
.login(courrier_international::Login::Cookies { lmd_a_m, ssess })
|
||||
.build()?;
|
||||
|
||||
Ok(vec![
|
||||
Box::new(mediapart),
|
||||
Box::new(monde_diplo),
|
||||
Box::new(courrier_international),
|
||||
])
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
@ -126,7 +156,6 @@ impl ArticleLocation {
|
||||
}
|
||||
|
||||
pub async fn retrieve_html(&self) -> Result<String> {
|
||||
info!("It will download from {}", self.url);
|
||||
// TODO: modify when retrieve_html returns a specific Error type
|
||||
Ok(self.newspaper.retrieve_html(&self.url).await?)
|
||||
}
|
||||
|
144
crieur-retrieve/src/newspapers/courrier_international.rs
Normal file
144
crieur-retrieve/src/newspapers/courrier_international.rs
Normal file
@ -0,0 +1,144 @@
|
||||
use anyhow::{anyhow, bail, Result};
|
||||
use async_trait::async_trait;
|
||||
use cookie::Cookie;
|
||||
use indoc::indoc;
|
||||
use url::Host;
|
||||
|
||||
use crate::newspaper::{Metadata, Newspaper};
|
||||
use crate::tools;
|
||||
use crate::Url;
|
||||
use crate::{Download, Downloader};
|
||||
|
||||
pub enum Login {
|
||||
Username(String, String),
|
||||
Cookies { lmd_a_m: String, ssess: String },
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct CourrierInternational {
|
||||
login_cookies: Vec<(String, String)>,
|
||||
}
|
||||
|
||||
fn str_to_host<S: Into<String>>(host: S) -> Host {
|
||||
Host::Domain(host.into())
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct Builder {
|
||||
login_cookies: Option<Vec<(String, String)>>,
|
||||
}
|
||||
|
||||
impl Builder {
|
||||
pub fn login(&mut self, login: Login) -> &mut Self {
|
||||
self.login_cookies = match login {
|
||||
Login::Username(_username, _password) => {
|
||||
unimplemented!("login using username and passwond not implemented")
|
||||
}
|
||||
Login::Cookies { lmd_a_m, ssess } => Some(vec![
|
||||
("lmd_a_m".into(), lmd_a_m),
|
||||
("SSESS862c7003d721c672d39f161b1456b890".into(), ssess),
|
||||
]),
|
||||
};
|
||||
self
|
||||
}
|
||||
|
||||
pub fn build(&self) -> Result<CourrierInternational> {
|
||||
match &self.login_cookies {
|
||||
Some(login_cookies) => Ok(CourrierInternational {
|
||||
login_cookies: login_cookies.clone(),
|
||||
}),
|
||||
None => Err(anyhow!("You have to log in to access this newspaper")),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Newspaper for CourrierInternational {
|
||||
fn metadata(&self) -> Metadata {
|
||||
Metadata::builder()
|
||||
.hosts(vec![
|
||||
str_to_host("courrierinternational.com"),
|
||||
str_to_host("www.courrierinternational.com"),
|
||||
])
|
||||
.lower_case_name("courrier-international")
|
||||
.name("Courrier international")
|
||||
.build()
|
||||
.unwrap_or_default()
|
||||
}
|
||||
|
||||
async fn retrieve_html(&self, url: &Url) -> Result<String> {
|
||||
let cookies = self
|
||||
.login_cookies
|
||||
.iter()
|
||||
.map(|cookie| Cookie::build(&cookie.0, &cookie.1).finish())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
// TODO: replace by builder
|
||||
let downloader = Downloader { cookies };
|
||||
|
||||
let body = downloader.download(&url).await?;
|
||||
let html = match body {
|
||||
Some(body) => String::from_utf8(body.to_vec())?,
|
||||
None => bail!("404 not found"),
|
||||
};
|
||||
|
||||
let elements_to_remove = &[
|
||||
// navigation elements
|
||||
"header.site-header",
|
||||
"footer.site-footer",
|
||||
// Social buttons
|
||||
"#toolbox-share",
|
||||
".toolbox-share",
|
||||
".toolbox-print",
|
||||
".toolbox-respond",
|
||||
".toolbox-zen",
|
||||
".toolbox-newsletter",
|
||||
".toolbox-offer",
|
||||
".box-article-offer-friend-abo",
|
||||
// unused services
|
||||
".article-aside",
|
||||
".article-secondary",
|
||||
".article-subject-readmore",
|
||||
// misc
|
||||
".element-invisible",
|
||||
".gptcontainer",
|
||||
];
|
||||
|
||||
// FIXME: it doesn't work because the aside is in the article body
|
||||
//
|
||||
let toolbox_style = indoc! {"
|
||||
aside.article-toolbox {
|
||||
position: sticky;
|
||||
top: 1em;
|
||||
}
|
||||
"};
|
||||
|
||||
let single_page_html = tools::self_contained_html::Config {
|
||||
downloader: Some(&downloader),
|
||||
base_url: Some(&url),
|
||||
elements_to_remove,
|
||||
styles_to_add: &[toolbox_style],
|
||||
..Default::default()
|
||||
}
|
||||
.run(&html)
|
||||
.await;
|
||||
Ok(single_page_html)
|
||||
}
|
||||
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
|
||||
async fn has_complete_access(&self) -> bool {
|
||||
// TODO: check if we are logged using the cookie
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
impl CourrierInternational {
|
||||
pub fn builder() -> Builder {
|
||||
Builder::default()
|
||||
}
|
||||
}
|
@ -1,4 +1,4 @@
|
||||
use anyhow::{anyhow, Result};
|
||||
use anyhow::{anyhow, bail, Result};
|
||||
use async_trait::async_trait;
|
||||
use cookie::Cookie;
|
||||
use url::Host;
|
||||
@ -10,7 +10,7 @@ use crate::{Download, Downloader};
|
||||
|
||||
pub enum Login {
|
||||
Username(String, String),
|
||||
MPRUUID(String),
|
||||
Mpruuid(String),
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default)]
|
||||
@ -33,7 +33,7 @@ impl Builder {
|
||||
Login::Username(_username, _password) => {
|
||||
unimplemented!("login using username and passwond not implemented")
|
||||
}
|
||||
Login::MPRUUID(cookie_value) => Some(("MPRUUID".into(), cookie_value)),
|
||||
Login::Mpruuid(cookie_value) => Some(("MPRUUID".into(), cookie_value)),
|
||||
};
|
||||
self
|
||||
}
|
||||
@ -80,10 +80,13 @@ impl Newspaper for Mediapart {
|
||||
let downloader = Downloader { cookies };
|
||||
|
||||
let body = downloader.download(&url).await?;
|
||||
let html = String::from_utf8(body.to_vec())?;
|
||||
let html = match body {
|
||||
Some(body) => String::from_utf8(body.to_vec())?,
|
||||
None => bail!("404 not found"),
|
||||
};
|
||||
|
||||
// TODO: Move to const
|
||||
let element_to_remove = [
|
||||
let elements_to_remove = &[
|
||||
// header
|
||||
".fb-root",
|
||||
".skipLinks",
|
||||
@ -101,8 +104,14 @@ impl Newspaper for Mediapart {
|
||||
"aside.cc-modal",
|
||||
];
|
||||
|
||||
let single_page_html =
|
||||
tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await;
|
||||
let single_page_html = tools::self_contained_html::Config {
|
||||
downloader: Some(&downloader),
|
||||
base_url: Some(&url),
|
||||
elements_to_remove,
|
||||
..Default::default()
|
||||
}
|
||||
.run(&html)
|
||||
.await;
|
||||
Ok(single_page_html)
|
||||
}
|
||||
|
||||
|
@ -1 +1,3 @@
|
||||
pub mod courrier_international;
|
||||
pub mod mediapart;
|
||||
pub mod monde_diplomatique;
|
||||
|
137
crieur-retrieve/src/newspapers/monde_diplomatique.rs
Normal file
137
crieur-retrieve/src/newspapers/monde_diplomatique.rs
Normal file
@ -0,0 +1,137 @@
|
||||
use anyhow::{anyhow, bail, Result};
|
||||
use async_trait::async_trait;
|
||||
use cookie::Cookie;
|
||||
use url::Host;
|
||||
|
||||
use crate::newspaper::{Metadata, Newspaper};
|
||||
use crate::tools;
|
||||
use crate::Url;
|
||||
use crate::{Download, Downloader};
|
||||
|
||||
pub enum Login {
|
||||
Username(String, String),
|
||||
Cookies {
|
||||
lmd_a_m: String,
|
||||
phpsessid: String,
|
||||
spip_session: String,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct MondeDiplo {
|
||||
login_cookies: Vec<(String, String)>,
|
||||
}
|
||||
|
||||
fn str_to_host<S: Into<String>>(host: S) -> Host {
|
||||
Host::Domain(host.into())
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct Builder {
|
||||
login_cookies: Option<Vec<(String, String)>>,
|
||||
}
|
||||
|
||||
impl Builder {
|
||||
pub fn login(&mut self, login: Login) -> &mut Self {
|
||||
self.login_cookies = match login {
|
||||
Login::Username(_username, _password) => {
|
||||
unimplemented!("login using username and passwond not implemented")
|
||||
}
|
||||
Login::Cookies {
|
||||
lmd_a_m,
|
||||
phpsessid,
|
||||
spip_session,
|
||||
} => Some(vec![
|
||||
("lmd_a_m".into(), lmd_a_m),
|
||||
("PHPSESSID".into(), phpsessid),
|
||||
("spip_session".into(), spip_session),
|
||||
]),
|
||||
};
|
||||
self
|
||||
}
|
||||
|
||||
pub fn build(&self) -> Result<MondeDiplo> {
|
||||
match &self.login_cookies {
|
||||
Some(login_cookies) => Ok(MondeDiplo {
|
||||
login_cookies: login_cookies.clone(),
|
||||
}),
|
||||
None => Err(anyhow!("You have to log in to access this newspaper")),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Newspaper for MondeDiplo {
|
||||
fn metadata(&self) -> Metadata {
|
||||
Metadata::builder()
|
||||
.hosts(vec![
|
||||
str_to_host("monde-diplomatique.fr"),
|
||||
str_to_host("www.monde-diplomatique.fr"),
|
||||
])
|
||||
.lower_case_name("monde-diplomatique")
|
||||
.name("Le Monde Diplomatique")
|
||||
.build()
|
||||
.unwrap_or_default()
|
||||
}
|
||||
|
||||
async fn retrieve_html(&self, url: &Url) -> Result<String> {
|
||||
let cookies = self
|
||||
.login_cookies
|
||||
.iter()
|
||||
.map(|cookie| Cookie::build(&cookie.0, &cookie.1).finish())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
// TODO: replace by builder
|
||||
let downloader = Downloader { cookies };
|
||||
|
||||
let body = downloader.download(&url).await?;
|
||||
let html = match body {
|
||||
Some(body) => String::from_utf8(body.to_vec())?,
|
||||
None => bail!("404 not found"),
|
||||
};
|
||||
|
||||
// TODO: Move to const
|
||||
let elements_to_remove = &[
|
||||
// navigation elements
|
||||
"#tout-en-haut.preentete",
|
||||
"#entete.connecte",
|
||||
"#navigation",
|
||||
"#pied",
|
||||
".bloc-connexion",
|
||||
// unused features
|
||||
"#ecouter",
|
||||
// Social buttons
|
||||
".actions-article",
|
||||
"#partage",
|
||||
// misc
|
||||
"noscript",
|
||||
];
|
||||
|
||||
let single_page_html = tools::self_contained_html::Config {
|
||||
downloader: Some(&downloader),
|
||||
base_url: Some(&url),
|
||||
elements_to_remove,
|
||||
..Default::default()
|
||||
}
|
||||
.run(&html)
|
||||
.await;
|
||||
Ok(single_page_html)
|
||||
}
|
||||
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
|
||||
async fn has_complete_access(&self) -> bool {
|
||||
// TODO: check if we are logged using the cookie
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
impl MondeDiplo {
|
||||
pub fn builder() -> Builder {
|
||||
Builder::default()
|
||||
}
|
||||
}
|
@ -4,7 +4,7 @@ use anyhow::Result;
|
||||
use async_trait::async_trait;
|
||||
use bytes::Bytes;
|
||||
use cookie::Cookie;
|
||||
use hyper::{header, Body, Client, Method, Request};
|
||||
use hyper::{header, Body, Client, Method, Request, StatusCode};
|
||||
use thiserror::Error;
|
||||
use url::Url;
|
||||
|
||||
@ -22,7 +22,9 @@ pub trait Download {
|
||||
type Error: StdError;
|
||||
|
||||
/// Downloads a file from an url and returns the result as bytes
|
||||
async fn download(&self, file_link: &Url) -> Result<Bytes, Self::Error>;
|
||||
///
|
||||
/// If the file is not found, returns None
|
||||
async fn download(&self, file_link: &Url) -> Result<Option<Bytes>, Self::Error>;
|
||||
}
|
||||
|
||||
/// Store several cookies
|
||||
@ -36,7 +38,8 @@ pub struct Downloader<'c> {
|
||||
impl<'c> Download for Downloader<'c> {
|
||||
type Error = DownloadError;
|
||||
|
||||
async fn download(&self, file_link: &Url) -> Result<Bytes, Self::Error> {
|
||||
async fn download(&self, file_link: &Url) -> Result<Option<Bytes>, Self::Error> {
|
||||
log::debug!("downloading url {:?}", file_link);
|
||||
let https = hyper_rustls::HttpsConnector::with_native_roots();
|
||||
let client: Client<_, hyper::Body> = Client::builder().build(https);
|
||||
|
||||
@ -44,14 +47,26 @@ impl<'c> Download for Downloader<'c> {
|
||||
.method(Method::GET)
|
||||
.uri(file_link.as_str());
|
||||
|
||||
for cookie in &self.cookies {
|
||||
req = req.header(header::COOKIE, cookie.to_string());
|
||||
}
|
||||
req = req.header(
|
||||
header::COOKIE,
|
||||
self.cookies
|
||||
.iter()
|
||||
.map(Cookie::to_string)
|
||||
.collect::<Vec<_>>()
|
||||
.join(";"),
|
||||
);
|
||||
log::debug!("headers : {:?}", req.headers_ref());
|
||||
|
||||
let req = req.body(Body::empty())?;
|
||||
|
||||
let resp = client.request(req).await?;
|
||||
let body = hyper::body::to_bytes(resp).await?;
|
||||
log::debug!("Response status : {:?}", resp.status());
|
||||
let body = match resp.status() {
|
||||
StatusCode::OK => Some(hyper::body::to_bytes(resp).await?),
|
||||
StatusCode::NOT_FOUND => None,
|
||||
// TODO: enhance this by handling more error codes
|
||||
_ => None,
|
||||
};
|
||||
Ok(body)
|
||||
}
|
||||
}
|
||||
|
@ -1,5 +1,4 @@
|
||||
mod download;
|
||||
mod self_contained_html;
|
||||
pub mod self_contained_html;
|
||||
|
||||
pub use download::{Download, DownloadError, Downloader};
|
||||
pub use self_contained_html::self_contained_html;
|
||||
|
@ -8,142 +8,198 @@ use url::Url;
|
||||
use crate::consts::{EVENT_HANDLERS, LINK_REL_EXTERNAL_RESOURCES};
|
||||
use crate::Download;
|
||||
|
||||
/// Makes an html page self-contained
|
||||
///
|
||||
/// The `downloader` must implement `Download` and is used to download ressources that are
|
||||
/// needed to make this page self-contained such as stylesheets or images.
|
||||
///
|
||||
/// The function also removes all scripts on the page
|
||||
pub async fn self_contained_html<E, D>(
|
||||
html: impl AsRef<str>,
|
||||
downloader: &D,
|
||||
base_url: &Url,
|
||||
elements_to_remove: &[impl AsRef<str>],
|
||||
) -> String
|
||||
/// Stores configuration for the self_contained_html function
|
||||
// TODO: write a builder
|
||||
pub struct Config<'t, E, D, S1 = &'static str, S2 = &'static str>
|
||||
where
|
||||
E: std::error::Error,
|
||||
D: Download<Error = E> + Send,
|
||||
S1: AsRef<str>,
|
||||
S2: AsRef<str>,
|
||||
{
|
||||
/// the downloader that will be used to retrieve ressources on the page
|
||||
pub downloader: Option<&'t D>,
|
||||
/// Base url for downloading ressources, it probably the
|
||||
pub base_url: Option<&'t Url>,
|
||||
pub elements_to_remove: &'t [S1],
|
||||
pub styles_to_add: &'t [S2],
|
||||
}
|
||||
|
||||
impl<'t, E, D> Default for Config<'t, E, D>
|
||||
where
|
||||
E: std::error::Error,
|
||||
D: Download<Error = E> + Send,
|
||||
{
|
||||
// TODO: split/refactor this function :
|
||||
// - ¿ find a way to ease the |get urls| -> |async download| -> |replace| workflow ?
|
||||
// - `element_to_remove` , `base_url` and `downloader` should be in a configuration structure
|
||||
// - ¿ should be function of a trait ? or only of the configuration struct ?
|
||||
let (style_urls, html) = {
|
||||
let document = Document::from(html.as_ref());
|
||||
|
||||
// ---- Remove scripts ----
|
||||
//
|
||||
document.select("script").remove();
|
||||
|
||||
for event in EVENT_HANDLERS {
|
||||
document
|
||||
.select(format!("[{}]", event).as_str())
|
||||
.remove_attr(event);
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
downloader: None,
|
||||
base_url: None,
|
||||
elements_to_remove: &[],
|
||||
styles_to_add: &[],
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for rel in LINK_REL_EXTERNAL_RESOURCES {
|
||||
document
|
||||
.select(format!("link[rel=\"{}\"]", rel).as_str())
|
||||
.remove();
|
||||
}
|
||||
impl<'t, E, D, S1, S2> Config<'t, E, D, S1, S2>
|
||||
where
|
||||
E: std::error::Error,
|
||||
D: Download<Error = E> + Send,
|
||||
S1: AsRef<str>,
|
||||
S2: AsRef<str>,
|
||||
{
|
||||
/// Makes an html page self-contained
|
||||
///
|
||||
/// The `downloader` must implement `Download` and is used to download ressources that are
|
||||
/// needed to make this page self-contained such as stylesheets or images.
|
||||
///
|
||||
/// The function also removes all scripts on the page
|
||||
pub async fn run(&self, html: impl AsRef<str>) -> String {
|
||||
//TODO: don't panic
|
||||
let base_url = self.base_url.expect("Base url not defined");
|
||||
let downloader = self.downloader.expect("Downloader not defined");
|
||||
// TODO: split/refactor this function :
|
||||
// - ¿ find a way to ease the |get urls| -> |async download| -> |replace| workflow ?
|
||||
// - put each modification (ex: style in the `foreach`) in functions, maybe using
|
||||
// (¿sealed?) Traits : https://git.alpaga.dev/poc/crieur/issues/42
|
||||
// - `element_to_remove` , `base_url` and `downloader` should be in a configuration structure
|
||||
let (style_urls, html) = {
|
||||
let document = Document::from(html.as_ref());
|
||||
|
||||
// ---- Replace stylesheets ----
|
||||
// ---- Remove scripts ----
|
||||
//
|
||||
document.select("script").remove();
|
||||
|
||||
for event in EVENT_HANDLERS {
|
||||
document
|
||||
.select(format!("[{}]", event).as_str())
|
||||
.remove_attr(event);
|
||||
}
|
||||
|
||||
for rel in LINK_REL_EXTERNAL_RESOURCES {
|
||||
document
|
||||
.select(format!("link[rel=\"{}\"]", rel).as_str())
|
||||
.remove();
|
||||
}
|
||||
|
||||
// ---- Replace stylesheets ----
|
||||
//
|
||||
let stylesheets = document.select("link[href][rel=\"stylesheet\"]");
|
||||
let styles_url = stylesheets
|
||||
.iter()
|
||||
.map(|style_link| {
|
||||
if let Some(src) = style_link.attr("href") {
|
||||
base_url.join(src.as_ref()).ok()
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
(styles_url, String::from(document.html()))
|
||||
};
|
||||
|
||||
let style_urls = style_urls.into_iter().map(|style_url| {
|
||||
OptionFuture::from(
|
||||
style_url.map(|s| async move { downloader.download(&s).await.unwrap() }),
|
||||
)
|
||||
});
|
||||
let downloaded_styles = futures::future::join_all(style_urls).await;
|
||||
|
||||
let html = {
|
||||
let document = Document::from(&html);
|
||||
let styles = document.select("link[href][rel=\"stylesheet\"]");
|
||||
|
||||
styles
|
||||
.iter()
|
||||
.zip(downloaded_styles.iter())
|
||||
.for_each(|(mut style_link, inner_css)| {
|
||||
if let Some(Some(inner_css)) = inner_css {
|
||||
let css = String::from_utf8(inner_css.to_vec()).unwrap();
|
||||
let media_query = style_link.attr("media");
|
||||
let css = match media_query {
|
||||
Some(media_query) => {
|
||||
format!("<style media=\"{}\">{}</style>", media_query, css)
|
||||
}
|
||||
None => format!("<style>{}</style>", css),
|
||||
};
|
||||
style_link.replace_with_html(css);
|
||||
} else {
|
||||
style_link.remove();
|
||||
}
|
||||
});
|
||||
String::from(document.html())
|
||||
};
|
||||
|
||||
// ---- Replace imgs ----
|
||||
//
|
||||
let stylesheets = document.select("link[href][rel=\"stylesheet\"]");
|
||||
let styles_url = stylesheets
|
||||
.iter()
|
||||
.map(|stylesheet| {
|
||||
if let Some(src) = stylesheet.attr("href") {
|
||||
//TODO: does it work with absolute urls ?
|
||||
base_url.join(src.as_ref()).ok()
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
(styles_url, String::from(document.html()))
|
||||
};
|
||||
let image_urls = {
|
||||
let document = Document::from(&html);
|
||||
let imgs = document.select("img:not([src^=\"data:\"])");
|
||||
|
||||
let style_urls = style_urls.into_iter().map(|style_url| {
|
||||
OptionFuture::from(style_url.map(|s| async move { downloader.download(&s).await.unwrap() }))
|
||||
});
|
||||
let downloaded_styles = futures::future::join_all(style_urls).await;
|
||||
imgs.iter()
|
||||
.map(|image| {
|
||||
if let Some(src) = image.attr("src") {
|
||||
base_url.join(src.as_ref()).ok()
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
let html = {
|
||||
let document = Document::from(&html);
|
||||
let styles = document.select("link[href][rel=\"stylesheet\"]");
|
||||
let downloaded_images = image_urls.into_iter().map(|image_url| {
|
||||
OptionFuture::from(image_url.map(|url| async move {
|
||||
let data = downloader.download(&url).await.unwrap();
|
||||
(url, data)
|
||||
}))
|
||||
});
|
||||
let downloaded_images = futures::future::join_all(downloaded_images).await;
|
||||
|
||||
styles
|
||||
.iter()
|
||||
.zip(downloaded_styles.iter())
|
||||
.for_each(|(mut stylesheet, inner_css)| {
|
||||
if let Some(inner_css) = inner_css {
|
||||
let css = String::from_utf8(inner_css.to_vec()).unwrap();
|
||||
let css = format!("<style>{}</style>", css);
|
||||
stylesheet.replace_with_html(css);
|
||||
} else {
|
||||
stylesheet.remove();
|
||||
}
|
||||
});
|
||||
String::from(document.html())
|
||||
};
|
||||
let html = {
|
||||
let document = Document::from(&html);
|
||||
let imgs = document.select("img:not([src^=\"data:\"])");
|
||||
|
||||
// ---- Replace imgs ----
|
||||
//
|
||||
let image_urls = {
|
||||
let document = Document::from(&html);
|
||||
let imgs = document.select("img");
|
||||
imgs.iter()
|
||||
.zip(downloaded_images.iter())
|
||||
.for_each(|(mut img, data)| {
|
||||
if let Some((url, Some(data))) = data {
|
||||
let data = base64::encode(data);
|
||||
//TODO: use an extension hashmap
|
||||
let extension =
|
||||
Path::new(url.path()).extension().unwrap().to_str().unwrap();
|
||||
img.set_attr("src", &format!("data:image/{};base64,{}", extension, data));
|
||||
} else {
|
||||
img.remove()
|
||||
}
|
||||
});
|
||||
// ---- Remove unwanted html elements -----
|
||||
//
|
||||
for element in self.elements_to_remove {
|
||||
document.select(element.as_ref()).remove();
|
||||
}
|
||||
|
||||
imgs.iter()
|
||||
.map(|image| {
|
||||
if let Some(src) = image.attr("src") {
|
||||
base_url.join(src.as_ref()).ok()
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
};
|
||||
// ---- Add additional styles ----
|
||||
//
|
||||
for style in self.styles_to_add {
|
||||
document
|
||||
.select("head")
|
||||
.append_html(format!("\n<style>{}</style>\n", style.as_ref()));
|
||||
}
|
||||
|
||||
let downloaded_images = image_urls.into_iter().map(|image_url| {
|
||||
OptionFuture::from(image_url.map(|url| async move {
|
||||
let data = downloader.download(&url).await.unwrap();
|
||||
(url, data)
|
||||
}))
|
||||
});
|
||||
let downloaded_images = futures::future::join_all(downloaded_images).await;
|
||||
String::from(document.html())
|
||||
};
|
||||
|
||||
let html = {
|
||||
let document = Document::from(&html);
|
||||
let imgs = document.select("img");
|
||||
|
||||
imgs.iter()
|
||||
.zip(downloaded_images.iter())
|
||||
.for_each(|(mut img, data)| {
|
||||
if let Some((url, data)) = data {
|
||||
let data = base64::encode(data);
|
||||
let extension = Path::new(url.path()).extension().unwrap().to_str().unwrap();
|
||||
img.set_attr("src", &format!("data:image/{};base64,{}", extension, data));
|
||||
}
|
||||
});
|
||||
// ---- Remove unwanted html elements -----
|
||||
// ---- output ----
|
||||
//
|
||||
for element in elements_to_remove {
|
||||
document.select(element.as_ref()).remove();
|
||||
}
|
||||
String::from(document.html())
|
||||
};
|
||||
let mut minifier = HTMLMinifier::new();
|
||||
minifier.digest(html.as_str()).unwrap();
|
||||
|
||||
// ---- output ----
|
||||
//
|
||||
let mut minifier = HTMLMinifier::new();
|
||||
minifier.digest(html.as_str()).unwrap();
|
||||
|
||||
String::from_utf8(minifier.get_html().into()).unwrap()
|
||||
String::from_utf8(minifier.get_html().into()).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
// TODO: reduce boilerplate, DRY
|
||||
|
||||
use super::*;
|
||||
|
||||
@ -168,8 +224,8 @@ mod tests {
|
||||
#[async_trait]
|
||||
impl Download for DummyDownloader {
|
||||
type Error = errors::Error;
|
||||
async fn download(&self, _file_link: &Url) -> errors::Result<Bytes> {
|
||||
Ok(Bytes::from(""))
|
||||
async fn download(&self, _file_link: &Url) -> errors::Result<Option<Bytes>> {
|
||||
Ok(Some(Bytes::from("")))
|
||||
}
|
||||
}
|
||||
|
||||
@ -178,9 +234,14 @@ mod tests {
|
||||
let html = "<html><head><script>let id = id => id</script></head><body></body></html>";
|
||||
let base_url = Url::parse("http://example.com")?;
|
||||
let downloader = DummyDownloader {};
|
||||
let to_remove: &[&str] = &[];
|
||||
assert_eq!(
|
||||
self_contained_html(html, &downloader, &base_url, to_remove).await,
|
||||
Config {
|
||||
downloader: Some(&downloader),
|
||||
base_url: Some(&base_url),
|
||||
..Default::default()
|
||||
}
|
||||
.run(html)
|
||||
.await,
|
||||
"<html><head></head><body></body></html>"
|
||||
);
|
||||
Ok(())
|
||||
@ -204,10 +265,13 @@ mod tests {
|
||||
};
|
||||
|
||||
let base_url = Url::parse("http://example.com")?;
|
||||
let to_remove: &[&str] = &[];
|
||||
for s in EVENT_HANDLERS {
|
||||
assert_eq!(
|
||||
self_contained_html(html(s), &downloader, &base_url, to_remove).await,
|
||||
Config {
|
||||
downloader: Some(&downloader),
|
||||
base_url: Some(&base_url),
|
||||
..Default::default()
|
||||
}.run(html(s)).await,
|
||||
"<html><head>\n</head>\n<body>\n<button class=\"activate\">button</button>\n</body></html>"
|
||||
);
|
||||
}
|
||||
@ -232,10 +296,15 @@ mod tests {
|
||||
};
|
||||
|
||||
let base_url = Url::parse("http://example.com")?;
|
||||
let to_remove: &[&str] = &[];
|
||||
for s in LINK_REL_EXTERNAL_RESOURCES {
|
||||
assert_eq!(
|
||||
self_contained_html(html(s), &downloader, &base_url, to_remove).await,
|
||||
Config {
|
||||
downloader: Some(&downloader),
|
||||
base_url: Some(&base_url),
|
||||
..Default::default()
|
||||
}
|
||||
.run(html(s))
|
||||
.await,
|
||||
"<html><head>\n</head>\n<body>\n</body></html>"
|
||||
);
|
||||
}
|
||||
@ -246,12 +315,14 @@ mod tests {
|
||||
#[async_trait]
|
||||
impl Download for CssDownloader {
|
||||
type Error = errors::Error;
|
||||
async fn download(&self, _file_link: &Url) -> errors::Result<Bytes> {
|
||||
Ok(indoc! {"
|
||||
async fn download(&self, _file_link: &Url) -> errors::Result<Option<Bytes>> {
|
||||
Ok(Some(
|
||||
indoc! {"
|
||||
section#warning {
|
||||
color: red;
|
||||
}"}
|
||||
.into())
|
||||
.into(),
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
@ -286,9 +357,57 @@ mod tests {
|
||||
let minified = String::from_utf8(minifier.get_html().into())?;
|
||||
|
||||
let base_url = Url::parse("http://example.com")?;
|
||||
let to_remove: &[&str] = &[];
|
||||
assert_eq!(
|
||||
self_contained_html(html, &downloader, &base_url, to_remove).await,
|
||||
Config {
|
||||
downloader: Some(&downloader),
|
||||
base_url: Some(&base_url),
|
||||
..Default::default()
|
||||
}
|
||||
.run(html)
|
||||
.await,
|
||||
minified
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn download_css_with_media_query() -> Result<()> {
|
||||
let downloader = CssDownloader {};
|
||||
|
||||
let html = indoc! {"
|
||||
<html>
|
||||
<head>
|
||||
<link rel=\"stylesheet\" href=\"main.css\" media=\"print\">
|
||||
</head>
|
||||
<body>
|
||||
</body>
|
||||
</html>
|
||||
"};
|
||||
|
||||
let wanted_html = indoc! {"
|
||||
<html><head>
|
||||
<style media=\"print\">
|
||||
section#warning {
|
||||
color: red;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
</body></html>
|
||||
"};
|
||||
let mut minifier = HTMLMinifier::new();
|
||||
minifier.digest(wanted_html)?;
|
||||
let minified = String::from_utf8(minifier.get_html().into())?;
|
||||
|
||||
let base_url = Url::parse("http://example.com")?;
|
||||
assert_eq!(
|
||||
Config {
|
||||
downloader: Some(&downloader),
|
||||
base_url: Some(&base_url),
|
||||
..Default::default()
|
||||
}
|
||||
.run(html)
|
||||
.await,
|
||||
minified
|
||||
);
|
||||
Ok(())
|
||||
@ -298,12 +417,12 @@ mod tests {
|
||||
#[async_trait]
|
||||
impl Download for PngDownloader {
|
||||
type Error = errors::Error;
|
||||
async fn download(&self, _file_link: &Url) -> errors::Result<Bytes> {
|
||||
async fn download(&self, _file_link: &Url) -> errors::Result<Option<Bytes>> {
|
||||
let image_path = Path::new("test_data/home.png");
|
||||
let mut image_file = File::open(&image_path).unwrap();
|
||||
let mut image_buf: Vec<u8> = vec![];
|
||||
image_file.read_to_end(&mut image_buf).unwrap();
|
||||
Ok(image_buf.into())
|
||||
Ok(Some(image_buf.into()))
|
||||
}
|
||||
}
|
||||
|
||||
@ -333,9 +452,14 @@ mod tests {
|
||||
let minified = String::from_utf8(minifier.get_html().into())?;
|
||||
|
||||
let base_url = Url::parse("http://example.com")?;
|
||||
let to_remove: &[&str] = &[];
|
||||
assert_eq!(
|
||||
self_contained_html(html, &downloader, &base_url, to_remove).await,
|
||||
Config {
|
||||
downloader: Some(&downloader),
|
||||
base_url: Some(&base_url),
|
||||
..Default::default()
|
||||
}
|
||||
.run(html)
|
||||
.await,
|
||||
minified
|
||||
);
|
||||
Ok(())
|
||||
@ -368,12 +492,67 @@ mod tests {
|
||||
let minified = String::from_utf8(minifier.get_html().into())?;
|
||||
|
||||
assert_eq!(
|
||||
self_contained_html(
|
||||
html,
|
||||
&downloader,
|
||||
&base_url,
|
||||
&["header", ".placeholder", "article > span.huge"]
|
||||
)
|
||||
Config {
|
||||
downloader: Some(&downloader),
|
||||
base_url: Some(&base_url),
|
||||
elements_to_remove: &["header", ".placeholder", "article > span.huge"],
|
||||
..Default::default()
|
||||
}
|
||||
.run(html)
|
||||
.await,
|
||||
minified
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn add_style() -> Result<()> {
|
||||
let html = indoc! {"
|
||||
<html>
|
||||
<head>
|
||||
<meta charset=\"UTF-8\">
|
||||
</head>
|
||||
<body>
|
||||
The body
|
||||
</body>
|
||||
</html>
|
||||
"};
|
||||
|
||||
let wanted_html = indoc! {"
|
||||
<html><head>
|
||||
<meta charset=\"UTF-8\">
|
||||
<style>
|
||||
body {
|
||||
margin: 3em;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
The body
|
||||
</body></html>
|
||||
"};
|
||||
|
||||
let style_to_add = indoc! {"
|
||||
body {
|
||||
margin: 3em;
|
||||
}
|
||||
"};
|
||||
|
||||
let base_url = Url::parse("http://example.com")?;
|
||||
let downloader = DummyDownloader {};
|
||||
|
||||
let mut minifier = HTMLMinifier::new();
|
||||
minifier.digest(wanted_html)?;
|
||||
let minified = String::from_utf8(minifier.get_html().into())?;
|
||||
|
||||
assert_eq!(
|
||||
Config {
|
||||
downloader: Some(&downloader),
|
||||
base_url: Some(&base_url),
|
||||
styles_to_add: &[style_to_add],
|
||||
..Default::default()
|
||||
}
|
||||
.run(html)
|
||||
.await,
|
||||
minified
|
||||
);
|
||||
|
@ -37,7 +37,8 @@ frame "backend" {
|
||||
|
||||
newspaper -> retrieval_tools: uses to implement
|
||||
|
||||
article_location --> article_repr :uses
|
||||
article_location --> article_repr: uses
|
||||
retrieval_tools -up-> article_repr: uses
|
||||
|
||||
auto_retrieve --> rss: watches
|
||||
auto_retrieve --> article_location
|
||||
|
@ -8,3 +8,24 @@ The newspapers are configured using environment variables
|
||||
|
||||
MEDIAPART_COOKIE
|
||||
: sets the `MPRUUID` cookie, used to log in
|
||||
|
||||
# Le Monde Diplomatique
|
||||
|
||||
All cookies are mandatory to log in
|
||||
|
||||
MONDE_DIPLO_LMD_A_M
|
||||
: sets the `lmd_a_m` cookie
|
||||
|
||||
MONDE_DIPLO_PHPSESSID
|
||||
: sets the `PHPSESSID` cookie
|
||||
|
||||
MONDE_DIPLO_SPIP_SESSION
|
||||
: sets the `spip_session` cookie
|
||||
|
||||
# Courrier international
|
||||
|
||||
COURRIER_INTERNATIONAL_LMD_A_M
|
||||
: sets the `lmd_a_m` cookie
|
||||
|
||||
COURRIER_INTERNATIONAL_SSESS
|
||||
: sets the `ssess` cookie
|
||||
|
@ -2,38 +2,27 @@ use std::convert::TryInto;
|
||||
use std::env;
|
||||
|
||||
use anyhow::Result;
|
||||
use crieur_retrieve::{
|
||||
newspaper::Newspaper,
|
||||
newspapers::mediapart::{self, Mediapart},
|
||||
ArticleLocation, Url,
|
||||
};
|
||||
use crieur_retrieve::{ArticleLocation, Url};
|
||||
use dotenv::dotenv;
|
||||
use log::info;
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<()> {
|
||||
dotenv().ok();
|
||||
env_logger::init();
|
||||
tracing_subscriber::fmt()
|
||||
.with_writer(std::io::stderr)
|
||||
.with_env_filter(tracing_subscriber::EnvFilter::from_default_env())
|
||||
.init();
|
||||
|
||||
let url = match env::args().nth(1) {
|
||||
Some(url) => Url::parse(&url)?,
|
||||
None => "https://www.mediapart.fr/journal/france/030421/l-hotel-dieu-patients-et-medecins-tentent-de-percer-les-mysteres-du-covid-long".try_into()?,
|
||||
};
|
||||
|
||||
// TODO: remove this in favor of default newspapers
|
||||
|
||||
let mpruiid = env::var("MEDIAPART_COOKIE")?.into();
|
||||
let mediapart = Mediapart::builder()
|
||||
.login(mediapart::Login::MPRUUID(mpruiid))
|
||||
.build()?;
|
||||
|
||||
info!("Trying to download article from {}", url);
|
||||
|
||||
// TODO: shorten this, maybe an helper function ?
|
||||
let article_location = ArticleLocation::builder()
|
||||
.url(url)?
|
||||
.newspaper(mediapart)
|
||||
.build()?;
|
||||
let article_location = ArticleLocation::builder().url(url)?.build()?;
|
||||
|
||||
let article_str = article_location.retrieve_html().await?;
|
||||
|
||||
|
28
justfile
28
justfile
@ -1,19 +1,29 @@
|
||||
@build:
|
||||
cargo build
|
||||
cargo build
|
||||
|
||||
@build-container:
|
||||
podman build --file ./containers/chatbot.containerfile -t crieur-chatbot .
|
||||
|
||||
@clean:
|
||||
cargo clean
|
||||
cargo clean
|
||||
|
||||
@run:
|
||||
cargo run
|
||||
cargo run
|
||||
|
||||
@test:
|
||||
cargo test --all
|
||||
|
||||
@clippy:
|
||||
cargo clippy
|
||||
|
||||
@fmt:
|
||||
cargo fmt
|
||||
|
||||
@simulate-ci: fmt clippy test
|
||||
|
||||
@container:
|
||||
podman build --file ./containers/chatbot.containerfile -t crieur-chatbot .
|
||||
|
||||
@audit:
|
||||
cargo audit
|
||||
cargo audit
|
||||
|
||||
@crev:
|
||||
cargo crev verify
|
||||
|
||||
@verify: audit crev
|
||||
cargo crev verify
|
||||
|
Loading…
Reference in New Issue
Block a user