Compare commits

...

7 Commits

Author SHA1 Message Date
koalp
16ad14467e Merge pull request 'add several newspapers' (#36) from feature/additional_newspapers into development
All checks were successful
continuous-integration/drone/push Build is passing
continuous-integration/drone/tag Build is passing
Reviewed-on: #36
2021-05-22 02:50:43 +00:00
e34edf0b21
fix: keep media queries in ref styles
All checks were successful
continuous-integration/drone/push Build is passing
Previously, media queries weren't keep when downloading styles from ref
tags.

It have been fixed so that media attribute are kept when creating style
tags from ref tags.
2021-05-22 04:41:08 +02:00
40ebc1ddea
feat: allow to inject styles 2021-05-22 04:41:08 +02:00
6e091a32fc
chore: use a config struct for self_contained_html
Previously, self_html_function was a function taking all parameters as
arguments.
As new optionnal parameters are beeing added, the function had too much
arguments and each usage of the function would have to be modified each
time an argument will be added.

Therefore, it have been moved to a configuration structure with a `run`
function taking only one argument, the html string.
2021-05-22 04:41:08 +02:00
5d0872b4d9
feat : add retrieve from courrier international
Retrieval of articles from courrier international have been added
2021-05-22 04:41:08 +02:00
cee0af6c3c
fix: only select images that have non-data src
Previously, when the image url contained data, it tried to parse an url
and failed, instead of keeping data.

It have been fixed so that images where url is starting by 'data' are
not modified.
2021-05-22 04:41:08 +02:00
970f510cd1
feat: add retrieval from le monde diplomatique
Add retrieval from le monde diplomatique

Previously, 404 pages were injected in the document when downloading
styles
Now, the downloader returns None when documents are not found
2021-05-22 04:41:01 +02:00
18 changed files with 919 additions and 302 deletions

271
Cargo.lock generated
View File

@ -74,13 +74,22 @@ checksum = "739f4a8db6605981345c5654f3a85b056ce52f37a39d34da03f25bf2151ea16e"
[[package]]
name = "aho-corasick"
version = "0.7.15"
version = "0.7.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7404febffaa47dac81aa44dba71523c9d069b1bdc50a77db41195149e17f68e5"
checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f"
dependencies = [
"memchr",
]
[[package]]
name = "ansi_term"
version = "0.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2"
dependencies = [
"winapi",
]
[[package]]
name = "anyhow"
version = "1.0.40"
@ -213,6 +222,18 @@ version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "chrono"
version = "0.4.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "670ad68c9088c2a963aaa298cb369688cf3f9465ce5e2d4ca10e6e0098a1ce73"
dependencies = [
"libc",
"num-integer",
"num-traits",
"winapi",
]
[[package]]
name = "cipher"
version = "0.2.5"
@ -285,10 +306,13 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "79bb3adfaf5f75d24b01aee375f7555907840fa2800e5ec8fa3b9e2031830173"
[[package]]
name = "cpuid-bool"
version = "0.1.2"
name = "cpufeatures"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8aebca1129a03dc6dc2b127edd729435bbc4a37e1d5f4d7513165089ceb02634"
checksum = "dec1028182c380cc45a2e2c5ec841134f2dfd0f8f5f0a5bcd68004f81b5efdf4"
dependencies = [
"libc",
]
[[package]]
name = "cpuid-bool"
@ -307,6 +331,7 @@ dependencies = [
"env_logger",
"log",
"tokio",
"tracing-subscriber",
]
[[package]]
@ -337,7 +362,6 @@ dependencies = [
"hyper",
"hyper-rustls",
"indoc",
"itertools",
"log",
"lol_html",
"nipper",
@ -569,12 +593,6 @@ dependencies = [
"syn",
]
[[package]]
name = "either"
version = "1.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457"
[[package]]
name = "encoding_rs"
version = "0.8.28"
@ -644,9 +662,9 @@ dependencies = [
[[package]]
name = "futures"
version = "0.3.14"
version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a9d5813545e459ad3ca1bff9915e9ad7f1a47dc6a91b627ce321d5863b7dd253"
checksum = "0e7e43a803dae2fa37c1f6a8fe121e1f7bf9548b4dfc0522a42f34145dadfc27"
dependencies = [
"futures-channel",
"futures-core",
@ -659,9 +677,9 @@ dependencies = [
[[package]]
name = "futures-channel"
version = "0.3.14"
version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ce79c6a52a299137a6013061e0cf0e688fce5d7f1bc60125f520912fdb29ec25"
checksum = "e682a68b29a882df0545c143dc3646daefe80ba479bcdede94d5a703de2871e2"
dependencies = [
"futures-core",
"futures-sink",
@ -669,15 +687,15 @@ dependencies = [
[[package]]
name = "futures-core"
version = "0.3.14"
version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "098cd1c6dda6ca01650f1a37a794245eb73181d0d4d4e955e2f3c37db7af1815"
checksum = "0402f765d8a89a26043b889b26ce3c4679d268fa6bb22cd7c6aad98340e179d1"
[[package]]
name = "futures-executor"
version = "0.3.14"
version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "10f6cb7042eda00f0049b1d2080aa4b93442997ee507eb3828e8bd7577f94c9d"
checksum = "badaa6a909fac9e7236d0620a2f57f7664640c56575b71a7552fbd68deafab79"
dependencies = [
"futures-core",
"futures-task",
@ -686,9 +704,9 @@ dependencies = [
[[package]]
name = "futures-io"
version = "0.3.14"
version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "365a1a1fb30ea1c03a830fdb2158f5236833ac81fa0ad12fe35b29cddc35cb04"
checksum = "acc499defb3b348f8d8f3f66415835a9131856ff7714bf10dadfc4ec4bdb29a1"
[[package]]
name = "futures-locks"
@ -701,10 +719,11 @@ dependencies = [
[[package]]
name = "futures-macro"
version = "0.3.14"
version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "668c6733a182cd7deb4f1de7ba3bf2120823835b3bcfbeacf7d2c4a773c1bb8b"
checksum = "a4c40298486cdf52cc00cd6d6987892ba502c7656a16a4192a9992b1ccedd121"
dependencies = [
"autocfg 1.0.1",
"proc-macro-hack",
"proc-macro2",
"quote",
@ -713,15 +732,15 @@ dependencies = [
[[package]]
name = "futures-sink"
version = "0.3.14"
version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c5629433c555de3d82861a7a4e3794a4c40040390907cfbfd7143a92a426c23"
checksum = "a57bead0ceff0d6dde8f465ecd96c9338121bb7717d3e7b108059531870c4282"
[[package]]
name = "futures-task"
version = "0.3.14"
version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ba7aa51095076f3ba6d9a1f702f74bd05ec65f555d70d2033d55ba8d69f581bc"
checksum = "8a16bef9fc1a4dddb5bee51c989e3fbba26569cbb0e31f5b303c184e3dd33dae"
[[package]]
name = "futures-timer"
@ -735,10 +754,11 @@ dependencies = [
[[package]]
name = "futures-util"
version = "0.3.14"
version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3c144ad54d60f23927f0a6b6d816e4271278b64f005ad65e4e35291d2de9c025"
checksum = "feb5c238d27e2bf94ffdfd27b2c29e3df4a68c4193bb6427384259e2bf191967"
dependencies = [
"autocfg 1.0.1",
"futures-channel",
"futures-core",
"futures-io",
@ -821,9 +841,9 @@ dependencies = [
[[package]]
name = "h2"
version = "0.3.2"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fc018e188373e2777d0ef2467ebff62a08e66c3f5857b23c8fbec3018210dc00"
checksum = "825343c4eef0b63f541f8903f395dc5beb362a979b5799a84062527ef1e37726"
dependencies = [
"bytes",
"fnv",
@ -877,9 +897,9 @@ dependencies = [
[[package]]
name = "html-minifier"
version = "3.0.10"
version = "3.0.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "108452631307790510cde91282fc706ae70076bd68200add8638773f06d5e122"
checksum = "70f11cbdecf4fde3b74532e8d6d5926b6354adfda3fe259ed585402f9ae0dbaf"
dependencies = [
"cow-utils",
"educe",
@ -914,9 +934,9 @@ dependencies = [
[[package]]
name = "http-body"
version = "0.4.1"
version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5dfb77c123b4e2f72a2069aeae0b4b4949cc7e966df277813fc16347e7549737"
checksum = "60daa14be0e0786db0f03a9e57cb404c9d756eed2b6c62b9ea98ec5743ec75a9"
dependencies = [
"bytes",
"http",
@ -1038,15 +1058,6 @@ version = "2.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "47be2f14c678be2fdcab04ab1171db51b2762ce6f0a8ee87c8dd4a04ed216135"
[[package]]
name = "itertools"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "37d572918e350e82412fe766d24b15e6682fb2ed2bbe018280caa810397cb319"
dependencies = [
"either",
]
[[package]]
name = "itoa"
version = "0.4.7"
@ -1055,9 +1066,9 @@ checksum = "dd25036021b0de88a0aff6b850051563c6516d0bf53f8638938edbb9de732736"
[[package]]
name = "js-sys"
version = "0.3.50"
version = "0.3.51"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2d99f9e3e84b8f67f846ef5b4cbbc3b1c29f6c759fcbce6f01aa0e73d932a24c"
checksum = "83bdfbace3a0e81a4253f73b49e960b053e396a11012cbd49b9b74d6a2b67062"
dependencies = [
"wasm-bindgen",
]
@ -1091,9 +1102,9 @@ checksum = "18794a8ad5b29321f790b55d93dfba91e125cb1a9edbd4f8e3150acc771c1a5e"
[[package]]
name = "lock_api"
version = "0.4.3"
version = "0.4.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a3c91c24eae6777794bb1997ad98bbb87daf92890acab859f7eaa4320333176"
checksum = "0382880606dff6d15c9476c416d18690b72742aa7b605bb6dd6ec9030fbf07eb"
dependencies = [
"scopeguard",
]
@ -1158,6 +1169,15 @@ dependencies = [
"tendril",
]
[[package]]
name = "matchers"
version = "0.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f099785f7595cc4b4553a174ce30dd7589ef93391ff414dbb67f62392b9e0ce1"
dependencies = [
"regex-automata",
]
[[package]]
name = "matches"
version = "0.1.8"
@ -1254,9 +1274,9 @@ checksum = "60302e4db3a61da70c0cb7991976248362f30319e88850c487b9b95bbf059e00"
[[package]]
name = "memchr"
version = "2.3.4"
version = "2.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0ee1c47aaa256ecabcaea351eae4a9b01ef39ed810004e298d2511ed284b1525"
checksum = "b16bd47d9e329435e309c58469fe0791c2d0d1ba96ec0954152a5ae2b04387dc"
[[package]]
name = "mime"
@ -1266,9 +1286,9 @@ checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d"
[[package]]
name = "minifier"
version = "0.0.39"
version = "0.0.41"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6cdf618de5c9c98d4a7b2e0d1f1e44f82a19196cfd94040bb203621c25d28d98"
checksum = "5594542d20834f2b974f5e5fb8e0cf1c67a2119dcadc29ef5d93a081fb30cc08"
dependencies = [
"macro-utils",
]
@ -1585,7 +1605,7 @@ version = "0.4.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eebcc4aa140b9abd2bc40d9c3f7ccec842679cd79045ac3a7ac698c1a064b7cd"
dependencies = [
"cpuid-bool 0.2.0",
"cpuid-bool",
"opaque-debug",
"universal-hash",
]
@ -1856,18 +1876,18 @@ dependencies = [
[[package]]
name = "redox_syscall"
version = "0.2.7"
version = "0.2.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "85dd92e586f7355c633911e11f77f3d12f04b1b1bd76a198bd34ae3af8341ef2"
checksum = "742739e41cd49414de871ea5e549afb7e2a3ac77b589bcbebe8c82fab37147fc"
dependencies = [
"bitflags",
]
[[package]]
name = "regex"
version = "1.4.6"
version = "1.5.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2a26af418b574bd56588335b3a3659a65725d4e636eb1016c2f9e3b38c7cc759"
checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461"
dependencies = [
"aho-corasick",
"memchr",
@ -1875,10 +1895,20 @@ dependencies = [
]
[[package]]
name = "regex-syntax"
version = "0.6.23"
name = "regex-automata"
version = "0.1.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "24d5f089152e60f62d28b835fbff2cd2e8dc0baf1ac13343bef92ab7eed84548"
checksum = "ae1ded71d66a4a97f5e961fd0cb25a5f366a42a41570d16a763a69c092c26ae4"
dependencies = [
"byteorder",
"regex-syntax",
]
[[package]]
name = "regex-syntax"
version = "0.6.25"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b"
[[package]]
name = "reqwest"
@ -2319,17 +2349,26 @@ checksum = "2579985fda508104f7587689507983eadd6a6e84dd35d6d115361f530916fa0d"
[[package]]
name = "sha2"
version = "0.9.3"
version = "0.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fa827a14b29ab7f44778d14a88d3cb76e949c45083f7dbfa507d0cb699dc12de"
checksum = "d8f6b75b17576b792bef0db1bcc4b8b8bcdf9506744cf34b974195487af6cff2"
dependencies = [
"block-buffer",
"cfg-if 1.0.0",
"cpuid-bool 0.1.2",
"cpufeatures",
"digest",
"opaque-debug",
]
[[package]]
name = "sharded-slab"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "79c719719ee05df97490f80a45acfc99e5a30ce98a1e4fb67aee422745ae14e3"
dependencies = [
"lazy_static",
]
[[package]]
name = "signal-hook-registry"
version = "1.3.0"
@ -2491,9 +2530,9 @@ checksum = "1e81da0851ada1f3e9d4312c704aa4f8806f0f9d69faaf8df2f3464b4a9437c2"
[[package]]
name = "syn"
version = "1.0.71"
version = "1.0.72"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ad184cc9470f9117b2ac6817bfe297307418819ba40552f9b3846f05c33d5373"
checksum = "a1e8cdbefb79a9a5a65e0db8b47b723ee907b7c7f8496c76a1770b5c310bab82"
dependencies = [
"proc-macro2",
"quote",
@ -2558,6 +2597,15 @@ dependencies = [
"syn",
]
[[package]]
name = "thread_local"
version = "1.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8018d24e04c95ac8790716a5987d0fec4f8b27249ffa0f7d33f1369bdfb88cbd"
dependencies = [
"once_cell",
]
[[package]]
name = "time"
version = "0.2.26"
@ -2613,9 +2661,9 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c"
[[package]]
name = "tokio"
version = "1.5.0"
version = "1.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "83f0c8e7c0addab50b663055baf787d0af7f413a46e6e7fb9559a4e4db7137a5"
checksum = "bd3076b5c8cc18138b8f8814895c11eb4de37114a5d127bafdc5e55798ceef37"
dependencies = [
"autocfg 1.0.1",
"bytes",
@ -2684,9 +2732,9 @@ checksum = "360dfd1d6d30e05fda32ace2c8c70e9c0a9da713275777f5a4dbb8a1893930c6"
[[package]]
name = "tracing"
version = "0.1.25"
version = "0.1.26"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "01ebdc2bb4498ab1ab5f5b73c5803825e60199229ccba0698170e3be0e7f959f"
checksum = "09adeb8c97449311ccd28a427f96fb563e7fd31aabf994189879d9da2394b89d"
dependencies = [
"cfg-if 1.0.0",
"pin-project-lite",
@ -2707,9 +2755,9 @@ dependencies = [
[[package]]
name = "tracing-core"
version = "0.1.17"
version = "0.1.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f50de3927f93d202783f4513cda820ab47ef17f624b03c096e86ef00c67e6b5f"
checksum = "a9ff14f98b1a4b289c6248a023c1c2fa1491062964e9fed67ab29c4e4da4a052"
dependencies = [
"lazy_static",
]
@ -2724,6 +2772,49 @@ dependencies = [
"tracing",
]
[[package]]
name = "tracing-log"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a6923477a48e41c1951f1999ef8bb5a3023eb723ceadafe78ffb65dc366761e3"
dependencies = [
"lazy_static",
"log",
"tracing-core",
]
[[package]]
name = "tracing-serde"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fb65ea441fbb84f9f6748fd496cf7f63ec9af5bca94dd86456978d055e8eb28b"
dependencies = [
"serde",
"tracing-core",
]
[[package]]
name = "tracing-subscriber"
version = "0.2.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aa5553bf0883ba7c9cbe493b085c29926bd41b66afc31ff72cf17ff4fb60dcd5"
dependencies = [
"ansi_term",
"chrono",
"lazy_static",
"matchers",
"regex",
"serde",
"serde_json",
"sharded-slab",
"smallvec 1.6.1",
"thread_local",
"tracing",
"tracing-core",
"tracing-log",
"tracing-serde",
]
[[package]]
name = "try-lock"
version = "0.2.3"
@ -2756,9 +2847,9 @@ dependencies = [
[[package]]
name = "unicode-xid"
version = "0.2.1"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f7fe0bb3479651439c9112f72b6c505038574c9fbb575ed1bf3b797fa39dd564"
checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3"
[[package]]
name = "unindent"
@ -2784,9 +2875,9 @@ checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a"
[[package]]
name = "url"
version = "2.2.1"
version = "2.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9ccd964113622c8e9322cfac19eb1004a07e636c545f325da085d5cdde6f1f8b"
checksum = "a507c383b2d33b5fc35d1861e77e6b383d158b2da5e14fe51b83dfedf6fd578c"
dependencies = [
"form_urlencoded",
"idna",
@ -2846,9 +2937,9 @@ checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6"
[[package]]
name = "wasm-bindgen"
version = "0.2.73"
version = "0.2.74"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "83240549659d187488f91f33c0f8547cbfef0b2088bc470c116d1d260ef623d9"
checksum = "d54ee1d4ed486f78874278e63e4069fc1ab9f6a18ca492076ffb90c5eb2997fd"
dependencies = [
"cfg-if 1.0.0",
"serde",
@ -2858,9 +2949,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen-backend"
version = "0.2.73"
version = "0.2.74"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ae70622411ca953215ca6d06d3ebeb1e915f0f6613e3b495122878d7ebec7dae"
checksum = "3b33f6a0694ccfea53d94db8b2ed1c3a8a4c86dd936b13b9f0a15ec4a451b900"
dependencies = [
"bumpalo",
"lazy_static",
@ -2873,9 +2964,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen-futures"
version = "0.4.23"
version = "0.4.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "81b8b767af23de6ac18bf2168b690bed2902743ddf0fb39252e36f9e2bfc63ea"
checksum = "5fba7978c679d53ce2d0ac80c8c175840feb849a161664365d1287b41f2e67f1"
dependencies = [
"cfg-if 1.0.0",
"js-sys",
@ -2885,9 +2976,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen-macro"
version = "0.2.73"
version = "0.2.74"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3e734d91443f177bfdb41969de821e15c516931c3c3db3d318fa1b68975d0f6f"
checksum = "088169ca61430fe1e58b8096c24975251700e7b1f6fd91cc9d59b04fb9b18bd4"
dependencies = [
"quote",
"wasm-bindgen-macro-support",
@ -2895,9 +2986,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen-macro-support"
version = "0.2.73"
version = "0.2.74"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d53739ff08c8a68b0fdbcd54c372b8ab800b1449ab3c9d706503bc7dd1621b2c"
checksum = "be2241542ff3d9f241f5e2cb6dd09b37efe786df8851c54957683a49f0987a97"
dependencies = [
"proc-macro2",
"quote",
@ -2908,15 +2999,15 @@ dependencies = [
[[package]]
name = "wasm-bindgen-shared"
version = "0.2.73"
version = "0.2.74"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9a543ae66aa233d14bb765ed9af4a33e81b8b58d1584cf1b47ff8cd0b9e4489"
checksum = "d7cff876b8f18eed75a66cf49b65e7f967cb354a7aa16003fb55dbfd25b44b4f"
[[package]]
name = "web-sys"
version = "0.3.50"
version = "0.3.51"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a905d57e488fec8861446d3393670fb50d27a262344013181c2cdf9fff5481be"
checksum = "e828417b379f3df7111d3a2a9e5753706cae29c41f7c4029ee9fd77f3e09e582"
dependencies = [
"js-sys",
"wasm-bindgen",

View File

@ -22,4 +22,5 @@ crieur-chatbot = {version = "0.1", path="crieur-chatbot"}
dotenv = "0.15.0"
env_logger = "0.8.3"
log = "0.4.14"
tokio = { version = "1.5.0", features = ["full"] }
tokio = { version = "1.6.0", features = ["full"] }
tracing-subscriber = "0.2.18"

View File

@ -1,4 +1,5 @@
Tools to retrieve articles from multiple newspaper you subscribed to.
Tools to retrieve articles from multiple newspaper you subscribed to, all from
the same place.
**This is a prototype, it isn't stable at all and you may not want to use it if
you expect it to just work !**

View File

@ -2,15 +2,7 @@
use std::convert::TryInto;
use anyhow::Result;
use matrix_sdk::{
self, async_trait,
events::{
room::message::{MessageEventContent, MessageType, TextMessageEventContent},
AnyMessageEventContent, SyncMessageEvent,
},
room::Room,
Client, ClientConfig, EventHandler, SyncSettings,
};
use matrix_sdk::{self, Client, SyncSettings};
use crate::Html;

View File

@ -1,7 +1,6 @@
use std::convert::TryInto;
use std::env;
use log::{error, info};
use log::error;
use matrix_sdk::{
self, async_trait,
events::{
@ -9,7 +8,7 @@ use matrix_sdk::{
AnyMessageEventContent, SyncMessageEvent,
},
room::Room,
Client, ClientConfig, EventHandler, SyncSettings,
EventHandler,
};
use crieur_retrieve::{article_location::Error, article_location::Result, ArticleLocation, Url};
@ -113,7 +112,6 @@ impl EventHandler for Html {
} else {
return;
};
info!("sending file");
match msg_body.split(' ').collect::<Vec<_>>().as_slice() {
["!html", url, ..] => send_article(*url, room).await,

View File

@ -8,23 +8,22 @@ publish = false
[dependencies]
anyhow = "1.0.40"
async-trait = "0.1.48"
async-trait = "0.1.50"
thiserror = "1.0.24"
url = "2.2.1"
hyper = { version = "0.14.5", features = ["full"] }
url = "2.2.2"
hyper = { version = "0.14.7", features = ["full"] }
hyper-rustls = "0.22.1"
cookie = "0.15.0"
lol_html = "0.3.0"
indoc = "1.0.3"
html-minifier = "3.0.9"
html-minifier = "3.0.13"
bytes = "1.0.1"
base64 = "0.13.0"
futures = "0.3.14"
derive_builder = "0.10.0"
futures = "0.3.15"
derive_builder = "0.10.2"
nipper = "0.1.9"
log = "0.4.14"
env_logger = "0.8.3"
itertools = "0.10.0"
[dev-dependencies]
tokio = "1.5.0"
tokio = "1.6.0"

View File

@ -2,12 +2,12 @@ use std::boxed::Box;
use std::convert::TryInto;
use std::env;
use anyhow::anyhow;
use log::info;
use url::{Host, Url};
use crate::newspaper::Newspaper;
use crate::newspapers::courrier_international::{self, CourrierInternational};
use crate::newspapers::mediapart::{self, Mediapart};
use crate::newspapers::monde_diplomatique::{self, MondeDiplo};
/// Enumerate all errors that can be encountered when using ArticleLocation
#[derive(thiserror::Error, Debug)]
@ -33,16 +33,46 @@ type Newspapers = Vec<Box<dyn Newspaper>>;
pub type Result<T, E = Error> = core::result::Result<T, E>;
fn default_newpapers() -> Result<Newspapers> {
// TODO: same thing is written too much times : how to DRY ?
let config_key = "MEDIAPART_COOKIE".to_string();
let mpruiid = env::var(&config_key)
.map_err(|_| Error::Misconfiguration(config_key))?
.into();
let mpruiid = env::var(&config_key).map_err(|_| Error::Misconfiguration(config_key))?;
let mediapart = Mediapart::builder()
.login(mediapart::Login::MPRUUID(mpruiid))
.login(mediapart::Login::Mpruuid(mpruiid))
.build()?;
Ok(vec![Box::new(mediapart)])
let lmd_a_m = "MONDE_DIPLO_LMD_A_M".to_string();
let phpsessid = "MONDE_DIPLO_PHPSESSID".to_string();
let spip_session = "MONDE_DIPLO_SPIP_SESSION".to_string();
let lmd_a_m = env::var(&lmd_a_m).map_err(|_| Error::Misconfiguration(lmd_a_m))?;
let phpsessid = env::var(&phpsessid).map_err(|_| Error::Misconfiguration(phpsessid))?;
let spip_session =
env::var(&spip_session).map_err(|_| Error::Misconfiguration(spip_session))?;
let monde_diplo = MondeDiplo::builder()
.login(monde_diplomatique::Login::Cookies {
lmd_a_m,
phpsessid,
spip_session,
})
.build()?;
let lmd_a_m = "COURRIER_INTERNATIONAL_LMD_A_M".to_string();
let ssess = "COURRIER_INTERNATIONAL_SSESS".to_string();
let lmd_a_m = env::var(&lmd_a_m).map_err(|_| Error::Misconfiguration(lmd_a_m))?;
let ssess = env::var(&ssess).map_err(|_| Error::Misconfiguration(ssess))?;
let courrier_international = CourrierInternational::builder()
.login(courrier_international::Login::Cookies { lmd_a_m, ssess })
.build()?;
Ok(vec![
Box::new(mediapart),
Box::new(monde_diplo),
Box::new(courrier_international),
])
}
#[derive(Default)]
@ -126,7 +156,6 @@ impl ArticleLocation {
}
pub async fn retrieve_html(&self) -> Result<String> {
info!("It will download from {}", self.url);
// TODO: modify when retrieve_html returns a specific Error type
Ok(self.newspaper.retrieve_html(&self.url).await?)
}

View File

@ -0,0 +1,144 @@
use anyhow::{anyhow, bail, Result};
use async_trait::async_trait;
use cookie::Cookie;
use indoc::indoc;
use url::Host;
use crate::newspaper::{Metadata, Newspaper};
use crate::tools;
use crate::Url;
use crate::{Download, Downloader};
pub enum Login {
Username(String, String),
Cookies { lmd_a_m: String, ssess: String },
}
#[derive(Debug, Clone, Default)]
pub struct CourrierInternational {
login_cookies: Vec<(String, String)>,
}
fn str_to_host<S: Into<String>>(host: S) -> Host {
Host::Domain(host.into())
}
#[derive(Debug, Clone, Default)]
pub struct Builder {
login_cookies: Option<Vec<(String, String)>>,
}
impl Builder {
pub fn login(&mut self, login: Login) -> &mut Self {
self.login_cookies = match login {
Login::Username(_username, _password) => {
unimplemented!("login using username and passwond not implemented")
}
Login::Cookies { lmd_a_m, ssess } => Some(vec![
("lmd_a_m".into(), lmd_a_m),
("SSESS862c7003d721c672d39f161b1456b890".into(), ssess),
]),
};
self
}
pub fn build(&self) -> Result<CourrierInternational> {
match &self.login_cookies {
Some(login_cookies) => Ok(CourrierInternational {
login_cookies: login_cookies.clone(),
}),
None => Err(anyhow!("You have to log in to access this newspaper")),
}
}
}
#[async_trait]
impl Newspaper for CourrierInternational {
fn metadata(&self) -> Metadata {
Metadata::builder()
.hosts(vec![
str_to_host("courrierinternational.com"),
str_to_host("www.courrierinternational.com"),
])
.lower_case_name("courrier-international")
.name("Courrier international")
.build()
.unwrap_or_default()
}
async fn retrieve_html(&self, url: &Url) -> Result<String> {
let cookies = self
.login_cookies
.iter()
.map(|cookie| Cookie::build(&cookie.0, &cookie.1).finish())
.collect::<Vec<_>>();
// TODO: replace by builder
let downloader = Downloader { cookies };
let body = downloader.download(&url).await?;
let html = match body {
Some(body) => String::from_utf8(body.to_vec())?,
None => bail!("404 not found"),
};
let elements_to_remove = &[
// navigation elements
"header.site-header",
"footer.site-footer",
// Social buttons
"#toolbox-share",
".toolbox-share",
".toolbox-print",
".toolbox-respond",
".toolbox-zen",
".toolbox-newsletter",
".toolbox-offer",
".box-article-offer-friend-abo",
// unused services
".article-aside",
".article-secondary",
".article-subject-readmore",
// misc
".element-invisible",
".gptcontainer",
];
// FIXME: it doesn't work because the aside is in the article body
//
let toolbox_style = indoc! {"
aside.article-toolbox {
position: sticky;
top: 1em;
}
"};
let single_page_html = tools::self_contained_html::Config {
downloader: Some(&downloader),
base_url: Some(&url),
elements_to_remove,
styles_to_add: &[toolbox_style],
..Default::default()
}
.run(&html)
.await;
Ok(single_page_html)
}
fn new() -> Self {
Self {
..Default::default()
}
}
async fn has_complete_access(&self) -> bool {
// TODO: check if we are logged using the cookie
true
}
}
impl CourrierInternational {
pub fn builder() -> Builder {
Builder::default()
}
}

View File

@ -1,4 +1,4 @@
use anyhow::{anyhow, Result};
use anyhow::{anyhow, bail, Result};
use async_trait::async_trait;
use cookie::Cookie;
use url::Host;
@ -10,7 +10,7 @@ use crate::{Download, Downloader};
pub enum Login {
Username(String, String),
MPRUUID(String),
Mpruuid(String),
}
#[derive(Debug, Clone, Default)]
@ -33,7 +33,7 @@ impl Builder {
Login::Username(_username, _password) => {
unimplemented!("login using username and passwond not implemented")
}
Login::MPRUUID(cookie_value) => Some(("MPRUUID".into(), cookie_value)),
Login::Mpruuid(cookie_value) => Some(("MPRUUID".into(), cookie_value)),
};
self
}
@ -80,10 +80,13 @@ impl Newspaper for Mediapart {
let downloader = Downloader { cookies };
let body = downloader.download(&url).await?;
let html = String::from_utf8(body.to_vec())?;
let html = match body {
Some(body) => String::from_utf8(body.to_vec())?,
None => bail!("404 not found"),
};
// TODO: Move to const
let element_to_remove = [
let elements_to_remove = &[
// header
".fb-root",
".skipLinks",
@ -101,8 +104,14 @@ impl Newspaper for Mediapart {
"aside.cc-modal",
];
let single_page_html =
tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await;
let single_page_html = tools::self_contained_html::Config {
downloader: Some(&downloader),
base_url: Some(&url),
elements_to_remove,
..Default::default()
}
.run(&html)
.await;
Ok(single_page_html)
}

View File

@ -1 +1,3 @@
pub mod courrier_international;
pub mod mediapart;
pub mod monde_diplomatique;

View File

@ -0,0 +1,137 @@
use anyhow::{anyhow, bail, Result};
use async_trait::async_trait;
use cookie::Cookie;
use url::Host;
use crate::newspaper::{Metadata, Newspaper};
use crate::tools;
use crate::Url;
use crate::{Download, Downloader};
pub enum Login {
Username(String, String),
Cookies {
lmd_a_m: String,
phpsessid: String,
spip_session: String,
},
}
#[derive(Debug, Clone, Default)]
pub struct MondeDiplo {
login_cookies: Vec<(String, String)>,
}
fn str_to_host<S: Into<String>>(host: S) -> Host {
Host::Domain(host.into())
}
#[derive(Debug, Clone, Default)]
pub struct Builder {
login_cookies: Option<Vec<(String, String)>>,
}
impl Builder {
pub fn login(&mut self, login: Login) -> &mut Self {
self.login_cookies = match login {
Login::Username(_username, _password) => {
unimplemented!("login using username and passwond not implemented")
}
Login::Cookies {
lmd_a_m,
phpsessid,
spip_session,
} => Some(vec![
("lmd_a_m".into(), lmd_a_m),
("PHPSESSID".into(), phpsessid),
("spip_session".into(), spip_session),
]),
};
self
}
pub fn build(&self) -> Result<MondeDiplo> {
match &self.login_cookies {
Some(login_cookies) => Ok(MondeDiplo {
login_cookies: login_cookies.clone(),
}),
None => Err(anyhow!("You have to log in to access this newspaper")),
}
}
}
#[async_trait]
impl Newspaper for MondeDiplo {
fn metadata(&self) -> Metadata {
Metadata::builder()
.hosts(vec![
str_to_host("monde-diplomatique.fr"),
str_to_host("www.monde-diplomatique.fr"),
])
.lower_case_name("monde-diplomatique")
.name("Le Monde Diplomatique")
.build()
.unwrap_or_default()
}
async fn retrieve_html(&self, url: &Url) -> Result<String> {
let cookies = self
.login_cookies
.iter()
.map(|cookie| Cookie::build(&cookie.0, &cookie.1).finish())
.collect::<Vec<_>>();
// TODO: replace by builder
let downloader = Downloader { cookies };
let body = downloader.download(&url).await?;
let html = match body {
Some(body) => String::from_utf8(body.to_vec())?,
None => bail!("404 not found"),
};
// TODO: Move to const
let elements_to_remove = &[
// navigation elements
"#tout-en-haut.preentete",
"#entete.connecte",
"#navigation",
"#pied",
".bloc-connexion",
// unused features
"#ecouter",
// Social buttons
".actions-article",
"#partage",
// misc
"noscript",
];
let single_page_html = tools::self_contained_html::Config {
downloader: Some(&downloader),
base_url: Some(&url),
elements_to_remove,
..Default::default()
}
.run(&html)
.await;
Ok(single_page_html)
}
fn new() -> Self {
Self {
..Default::default()
}
}
async fn has_complete_access(&self) -> bool {
// TODO: check if we are logged using the cookie
true
}
}
impl MondeDiplo {
pub fn builder() -> Builder {
Builder::default()
}
}

View File

@ -4,7 +4,7 @@ use anyhow::Result;
use async_trait::async_trait;
use bytes::Bytes;
use cookie::Cookie;
use hyper::{header, Body, Client, Method, Request};
use hyper::{header, Body, Client, Method, Request, StatusCode};
use thiserror::Error;
use url::Url;
@ -22,7 +22,9 @@ pub trait Download {
type Error: StdError;
/// Downloads a file from an url and returns the result as bytes
async fn download(&self, file_link: &Url) -> Result<Bytes, Self::Error>;
///
/// If the file is not found, returns None
async fn download(&self, file_link: &Url) -> Result<Option<Bytes>, Self::Error>;
}
/// Store several cookies
@ -36,7 +38,8 @@ pub struct Downloader<'c> {
impl<'c> Download for Downloader<'c> {
type Error = DownloadError;
async fn download(&self, file_link: &Url) -> Result<Bytes, Self::Error> {
async fn download(&self, file_link: &Url) -> Result<Option<Bytes>, Self::Error> {
log::debug!("downloading url {:?}", file_link);
let https = hyper_rustls::HttpsConnector::with_native_roots();
let client: Client<_, hyper::Body> = Client::builder().build(https);
@ -44,14 +47,26 @@ impl<'c> Download for Downloader<'c> {
.method(Method::GET)
.uri(file_link.as_str());
for cookie in &self.cookies {
req = req.header(header::COOKIE, cookie.to_string());
}
req = req.header(
header::COOKIE,
self.cookies
.iter()
.map(Cookie::to_string)
.collect::<Vec<_>>()
.join(";"),
);
log::debug!("headers : {:?}", req.headers_ref());
let req = req.body(Body::empty())?;
let resp = client.request(req).await?;
let body = hyper::body::to_bytes(resp).await?;
log::debug!("Response status : {:?}", resp.status());
let body = match resp.status() {
StatusCode::OK => Some(hyper::body::to_bytes(resp).await?),
StatusCode::NOT_FOUND => None,
// TODO: enhance this by handling more error codes
_ => None,
};
Ok(body)
}
}

View File

@ -1,5 +1,4 @@
mod download;
mod self_contained_html;
pub mod self_contained_html;
pub use download::{Download, DownloadError, Downloader};
pub use self_contained_html::self_contained_html;

View File

@ -8,142 +8,198 @@ use url::Url;
use crate::consts::{EVENT_HANDLERS, LINK_REL_EXTERNAL_RESOURCES};
use crate::Download;
/// Makes an html page self-contained
///
/// The `downloader` must implement `Download` and is used to download ressources that are
/// needed to make this page self-contained such as stylesheets or images.
///
/// The function also removes all scripts on the page
pub async fn self_contained_html<E, D>(
html: impl AsRef<str>,
downloader: &D,
base_url: &Url,
elements_to_remove: &[impl AsRef<str>],
) -> String
/// Stores configuration for the self_contained_html function
// TODO: write a builder
pub struct Config<'t, E, D, S1 = &'static str, S2 = &'static str>
where
E: std::error::Error,
D: Download<Error = E> + Send,
S1: AsRef<str>,
S2: AsRef<str>,
{
/// the downloader that will be used to retrieve ressources on the page
pub downloader: Option<&'t D>,
/// Base url for downloading ressources, it probably the
pub base_url: Option<&'t Url>,
pub elements_to_remove: &'t [S1],
pub styles_to_add: &'t [S2],
}
impl<'t, E, D> Default for Config<'t, E, D>
where
E: std::error::Error,
D: Download<Error = E> + Send,
{
// TODO: split/refactor this function :
// - ¿ find a way to ease the |get urls| -> |async download| -> |replace| workflow ?
// - `element_to_remove` , `base_url` and `downloader` should be in a configuration structure
// - ¿ should be function of a trait ? or only of the configuration struct ?
let (style_urls, html) = {
let document = Document::from(html.as_ref());
// ---- Remove scripts ----
//
document.select("script").remove();
for event in EVENT_HANDLERS {
document
.select(format!("[{}]", event).as_str())
.remove_attr(event);
fn default() -> Self {
Self {
downloader: None,
base_url: None,
elements_to_remove: &[],
styles_to_add: &[],
}
}
}
for rel in LINK_REL_EXTERNAL_RESOURCES {
document
.select(format!("link[rel=\"{}\"]", rel).as_str())
.remove();
}
impl<'t, E, D, S1, S2> Config<'t, E, D, S1, S2>
where
E: std::error::Error,
D: Download<Error = E> + Send,
S1: AsRef<str>,
S2: AsRef<str>,
{
/// Makes an html page self-contained
///
/// The `downloader` must implement `Download` and is used to download ressources that are
/// needed to make this page self-contained such as stylesheets or images.
///
/// The function also removes all scripts on the page
pub async fn run(&self, html: impl AsRef<str>) -> String {
//TODO: don't panic
let base_url = self.base_url.expect("Base url not defined");
let downloader = self.downloader.expect("Downloader not defined");
// TODO: split/refactor this function :
// - ¿ find a way to ease the |get urls| -> |async download| -> |replace| workflow ?
// - put each modification (ex: style in the `foreach`) in functions, maybe using
// (¿sealed?) Traits : https://git.alpaga.dev/poc/crieur/issues/42
// - `element_to_remove` , `base_url` and `downloader` should be in a configuration structure
let (style_urls, html) = {
let document = Document::from(html.as_ref());
// ---- Replace stylesheets ----
// ---- Remove scripts ----
//
document.select("script").remove();
for event in EVENT_HANDLERS {
document
.select(format!("[{}]", event).as_str())
.remove_attr(event);
}
for rel in LINK_REL_EXTERNAL_RESOURCES {
document
.select(format!("link[rel=\"{}\"]", rel).as_str())
.remove();
}
// ---- Replace stylesheets ----
//
let stylesheets = document.select("link[href][rel=\"stylesheet\"]");
let styles_url = stylesheets
.iter()
.map(|style_link| {
if let Some(src) = style_link.attr("href") {
base_url.join(src.as_ref()).ok()
} else {
None
}
})
.collect::<Vec<_>>();
(styles_url, String::from(document.html()))
};
let style_urls = style_urls.into_iter().map(|style_url| {
OptionFuture::from(
style_url.map(|s| async move { downloader.download(&s).await.unwrap() }),
)
});
let downloaded_styles = futures::future::join_all(style_urls).await;
let html = {
let document = Document::from(&html);
let styles = document.select("link[href][rel=\"stylesheet\"]");
styles
.iter()
.zip(downloaded_styles.iter())
.for_each(|(mut style_link, inner_css)| {
if let Some(Some(inner_css)) = inner_css {
let css = String::from_utf8(inner_css.to_vec()).unwrap();
let media_query = style_link.attr("media");
let css = match media_query {
Some(media_query) => {
format!("<style media=\"{}\">{}</style>", media_query, css)
}
None => format!("<style>{}</style>", css),
};
style_link.replace_with_html(css);
} else {
style_link.remove();
}
});
String::from(document.html())
};
// ---- Replace imgs ----
//
let stylesheets = document.select("link[href][rel=\"stylesheet\"]");
let styles_url = stylesheets
.iter()
.map(|stylesheet| {
if let Some(src) = stylesheet.attr("href") {
//TODO: does it work with absolute urls ?
base_url.join(src.as_ref()).ok()
} else {
None
}
})
.collect::<Vec<_>>();
(styles_url, String::from(document.html()))
};
let image_urls = {
let document = Document::from(&html);
let imgs = document.select("img:not([src^=\"data:\"])");
let style_urls = style_urls.into_iter().map(|style_url| {
OptionFuture::from(style_url.map(|s| async move { downloader.download(&s).await.unwrap() }))
});
let downloaded_styles = futures::future::join_all(style_urls).await;
imgs.iter()
.map(|image| {
if let Some(src) = image.attr("src") {
base_url.join(src.as_ref()).ok()
} else {
None
}
})
.collect::<Vec<_>>()
};
let html = {
let document = Document::from(&html);
let styles = document.select("link[href][rel=\"stylesheet\"]");
let downloaded_images = image_urls.into_iter().map(|image_url| {
OptionFuture::from(image_url.map(|url| async move {
let data = downloader.download(&url).await.unwrap();
(url, data)
}))
});
let downloaded_images = futures::future::join_all(downloaded_images).await;
styles
.iter()
.zip(downloaded_styles.iter())
.for_each(|(mut stylesheet, inner_css)| {
if let Some(inner_css) = inner_css {
let css = String::from_utf8(inner_css.to_vec()).unwrap();
let css = format!("<style>{}</style>", css);
stylesheet.replace_with_html(css);
} else {
stylesheet.remove();
}
});
String::from(document.html())
};
let html = {
let document = Document::from(&html);
let imgs = document.select("img:not([src^=\"data:\"])");
// ---- Replace imgs ----
//
let image_urls = {
let document = Document::from(&html);
let imgs = document.select("img");
imgs.iter()
.zip(downloaded_images.iter())
.for_each(|(mut img, data)| {
if let Some((url, Some(data))) = data {
let data = base64::encode(data);
//TODO: use an extension hashmap
let extension =
Path::new(url.path()).extension().unwrap().to_str().unwrap();
img.set_attr("src", &format!("data:image/{};base64,{}", extension, data));
} else {
img.remove()
}
});
// ---- Remove unwanted html elements -----
//
for element in self.elements_to_remove {
document.select(element.as_ref()).remove();
}
imgs.iter()
.map(|image| {
if let Some(src) = image.attr("src") {
base_url.join(src.as_ref()).ok()
} else {
None
}
})
.collect::<Vec<_>>()
};
// ---- Add additional styles ----
//
for style in self.styles_to_add {
document
.select("head")
.append_html(format!("\n<style>{}</style>\n", style.as_ref()));
}
let downloaded_images = image_urls.into_iter().map(|image_url| {
OptionFuture::from(image_url.map(|url| async move {
let data = downloader.download(&url).await.unwrap();
(url, data)
}))
});
let downloaded_images = futures::future::join_all(downloaded_images).await;
String::from(document.html())
};
let html = {
let document = Document::from(&html);
let imgs = document.select("img");
imgs.iter()
.zip(downloaded_images.iter())
.for_each(|(mut img, data)| {
if let Some((url, data)) = data {
let data = base64::encode(data);
let extension = Path::new(url.path()).extension().unwrap().to_str().unwrap();
img.set_attr("src", &format!("data:image/{};base64,{}", extension, data));
}
});
// ---- Remove unwanted html elements -----
// ---- output ----
//
for element in elements_to_remove {
document.select(element.as_ref()).remove();
}
String::from(document.html())
};
let mut minifier = HTMLMinifier::new();
minifier.digest(html.as_str()).unwrap();
// ---- output ----
//
let mut minifier = HTMLMinifier::new();
minifier.digest(html.as_str()).unwrap();
String::from_utf8(minifier.get_html().into()).unwrap()
String::from_utf8(minifier.get_html().into()).unwrap()
}
}
#[cfg(test)]
mod tests {
// TODO: reduce boilerplate, DRY
use super::*;
@ -168,8 +224,8 @@ mod tests {
#[async_trait]
impl Download for DummyDownloader {
type Error = errors::Error;
async fn download(&self, _file_link: &Url) -> errors::Result<Bytes> {
Ok(Bytes::from(""))
async fn download(&self, _file_link: &Url) -> errors::Result<Option<Bytes>> {
Ok(Some(Bytes::from("")))
}
}
@ -178,9 +234,14 @@ mod tests {
let html = "<html><head><script>let id = id => id</script></head><body></body></html>";
let base_url = Url::parse("http://example.com")?;
let downloader = DummyDownloader {};
let to_remove: &[&str] = &[];
assert_eq!(
self_contained_html(html, &downloader, &base_url, to_remove).await,
Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}
.run(html)
.await,
"<html><head></head><body></body></html>"
);
Ok(())
@ -204,10 +265,13 @@ mod tests {
};
let base_url = Url::parse("http://example.com")?;
let to_remove: &[&str] = &[];
for s in EVENT_HANDLERS {
assert_eq!(
self_contained_html(html(s), &downloader, &base_url, to_remove).await,
Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}.run(html(s)).await,
"<html><head>\n</head>\n<body>\n<button class=\"activate\">button</button>\n</body></html>"
);
}
@ -232,10 +296,15 @@ mod tests {
};
let base_url = Url::parse("http://example.com")?;
let to_remove: &[&str] = &[];
for s in LINK_REL_EXTERNAL_RESOURCES {
assert_eq!(
self_contained_html(html(s), &downloader, &base_url, to_remove).await,
Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}
.run(html(s))
.await,
"<html><head>\n</head>\n<body>\n</body></html>"
);
}
@ -246,12 +315,14 @@ mod tests {
#[async_trait]
impl Download for CssDownloader {
type Error = errors::Error;
async fn download(&self, _file_link: &Url) -> errors::Result<Bytes> {
Ok(indoc! {"
async fn download(&self, _file_link: &Url) -> errors::Result<Option<Bytes>> {
Ok(Some(
indoc! {"
section#warning {
color: red;
}"}
.into())
.into(),
))
}
}
@ -286,9 +357,57 @@ mod tests {
let minified = String::from_utf8(minifier.get_html().into())?;
let base_url = Url::parse("http://example.com")?;
let to_remove: &[&str] = &[];
assert_eq!(
self_contained_html(html, &downloader, &base_url, to_remove).await,
Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}
.run(html)
.await,
minified
);
Ok(())
}
#[tokio::test]
async fn download_css_with_media_query() -> Result<()> {
let downloader = CssDownloader {};
let html = indoc! {"
<html>
<head>
<link rel=\"stylesheet\" href=\"main.css\" media=\"print\">
</head>
<body>
</body>
</html>
"};
let wanted_html = indoc! {"
<html><head>
<style media=\"print\">
section#warning {
color: red;
}
</style>
</head>
<body>
</body></html>
"};
let mut minifier = HTMLMinifier::new();
minifier.digest(wanted_html)?;
let minified = String::from_utf8(minifier.get_html().into())?;
let base_url = Url::parse("http://example.com")?;
assert_eq!(
Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}
.run(html)
.await,
minified
);
Ok(())
@ -298,12 +417,12 @@ mod tests {
#[async_trait]
impl Download for PngDownloader {
type Error = errors::Error;
async fn download(&self, _file_link: &Url) -> errors::Result<Bytes> {
async fn download(&self, _file_link: &Url) -> errors::Result<Option<Bytes>> {
let image_path = Path::new("test_data/home.png");
let mut image_file = File::open(&image_path).unwrap();
let mut image_buf: Vec<u8> = vec![];
image_file.read_to_end(&mut image_buf).unwrap();
Ok(image_buf.into())
Ok(Some(image_buf.into()))
}
}
@ -333,9 +452,14 @@ mod tests {
let minified = String::from_utf8(minifier.get_html().into())?;
let base_url = Url::parse("http://example.com")?;
let to_remove: &[&str] = &[];
assert_eq!(
self_contained_html(html, &downloader, &base_url, to_remove).await,
Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}
.run(html)
.await,
minified
);
Ok(())
@ -368,12 +492,67 @@ mod tests {
let minified = String::from_utf8(minifier.get_html().into())?;
assert_eq!(
self_contained_html(
html,
&downloader,
&base_url,
&["header", ".placeholder", "article > span.huge"]
)
Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
elements_to_remove: &["header", ".placeholder", "article > span.huge"],
..Default::default()
}
.run(html)
.await,
minified
);
Ok(())
}
#[tokio::test]
async fn add_style() -> Result<()> {
let html = indoc! {"
<html>
<head>
<meta charset=\"UTF-8\">
</head>
<body>
The body
</body>
</html>
"};
let wanted_html = indoc! {"
<html><head>
<meta charset=\"UTF-8\">
<style>
body {
margin: 3em;
}
</style>
</head>
<body>
The body
</body></html>
"};
let style_to_add = indoc! {"
body {
margin: 3em;
}
"};
let base_url = Url::parse("http://example.com")?;
let downloader = DummyDownloader {};
let mut minifier = HTMLMinifier::new();
minifier.digest(wanted_html)?;
let minified = String::from_utf8(minifier.get_html().into())?;
assert_eq!(
Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
styles_to_add: &[style_to_add],
..Default::default()
}
.run(html)
.await,
minified
);

View File

@ -37,7 +37,8 @@ frame "backend" {
newspaper -> retrieval_tools: uses to implement
article_location --> article_repr :uses
article_location --> article_repr: uses
retrieval_tools -up-> article_repr: uses
auto_retrieve --> rss: watches
auto_retrieve --> article_location

View File

@ -8,3 +8,24 @@ The newspapers are configured using environment variables
MEDIAPART_COOKIE
: sets the `MPRUUID` cookie, used to log in
# Le Monde Diplomatique
All cookies are mandatory to log in
MONDE_DIPLO_LMD_A_M
: sets the `lmd_a_m` cookie
MONDE_DIPLO_PHPSESSID
: sets the `PHPSESSID` cookie
MONDE_DIPLO_SPIP_SESSION
: sets the `spip_session` cookie
# Courrier international
COURRIER_INTERNATIONAL_LMD_A_M
: sets the `lmd_a_m` cookie
COURRIER_INTERNATIONAL_SSESS
: sets the `ssess` cookie

View File

@ -2,38 +2,27 @@ use std::convert::TryInto;
use std::env;
use anyhow::Result;
use crieur_retrieve::{
newspaper::Newspaper,
newspapers::mediapart::{self, Mediapart},
ArticleLocation, Url,
};
use crieur_retrieve::{ArticleLocation, Url};
use dotenv::dotenv;
use log::info;
#[tokio::main]
async fn main() -> Result<()> {
dotenv().ok();
env_logger::init();
tracing_subscriber::fmt()
.with_writer(std::io::stderr)
.with_env_filter(tracing_subscriber::EnvFilter::from_default_env())
.init();
let url = match env::args().nth(1) {
Some(url) => Url::parse(&url)?,
None => "https://www.mediapart.fr/journal/france/030421/l-hotel-dieu-patients-et-medecins-tentent-de-percer-les-mysteres-du-covid-long".try_into()?,
};
// TODO: remove this in favor of default newspapers
let mpruiid = env::var("MEDIAPART_COOKIE")?.into();
let mediapart = Mediapart::builder()
.login(mediapart::Login::MPRUUID(mpruiid))
.build()?;
info!("Trying to download article from {}", url);
// TODO: shorten this, maybe an helper function ?
let article_location = ArticleLocation::builder()
.url(url)?
.newspaper(mediapart)
.build()?;
let article_location = ArticleLocation::builder().url(url)?.build()?;
let article_str = article_location.retrieve_html().await?;

View File

@ -1,19 +1,29 @@
@build:
cargo build
cargo build
@build-container:
podman build --file ./containers/chatbot.containerfile -t crieur-chatbot .
@clean:
cargo clean
cargo clean
@run:
cargo run
cargo run
@test:
cargo test --all
@clippy:
cargo clippy
@fmt:
cargo fmt
@simulate-ci: fmt clippy test
@container:
podman build --file ./containers/chatbot.containerfile -t crieur-chatbot .
@audit:
cargo audit
cargo audit
@crev:
cargo crev verify
@verify: audit crev
cargo crev verify