Compare commits

..

8 Commits

Author SHA1 Message Date
koalp
16ad14467e Merge pull request 'add several newspapers' (#36) from feature/additional_newspapers into development
All checks were successful
continuous-integration/drone/push Build is passing
continuous-integration/drone/tag Build is passing
Reviewed-on: #36
2021-05-22 02:50:43 +00:00
e34edf0b21
fix: keep media queries in ref styles
All checks were successful
continuous-integration/drone/push Build is passing
Previously, media queries weren't keep when downloading styles from ref
tags.

It have been fixed so that media attribute are kept when creating style
tags from ref tags.
2021-05-22 04:41:08 +02:00
40ebc1ddea
feat: allow to inject styles 2021-05-22 04:41:08 +02:00
6e091a32fc
chore: use a config struct for self_contained_html
Previously, self_html_function was a function taking all parameters as
arguments.
As new optionnal parameters are beeing added, the function had too much
arguments and each usage of the function would have to be modified each
time an argument will be added.

Therefore, it have been moved to a configuration structure with a `run`
function taking only one argument, the html string.
2021-05-22 04:41:08 +02:00
5d0872b4d9
feat : add retrieve from courrier international
Retrieval of articles from courrier international have been added
2021-05-22 04:41:08 +02:00
cee0af6c3c
fix: only select images that have non-data src
Previously, when the image url contained data, it tried to parse an url
and failed, instead of keeping data.

It have been fixed so that images where url is starting by 'data' are
not modified.
2021-05-22 04:41:08 +02:00
970f510cd1
feat: add retrieval from le monde diplomatique
Add retrieval from le monde diplomatique

Previously, 404 pages were injected in the document when downloading
styles
Now, the downloader returns None when documents are not found
2021-05-22 04:41:01 +02:00
koalp
8afd74995b Merge pull request 'feature/several_rooms' (#32) from feature/several_rooms into development
All checks were successful
continuous-integration/drone/push Build is passing
Reviewed-on: #32
2021-05-08 01:34:08 +00:00
18 changed files with 919 additions and 302 deletions

271
Cargo.lock generated
View File

@ -74,13 +74,22 @@ checksum = "739f4a8db6605981345c5654f3a85b056ce52f37a39d34da03f25bf2151ea16e"
[[package]] [[package]]
name = "aho-corasick" name = "aho-corasick"
version = "0.7.15" version = "0.7.18"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7404febffaa47dac81aa44dba71523c9d069b1bdc50a77db41195149e17f68e5" checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f"
dependencies = [ dependencies = [
"memchr", "memchr",
] ]
[[package]]
name = "ansi_term"
version = "0.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2"
dependencies = [
"winapi",
]
[[package]] [[package]]
name = "anyhow" name = "anyhow"
version = "1.0.40" version = "1.0.40"
@ -213,6 +222,18 @@ version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "chrono"
version = "0.4.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "670ad68c9088c2a963aaa298cb369688cf3f9465ce5e2d4ca10e6e0098a1ce73"
dependencies = [
"libc",
"num-integer",
"num-traits",
"winapi",
]
[[package]] [[package]]
name = "cipher" name = "cipher"
version = "0.2.5" version = "0.2.5"
@ -285,10 +306,13 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "79bb3adfaf5f75d24b01aee375f7555907840fa2800e5ec8fa3b9e2031830173" checksum = "79bb3adfaf5f75d24b01aee375f7555907840fa2800e5ec8fa3b9e2031830173"
[[package]] [[package]]
name = "cpuid-bool" name = "cpufeatures"
version = "0.1.2" version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8aebca1129a03dc6dc2b127edd729435bbc4a37e1d5f4d7513165089ceb02634" checksum = "dec1028182c380cc45a2e2c5ec841134f2dfd0f8f5f0a5bcd68004f81b5efdf4"
dependencies = [
"libc",
]
[[package]] [[package]]
name = "cpuid-bool" name = "cpuid-bool"
@ -307,6 +331,7 @@ dependencies = [
"env_logger", "env_logger",
"log", "log",
"tokio", "tokio",
"tracing-subscriber",
] ]
[[package]] [[package]]
@ -337,7 +362,6 @@ dependencies = [
"hyper", "hyper",
"hyper-rustls", "hyper-rustls",
"indoc", "indoc",
"itertools",
"log", "log",
"lol_html", "lol_html",
"nipper", "nipper",
@ -569,12 +593,6 @@ dependencies = [
"syn", "syn",
] ]
[[package]]
name = "either"
version = "1.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457"
[[package]] [[package]]
name = "encoding_rs" name = "encoding_rs"
version = "0.8.28" version = "0.8.28"
@ -644,9 +662,9 @@ dependencies = [
[[package]] [[package]]
name = "futures" name = "futures"
version = "0.3.14" version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a9d5813545e459ad3ca1bff9915e9ad7f1a47dc6a91b627ce321d5863b7dd253" checksum = "0e7e43a803dae2fa37c1f6a8fe121e1f7bf9548b4dfc0522a42f34145dadfc27"
dependencies = [ dependencies = [
"futures-channel", "futures-channel",
"futures-core", "futures-core",
@ -659,9 +677,9 @@ dependencies = [
[[package]] [[package]]
name = "futures-channel" name = "futures-channel"
version = "0.3.14" version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ce79c6a52a299137a6013061e0cf0e688fce5d7f1bc60125f520912fdb29ec25" checksum = "e682a68b29a882df0545c143dc3646daefe80ba479bcdede94d5a703de2871e2"
dependencies = [ dependencies = [
"futures-core", "futures-core",
"futures-sink", "futures-sink",
@ -669,15 +687,15 @@ dependencies = [
[[package]] [[package]]
name = "futures-core" name = "futures-core"
version = "0.3.14" version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "098cd1c6dda6ca01650f1a37a794245eb73181d0d4d4e955e2f3c37db7af1815" checksum = "0402f765d8a89a26043b889b26ce3c4679d268fa6bb22cd7c6aad98340e179d1"
[[package]] [[package]]
name = "futures-executor" name = "futures-executor"
version = "0.3.14" version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "10f6cb7042eda00f0049b1d2080aa4b93442997ee507eb3828e8bd7577f94c9d" checksum = "badaa6a909fac9e7236d0620a2f57f7664640c56575b71a7552fbd68deafab79"
dependencies = [ dependencies = [
"futures-core", "futures-core",
"futures-task", "futures-task",
@ -686,9 +704,9 @@ dependencies = [
[[package]] [[package]]
name = "futures-io" name = "futures-io"
version = "0.3.14" version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "365a1a1fb30ea1c03a830fdb2158f5236833ac81fa0ad12fe35b29cddc35cb04" checksum = "acc499defb3b348f8d8f3f66415835a9131856ff7714bf10dadfc4ec4bdb29a1"
[[package]] [[package]]
name = "futures-locks" name = "futures-locks"
@ -701,10 +719,11 @@ dependencies = [
[[package]] [[package]]
name = "futures-macro" name = "futures-macro"
version = "0.3.14" version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "668c6733a182cd7deb4f1de7ba3bf2120823835b3bcfbeacf7d2c4a773c1bb8b" checksum = "a4c40298486cdf52cc00cd6d6987892ba502c7656a16a4192a9992b1ccedd121"
dependencies = [ dependencies = [
"autocfg 1.0.1",
"proc-macro-hack", "proc-macro-hack",
"proc-macro2", "proc-macro2",
"quote", "quote",
@ -713,15 +732,15 @@ dependencies = [
[[package]] [[package]]
name = "futures-sink" name = "futures-sink"
version = "0.3.14" version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c5629433c555de3d82861a7a4e3794a4c40040390907cfbfd7143a92a426c23" checksum = "a57bead0ceff0d6dde8f465ecd96c9338121bb7717d3e7b108059531870c4282"
[[package]] [[package]]
name = "futures-task" name = "futures-task"
version = "0.3.14" version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ba7aa51095076f3ba6d9a1f702f74bd05ec65f555d70d2033d55ba8d69f581bc" checksum = "8a16bef9fc1a4dddb5bee51c989e3fbba26569cbb0e31f5b303c184e3dd33dae"
[[package]] [[package]]
name = "futures-timer" name = "futures-timer"
@ -735,10 +754,11 @@ dependencies = [
[[package]] [[package]]
name = "futures-util" name = "futures-util"
version = "0.3.14" version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3c144ad54d60f23927f0a6b6d816e4271278b64f005ad65e4e35291d2de9c025" checksum = "feb5c238d27e2bf94ffdfd27b2c29e3df4a68c4193bb6427384259e2bf191967"
dependencies = [ dependencies = [
"autocfg 1.0.1",
"futures-channel", "futures-channel",
"futures-core", "futures-core",
"futures-io", "futures-io",
@ -821,9 +841,9 @@ dependencies = [
[[package]] [[package]]
name = "h2" name = "h2"
version = "0.3.2" version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fc018e188373e2777d0ef2467ebff62a08e66c3f5857b23c8fbec3018210dc00" checksum = "825343c4eef0b63f541f8903f395dc5beb362a979b5799a84062527ef1e37726"
dependencies = [ dependencies = [
"bytes", "bytes",
"fnv", "fnv",
@ -877,9 +897,9 @@ dependencies = [
[[package]] [[package]]
name = "html-minifier" name = "html-minifier"
version = "3.0.10" version = "3.0.13"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "108452631307790510cde91282fc706ae70076bd68200add8638773f06d5e122" checksum = "70f11cbdecf4fde3b74532e8d6d5926b6354adfda3fe259ed585402f9ae0dbaf"
dependencies = [ dependencies = [
"cow-utils", "cow-utils",
"educe", "educe",
@ -914,9 +934,9 @@ dependencies = [
[[package]] [[package]]
name = "http-body" name = "http-body"
version = "0.4.1" version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5dfb77c123b4e2f72a2069aeae0b4b4949cc7e966df277813fc16347e7549737" checksum = "60daa14be0e0786db0f03a9e57cb404c9d756eed2b6c62b9ea98ec5743ec75a9"
dependencies = [ dependencies = [
"bytes", "bytes",
"http", "http",
@ -1038,15 +1058,6 @@ version = "2.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "47be2f14c678be2fdcab04ab1171db51b2762ce6f0a8ee87c8dd4a04ed216135" checksum = "47be2f14c678be2fdcab04ab1171db51b2762ce6f0a8ee87c8dd4a04ed216135"
[[package]]
name = "itertools"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "37d572918e350e82412fe766d24b15e6682fb2ed2bbe018280caa810397cb319"
dependencies = [
"either",
]
[[package]] [[package]]
name = "itoa" name = "itoa"
version = "0.4.7" version = "0.4.7"
@ -1055,9 +1066,9 @@ checksum = "dd25036021b0de88a0aff6b850051563c6516d0bf53f8638938edbb9de732736"
[[package]] [[package]]
name = "js-sys" name = "js-sys"
version = "0.3.50" version = "0.3.51"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2d99f9e3e84b8f67f846ef5b4cbbc3b1c29f6c759fcbce6f01aa0e73d932a24c" checksum = "83bdfbace3a0e81a4253f73b49e960b053e396a11012cbd49b9b74d6a2b67062"
dependencies = [ dependencies = [
"wasm-bindgen", "wasm-bindgen",
] ]
@ -1091,9 +1102,9 @@ checksum = "18794a8ad5b29321f790b55d93dfba91e125cb1a9edbd4f8e3150acc771c1a5e"
[[package]] [[package]]
name = "lock_api" name = "lock_api"
version = "0.4.3" version = "0.4.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a3c91c24eae6777794bb1997ad98bbb87daf92890acab859f7eaa4320333176" checksum = "0382880606dff6d15c9476c416d18690b72742aa7b605bb6dd6ec9030fbf07eb"
dependencies = [ dependencies = [
"scopeguard", "scopeguard",
] ]
@ -1158,6 +1169,15 @@ dependencies = [
"tendril", "tendril",
] ]
[[package]]
name = "matchers"
version = "0.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f099785f7595cc4b4553a174ce30dd7589ef93391ff414dbb67f62392b9e0ce1"
dependencies = [
"regex-automata",
]
[[package]] [[package]]
name = "matches" name = "matches"
version = "0.1.8" version = "0.1.8"
@ -1254,9 +1274,9 @@ checksum = "60302e4db3a61da70c0cb7991976248362f30319e88850c487b9b95bbf059e00"
[[package]] [[package]]
name = "memchr" name = "memchr"
version = "2.3.4" version = "2.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0ee1c47aaa256ecabcaea351eae4a9b01ef39ed810004e298d2511ed284b1525" checksum = "b16bd47d9e329435e309c58469fe0791c2d0d1ba96ec0954152a5ae2b04387dc"
[[package]] [[package]]
name = "mime" name = "mime"
@ -1266,9 +1286,9 @@ checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d"
[[package]] [[package]]
name = "minifier" name = "minifier"
version = "0.0.39" version = "0.0.41"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6cdf618de5c9c98d4a7b2e0d1f1e44f82a19196cfd94040bb203621c25d28d98" checksum = "5594542d20834f2b974f5e5fb8e0cf1c67a2119dcadc29ef5d93a081fb30cc08"
dependencies = [ dependencies = [
"macro-utils", "macro-utils",
] ]
@ -1585,7 +1605,7 @@ version = "0.4.5"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eebcc4aa140b9abd2bc40d9c3f7ccec842679cd79045ac3a7ac698c1a064b7cd" checksum = "eebcc4aa140b9abd2bc40d9c3f7ccec842679cd79045ac3a7ac698c1a064b7cd"
dependencies = [ dependencies = [
"cpuid-bool 0.2.0", "cpuid-bool",
"opaque-debug", "opaque-debug",
"universal-hash", "universal-hash",
] ]
@ -1856,18 +1876,18 @@ dependencies = [
[[package]] [[package]]
name = "redox_syscall" name = "redox_syscall"
version = "0.2.7" version = "0.2.8"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "85dd92e586f7355c633911e11f77f3d12f04b1b1bd76a198bd34ae3af8341ef2" checksum = "742739e41cd49414de871ea5e549afb7e2a3ac77b589bcbebe8c82fab37147fc"
dependencies = [ dependencies = [
"bitflags", "bitflags",
] ]
[[package]] [[package]]
name = "regex" name = "regex"
version = "1.4.6" version = "1.5.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2a26af418b574bd56588335b3a3659a65725d4e636eb1016c2f9e3b38c7cc759" checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461"
dependencies = [ dependencies = [
"aho-corasick", "aho-corasick",
"memchr", "memchr",
@ -1875,10 +1895,20 @@ dependencies = [
] ]
[[package]] [[package]]
name = "regex-syntax" name = "regex-automata"
version = "0.6.23" version = "0.1.9"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "24d5f089152e60f62d28b835fbff2cd2e8dc0baf1ac13343bef92ab7eed84548" checksum = "ae1ded71d66a4a97f5e961fd0cb25a5f366a42a41570d16a763a69c092c26ae4"
dependencies = [
"byteorder",
"regex-syntax",
]
[[package]]
name = "regex-syntax"
version = "0.6.25"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b"
[[package]] [[package]]
name = "reqwest" name = "reqwest"
@ -2319,17 +2349,26 @@ checksum = "2579985fda508104f7587689507983eadd6a6e84dd35d6d115361f530916fa0d"
[[package]] [[package]]
name = "sha2" name = "sha2"
version = "0.9.3" version = "0.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fa827a14b29ab7f44778d14a88d3cb76e949c45083f7dbfa507d0cb699dc12de" checksum = "d8f6b75b17576b792bef0db1bcc4b8b8bcdf9506744cf34b974195487af6cff2"
dependencies = [ dependencies = [
"block-buffer", "block-buffer",
"cfg-if 1.0.0", "cfg-if 1.0.0",
"cpuid-bool 0.1.2", "cpufeatures",
"digest", "digest",
"opaque-debug", "opaque-debug",
] ]
[[package]]
name = "sharded-slab"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "79c719719ee05df97490f80a45acfc99e5a30ce98a1e4fb67aee422745ae14e3"
dependencies = [
"lazy_static",
]
[[package]] [[package]]
name = "signal-hook-registry" name = "signal-hook-registry"
version = "1.3.0" version = "1.3.0"
@ -2491,9 +2530,9 @@ checksum = "1e81da0851ada1f3e9d4312c704aa4f8806f0f9d69faaf8df2f3464b4a9437c2"
[[package]] [[package]]
name = "syn" name = "syn"
version = "1.0.71" version = "1.0.72"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ad184cc9470f9117b2ac6817bfe297307418819ba40552f9b3846f05c33d5373" checksum = "a1e8cdbefb79a9a5a65e0db8b47b723ee907b7c7f8496c76a1770b5c310bab82"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
@ -2558,6 +2597,15 @@ dependencies = [
"syn", "syn",
] ]
[[package]]
name = "thread_local"
version = "1.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8018d24e04c95ac8790716a5987d0fec4f8b27249ffa0f7d33f1369bdfb88cbd"
dependencies = [
"once_cell",
]
[[package]] [[package]]
name = "time" name = "time"
version = "0.2.26" version = "0.2.26"
@ -2613,9 +2661,9 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c"
[[package]] [[package]]
name = "tokio" name = "tokio"
version = "1.5.0" version = "1.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "83f0c8e7c0addab50b663055baf787d0af7f413a46e6e7fb9559a4e4db7137a5" checksum = "bd3076b5c8cc18138b8f8814895c11eb4de37114a5d127bafdc5e55798ceef37"
dependencies = [ dependencies = [
"autocfg 1.0.1", "autocfg 1.0.1",
"bytes", "bytes",
@ -2684,9 +2732,9 @@ checksum = "360dfd1d6d30e05fda32ace2c8c70e9c0a9da713275777f5a4dbb8a1893930c6"
[[package]] [[package]]
name = "tracing" name = "tracing"
version = "0.1.25" version = "0.1.26"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "01ebdc2bb4498ab1ab5f5b73c5803825e60199229ccba0698170e3be0e7f959f" checksum = "09adeb8c97449311ccd28a427f96fb563e7fd31aabf994189879d9da2394b89d"
dependencies = [ dependencies = [
"cfg-if 1.0.0", "cfg-if 1.0.0",
"pin-project-lite", "pin-project-lite",
@ -2707,9 +2755,9 @@ dependencies = [
[[package]] [[package]]
name = "tracing-core" name = "tracing-core"
version = "0.1.17" version = "0.1.18"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f50de3927f93d202783f4513cda820ab47ef17f624b03c096e86ef00c67e6b5f" checksum = "a9ff14f98b1a4b289c6248a023c1c2fa1491062964e9fed67ab29c4e4da4a052"
dependencies = [ dependencies = [
"lazy_static", "lazy_static",
] ]
@ -2724,6 +2772,49 @@ dependencies = [
"tracing", "tracing",
] ]
[[package]]
name = "tracing-log"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a6923477a48e41c1951f1999ef8bb5a3023eb723ceadafe78ffb65dc366761e3"
dependencies = [
"lazy_static",
"log",
"tracing-core",
]
[[package]]
name = "tracing-serde"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fb65ea441fbb84f9f6748fd496cf7f63ec9af5bca94dd86456978d055e8eb28b"
dependencies = [
"serde",
"tracing-core",
]
[[package]]
name = "tracing-subscriber"
version = "0.2.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aa5553bf0883ba7c9cbe493b085c29926bd41b66afc31ff72cf17ff4fb60dcd5"
dependencies = [
"ansi_term",
"chrono",
"lazy_static",
"matchers",
"regex",
"serde",
"serde_json",
"sharded-slab",
"smallvec 1.6.1",
"thread_local",
"tracing",
"tracing-core",
"tracing-log",
"tracing-serde",
]
[[package]] [[package]]
name = "try-lock" name = "try-lock"
version = "0.2.3" version = "0.2.3"
@ -2756,9 +2847,9 @@ dependencies = [
[[package]] [[package]]
name = "unicode-xid" name = "unicode-xid"
version = "0.2.1" version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f7fe0bb3479651439c9112f72b6c505038574c9fbb575ed1bf3b797fa39dd564" checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3"
[[package]] [[package]]
name = "unindent" name = "unindent"
@ -2784,9 +2875,9 @@ checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a"
[[package]] [[package]]
name = "url" name = "url"
version = "2.2.1" version = "2.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9ccd964113622c8e9322cfac19eb1004a07e636c545f325da085d5cdde6f1f8b" checksum = "a507c383b2d33b5fc35d1861e77e6b383d158b2da5e14fe51b83dfedf6fd578c"
dependencies = [ dependencies = [
"form_urlencoded", "form_urlencoded",
"idna", "idna",
@ -2846,9 +2937,9 @@ checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6"
[[package]] [[package]]
name = "wasm-bindgen" name = "wasm-bindgen"
version = "0.2.73" version = "0.2.74"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "83240549659d187488f91f33c0f8547cbfef0b2088bc470c116d1d260ef623d9" checksum = "d54ee1d4ed486f78874278e63e4069fc1ab9f6a18ca492076ffb90c5eb2997fd"
dependencies = [ dependencies = [
"cfg-if 1.0.0", "cfg-if 1.0.0",
"serde", "serde",
@ -2858,9 +2949,9 @@ dependencies = [
[[package]] [[package]]
name = "wasm-bindgen-backend" name = "wasm-bindgen-backend"
version = "0.2.73" version = "0.2.74"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ae70622411ca953215ca6d06d3ebeb1e915f0f6613e3b495122878d7ebec7dae" checksum = "3b33f6a0694ccfea53d94db8b2ed1c3a8a4c86dd936b13b9f0a15ec4a451b900"
dependencies = [ dependencies = [
"bumpalo", "bumpalo",
"lazy_static", "lazy_static",
@ -2873,9 +2964,9 @@ dependencies = [
[[package]] [[package]]
name = "wasm-bindgen-futures" name = "wasm-bindgen-futures"
version = "0.4.23" version = "0.4.24"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "81b8b767af23de6ac18bf2168b690bed2902743ddf0fb39252e36f9e2bfc63ea" checksum = "5fba7978c679d53ce2d0ac80c8c175840feb849a161664365d1287b41f2e67f1"
dependencies = [ dependencies = [
"cfg-if 1.0.0", "cfg-if 1.0.0",
"js-sys", "js-sys",
@ -2885,9 +2976,9 @@ dependencies = [
[[package]] [[package]]
name = "wasm-bindgen-macro" name = "wasm-bindgen-macro"
version = "0.2.73" version = "0.2.74"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3e734d91443f177bfdb41969de821e15c516931c3c3db3d318fa1b68975d0f6f" checksum = "088169ca61430fe1e58b8096c24975251700e7b1f6fd91cc9d59b04fb9b18bd4"
dependencies = [ dependencies = [
"quote", "quote",
"wasm-bindgen-macro-support", "wasm-bindgen-macro-support",
@ -2895,9 +2986,9 @@ dependencies = [
[[package]] [[package]]
name = "wasm-bindgen-macro-support" name = "wasm-bindgen-macro-support"
version = "0.2.73" version = "0.2.74"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d53739ff08c8a68b0fdbcd54c372b8ab800b1449ab3c9d706503bc7dd1621b2c" checksum = "be2241542ff3d9f241f5e2cb6dd09b37efe786df8851c54957683a49f0987a97"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
@ -2908,15 +2999,15 @@ dependencies = [
[[package]] [[package]]
name = "wasm-bindgen-shared" name = "wasm-bindgen-shared"
version = "0.2.73" version = "0.2.74"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9a543ae66aa233d14bb765ed9af4a33e81b8b58d1584cf1b47ff8cd0b9e4489" checksum = "d7cff876b8f18eed75a66cf49b65e7f967cb354a7aa16003fb55dbfd25b44b4f"
[[package]] [[package]]
name = "web-sys" name = "web-sys"
version = "0.3.50" version = "0.3.51"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a905d57e488fec8861446d3393670fb50d27a262344013181c2cdf9fff5481be" checksum = "e828417b379f3df7111d3a2a9e5753706cae29c41f7c4029ee9fd77f3e09e582"
dependencies = [ dependencies = [
"js-sys", "js-sys",
"wasm-bindgen", "wasm-bindgen",

View File

@ -22,4 +22,5 @@ crieur-chatbot = {version = "0.1", path="crieur-chatbot"}
dotenv = "0.15.0" dotenv = "0.15.0"
env_logger = "0.8.3" env_logger = "0.8.3"
log = "0.4.14" log = "0.4.14"
tokio = { version = "1.5.0", features = ["full"] } tokio = { version = "1.6.0", features = ["full"] }
tracing-subscriber = "0.2.18"

View File

@ -1,4 +1,5 @@
Tools to retrieve articles from multiple newspaper you subscribed to. Tools to retrieve articles from multiple newspaper you subscribed to, all from
the same place.
**This is a prototype, it isn't stable at all and you may not want to use it if **This is a prototype, it isn't stable at all and you may not want to use it if
you expect it to just work !** you expect it to just work !**

View File

@ -2,15 +2,7 @@
use std::convert::TryInto; use std::convert::TryInto;
use anyhow::Result; use anyhow::Result;
use matrix_sdk::{ use matrix_sdk::{self, Client, SyncSettings};
self, async_trait,
events::{
room::message::{MessageEventContent, MessageType, TextMessageEventContent},
AnyMessageEventContent, SyncMessageEvent,
},
room::Room,
Client, ClientConfig, EventHandler, SyncSettings,
};
use crate::Html; use crate::Html;

View File

@ -1,7 +1,6 @@
use std::convert::TryInto; use std::convert::TryInto;
use std::env;
use log::{error, info}; use log::error;
use matrix_sdk::{ use matrix_sdk::{
self, async_trait, self, async_trait,
events::{ events::{
@ -9,7 +8,7 @@ use matrix_sdk::{
AnyMessageEventContent, SyncMessageEvent, AnyMessageEventContent, SyncMessageEvent,
}, },
room::Room, room::Room,
Client, ClientConfig, EventHandler, SyncSettings, EventHandler,
}; };
use crieur_retrieve::{article_location::Error, article_location::Result, ArticleLocation, Url}; use crieur_retrieve::{article_location::Error, article_location::Result, ArticleLocation, Url};
@ -113,7 +112,6 @@ impl EventHandler for Html {
} else { } else {
return; return;
}; };
info!("sending file");
match msg_body.split(' ').collect::<Vec<_>>().as_slice() { match msg_body.split(' ').collect::<Vec<_>>().as_slice() {
["!html", url, ..] => send_article(*url, room).await, ["!html", url, ..] => send_article(*url, room).await,

View File

@ -8,23 +8,22 @@ publish = false
[dependencies] [dependencies]
anyhow = "1.0.40" anyhow = "1.0.40"
async-trait = "0.1.48" async-trait = "0.1.50"
thiserror = "1.0.24" thiserror = "1.0.24"
url = "2.2.1" url = "2.2.2"
hyper = { version = "0.14.5", features = ["full"] } hyper = { version = "0.14.7", features = ["full"] }
hyper-rustls = "0.22.1" hyper-rustls = "0.22.1"
cookie = "0.15.0" cookie = "0.15.0"
lol_html = "0.3.0" lol_html = "0.3.0"
indoc = "1.0.3" indoc = "1.0.3"
html-minifier = "3.0.9" html-minifier = "3.0.13"
bytes = "1.0.1" bytes = "1.0.1"
base64 = "0.13.0" base64 = "0.13.0"
futures = "0.3.14" futures = "0.3.15"
derive_builder = "0.10.0" derive_builder = "0.10.2"
nipper = "0.1.9" nipper = "0.1.9"
log = "0.4.14" log = "0.4.14"
env_logger = "0.8.3" env_logger = "0.8.3"
itertools = "0.10.0"
[dev-dependencies] [dev-dependencies]
tokio = "1.5.0" tokio = "1.6.0"

View File

@ -2,12 +2,12 @@ use std::boxed::Box;
use std::convert::TryInto; use std::convert::TryInto;
use std::env; use std::env;
use anyhow::anyhow;
use log::info;
use url::{Host, Url}; use url::{Host, Url};
use crate::newspaper::Newspaper; use crate::newspaper::Newspaper;
use crate::newspapers::courrier_international::{self, CourrierInternational};
use crate::newspapers::mediapart::{self, Mediapart}; use crate::newspapers::mediapart::{self, Mediapart};
use crate::newspapers::monde_diplomatique::{self, MondeDiplo};
/// Enumerate all errors that can be encountered when using ArticleLocation /// Enumerate all errors that can be encountered when using ArticleLocation
#[derive(thiserror::Error, Debug)] #[derive(thiserror::Error, Debug)]
@ -33,16 +33,46 @@ type Newspapers = Vec<Box<dyn Newspaper>>;
pub type Result<T, E = Error> = core::result::Result<T, E>; pub type Result<T, E = Error> = core::result::Result<T, E>;
fn default_newpapers() -> Result<Newspapers> { fn default_newpapers() -> Result<Newspapers> {
// TODO: same thing is written too much times : how to DRY ?
let config_key = "MEDIAPART_COOKIE".to_string(); let config_key = "MEDIAPART_COOKIE".to_string();
let mpruiid = env::var(&config_key) let mpruiid = env::var(&config_key).map_err(|_| Error::Misconfiguration(config_key))?;
.map_err(|_| Error::Misconfiguration(config_key))?
.into();
let mediapart = Mediapart::builder() let mediapart = Mediapart::builder()
.login(mediapart::Login::MPRUUID(mpruiid)) .login(mediapart::Login::Mpruuid(mpruiid))
.build()?; .build()?;
Ok(vec![Box::new(mediapart)]) let lmd_a_m = "MONDE_DIPLO_LMD_A_M".to_string();
let phpsessid = "MONDE_DIPLO_PHPSESSID".to_string();
let spip_session = "MONDE_DIPLO_SPIP_SESSION".to_string();
let lmd_a_m = env::var(&lmd_a_m).map_err(|_| Error::Misconfiguration(lmd_a_m))?;
let phpsessid = env::var(&phpsessid).map_err(|_| Error::Misconfiguration(phpsessid))?;
let spip_session =
env::var(&spip_session).map_err(|_| Error::Misconfiguration(spip_session))?;
let monde_diplo = MondeDiplo::builder()
.login(monde_diplomatique::Login::Cookies {
lmd_a_m,
phpsessid,
spip_session,
})
.build()?;
let lmd_a_m = "COURRIER_INTERNATIONAL_LMD_A_M".to_string();
let ssess = "COURRIER_INTERNATIONAL_SSESS".to_string();
let lmd_a_m = env::var(&lmd_a_m).map_err(|_| Error::Misconfiguration(lmd_a_m))?;
let ssess = env::var(&ssess).map_err(|_| Error::Misconfiguration(ssess))?;
let courrier_international = CourrierInternational::builder()
.login(courrier_international::Login::Cookies { lmd_a_m, ssess })
.build()?;
Ok(vec![
Box::new(mediapart),
Box::new(monde_diplo),
Box::new(courrier_international),
])
} }
#[derive(Default)] #[derive(Default)]
@ -126,7 +156,6 @@ impl ArticleLocation {
} }
pub async fn retrieve_html(&self) -> Result<String> { pub async fn retrieve_html(&self) -> Result<String> {
info!("It will download from {}", self.url);
// TODO: modify when retrieve_html returns a specific Error type // TODO: modify when retrieve_html returns a specific Error type
Ok(self.newspaper.retrieve_html(&self.url).await?) Ok(self.newspaper.retrieve_html(&self.url).await?)
} }

View File

@ -0,0 +1,144 @@
use anyhow::{anyhow, bail, Result};
use async_trait::async_trait;
use cookie::Cookie;
use indoc::indoc;
use url::Host;
use crate::newspaper::{Metadata, Newspaper};
use crate::tools;
use crate::Url;
use crate::{Download, Downloader};
pub enum Login {
Username(String, String),
Cookies { lmd_a_m: String, ssess: String },
}
#[derive(Debug, Clone, Default)]
pub struct CourrierInternational {
login_cookies: Vec<(String, String)>,
}
fn str_to_host<S: Into<String>>(host: S) -> Host {
Host::Domain(host.into())
}
#[derive(Debug, Clone, Default)]
pub struct Builder {
login_cookies: Option<Vec<(String, String)>>,
}
impl Builder {
pub fn login(&mut self, login: Login) -> &mut Self {
self.login_cookies = match login {
Login::Username(_username, _password) => {
unimplemented!("login using username and passwond not implemented")
}
Login::Cookies { lmd_a_m, ssess } => Some(vec![
("lmd_a_m".into(), lmd_a_m),
("SSESS862c7003d721c672d39f161b1456b890".into(), ssess),
]),
};
self
}
pub fn build(&self) -> Result<CourrierInternational> {
match &self.login_cookies {
Some(login_cookies) => Ok(CourrierInternational {
login_cookies: login_cookies.clone(),
}),
None => Err(anyhow!("You have to log in to access this newspaper")),
}
}
}
#[async_trait]
impl Newspaper for CourrierInternational {
fn metadata(&self) -> Metadata {
Metadata::builder()
.hosts(vec![
str_to_host("courrierinternational.com"),
str_to_host("www.courrierinternational.com"),
])
.lower_case_name("courrier-international")
.name("Courrier international")
.build()
.unwrap_or_default()
}
async fn retrieve_html(&self, url: &Url) -> Result<String> {
let cookies = self
.login_cookies
.iter()
.map(|cookie| Cookie::build(&cookie.0, &cookie.1).finish())
.collect::<Vec<_>>();
// TODO: replace by builder
let downloader = Downloader { cookies };
let body = downloader.download(&url).await?;
let html = match body {
Some(body) => String::from_utf8(body.to_vec())?,
None => bail!("404 not found"),
};
let elements_to_remove = &[
// navigation elements
"header.site-header",
"footer.site-footer",
// Social buttons
"#toolbox-share",
".toolbox-share",
".toolbox-print",
".toolbox-respond",
".toolbox-zen",
".toolbox-newsletter",
".toolbox-offer",
".box-article-offer-friend-abo",
// unused services
".article-aside",
".article-secondary",
".article-subject-readmore",
// misc
".element-invisible",
".gptcontainer",
];
// FIXME: it doesn't work because the aside is in the article body
//
let toolbox_style = indoc! {"
aside.article-toolbox {
position: sticky;
top: 1em;
}
"};
let single_page_html = tools::self_contained_html::Config {
downloader: Some(&downloader),
base_url: Some(&url),
elements_to_remove,
styles_to_add: &[toolbox_style],
..Default::default()
}
.run(&html)
.await;
Ok(single_page_html)
}
fn new() -> Self {
Self {
..Default::default()
}
}
async fn has_complete_access(&self) -> bool {
// TODO: check if we are logged using the cookie
true
}
}
impl CourrierInternational {
pub fn builder() -> Builder {
Builder::default()
}
}

View File

@ -1,4 +1,4 @@
use anyhow::{anyhow, Result}; use anyhow::{anyhow, bail, Result};
use async_trait::async_trait; use async_trait::async_trait;
use cookie::Cookie; use cookie::Cookie;
use url::Host; use url::Host;
@ -10,7 +10,7 @@ use crate::{Download, Downloader};
pub enum Login { pub enum Login {
Username(String, String), Username(String, String),
MPRUUID(String), Mpruuid(String),
} }
#[derive(Debug, Clone, Default)] #[derive(Debug, Clone, Default)]
@ -33,7 +33,7 @@ impl Builder {
Login::Username(_username, _password) => { Login::Username(_username, _password) => {
unimplemented!("login using username and passwond not implemented") unimplemented!("login using username and passwond not implemented")
} }
Login::MPRUUID(cookie_value) => Some(("MPRUUID".into(), cookie_value)), Login::Mpruuid(cookie_value) => Some(("MPRUUID".into(), cookie_value)),
}; };
self self
} }
@ -80,10 +80,13 @@ impl Newspaper for Mediapart {
let downloader = Downloader { cookies }; let downloader = Downloader { cookies };
let body = downloader.download(&url).await?; let body = downloader.download(&url).await?;
let html = String::from_utf8(body.to_vec())?; let html = match body {
Some(body) => String::from_utf8(body.to_vec())?,
None => bail!("404 not found"),
};
// TODO: Move to const // TODO: Move to const
let element_to_remove = [ let elements_to_remove = &[
// header // header
".fb-root", ".fb-root",
".skipLinks", ".skipLinks",
@ -101,8 +104,14 @@ impl Newspaper for Mediapart {
"aside.cc-modal", "aside.cc-modal",
]; ];
let single_page_html = let single_page_html = tools::self_contained_html::Config {
tools::self_contained_html(&html, &downloader, &url, &element_to_remove).await; downloader: Some(&downloader),
base_url: Some(&url),
elements_to_remove,
..Default::default()
}
.run(&html)
.await;
Ok(single_page_html) Ok(single_page_html)
} }

View File

@ -1 +1,3 @@
pub mod courrier_international;
pub mod mediapart; pub mod mediapart;
pub mod monde_diplomatique;

View File

@ -0,0 +1,137 @@
use anyhow::{anyhow, bail, Result};
use async_trait::async_trait;
use cookie::Cookie;
use url::Host;
use crate::newspaper::{Metadata, Newspaper};
use crate::tools;
use crate::Url;
use crate::{Download, Downloader};
pub enum Login {
Username(String, String),
Cookies {
lmd_a_m: String,
phpsessid: String,
spip_session: String,
},
}
#[derive(Debug, Clone, Default)]
pub struct MondeDiplo {
login_cookies: Vec<(String, String)>,
}
fn str_to_host<S: Into<String>>(host: S) -> Host {
Host::Domain(host.into())
}
#[derive(Debug, Clone, Default)]
pub struct Builder {
login_cookies: Option<Vec<(String, String)>>,
}
impl Builder {
pub fn login(&mut self, login: Login) -> &mut Self {
self.login_cookies = match login {
Login::Username(_username, _password) => {
unimplemented!("login using username and passwond not implemented")
}
Login::Cookies {
lmd_a_m,
phpsessid,
spip_session,
} => Some(vec![
("lmd_a_m".into(), lmd_a_m),
("PHPSESSID".into(), phpsessid),
("spip_session".into(), spip_session),
]),
};
self
}
pub fn build(&self) -> Result<MondeDiplo> {
match &self.login_cookies {
Some(login_cookies) => Ok(MondeDiplo {
login_cookies: login_cookies.clone(),
}),
None => Err(anyhow!("You have to log in to access this newspaper")),
}
}
}
#[async_trait]
impl Newspaper for MondeDiplo {
fn metadata(&self) -> Metadata {
Metadata::builder()
.hosts(vec![
str_to_host("monde-diplomatique.fr"),
str_to_host("www.monde-diplomatique.fr"),
])
.lower_case_name("monde-diplomatique")
.name("Le Monde Diplomatique")
.build()
.unwrap_or_default()
}
async fn retrieve_html(&self, url: &Url) -> Result<String> {
let cookies = self
.login_cookies
.iter()
.map(|cookie| Cookie::build(&cookie.0, &cookie.1).finish())
.collect::<Vec<_>>();
// TODO: replace by builder
let downloader = Downloader { cookies };
let body = downloader.download(&url).await?;
let html = match body {
Some(body) => String::from_utf8(body.to_vec())?,
None => bail!("404 not found"),
};
// TODO: Move to const
let elements_to_remove = &[
// navigation elements
"#tout-en-haut.preentete",
"#entete.connecte",
"#navigation",
"#pied",
".bloc-connexion",
// unused features
"#ecouter",
// Social buttons
".actions-article",
"#partage",
// misc
"noscript",
];
let single_page_html = tools::self_contained_html::Config {
downloader: Some(&downloader),
base_url: Some(&url),
elements_to_remove,
..Default::default()
}
.run(&html)
.await;
Ok(single_page_html)
}
fn new() -> Self {
Self {
..Default::default()
}
}
async fn has_complete_access(&self) -> bool {
// TODO: check if we are logged using the cookie
true
}
}
impl MondeDiplo {
pub fn builder() -> Builder {
Builder::default()
}
}

View File

@ -4,7 +4,7 @@ use anyhow::Result;
use async_trait::async_trait; use async_trait::async_trait;
use bytes::Bytes; use bytes::Bytes;
use cookie::Cookie; use cookie::Cookie;
use hyper::{header, Body, Client, Method, Request}; use hyper::{header, Body, Client, Method, Request, StatusCode};
use thiserror::Error; use thiserror::Error;
use url::Url; use url::Url;
@ -22,7 +22,9 @@ pub trait Download {
type Error: StdError; type Error: StdError;
/// Downloads a file from an url and returns the result as bytes /// Downloads a file from an url and returns the result as bytes
async fn download(&self, file_link: &Url) -> Result<Bytes, Self::Error>; ///
/// If the file is not found, returns None
async fn download(&self, file_link: &Url) -> Result<Option<Bytes>, Self::Error>;
} }
/// Store several cookies /// Store several cookies
@ -36,7 +38,8 @@ pub struct Downloader<'c> {
impl<'c> Download for Downloader<'c> { impl<'c> Download for Downloader<'c> {
type Error = DownloadError; type Error = DownloadError;
async fn download(&self, file_link: &Url) -> Result<Bytes, Self::Error> { async fn download(&self, file_link: &Url) -> Result<Option<Bytes>, Self::Error> {
log::debug!("downloading url {:?}", file_link);
let https = hyper_rustls::HttpsConnector::with_native_roots(); let https = hyper_rustls::HttpsConnector::with_native_roots();
let client: Client<_, hyper::Body> = Client::builder().build(https); let client: Client<_, hyper::Body> = Client::builder().build(https);
@ -44,14 +47,26 @@ impl<'c> Download for Downloader<'c> {
.method(Method::GET) .method(Method::GET)
.uri(file_link.as_str()); .uri(file_link.as_str());
for cookie in &self.cookies { req = req.header(
req = req.header(header::COOKIE, cookie.to_string()); header::COOKIE,
} self.cookies
.iter()
.map(Cookie::to_string)
.collect::<Vec<_>>()
.join(";"),
);
log::debug!("headers : {:?}", req.headers_ref());
let req = req.body(Body::empty())?; let req = req.body(Body::empty())?;
let resp = client.request(req).await?; let resp = client.request(req).await?;
let body = hyper::body::to_bytes(resp).await?; log::debug!("Response status : {:?}", resp.status());
let body = match resp.status() {
StatusCode::OK => Some(hyper::body::to_bytes(resp).await?),
StatusCode::NOT_FOUND => None,
// TODO: enhance this by handling more error codes
_ => None,
};
Ok(body) Ok(body)
} }
} }

View File

@ -1,5 +1,4 @@
mod download; mod download;
mod self_contained_html; pub mod self_contained_html;
pub use download::{Download, DownloadError, Downloader}; pub use download::{Download, DownloadError, Downloader};
pub use self_contained_html::self_contained_html;

View File

@ -8,26 +8,60 @@ use url::Url;
use crate::consts::{EVENT_HANDLERS, LINK_REL_EXTERNAL_RESOURCES}; use crate::consts::{EVENT_HANDLERS, LINK_REL_EXTERNAL_RESOURCES};
use crate::Download; use crate::Download;
/// Stores configuration for the self_contained_html function
// TODO: write a builder
pub struct Config<'t, E, D, S1 = &'static str, S2 = &'static str>
where
E: std::error::Error,
D: Download<Error = E> + Send,
S1: AsRef<str>,
S2: AsRef<str>,
{
/// the downloader that will be used to retrieve ressources on the page
pub downloader: Option<&'t D>,
/// Base url for downloading ressources, it probably the
pub base_url: Option<&'t Url>,
pub elements_to_remove: &'t [S1],
pub styles_to_add: &'t [S2],
}
impl<'t, E, D> Default for Config<'t, E, D>
where
E: std::error::Error,
D: Download<Error = E> + Send,
{
fn default() -> Self {
Self {
downloader: None,
base_url: None,
elements_to_remove: &[],
styles_to_add: &[],
}
}
}
impl<'t, E, D, S1, S2> Config<'t, E, D, S1, S2>
where
E: std::error::Error,
D: Download<Error = E> + Send,
S1: AsRef<str>,
S2: AsRef<str>,
{
/// Makes an html page self-contained /// Makes an html page self-contained
/// ///
/// The `downloader` must implement `Download` and is used to download ressources that are /// The `downloader` must implement `Download` and is used to download ressources that are
/// needed to make this page self-contained such as stylesheets or images. /// needed to make this page self-contained such as stylesheets or images.
/// ///
/// The function also removes all scripts on the page /// The function also removes all scripts on the page
pub async fn self_contained_html<E, D>( pub async fn run(&self, html: impl AsRef<str>) -> String {
html: impl AsRef<str>, //TODO: don't panic
downloader: &D, let base_url = self.base_url.expect("Base url not defined");
base_url: &Url, let downloader = self.downloader.expect("Downloader not defined");
elements_to_remove: &[impl AsRef<str>],
) -> String
where
E: std::error::Error,
D: Download<Error = E> + Send,
{
// TODO: split/refactor this function : // TODO: split/refactor this function :
// - ¿ find a way to ease the |get urls| -> |async download| -> |replace| workflow ? // - ¿ find a way to ease the |get urls| -> |async download| -> |replace| workflow ?
// - put each modification (ex: style in the `foreach`) in functions, maybe using
// (¿sealed?) Traits : https://git.alpaga.dev/poc/crieur/issues/42
// - `element_to_remove` , `base_url` and `downloader` should be in a configuration structure // - `element_to_remove` , `base_url` and `downloader` should be in a configuration structure
// - ¿ should be function of a trait ? or only of the configuration struct ?
let (style_urls, html) = { let (style_urls, html) = {
let document = Document::from(html.as_ref()); let document = Document::from(html.as_ref());
@ -52,9 +86,8 @@ where
let stylesheets = document.select("link[href][rel=\"stylesheet\"]"); let stylesheets = document.select("link[href][rel=\"stylesheet\"]");
let styles_url = stylesheets let styles_url = stylesheets
.iter() .iter()
.map(|stylesheet| { .map(|style_link| {
if let Some(src) = stylesheet.attr("href") { if let Some(src) = style_link.attr("href") {
//TODO: does it work with absolute urls ?
base_url.join(src.as_ref()).ok() base_url.join(src.as_ref()).ok()
} else { } else {
None None
@ -65,7 +98,9 @@ where
}; };
let style_urls = style_urls.into_iter().map(|style_url| { let style_urls = style_urls.into_iter().map(|style_url| {
OptionFuture::from(style_url.map(|s| async move { downloader.download(&s).await.unwrap() })) OptionFuture::from(
style_url.map(|s| async move { downloader.download(&s).await.unwrap() }),
)
}); });
let downloaded_styles = futures::future::join_all(style_urls).await; let downloaded_styles = futures::future::join_all(style_urls).await;
@ -76,13 +111,19 @@ where
styles styles
.iter() .iter()
.zip(downloaded_styles.iter()) .zip(downloaded_styles.iter())
.for_each(|(mut stylesheet, inner_css)| { .for_each(|(mut style_link, inner_css)| {
if let Some(inner_css) = inner_css { if let Some(Some(inner_css)) = inner_css {
let css = String::from_utf8(inner_css.to_vec()).unwrap(); let css = String::from_utf8(inner_css.to_vec()).unwrap();
let css = format!("<style>{}</style>", css); let media_query = style_link.attr("media");
stylesheet.replace_with_html(css); let css = match media_query {
Some(media_query) => {
format!("<style media=\"{}\">{}</style>", media_query, css)
}
None => format!("<style>{}</style>", css),
};
style_link.replace_with_html(css);
} else { } else {
stylesheet.remove(); style_link.remove();
} }
}); });
String::from(document.html()) String::from(document.html())
@ -92,7 +133,7 @@ where
// //
let image_urls = { let image_urls = {
let document = Document::from(&html); let document = Document::from(&html);
let imgs = document.select("img"); let imgs = document.select("img:not([src^=\"data:\"])");
imgs.iter() imgs.iter()
.map(|image| { .map(|image| {
@ -115,22 +156,35 @@ where
let html = { let html = {
let document = Document::from(&html); let document = Document::from(&html);
let imgs = document.select("img"); let imgs = document.select("img:not([src^=\"data:\"])");
imgs.iter() imgs.iter()
.zip(downloaded_images.iter()) .zip(downloaded_images.iter())
.for_each(|(mut img, data)| { .for_each(|(mut img, data)| {
if let Some((url, data)) = data { if let Some((url, Some(data))) = data {
let data = base64::encode(data); let data = base64::encode(data);
let extension = Path::new(url.path()).extension().unwrap().to_str().unwrap(); //TODO: use an extension hashmap
let extension =
Path::new(url.path()).extension().unwrap().to_str().unwrap();
img.set_attr("src", &format!("data:image/{};base64,{}", extension, data)); img.set_attr("src", &format!("data:image/{};base64,{}", extension, data));
} else {
img.remove()
} }
}); });
// ---- Remove unwanted html elements ----- // ---- Remove unwanted html elements -----
// //
for element in elements_to_remove { for element in self.elements_to_remove {
document.select(element.as_ref()).remove(); document.select(element.as_ref()).remove();
} }
// ---- Add additional styles ----
//
for style in self.styles_to_add {
document
.select("head")
.append_html(format!("\n<style>{}</style>\n", style.as_ref()));
}
String::from(document.html()) String::from(document.html())
}; };
@ -141,9 +195,11 @@ where
String::from_utf8(minifier.get_html().into()).unwrap() String::from_utf8(minifier.get_html().into()).unwrap()
} }
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
// TODO: reduce boilerplate, DRY
use super::*; use super::*;
@ -168,8 +224,8 @@ mod tests {
#[async_trait] #[async_trait]
impl Download for DummyDownloader { impl Download for DummyDownloader {
type Error = errors::Error; type Error = errors::Error;
async fn download(&self, _file_link: &Url) -> errors::Result<Bytes> { async fn download(&self, _file_link: &Url) -> errors::Result<Option<Bytes>> {
Ok(Bytes::from("")) Ok(Some(Bytes::from("")))
} }
} }
@ -178,9 +234,14 @@ mod tests {
let html = "<html><head><script>let id = id => id</script></head><body></body></html>"; let html = "<html><head><script>let id = id => id</script></head><body></body></html>";
let base_url = Url::parse("http://example.com")?; let base_url = Url::parse("http://example.com")?;
let downloader = DummyDownloader {}; let downloader = DummyDownloader {};
let to_remove: &[&str] = &[];
assert_eq!( assert_eq!(
self_contained_html(html, &downloader, &base_url, to_remove).await, Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}
.run(html)
.await,
"<html><head></head><body></body></html>" "<html><head></head><body></body></html>"
); );
Ok(()) Ok(())
@ -204,10 +265,13 @@ mod tests {
}; };
let base_url = Url::parse("http://example.com")?; let base_url = Url::parse("http://example.com")?;
let to_remove: &[&str] = &[];
for s in EVENT_HANDLERS { for s in EVENT_HANDLERS {
assert_eq!( assert_eq!(
self_contained_html(html(s), &downloader, &base_url, to_remove).await, Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}.run(html(s)).await,
"<html><head>\n</head>\n<body>\n<button class=\"activate\">button</button>\n</body></html>" "<html><head>\n</head>\n<body>\n<button class=\"activate\">button</button>\n</body></html>"
); );
} }
@ -232,10 +296,15 @@ mod tests {
}; };
let base_url = Url::parse("http://example.com")?; let base_url = Url::parse("http://example.com")?;
let to_remove: &[&str] = &[];
for s in LINK_REL_EXTERNAL_RESOURCES { for s in LINK_REL_EXTERNAL_RESOURCES {
assert_eq!( assert_eq!(
self_contained_html(html(s), &downloader, &base_url, to_remove).await, Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}
.run(html(s))
.await,
"<html><head>\n</head>\n<body>\n</body></html>" "<html><head>\n</head>\n<body>\n</body></html>"
); );
} }
@ -246,12 +315,14 @@ mod tests {
#[async_trait] #[async_trait]
impl Download for CssDownloader { impl Download for CssDownloader {
type Error = errors::Error; type Error = errors::Error;
async fn download(&self, _file_link: &Url) -> errors::Result<Bytes> { async fn download(&self, _file_link: &Url) -> errors::Result<Option<Bytes>> {
Ok(indoc! {" Ok(Some(
indoc! {"
section#warning { section#warning {
color: red; color: red;
}"} }"}
.into()) .into(),
))
} }
} }
@ -286,9 +357,57 @@ mod tests {
let minified = String::from_utf8(minifier.get_html().into())?; let minified = String::from_utf8(minifier.get_html().into())?;
let base_url = Url::parse("http://example.com")?; let base_url = Url::parse("http://example.com")?;
let to_remove: &[&str] = &[];
assert_eq!( assert_eq!(
self_contained_html(html, &downloader, &base_url, to_remove).await, Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}
.run(html)
.await,
minified
);
Ok(())
}
#[tokio::test]
async fn download_css_with_media_query() -> Result<()> {
let downloader = CssDownloader {};
let html = indoc! {"
<html>
<head>
<link rel=\"stylesheet\" href=\"main.css\" media=\"print\">
</head>
<body>
</body>
</html>
"};
let wanted_html = indoc! {"
<html><head>
<style media=\"print\">
section#warning {
color: red;
}
</style>
</head>
<body>
</body></html>
"};
let mut minifier = HTMLMinifier::new();
minifier.digest(wanted_html)?;
let minified = String::from_utf8(minifier.get_html().into())?;
let base_url = Url::parse("http://example.com")?;
assert_eq!(
Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}
.run(html)
.await,
minified minified
); );
Ok(()) Ok(())
@ -298,12 +417,12 @@ mod tests {
#[async_trait] #[async_trait]
impl Download for PngDownloader { impl Download for PngDownloader {
type Error = errors::Error; type Error = errors::Error;
async fn download(&self, _file_link: &Url) -> errors::Result<Bytes> { async fn download(&self, _file_link: &Url) -> errors::Result<Option<Bytes>> {
let image_path = Path::new("test_data/home.png"); let image_path = Path::new("test_data/home.png");
let mut image_file = File::open(&image_path).unwrap(); let mut image_file = File::open(&image_path).unwrap();
let mut image_buf: Vec<u8> = vec![]; let mut image_buf: Vec<u8> = vec![];
image_file.read_to_end(&mut image_buf).unwrap(); image_file.read_to_end(&mut image_buf).unwrap();
Ok(image_buf.into()) Ok(Some(image_buf.into()))
} }
} }
@ -333,9 +452,14 @@ mod tests {
let minified = String::from_utf8(minifier.get_html().into())?; let minified = String::from_utf8(minifier.get_html().into())?;
let base_url = Url::parse("http://example.com")?; let base_url = Url::parse("http://example.com")?;
let to_remove: &[&str] = &[];
assert_eq!( assert_eq!(
self_contained_html(html, &downloader, &base_url, to_remove).await, Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
..Default::default()
}
.run(html)
.await,
minified minified
); );
Ok(()) Ok(())
@ -368,12 +492,67 @@ mod tests {
let minified = String::from_utf8(minifier.get_html().into())?; let minified = String::from_utf8(minifier.get_html().into())?;
assert_eq!( assert_eq!(
self_contained_html( Config {
html, downloader: Some(&downloader),
&downloader, base_url: Some(&base_url),
&base_url, elements_to_remove: &["header", ".placeholder", "article > span.huge"],
&["header", ".placeholder", "article > span.huge"] ..Default::default()
) }
.run(html)
.await,
minified
);
Ok(())
}
#[tokio::test]
async fn add_style() -> Result<()> {
let html = indoc! {"
<html>
<head>
<meta charset=\"UTF-8\">
</head>
<body>
The body
</body>
</html>
"};
let wanted_html = indoc! {"
<html><head>
<meta charset=\"UTF-8\">
<style>
body {
margin: 3em;
}
</style>
</head>
<body>
The body
</body></html>
"};
let style_to_add = indoc! {"
body {
margin: 3em;
}
"};
let base_url = Url::parse("http://example.com")?;
let downloader = DummyDownloader {};
let mut minifier = HTMLMinifier::new();
minifier.digest(wanted_html)?;
let minified = String::from_utf8(minifier.get_html().into())?;
assert_eq!(
Config {
downloader: Some(&downloader),
base_url: Some(&base_url),
styles_to_add: &[style_to_add],
..Default::default()
}
.run(html)
.await, .await,
minified minified
); );

View File

@ -38,6 +38,7 @@ frame "backend" {
newspaper -> retrieval_tools: uses to implement newspaper -> retrieval_tools: uses to implement
article_location --> article_repr: uses article_location --> article_repr: uses
retrieval_tools -up-> article_repr: uses
auto_retrieve --> rss: watches auto_retrieve --> rss: watches
auto_retrieve --> article_location auto_retrieve --> article_location

View File

@ -8,3 +8,24 @@ The newspapers are configured using environment variables
MEDIAPART_COOKIE MEDIAPART_COOKIE
: sets the `MPRUUID` cookie, used to log in : sets the `MPRUUID` cookie, used to log in
# Le Monde Diplomatique
All cookies are mandatory to log in
MONDE_DIPLO_LMD_A_M
: sets the `lmd_a_m` cookie
MONDE_DIPLO_PHPSESSID
: sets the `PHPSESSID` cookie
MONDE_DIPLO_SPIP_SESSION
: sets the `spip_session` cookie
# Courrier international
COURRIER_INTERNATIONAL_LMD_A_M
: sets the `lmd_a_m` cookie
COURRIER_INTERNATIONAL_SSESS
: sets the `ssess` cookie

View File

@ -2,38 +2,27 @@ use std::convert::TryInto;
use std::env; use std::env;
use anyhow::Result; use anyhow::Result;
use crieur_retrieve::{ use crieur_retrieve::{ArticleLocation, Url};
newspaper::Newspaper,
newspapers::mediapart::{self, Mediapart},
ArticleLocation, Url,
};
use dotenv::dotenv; use dotenv::dotenv;
use log::info; use log::info;
#[tokio::main] #[tokio::main]
async fn main() -> Result<()> { async fn main() -> Result<()> {
dotenv().ok(); dotenv().ok();
env_logger::init(); tracing_subscriber::fmt()
.with_writer(std::io::stderr)
.with_env_filter(tracing_subscriber::EnvFilter::from_default_env())
.init();
let url = match env::args().nth(1) { let url = match env::args().nth(1) {
Some(url) => Url::parse(&url)?, Some(url) => Url::parse(&url)?,
None => "https://www.mediapart.fr/journal/france/030421/l-hotel-dieu-patients-et-medecins-tentent-de-percer-les-mysteres-du-covid-long".try_into()?, None => "https://www.mediapart.fr/journal/france/030421/l-hotel-dieu-patients-et-medecins-tentent-de-percer-les-mysteres-du-covid-long".try_into()?,
}; };
// TODO: remove this in favor of default newspapers
let mpruiid = env::var("MEDIAPART_COOKIE")?.into();
let mediapart = Mediapart::builder()
.login(mediapart::Login::MPRUUID(mpruiid))
.build()?;
info!("Trying to download article from {}", url); info!("Trying to download article from {}", url);
// TODO: shorten this, maybe an helper function ? // TODO: shorten this, maybe an helper function ?
let article_location = ArticleLocation::builder() let article_location = ArticleLocation::builder().url(url)?.build()?;
.url(url)?
.newspaper(mediapart)
.build()?;
let article_str = article_location.retrieve_html().await?; let article_str = article_location.retrieve_html().await?;

View File

@ -1,19 +1,29 @@
@build: @build:
cargo build cargo build
@build-container:
podman build --file ./containers/chatbot.containerfile -t crieur-chatbot .
@clean: @clean:
cargo clean cargo clean
@run: @run:
cargo run cargo run
@container: @test:
podman build --file ./containers/chatbot.containerfile -t crieur-chatbot . cargo test --all
@clippy:
cargo clippy
@fmt:
cargo fmt
@simulate-ci: fmt clippy test
@audit: @audit:
cargo audit cargo audit
@crev: @crev:
cargo crev verify cargo crev verify
@verify: audit crev