Compare commits
7 Commits
Author | SHA1 | Date | |
---|---|---|---|
7890ac487a | |||
6b4a54a2c9 | |||
b38a7c1c4c | |||
d8632c9228 | |||
9426f6b855 | |||
55c7bba09d | |||
080a27f77d |
308
Cargo.lock
generated
308
Cargo.lock
generated
@ -17,6 +17,79 @@ version = "1.0.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
|
||||
|
||||
[[package]]
|
||||
name = "aho-corasick"
|
||||
version = "1.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "android-tzdata"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0"
|
||||
|
||||
[[package]]
|
||||
name = "android_system_properties"
|
||||
version = "0.1.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311"
|
||||
dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "anstream"
|
||||
version = "0.6.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "64e15c1ab1f89faffbf04a634d5e1962e9074f2741eef6d97f3c4e322426d526"
|
||||
dependencies = [
|
||||
"anstyle",
|
||||
"anstyle-parse",
|
||||
"anstyle-query",
|
||||
"anstyle-wincon",
|
||||
"colorchoice",
|
||||
"is_terminal_polyfill",
|
||||
"utf8parse",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "anstyle"
|
||||
version = "1.0.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1bec1de6f59aedf83baf9ff929c98f2ad654b97c9510f4e70cf6f661d49fd5b1"
|
||||
|
||||
[[package]]
|
||||
name = "anstyle-parse"
|
||||
version = "0.2.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "eb47de1e80c2b463c735db5b217a0ddc39d612e7ac9e2e96a5aed1f57616c1cb"
|
||||
dependencies = [
|
||||
"utf8parse",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "anstyle-query"
|
||||
version = "1.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6d36fc52c7f6c869915e99412912f22093507da8d9e942ceaf66fe4b7c14422a"
|
||||
dependencies = [
|
||||
"windows-sys 0.52.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "anstyle-wincon"
|
||||
version = "3.0.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5bf74e1b6e971609db8ca7a9ce79fd5768ab6ae46441c572e46cf596f59e57f8"
|
||||
dependencies = [
|
||||
"anstyle",
|
||||
"windows-sys 0.52.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "anyhow"
|
||||
version = "1.0.86"
|
||||
@ -95,6 +168,75 @@ version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
||||
|
||||
[[package]]
|
||||
name = "chrono"
|
||||
version = "0.4.38"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401"
|
||||
dependencies = [
|
||||
"android-tzdata",
|
||||
"iana-time-zone",
|
||||
"num-traits",
|
||||
"windows-targets",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "clap"
|
||||
version = "4.5.16"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ed6719fffa43d0d87e5fd8caeab59be1554fb028cd30edc88fc4369b17971019"
|
||||
dependencies = [
|
||||
"clap_builder",
|
||||
"clap_derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "clap_builder"
|
||||
version = "4.5.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "216aec2b177652e3846684cbfe25c9964d18ec45234f0f5da5157b207ed1aab6"
|
||||
dependencies = [
|
||||
"anstream",
|
||||
"anstyle",
|
||||
"clap_lex",
|
||||
"strsim",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "clap_derive"
|
||||
version = "4.5.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "501d359d5f3dcaf6ecdeee48833ae73ec6e42723a1e52419c79abf9507eec0a0"
|
||||
dependencies = [
|
||||
"heck",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "clap_lex"
|
||||
version = "0.7.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97"
|
||||
|
||||
[[package]]
|
||||
name = "colorchoice"
|
||||
version = "1.0.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d3fd119d74b830634cea2a0f58bbd0d54540518a14397557951e79340abc28c0"
|
||||
|
||||
[[package]]
|
||||
name = "colored"
|
||||
version = "1.9.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5a5f741c91823341bebf717d4c71bda820630ce065443b58bd1b7451af008355"
|
||||
dependencies = [
|
||||
"is-terminal",
|
||||
"lazy_static",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cookie"
|
||||
version = "0.16.2"
|
||||
@ -197,6 +339,17 @@ version = "2.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a"
|
||||
|
||||
[[package]]
|
||||
name = "fern"
|
||||
version = "0.6.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d9f0c14694cbd524c8720dd69b0e3179344f04ebb5f90f2e4a440c6ea3b2f1ee"
|
||||
dependencies = [
|
||||
"chrono",
|
||||
"colored",
|
||||
"log",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fnv"
|
||||
version = "1.0.7"
|
||||
@ -321,6 +474,12 @@ version = "0.14.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
|
||||
|
||||
[[package]]
|
||||
name = "heck"
|
||||
version = "0.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
|
||||
|
||||
[[package]]
|
||||
name = "hermit-abi"
|
||||
version = "0.3.9"
|
||||
@ -378,6 +537,12 @@ version = "1.9.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0fcc0b4a115bf80b728eb8ea024ad5bd707b615bfed49e0665b6e0f86fd082d9"
|
||||
|
||||
[[package]]
|
||||
name = "humantime"
|
||||
version = "2.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4"
|
||||
|
||||
[[package]]
|
||||
name = "hyper"
|
||||
version = "1.4.1"
|
||||
@ -451,6 +616,29 @@ dependencies = [
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "iana-time-zone"
|
||||
version = "0.1.60"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e7ffbb5a1b541ea2561f8c41c087286cc091e21e556a4f09a8f6cbf17b69b141"
|
||||
dependencies = [
|
||||
"android_system_properties",
|
||||
"core-foundation-sys",
|
||||
"iana-time-zone-haiku",
|
||||
"js-sys",
|
||||
"wasm-bindgen",
|
||||
"windows-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "iana-time-zone-haiku"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f"
|
||||
dependencies = [
|
||||
"cc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "idna"
|
||||
version = "0.5.0"
|
||||
@ -477,6 +665,23 @@ version = "2.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3"
|
||||
|
||||
[[package]]
|
||||
name = "is-terminal"
|
||||
version = "0.4.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f23ff5ef2b80d608d61efee834934d862cd92461afc0560dedf493e4c033738b"
|
||||
dependencies = [
|
||||
"hermit-abi",
|
||||
"libc",
|
||||
"windows-sys 0.52.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "is_terminal_polyfill"
|
||||
version = "1.70.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
|
||||
|
||||
[[package]]
|
||||
name = "itoa"
|
||||
version = "1.0.11"
|
||||
@ -492,6 +697,12 @@ dependencies = [
|
||||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lazy_static"
|
||||
version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
|
||||
|
||||
[[package]]
|
||||
name = "libc"
|
||||
version = "0.2.158"
|
||||
@ -576,6 +787,15 @@ version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9"
|
||||
|
||||
[[package]]
|
||||
name = "num-traits"
|
||||
version = "0.2.19"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "object"
|
||||
version = "0.36.3"
|
||||
@ -735,6 +955,35 @@ dependencies = [
|
||||
"bitflags",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex"
|
||||
version = "1.10.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4219d74c6b67a3654a9fbebc4b419e22126d13d2f3c4a07ee0cb61ff79a79619"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
"regex-automata",
|
||||
"regex-syntax",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-automata"
|
||||
version = "0.4.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
"regex-syntax",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-syntax"
|
||||
version = "0.8.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b"
|
||||
|
||||
[[package]]
|
||||
name = "reqwest"
|
||||
version = "0.12.7"
|
||||
@ -898,18 +1147,18 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "serde"
|
||||
version = "1.0.208"
|
||||
version = "1.0.209"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cff085d2cb684faa248efb494c39b68e522822ac0de72ccf08109abde717cfb2"
|
||||
checksum = "99fce0ffe7310761ca6bf9faf5115afbc19688edd00171d81b1bb1b116c63e09"
|
||||
dependencies = [
|
||||
"serde_derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_derive"
|
||||
version = "1.0.208"
|
||||
version = "1.0.209"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "24008e81ff7613ed8e5ba0cfaf24e2c2f1e5b8a0495711e44fcd4882fca62bcf"
|
||||
checksum = "a5831b979fd7b5439637af1752d535ff49f4860c0f341d1baeb6faf0f4242170"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
@ -991,11 +1240,25 @@ name = "stackscraper"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"clap",
|
||||
"fantoccini",
|
||||
"fern",
|
||||
"humantime",
|
||||
"lazy_static",
|
||||
"log",
|
||||
"regex",
|
||||
"reqwest",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "strsim"
|
||||
version = "0.11.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
|
||||
|
||||
[[package]]
|
||||
name = "subtle"
|
||||
version = "2.6.1"
|
||||
@ -1281,6 +1544,12 @@ dependencies = [
|
||||
"percent-encoding",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "utf8parse"
|
||||
version = "0.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
|
||||
|
||||
[[package]]
|
||||
name = "vcpkg"
|
||||
version = "0.2.15"
|
||||
@ -1405,6 +1674,37 @@ dependencies = [
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "winapi"
|
||||
version = "0.3.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
|
||||
dependencies = [
|
||||
"winapi-i686-pc-windows-gnu",
|
||||
"winapi-x86_64-pc-windows-gnu",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "winapi-i686-pc-windows-gnu"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
|
||||
|
||||
[[package]]
|
||||
name = "winapi-x86_64-pc-windows-gnu"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
|
||||
|
||||
[[package]]
|
||||
name = "windows-core"
|
||||
version = "0.52.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9"
|
||||
dependencies = [
|
||||
"windows-targets",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-registry"
|
||||
version = "0.2.0"
|
||||
|
@ -5,6 +5,14 @@ edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1.0.86"
|
||||
clap = { version = "4.5.16", features = ["derive"] }
|
||||
fantoccini = "0.21.1"
|
||||
fern = { version = "0.6.2", features = ["chrono", "colored"] }
|
||||
humantime = "2.1.0"
|
||||
lazy_static = "1.5.0"
|
||||
log = "0.4.22"
|
||||
regex = "1.10.6"
|
||||
reqwest = "0.12.7"
|
||||
serde = { version = "1.0.209", features = ["derive"] }
|
||||
serde_json = "1.0.127"
|
||||
tokio = { version = "1.39.3", features = ["full"] }
|
||||
|
26
Dockerfile
26
Dockerfile
@ -1,17 +1,9 @@
|
||||
FROM ubuntu:latest
|
||||
VOLUME .:/project
|
||||
WORKDIR /project
|
||||
RUN echo '\n\
|
||||
Package: *\n\
|
||||
Pin: origin packages.mozilla.org\n\
|
||||
Pin-Priority: 1000\n\
|
||||
' | tee /etc/apt/preferences.d/mozilla
|
||||
RUN cat /etc/apt/preferences.d/mozilla
|
||||
RUN apt -y update
|
||||
RUN apt -y install curl tar firefox
|
||||
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y -v --default-toolchain nightly --profile complete
|
||||
RUN install -d -m 0755 /etc/apt/keyrings
|
||||
RUN curl --proto '=https' --tlsv1.2 -sSf https://packages.mozilla.org/apt/repo-signing-key.gpg | tee /etc/apt/keyrings/packages.mozilla.org.asc > /dev/null
|
||||
RUN echo "deb [signed-by=/etc/apt/keyrings/packages.mozilla.org.asc] https://packages.mozilla.org/apt mozilla main" | tee -a /etc/apt/sources.list.d/mozilla.list > /dev/null
|
||||
RUN curl --proto '=https' --tlsv1.2 -sSfL https://github.com/mozilla/geckodriver/releases/download/v0.35.0/geckodriver-v0.35.0-linux64.tar.gz | tar xz
|
||||
CMD ["/bin/bash"]
|
||||
FROM archlinux:base-devel
|
||||
RUN pacman -Sy --noconfirm geckodriver firefox xorg-server-xvfb
|
||||
RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
|
||||
ENV PATH="/root/.cargo/bin:${PATH}"
|
||||
COPY . ./StackScraper
|
||||
WORKDIR StackScraper
|
||||
RUN cargo build --release
|
||||
ENV DISPLAY=:99
|
||||
ENTRYPOINT Xvfb :99 -ac &>/dev/null & export DISPLAY=:99 && geckodriver &>/dev/null & ./target/release/stackscraper
|
||||
|
1
answers.json
Normal file
1
answers.json
Normal file
File diff suppressed because one or more lines are too long
17
src/analyze.rs
Normal file
17
src/analyze.rs
Normal file
@ -0,0 +1,17 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use crate::Answer;
|
||||
|
||||
pub fn analyze_frequencies(answers: Vec<Answer>) -> HashMap<String, u16> {
|
||||
let mut out: HashMap<String, u16> = HashMap::new();
|
||||
|
||||
for answer in answers {
|
||||
for word in answer.content.replace("\n", " ").split_whitespace() {
|
||||
out.entry(word.to_string())
|
||||
.and_modify(|count| *count += 1)
|
||||
.or_insert(1);
|
||||
}
|
||||
}
|
||||
|
||||
out
|
||||
}
|
100
src/collector.rs
Normal file
100
src/collector.rs
Normal file
@ -0,0 +1,100 @@
|
||||
use fantoccini::{Client, ClientBuilder, Locator};
|
||||
use log::info;
|
||||
use log::warn;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
macro_rules! skip_fail {
|
||||
($res:expr) => {
|
||||
match $res {
|
||||
Ok(val) => val,
|
||||
Err(e) => {
|
||||
warn!("An error: {}; skipped.", e);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
macro_rules! skip_fail_opt {
|
||||
($res:expr) => {
|
||||
match $res {
|
||||
Some(val) => val,
|
||||
None => {
|
||||
warn!("Unexpected empty value; skipped.");
|
||||
continue;
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/// Holds data about stackoverflow answers
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
pub struct Answer {
|
||||
upvotes: u32,
|
||||
author: String,
|
||||
pub content: String,
|
||||
}
|
||||
|
||||
/// Get all answers from a stackoverflow domain. No error handling is done so get ready to either
|
||||
/// check your input or "note: run with `RUST_BACKTRACE=1` environment variable to display a
|
||||
/// backtrace"
|
||||
pub async fn get_answers(
|
||||
c: &Client,
|
||||
url: &str,
|
||||
i: usize,
|
||||
links_size: usize,
|
||||
) -> anyhow::Result<Vec<Answer>> {
|
||||
// first, go to the Wikipedia page for Foobar
|
||||
c.goto(url).await?;
|
||||
|
||||
let answer_loc = c.find_all(Locator::Css(".answer")).await?;
|
||||
let mut out_answers = vec![];
|
||||
for (j, answer) in answer_loc.iter().enumerate() {
|
||||
info!("Getting answer {} on link {} of {}", j, i, links_size);
|
||||
let text = skip_fail!(answer.text().await);
|
||||
|
||||
let score =
|
||||
skip_fail!(
|
||||
skip_fail_opt!(text.clone().split('\n').collect::<Vec<&str>>().get(0))
|
||||
.parse::<u32>()
|
||||
);
|
||||
let content = text
|
||||
.split("Share\nImprove this answer")
|
||||
.collect::<Vec<&str>>()[0]
|
||||
.to_string()
|
||||
.replace("\\n", "\n");
|
||||
|
||||
out_answers.push(Answer {
|
||||
upvotes: score,
|
||||
content: content,
|
||||
author: "unimplemented".to_string(),
|
||||
});
|
||||
}
|
||||
|
||||
Ok(out_answers)
|
||||
}
|
||||
|
||||
pub async fn get_top_links(c: &Client, pages: u16) -> anyhow::Result<Vec<String>> {
|
||||
let mut answers = vec![];
|
||||
for page in 1..=pages {
|
||||
skip_fail!(
|
||||
c.goto(
|
||||
format!(
|
||||
"https://stackoverflow.com/questions?tab=votes&page={}",
|
||||
page
|
||||
)
|
||||
.as_str(),
|
||||
)
|
||||
.await
|
||||
);
|
||||
|
||||
let finds = c.find_all(Locator::Css(".s-link")).await?;
|
||||
for find in finds {
|
||||
if skip_fail_opt!(skip_fail!(find.attr("href").await)).contains("/questions/") {
|
||||
answers.push(find.attr("href").await.unwrap().unwrap());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(answers)
|
||||
}
|
149
src/main.rs
149
src/main.rs
@ -1,49 +1,112 @@
|
||||
use fantoccini::{elements::Element, ClientBuilder, Locator};
|
||||
use fantoccini::{Client, ClientBuilder};
|
||||
pub mod analyze;
|
||||
pub mod collector;
|
||||
use analyze::*;
|
||||
use clap::Parser;
|
||||
use collector::*;
|
||||
use fern::{
|
||||
self,
|
||||
colors::{Color, ColoredLevelConfig},
|
||||
};
|
||||
use log::{debug, error, info, trace, warn};
|
||||
use std::path::PathBuf;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct Answer {
|
||||
upvotes: u32,
|
||||
author: String,
|
||||
content: String,
|
||||
}
|
||||
|
||||
// let's set up the sequence of steps we want the browser to take
|
||||
async fn get_answers(url: &str) -> Vec<Answer> {
|
||||
let c = ClientBuilder::native()
|
||||
.connect("http://localhost:4444")
|
||||
.await
|
||||
.expect("failed to connect to WebDriver");
|
||||
|
||||
// first, go to the Wikipedia page for Foobar
|
||||
c.goto(url).await.unwrap();
|
||||
|
||||
let answer_loc = c.find_all(Locator::Css(".answer")).await.unwrap();
|
||||
let mut out_answers = vec![];
|
||||
for answer in answer_loc {
|
||||
let text = answer.text().await.unwrap();
|
||||
|
||||
let score = text
|
||||
.clone()
|
||||
.split('\n')
|
||||
.collect::<Vec<&str>>()
|
||||
.get(0)
|
||||
.unwrap()
|
||||
.parse::<u32>()
|
||||
.unwrap();
|
||||
let content = text;
|
||||
|
||||
out_answers.push(Answer {
|
||||
upvotes: score,
|
||||
content,
|
||||
author: "unimplemented".to_string(),
|
||||
});
|
||||
}
|
||||
c.close().await.unwrap();
|
||||
|
||||
out_answers
|
||||
#[derive(Debug, Parser, Clone)]
|
||||
#[command(about = "Scrape stackoverflow for something idk")]
|
||||
pub struct Args {
|
||||
#[clap(
|
||||
short,
|
||||
long,
|
||||
default_value_t = 5,
|
||||
help = "Amount of pages to scrape for links. Sorted by top voted"
|
||||
)]
|
||||
pages: u16,
|
||||
#[clap(short, long, default_value_t = log::LevelFilter::Info)]
|
||||
log_level: log::LevelFilter,
|
||||
#[clap(short, long)]
|
||||
answers_file: Option<PathBuf>,
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
dbg!(get_answers("https://stackoverflow.com/questions/11227809/why-is-processing-a-sorted-array-faster-than-processing-an-unsorted-array").await);
|
||||
let args = Args::parse();
|
||||
init_fern(args.log_level);
|
||||
|
||||
if let Some(path) = args.answers_file {
|
||||
let answers = serde_json::from_str(&std::fs::read_to_string(path).unwrap()).unwrap();
|
||||
let freqs = analyze_frequencies(answers);
|
||||
let mut freqs = freqs.iter().collect::<Vec<(&String, &u16)>>();
|
||||
freqs.sort_by(|a, b| b.1.cmp(&a.1));
|
||||
for i in &freqs[0..] {
|
||||
println!("{} : {}", i.0, i.1);
|
||||
}
|
||||
} else {
|
||||
let start = std::time::Instant::now();
|
||||
info!("Spawning client");
|
||||
let c: Client = ClientBuilder::native()
|
||||
.connect("http://localhost:4444")
|
||||
.await
|
||||
.unwrap_or_else(|e| {
|
||||
error!("Error: {e}");
|
||||
panic!();
|
||||
});
|
||||
|
||||
info!("Getting links");
|
||||
let links = get_top_links(&c, args.pages)
|
||||
.await
|
||||
.expect("Failed to get links. Exiting");
|
||||
info!("Got {} links. Expected {}", links.len(), args.pages * 15);
|
||||
info!("Getting answers");
|
||||
let mut answers = vec![];
|
||||
for (i, link) in links.iter().enumerate() {
|
||||
answers.append(
|
||||
&mut get_answers(
|
||||
&c,
|
||||
format!("https://stackoverflow.com{}", link).as_str(),
|
||||
i,
|
||||
links.len(),
|
||||
)
|
||||
.await
|
||||
.unwrap_or_default(),
|
||||
);
|
||||
}
|
||||
info!(
|
||||
"Got {} answers in {} sec",
|
||||
answers.len(),
|
||||
start.elapsed().as_secs_f32()
|
||||
);
|
||||
c.close().await.unwrap();
|
||||
info!("Writing answers to answers.json");
|
||||
let _ = std::fs::write(
|
||||
"answers.json",
|
||||
serde_json::to_string(&answers).unwrap_or_else(|e| {
|
||||
error!("Error: {}", e);
|
||||
panic!();
|
||||
}),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
fn init_fern(level: log::LevelFilter) -> anyhow::Result<()> {
|
||||
let colors = ColoredLevelConfig::new()
|
||||
.trace(Color::White)
|
||||
.info(Color::Green)
|
||||
.debug(Color::Magenta)
|
||||
.warn(Color::Yellow)
|
||||
.error(Color::Red);
|
||||
|
||||
fern::Dispatch::new()
|
||||
.format(move |out, message, record| {
|
||||
out.finish(format_args!(
|
||||
"[{} {} {}] {}",
|
||||
humantime::format_rfc3339_seconds(std::time::SystemTime::now()),
|
||||
colors.color(record.level()),
|
||||
record.target(),
|
||||
message
|
||||
))
|
||||
})
|
||||
.level(level)
|
||||
.chain(std::io::stdout())
|
||||
.apply()?;
|
||||
Ok(())
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user