Compare commits

...

7 Commits

7 changed files with 545 additions and 64 deletions

308
Cargo.lock generated
View File

@ -17,6 +17,79 @@ version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
[[package]]
name = "aho-corasick"
version = "1.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
dependencies = [
"memchr",
]
[[package]]
name = "android-tzdata"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0"
[[package]]
name = "android_system_properties"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311"
dependencies = [
"libc",
]
[[package]]
name = "anstream"
version = "0.6.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "64e15c1ab1f89faffbf04a634d5e1962e9074f2741eef6d97f3c4e322426d526"
dependencies = [
"anstyle",
"anstyle-parse",
"anstyle-query",
"anstyle-wincon",
"colorchoice",
"is_terminal_polyfill",
"utf8parse",
]
[[package]]
name = "anstyle"
version = "1.0.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1bec1de6f59aedf83baf9ff929c98f2ad654b97c9510f4e70cf6f661d49fd5b1"
[[package]]
name = "anstyle-parse"
version = "0.2.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eb47de1e80c2b463c735db5b217a0ddc39d612e7ac9e2e96a5aed1f57616c1cb"
dependencies = [
"utf8parse",
]
[[package]]
name = "anstyle-query"
version = "1.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6d36fc52c7f6c869915e99412912f22093507da8d9e942ceaf66fe4b7c14422a"
dependencies = [
"windows-sys 0.52.0",
]
[[package]]
name = "anstyle-wincon"
version = "3.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5bf74e1b6e971609db8ca7a9ce79fd5768ab6ae46441c572e46cf596f59e57f8"
dependencies = [
"anstyle",
"windows-sys 0.52.0",
]
[[package]]
name = "anyhow"
version = "1.0.86"
@ -95,6 +168,75 @@ version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "chrono"
version = "0.4.38"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401"
dependencies = [
"android-tzdata",
"iana-time-zone",
"num-traits",
"windows-targets",
]
[[package]]
name = "clap"
version = "4.5.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ed6719fffa43d0d87e5fd8caeab59be1554fb028cd30edc88fc4369b17971019"
dependencies = [
"clap_builder",
"clap_derive",
]
[[package]]
name = "clap_builder"
version = "4.5.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "216aec2b177652e3846684cbfe25c9964d18ec45234f0f5da5157b207ed1aab6"
dependencies = [
"anstream",
"anstyle",
"clap_lex",
"strsim",
]
[[package]]
name = "clap_derive"
version = "4.5.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "501d359d5f3dcaf6ecdeee48833ae73ec6e42723a1e52419c79abf9507eec0a0"
dependencies = [
"heck",
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "clap_lex"
version = "0.7.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97"
[[package]]
name = "colorchoice"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d3fd119d74b830634cea2a0f58bbd0d54540518a14397557951e79340abc28c0"
[[package]]
name = "colored"
version = "1.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a5f741c91823341bebf717d4c71bda820630ce065443b58bd1b7451af008355"
dependencies = [
"is-terminal",
"lazy_static",
"winapi",
]
[[package]]
name = "cookie"
version = "0.16.2"
@ -197,6 +339,17 @@ version = "2.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a"
[[package]]
name = "fern"
version = "0.6.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9f0c14694cbd524c8720dd69b0e3179344f04ebb5f90f2e4a440c6ea3b2f1ee"
dependencies = [
"chrono",
"colored",
"log",
]
[[package]]
name = "fnv"
version = "1.0.7"
@ -321,6 +474,12 @@ version = "0.14.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
[[package]]
name = "heck"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
[[package]]
name = "hermit-abi"
version = "0.3.9"
@ -378,6 +537,12 @@ version = "1.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0fcc0b4a115bf80b728eb8ea024ad5bd707b615bfed49e0665b6e0f86fd082d9"
[[package]]
name = "humantime"
version = "2.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4"
[[package]]
name = "hyper"
version = "1.4.1"
@ -451,6 +616,29 @@ dependencies = [
"tracing",
]
[[package]]
name = "iana-time-zone"
version = "0.1.60"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e7ffbb5a1b541ea2561f8c41c087286cc091e21e556a4f09a8f6cbf17b69b141"
dependencies = [
"android_system_properties",
"core-foundation-sys",
"iana-time-zone-haiku",
"js-sys",
"wasm-bindgen",
"windows-core",
]
[[package]]
name = "iana-time-zone-haiku"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f"
dependencies = [
"cc",
]
[[package]]
name = "idna"
version = "0.5.0"
@ -477,6 +665,23 @@ version = "2.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3"
[[package]]
name = "is-terminal"
version = "0.4.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f23ff5ef2b80d608d61efee834934d862cd92461afc0560dedf493e4c033738b"
dependencies = [
"hermit-abi",
"libc",
"windows-sys 0.52.0",
]
[[package]]
name = "is_terminal_polyfill"
version = "1.70.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
[[package]]
name = "itoa"
version = "1.0.11"
@ -492,6 +697,12 @@ dependencies = [
"wasm-bindgen",
]
[[package]]
name = "lazy_static"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
[[package]]
name = "libc"
version = "0.2.158"
@ -576,6 +787,15 @@ version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9"
[[package]]
name = "num-traits"
version = "0.2.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
dependencies = [
"autocfg",
]
[[package]]
name = "object"
version = "0.36.3"
@ -735,6 +955,35 @@ dependencies = [
"bitflags",
]
[[package]]
name = "regex"
version = "1.10.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4219d74c6b67a3654a9fbebc4b419e22126d13d2f3c4a07ee0cb61ff79a79619"
dependencies = [
"aho-corasick",
"memchr",
"regex-automata",
"regex-syntax",
]
[[package]]
name = "regex-automata"
version = "0.4.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df"
dependencies = [
"aho-corasick",
"memchr",
"regex-syntax",
]
[[package]]
name = "regex-syntax"
version = "0.8.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b"
[[package]]
name = "reqwest"
version = "0.12.7"
@ -898,18 +1147,18 @@ dependencies = [
[[package]]
name = "serde"
version = "1.0.208"
version = "1.0.209"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cff085d2cb684faa248efb494c39b68e522822ac0de72ccf08109abde717cfb2"
checksum = "99fce0ffe7310761ca6bf9faf5115afbc19688edd00171d81b1bb1b116c63e09"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.208"
version = "1.0.209"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "24008e81ff7613ed8e5ba0cfaf24e2c2f1e5b8a0495711e44fcd4882fca62bcf"
checksum = "a5831b979fd7b5439637af1752d535ff49f4860c0f341d1baeb6faf0f4242170"
dependencies = [
"proc-macro2",
"quote",
@ -991,11 +1240,25 @@ name = "stackscraper"
version = "0.1.0"
dependencies = [
"anyhow",
"clap",
"fantoccini",
"fern",
"humantime",
"lazy_static",
"log",
"regex",
"reqwest",
"serde",
"serde_json",
"tokio",
]
[[package]]
name = "strsim"
version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
[[package]]
name = "subtle"
version = "2.6.1"
@ -1281,6 +1544,12 @@ dependencies = [
"percent-encoding",
]
[[package]]
name = "utf8parse"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
[[package]]
name = "vcpkg"
version = "0.2.15"
@ -1405,6 +1674,37 @@ dependencies = [
"url",
]
[[package]]
name = "winapi"
version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
dependencies = [
"winapi-i686-pc-windows-gnu",
"winapi-x86_64-pc-windows-gnu",
]
[[package]]
name = "winapi-i686-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
[[package]]
name = "winapi-x86_64-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
[[package]]
name = "windows-core"
version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9"
dependencies = [
"windows-targets",
]
[[package]]
name = "windows-registry"
version = "0.2.0"

View File

@ -5,6 +5,14 @@ edition = "2021"
[dependencies]
anyhow = "1.0.86"
clap = { version = "4.5.16", features = ["derive"] }
fantoccini = "0.21.1"
fern = { version = "0.6.2", features = ["chrono", "colored"] }
humantime = "2.1.0"
lazy_static = "1.5.0"
log = "0.4.22"
regex = "1.10.6"
reqwest = "0.12.7"
serde = { version = "1.0.209", features = ["derive"] }
serde_json = "1.0.127"
tokio = { version = "1.39.3", features = ["full"] }

View File

@ -1,17 +1,9 @@
FROM ubuntu:latest
VOLUME .:/project
WORKDIR /project
RUN echo '\n\
Package: *\n\
Pin: origin packages.mozilla.org\n\
Pin-Priority: 1000\n\
' | tee /etc/apt/preferences.d/mozilla
RUN cat /etc/apt/preferences.d/mozilla
RUN apt -y update
RUN apt -y install curl tar firefox
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y -v --default-toolchain nightly --profile complete
RUN install -d -m 0755 /etc/apt/keyrings
RUN curl --proto '=https' --tlsv1.2 -sSf https://packages.mozilla.org/apt/repo-signing-key.gpg | tee /etc/apt/keyrings/packages.mozilla.org.asc > /dev/null
RUN echo "deb [signed-by=/etc/apt/keyrings/packages.mozilla.org.asc] https://packages.mozilla.org/apt mozilla main" | tee -a /etc/apt/sources.list.d/mozilla.list > /dev/null
RUN curl --proto '=https' --tlsv1.2 -sSfL https://github.com/mozilla/geckodriver/releases/download/v0.35.0/geckodriver-v0.35.0-linux64.tar.gz | tar xz
CMD ["/bin/bash"]
FROM archlinux:base-devel
RUN pacman -Sy --noconfirm geckodriver firefox xorg-server-xvfb
RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
ENV PATH="/root/.cargo/bin:${PATH}"
COPY . ./StackScraper
WORKDIR StackScraper
RUN cargo build --release
ENV DISPLAY=:99
ENTRYPOINT Xvfb :99 -ac &>/dev/null & export DISPLAY=:99 && geckodriver &>/dev/null & ./target/release/stackscraper

1
answers.json Normal file

File diff suppressed because one or more lines are too long

17
src/analyze.rs Normal file
View File

@ -0,0 +1,17 @@
use std::collections::HashMap;
use crate::Answer;
pub fn analyze_frequencies(answers: Vec<Answer>) -> HashMap<String, u16> {
let mut out: HashMap<String, u16> = HashMap::new();
for answer in answers {
for word in answer.content.replace("\n", " ").split_whitespace() {
out.entry(word.to_string())
.and_modify(|count| *count += 1)
.or_insert(1);
}
}
out
}

100
src/collector.rs Normal file
View File

@ -0,0 +1,100 @@
use fantoccini::{Client, ClientBuilder, Locator};
use log::info;
use log::warn;
use serde::{Deserialize, Serialize};
macro_rules! skip_fail {
($res:expr) => {
match $res {
Ok(val) => val,
Err(e) => {
warn!("An error: {}; skipped.", e);
continue;
}
}
};
}
macro_rules! skip_fail_opt {
($res:expr) => {
match $res {
Some(val) => val,
None => {
warn!("Unexpected empty value; skipped.");
continue;
}
}
};
}
/// Holds data about stackoverflow answers
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct Answer {
upvotes: u32,
author: String,
pub content: String,
}
/// Get all answers from a stackoverflow domain. No error handling is done so get ready to either
/// check your input or "note: run with `RUST_BACKTRACE=1` environment variable to display a
/// backtrace"
pub async fn get_answers(
c: &Client,
url: &str,
i: usize,
links_size: usize,
) -> anyhow::Result<Vec<Answer>> {
// first, go to the Wikipedia page for Foobar
c.goto(url).await?;
let answer_loc = c.find_all(Locator::Css(".answer")).await?;
let mut out_answers = vec![];
for (j, answer) in answer_loc.iter().enumerate() {
info!("Getting answer {} on link {} of {}", j, i, links_size);
let text = skip_fail!(answer.text().await);
let score =
skip_fail!(
skip_fail_opt!(text.clone().split('\n').collect::<Vec<&str>>().get(0))
.parse::<u32>()
);
let content = text
.split("Share\nImprove this answer")
.collect::<Vec<&str>>()[0]
.to_string()
.replace("\\n", "\n");
out_answers.push(Answer {
upvotes: score,
content: content,
author: "unimplemented".to_string(),
});
}
Ok(out_answers)
}
pub async fn get_top_links(c: &Client, pages: u16) -> anyhow::Result<Vec<String>> {
let mut answers = vec![];
for page in 1..=pages {
skip_fail!(
c.goto(
format!(
"https://stackoverflow.com/questions?tab=votes&page={}",
page
)
.as_str(),
)
.await
);
let finds = c.find_all(Locator::Css(".s-link")).await?;
for find in finds {
if skip_fail_opt!(skip_fail!(find.attr("href").await)).contains("/questions/") {
answers.push(find.attr("href").await.unwrap().unwrap());
}
}
}
Ok(answers)
}

View File

@ -1,49 +1,112 @@
use fantoccini::{elements::Element, ClientBuilder, Locator};
use fantoccini::{Client, ClientBuilder};
pub mod analyze;
pub mod collector;
use analyze::*;
use clap::Parser;
use collector::*;
use fern::{
self,
colors::{Color, ColoredLevelConfig},
};
use log::{debug, error, info, trace, warn};
use std::path::PathBuf;
#[derive(Debug, Clone)]
struct Answer {
upvotes: u32,
author: String,
content: String,
}
// let's set up the sequence of steps we want the browser to take
async fn get_answers(url: &str) -> Vec<Answer> {
let c = ClientBuilder::native()
.connect("http://localhost:4444")
.await
.expect("failed to connect to WebDriver");
// first, go to the Wikipedia page for Foobar
c.goto(url).await.unwrap();
let answer_loc = c.find_all(Locator::Css(".answer")).await.unwrap();
let mut out_answers = vec![];
for answer in answer_loc {
let text = answer.text().await.unwrap();
let score = text
.clone()
.split('\n')
.collect::<Vec<&str>>()
.get(0)
.unwrap()
.parse::<u32>()
.unwrap();
let content = text;
out_answers.push(Answer {
upvotes: score,
content,
author: "unimplemented".to_string(),
});
}
c.close().await.unwrap();
out_answers
#[derive(Debug, Parser, Clone)]
#[command(about = "Scrape stackoverflow for something idk")]
pub struct Args {
#[clap(
short,
long,
default_value_t = 5,
help = "Amount of pages to scrape for links. Sorted by top voted"
)]
pages: u16,
#[clap(short, long, default_value_t = log::LevelFilter::Info)]
log_level: log::LevelFilter,
#[clap(short, long)]
answers_file: Option<PathBuf>,
}
#[tokio::main]
async fn main() {
dbg!(get_answers("https://stackoverflow.com/questions/11227809/why-is-processing-a-sorted-array-faster-than-processing-an-unsorted-array").await);
let args = Args::parse();
init_fern(args.log_level);
if let Some(path) = args.answers_file {
let answers = serde_json::from_str(&std::fs::read_to_string(path).unwrap()).unwrap();
let freqs = analyze_frequencies(answers);
let mut freqs = freqs.iter().collect::<Vec<(&String, &u16)>>();
freqs.sort_by(|a, b| b.1.cmp(&a.1));
for i in &freqs[0..] {
println!("{} : {}", i.0, i.1);
}
} else {
let start = std::time::Instant::now();
info!("Spawning client");
let c: Client = ClientBuilder::native()
.connect("http://localhost:4444")
.await
.unwrap_or_else(|e| {
error!("Error: {e}");
panic!();
});
info!("Getting links");
let links = get_top_links(&c, args.pages)
.await
.expect("Failed to get links. Exiting");
info!("Got {} links. Expected {}", links.len(), args.pages * 15);
info!("Getting answers");
let mut answers = vec![];
for (i, link) in links.iter().enumerate() {
answers.append(
&mut get_answers(
&c,
format!("https://stackoverflow.com{}", link).as_str(),
i,
links.len(),
)
.await
.unwrap_or_default(),
);
}
info!(
"Got {} answers in {} sec",
answers.len(),
start.elapsed().as_secs_f32()
);
c.close().await.unwrap();
info!("Writing answers to answers.json");
let _ = std::fs::write(
"answers.json",
serde_json::to_string(&answers).unwrap_or_else(|e| {
error!("Error: {}", e);
panic!();
}),
);
}
}
fn init_fern(level: log::LevelFilter) -> anyhow::Result<()> {
let colors = ColoredLevelConfig::new()
.trace(Color::White)
.info(Color::Green)
.debug(Color::Magenta)
.warn(Color::Yellow)
.error(Color::Red);
fern::Dispatch::new()
.format(move |out, message, record| {
out.finish(format_args!(
"[{} {} {}] {}",
humantime::format_rfc3339_seconds(std::time::SystemTime::now()),
colors.color(record.level()),
record.target(),
message
))
})
.level(level)
.chain(std::io::stdout())
.apply()?;
Ok(())
}