some improvements

This commit is contained in:
spv 2024-08-26 17:31:40 +02:00
parent d8632c9228
commit b38a7c1c4c
No known key found for this signature in database
GPG Key ID: 7638A987CE28ADFA
5 changed files with 207 additions and 15 deletions

131
Cargo.lock generated
View File

@ -26,6 +26,21 @@ dependencies = [
"memchr",
]
[[package]]
name = "android-tzdata"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0"
[[package]]
name = "android_system_properties"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311"
dependencies = [
"libc",
]
[[package]]
name = "anstream"
version = "0.6.15"
@ -153,6 +168,18 @@ version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "chrono"
version = "0.4.38"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401"
dependencies = [
"android-tzdata",
"iana-time-zone",
"num-traits",
"windows-targets",
]
[[package]]
name = "clap"
version = "4.5.16"
@ -199,6 +226,17 @@ version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d3fd119d74b830634cea2a0f58bbd0d54540518a14397557951e79340abc28c0"
[[package]]
name = "colored"
version = "1.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a5f741c91823341bebf717d4c71bda820630ce065443b58bd1b7451af008355"
dependencies = [
"is-terminal",
"lazy_static",
"winapi",
]
[[package]]
name = "cookie"
version = "0.16.2"
@ -301,6 +339,17 @@ version = "2.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a"
[[package]]
name = "fern"
version = "0.6.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9f0c14694cbd524c8720dd69b0e3179344f04ebb5f90f2e4a440c6ea3b2f1ee"
dependencies = [
"chrono",
"colored",
"log",
]
[[package]]
name = "fnv"
version = "1.0.7"
@ -488,6 +537,12 @@ version = "1.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0fcc0b4a115bf80b728eb8ea024ad5bd707b615bfed49e0665b6e0f86fd082d9"
[[package]]
name = "humantime"
version = "2.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4"
[[package]]
name = "hyper"
version = "1.4.1"
@ -561,6 +616,29 @@ dependencies = [
"tracing",
]
[[package]]
name = "iana-time-zone"
version = "0.1.60"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e7ffbb5a1b541ea2561f8c41c087286cc091e21e556a4f09a8f6cbf17b69b141"
dependencies = [
"android_system_properties",
"core-foundation-sys",
"iana-time-zone-haiku",
"js-sys",
"wasm-bindgen",
"windows-core",
]
[[package]]
name = "iana-time-zone-haiku"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f"
dependencies = [
"cc",
]
[[package]]
name = "idna"
version = "0.5.0"
@ -587,6 +665,17 @@ version = "2.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3"
[[package]]
name = "is-terminal"
version = "0.4.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f23ff5ef2b80d608d61efee834934d862cd92461afc0560dedf493e4c033738b"
dependencies = [
"hermit-abi",
"libc",
"windows-sys 0.52.0",
]
[[package]]
name = "is_terminal_polyfill"
version = "1.70.1"
@ -698,6 +787,15 @@ version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9"
[[package]]
name = "num-traits"
version = "0.2.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
dependencies = [
"autocfg",
]
[[package]]
name = "object"
version = "0.36.3"
@ -1144,6 +1242,8 @@ dependencies = [
"anyhow",
"clap",
"fantoccini",
"fern",
"humantime",
"lazy_static",
"log",
"regex",
@ -1574,6 +1674,37 @@ dependencies = [
"url",
]
[[package]]
name = "winapi"
version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
dependencies = [
"winapi-i686-pc-windows-gnu",
"winapi-x86_64-pc-windows-gnu",
]
[[package]]
name = "winapi-i686-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
[[package]]
name = "winapi-x86_64-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
[[package]]
name = "windows-core"
version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9"
dependencies = [
"windows-targets",
]
[[package]]
name = "windows-registry"
version = "0.2.0"

View File

@ -7,6 +7,8 @@ edition = "2021"
anyhow = "1.0.86"
clap = { version = "4.5.16", features = ["derive"] }
fantoccini = "0.21.1"
fern = { version = "0.6.2", features = ["chrono", "colored"] }
humantime = "2.1.0"
lazy_static = "1.5.0"
log = "0.4.22"
regex = "1.10.6"

File diff suppressed because one or more lines are too long

View File

@ -1,4 +1,5 @@
use fantoccini::{Client, ClientBuilder, Locator};
use log::info;
use log::warn;
use serde::{Deserialize, Serialize};
@ -37,14 +38,19 @@ pub struct Answer {
/// Get all answers from a stackoverflow domain. No error handling is done so get ready to either
/// check your input or "note: run with `RUST_BACKTRACE=1` environment variable to display a
/// backtrace"
pub async fn get_answers(c: &Client, url: &str, i: usize) -> anyhow::Result<Vec<Answer>> {
pub async fn get_answers(
c: &Client,
url: &str,
i: usize,
links_size: usize,
) -> anyhow::Result<Vec<Answer>> {
// first, go to the Wikipedia page for Foobar
c.goto(url).await?;
let answer_loc = c.find_all(Locator::Css(".answer")).await?;
let mut out_answers = vec![];
for (j, answer) in answer_loc.iter().enumerate() {
println!("Getting answer {} on link {}", j, i);
info!("Getting answer {} on link {} of {}", j, i, links_size);
let text = skip_fail!(answer.text().await);
let score =

View File

@ -2,44 +2,97 @@ use fantoccini::{Client, ClientBuilder};
pub mod collector;
use clap::Parser;
use collector::*;
use fern::{
self,
colors::{Color, ColoredLevelConfig},
};
use log::{debug, error, info, trace, warn};
use std::process::exit;
#[derive(Debug, Parser, Clone)]
#[command(about = "Scrape stackoverflow for something idk")]
pub struct Args {
#[clap(default_value_t = 5)]
#[clap(
short,
long,
default_value_t = 5,
help = "Amount of pages to scrape for links. Sorted by top voted"
)]
pages: u16,
#[clap(short, long, default_value_t = log::LevelFilter::Info)]
log_level: log::LevelFilter,
}
#[tokio::main]
async fn main() {
let args = Args::parse();
init_fern(args.log_level);
let start = std::time::Instant::now();
println!("Spawning client");
info!("Spawning client");
let c: Client = ClientBuilder::native()
.connect("http://localhost:4444")
.await
.expect("failed to connect to WebDriver");
.unwrap_or_else(|e| {
error!("Error: {e}");
panic!();
});
println!("Getting links");
info!("Getting links");
let links = get_top_links(&c, args.pages)
.await
.expect("Failed to get links. Exiting");
println!("Got {} links. Expected {}", links.len(), args.pages * 15);
println!("Getting answers");
info!("Got {} links. Expected {}", links.len(), args.pages * 15);
info!("Getting answers");
let mut answers = vec![];
for (i, link) in links.iter().enumerate() {
answers.append(
&mut get_answers(&c, format!("https://stackoverflow.com{}", link).as_str(), i)
&mut get_answers(
&c,
format!("https://stackoverflow.com{}", link).as_str(),
i,
links.len(),
)
.await
.unwrap_or_default(),
);
}
println!(
info!(
"Got {} answers in {} sec",
answers.len(),
start.elapsed().as_secs_f32()
);
c.close().await.unwrap();
println!("Writing answers to answers.json");
std::fs::write("answers.json", serde_json::to_string(&answers).unwrap());
info!("Writing answers to answers.json");
std::fs::write(
"answers.json",
serde_json::to_string(&answers).unwrap_or_else(|e| {
error!("Error: {}", e);
panic!();
}),
);
}
fn init_fern(level: log::LevelFilter) -> anyhow::Result<()> {
let colors = ColoredLevelConfig::new()
.trace(Color::White)
.info(Color::Green)
.debug(Color::Magenta)
.warn(Color::Yellow)
.error(Color::Red);
fern::Dispatch::new()
.format(move |out, message, record| {
out.finish(format_args!(
"[{} {} {}] {}",
humantime::format_rfc3339_seconds(std::time::SystemTime::now()),
colors.color(record.level()),
record.target(),
message
))
})
.level(level)
.chain(std::io::stdout())
.apply()?;
Ok(())
}