some improvements
This commit is contained in:
parent
d8632c9228
commit
b38a7c1c4c
131
Cargo.lock
generated
131
Cargo.lock
generated
@ -26,6 +26,21 @@ dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "android-tzdata"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0"
|
||||
|
||||
[[package]]
|
||||
name = "android_system_properties"
|
||||
version = "0.1.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311"
|
||||
dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "anstream"
|
||||
version = "0.6.15"
|
||||
@ -153,6 +168,18 @@ version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
||||
|
||||
[[package]]
|
||||
name = "chrono"
|
||||
version = "0.4.38"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401"
|
||||
dependencies = [
|
||||
"android-tzdata",
|
||||
"iana-time-zone",
|
||||
"num-traits",
|
||||
"windows-targets",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "clap"
|
||||
version = "4.5.16"
|
||||
@ -199,6 +226,17 @@ version = "1.0.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d3fd119d74b830634cea2a0f58bbd0d54540518a14397557951e79340abc28c0"
|
||||
|
||||
[[package]]
|
||||
name = "colored"
|
||||
version = "1.9.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5a5f741c91823341bebf717d4c71bda820630ce065443b58bd1b7451af008355"
|
||||
dependencies = [
|
||||
"is-terminal",
|
||||
"lazy_static",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cookie"
|
||||
version = "0.16.2"
|
||||
@ -301,6 +339,17 @@ version = "2.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a"
|
||||
|
||||
[[package]]
|
||||
name = "fern"
|
||||
version = "0.6.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d9f0c14694cbd524c8720dd69b0e3179344f04ebb5f90f2e4a440c6ea3b2f1ee"
|
||||
dependencies = [
|
||||
"chrono",
|
||||
"colored",
|
||||
"log",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fnv"
|
||||
version = "1.0.7"
|
||||
@ -488,6 +537,12 @@ version = "1.9.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0fcc0b4a115bf80b728eb8ea024ad5bd707b615bfed49e0665b6e0f86fd082d9"
|
||||
|
||||
[[package]]
|
||||
name = "humantime"
|
||||
version = "2.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4"
|
||||
|
||||
[[package]]
|
||||
name = "hyper"
|
||||
version = "1.4.1"
|
||||
@ -561,6 +616,29 @@ dependencies = [
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "iana-time-zone"
|
||||
version = "0.1.60"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e7ffbb5a1b541ea2561f8c41c087286cc091e21e556a4f09a8f6cbf17b69b141"
|
||||
dependencies = [
|
||||
"android_system_properties",
|
||||
"core-foundation-sys",
|
||||
"iana-time-zone-haiku",
|
||||
"js-sys",
|
||||
"wasm-bindgen",
|
||||
"windows-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "iana-time-zone-haiku"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f"
|
||||
dependencies = [
|
||||
"cc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "idna"
|
||||
version = "0.5.0"
|
||||
@ -587,6 +665,17 @@ version = "2.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3"
|
||||
|
||||
[[package]]
|
||||
name = "is-terminal"
|
||||
version = "0.4.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f23ff5ef2b80d608d61efee834934d862cd92461afc0560dedf493e4c033738b"
|
||||
dependencies = [
|
||||
"hermit-abi",
|
||||
"libc",
|
||||
"windows-sys 0.52.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "is_terminal_polyfill"
|
||||
version = "1.70.1"
|
||||
@ -698,6 +787,15 @@ version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9"
|
||||
|
||||
[[package]]
|
||||
name = "num-traits"
|
||||
version = "0.2.19"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "object"
|
||||
version = "0.36.3"
|
||||
@ -1144,6 +1242,8 @@ dependencies = [
|
||||
"anyhow",
|
||||
"clap",
|
||||
"fantoccini",
|
||||
"fern",
|
||||
"humantime",
|
||||
"lazy_static",
|
||||
"log",
|
||||
"regex",
|
||||
@ -1574,6 +1674,37 @@ dependencies = [
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "winapi"
|
||||
version = "0.3.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
|
||||
dependencies = [
|
||||
"winapi-i686-pc-windows-gnu",
|
||||
"winapi-x86_64-pc-windows-gnu",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "winapi-i686-pc-windows-gnu"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
|
||||
|
||||
[[package]]
|
||||
name = "winapi-x86_64-pc-windows-gnu"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
|
||||
|
||||
[[package]]
|
||||
name = "windows-core"
|
||||
version = "0.52.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9"
|
||||
dependencies = [
|
||||
"windows-targets",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-registry"
|
||||
version = "0.2.0"
|
||||
|
@ -7,6 +7,8 @@ edition = "2021"
|
||||
anyhow = "1.0.86"
|
||||
clap = { version = "4.5.16", features = ["derive"] }
|
||||
fantoccini = "0.21.1"
|
||||
fern = { version = "0.6.2", features = ["chrono", "colored"] }
|
||||
humantime = "2.1.0"
|
||||
lazy_static = "1.5.0"
|
||||
log = "0.4.22"
|
||||
regex = "1.10.6"
|
||||
|
File diff suppressed because one or more lines are too long
@ -1,4 +1,5 @@
|
||||
use fantoccini::{Client, ClientBuilder, Locator};
|
||||
use log::info;
|
||||
use log::warn;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
@ -37,14 +38,19 @@ pub struct Answer {
|
||||
/// Get all answers from a stackoverflow domain. No error handling is done so get ready to either
|
||||
/// check your input or "note: run with `RUST_BACKTRACE=1` environment variable to display a
|
||||
/// backtrace"
|
||||
pub async fn get_answers(c: &Client, url: &str, i: usize) -> anyhow::Result<Vec<Answer>> {
|
||||
pub async fn get_answers(
|
||||
c: &Client,
|
||||
url: &str,
|
||||
i: usize,
|
||||
links_size: usize,
|
||||
) -> anyhow::Result<Vec<Answer>> {
|
||||
// first, go to the Wikipedia page for Foobar
|
||||
c.goto(url).await?;
|
||||
|
||||
let answer_loc = c.find_all(Locator::Css(".answer")).await?;
|
||||
let mut out_answers = vec![];
|
||||
for (j, answer) in answer_loc.iter().enumerate() {
|
||||
println!("Getting answer {} on link {}", j, i);
|
||||
info!("Getting answer {} on link {} of {}", j, i, links_size);
|
||||
let text = skip_fail!(answer.text().await);
|
||||
|
||||
let score =
|
||||
|
77
src/main.rs
77
src/main.rs
@ -2,44 +2,97 @@ use fantoccini::{Client, ClientBuilder};
|
||||
pub mod collector;
|
||||
use clap::Parser;
|
||||
use collector::*;
|
||||
use fern::{
|
||||
self,
|
||||
colors::{Color, ColoredLevelConfig},
|
||||
};
|
||||
use log::{debug, error, info, trace, warn};
|
||||
use std::process::exit;
|
||||
|
||||
#[derive(Debug, Parser, Clone)]
|
||||
#[command(about = "Scrape stackoverflow for something idk")]
|
||||
pub struct Args {
|
||||
#[clap(default_value_t = 5)]
|
||||
#[clap(
|
||||
short,
|
||||
long,
|
||||
default_value_t = 5,
|
||||
help = "Amount of pages to scrape for links. Sorted by top voted"
|
||||
)]
|
||||
pages: u16,
|
||||
#[clap(short, long, default_value_t = log::LevelFilter::Info)]
|
||||
log_level: log::LevelFilter,
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
let args = Args::parse();
|
||||
init_fern(args.log_level);
|
||||
|
||||
let start = std::time::Instant::now();
|
||||
println!("Spawning client");
|
||||
info!("Spawning client");
|
||||
let c: Client = ClientBuilder::native()
|
||||
.connect("http://localhost:4444")
|
||||
.await
|
||||
.expect("failed to connect to WebDriver");
|
||||
.unwrap_or_else(|e| {
|
||||
error!("Error: {e}");
|
||||
panic!();
|
||||
});
|
||||
|
||||
println!("Getting links");
|
||||
info!("Getting links");
|
||||
let links = get_top_links(&c, args.pages)
|
||||
.await
|
||||
.expect("Failed to get links. Exiting");
|
||||
println!("Got {} links. Expected {}", links.len(), args.pages * 15);
|
||||
println!("Getting answers");
|
||||
info!("Got {} links. Expected {}", links.len(), args.pages * 15);
|
||||
info!("Getting answers");
|
||||
let mut answers = vec![];
|
||||
for (i, link) in links.iter().enumerate() {
|
||||
answers.append(
|
||||
&mut get_answers(&c, format!("https://stackoverflow.com{}", link).as_str(), i)
|
||||
.await
|
||||
.unwrap_or_default(),
|
||||
&mut get_answers(
|
||||
&c,
|
||||
format!("https://stackoverflow.com{}", link).as_str(),
|
||||
i,
|
||||
links.len(),
|
||||
)
|
||||
.await
|
||||
.unwrap_or_default(),
|
||||
);
|
||||
}
|
||||
println!(
|
||||
info!(
|
||||
"Got {} answers in {} sec",
|
||||
answers.len(),
|
||||
start.elapsed().as_secs_f32()
|
||||
);
|
||||
c.close().await.unwrap();
|
||||
println!("Writing answers to answers.json");
|
||||
std::fs::write("answers.json", serde_json::to_string(&answers).unwrap());
|
||||
info!("Writing answers to answers.json");
|
||||
std::fs::write(
|
||||
"answers.json",
|
||||
serde_json::to_string(&answers).unwrap_or_else(|e| {
|
||||
error!("Error: {}", e);
|
||||
panic!();
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
fn init_fern(level: log::LevelFilter) -> anyhow::Result<()> {
|
||||
let colors = ColoredLevelConfig::new()
|
||||
.trace(Color::White)
|
||||
.info(Color::Green)
|
||||
.debug(Color::Magenta)
|
||||
.warn(Color::Yellow)
|
||||
.error(Color::Red);
|
||||
|
||||
fern::Dispatch::new()
|
||||
.format(move |out, message, record| {
|
||||
out.finish(format_args!(
|
||||
"[{} {} {}] {}",
|
||||
humantime::format_rfc3339_seconds(std::time::SystemTime::now()),
|
||||
colors.color(record.level()),
|
||||
record.target(),
|
||||
message
|
||||
))
|
||||
})
|
||||
.level(level)
|
||||
.chain(std::io::stdout())
|
||||
.apply()?;
|
||||
Ok(())
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user