From 9426f6b855ed673be7bd3bb2248752e379729a1a Mon Sep 17 00:00:00 2001 From: spv Date: Mon, 26 Aug 2024 16:27:07 +0200 Subject: [PATCH] added some logs. Not using a real logger. fuck you --- Cargo.lock | 1 + Cargo.toml | 1 + src/collector.rs | 89 ++++++++++++++++++++++++++++++++++++++++++++ src/main.rs | 96 ++++++++++-------------------------------------- 4 files changed, 111 insertions(+), 76 deletions(-) create mode 100644 src/collector.rs diff --git a/Cargo.lock b/Cargo.lock index ee9a31c..c294f99 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1037,6 +1037,7 @@ dependencies = [ "anyhow", "fantoccini", "lazy_static", + "log", "regex", "reqwest", "tokio", diff --git a/Cargo.toml b/Cargo.toml index 0d78e83..f688d2a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,6 +7,7 @@ edition = "2021" anyhow = "1.0.86" fantoccini = "0.21.1" lazy_static = "1.5.0" +log = "0.4.22" regex = "1.10.6" reqwest = "0.12.7" tokio = { version = "1.39.3", features = ["full"] } diff --git a/src/collector.rs b/src/collector.rs new file mode 100644 index 0000000..1596e56 --- /dev/null +++ b/src/collector.rs @@ -0,0 +1,89 @@ +use fantoccini::{Client, ClientBuilder, Locator}; +use log::warn; + +macro_rules! skip_fail { + ($res:expr) => { + match $res { + Ok(val) => val, + Err(e) => { + warn!("An error: {}; skipped.", e); + continue; + } + } + }; +} + +macro_rules! skip_fail_opt { + ($res:expr) => { + match $res { + Some(val) => val, + None => { + warn!("Unexpected empty value; skipped."); + continue; + } + } + }; +} + +/// Holds data about stackoverflow answers +#[derive(Debug, Clone)] +pub struct Answer { + upvotes: u32, + author: String, + content: String, +} + +/// Get all answers from a stackoverflow domain. No error handling is done so get ready to either +/// check your input or "note: run with `RUST_BACKTRACE=1` environment variable to display a +/// backtrace" +pub async fn get_answers(c: &Client, url: &str, i: usize) -> anyhow::Result> { + // first, go to the Wikipedia page for Foobar + c.goto(url).await?; + + let answer_loc = c.find_all(Locator::Css(".answer")).await?; + let mut out_answers = vec![]; + for (j, answer) in answer_loc.iter().enumerate() { + println!("Getting answer {} on page {}", j, i); + let text = skip_fail!(answer.text().await); + + let score = + skip_fail!( + skip_fail_opt!(text.clone().split('\n').collect::>().get(0)) + .parse::() + ); + let content = text; + + out_answers.push(Answer { + upvotes: score, + content, + author: "unimplemented".to_string(), + }); + } + + Ok(out_answers) +} + +pub async fn get_top_links(c: &Client, pages: u16) -> anyhow::Result> { + let mut answers = vec![]; + for page in 1..=pages { + skip_fail!( + c.goto( + format!( + "https://stackoverflow.com/questions?tab=votes&page={}", + page + ) + .as_str(), + ) + .await + ); + + let finds = c.find_all(Locator::Css(".s-link")).await?; + for find in finds { + if skip_fail_opt!(skip_fail!(find.attr("href").await)).contains("/questions/") { + answers.push(find.attr("href").await.unwrap().unwrap()); + } + } + } + + Ok(answers) +} diff --git a/src/main.rs b/src/main.rs index 0e94414..002efed 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,89 +1,33 @@ -use fantoccini::{elements::Element, Client, ClientBuilder, Locator}; -use lazy_static::lazy_static; -use regex; - -/// Holds data about stackoverflow answers -#[derive(Debug, Clone)] -struct Answer { - upvotes: u32, - author: String, - content: String, -} - -/// Get all answers from a stackoverflow domain. No error handling is done so get ready to either -/// check your input or "note: run with `RUST_BACKTRACE=1` environment variable to display a -/// backtrace" -async fn get_answers(c: &Client, url: &str) -> Vec { - // first, go to the Wikipedia page for Foobar - c.goto(url).await.unwrap(); - - let answer_loc = c.find_all(Locator::Css(".answer")).await.unwrap(); - let mut out_answers = vec![]; - for answer in answer_loc { - let text = answer.text().await.unwrap(); - - let score = text - .clone() - .split('\n') - .collect::>() - .get(0) - .unwrap() - .parse::() - .unwrap(); - let content = text; - - out_answers.push(Answer { - upvotes: score, - content, - author: "unimplemented".to_string(), - }); - } - - out_answers -} - -async fn get_top_links(c: &Client, pages: u16) -> anyhow::Result> { - let mut answers = vec![]; - for page in 1..=pages { - c.goto( - format!( - "https://stackoverflow.com/questions?tab=votes&page={}", - page - ) - .as_str(), - ) - .await - .unwrap(); - - let finds = c.find_all(Locator::Css(".s-link")).await.unwrap(); - for find in finds { - if find - .attr("href") - .await - .unwrap() - .unwrap() - .contains("/questions/") - { - answers.push(find.attr("href").await.unwrap().unwrap()); - } - } - } - - Ok(answers) -} +use fantoccini::{Client, ClientBuilder}; +pub mod collector; +use collector::*; #[tokio::main] async fn main() { + let start = std::time::Instant::now(); + println!("Spawning client"); let c: Client = ClientBuilder::native() .connect("http://localhost:4444") .await .expect("failed to connect to WebDriver"); - let links = get_top_links(&c, 5).await.unwrap(); + println!("Getting links"); + let links = get_top_links(&c, 1) + .await + .expect("Failed to get links. Exiting"); + println!("Got {} links. Expected {}", links.len(), 5 * 15); + println!("Getting answers"); let mut answers = vec![]; - for link in links { + for (i, link) in links.iter().enumerate() { answers.append( - &mut get_answers(&c, format!("https://stackoverflow.com{}", link).as_str()).await, + &mut get_answers(&c, format!("https://stackoverflow.com{}", link).as_str(), i) + .await + .unwrap_or_default(), ); } + println!( + "Got {} answers in {} sec", + answers.len(), + start.elapsed().as_secs_f32() + ); c.close().await.unwrap(); }