From 55c7bba09dbfb87dcad3d135b45246c74b45363f Mon Sep 17 00:00:00 2001 From: spv Date: Mon, 26 Aug 2024 15:58:37 +0200 Subject: [PATCH] works now. we collect links and scrape. Now we just have to scrape through it --- Cargo.lock | 46 ++++++++++++++++++++++++++++++++++++++++++++ Cargo.toml | 2 ++ src/main.rs | 55 ++++++++++++++++++++++++++++++++++++++++++++--------- 3 files changed, 94 insertions(+), 9 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d20c959..ee9a31c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -17,6 +17,15 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" +[[package]] +name = "aho-corasick" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +dependencies = [ + "memchr", +] + [[package]] name = "anyhow" version = "1.0.86" @@ -492,6 +501,12 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" + [[package]] name = "libc" version = "0.2.158" @@ -735,6 +750,35 @@ dependencies = [ "bitflags", ] +[[package]] +name = "regex" +version = "1.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4219d74c6b67a3654a9fbebc4b419e22126d13d2f3c4a07ee0cb61ff79a79619" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b" + [[package]] name = "reqwest" version = "0.12.7" @@ -992,6 +1036,8 @@ version = "0.1.0" dependencies = [ "anyhow", "fantoccini", + "lazy_static", + "regex", "reqwest", "tokio", ] diff --git a/Cargo.toml b/Cargo.toml index 8f32aa1..0d78e83 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,5 +6,7 @@ edition = "2021" [dependencies] anyhow = "1.0.86" fantoccini = "0.21.1" +lazy_static = "1.5.0" +regex = "1.10.6" reqwest = "0.12.7" tokio = { version = "1.39.3", features = ["full"] } diff --git a/src/main.rs b/src/main.rs index 15ee673..0e94414 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,4 +1,6 @@ -use fantoccini::{elements::Element, ClientBuilder, Locator}; +use fantoccini::{elements::Element, Client, ClientBuilder, Locator}; +use lazy_static::lazy_static; +use regex; /// Holds data about stackoverflow answers #[derive(Debug, Clone)] @@ -11,12 +13,7 @@ struct Answer { /// Get all answers from a stackoverflow domain. No error handling is done so get ready to either /// check your input or "note: run with `RUST_BACKTRACE=1` environment variable to display a /// backtrace" -async fn get_answers(url: &str) -> Vec { - let c = ClientBuilder::native() - .connect("http://localhost:4444") - .await - .expect("failed to connect to WebDriver"); - +async fn get_answers(c: &Client, url: &str) -> Vec { // first, go to the Wikipedia page for Foobar c.goto(url).await.unwrap(); @@ -41,12 +38,52 @@ async fn get_answers(url: &str) -> Vec { author: "unimplemented".to_string(), }); } - c.close().await.unwrap(); out_answers } +async fn get_top_links(c: &Client, pages: u16) -> anyhow::Result> { + let mut answers = vec![]; + for page in 1..=pages { + c.goto( + format!( + "https://stackoverflow.com/questions?tab=votes&page={}", + page + ) + .as_str(), + ) + .await + .unwrap(); + + let finds = c.find_all(Locator::Css(".s-link")).await.unwrap(); + for find in finds { + if find + .attr("href") + .await + .unwrap() + .unwrap() + .contains("/questions/") + { + answers.push(find.attr("href").await.unwrap().unwrap()); + } + } + } + + Ok(answers) +} + #[tokio::main] async fn main() { - dbg!(get_answers("https://stackoverflow.com/questions/11227809/why-is-processing-a-sorted-array-faster-than-processing-an-unsorted-array").await); + let c: Client = ClientBuilder::native() + .connect("http://localhost:4444") + .await + .expect("failed to connect to WebDriver"); + let links = get_top_links(&c, 5).await.unwrap(); + let mut answers = vec![]; + for link in links { + answers.append( + &mut get_answers(&c, format!("https://stackoverflow.com{}", link).as_str()).await, + ); + } + c.close().await.unwrap(); }