works now. we collect links and scrape. Now we just have to scrape through it
This commit is contained in:
parent
080a27f77d
commit
55c7bba09d
46
Cargo.lock
generated
46
Cargo.lock
generated
@ -17,6 +17,15 @@ version = "1.0.2"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
|
checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "aho-corasick"
|
||||||
|
version = "1.1.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
|
||||||
|
dependencies = [
|
||||||
|
"memchr",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "anyhow"
|
name = "anyhow"
|
||||||
version = "1.0.86"
|
version = "1.0.86"
|
||||||
@ -492,6 +501,12 @@ dependencies = [
|
|||||||
"wasm-bindgen",
|
"wasm-bindgen",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "lazy_static"
|
||||||
|
version = "1.5.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "libc"
|
name = "libc"
|
||||||
version = "0.2.158"
|
version = "0.2.158"
|
||||||
@ -735,6 +750,35 @@ dependencies = [
|
|||||||
"bitflags",
|
"bitflags",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "regex"
|
||||||
|
version = "1.10.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "4219d74c6b67a3654a9fbebc4b419e22126d13d2f3c4a07ee0cb61ff79a79619"
|
||||||
|
dependencies = [
|
||||||
|
"aho-corasick",
|
||||||
|
"memchr",
|
||||||
|
"regex-automata",
|
||||||
|
"regex-syntax",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "regex-automata"
|
||||||
|
version = "0.4.7"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df"
|
||||||
|
dependencies = [
|
||||||
|
"aho-corasick",
|
||||||
|
"memchr",
|
||||||
|
"regex-syntax",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "regex-syntax"
|
||||||
|
version = "0.8.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "reqwest"
|
name = "reqwest"
|
||||||
version = "0.12.7"
|
version = "0.12.7"
|
||||||
@ -992,6 +1036,8 @@ version = "0.1.0"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"fantoccini",
|
"fantoccini",
|
||||||
|
"lazy_static",
|
||||||
|
"regex",
|
||||||
"reqwest",
|
"reqwest",
|
||||||
"tokio",
|
"tokio",
|
||||||
]
|
]
|
||||||
|
@ -6,5 +6,7 @@ edition = "2021"
|
|||||||
[dependencies]
|
[dependencies]
|
||||||
anyhow = "1.0.86"
|
anyhow = "1.0.86"
|
||||||
fantoccini = "0.21.1"
|
fantoccini = "0.21.1"
|
||||||
|
lazy_static = "1.5.0"
|
||||||
|
regex = "1.10.6"
|
||||||
reqwest = "0.12.7"
|
reqwest = "0.12.7"
|
||||||
tokio = { version = "1.39.3", features = ["full"] }
|
tokio = { version = "1.39.3", features = ["full"] }
|
||||||
|
55
src/main.rs
55
src/main.rs
@ -1,4 +1,6 @@
|
|||||||
use fantoccini::{elements::Element, ClientBuilder, Locator};
|
use fantoccini::{elements::Element, Client, ClientBuilder, Locator};
|
||||||
|
use lazy_static::lazy_static;
|
||||||
|
use regex;
|
||||||
|
|
||||||
/// Holds data about stackoverflow answers
|
/// Holds data about stackoverflow answers
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
@ -11,12 +13,7 @@ struct Answer {
|
|||||||
/// Get all answers from a stackoverflow domain. No error handling is done so get ready to either
|
/// Get all answers from a stackoverflow domain. No error handling is done so get ready to either
|
||||||
/// check your input or "note: run with `RUST_BACKTRACE=1` environment variable to display a
|
/// check your input or "note: run with `RUST_BACKTRACE=1` environment variable to display a
|
||||||
/// backtrace"
|
/// backtrace"
|
||||||
async fn get_answers(url: &str) -> Vec<Answer> {
|
async fn get_answers(c: &Client, url: &str) -> Vec<Answer> {
|
||||||
let c = ClientBuilder::native()
|
|
||||||
.connect("http://localhost:4444")
|
|
||||||
.await
|
|
||||||
.expect("failed to connect to WebDriver");
|
|
||||||
|
|
||||||
// first, go to the Wikipedia page for Foobar
|
// first, go to the Wikipedia page for Foobar
|
||||||
c.goto(url).await.unwrap();
|
c.goto(url).await.unwrap();
|
||||||
|
|
||||||
@ -41,12 +38,52 @@ async fn get_answers(url: &str) -> Vec<Answer> {
|
|||||||
author: "unimplemented".to_string(),
|
author: "unimplemented".to_string(),
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
c.close().await.unwrap();
|
|
||||||
|
|
||||||
out_answers
|
out_answers
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn get_top_links(c: &Client, pages: u16) -> anyhow::Result<Vec<String>> {
|
||||||
|
let mut answers = vec![];
|
||||||
|
for page in 1..=pages {
|
||||||
|
c.goto(
|
||||||
|
format!(
|
||||||
|
"https://stackoverflow.com/questions?tab=votes&page={}",
|
||||||
|
page
|
||||||
|
)
|
||||||
|
.as_str(),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let finds = c.find_all(Locator::Css(".s-link")).await.unwrap();
|
||||||
|
for find in finds {
|
||||||
|
if find
|
||||||
|
.attr("href")
|
||||||
|
.await
|
||||||
|
.unwrap()
|
||||||
|
.unwrap()
|
||||||
|
.contains("/questions/")
|
||||||
|
{
|
||||||
|
answers.push(find.attr("href").await.unwrap().unwrap());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(answers)
|
||||||
|
}
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
async fn main() {
|
async fn main() {
|
||||||
dbg!(get_answers("https://stackoverflow.com/questions/11227809/why-is-processing-a-sorted-array-faster-than-processing-an-unsorted-array").await);
|
let c: Client = ClientBuilder::native()
|
||||||
|
.connect("http://localhost:4444")
|
||||||
|
.await
|
||||||
|
.expect("failed to connect to WebDriver");
|
||||||
|
let links = get_top_links(&c, 5).await.unwrap();
|
||||||
|
let mut answers = vec![];
|
||||||
|
for link in links {
|
||||||
|
answers.append(
|
||||||
|
&mut get_answers(&c, format!("https://stackoverflow.com{}", link).as_str()).await,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
c.close().await.unwrap();
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user