StackScraper/src/collector.rs

101 lines
2.6 KiB
Rust

use fantoccini::{Client, ClientBuilder, Locator};
use log::info;
use log::warn;
use serde::{Deserialize, Serialize};
macro_rules! skip_fail {
($res:expr) => {
match $res {
Ok(val) => val,
Err(e) => {
warn!("An error: {}; skipped.", e);
continue;
}
}
};
}
macro_rules! skip_fail_opt {
($res:expr) => {
match $res {
Some(val) => val,
None => {
warn!("Unexpected empty value; skipped.");
continue;
}
}
};
}
/// Holds data about stackoverflow answers
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct Answer {
upvotes: u32,
author: String,
pub content: String,
}
/// Get all answers from a stackoverflow domain. No error handling is done so get ready to either
/// check your input or "note: run with `RUST_BACKTRACE=1` environment variable to display a
/// backtrace"
pub async fn get_answers(
c: &Client,
url: &str,
i: usize,
links_size: usize,
) -> anyhow::Result<Vec<Answer>> {
// first, go to the Wikipedia page for Foobar
c.goto(url).await?;
let answer_loc = c.find_all(Locator::Css(".answer")).await?;
let mut out_answers = vec![];
for (j, answer) in answer_loc.iter().enumerate() {
info!("Getting answer {} on link {} of {}", j, i, links_size);
let text = skip_fail!(answer.text().await);
let score =
skip_fail!(
skip_fail_opt!(text.clone().split('\n').collect::<Vec<&str>>().get(0))
.parse::<u32>()
);
let content = text
.split("Share\nImprove this answer")
.collect::<Vec<&str>>()[0]
.to_string()
.replace("\\n", "\n");
out_answers.push(Answer {
upvotes: score,
content: content,
author: "unimplemented".to_string(),
});
}
Ok(out_answers)
}
pub async fn get_top_links(c: &Client, pages: u16) -> anyhow::Result<Vec<String>> {
let mut answers = vec![];
for page in 1..=pages {
skip_fail!(
c.goto(
format!(
"https://stackoverflow.com/questions?tab=votes&page={}",
page
)
.as_str(),
)
.await
);
let finds = c.find_all(Locator::Css(".s-link")).await?;
for find in finds {
if skip_fail_opt!(skip_fail!(find.attr("href").await)).contains("/questions/") {
answers.push(find.attr("href").await.unwrap().unwrap());
}
}
}
Ok(answers)
}