some very simple analysis printing the top 100 most common words in all answers

This commit is contained in:
spv 2024-08-26 21:38:23 +02:00
parent b38a7c1c4c
commit 6b4a54a2c9
No known key found for this signature in database
GPG Key ID: 7638A987CE28ADFA
3 changed files with 73 additions and 42 deletions

17
src/analyze.rs Normal file
View File

@ -0,0 +1,17 @@
use std::collections::HashMap;
use crate::Answer;
pub fn analyze_frequencies(answers: Vec<Answer>) -> HashMap<String, u16> {
let mut out: HashMap<String, u16> = HashMap::new();
for answer in answers {
for word in answer.content.replace("\n", " ").split_whitespace() {
out.entry(word.to_string())
.and_modify(|count| *count += 1)
.or_insert(1);
}
}
out
}

View File

@ -32,7 +32,7 @@ macro_rules! skip_fail_opt {
pub struct Answer {
upvotes: u32,
author: String,
content: String,
pub content: String,
}
/// Get all answers from a stackoverflow domain. No error handling is done so get ready to either

View File

@ -1,5 +1,7 @@
use fantoccini::{Client, ClientBuilder};
pub mod analyze;
pub mod collector;
use analyze::*;
use clap::Parser;
use collector::*;
use fern::{
@ -7,7 +9,7 @@ use fern::{
colors::{Color, ColoredLevelConfig},
};
use log::{debug, error, info, trace, warn};
use std::process::exit;
use std::{path::PathBuf, process::exit};
#[derive(Debug, Parser, Clone)]
#[command(about = "Scrape stackoverflow for something idk")]
@ -21,6 +23,8 @@ pub struct Args {
pages: u16,
#[clap(short, long, default_value_t = log::LevelFilter::Info)]
log_level: log::LevelFilter,
#[clap(short, long)]
answers_file: Option<PathBuf>,
}
#[tokio::main]
@ -28,49 +32,59 @@ async fn main() {
let args = Args::parse();
init_fern(args.log_level);
let start = std::time::Instant::now();
info!("Spawning client");
let c: Client = ClientBuilder::native()
.connect("http://localhost:4444")
.await
.unwrap_or_else(|e| {
error!("Error: {e}");
panic!();
});
info!("Getting links");
let links = get_top_links(&c, args.pages)
.await
.expect("Failed to get links. Exiting");
info!("Got {} links. Expected {}", links.len(), args.pages * 15);
info!("Getting answers");
let mut answers = vec![];
for (i, link) in links.iter().enumerate() {
answers.append(
&mut get_answers(
&c,
format!("https://stackoverflow.com{}", link).as_str(),
i,
links.len(),
)
if let Some(path) = args.answers_file {
let answers = serde_json::from_str(&std::fs::read_to_string(path).unwrap()).unwrap();
let freqs = analyze_frequencies(answers);
let mut freqs = freqs.iter().collect::<Vec<(&String, &u16)>>();
freqs.sort_by(|a, b| b.1.cmp(&a.1));
for i in &freqs[0..100] {
println!("{} : {}", i.0, i.1);
}
} else {
let start = std::time::Instant::now();
info!("Spawning client");
let c: Client = ClientBuilder::native()
.connect("http://localhost:4444")
.await
.unwrap_or_default(),
.unwrap_or_else(|e| {
error!("Error: {e}");
panic!();
});
info!("Getting links");
let links = get_top_links(&c, args.pages)
.await
.expect("Failed to get links. Exiting");
info!("Got {} links. Expected {}", links.len(), args.pages * 15);
info!("Getting answers");
let mut answers = vec![];
for (i, link) in links.iter().enumerate() {
answers.append(
&mut get_answers(
&c,
format!("https://stackoverflow.com{}", link).as_str(),
i,
links.len(),
)
.await
.unwrap_or_default(),
);
}
info!(
"Got {} answers in {} sec",
answers.len(),
start.elapsed().as_secs_f32()
);
c.close().await.unwrap();
info!("Writing answers to answers.json");
let _ = std::fs::write(
"answers.json",
serde_json::to_string(&answers).unwrap_or_else(|e| {
error!("Error: {}", e);
panic!();
}),
);
}
info!(
"Got {} answers in {} sec",
answers.len(),
start.elapsed().as_secs_f32()
);
c.close().await.unwrap();
info!("Writing answers to answers.json");
std::fs::write(
"answers.json",
serde_json::to_string(&answers).unwrap_or_else(|e| {
error!("Error: {}", e);
panic!();
}),
);
}
fn init_fern(level: log::LevelFilter) -> anyhow::Result<()> {