diff --git a/src/analyze.rs b/src/analyze.rs new file mode 100644 index 0000000..c1cc06c --- /dev/null +++ b/src/analyze.rs @@ -0,0 +1,17 @@ +use std::collections::HashMap; + +use crate::Answer; + +pub fn analyze_frequencies(answers: Vec) -> HashMap { + let mut out: HashMap = HashMap::new(); + + for answer in answers { + for word in answer.content.replace("\n", " ").split_whitespace() { + out.entry(word.to_string()) + .and_modify(|count| *count += 1) + .or_insert(1); + } + } + + out +} diff --git a/src/collector.rs b/src/collector.rs index fbae32a..475f5c1 100644 --- a/src/collector.rs +++ b/src/collector.rs @@ -32,7 +32,7 @@ macro_rules! skip_fail_opt { pub struct Answer { upvotes: u32, author: String, - content: String, + pub content: String, } /// Get all answers from a stackoverflow domain. No error handling is done so get ready to either diff --git a/src/main.rs b/src/main.rs index 54b9d53..efd5cb6 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,5 +1,7 @@ use fantoccini::{Client, ClientBuilder}; +pub mod analyze; pub mod collector; +use analyze::*; use clap::Parser; use collector::*; use fern::{ @@ -7,7 +9,7 @@ use fern::{ colors::{Color, ColoredLevelConfig}, }; use log::{debug, error, info, trace, warn}; -use std::process::exit; +use std::{path::PathBuf, process::exit}; #[derive(Debug, Parser, Clone)] #[command(about = "Scrape stackoverflow for something idk")] @@ -21,6 +23,8 @@ pub struct Args { pages: u16, #[clap(short, long, default_value_t = log::LevelFilter::Info)] log_level: log::LevelFilter, + #[clap(short, long)] + answers_file: Option, } #[tokio::main] @@ -28,49 +32,59 @@ async fn main() { let args = Args::parse(); init_fern(args.log_level); - let start = std::time::Instant::now(); - info!("Spawning client"); - let c: Client = ClientBuilder::native() - .connect("http://localhost:4444") - .await - .unwrap_or_else(|e| { - error!("Error: {e}"); - panic!(); - }); - - info!("Getting links"); - let links = get_top_links(&c, args.pages) - .await - .expect("Failed to get links. Exiting"); - info!("Got {} links. Expected {}", links.len(), args.pages * 15); - info!("Getting answers"); - let mut answers = vec![]; - for (i, link) in links.iter().enumerate() { - answers.append( - &mut get_answers( - &c, - format!("https://stackoverflow.com{}", link).as_str(), - i, - links.len(), - ) + if let Some(path) = args.answers_file { + let answers = serde_json::from_str(&std::fs::read_to_string(path).unwrap()).unwrap(); + let freqs = analyze_frequencies(answers); + let mut freqs = freqs.iter().collect::>(); + freqs.sort_by(|a, b| b.1.cmp(&a.1)); + for i in &freqs[0..100] { + println!("{} : {}", i.0, i.1); + } + } else { + let start = std::time::Instant::now(); + info!("Spawning client"); + let c: Client = ClientBuilder::native() + .connect("http://localhost:4444") .await - .unwrap_or_default(), + .unwrap_or_else(|e| { + error!("Error: {e}"); + panic!(); + }); + + info!("Getting links"); + let links = get_top_links(&c, args.pages) + .await + .expect("Failed to get links. Exiting"); + info!("Got {} links. Expected {}", links.len(), args.pages * 15); + info!("Getting answers"); + let mut answers = vec![]; + for (i, link) in links.iter().enumerate() { + answers.append( + &mut get_answers( + &c, + format!("https://stackoverflow.com{}", link).as_str(), + i, + links.len(), + ) + .await + .unwrap_or_default(), + ); + } + info!( + "Got {} answers in {} sec", + answers.len(), + start.elapsed().as_secs_f32() + ); + c.close().await.unwrap(); + info!("Writing answers to answers.json"); + let _ = std::fs::write( + "answers.json", + serde_json::to_string(&answers).unwrap_or_else(|e| { + error!("Error: {}", e); + panic!(); + }), ); } - info!( - "Got {} answers in {} sec", - answers.len(), - start.elapsed().as_secs_f32() - ); - c.close().await.unwrap(); - info!("Writing answers to answers.json"); - std::fs::write( - "answers.json", - serde_json::to_string(&answers).unwrap_or_else(|e| { - error!("Error: {}", e); - panic!(); - }), - ); } fn init_fern(level: log::LevelFilter) -> anyhow::Result<()> {