first commit

This commit is contained in:
sim1222 2024-07-26 22:56:39 +09:00
commit f66c86d74c
Signed by: sim1222
GPG Key ID: D1AE30E316E44E5D
7 changed files with 2294 additions and 0 deletions

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
/target
/dics

2094
Cargo.lock generated Normal file

File diff suppressed because it is too large Load Diff

18
Cargo.toml Normal file
View File

@ -0,0 +1,18 @@
[package]
name = "pictsense-dict-crawler"
version = "0.1.0"
edition = "2021"
[dependencies]
anyhow = "1.0.86"
better-panic = "0.3.0"
chrono = { version = "0.4.38", features = ["serde"] }
clap = { version = "4.5.11", features = ["derive"] }
quick-xml = "0.36.1"
regex = "1.10.5"
reqwest = { version = "0.12.5", features = ["json"] }
scraper = "0.19.1"
serde = { version = "1.0.204", features = ["derive"] }
serde-xml-rs = "0.6.0"
serde_json = "1.0.120"
tokio = { version = "1.39.1", features = ["full"] }

101
src/client.rs Normal file
View File

@ -0,0 +1,101 @@
use std::collections::HashMap;
use crate::types::response::{
DictionaryResponse, DictionaryResponseData,
DictionaryResponseDataDicValueDic, DictionaryResponseDataDicValueUl,
};
#[derive(Debug)]
pub struct DictionaryList {
pub next: Option<u64>,
pub dics: Vec<DictionaryResponseDataDicValueDic>,
}
pub struct Client {
client: reqwest::Client,
}
impl Client {
pub fn new() -> Self {
let client = reqwest::Client::new();
Self { client }
}
pub async fn get_dictionary_list(
&self,
query: &str,
start: u64,
) -> anyhow::Result<DictionaryList> {
let url = "https://pictsense.com/get/dic/";
let mut params = HashMap::new();
let start = start.to_string();
let now_unixtime = chrono::Utc::now().timestamp().to_string();
params.insert("q", query);
params.insert("s", &start);
params.insert("t", "0");
params.insert("at", &now_unixtime);
let res = self.client.post(url).form(&params).send().await?;
let body = res.text().await?;
let dics = match serde_xml_rs::from_str::<DictionaryResponse>(&body) {
Ok(dic) => dic,
Err(e) => return Err(anyhow::Error::new(e).context(body)),
};
let dic = match dics {
DictionaryResponse::Data(data) => match data {
DictionaryResponseData::Dics(dic) => dic,
},
};
let dic_value = "<ul>".to_string() + dic.value.as_str() + "</ul>";
// ライブラリのバグを運用でカバー!!!!✨
// SEE: https://github.com/RReverser/serde-xml-rs/issues/66
let delete_span_regex = regex::Regex::new(r"<span>\([0-9]*\)<\/span>").unwrap();
let dic_value = delete_span_regex.replace_all(&dic_value, "").to_string();
let dic_value: DictionaryResponseDataDicValueUl = serde_xml_rs::from_str(&dic_value)?;
Ok(DictionaryList {
next: dic.next,
dics: dic_value.li,
})
}
pub async fn get_all_dictionary_list(
&self,
query: &str,
) -> anyhow::Result<Vec<DictionaryResponseDataDicValueDic>> {
let mut start = 0;
let mut dics = Vec::new();
loop {
println!("start: {}", start);
let dic = self.get_dictionary_list(query, start).await?;
dics.extend(dic.dics);
if dic.next.is_none() {
break;
};
start = dic.next.unwrap();
tokio::time::sleep(std::time::Duration::from_millis(100)).await;
}
Ok(dics)
}
pub async fn get_dictionary_values(&self, id: u64) -> anyhow::Result<Vec<String>> {
let url = format!("https://pictsense.com/dic/n{}", id);
let res = self.client.get(&url).send().await?;
let body = res.text().await?;
let document = scraper::Html::parse_document(&body);
// <ul id="words">
let selector = scraper::Selector::parse("ul#words").unwrap();
let words_list = document.select(&selector).next().unwrap();
let mut words = Vec::new();
for element in words_list.select(&scraper::Selector::parse("li").unwrap()) {
words.push(element.inner_html());
}
Ok(words)
}
}

40
src/main.rs Normal file
View File

@ -0,0 +1,40 @@
mod client;
mod types;
#[tokio::main]
async fn main() {
better_panic::install();
let client = client::Client::new();
let dic = client.get_all_dictionary_list("").await.unwrap();
std::fs::create_dir_all("dics").unwrap();
let file = std::fs::File::create("./dics/dics.json").unwrap();
serde_json::to_writer_pretty(file, &dic).unwrap();
for d in dic {
let vec = client.get_dictionary_values(d.data_no).await.unwrap();
let file = std::fs::File::create(format!("./dics/{}.json", d.data_no)).unwrap();
#[derive(serde::Serialize)]
struct Dic {
title: String,
words: Vec<String>,
}
serde_json::to_writer_pretty(
file,
&Dic {
title: d.title,
words: vec,
},
)
.unwrap();
println!("saved: {}", d.data_no);
tokio::time::sleep(std::time::Duration::from_millis(100)).await;
}
// println!("{:#?}", dic);
}

1
src/types/mod.rs Normal file
View File

@ -0,0 +1 @@
pub mod response;

38
src/types/response.rs Normal file
View File

@ -0,0 +1,38 @@
use serde::{Deserialize, Serialize};
#[derive(Debug, Deserialize, Serialize)]
#[serde(rename_all = "kebab-case")]
pub enum DictionaryResponse {
Data(DictionaryResponseData),
}
#[derive(Debug, Deserialize, Serialize)]
#[serde(rename_all = "kebab-case")]
pub enum DictionaryResponseData {
Dics(DictionaryResponseDataDic),
}
#[derive(Debug, Deserialize, Serialize)]
pub struct DictionaryResponseDataDic {
pub next: Option<u64>,
#[serde(rename = "$value")]
pub value: String,
}
#[derive(Debug, Deserialize, Serialize)]
pub struct DictionaryResponseDataDicValueUl {
// #[serde(rename = "$value")]
pub li: Vec<DictionaryResponseDataDicValueDic>,
}
#[derive(Debug, Deserialize, Serialize)]
pub struct DictionaryResponseDataDicValueDic {
#[serde(rename = "data-no")]
pub data_no: u64,
#[serde(rename = "data-count")]
pub data_count: u64,
#[serde(rename = "$value")]
pub title: String,
}