first commit
This commit is contained in:
commit
f66c86d74c
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
/target
|
||||
/dics
|
2094
Cargo.lock
generated
Normal file
2094
Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
18
Cargo.toml
Normal file
18
Cargo.toml
Normal file
@ -0,0 +1,18 @@
|
||||
[package]
|
||||
name = "pictsense-dict-crawler"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1.0.86"
|
||||
better-panic = "0.3.0"
|
||||
chrono = { version = "0.4.38", features = ["serde"] }
|
||||
clap = { version = "4.5.11", features = ["derive"] }
|
||||
quick-xml = "0.36.1"
|
||||
regex = "1.10.5"
|
||||
reqwest = { version = "0.12.5", features = ["json"] }
|
||||
scraper = "0.19.1"
|
||||
serde = { version = "1.0.204", features = ["derive"] }
|
||||
serde-xml-rs = "0.6.0"
|
||||
serde_json = "1.0.120"
|
||||
tokio = { version = "1.39.1", features = ["full"] }
|
101
src/client.rs
Normal file
101
src/client.rs
Normal file
@ -0,0 +1,101 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use crate::types::response::{
|
||||
DictionaryResponse, DictionaryResponseData,
|
||||
DictionaryResponseDataDicValueDic, DictionaryResponseDataDicValueUl,
|
||||
};
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct DictionaryList {
|
||||
pub next: Option<u64>,
|
||||
pub dics: Vec<DictionaryResponseDataDicValueDic>,
|
||||
}
|
||||
|
||||
pub struct Client {
|
||||
client: reqwest::Client,
|
||||
}
|
||||
|
||||
impl Client {
|
||||
pub fn new() -> Self {
|
||||
let client = reqwest::Client::new();
|
||||
Self { client }
|
||||
}
|
||||
pub async fn get_dictionary_list(
|
||||
&self,
|
||||
query: &str,
|
||||
start: u64,
|
||||
) -> anyhow::Result<DictionaryList> {
|
||||
let url = "https://pictsense.com/get/dic/";
|
||||
let mut params = HashMap::new();
|
||||
let start = start.to_string();
|
||||
let now_unixtime = chrono::Utc::now().timestamp().to_string();
|
||||
params.insert("q", query);
|
||||
params.insert("s", &start);
|
||||
params.insert("t", "0");
|
||||
params.insert("at", &now_unixtime);
|
||||
|
||||
let res = self.client.post(url).form(¶ms).send().await?;
|
||||
let body = res.text().await?;
|
||||
let dics = match serde_xml_rs::from_str::<DictionaryResponse>(&body) {
|
||||
Ok(dic) => dic,
|
||||
Err(e) => return Err(anyhow::Error::new(e).context(body)),
|
||||
};
|
||||
|
||||
let dic = match dics {
|
||||
DictionaryResponse::Data(data) => match data {
|
||||
DictionaryResponseData::Dics(dic) => dic,
|
||||
},
|
||||
};
|
||||
|
||||
let dic_value = "<ul>".to_string() + dic.value.as_str() + "</ul>";
|
||||
|
||||
// ライブラリのバグを運用でカバー!!!!✨
|
||||
// SEE: https://github.com/RReverser/serde-xml-rs/issues/66
|
||||
let delete_span_regex = regex::Regex::new(r"<span>\([0-9]*\)<\/span>").unwrap();
|
||||
let dic_value = delete_span_regex.replace_all(&dic_value, "").to_string();
|
||||
|
||||
let dic_value: DictionaryResponseDataDicValueUl = serde_xml_rs::from_str(&dic_value)?;
|
||||
|
||||
Ok(DictionaryList {
|
||||
next: dic.next,
|
||||
dics: dic_value.li,
|
||||
})
|
||||
}
|
||||
pub async fn get_all_dictionary_list(
|
||||
&self,
|
||||
query: &str,
|
||||
) -> anyhow::Result<Vec<DictionaryResponseDataDicValueDic>> {
|
||||
let mut start = 0;
|
||||
let mut dics = Vec::new();
|
||||
loop {
|
||||
println!("start: {}", start);
|
||||
let dic = self.get_dictionary_list(query, start).await?;
|
||||
dics.extend(dic.dics);
|
||||
if dic.next.is_none() {
|
||||
break;
|
||||
};
|
||||
start = dic.next.unwrap();
|
||||
tokio::time::sleep(std::time::Duration::from_millis(100)).await;
|
||||
}
|
||||
Ok(dics)
|
||||
}
|
||||
pub async fn get_dictionary_values(&self, id: u64) -> anyhow::Result<Vec<String>> {
|
||||
let url = format!("https://pictsense.com/dic/n{}", id);
|
||||
|
||||
let res = self.client.get(&url).send().await?;
|
||||
let body = res.text().await?;
|
||||
let document = scraper::Html::parse_document(&body);
|
||||
|
||||
// <ul id="words">
|
||||
let selector = scraper::Selector::parse("ul#words").unwrap();
|
||||
let words_list = document.select(&selector).next().unwrap();
|
||||
|
||||
let mut words = Vec::new();
|
||||
|
||||
for element in words_list.select(&scraper::Selector::parse("li").unwrap()) {
|
||||
words.push(element.inner_html());
|
||||
}
|
||||
|
||||
Ok(words)
|
||||
}
|
||||
}
|
40
src/main.rs
Normal file
40
src/main.rs
Normal file
@ -0,0 +1,40 @@
|
||||
mod client;
|
||||
mod types;
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
better_panic::install();
|
||||
let client = client::Client::new();
|
||||
|
||||
let dic = client.get_all_dictionary_list("").await.unwrap();
|
||||
|
||||
std::fs::create_dir_all("dics").unwrap();
|
||||
|
||||
let file = std::fs::File::create("./dics/dics.json").unwrap();
|
||||
serde_json::to_writer_pretty(file, &dic).unwrap();
|
||||
|
||||
for d in dic {
|
||||
let vec = client.get_dictionary_values(d.data_no).await.unwrap();
|
||||
|
||||
let file = std::fs::File::create(format!("./dics/{}.json", d.data_no)).unwrap();
|
||||
|
||||
#[derive(serde::Serialize)]
|
||||
struct Dic {
|
||||
title: String,
|
||||
words: Vec<String>,
|
||||
}
|
||||
serde_json::to_writer_pretty(
|
||||
file,
|
||||
&Dic {
|
||||
title: d.title,
|
||||
words: vec,
|
||||
},
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
println!("saved: {}", d.data_no);
|
||||
tokio::time::sleep(std::time::Duration::from_millis(100)).await;
|
||||
}
|
||||
|
||||
// println!("{:#?}", dic);
|
||||
}
|
1
src/types/mod.rs
Normal file
1
src/types/mod.rs
Normal file
@ -0,0 +1 @@
|
||||
pub mod response;
|
38
src/types/response.rs
Normal file
38
src/types/response.rs
Normal file
@ -0,0 +1,38 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
#[derive(Debug, Deserialize, Serialize)]
|
||||
#[serde(rename_all = "kebab-case")]
|
||||
pub enum DictionaryResponse {
|
||||
Data(DictionaryResponseData),
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize, Serialize)]
|
||||
#[serde(rename_all = "kebab-case")]
|
||||
pub enum DictionaryResponseData {
|
||||
Dics(DictionaryResponseDataDic),
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize, Serialize)]
|
||||
pub struct DictionaryResponseDataDic {
|
||||
pub next: Option<u64>,
|
||||
#[serde(rename = "$value")]
|
||||
pub value: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize, Serialize)]
|
||||
pub struct DictionaryResponseDataDicValueUl {
|
||||
// #[serde(rename = "$value")]
|
||||
pub li: Vec<DictionaryResponseDataDicValueDic>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize, Serialize)]
|
||||
pub struct DictionaryResponseDataDicValueDic {
|
||||
#[serde(rename = "data-no")]
|
||||
pub data_no: u64,
|
||||
|
||||
#[serde(rename = "data-count")]
|
||||
pub data_count: u64,
|
||||
|
||||
#[serde(rename = "$value")]
|
||||
pub title: String,
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user