first commit
This commit is contained in:
commit
f66c86d74c
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
/target
|
||||||
|
/dics
|
2094
Cargo.lock
generated
Normal file
2094
Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
18
Cargo.toml
Normal file
18
Cargo.toml
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
[package]
|
||||||
|
name = "pictsense-dict-crawler"
|
||||||
|
version = "0.1.0"
|
||||||
|
edition = "2021"
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
anyhow = "1.0.86"
|
||||||
|
better-panic = "0.3.0"
|
||||||
|
chrono = { version = "0.4.38", features = ["serde"] }
|
||||||
|
clap = { version = "4.5.11", features = ["derive"] }
|
||||||
|
quick-xml = "0.36.1"
|
||||||
|
regex = "1.10.5"
|
||||||
|
reqwest = { version = "0.12.5", features = ["json"] }
|
||||||
|
scraper = "0.19.1"
|
||||||
|
serde = { version = "1.0.204", features = ["derive"] }
|
||||||
|
serde-xml-rs = "0.6.0"
|
||||||
|
serde_json = "1.0.120"
|
||||||
|
tokio = { version = "1.39.1", features = ["full"] }
|
101
src/client.rs
Normal file
101
src/client.rs
Normal file
@ -0,0 +1,101 @@
|
|||||||
|
use std::collections::HashMap;
|
||||||
|
|
||||||
|
use crate::types::response::{
|
||||||
|
DictionaryResponse, DictionaryResponseData,
|
||||||
|
DictionaryResponseDataDicValueDic, DictionaryResponseDataDicValueUl,
|
||||||
|
};
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct DictionaryList {
|
||||||
|
pub next: Option<u64>,
|
||||||
|
pub dics: Vec<DictionaryResponseDataDicValueDic>,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct Client {
|
||||||
|
client: reqwest::Client,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Client {
|
||||||
|
pub fn new() -> Self {
|
||||||
|
let client = reqwest::Client::new();
|
||||||
|
Self { client }
|
||||||
|
}
|
||||||
|
pub async fn get_dictionary_list(
|
||||||
|
&self,
|
||||||
|
query: &str,
|
||||||
|
start: u64,
|
||||||
|
) -> anyhow::Result<DictionaryList> {
|
||||||
|
let url = "https://pictsense.com/get/dic/";
|
||||||
|
let mut params = HashMap::new();
|
||||||
|
let start = start.to_string();
|
||||||
|
let now_unixtime = chrono::Utc::now().timestamp().to_string();
|
||||||
|
params.insert("q", query);
|
||||||
|
params.insert("s", &start);
|
||||||
|
params.insert("t", "0");
|
||||||
|
params.insert("at", &now_unixtime);
|
||||||
|
|
||||||
|
let res = self.client.post(url).form(¶ms).send().await?;
|
||||||
|
let body = res.text().await?;
|
||||||
|
let dics = match serde_xml_rs::from_str::<DictionaryResponse>(&body) {
|
||||||
|
Ok(dic) => dic,
|
||||||
|
Err(e) => return Err(anyhow::Error::new(e).context(body)),
|
||||||
|
};
|
||||||
|
|
||||||
|
let dic = match dics {
|
||||||
|
DictionaryResponse::Data(data) => match data {
|
||||||
|
DictionaryResponseData::Dics(dic) => dic,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
let dic_value = "<ul>".to_string() + dic.value.as_str() + "</ul>";
|
||||||
|
|
||||||
|
// ライブラリのバグを運用でカバー!!!!✨
|
||||||
|
// SEE: https://github.com/RReverser/serde-xml-rs/issues/66
|
||||||
|
let delete_span_regex = regex::Regex::new(r"<span>\([0-9]*\)<\/span>").unwrap();
|
||||||
|
let dic_value = delete_span_regex.replace_all(&dic_value, "").to_string();
|
||||||
|
|
||||||
|
let dic_value: DictionaryResponseDataDicValueUl = serde_xml_rs::from_str(&dic_value)?;
|
||||||
|
|
||||||
|
Ok(DictionaryList {
|
||||||
|
next: dic.next,
|
||||||
|
dics: dic_value.li,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
pub async fn get_all_dictionary_list(
|
||||||
|
&self,
|
||||||
|
query: &str,
|
||||||
|
) -> anyhow::Result<Vec<DictionaryResponseDataDicValueDic>> {
|
||||||
|
let mut start = 0;
|
||||||
|
let mut dics = Vec::new();
|
||||||
|
loop {
|
||||||
|
println!("start: {}", start);
|
||||||
|
let dic = self.get_dictionary_list(query, start).await?;
|
||||||
|
dics.extend(dic.dics);
|
||||||
|
if dic.next.is_none() {
|
||||||
|
break;
|
||||||
|
};
|
||||||
|
start = dic.next.unwrap();
|
||||||
|
tokio::time::sleep(std::time::Duration::from_millis(100)).await;
|
||||||
|
}
|
||||||
|
Ok(dics)
|
||||||
|
}
|
||||||
|
pub async fn get_dictionary_values(&self, id: u64) -> anyhow::Result<Vec<String>> {
|
||||||
|
let url = format!("https://pictsense.com/dic/n{}", id);
|
||||||
|
|
||||||
|
let res = self.client.get(&url).send().await?;
|
||||||
|
let body = res.text().await?;
|
||||||
|
let document = scraper::Html::parse_document(&body);
|
||||||
|
|
||||||
|
// <ul id="words">
|
||||||
|
let selector = scraper::Selector::parse("ul#words").unwrap();
|
||||||
|
let words_list = document.select(&selector).next().unwrap();
|
||||||
|
|
||||||
|
let mut words = Vec::new();
|
||||||
|
|
||||||
|
for element in words_list.select(&scraper::Selector::parse("li").unwrap()) {
|
||||||
|
words.push(element.inner_html());
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(words)
|
||||||
|
}
|
||||||
|
}
|
40
src/main.rs
Normal file
40
src/main.rs
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
mod client;
|
||||||
|
mod types;
|
||||||
|
|
||||||
|
#[tokio::main]
|
||||||
|
async fn main() {
|
||||||
|
better_panic::install();
|
||||||
|
let client = client::Client::new();
|
||||||
|
|
||||||
|
let dic = client.get_all_dictionary_list("").await.unwrap();
|
||||||
|
|
||||||
|
std::fs::create_dir_all("dics").unwrap();
|
||||||
|
|
||||||
|
let file = std::fs::File::create("./dics/dics.json").unwrap();
|
||||||
|
serde_json::to_writer_pretty(file, &dic).unwrap();
|
||||||
|
|
||||||
|
for d in dic {
|
||||||
|
let vec = client.get_dictionary_values(d.data_no).await.unwrap();
|
||||||
|
|
||||||
|
let file = std::fs::File::create(format!("./dics/{}.json", d.data_no)).unwrap();
|
||||||
|
|
||||||
|
#[derive(serde::Serialize)]
|
||||||
|
struct Dic {
|
||||||
|
title: String,
|
||||||
|
words: Vec<String>,
|
||||||
|
}
|
||||||
|
serde_json::to_writer_pretty(
|
||||||
|
file,
|
||||||
|
&Dic {
|
||||||
|
title: d.title,
|
||||||
|
words: vec,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
println!("saved: {}", d.data_no);
|
||||||
|
tokio::time::sleep(std::time::Duration::from_millis(100)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
// println!("{:#?}", dic);
|
||||||
|
}
|
1
src/types/mod.rs
Normal file
1
src/types/mod.rs
Normal file
@ -0,0 +1 @@
|
|||||||
|
pub mod response;
|
38
src/types/response.rs
Normal file
38
src/types/response.rs
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize, Serialize)]
|
||||||
|
#[serde(rename_all = "kebab-case")]
|
||||||
|
pub enum DictionaryResponse {
|
||||||
|
Data(DictionaryResponseData),
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize, Serialize)]
|
||||||
|
#[serde(rename_all = "kebab-case")]
|
||||||
|
pub enum DictionaryResponseData {
|
||||||
|
Dics(DictionaryResponseDataDic),
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize, Serialize)]
|
||||||
|
pub struct DictionaryResponseDataDic {
|
||||||
|
pub next: Option<u64>,
|
||||||
|
#[serde(rename = "$value")]
|
||||||
|
pub value: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize, Serialize)]
|
||||||
|
pub struct DictionaryResponseDataDicValueUl {
|
||||||
|
// #[serde(rename = "$value")]
|
||||||
|
pub li: Vec<DictionaryResponseDataDicValueDic>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize, Serialize)]
|
||||||
|
pub struct DictionaryResponseDataDicValueDic {
|
||||||
|
#[serde(rename = "data-no")]
|
||||||
|
pub data_no: u64,
|
||||||
|
|
||||||
|
#[serde(rename = "data-count")]
|
||||||
|
pub data_count: u64,
|
||||||
|
|
||||||
|
#[serde(rename = "$value")]
|
||||||
|
pub title: String,
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user