Add gitignore
This commit is contained in:
parent
73648e0ce0
commit
ce25603c94
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
upstream/
|
6
LICENSE
6
LICENSE
@ -631,8 +631,8 @@ to attach them to the start of each source file to most effectively
|
|||||||
state the exclusion of warranty; and each file should have at least
|
state the exclusion of warranty; and each file should have at least
|
||||||
the "copyright" line and a pointer to where the full notice is found.
|
the "copyright" line and a pointer to where the full notice is found.
|
||||||
|
|
||||||
<one line to give the program's name and a brief idea of what it does.>
|
mozcdict-ext - Convert external words into Mozc system dictionary
|
||||||
Copyright (C) <year> <name of author>
|
Copyright (C) 2023 Masaki Haruka
|
||||||
|
|
||||||
This program is free software: you can redistribute it and/or modify
|
This program is free software: you can redistribute it and/or modify
|
||||||
it under the terms of the GNU General Public License as published by
|
it under the terms of the GNU General Public License as published by
|
||||||
@ -652,7 +652,7 @@ Also add information on how to contact you by electronic and paper mail.
|
|||||||
If the program does terminal interaction, make it output a short
|
If the program does terminal interaction, make it output a short
|
||||||
notice like this when it starts in an interactive mode:
|
notice like this when it starts in an interactive mode:
|
||||||
|
|
||||||
<program> Copyright (C) <year> <name of author>
|
mozcdict-ext Copyright (C) 2023 Masaki Haruka
|
||||||
This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
|
This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
|
||||||
This is free software, and you are welcome to redistribute it
|
This is free software, and you are welcome to redistribute it
|
||||||
under certain conditions; type `show c' for details.
|
under certain conditions; type `show c' for details.
|
||||||
|
18
neologd/mkdict.zsh
Normal file
18
neologd/mkdict.zsh
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
#!/bin/zsh
|
||||||
|
|
||||||
|
if [[ ! -e ./neologd.rb ]]
|
||||||
|
then
|
||||||
|
print "Run this script on same directory as mkdict.zsh" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -e upsttream ]]
|
||||||
|
then
|
||||||
|
(
|
||||||
|
cd upstream
|
||||||
|
git pull
|
||||||
|
)
|
||||||
|
else
|
||||||
|
git clone 'https://github.com/neologd/mecab-ipadic-neologd.git' upstream
|
||||||
|
fi
|
||||||
|
|
121
neologd/neologd.rb
Normal file
121
neologd/neologd.rb
Normal file
@ -0,0 +1,121 @@
|
|||||||
|
#! /usr/bin/env ruby
|
||||||
|
require 'nkf'
|
||||||
|
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# convert_neologd_to_mozcdic
|
||||||
|
# ==============================================================================
|
||||||
|
|
||||||
|
def convert_neologd_to_mozcdic
|
||||||
|
# mecab-user-dict-seedを読み込む
|
||||||
|
file = File.new($filename, "r")
|
||||||
|
lines = file.read.split("\n")
|
||||||
|
file.close
|
||||||
|
|
||||||
|
l2 = []
|
||||||
|
p = 0
|
||||||
|
|
||||||
|
# neologd のエントリから読みと表記を取得
|
||||||
|
lines.length.times do |i|
|
||||||
|
# 表層形,左文脈ID,右文脈ID,コスト,品詞1,品詞2,品詞3,品詞4,品詞5,品詞6,\
|
||||||
|
# 原形,読み,発音
|
||||||
|
# little glee monster,1289,1289,2098,名詞,固有名詞,人名,一般,*,*,\
|
||||||
|
# Little Glee Monster,リトルグリーモンスター,リトルグリーモンスター
|
||||||
|
# リトルグリーモンスター,1288,1288,-1677,名詞,固有名詞,一般,*,*,*,\
|
||||||
|
# Little Glee Monster,リトルグリーモンスター,リトルグリーモンスター
|
||||||
|
# 新型コロナウィルス,1288,1288,4808,名詞,固有名詞,一般,*,*,*,\
|
||||||
|
# 新型コロナウィルス,シンガタコロナウィルス,シンガタコロナウィルス
|
||||||
|
# 新型コロナウイルス,1288,1288,4404,名詞,固有名詞,一般,*,*,*,\
|
||||||
|
# 新型コロナウイルス,シンガタコロナウイルス,シンガタコロナウイルス
|
||||||
|
|
||||||
|
s = lines[i].split(",")
|
||||||
|
# 「読み」を取得
|
||||||
|
yomi = s[11]
|
||||||
|
# 「原形」を表記にする
|
||||||
|
hyouki = s[10]
|
||||||
|
|
||||||
|
# 読みのカタカナをひらがなに変換
|
||||||
|
yomi = NKF.nkf("--hiragana -w -W", yomi)
|
||||||
|
yomi = yomi.tr("ゐゑ", "いえ")
|
||||||
|
|
||||||
|
# 読みがひらがな以外を含む場合はスキップ
|
||||||
|
if yomi != yomi.scan(/[ぁ-ゔー]/).join
|
||||||
|
next
|
||||||
|
end
|
||||||
|
|
||||||
|
# 名詞以外の場合はスキップ
|
||||||
|
if s[4] != "名詞" ||
|
||||||
|
# 「地域」をスキップ。地名は郵便番号ファイルから生成する
|
||||||
|
s[6] == "地域" ||
|
||||||
|
# 「名」をスキップ
|
||||||
|
s[7] == "名"
|
||||||
|
next
|
||||||
|
end
|
||||||
|
|
||||||
|
# [読み, 表記, コスト] の順に並べる
|
||||||
|
l2[p] = [yomi, hyouki, s[3].to_i]
|
||||||
|
p = p + 1
|
||||||
|
end
|
||||||
|
|
||||||
|
lines = l2.sort
|
||||||
|
l2 = []
|
||||||
|
|
||||||
|
# Mozcの品詞IDを取得
|
||||||
|
idfile = File.new("../mozc/id.def", "r")
|
||||||
|
id = idfile.read.split("\n")
|
||||||
|
idfile.close
|
||||||
|
|
||||||
|
# 「名詞,固有名詞,人名,一般,*,*」は優先度が低いので使わない。
|
||||||
|
# 「名詞,固有名詞,一般,*,*,*」は後でフィルタリングする。
|
||||||
|
id = id.grep(/\ 名詞,固有名詞,一般,\*,\*,\*,\*/)
|
||||||
|
id = id[0].split(" ")[0]
|
||||||
|
|
||||||
|
# Mozc形式で書き出す
|
||||||
|
dicfile = File.new($dicname, "w")
|
||||||
|
|
||||||
|
lines.length.times do |i|
|
||||||
|
s1 = lines[i]
|
||||||
|
s2 = lines[i - 1]
|
||||||
|
|
||||||
|
# [読み..表記] が重複する場合はスキップ
|
||||||
|
if s1[0..1] == s2[0..1]
|
||||||
|
next
|
||||||
|
end
|
||||||
|
|
||||||
|
# コストがマイナスの場合は8000にする
|
||||||
|
if s1[2] < 0
|
||||||
|
s1[2] = 8000
|
||||||
|
end
|
||||||
|
|
||||||
|
# コストが10000を超える場合は10000にする
|
||||||
|
if s1[2] > 10000
|
||||||
|
s1[2] = 10000
|
||||||
|
end
|
||||||
|
|
||||||
|
# コストを 6000 < cost < 7000 に調整する
|
||||||
|
s1[2] = 6000 + (s1[2] / 10)
|
||||||
|
|
||||||
|
# [読み,id,id,コスト,表記] の順に並べる
|
||||||
|
t = [s1[0], id, id, s1[2].to_s, s1[1]]
|
||||||
|
dicfile.puts t.join(" ")
|
||||||
|
end
|
||||||
|
|
||||||
|
dicfile.close
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# main
|
||||||
|
# ==============================================================================
|
||||||
|
|
||||||
|
require 'open-uri'
|
||||||
|
url = "https://github.com/neologd/mecab-ipadic-neologd/tree/master/seed"
|
||||||
|
neologdver = URI.open(url).read.split("mecab-user-dict-seed.")[1]
|
||||||
|
neologdver = neologdver.split(".csv.xz")[0]
|
||||||
|
|
||||||
|
`wget -nc https://github.com/neologd/mecab-ipadic-neologd/raw/master/seed/mecab-user-dict-seed.#{neologdver}.csv.xz`
|
||||||
|
`7z x -aos mecab-user-dict-seed.#{neologdver}.csv.xz`
|
||||||
|
$filename = "mecab-user-dict-seed.#{neologdver}.csv"
|
||||||
|
$dicname = "mozcdic-ut-neologd.txt"
|
||||||
|
|
||||||
|
convert_neologd_to_mozcdic
|
Loading…
x
Reference in New Issue
Block a user