【snownlp相关文件】上传自己的模型,调用utils/mynlp

This commit is contained in:
redhongx
2024-07-04 11:54:06 +08:00
parent f09fcb3000
commit 93b72ea2e0
33 changed files with 135956 additions and 0 deletions
+61
View File
@@ -0,0 +1,61 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import os
import re
import codecs
from . import zh
from . import pinyin
stop_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
'stopwords.txt')
pinyin_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
'pinyin.txt')
stop = set()
fr = codecs.open(stop_path, 'r', 'utf-8')
for word in fr:
stop.add(word.strip())
fr.close()
pin = pinyin.PinYin(pinyin_path)
re_zh = re.compile('([\u4E00-\u9FA5]+)')
def filter_stop(words):
return list(filter(lambda x: x not in stop, words))
def zh2hans(sent):
return zh.transfer(sent)
def get_sentences(doc):
line_break = re.compile('[\r\n]')
delimiter = re.compile('[,。?!;]')
sentences = []
for line in line_break.split(doc):
line = line.strip()
if not line:
continue
for sent in delimiter.split(line):
sent = sent.strip()
if not sent:
continue
sentences.append(sent)
return sentences
def get_pinyin(sentence):
ret = []
for s in re_zh.split(sentence):
s = s.strip()
if not s:
continue
if re_zh.match(s):
ret += pin.get(s)
else:
for word in s.split():
word = word.strip()
if word:
ret.append(word)
return ret
+26
View File
@@ -0,0 +1,26 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import codecs
from ..utils.trie import Trie
class PinYin(object):
def __init__(self, fname):
self.handle = Trie()
fr = codecs.open(fname, 'r', 'utf-8')
for line in fr:
words = line.split()
self.handle.insert(words[0], words[1:])
fr.close()
def get(self, text):
ret = []
for i in self.handle.translate(text):
if isinstance(i, list) or isinstance(i, tuple):
ret = ret + i
else:
ret.append(i)
return ret
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff