【snownlp相关文件】上传自己的模型,调用utils/mynlp
This commit is contained in:
@@ -0,0 +1,85 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from . import normal
|
||||
from . import seg
|
||||
from . import tag
|
||||
from . import sentiment
|
||||
from .sim import bm25
|
||||
from .summary import textrank
|
||||
from .summary import words_merge
|
||||
|
||||
|
||||
class SnowNLP(object):
|
||||
|
||||
def __init__(self, doc):
|
||||
self.doc = doc
|
||||
self.bm25 = bm25.BM25(doc)
|
||||
|
||||
@property
|
||||
def words(self):
|
||||
return seg.seg(self.doc)
|
||||
|
||||
@property
|
||||
def sentences(self):
|
||||
return normal.get_sentences(self.doc)
|
||||
|
||||
@property
|
||||
def han(self):
|
||||
return normal.zh2hans(self.doc)
|
||||
|
||||
@property
|
||||
def pinyin(self):
|
||||
return normal.get_pinyin(self.doc)
|
||||
|
||||
@property
|
||||
def sentiments(self):
|
||||
return sentiment.classify(self.doc)
|
||||
|
||||
@property
|
||||
def tags(self):
|
||||
words = self.words
|
||||
tags = tag.tag(words)
|
||||
return zip(words, tags)
|
||||
|
||||
@property
|
||||
def tf(self):
|
||||
return self.bm25.f
|
||||
|
||||
@property
|
||||
def idf(self):
|
||||
return self.bm25.idf
|
||||
|
||||
def sim(self, doc):
|
||||
return self.bm25.simall(doc)
|
||||
|
||||
def summary(self, limit=5):
|
||||
doc = []
|
||||
sents = self.sentences
|
||||
for sent in sents:
|
||||
words = seg.seg(sent)
|
||||
words = normal.filter_stop(words)
|
||||
doc.append(words)
|
||||
rank = textrank.TextRank(doc)
|
||||
rank.solve()
|
||||
ret = []
|
||||
for index in rank.top_index(limit):
|
||||
ret.append(sents[index])
|
||||
return ret
|
||||
|
||||
def keywords(self, limit=5, merge=False):
|
||||
doc = []
|
||||
sents = self.sentences
|
||||
for sent in sents:
|
||||
words = seg.seg(sent)
|
||||
words = normal.filter_stop(words)
|
||||
doc.append(words)
|
||||
rank = textrank.KeywordTextRank(doc)
|
||||
rank.solve()
|
||||
ret = []
|
||||
for w in rank.top_index(limit):
|
||||
ret.append(w)
|
||||
if merge:
|
||||
wm = words_merge.SimpleMerge(self.doc, ret)
|
||||
return wm.merge()
|
||||
return ret
|
||||
@@ -0,0 +1,78 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import sys
|
||||
import gzip
|
||||
import marshal
|
||||
from math import log, exp
|
||||
|
||||
from ..utils.frequency import AddOneProb
|
||||
|
||||
|
||||
class Bayes(object):
|
||||
|
||||
def __init__(self):
|
||||
self.d = {}
|
||||
self.total = 0
|
||||
|
||||
def save(self, fname, iszip=True):
|
||||
d = {}
|
||||
d['total'] = self.total
|
||||
d['d'] = {}
|
||||
for k, v in self.d.items():
|
||||
d['d'][k] = v.__dict__
|
||||
if sys.version_info[0] == 3:
|
||||
fname = fname + '.3'
|
||||
if not iszip:
|
||||
marshal.dump(d, open(fname, 'wb'))
|
||||
else:
|
||||
f = gzip.open(fname, 'wb')
|
||||
f.write(marshal.dumps(d))
|
||||
f.close()
|
||||
|
||||
def load(self, fname, iszip=True):
|
||||
if sys.version_info[0] == 3:
|
||||
fname = fname + '.3'
|
||||
if not iszip:
|
||||
d = marshal.load(open(fname, 'rb'))
|
||||
else:
|
||||
try:
|
||||
f = gzip.open(fname, 'rb')
|
||||
d = marshal.loads(f.read())
|
||||
except IOError:
|
||||
f = open(fname, 'rb')
|
||||
d = marshal.loads(f.read())
|
||||
f.close()
|
||||
self.total = d['total']
|
||||
self.d = {}
|
||||
for k, v in d['d'].items():
|
||||
self.d[k] = AddOneProb()
|
||||
self.d[k].__dict__ = v
|
||||
|
||||
def train(self, data):
|
||||
for d in data:
|
||||
c = d[1]
|
||||
if c not in self.d:
|
||||
self.d[c] = AddOneProb()
|
||||
for word in d[0]:
|
||||
self.d[c].add(word, 1)
|
||||
self.total = sum(map(lambda x: self.d[x].getsum(), self.d.keys()))
|
||||
|
||||
def classify(self, x):
|
||||
tmp = {}
|
||||
for k in self.d:
|
||||
tmp[k] = log(self.d[k].getsum()) - log(self.total)
|
||||
for word in x:
|
||||
tmp[k] += log(self.d[k].freq(word))
|
||||
ret, prob = 0, 0
|
||||
for k in self.d:
|
||||
now = 0
|
||||
try:
|
||||
for otherk in self.d:
|
||||
now += exp(tmp[otherk]-tmp[k])
|
||||
now = 1/now
|
||||
except OverflowError:
|
||||
now = 0
|
||||
if now > prob:
|
||||
ret, prob = k, now
|
||||
return (ret, prob)
|
||||
@@ -0,0 +1,61 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import os
|
||||
import re
|
||||
import codecs
|
||||
|
||||
from . import zh
|
||||
from . import pinyin
|
||||
|
||||
stop_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
|
||||
'stopwords.txt')
|
||||
pinyin_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
|
||||
'pinyin.txt')
|
||||
stop = set()
|
||||
fr = codecs.open(stop_path, 'r', 'utf-8')
|
||||
for word in fr:
|
||||
stop.add(word.strip())
|
||||
fr.close()
|
||||
pin = pinyin.PinYin(pinyin_path)
|
||||
re_zh = re.compile('([\u4E00-\u9FA5]+)')
|
||||
|
||||
|
||||
def filter_stop(words):
|
||||
return list(filter(lambda x: x not in stop, words))
|
||||
|
||||
|
||||
def zh2hans(sent):
|
||||
return zh.transfer(sent)
|
||||
|
||||
|
||||
def get_sentences(doc):
|
||||
line_break = re.compile('[\r\n]')
|
||||
delimiter = re.compile('[,。?!;]')
|
||||
sentences = []
|
||||
for line in line_break.split(doc):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
for sent in delimiter.split(line):
|
||||
sent = sent.strip()
|
||||
if not sent:
|
||||
continue
|
||||
sentences.append(sent)
|
||||
return sentences
|
||||
|
||||
|
||||
def get_pinyin(sentence):
|
||||
ret = []
|
||||
for s in re_zh.split(sentence):
|
||||
s = s.strip()
|
||||
if not s:
|
||||
continue
|
||||
if re_zh.match(s):
|
||||
ret += pin.get(s)
|
||||
else:
|
||||
for word in s.split():
|
||||
word = word.strip()
|
||||
if word:
|
||||
ret.append(word)
|
||||
return ret
|
||||
@@ -0,0 +1,26 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import codecs
|
||||
|
||||
from ..utils.trie import Trie
|
||||
|
||||
|
||||
class PinYin(object):
|
||||
|
||||
def __init__(self, fname):
|
||||
self.handle = Trie()
|
||||
fr = codecs.open(fname, 'r', 'utf-8')
|
||||
for line in fr:
|
||||
words = line.split()
|
||||
self.handle.insert(words[0], words[1:])
|
||||
fr.close()
|
||||
|
||||
def get(self, text):
|
||||
ret = []
|
||||
for i in self.handle.translate(text):
|
||||
if isinstance(i, list) or isinstance(i, tuple):
|
||||
ret = ret + i
|
||||
else:
|
||||
ret.append(i)
|
||||
return ret
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,47 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import os
|
||||
import re
|
||||
|
||||
from . import seg as TnTseg
|
||||
|
||||
data_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
|
||||
'seg.marshal')
|
||||
segger = TnTseg.Seg()
|
||||
segger.load(data_path, True)
|
||||
re_zh = re.compile('([\u4E00-\u9FA5]+)')
|
||||
|
||||
|
||||
def seg(sent):
|
||||
words = []
|
||||
for s in re_zh.split(sent):
|
||||
s = s.strip()
|
||||
if not s:
|
||||
continue
|
||||
if re_zh.match(s):
|
||||
words += single_seg(s)
|
||||
else:
|
||||
for word in s.split():
|
||||
word = word.strip()
|
||||
if word:
|
||||
words.append(word)
|
||||
return words
|
||||
|
||||
|
||||
def train(fname):
|
||||
global segger
|
||||
segger = TnTseg.Seg()
|
||||
segger.train(fname)
|
||||
|
||||
|
||||
def save(fname, iszip=True):
|
||||
segger.save(fname, iszip)
|
||||
|
||||
|
||||
def load(fname, iszip=True):
|
||||
segger.load(fname, iszip)
|
||||
|
||||
|
||||
def single_seg(sent):
|
||||
return list(segger.seg(sent))
|
||||
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,57 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import codecs
|
||||
|
||||
from ..utils.tnt import TnT
|
||||
from .y09_2047 import CharacterBasedGenerativeModel
|
||||
|
||||
|
||||
class Seg(object):
|
||||
|
||||
def __init__(self, name='other'):
|
||||
if name == 'tnt':
|
||||
self.segger = TnT()
|
||||
else:
|
||||
self.segger = CharacterBasedGenerativeModel()
|
||||
|
||||
def save(self, fname, iszip=True):
|
||||
self.segger.save(fname, iszip)
|
||||
|
||||
def load(self, fname, iszip=True):
|
||||
self.segger.load(fname, iszip)
|
||||
|
||||
def train(self, fname):
|
||||
fr = codecs.open(fname, 'r', 'utf-8')
|
||||
data = []
|
||||
for i in fr:
|
||||
line = i.strip()
|
||||
if not line:
|
||||
continue
|
||||
tmp = map(lambda x: x.split('/'), line.split())
|
||||
data.append(tmp)
|
||||
fr.close()
|
||||
self.segger.train(data)
|
||||
|
||||
def seg(self, sentence):
|
||||
ret = self.segger.tag(sentence)
|
||||
tmp = ''
|
||||
for i in ret:
|
||||
if i[1] == 'e':
|
||||
yield tmp+i[0]
|
||||
tmp = ''
|
||||
elif i[1] == 'b' or i[1] == 's':
|
||||
if tmp:
|
||||
yield tmp
|
||||
tmp = i[0]
|
||||
else:
|
||||
tmp += i[0]
|
||||
if tmp:
|
||||
yield tmp
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
seg = Seg()
|
||||
seg.train('data.txt')
|
||||
print(' '.join(seg.seg('主要是用来放置一些简单快速的中文分词和词性标注的程序')))
|
||||
@@ -0,0 +1,125 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import sys
|
||||
import gzip
|
||||
import marshal
|
||||
from math import log
|
||||
|
||||
from ..utils import frequency
|
||||
|
||||
|
||||
class CharacterBasedGenerativeModel(object):
|
||||
|
||||
def __init__(self):
|
||||
self.l1 = 0.0
|
||||
self.l2 = 0.0
|
||||
self.l3 = 0.0
|
||||
self.status = ('b', 'm', 'e', 's')
|
||||
self.uni = frequency.NormalProb()
|
||||
self.bi = frequency.NormalProb()
|
||||
self.tri = frequency.NormalProb()
|
||||
|
||||
def save(self, fname, iszip=True):
|
||||
d = {}
|
||||
for k, v in self.__dict__.items():
|
||||
if hasattr(v, '__dict__'):
|
||||
d[k] = v.__dict__
|
||||
else:
|
||||
d[k] = v
|
||||
if sys.version_info[0] == 3:
|
||||
fname = fname + '.3'
|
||||
if not iszip:
|
||||
marshal.dump(d, open(fname, 'wb'))
|
||||
else:
|
||||
f = gzip.open(fname, 'wb')
|
||||
f.write(marshal.dumps(d))
|
||||
f.close()
|
||||
|
||||
def load(self, fname, iszip=True):
|
||||
if sys.version_info[0] == 3:
|
||||
fname = fname + '.3'
|
||||
if not iszip:
|
||||
d = marshal.load(open(fname, 'rb'))
|
||||
else:
|
||||
try:
|
||||
f = gzip.open(fname, 'rb')
|
||||
d = marshal.loads(f.read())
|
||||
except IOError:
|
||||
f = open(fname, 'rb')
|
||||
d = marshal.loads(f.read())
|
||||
f.close()
|
||||
for k, v in d.items():
|
||||
if hasattr(self.__dict__[k], '__dict__'):
|
||||
self.__dict__[k].__dict__ = v
|
||||
else:
|
||||
self.__dict__[k] = v
|
||||
|
||||
def div(self, v1, v2):
|
||||
if v2 == 0:
|
||||
return 0
|
||||
return float(v1)/v2
|
||||
|
||||
def train(self, data):
|
||||
for sentence in data:
|
||||
now = [('', 'BOS'), ('', 'BOS')]
|
||||
self.bi.add((('', 'BOS'), ('', 'BOS')), 1)
|
||||
self.uni.add(('', 'BOS'), 2)
|
||||
for word, tag in sentence:
|
||||
now.append((word, tag))
|
||||
self.uni.add((word, tag), 1)
|
||||
self.bi.add(tuple(now[1:]), 1)
|
||||
self.tri.add(tuple(now), 1)
|
||||
now.pop(0)
|
||||
tl1 = 0.0
|
||||
tl2 = 0.0
|
||||
tl3 = 0.0
|
||||
samples = sorted(self.tri.samples(), key=lambda x: self.tri.get(x)[1])
|
||||
for now in samples:
|
||||
c3 = self.div(self.tri.get(now)[1]-1, self.bi.get(now[:2])[1]-1)
|
||||
c2 = self.div(self.bi.get(now[1:])[1]-1, self.uni.get(now[1])[1]-1)
|
||||
c1 = self.div(self.uni.get(now[2])[1]-1, self.uni.getsum()-1)
|
||||
if c3 >= c1 and c3 >= c2:
|
||||
tl3 += self.tri.get(now)[1]
|
||||
elif c2 >= c1 and c2 >= c3:
|
||||
tl2 += self.tri.get(now)[1]
|
||||
elif c1 >= c2 and c1 >= c3:
|
||||
tl1 += self.tri.get(now)[1]
|
||||
self.l1 = self.div(tl1, tl1+tl2+tl3)
|
||||
self.l2 = self.div(tl2, tl1+tl2+tl3)
|
||||
self.l3 = self.div(tl3, tl1+tl2+tl3)
|
||||
|
||||
def log_prob(self, s1, s2, s3):
|
||||
uni = self.l1*self.uni.freq(s3)
|
||||
bi = self.div(self.l2*self.bi.get((s2, s3))[1], self.uni.get(s2)[1])
|
||||
tri = self.div(self.l3*self.tri.get((s1, s2, s3))[1],
|
||||
self.bi.get((s1, s2))[1])
|
||||
if uni+bi+tri == 0:
|
||||
return float('-inf')
|
||||
return log(uni+bi+tri)
|
||||
|
||||
def tag(self, data):
|
||||
now = [((('', 'BOS'), ('', 'BOS')), 0.0, [])]
|
||||
for w in data:
|
||||
stage = {}
|
||||
not_found = True
|
||||
for s in self.status:
|
||||
if self.uni.freq((w, s)) != 0:
|
||||
not_found = False
|
||||
break
|
||||
if not_found:
|
||||
for s in self.status:
|
||||
for pre in now:
|
||||
stage[(pre[0][1], (w, s))] = (pre[1], pre[2]+[s])
|
||||
now = list(map(lambda x: (x[0], x[1][0], x[1][1]),
|
||||
stage.items()))
|
||||
continue
|
||||
for s in self.status:
|
||||
for pre in now:
|
||||
p = pre[1]+self.log_prob(pre[0][0], pre[0][1], (w, s))
|
||||
if (not (pre[0][1],
|
||||
(w, s)) in stage) or p > stage[(pre[0][1],
|
||||
(w, s))][0]:
|
||||
stage[(pre[0][1], (w, s))] = (p, pre[2]+[s])
|
||||
now = list(map(lambda x: (x[0], x[1][0], x[1][1]), stage.items()))
|
||||
return zip(data, max(now, key=lambda x: x[1])[2])
|
||||
@@ -0,0 +1,73 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import os
|
||||
import codecs
|
||||
|
||||
from .. import normal
|
||||
from .. import seg
|
||||
from ..classification.bayes import Bayes
|
||||
|
||||
data_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
|
||||
'sentiment.marshal')
|
||||
|
||||
|
||||
class Sentiment(object):
|
||||
|
||||
def __init__(self):
|
||||
self.classifier = Bayes()
|
||||
|
||||
def save(self, fname, iszip=True):
|
||||
self.classifier.save(fname, iszip)
|
||||
|
||||
def load(self, fname=data_path, iszip=True):
|
||||
self.classifier.load(fname, iszip)
|
||||
|
||||
def handle(self, doc):
|
||||
words = seg.seg(doc)
|
||||
words = normal.filter_stop(words)
|
||||
return words
|
||||
|
||||
def train(self, neg_docs, pos_docs):
|
||||
data = []
|
||||
for sent in neg_docs:
|
||||
data.append([self.handle(sent), 'neg'])
|
||||
for sent in pos_docs:
|
||||
data.append([self.handle(sent), 'pos'])
|
||||
self.classifier.train(data)
|
||||
|
||||
def classify(self, sent):
|
||||
ret, prob = self.classifier.classify(self.handle(sent))
|
||||
if ret == 'pos':
|
||||
return prob
|
||||
return 1-prob
|
||||
|
||||
|
||||
classifier = Sentiment()
|
||||
classifier.load()
|
||||
|
||||
|
||||
def train(neg_file, pos_file):
|
||||
neg = codecs.open(neg_file, 'r', 'utf-8').readlines()
|
||||
pos = codecs.open(pos_file, 'r', 'utf-8').readlines()
|
||||
neg_docs = []
|
||||
pos_docs = []
|
||||
for line in neg:
|
||||
neg_docs.append(line.rstrip("\r\n"))
|
||||
for line in pos:
|
||||
pos_docs.append(line.rstrip("\r\n"))
|
||||
global classifier
|
||||
classifier = Sentiment()
|
||||
classifier.train(neg_docs, pos_docs)
|
||||
|
||||
|
||||
def save(fname, iszip=True):
|
||||
classifier.save(fname, iszip)
|
||||
|
||||
|
||||
def load(fname, iszip=True):
|
||||
classifier.load(fname, iszip)
|
||||
|
||||
|
||||
def classify(sent):
|
||||
return classifier.classify(sent)
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,51 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import math
|
||||
|
||||
|
||||
class BM25(object):
|
||||
|
||||
def __init__(self, docs):
|
||||
self.D = len(docs)
|
||||
self.avgdl = sum([len(doc)+0.0 for doc in docs]) / self.D
|
||||
self.docs = docs
|
||||
self.f = []
|
||||
self.df = {}
|
||||
self.idf = {}
|
||||
self.k1 = 1.5
|
||||
self.b = 0.75
|
||||
self.init()
|
||||
|
||||
def init(self):
|
||||
for doc in self.docs:
|
||||
tmp = {}
|
||||
for word in doc:
|
||||
if not word in tmp:
|
||||
tmp[word] = 0
|
||||
tmp[word] += 1
|
||||
self.f.append(tmp)
|
||||
for k, v in tmp.items():
|
||||
if k not in self.df:
|
||||
self.df[k] = 0
|
||||
self.df[k] += 1
|
||||
for k, v in self.df.items():
|
||||
self.idf[k] = math.log(self.D-v+0.5)-math.log(v+0.5)
|
||||
|
||||
def sim(self, doc, index):
|
||||
score = 0
|
||||
for word in doc:
|
||||
if word not in self.f[index]:
|
||||
continue
|
||||
d = len(self.docs[index])
|
||||
score += (self.idf[word]*self.f[index][word]*(self.k1+1)
|
||||
/ (self.f[index][word]+self.k1*(1-self.b+self.b*d
|
||||
/ self.avgdl)))
|
||||
return score
|
||||
|
||||
def simall(self, doc):
|
||||
scores = []
|
||||
for index in range(self.D):
|
||||
score = self.sim(doc, index)
|
||||
scores.append(score)
|
||||
return scores
|
||||
@@ -0,0 +1,106 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..sim.bm25 import BM25
|
||||
|
||||
|
||||
class TextRank(object):
|
||||
|
||||
def __init__(self, docs):
|
||||
self.docs = docs
|
||||
self.bm25 = BM25(docs)
|
||||
self.D = len(docs)
|
||||
self.d = 0.85
|
||||
self.weight = []
|
||||
self.weight_sum = []
|
||||
self.vertex = []
|
||||
self.max_iter = 200
|
||||
self.min_diff = 0.001
|
||||
self.top = []
|
||||
|
||||
def solve(self):
|
||||
for cnt, doc in enumerate(self.docs):
|
||||
scores = self.bm25.simall(doc)
|
||||
self.weight.append(scores)
|
||||
self.weight_sum.append(sum(scores)-scores[cnt])
|
||||
self.vertex.append(1.0)
|
||||
for _ in range(self.max_iter):
|
||||
m = []
|
||||
max_diff = 0
|
||||
for i in range(self.D):
|
||||
m.append(1-self.d)
|
||||
for j in range(self.D):
|
||||
if j == i or self.weight_sum[j] == 0:
|
||||
continue
|
||||
m[-1] += (self.d*self.weight[j][i]
|
||||
/ self.weight_sum[j]*self.vertex[j])
|
||||
if abs(m[-1] - self.vertex[i]) > max_diff:
|
||||
max_diff = abs(m[-1] - self.vertex[i])
|
||||
self.vertex = m
|
||||
if max_diff <= self.min_diff:
|
||||
break
|
||||
self.top = list(enumerate(self.vertex))
|
||||
self.top = sorted(self.top, key=lambda x: x[1], reverse=True)
|
||||
|
||||
def top_index(self, limit):
|
||||
return list(map(lambda x: x[0], self.top))[:limit]
|
||||
|
||||
def top(self, limit):
|
||||
return list(map(lambda x: self.docs[x[0]], self.top))
|
||||
|
||||
|
||||
class KeywordTextRank(object):
|
||||
|
||||
def __init__(self, docs):
|
||||
self.docs = docs
|
||||
self.words = {}
|
||||
self.vertex = {}
|
||||
self.d = 0.85
|
||||
self.max_iter = 200
|
||||
self.min_diff = 0.001
|
||||
self.top = []
|
||||
|
||||
def solve(self):
|
||||
for doc in self.docs:
|
||||
que = []
|
||||
for word in doc:
|
||||
if word not in self.words:
|
||||
self.words[word] = set()
|
||||
self.vertex[word] = 1.0
|
||||
que.append(word)
|
||||
if len(que) > 5:
|
||||
que.pop(0)
|
||||
for w1 in que:
|
||||
for w2 in que:
|
||||
if w1 == w2:
|
||||
continue
|
||||
self.words[w1].add(w2)
|
||||
self.words[w2].add(w1)
|
||||
for _ in range(self.max_iter):
|
||||
m = {}
|
||||
max_diff = 0
|
||||
tmp = filter(lambda x: len(self.words[x[0]]) > 0,
|
||||
self.vertex.items())
|
||||
tmp = sorted(tmp, key=lambda x: x[1] / len(self.words[x[0]]))
|
||||
for k, v in tmp:
|
||||
for j in self.words[k]:
|
||||
if k == j:
|
||||
continue
|
||||
if j not in m:
|
||||
m[j] = 1 - self.d
|
||||
m[j] += (self.d / len(self.words[k]) * self.vertex[k])
|
||||
for k in self.vertex:
|
||||
if k in m and k in self.vertex:
|
||||
if abs(m[k] - self.vertex[k]) > max_diff:
|
||||
max_diff = abs(m[k] - self.vertex[k])
|
||||
self.vertex = m
|
||||
if max_diff <= self.min_diff:
|
||||
break
|
||||
self.top = list(self.vertex.items())
|
||||
self.top = sorted(self.top, key=lambda x: x[1], reverse=True)
|
||||
|
||||
def top_index(self, limit):
|
||||
return list(map(lambda x: x[0], self.top))[:limit]
|
||||
|
||||
def top(self, limit):
|
||||
return list(map(lambda x: self.docs[x[0]], self.top))
|
||||
@@ -0,0 +1,47 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
class SimpleMerge(object):
|
||||
|
||||
def __init__(self, doc, words):
|
||||
self.doc = doc
|
||||
self.words = words
|
||||
|
||||
def merge(self):
|
||||
trans = {}
|
||||
for w in self.words:
|
||||
trans[w] = ''
|
||||
for w1 in self.words:
|
||||
cw = 0
|
||||
lw = len(w1)
|
||||
for i in range(len(self.doc)-lw+1):
|
||||
if w1 == self.doc[i: i+lw]:
|
||||
cw += 1
|
||||
for w2 in self.words:
|
||||
cnt = 0
|
||||
l2 = len(w1)+len(w2)
|
||||
for i in range(len(self.doc)-l2+1):
|
||||
if w1+w2 == self.doc[i: i+l2]:
|
||||
cnt += 1
|
||||
if cw < cnt*2:
|
||||
trans[w1] = w2
|
||||
break
|
||||
ret = []
|
||||
for w in self.words:
|
||||
if w not in trans:
|
||||
continue
|
||||
s = ''
|
||||
now = trans[w]
|
||||
while now:
|
||||
s += now
|
||||
if now not in trans:
|
||||
break
|
||||
tmp = trans[now]
|
||||
del trans[now]
|
||||
now = tmp
|
||||
trans[w] = s
|
||||
for w in self.words:
|
||||
if w in trans:
|
||||
ret.append(w+trans[w])
|
||||
return ret
|
||||
File diff suppressed because one or more lines are too long
@@ -0,0 +1,43 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import os
|
||||
import codecs
|
||||
|
||||
from ..utils.tnt import TnT
|
||||
|
||||
data_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
|
||||
'tag.marshal')
|
||||
tagger = TnT()
|
||||
tagger.load(data_path)
|
||||
|
||||
|
||||
def train(fname):
|
||||
fr = codecs.open(fname, 'r', 'utf-8')
|
||||
data = []
|
||||
for i in fr:
|
||||
line = i.strip()
|
||||
if not line:
|
||||
continue
|
||||
tmp = map(lambda x: x.split('/'), line.split())
|
||||
data.append(tmp)
|
||||
fr.close()
|
||||
global tagger
|
||||
tagger = TnT()
|
||||
tagger.train(data)
|
||||
|
||||
|
||||
def save(fname, iszip=True):
|
||||
tagger.save(fname, iszip)
|
||||
|
||||
|
||||
def load(fname, iszip=True):
|
||||
tagger.load(fname, iszip)
|
||||
|
||||
|
||||
def tag_all(words):
|
||||
return tagger.tag(words)
|
||||
|
||||
|
||||
def tag(words):
|
||||
return map(lambda x: x[1], tag_all(words))
|
||||
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,74 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from . import good_turing
|
||||
|
||||
class BaseProb(object):
|
||||
|
||||
def __init__(self):
|
||||
self.d = {}
|
||||
self.total = 0.0
|
||||
self.none = 0
|
||||
|
||||
def exists(self, key):
|
||||
return key in self.d
|
||||
|
||||
def getsum(self):
|
||||
return self.total
|
||||
|
||||
def get(self, key):
|
||||
if not self.exists(key):
|
||||
return False, self.none
|
||||
return True, self.d[key]
|
||||
|
||||
def freq(self, key):
|
||||
return float(self.get(key)[1])/self.total
|
||||
|
||||
def samples(self):
|
||||
return self.d.keys()
|
||||
|
||||
|
||||
class NormalProb(BaseProb):
|
||||
|
||||
def add(self, key, value):
|
||||
if not self.exists(key):
|
||||
self.d[key] = 0
|
||||
self.d[key] += value
|
||||
self.total += value
|
||||
|
||||
|
||||
class AddOneProb(BaseProb):
|
||||
|
||||
def __init__(self):
|
||||
self.d = {}
|
||||
self.total = 0.0
|
||||
self.none = 1
|
||||
|
||||
def add(self, key, value):
|
||||
self.total += value
|
||||
if not self.exists(key):
|
||||
self.d[key] = 1
|
||||
self.total += 1
|
||||
self.d[key] += value
|
||||
|
||||
|
||||
class GoodTuringProb(BaseProb):
|
||||
|
||||
def __init__(self):
|
||||
self.d = {}
|
||||
self.total = 0.0
|
||||
self.handled = False
|
||||
|
||||
def add(self, key, value):
|
||||
if not self.exists(key):
|
||||
self.d[key] = 0
|
||||
self.d[key] += value
|
||||
|
||||
def get(self, key):
|
||||
if not self.handled:
|
||||
self.handled = True
|
||||
tmp, self.d = good_turing.main(self.d)
|
||||
self.none = tmp
|
||||
self.total = sum(self.d.values())+0.0
|
||||
if not self.exists(key):
|
||||
return False, self.none
|
||||
return True, self.d[key]
|
||||
@@ -0,0 +1,51 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import print_function
|
||||
from __future__ import division
|
||||
from math import log, exp
|
||||
|
||||
def getz(r, nr):
|
||||
z = [2*nr[0]/r[1]]
|
||||
for i in xrange(len(nr)-2):
|
||||
z.append(2*nr[i+1]/(r[i+2]-r[i]))
|
||||
z.append(nr[-1]/(r[-1]-r[-2]))
|
||||
return z
|
||||
|
||||
def least_square(x, y): # y=a+bx
|
||||
meanx = sum(x)/len(x)
|
||||
meany = sum(y)/len(y)
|
||||
xy = sum((x[i]-meanx)*(y[i]-meany) for i in range(len(x)))
|
||||
square = sum((x[i]-meanx)**2 for i in range(len(x)))
|
||||
b = xy/square
|
||||
return (meany-b*meanx, b)
|
||||
|
||||
def main(dic):
|
||||
values = sorted(dic.values())
|
||||
r, nr, prob = [], [], []
|
||||
for v in values:
|
||||
if not r or r[-1] != v:
|
||||
r.append(v)
|
||||
nr.append(1)
|
||||
else:
|
||||
nr[-1] += 1
|
||||
rr = dict(map(lambda x:list(reversed(x)), enumerate(r)))
|
||||
total = reduce(lambda x, y:(x[0]*x[1]+y[0]*y[1], 1), zip(nr, r))[0]
|
||||
z = getz(r, nr)
|
||||
a, b = least_square(map(lambda x:log(x), r), map(lambda x:log(x), z))
|
||||
use_good_turing = False
|
||||
nr.append(exp(a+b*log(r[-1]+1)))
|
||||
for i in xrange(len(r)):
|
||||
good_turing = (r[i]+1)*(exp(b*(log(r[i]+1)-log(r[i]))))
|
||||
turing = (r[i]+1)*nr[i+1]/nr[i] if i+1<len(r) else good_turing
|
||||
diff = ((((r[i]+1)**2)/nr[i]*nr[i+1]/nr[i]*(1+nr[i+1]/nr[i]))**0.5)*1.65
|
||||
if not use_good_turing and abs(good_turing-turing)>diff:
|
||||
prob.append(turing)
|
||||
else:
|
||||
use_good_turing = True
|
||||
prob.append(good_turing)
|
||||
sump = reduce(lambda x, y:(x[0]*x[1]+y[0]*y[1], 1), zip(nr, prob))[0]
|
||||
for cnt, i in enumerate(prob):
|
||||
prob[cnt] = (1-nr[0]/total)*i/sump
|
||||
return nr[0]/total/total, dict(zip(dic.keys(), map(lambda x:prob[rr[x]], dic.values())))
|
||||
|
||||
if __name__ == '__main__':
|
||||
print(main({1:1,2:1,3:1,4:2,5:2,6:3,7:1,8:2,9:3}))
|
||||
@@ -0,0 +1,148 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
'''
|
||||
Implementation of 'TnT - A Statisical Part of Speech Tagger'
|
||||
'''
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import sys
|
||||
import gzip
|
||||
import heapq
|
||||
import marshal
|
||||
from math import log
|
||||
|
||||
from . import frequency
|
||||
|
||||
|
||||
class TnT(object):
|
||||
|
||||
def __init__(self, N=1000):
|
||||
self.N = N
|
||||
self.l1 = 0.0
|
||||
self.l2 = 0.0
|
||||
self.l3 = 0.0
|
||||
self.status = set()
|
||||
self.wd = frequency.AddOneProb()
|
||||
self.eos = frequency.AddOneProb()
|
||||
self.eosd = frequency.AddOneProb()
|
||||
self.uni = frequency.NormalProb()
|
||||
self.bi = frequency.NormalProb()
|
||||
self.tri = frequency.NormalProb()
|
||||
self.word = {}
|
||||
self.trans = {}
|
||||
|
||||
def save(self, fname, iszip=True):
|
||||
d = {}
|
||||
for k, v in self.__dict__.items():
|
||||
if isinstance(v, set):
|
||||
d[k] = list(v)
|
||||
elif hasattr(v, '__dict__'):
|
||||
d[k] = v.__dict__
|
||||
else:
|
||||
d[k] = v
|
||||
if sys.version_info[0] == 3:
|
||||
fname = fname + '.3'
|
||||
if not iszip:
|
||||
marshal.dump(d, open(fname, 'wb'))
|
||||
else:
|
||||
f = gzip.open(fname, 'wb')
|
||||
f.write(marshal.dumps(d))
|
||||
f.close()
|
||||
|
||||
def load(self, fname, iszip=True):
|
||||
if sys.version_info[0] == 3:
|
||||
fname = fname + '.3'
|
||||
if not iszip:
|
||||
d = marshal.load(open(fname, 'rb'))
|
||||
else:
|
||||
try:
|
||||
f = gzip.open(fname, 'rb')
|
||||
d = marshal.loads(f.read())
|
||||
except IOError:
|
||||
f = open(fname, 'rb')
|
||||
d = marshal.loads(f.read())
|
||||
f.close()
|
||||
for k, v in d.items():
|
||||
if isinstance(self.__dict__[k], set):
|
||||
self.__dict__[k] = set(v)
|
||||
elif hasattr(self.__dict__[k], '__dict__'):
|
||||
self.__dict__[k].__dict__ = v
|
||||
else:
|
||||
self.__dict__[k] = v
|
||||
|
||||
def tnt_div(self, v1, v2):
|
||||
if v2 == 0:
|
||||
return 0
|
||||
return float(v1)/v2
|
||||
|
||||
def geteos(self, tag):
|
||||
tmp = self.eosd.get(tag)
|
||||
if not tmp[0]:
|
||||
return log(1.0/len(self.status))
|
||||
return log(self.eos.get((tag, 'EOS'))[1])-log(self.eosd.get(tag)[1])
|
||||
|
||||
def train(self, data):
|
||||
for sentence in data:
|
||||
now = ['BOS', 'BOS']
|
||||
self.bi.add(('BOS', 'BOS'), 1)
|
||||
self.uni.add('BOS', 2)
|
||||
for word, tag in sentence:
|
||||
now.append(tag)
|
||||
self.status.add(tag)
|
||||
self.wd.add((tag, word), 1)
|
||||
self.eos.add(tuple(now[1:]), 1)
|
||||
self.eosd.add(tag, 1)
|
||||
self.uni.add(tag, 1)
|
||||
self.bi.add(tuple(now[1:]), 1)
|
||||
self.tri.add(tuple(now), 1)
|
||||
if word not in self.word:
|
||||
self.word[word] = set()
|
||||
self.word[word].add(tag)
|
||||
now.pop(0)
|
||||
self.eos.add((now[-1], 'EOS'), 1)
|
||||
tl1 = 0.0
|
||||
tl2 = 0.0
|
||||
tl3 = 0.0
|
||||
for now in self.tri.samples():
|
||||
c3 = self.tnt_div(self.tri.get(now)[1]-1,
|
||||
self.bi.get(now[:2])[1]-1)
|
||||
c2 = self.tnt_div(self.bi.get(now[1:])[1]-1,
|
||||
self.uni.get(now[1])[1]-1)
|
||||
c1 = self.tnt_div(self.uni.get(now[2])[1]-1, self.uni.getsum()-1)
|
||||
if c3 >= c1 and c3 >= c2:
|
||||
tl3 += self.tri.get(now)[1]
|
||||
elif c2 >= c1 and c2 >= c3:
|
||||
tl2 += self.tri.get(now)[1]
|
||||
elif c1 >= c2 and c1 >= c3:
|
||||
tl1 += self.tri.get(now)[1]
|
||||
self.l1 = float(tl1)/(tl1+tl2+tl3)
|
||||
self.l2 = float(tl2)/(tl1+tl2+tl3)
|
||||
self.l3 = float(tl3)/(tl1+tl2+tl3)
|
||||
for s1 in self.status | set(('BOS',)):
|
||||
for s2 in self.status | set(('BOS',)):
|
||||
for s3 in self.status:
|
||||
uni = self.l1*self.uni.freq(s3)
|
||||
bi = self.tnt_div(self.l2*self.bi.get((s2, s3))[1],
|
||||
self.uni.get(s2)[1])
|
||||
tri = self.tnt_div(self.l3*self.tri.get((s1, s2, s3))[1],
|
||||
self.bi.get((s1, s2))[1])
|
||||
self.trans[(s1, s2, s3)] = log(uni+bi+tri)
|
||||
|
||||
def tag(self, data):
|
||||
now = [(('BOS', 'BOS'), 0.0, [])]
|
||||
for w in data:
|
||||
stage = {}
|
||||
samples = self.status
|
||||
if w in self.word:
|
||||
samples = self.word[w]
|
||||
for s in samples:
|
||||
wd = log(self.wd.get((s, w))[1])-log(self.uni.get(s)[1])
|
||||
for pre in now:
|
||||
p = pre[1]+wd+self.trans[(pre[0][0], pre[0][1], s)]
|
||||
if (pre[0][1], s) not in stage or p > stage[(pre[0][1],
|
||||
s)][0]:
|
||||
stage[(pre[0][1], s)] = (p, pre[2]+[s])
|
||||
stage = list(map(lambda x: (x[0], x[1][0], x[1][1]), stage.items()))
|
||||
now = heapq.nlargest(self.N, stage, key=lambda x: x[1])
|
||||
now = heapq.nlargest(1, stage, key=lambda x: x[1]+self.geteos(x[0][1]))
|
||||
return zip(data, now[0][2])
|
||||
@@ -0,0 +1,48 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
class Trie(object):
|
||||
|
||||
def __init__(self):
|
||||
self.d = {}
|
||||
|
||||
def insert(self, key, value):
|
||||
now = self.d
|
||||
for k in key:
|
||||
if not k in now:
|
||||
now[k] = {}
|
||||
now = now[k]
|
||||
now['value'] = value
|
||||
|
||||
def find(self, text, start=0):
|
||||
now = self.d
|
||||
n = len(text)
|
||||
ret = None
|
||||
pos = start
|
||||
while pos < n:
|
||||
if text[pos] in now:
|
||||
now = now[text[pos]]
|
||||
else:
|
||||
return ret
|
||||
if 'value' in now:
|
||||
ret = (text[start:pos+1], now['value'])
|
||||
pos += 1
|
||||
return ret
|
||||
|
||||
def translate(self, text, with_not_found=True):
|
||||
n = len(text)
|
||||
pos = 0
|
||||
ret = []
|
||||
while pos < n:
|
||||
now = self.d
|
||||
if text[pos] in now:
|
||||
tmp = self.find(text, pos)
|
||||
if tmp:
|
||||
ret.append(tmp[1])
|
||||
pos += len(tmp[0])
|
||||
continue
|
||||
if with_not_found:
|
||||
ret.append(text[pos])
|
||||
pos += 1
|
||||
return ret
|
||||
Reference in New Issue
Block a user