【snownlp相关文件】上传自己的模型，调用utils/mynlp

2024-07-04 11:54:06 +08:00
parent f09fcb3000
commit 93b72ea2e0
33 changed files with 135956 additions and 0 deletions
@@ -0,0 +1,85 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+from . import normal
+from . import seg
+from . import tag
+from . import sentiment
+from .sim import bm25
+from .summary import textrank
+from .summary import words_merge
+
+
+class SnowNLP(object):
+
+    def __init__(self, doc):
+        self.doc = doc
+        self.bm25 = bm25.BM25(doc)
+
+    @property
+    def words(self):
+        return seg.seg(self.doc)
+
+    @property
+    def sentences(self):
+        return normal.get_sentences(self.doc)
+
+    @property
+    def han(self):
+        return normal.zh2hans(self.doc)
+
+    @property
+    def pinyin(self):
+        return normal.get_pinyin(self.doc)
+
+    @property
+    def sentiments(self):
+        return sentiment.classify(self.doc)
+
+    @property
+    def tags(self):
+        words = self.words
+        tags = tag.tag(words)
+        return zip(words, tags)
+
+    @property
+    def tf(self):
+        return self.bm25.f
+
+    @property
+    def idf(self):
+        return self.bm25.idf
+
+    def sim(self, doc):
+        return self.bm25.simall(doc)
+
+    def summary(self, limit=5):
+        doc = []
+        sents = self.sentences
+        for sent in sents:
+            words = seg.seg(sent)
+            words = normal.filter_stop(words)
+            doc.append(words)
+        rank = textrank.TextRank(doc)
+        rank.solve()
+        ret = []
+        for index in rank.top_index(limit):
+            ret.append(sents[index])
+        return ret
+
+    def keywords(self, limit=5, merge=False):
+        doc = []
+        sents = self.sentences
+        for sent in sents:
+            words = seg.seg(sent)
+            words = normal.filter_stop(words)
+            doc.append(words)
+        rank = textrank.KeywordTextRank(doc)
+        rank.solve()
+        ret = []
+        for w in rank.top_index(limit):
+            ret.append(w)
+        if merge:
+            wm = words_merge.SimpleMerge(self.doc, ret)
+            return wm.merge()
+        return ret
@@ -0,0 +1,78 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+import sys
+import gzip
+import marshal
+from math import log, exp
+
+from ..utils.frequency import AddOneProb
+
+
+class Bayes(object):
+
+    def __init__(self):
+        self.d = {}
+        self.total = 0
+
+    def save(self, fname, iszip=True):
+        d = {}
+        d['total'] = self.total
+        d['d'] = {}
+        for k, v in self.d.items():
+            d['d'][k] = v.__dict__
+        if sys.version_info[0] == 3:
+            fname = fname + '.3'
+        if not iszip:
+            marshal.dump(d, open(fname, 'wb'))
+        else:
+            f = gzip.open(fname, 'wb')
+            f.write(marshal.dumps(d))
+            f.close()
+
+    def load(self, fname, iszip=True):
+        if sys.version_info[0] == 3:
+            fname = fname + '.3'
+        if not iszip:
+            d = marshal.load(open(fname, 'rb'))
+        else:
+            try:
+                f = gzip.open(fname, 'rb')
+                d = marshal.loads(f.read())
+            except IOError:
+                f = open(fname, 'rb')
+                d = marshal.loads(f.read())
+            f.close()
+        self.total = d['total']
+        self.d = {}
+        for k, v in d['d'].items():
+            self.d[k] = AddOneProb()
+            self.d[k].__dict__ = v
+
+    def train(self, data):
+        for d in data:
+            c = d[1]
+            if c not in self.d:
+                self.d[c] = AddOneProb()
+            for word in d[0]:
+                self.d[c].add(word, 1)
+        self.total = sum(map(lambda x: self.d[x].getsum(), self.d.keys()))
+
+    def classify(self, x):
+        tmp = {}
+        for k in self.d:
+            tmp[k] = log(self.d[k].getsum()) - log(self.total)
+            for word in x:
+                tmp[k] += log(self.d[k].freq(word))
+        ret, prob = 0, 0
+        for k in self.d:
+            now = 0
+            try:
+                for otherk in self.d:
+                    now += exp(tmp[otherk]-tmp[k])
+                now = 1/now
+            except OverflowError:
+                now = 0
+            if now > prob:
+                ret, prob = k, now
+        return (ret, prob)
@@ -0,0 +1,61 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+import os
+import re
+import codecs
+
+from . import zh
+from . import pinyin
+
+stop_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
+                         'stopwords.txt')
+pinyin_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
+                           'pinyin.txt')
+stop = set()
+fr = codecs.open(stop_path, 'r', 'utf-8')
+for word in fr:
+    stop.add(word.strip())
+fr.close()
+pin = pinyin.PinYin(pinyin_path)
+re_zh = re.compile('([\u4E00-\u9FA5]+)')
+
+
+def filter_stop(words):
+    return list(filter(lambda x: x not in stop, words))
+
+
+def zh2hans(sent):
+    return zh.transfer(sent)
+
+
+def get_sentences(doc):
+    line_break = re.compile('[\r\n]')
+    delimiter = re.compile('[，。？！；]')
+    sentences = []
+    for line in line_break.split(doc):
+        line = line.strip()
+        if not line:
+            continue
+        for sent in delimiter.split(line):
+            sent = sent.strip()
+            if not sent:
+                continue
+            sentences.append(sent)
+    return sentences
+
+
+def get_pinyin(sentence):
+    ret = []
+    for s in re_zh.split(sentence):
+        s = s.strip()
+        if not s:
+            continue
+        if re_zh.match(s):
+            ret += pin.get(s)
+        else:
+            for word in s.split():
+                word = word.strip()
+                if word:
+                    ret.append(word)
+    return ret
@@ -0,0 +1,26 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+import codecs
+
+from ..utils.trie import Trie
+
+
+class PinYin(object):
+
+    def __init__(self, fname):
+        self.handle = Trie()
+        fr = codecs.open(fname, 'r', 'utf-8')
+        for line in fr:
+            words = line.split()
+            self.handle.insert(words[0], words[1:])
+        fr.close()
+
+    def get(self, text):
+        ret = []
+        for i in self.handle.translate(text):
+            if isinstance(i, list) or isinstance(i, tuple):
+                ret = ret + i
+            else:
+                ret.append(i)
+        return ret
@@ -0,0 +1,47 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+import os
+import re
+
+from . import seg as TnTseg
+
+data_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
+                         'seg.marshal')
+segger = TnTseg.Seg()
+segger.load(data_path, True)
+re_zh = re.compile('([\u4E00-\u9FA5]+)')
+
+
+def seg(sent):
+    words = []
+    for s in re_zh.split(sent):
+        s = s.strip()
+        if not s:
+            continue
+        if re_zh.match(s):
+            words += single_seg(s)
+        else:
+            for word in s.split():
+                word = word.strip()
+                if word:
+                    words.append(word)
+    return words
+
+
+def train(fname):
+    global segger
+    segger = TnTseg.Seg()
+    segger.train(fname)
+
+
+def save(fname, iszip=True):
+    segger.save(fname, iszip)
+
+
+def load(fname, iszip=True):
+    segger.load(fname, iszip)
+
+
+def single_seg(sent):
+    return list(segger.seg(sent))
@@ -0,0 +1,57 @@
+# -*- coding: utf-8 -*-
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import codecs
+
+from ..utils.tnt import TnT
+from .y09_2047 import CharacterBasedGenerativeModel
+
+
+class Seg(object):
+
+    def __init__(self, name='other'):
+        if name == 'tnt':
+            self.segger = TnT()
+        else:
+            self.segger = CharacterBasedGenerativeModel()
+
+    def save(self, fname, iszip=True):
+        self.segger.save(fname, iszip)
+
+    def load(self, fname, iszip=True):
+        self.segger.load(fname, iszip)
+
+    def train(self, fname):
+        fr = codecs.open(fname, 'r', 'utf-8')
+        data = []
+        for i in fr:
+            line = i.strip()
+            if not line:
+                continue
+            tmp = map(lambda x: x.split('/'), line.split())
+            data.append(tmp)
+        fr.close()
+        self.segger.train(data)
+
+    def seg(self, sentence):
+        ret = self.segger.tag(sentence)
+        tmp = ''
+        for i in ret:
+            if i[1] == 'e':
+                yield tmp+i[0]
+                tmp = ''
+            elif i[1] == 'b' or i[1] == 's':
+                if tmp:
+                    yield tmp
+                tmp = i[0]
+            else:
+                tmp += i[0]
+        if tmp:
+            yield tmp
+
+
+if __name__ == '__main__':
+    seg = Seg()
+    seg.train('data.txt')
+    print(' '.join(seg.seg('主要是用来放置一些简单快速的中文分词和词性标注的程序')))
@@ -0,0 +1,125 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+import sys
+import gzip
+import marshal
+from math import log
+
+from ..utils import frequency
+
+
+class CharacterBasedGenerativeModel(object):
+
+    def __init__(self):
+        self.l1 = 0.0
+        self.l2 = 0.0
+        self.l3 = 0.0
+        self.status = ('b', 'm', 'e', 's')
+        self.uni = frequency.NormalProb()
+        self.bi = frequency.NormalProb()
+        self.tri = frequency.NormalProb()
+
+    def save(self, fname, iszip=True):
+        d = {}
+        for k, v in self.__dict__.items():
+            if hasattr(v, '__dict__'):
+                d[k] = v.__dict__
+            else:
+                d[k] = v
+        if sys.version_info[0] == 3:
+            fname = fname + '.3'
+        if not iszip:
+            marshal.dump(d, open(fname, 'wb'))
+        else:
+            f = gzip.open(fname, 'wb')
+            f.write(marshal.dumps(d))
+            f.close()
+
+    def load(self, fname, iszip=True):
+        if sys.version_info[0] == 3:
+            fname = fname + '.3'
+        if not iszip:
+            d = marshal.load(open(fname, 'rb'))
+        else:
+            try:
+                f = gzip.open(fname, 'rb')
+                d = marshal.loads(f.read())
+            except IOError:
+                f = open(fname, 'rb')
+                d = marshal.loads(f.read())
+            f.close()
+        for k, v in d.items():
+            if hasattr(self.__dict__[k], '__dict__'):
+                self.__dict__[k].__dict__ = v
+            else:
+                self.__dict__[k] = v
+
+    def div(self, v1, v2):
+        if v2 == 0:
+            return 0
+        return float(v1)/v2
+
+    def train(self, data):
+        for sentence in data:
+            now = [('', 'BOS'), ('', 'BOS')]
+            self.bi.add((('', 'BOS'), ('', 'BOS')), 1)
+            self.uni.add(('', 'BOS'), 2)
+            for word, tag in sentence:
+                now.append((word, tag))
+                self.uni.add((word, tag), 1)
+                self.bi.add(tuple(now[1:]), 1)
+                self.tri.add(tuple(now), 1)
+                now.pop(0)
+        tl1 = 0.0
+        tl2 = 0.0
+        tl3 = 0.0
+        samples = sorted(self.tri.samples(), key=lambda x: self.tri.get(x)[1])
+        for now in samples:
+            c3 = self.div(self.tri.get(now)[1]-1, self.bi.get(now[:2])[1]-1)
+            c2 = self.div(self.bi.get(now[1:])[1]-1, self.uni.get(now[1])[1]-1)
+            c1 = self.div(self.uni.get(now[2])[1]-1, self.uni.getsum()-1)
+            if c3 >= c1 and c3 >= c2:
+                tl3 += self.tri.get(now)[1]
+            elif c2 >= c1 and c2 >= c3:
+                tl2 += self.tri.get(now)[1]
+            elif c1 >= c2 and c1 >= c3:
+                tl1 += self.tri.get(now)[1]
+        self.l1 = self.div(tl1, tl1+tl2+tl3)
+        self.l2 = self.div(tl2, tl1+tl2+tl3)
+        self.l3 = self.div(tl3, tl1+tl2+tl3)
+
+    def log_prob(self, s1, s2, s3):
+        uni = self.l1*self.uni.freq(s3)
+        bi = self.div(self.l2*self.bi.get((s2, s3))[1], self.uni.get(s2)[1])
+        tri = self.div(self.l3*self.tri.get((s1, s2, s3))[1],
+                       self.bi.get((s1, s2))[1])
+        if uni+bi+tri == 0:
+            return float('-inf')
+        return log(uni+bi+tri)
+
+    def tag(self, data):
+        now = [((('', 'BOS'), ('', 'BOS')), 0.0, [])]
+        for w in data:
+            stage = {}
+            not_found = True
+            for s in self.status:
+                if self.uni.freq((w, s)) != 0:
+                    not_found = False
+                    break
+            if not_found:
+                for s in self.status:
+                    for pre in now:
+                        stage[(pre[0][1], (w, s))] = (pre[1], pre[2]+[s])
+                now = list(map(lambda x: (x[0], x[1][0], x[1][1]),
+                               stage.items()))
+                continue
+            for s in self.status:
+                for pre in now:
+                    p = pre[1]+self.log_prob(pre[0][0], pre[0][1], (w, s))
+                    if (not (pre[0][1],
+                             (w, s)) in stage) or p > stage[(pre[0][1],
+                                                            (w, s))][0]:
+                        stage[(pre[0][1], (w, s))] = (p, pre[2]+[s])
+            now = list(map(lambda x: (x[0], x[1][0], x[1][1]), stage.items()))
+        return zip(data, max(now, key=lambda x: x[1])[2])
@@ -0,0 +1,73 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+import os
+import codecs
+
+from .. import normal
+from .. import seg
+from ..classification.bayes import Bayes
+
+data_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
+                         'sentiment.marshal')
+
+
+class Sentiment(object):
+
+    def __init__(self):
+        self.classifier = Bayes()
+
+    def save(self, fname, iszip=True):
+        self.classifier.save(fname, iszip)
+
+    def load(self, fname=data_path, iszip=True):
+        self.classifier.load(fname, iszip)
+
+    def handle(self, doc):
+        words = seg.seg(doc)
+        words = normal.filter_stop(words)
+        return words
+
+    def train(self, neg_docs, pos_docs):
+        data = []
+        for sent in neg_docs:
+            data.append([self.handle(sent), 'neg'])
+        for sent in pos_docs:
+            data.append([self.handle(sent), 'pos'])
+        self.classifier.train(data)
+
+    def classify(self, sent):
+        ret, prob = self.classifier.classify(self.handle(sent))
+        if ret == 'pos':
+            return prob
+        return 1-prob
+
+
+classifier = Sentiment()
+classifier.load()
+
+
+def train(neg_file, pos_file):
+    neg = codecs.open(neg_file, 'r', 'utf-8').readlines()
+    pos = codecs.open(pos_file, 'r', 'utf-8').readlines()
+    neg_docs = []
+    pos_docs = []
+    for line in neg:
+        neg_docs.append(line.rstrip("\r\n"))
+    for line in pos:
+        pos_docs.append(line.rstrip("\r\n"))
+    global classifier
+    classifier = Sentiment()
+    classifier.train(neg_docs, pos_docs)
+
+
+def save(fname, iszip=True):
+    classifier.save(fname, iszip)
+
+
+def load(fname, iszip=True):
+    classifier.load(fname, iszip)
+
+
+def classify(sent):
+    return classifier.classify(sent)
@@ -0,0 +1,51 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+import math
+
+
+class BM25(object):
+
+    def __init__(self, docs):
+        self.D = len(docs)
+        self.avgdl = sum([len(doc)+0.0 for doc in docs]) / self.D
+        self.docs = docs
+        self.f = []
+        self.df = {}
+        self.idf = {}
+        self.k1 = 1.5
+        self.b = 0.75
+        self.init()
+
+    def init(self):
+        for doc in self.docs:
+            tmp = {}
+            for word in doc:
+                if not word in tmp:
+                    tmp[word] = 0
+                tmp[word] += 1
+            self.f.append(tmp)
+            for k, v in tmp.items():
+                if k not in self.df:
+                    self.df[k] = 0
+                self.df[k] += 1
+        for k, v in self.df.items():
+            self.idf[k] = math.log(self.D-v+0.5)-math.log(v+0.5)
+
+    def sim(self, doc, index):
+        score = 0
+        for word in doc:
+            if word not in self.f[index]:
+                continue
+            d = len(self.docs[index])
+            score += (self.idf[word]*self.f[index][word]*(self.k1+1)
+                      / (self.f[index][word]+self.k1*(1-self.b+self.b*d
+                                                      / self.avgdl)))
+        return score
+
+    def simall(self, doc):
+        scores = []
+        for index in range(self.D):
+            score = self.sim(doc, index)
+            scores.append(score)
+        return scores
@@ -0,0 +1,106 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+from ..sim.bm25 import BM25
+
+
+class TextRank(object):
+
+    def __init__(self, docs):
+        self.docs = docs
+        self.bm25 = BM25(docs)
+        self.D = len(docs)
+        self.d = 0.85
+        self.weight = []
+        self.weight_sum = []
+        self.vertex = []
+        self.max_iter = 200
+        self.min_diff = 0.001
+        self.top = []
+
+    def solve(self):
+        for cnt, doc in enumerate(self.docs):
+            scores = self.bm25.simall(doc)
+            self.weight.append(scores)
+            self.weight_sum.append(sum(scores)-scores[cnt])
+            self.vertex.append(1.0)
+        for _ in range(self.max_iter):
+            m = []
+            max_diff = 0
+            for i in range(self.D):
+                m.append(1-self.d)
+                for j in range(self.D):
+                    if j == i or self.weight_sum[j] == 0:
+                        continue
+                    m[-1] += (self.d*self.weight[j][i]
+                              / self.weight_sum[j]*self.vertex[j])
+                if abs(m[-1] - self.vertex[i]) > max_diff:
+                    max_diff = abs(m[-1] - self.vertex[i])
+            self.vertex = m
+            if max_diff <= self.min_diff:
+                break
+        self.top = list(enumerate(self.vertex))
+        self.top = sorted(self.top, key=lambda x: x[1], reverse=True)
+
+    def top_index(self, limit):
+        return list(map(lambda x: x[0], self.top))[:limit]
+
+    def top(self, limit):
+        return list(map(lambda x: self.docs[x[0]], self.top))
+
+
+class KeywordTextRank(object):
+
+    def __init__(self, docs):
+        self.docs = docs
+        self.words = {}
+        self.vertex = {}
+        self.d = 0.85
+        self.max_iter = 200
+        self.min_diff = 0.001
+        self.top = []
+
+    def solve(self):
+        for doc in self.docs:
+            que = []
+            for word in doc:
+                if word not in self.words:
+                    self.words[word] = set()
+                    self.vertex[word] = 1.0
+                que.append(word)
+                if len(que) > 5:
+                    que.pop(0)
+                for w1 in que:
+                    for w2 in que:
+                        if w1 == w2:
+                            continue
+                        self.words[w1].add(w2)
+                        self.words[w2].add(w1)
+        for _ in range(self.max_iter):
+            m = {}
+            max_diff = 0
+            tmp = filter(lambda x: len(self.words[x[0]]) > 0,
+                         self.vertex.items())
+            tmp = sorted(tmp, key=lambda x: x[1] / len(self.words[x[0]]))
+            for k, v in tmp:
+                for j in self.words[k]:
+                    if k == j:
+                        continue
+                    if j not in m:
+                        m[j] = 1 - self.d
+                    m[j] += (self.d / len(self.words[k]) * self.vertex[k])
+            for k in self.vertex:
+                if k in m and k in self.vertex:
+                    if abs(m[k] - self.vertex[k]) > max_diff:
+                        max_diff = abs(m[k] - self.vertex[k])
+            self.vertex = m
+            if max_diff <= self.min_diff:
+                break
+        self.top = list(self.vertex.items())
+        self.top = sorted(self.top, key=lambda x: x[1], reverse=True)
+
+    def top_index(self, limit):
+        return list(map(lambda x: x[0], self.top))[:limit]
+
+    def top(self, limit):
+        return list(map(lambda x: self.docs[x[0]], self.top))
@@ -0,0 +1,47 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+
+class SimpleMerge(object):
+
+    def __init__(self, doc, words):
+        self.doc = doc
+        self.words = words
+
+    def merge(self):
+        trans = {}
+        for w in self.words:
+            trans[w] = ''
+        for w1 in self.words:
+            cw = 0
+            lw = len(w1)
+            for i in range(len(self.doc)-lw+1):
+                if w1 == self.doc[i: i+lw]:
+                    cw += 1
+            for w2 in self.words:
+                cnt = 0
+                l2 = len(w1)+len(w2)
+                for i in range(len(self.doc)-l2+1):
+                    if w1+w2 == self.doc[i: i+l2]:
+                        cnt += 1
+                if cw < cnt*2:
+                    trans[w1] = w2
+                    break
+        ret = []
+        for w in self.words:
+            if w not in trans:
+                continue
+            s = ''
+            now = trans[w]
+            while now:
+                s += now
+                if now not in trans:
+                    break
+                tmp = trans[now]
+                del trans[now]
+                now = tmp
+            trans[w] = s
+        for w in self.words:
+            if w in trans:
+                ret.append(w+trans[w])
+        return ret
@@ -0,0 +1,43 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+import os
+import codecs
+
+from ..utils.tnt import TnT
+
+data_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
+                         'tag.marshal')
+tagger = TnT()
+tagger.load(data_path)
+
+
+def train(fname):
+    fr = codecs.open(fname, 'r', 'utf-8')
+    data = []
+    for i in fr:
+        line = i.strip()
+        if not line:
+            continue
+        tmp = map(lambda x: x.split('/'), line.split())
+        data.append(tmp)
+    fr.close()
+    global tagger
+    tagger = TnT()
+    tagger.train(data)
+
+
+def save(fname, iszip=True):
+    tagger.save(fname, iszip)
+
+
+def load(fname, iszip=True):
+    tagger.load(fname, iszip)
+
+
+def tag_all(words):
+    return tagger.tag(words)
+
+
+def tag(words):
+    return map(lambda x: x[1], tag_all(words))
@@ -0,0 +1,74 @@
+# -*- coding: utf-8 -*-
+
+from . import good_turing
+
+class BaseProb(object):
+
+    def __init__(self):
+        self.d = {}
+        self.total = 0.0
+        self.none = 0
+
+    def exists(self, key):
+        return key in self.d
+
+    def getsum(self):
+        return self.total
+
+    def get(self, key):
+        if not self.exists(key):
+            return False, self.none
+        return True, self.d[key]
+
+    def freq(self, key):
+        return float(self.get(key)[1])/self.total
+
+    def samples(self):
+        return self.d.keys()
+
+
+class NormalProb(BaseProb):
+
+    def add(self, key, value):
+        if not self.exists(key):
+            self.d[key] = 0
+        self.d[key] += value
+        self.total += value
+
+
+class AddOneProb(BaseProb):
+
+    def __init__(self):
+        self.d = {}
+        self.total = 0.0
+        self.none = 1
+
+    def add(self, key, value):
+        self.total += value
+        if not self.exists(key):
+            self.d[key] = 1
+            self.total += 1
+        self.d[key] += value
+
+
+class GoodTuringProb(BaseProb):
+
+    def __init__(self):
+        self.d = {}
+        self.total = 0.0
+        self.handled = False
+
+    def add(self, key, value):
+        if not self.exists(key):
+            self.d[key] = 0
+        self.d[key] += value
+
+    def get(self, key):
+        if not self.handled:
+            self.handled = True
+            tmp, self.d = good_turing.main(self.d)
+            self.none = tmp
+            self.total = sum(self.d.values())+0.0
+        if not self.exists(key):
+            return False, self.none
+        return True, self.d[key]
@@ -0,0 +1,51 @@
+# -*- coding: utf-8 -*-
+from __future__ import print_function
+from __future__ import division
+from math import log, exp
+
+def getz(r, nr):
+    z = [2*nr[0]/r[1]]
+    for i in xrange(len(nr)-2):
+        z.append(2*nr[i+1]/(r[i+2]-r[i]))
+    z.append(nr[-1]/(r[-1]-r[-2]))
+    return z
+
+def least_square(x, y): # y=a+bx
+    meanx = sum(x)/len(x)
+    meany = sum(y)/len(y)
+    xy = sum((x[i]-meanx)*(y[i]-meany) for i in range(len(x)))
+    square = sum((x[i]-meanx)**2 for i in range(len(x)))
+    b = xy/square
+    return (meany-b*meanx, b)
+
+def main(dic):
+    values = sorted(dic.values())
+    r, nr, prob = [], [], []
+    for v in values:
+        if not r or r[-1] != v:
+            r.append(v)
+            nr.append(1)
+        else:
+            nr[-1] += 1
+    rr = dict(map(lambda x:list(reversed(x)), enumerate(r)))
+    total = reduce(lambda x, y:(x[0]*x[1]+y[0]*y[1], 1), zip(nr, r))[0]
+    z = getz(r, nr)
+    a, b = least_square(map(lambda x:log(x), r), map(lambda x:log(x), z))
+    use_good_turing = False
+    nr.append(exp(a+b*log(r[-1]+1)))
+    for i in xrange(len(r)):
+        good_turing = (r[i]+1)*(exp(b*(log(r[i]+1)-log(r[i]))))
+        turing = (r[i]+1)*nr[i+1]/nr[i] if i+1<len(r) else good_turing
+        diff = ((((r[i]+1)**2)/nr[i]*nr[i+1]/nr[i]*(1+nr[i+1]/nr[i]))**0.5)*1.65
+        if not use_good_turing and abs(good_turing-turing)>diff:
+            prob.append(turing)
+        else:
+            use_good_turing = True
+            prob.append(good_turing)
+    sump = reduce(lambda x, y:(x[0]*x[1]+y[0]*y[1], 1), zip(nr, prob))[0]
+    for cnt, i in enumerate(prob):
+        prob[cnt] = (1-nr[0]/total)*i/sump
+    return nr[0]/total/total, dict(zip(dic.keys(), map(lambda x:prob[rr[x]], dic.values())))
+
+if __name__ == '__main__':
+    print(main({1:1,2:1,3:1,4:2,5:2,6:3,7:1,8:2,9:3}))
@@ -0,0 +1,148 @@
+# -*- coding: utf-8 -*-
+
+'''
+Implementation of 'TnT - A Statisical Part of Speech Tagger'
+'''
+from __future__ import unicode_literals
+
+import sys
+import gzip
+import heapq
+import marshal
+from math import log
+
+from . import frequency
+
+
+class TnT(object):
+
+    def __init__(self, N=1000):
+        self.N = N
+        self.l1 = 0.0
+        self.l2 = 0.0
+        self.l3 = 0.0
+        self.status = set()
+        self.wd = frequency.AddOneProb()
+        self.eos = frequency.AddOneProb()
+        self.eosd = frequency.AddOneProb()
+        self.uni = frequency.NormalProb()
+        self.bi = frequency.NormalProb()
+        self.tri = frequency.NormalProb()
+        self.word = {}
+        self.trans = {}
+
+    def save(self, fname, iszip=True):
+        d = {}
+        for k, v in self.__dict__.items():
+            if isinstance(v, set):
+                d[k] = list(v)
+            elif hasattr(v, '__dict__'):
+                d[k] = v.__dict__
+            else:
+                d[k] = v
+        if sys.version_info[0] == 3:
+            fname = fname + '.3'
+        if not iszip:
+            marshal.dump(d, open(fname, 'wb'))
+        else:
+            f = gzip.open(fname, 'wb')
+            f.write(marshal.dumps(d))
+            f.close()
+
+    def load(self, fname, iszip=True):
+        if sys.version_info[0] == 3:
+            fname = fname + '.3'
+        if not iszip:
+            d = marshal.load(open(fname, 'rb'))
+        else:
+            try:
+                f = gzip.open(fname, 'rb')
+                d = marshal.loads(f.read())
+            except IOError:
+                f = open(fname, 'rb')
+                d = marshal.loads(f.read())
+            f.close()
+        for k, v in d.items():
+            if isinstance(self.__dict__[k], set):
+                self.__dict__[k] = set(v)
+            elif hasattr(self.__dict__[k], '__dict__'):
+                self.__dict__[k].__dict__ = v
+            else:
+                self.__dict__[k] = v
+
+    def tnt_div(self, v1, v2):
+        if v2 == 0:
+            return 0
+        return float(v1)/v2
+
+    def geteos(self, tag):
+        tmp = self.eosd.get(tag)
+        if not tmp[0]:
+            return log(1.0/len(self.status))
+        return log(self.eos.get((tag, 'EOS'))[1])-log(self.eosd.get(tag)[1])
+
+    def train(self, data):
+        for sentence in data:
+            now = ['BOS', 'BOS']
+            self.bi.add(('BOS', 'BOS'), 1)
+            self.uni.add('BOS', 2)
+            for word, tag in sentence:
+                now.append(tag)
+                self.status.add(tag)
+                self.wd.add((tag, word), 1)
+                self.eos.add(tuple(now[1:]), 1)
+                self.eosd.add(tag, 1)
+                self.uni.add(tag, 1)
+                self.bi.add(tuple(now[1:]), 1)
+                self.tri.add(tuple(now), 1)
+                if word not in self.word:
+                    self.word[word] = set()
+                self.word[word].add(tag)
+                now.pop(0)
+            self.eos.add((now[-1], 'EOS'), 1)
+        tl1 = 0.0
+        tl2 = 0.0
+        tl3 = 0.0
+        for now in self.tri.samples():
+            c3 = self.tnt_div(self.tri.get(now)[1]-1,
+                              self.bi.get(now[:2])[1]-1)
+            c2 = self.tnt_div(self.bi.get(now[1:])[1]-1,
+                              self.uni.get(now[1])[1]-1)
+            c1 = self.tnt_div(self.uni.get(now[2])[1]-1, self.uni.getsum()-1)
+            if c3 >= c1 and c3 >= c2:
+                tl3 += self.tri.get(now)[1]
+            elif c2 >= c1 and c2 >= c3:
+                tl2 += self.tri.get(now)[1]
+            elif c1 >= c2 and c1 >= c3:
+                tl1 += self.tri.get(now)[1]
+        self.l1 = float(tl1)/(tl1+tl2+tl3)
+        self.l2 = float(tl2)/(tl1+tl2+tl3)
+        self.l3 = float(tl3)/(tl1+tl2+tl3)
+        for s1 in self.status | set(('BOS',)):
+            for s2 in self.status | set(('BOS',)):
+                for s3 in self.status:
+                    uni = self.l1*self.uni.freq(s3)
+                    bi = self.tnt_div(self.l2*self.bi.get((s2, s3))[1],
+                                      self.uni.get(s2)[1])
+                    tri = self.tnt_div(self.l3*self.tri.get((s1, s2, s3))[1],
+                                       self.bi.get((s1, s2))[1])
+                    self.trans[(s1, s2, s3)] = log(uni+bi+tri)
+
+    def tag(self, data):
+        now = [(('BOS', 'BOS'), 0.0, [])]
+        for w in data:
+            stage = {}
+            samples = self.status
+            if w in self.word:
+                samples = self.word[w]
+            for s in samples:
+                wd = log(self.wd.get((s, w))[1])-log(self.uni.get(s)[1])
+                for pre in now:
+                    p = pre[1]+wd+self.trans[(pre[0][0], pre[0][1], s)]
+                    if (pre[0][1], s) not in stage or p > stage[(pre[0][1],
+                                                                 s)][0]:
+                        stage[(pre[0][1], s)] = (p, pre[2]+[s])
+            stage = list(map(lambda x: (x[0], x[1][0], x[1][1]), stage.items()))
+            now = heapq.nlargest(self.N, stage, key=lambda x: x[1])
+        now = heapq.nlargest(1, stage, key=lambda x: x[1]+self.geteos(x[0][1]))
+        return zip(data, now[0][2])
@@ -0,0 +1,48 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+
+class Trie(object):
+
+    def __init__(self):
+        self.d = {}
+
+    def insert(self, key, value):
+        now = self.d
+        for k in key:
+            if not k in now:
+                now[k] = {}
+            now = now[k]
+        now['value'] = value
+
+    def find(self, text, start=0):
+        now = self.d
+        n = len(text)
+        ret = None
+        pos = start
+        while pos < n:
+            if text[pos] in now:
+                now = now[text[pos]]
+            else:
+                return ret
+            if 'value' in now:
+                ret = (text[start:pos+1], now['value'])
+            pos += 1
+        return ret
+
+    def translate(self, text, with_not_found=True):
+        n = len(text)
+        pos = 0
+        ret = []
+        while pos < n:
+            now = self.d
+            if text[pos] in now:
+                tmp = self.find(text, pos)
+                if tmp:
+                    ret.append(tmp[1])
+                    pos += len(tmp[0])
+                    continue
+            if with_not_found:
+                ret.append(text[pos])
+            pos += 1
+        return ret