This commit is contained in:
juanboy
2024-07-04 12:00:44 +08:00
40 changed files with 135972 additions and 4 deletions
+4 -1
View File
@@ -1,2 +1,5 @@
.conda
*__pycache__/
*__pycache__/
model2/*
!model2/readme.md
!model2/话题识别模型
+9
View File
@@ -0,0 +1,9 @@
### 模型说明
该文件太大了,就不上传到github上了,可自行下载,并解压到当前目录下:
阉割版下载链接(训好了,只能直接使用):链接:https://pan.baidu.com/s/1Ao5S7LVd5Nfw_qmJq2IVoQ?pwd=42jh 提取码:42jh
完整版下载链接(包含了完整数据,可以重新训练):链接:https://pan.baidu.com/s/1pPTeNpp6dg3ZSd_RcNLABA?pwd=96bz 提取码:96bz
**直接运行model_use.py即可使用,不要改动该文件分割线上方的代码。**
View File
+1 -1
View File
@@ -2,7 +2,7 @@ import os
from sqlalchemy import create_engine
import pandas as pd
engine = create_engine('mysql+pymysql://XiaoXueQi:XiaoXueQi@10.92.35.13/Weibo_PublicOpinion_AnalysisSystem?charset=utf8mb4')
engine = create_engine('mysql+pymysql://XiaoXueQi:XiaoXueQi@47.92.235.6/Weibo_PublicOpinion_AnalysisSystem?charset=utf8mb4')
def save_to_sql():
try:
+85
View File
@@ -0,0 +1,85 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from . import normal
from . import seg
from . import tag
from . import sentiment
from .sim import bm25
from .summary import textrank
from .summary import words_merge
class SnowNLP(object):
def __init__(self, doc):
self.doc = doc
self.bm25 = bm25.BM25(doc)
@property
def words(self):
return seg.seg(self.doc)
@property
def sentences(self):
return normal.get_sentences(self.doc)
@property
def han(self):
return normal.zh2hans(self.doc)
@property
def pinyin(self):
return normal.get_pinyin(self.doc)
@property
def sentiments(self):
return sentiment.classify(self.doc)
@property
def tags(self):
words = self.words
tags = tag.tag(words)
return zip(words, tags)
@property
def tf(self):
return self.bm25.f
@property
def idf(self):
return self.bm25.idf
def sim(self, doc):
return self.bm25.simall(doc)
def summary(self, limit=5):
doc = []
sents = self.sentences
for sent in sents:
words = seg.seg(sent)
words = normal.filter_stop(words)
doc.append(words)
rank = textrank.TextRank(doc)
rank.solve()
ret = []
for index in rank.top_index(limit):
ret.append(sents[index])
return ret
def keywords(self, limit=5, merge=False):
doc = []
sents = self.sentences
for sent in sents:
words = seg.seg(sent)
words = normal.filter_stop(words)
doc.append(words)
rank = textrank.KeywordTextRank(doc)
rank.solve()
ret = []
for w in rank.top_index(limit):
ret.append(w)
if merge:
wm = words_merge.SimpleMerge(self.doc, ret)
return wm.merge()
return ret
+78
View File
@@ -0,0 +1,78 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import sys
import gzip
import marshal
from math import log, exp
from ..utils.frequency import AddOneProb
class Bayes(object):
def __init__(self):
self.d = {}
self.total = 0
def save(self, fname, iszip=True):
d = {}
d['total'] = self.total
d['d'] = {}
for k, v in self.d.items():
d['d'][k] = v.__dict__
if sys.version_info[0] == 3:
fname = fname + '.3'
if not iszip:
marshal.dump(d, open(fname, 'wb'))
else:
f = gzip.open(fname, 'wb')
f.write(marshal.dumps(d))
f.close()
def load(self, fname, iszip=True):
if sys.version_info[0] == 3:
fname = fname + '.3'
if not iszip:
d = marshal.load(open(fname, 'rb'))
else:
try:
f = gzip.open(fname, 'rb')
d = marshal.loads(f.read())
except IOError:
f = open(fname, 'rb')
d = marshal.loads(f.read())
f.close()
self.total = d['total']
self.d = {}
for k, v in d['d'].items():
self.d[k] = AddOneProb()
self.d[k].__dict__ = v
def train(self, data):
for d in data:
c = d[1]
if c not in self.d:
self.d[c] = AddOneProb()
for word in d[0]:
self.d[c].add(word, 1)
self.total = sum(map(lambda x: self.d[x].getsum(), self.d.keys()))
def classify(self, x):
tmp = {}
for k in self.d:
tmp[k] = log(self.d[k].getsum()) - log(self.total)
for word in x:
tmp[k] += log(self.d[k].freq(word))
ret, prob = 0, 0
for k in self.d:
now = 0
try:
for otherk in self.d:
now += exp(tmp[otherk]-tmp[k])
now = 1/now
except OverflowError:
now = 0
if now > prob:
ret, prob = k, now
return (ret, prob)
+61
View File
@@ -0,0 +1,61 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import os
import re
import codecs
from . import zh
from . import pinyin
stop_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
'stopwords.txt')
pinyin_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
'pinyin.txt')
stop = set()
fr = codecs.open(stop_path, 'r', 'utf-8')
for word in fr:
stop.add(word.strip())
fr.close()
pin = pinyin.PinYin(pinyin_path)
re_zh = re.compile('([\u4E00-\u9FA5]+)')
def filter_stop(words):
return list(filter(lambda x: x not in stop, words))
def zh2hans(sent):
return zh.transfer(sent)
def get_sentences(doc):
line_break = re.compile('[\r\n]')
delimiter = re.compile('[,。?!;]')
sentences = []
for line in line_break.split(doc):
line = line.strip()
if not line:
continue
for sent in delimiter.split(line):
sent = sent.strip()
if not sent:
continue
sentences.append(sent)
return sentences
def get_pinyin(sentence):
ret = []
for s in re_zh.split(sentence):
s = s.strip()
if not s:
continue
if re_zh.match(s):
ret += pin.get(s)
else:
for word in s.split():
word = word.strip()
if word:
ret.append(word)
return ret
+26
View File
@@ -0,0 +1,26 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import codecs
from ..utils.trie import Trie
class PinYin(object):
def __init__(self, fname):
self.handle = Trie()
fr = codecs.open(fname, 'r', 'utf-8')
for line in fr:
words = line.split()
self.handle.insert(words[0], words[1:])
fr.close()
def get(self, text):
ret = []
for i in self.handle.translate(text):
if isinstance(i, list) or isinstance(i, tuple):
ret = ret + i
else:
ret.append(i)
return ret
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
+47
View File
@@ -0,0 +1,47 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import os
import re
from . import seg as TnTseg
data_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
'seg.marshal')
segger = TnTseg.Seg()
segger.load(data_path, True)
re_zh = re.compile('([\u4E00-\u9FA5]+)')
def seg(sent):
words = []
for s in re_zh.split(sent):
s = s.strip()
if not s:
continue
if re_zh.match(s):
words += single_seg(s)
else:
for word in s.split():
word = word.strip()
if word:
words.append(word)
return words
def train(fname):
global segger
segger = TnTseg.Seg()
segger.train(fname)
def save(fname, iszip=True):
segger.save(fname, iszip)
def load(fname, iszip=True):
segger.load(fname, iszip)
def single_seg(sent):
return list(segger.seg(sent))
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
+57
View File
@@ -0,0 +1,57 @@
# -*- coding: utf-8 -*-
from __future__ import print_function
from __future__ import unicode_literals
import codecs
from ..utils.tnt import TnT
from .y09_2047 import CharacterBasedGenerativeModel
class Seg(object):
def __init__(self, name='other'):
if name == 'tnt':
self.segger = TnT()
else:
self.segger = CharacterBasedGenerativeModel()
def save(self, fname, iszip=True):
self.segger.save(fname, iszip)
def load(self, fname, iszip=True):
self.segger.load(fname, iszip)
def train(self, fname):
fr = codecs.open(fname, 'r', 'utf-8')
data = []
for i in fr:
line = i.strip()
if not line:
continue
tmp = map(lambda x: x.split('/'), line.split())
data.append(tmp)
fr.close()
self.segger.train(data)
def seg(self, sentence):
ret = self.segger.tag(sentence)
tmp = ''
for i in ret:
if i[1] == 'e':
yield tmp+i[0]
tmp = ''
elif i[1] == 'b' or i[1] == 's':
if tmp:
yield tmp
tmp = i[0]
else:
tmp += i[0]
if tmp:
yield tmp
if __name__ == '__main__':
seg = Seg()
seg.train('data.txt')
print(' '.join(seg.seg('主要是用来放置一些简单快速的中文分词和词性标注的程序')))
+125
View File
@@ -0,0 +1,125 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import sys
import gzip
import marshal
from math import log
from ..utils import frequency
class CharacterBasedGenerativeModel(object):
def __init__(self):
self.l1 = 0.0
self.l2 = 0.0
self.l3 = 0.0
self.status = ('b', 'm', 'e', 's')
self.uni = frequency.NormalProb()
self.bi = frequency.NormalProb()
self.tri = frequency.NormalProb()
def save(self, fname, iszip=True):
d = {}
for k, v in self.__dict__.items():
if hasattr(v, '__dict__'):
d[k] = v.__dict__
else:
d[k] = v
if sys.version_info[0] == 3:
fname = fname + '.3'
if not iszip:
marshal.dump(d, open(fname, 'wb'))
else:
f = gzip.open(fname, 'wb')
f.write(marshal.dumps(d))
f.close()
def load(self, fname, iszip=True):
if sys.version_info[0] == 3:
fname = fname + '.3'
if not iszip:
d = marshal.load(open(fname, 'rb'))
else:
try:
f = gzip.open(fname, 'rb')
d = marshal.loads(f.read())
except IOError:
f = open(fname, 'rb')
d = marshal.loads(f.read())
f.close()
for k, v in d.items():
if hasattr(self.__dict__[k], '__dict__'):
self.__dict__[k].__dict__ = v
else:
self.__dict__[k] = v
def div(self, v1, v2):
if v2 == 0:
return 0
return float(v1)/v2
def train(self, data):
for sentence in data:
now = [('', 'BOS'), ('', 'BOS')]
self.bi.add((('', 'BOS'), ('', 'BOS')), 1)
self.uni.add(('', 'BOS'), 2)
for word, tag in sentence:
now.append((word, tag))
self.uni.add((word, tag), 1)
self.bi.add(tuple(now[1:]), 1)
self.tri.add(tuple(now), 1)
now.pop(0)
tl1 = 0.0
tl2 = 0.0
tl3 = 0.0
samples = sorted(self.tri.samples(), key=lambda x: self.tri.get(x)[1])
for now in samples:
c3 = self.div(self.tri.get(now)[1]-1, self.bi.get(now[:2])[1]-1)
c2 = self.div(self.bi.get(now[1:])[1]-1, self.uni.get(now[1])[1]-1)
c1 = self.div(self.uni.get(now[2])[1]-1, self.uni.getsum()-1)
if c3 >= c1 and c3 >= c2:
tl3 += self.tri.get(now)[1]
elif c2 >= c1 and c2 >= c3:
tl2 += self.tri.get(now)[1]
elif c1 >= c2 and c1 >= c3:
tl1 += self.tri.get(now)[1]
self.l1 = self.div(tl1, tl1+tl2+tl3)
self.l2 = self.div(tl2, tl1+tl2+tl3)
self.l3 = self.div(tl3, tl1+tl2+tl3)
def log_prob(self, s1, s2, s3):
uni = self.l1*self.uni.freq(s3)
bi = self.div(self.l2*self.bi.get((s2, s3))[1], self.uni.get(s2)[1])
tri = self.div(self.l3*self.tri.get((s1, s2, s3))[1],
self.bi.get((s1, s2))[1])
if uni+bi+tri == 0:
return float('-inf')
return log(uni+bi+tri)
def tag(self, data):
now = [((('', 'BOS'), ('', 'BOS')), 0.0, [])]
for w in data:
stage = {}
not_found = True
for s in self.status:
if self.uni.freq((w, s)) != 0:
not_found = False
break
if not_found:
for s in self.status:
for pre in now:
stage[(pre[0][1], (w, s))] = (pre[1], pre[2]+[s])
now = list(map(lambda x: (x[0], x[1][0], x[1][1]),
stage.items()))
continue
for s in self.status:
for pre in now:
p = pre[1]+self.log_prob(pre[0][0], pre[0][1], (w, s))
if (not (pre[0][1],
(w, s)) in stage) or p > stage[(pre[0][1],
(w, s))][0]:
stage[(pre[0][1], (w, s))] = (p, pre[2]+[s])
now = list(map(lambda x: (x[0], x[1][0], x[1][1]), stage.items()))
return zip(data, max(now, key=lambda x: x[1])[2])
+73
View File
@@ -0,0 +1,73 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import os
import codecs
from .. import normal
from .. import seg
from ..classification.bayes import Bayes
data_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
'sentiment.marshal')
class Sentiment(object):
def __init__(self):
self.classifier = Bayes()
def save(self, fname, iszip=True):
self.classifier.save(fname, iszip)
def load(self, fname=data_path, iszip=True):
self.classifier.load(fname, iszip)
def handle(self, doc):
words = seg.seg(doc)
words = normal.filter_stop(words)
return words
def train(self, neg_docs, pos_docs):
data = []
for sent in neg_docs:
data.append([self.handle(sent), 'neg'])
for sent in pos_docs:
data.append([self.handle(sent), 'pos'])
self.classifier.train(data)
def classify(self, sent):
ret, prob = self.classifier.classify(self.handle(sent))
if ret == 'pos':
return prob
return 1-prob
classifier = Sentiment()
classifier.load()
def train(neg_file, pos_file):
neg = codecs.open(neg_file, 'r', 'utf-8').readlines()
pos = codecs.open(pos_file, 'r', 'utf-8').readlines()
neg_docs = []
pos_docs = []
for line in neg:
neg_docs.append(line.rstrip("\r\n"))
for line in pos:
pos_docs.append(line.rstrip("\r\n"))
global classifier
classifier = Sentiment()
classifier.train(neg_docs, pos_docs)
def save(fname, iszip=True):
classifier.save(fname, iszip)
def load(fname, iszip=True):
classifier.load(fname, iszip)
def classify(sent):
return classifier.classify(sent)
File diff suppressed because it is too large Load Diff
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
View File
+51
View File
@@ -0,0 +1,51 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import math
class BM25(object):
def __init__(self, docs):
self.D = len(docs)
self.avgdl = sum([len(doc)+0.0 for doc in docs]) / self.D
self.docs = docs
self.f = []
self.df = {}
self.idf = {}
self.k1 = 1.5
self.b = 0.75
self.init()
def init(self):
for doc in self.docs:
tmp = {}
for word in doc:
if not word in tmp:
tmp[word] = 0
tmp[word] += 1
self.f.append(tmp)
for k, v in tmp.items():
if k not in self.df:
self.df[k] = 0
self.df[k] += 1
for k, v in self.df.items():
self.idf[k] = math.log(self.D-v+0.5)-math.log(v+0.5)
def sim(self, doc, index):
score = 0
for word in doc:
if word not in self.f[index]:
continue
d = len(self.docs[index])
score += (self.idf[word]*self.f[index][word]*(self.k1+1)
/ (self.f[index][word]+self.k1*(1-self.b+self.b*d
/ self.avgdl)))
return score
def simall(self, doc):
scores = []
for index in range(self.D):
score = self.sim(doc, index)
scores.append(score)
return scores
View File
+106
View File
@@ -0,0 +1,106 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from ..sim.bm25 import BM25
class TextRank(object):
def __init__(self, docs):
self.docs = docs
self.bm25 = BM25(docs)
self.D = len(docs)
self.d = 0.85
self.weight = []
self.weight_sum = []
self.vertex = []
self.max_iter = 200
self.min_diff = 0.001
self.top = []
def solve(self):
for cnt, doc in enumerate(self.docs):
scores = self.bm25.simall(doc)
self.weight.append(scores)
self.weight_sum.append(sum(scores)-scores[cnt])
self.vertex.append(1.0)
for _ in range(self.max_iter):
m = []
max_diff = 0
for i in range(self.D):
m.append(1-self.d)
for j in range(self.D):
if j == i or self.weight_sum[j] == 0:
continue
m[-1] += (self.d*self.weight[j][i]
/ self.weight_sum[j]*self.vertex[j])
if abs(m[-1] - self.vertex[i]) > max_diff:
max_diff = abs(m[-1] - self.vertex[i])
self.vertex = m
if max_diff <= self.min_diff:
break
self.top = list(enumerate(self.vertex))
self.top = sorted(self.top, key=lambda x: x[1], reverse=True)
def top_index(self, limit):
return list(map(lambda x: x[0], self.top))[:limit]
def top(self, limit):
return list(map(lambda x: self.docs[x[0]], self.top))
class KeywordTextRank(object):
def __init__(self, docs):
self.docs = docs
self.words = {}
self.vertex = {}
self.d = 0.85
self.max_iter = 200
self.min_diff = 0.001
self.top = []
def solve(self):
for doc in self.docs:
que = []
for word in doc:
if word not in self.words:
self.words[word] = set()
self.vertex[word] = 1.0
que.append(word)
if len(que) > 5:
que.pop(0)
for w1 in que:
for w2 in que:
if w1 == w2:
continue
self.words[w1].add(w2)
self.words[w2].add(w1)
for _ in range(self.max_iter):
m = {}
max_diff = 0
tmp = filter(lambda x: len(self.words[x[0]]) > 0,
self.vertex.items())
tmp = sorted(tmp, key=lambda x: x[1] / len(self.words[x[0]]))
for k, v in tmp:
for j in self.words[k]:
if k == j:
continue
if j not in m:
m[j] = 1 - self.d
m[j] += (self.d / len(self.words[k]) * self.vertex[k])
for k in self.vertex:
if k in m and k in self.vertex:
if abs(m[k] - self.vertex[k]) > max_diff:
max_diff = abs(m[k] - self.vertex[k])
self.vertex = m
if max_diff <= self.min_diff:
break
self.top = list(self.vertex.items())
self.top = sorted(self.top, key=lambda x: x[1], reverse=True)
def top_index(self, limit):
return list(map(lambda x: x[0], self.top))[:limit]
def top(self, limit):
return list(map(lambda x: self.docs[x[0]], self.top))
+47
View File
@@ -0,0 +1,47 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
class SimpleMerge(object):
def __init__(self, doc, words):
self.doc = doc
self.words = words
def merge(self):
trans = {}
for w in self.words:
trans[w] = ''
for w1 in self.words:
cw = 0
lw = len(w1)
for i in range(len(self.doc)-lw+1):
if w1 == self.doc[i: i+lw]:
cw += 1
for w2 in self.words:
cnt = 0
l2 = len(w1)+len(w2)
for i in range(len(self.doc)-l2+1):
if w1+w2 == self.doc[i: i+l2]:
cnt += 1
if cw < cnt*2:
trans[w1] = w2
break
ret = []
for w in self.words:
if w not in trans:
continue
s = ''
now = trans[w]
while now:
s += now
if now not in trans:
break
tmp = trans[now]
del trans[now]
now = tmp
trans[w] = s
for w in self.words:
if w in trans:
ret.append(w+trans[w])
return ret
File diff suppressed because one or more lines are too long
+43
View File
@@ -0,0 +1,43 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import os
import codecs
from ..utils.tnt import TnT
data_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
'tag.marshal')
tagger = TnT()
tagger.load(data_path)
def train(fname):
fr = codecs.open(fname, 'r', 'utf-8')
data = []
for i in fr:
line = i.strip()
if not line:
continue
tmp = map(lambda x: x.split('/'), line.split())
data.append(tmp)
fr.close()
global tagger
tagger = TnT()
tagger.train(data)
def save(fname, iszip=True):
tagger.save(fname, iszip)
def load(fname, iszip=True):
tagger.load(fname, iszip)
def tag_all(words):
return tagger.tag(words)
def tag(words):
return map(lambda x: x[1], tag_all(words))
Binary file not shown.
Binary file not shown.
View File
+74
View File
@@ -0,0 +1,74 @@
# -*- coding: utf-8 -*-
from . import good_turing
class BaseProb(object):
def __init__(self):
self.d = {}
self.total = 0.0
self.none = 0
def exists(self, key):
return key in self.d
def getsum(self):
return self.total
def get(self, key):
if not self.exists(key):
return False, self.none
return True, self.d[key]
def freq(self, key):
return float(self.get(key)[1])/self.total
def samples(self):
return self.d.keys()
class NormalProb(BaseProb):
def add(self, key, value):
if not self.exists(key):
self.d[key] = 0
self.d[key] += value
self.total += value
class AddOneProb(BaseProb):
def __init__(self):
self.d = {}
self.total = 0.0
self.none = 1
def add(self, key, value):
self.total += value
if not self.exists(key):
self.d[key] = 1
self.total += 1
self.d[key] += value
class GoodTuringProb(BaseProb):
def __init__(self):
self.d = {}
self.total = 0.0
self.handled = False
def add(self, key, value):
if not self.exists(key):
self.d[key] = 0
self.d[key] += value
def get(self, key):
if not self.handled:
self.handled = True
tmp, self.d = good_turing.main(self.d)
self.none = tmp
self.total = sum(self.d.values())+0.0
if not self.exists(key):
return False, self.none
return True, self.d[key]
+51
View File
@@ -0,0 +1,51 @@
# -*- coding: utf-8 -*-
from __future__ import print_function
from __future__ import division
from math import log, exp
def getz(r, nr):
z = [2*nr[0]/r[1]]
for i in xrange(len(nr)-2):
z.append(2*nr[i+1]/(r[i+2]-r[i]))
z.append(nr[-1]/(r[-1]-r[-2]))
return z
def least_square(x, y): # y=a+bx
meanx = sum(x)/len(x)
meany = sum(y)/len(y)
xy = sum((x[i]-meanx)*(y[i]-meany) for i in range(len(x)))
square = sum((x[i]-meanx)**2 for i in range(len(x)))
b = xy/square
return (meany-b*meanx, b)
def main(dic):
values = sorted(dic.values())
r, nr, prob = [], [], []
for v in values:
if not r or r[-1] != v:
r.append(v)
nr.append(1)
else:
nr[-1] += 1
rr = dict(map(lambda x:list(reversed(x)), enumerate(r)))
total = reduce(lambda x, y:(x[0]*x[1]+y[0]*y[1], 1), zip(nr, r))[0]
z = getz(r, nr)
a, b = least_square(map(lambda x:log(x), r), map(lambda x:log(x), z))
use_good_turing = False
nr.append(exp(a+b*log(r[-1]+1)))
for i in xrange(len(r)):
good_turing = (r[i]+1)*(exp(b*(log(r[i]+1)-log(r[i]))))
turing = (r[i]+1)*nr[i+1]/nr[i] if i+1<len(r) else good_turing
diff = ((((r[i]+1)**2)/nr[i]*nr[i+1]/nr[i]*(1+nr[i+1]/nr[i]))**0.5)*1.65
if not use_good_turing and abs(good_turing-turing)>diff:
prob.append(turing)
else:
use_good_turing = True
prob.append(good_turing)
sump = reduce(lambda x, y:(x[0]*x[1]+y[0]*y[1], 1), zip(nr, prob))[0]
for cnt, i in enumerate(prob):
prob[cnt] = (1-nr[0]/total)*i/sump
return nr[0]/total/total, dict(zip(dic.keys(), map(lambda x:prob[rr[x]], dic.values())))
if __name__ == '__main__':
print(main({1:1,2:1,3:1,4:2,5:2,6:3,7:1,8:2,9:3}))
+148
View File
@@ -0,0 +1,148 @@
# -*- coding: utf-8 -*-
'''
Implementation of 'TnT - A Statisical Part of Speech Tagger'
'''
from __future__ import unicode_literals
import sys
import gzip
import heapq
import marshal
from math import log
from . import frequency
class TnT(object):
def __init__(self, N=1000):
self.N = N
self.l1 = 0.0
self.l2 = 0.0
self.l3 = 0.0
self.status = set()
self.wd = frequency.AddOneProb()
self.eos = frequency.AddOneProb()
self.eosd = frequency.AddOneProb()
self.uni = frequency.NormalProb()
self.bi = frequency.NormalProb()
self.tri = frequency.NormalProb()
self.word = {}
self.trans = {}
def save(self, fname, iszip=True):
d = {}
for k, v in self.__dict__.items():
if isinstance(v, set):
d[k] = list(v)
elif hasattr(v, '__dict__'):
d[k] = v.__dict__
else:
d[k] = v
if sys.version_info[0] == 3:
fname = fname + '.3'
if not iszip:
marshal.dump(d, open(fname, 'wb'))
else:
f = gzip.open(fname, 'wb')
f.write(marshal.dumps(d))
f.close()
def load(self, fname, iszip=True):
if sys.version_info[0] == 3:
fname = fname + '.3'
if not iszip:
d = marshal.load(open(fname, 'rb'))
else:
try:
f = gzip.open(fname, 'rb')
d = marshal.loads(f.read())
except IOError:
f = open(fname, 'rb')
d = marshal.loads(f.read())
f.close()
for k, v in d.items():
if isinstance(self.__dict__[k], set):
self.__dict__[k] = set(v)
elif hasattr(self.__dict__[k], '__dict__'):
self.__dict__[k].__dict__ = v
else:
self.__dict__[k] = v
def tnt_div(self, v1, v2):
if v2 == 0:
return 0
return float(v1)/v2
def geteos(self, tag):
tmp = self.eosd.get(tag)
if not tmp[0]:
return log(1.0/len(self.status))
return log(self.eos.get((tag, 'EOS'))[1])-log(self.eosd.get(tag)[1])
def train(self, data):
for sentence in data:
now = ['BOS', 'BOS']
self.bi.add(('BOS', 'BOS'), 1)
self.uni.add('BOS', 2)
for word, tag in sentence:
now.append(tag)
self.status.add(tag)
self.wd.add((tag, word), 1)
self.eos.add(tuple(now[1:]), 1)
self.eosd.add(tag, 1)
self.uni.add(tag, 1)
self.bi.add(tuple(now[1:]), 1)
self.tri.add(tuple(now), 1)
if word not in self.word:
self.word[word] = set()
self.word[word].add(tag)
now.pop(0)
self.eos.add((now[-1], 'EOS'), 1)
tl1 = 0.0
tl2 = 0.0
tl3 = 0.0
for now in self.tri.samples():
c3 = self.tnt_div(self.tri.get(now)[1]-1,
self.bi.get(now[:2])[1]-1)
c2 = self.tnt_div(self.bi.get(now[1:])[1]-1,
self.uni.get(now[1])[1]-1)
c1 = self.tnt_div(self.uni.get(now[2])[1]-1, self.uni.getsum()-1)
if c3 >= c1 and c3 >= c2:
tl3 += self.tri.get(now)[1]
elif c2 >= c1 and c2 >= c3:
tl2 += self.tri.get(now)[1]
elif c1 >= c2 and c1 >= c3:
tl1 += self.tri.get(now)[1]
self.l1 = float(tl1)/(tl1+tl2+tl3)
self.l2 = float(tl2)/(tl1+tl2+tl3)
self.l3 = float(tl3)/(tl1+tl2+tl3)
for s1 in self.status | set(('BOS',)):
for s2 in self.status | set(('BOS',)):
for s3 in self.status:
uni = self.l1*self.uni.freq(s3)
bi = self.tnt_div(self.l2*self.bi.get((s2, s3))[1],
self.uni.get(s2)[1])
tri = self.tnt_div(self.l3*self.tri.get((s1, s2, s3))[1],
self.bi.get((s1, s2))[1])
self.trans[(s1, s2, s3)] = log(uni+bi+tri)
def tag(self, data):
now = [(('BOS', 'BOS'), 0.0, [])]
for w in data:
stage = {}
samples = self.status
if w in self.word:
samples = self.word[w]
for s in samples:
wd = log(self.wd.get((s, w))[1])-log(self.uni.get(s)[1])
for pre in now:
p = pre[1]+wd+self.trans[(pre[0][0], pre[0][1], s)]
if (pre[0][1], s) not in stage or p > stage[(pre[0][1],
s)][0]:
stage[(pre[0][1], s)] = (p, pre[2]+[s])
stage = list(map(lambda x: (x[0], x[1][0], x[1][1]), stage.items()))
now = heapq.nlargest(self.N, stage, key=lambda x: x[1])
now = heapq.nlargest(1, stage, key=lambda x: x[1]+self.geteos(x[0][1]))
return zip(data, now[0][2])
+48
View File
@@ -0,0 +1,48 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
class Trie(object):
def __init__(self):
self.d = {}
def insert(self, key, value):
now = self.d
for k in key:
if not k in now:
now[k] = {}
now = now[k]
now['value'] = value
def find(self, text, start=0):
now = self.d
n = len(text)
ret = None
pos = start
while pos < n:
if text[pos] in now:
now = now[text[pos]]
else:
return ret
if 'value' in now:
ret = (text[start:pos+1], now['value'])
pos += 1
return ret
def translate(self, text, with_not_found=True):
n = len(text)
pos = 0
ret = []
while pos < n:
now = self.d
if text[pos] in now:
tmp = self.find(text, pos)
if tmp:
ret.append(tmp[1])
pos += len(tmp[0])
continue
if with_not_found:
ret.append(text[pos])
pos += 1
return ret
+1 -1
View File
@@ -1,5 +1,5 @@
from pymysql import *
conn = connect(host='10.92.35.13',port=3306,user='XiaoXueQi',password='XiaoXueQi',database='Weibo_PublicOpinion_AnalysisSystem')
conn = connect(host='47.92.235.6',port=3306,user='XiaoXueQi',password='XiaoXueQi',database='Weibo_PublicOpinion_AnalysisSystem')
cursor = conn.cursor()
def query(sql,params,type="no_select"):
params = tuple(params)
+1 -1
View File
@@ -9,7 +9,7 @@ def stopWordList():
return [line.strip() for line in open('./model/stopWords.txt',encoding='utf8').readlines()]
def get_img(field,tableName,targetImgSrc,resImgSrc):
con = connect(host='localhost',user='root',password='root',database='weiboarticles',port=3306,charset='utf8mb4')
con = connect(host='47.92.235.6',user='XiaoXueQi',password='XiaoXueQi',database='Weibo_PublicOpinion_AnalysisSystem',port=3306,charset='utf8mb4')
cuser = con.cursor()
sql = f'select {field} from {tableName}'
cuser.execute(sql)