【snownlp相关文件】上传自己的模型,调用utils/mynlp
This commit is contained in:
@@ -0,0 +1,106 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..sim.bm25 import BM25
|
||||
|
||||
|
||||
class TextRank(object):
|
||||
|
||||
def __init__(self, docs):
|
||||
self.docs = docs
|
||||
self.bm25 = BM25(docs)
|
||||
self.D = len(docs)
|
||||
self.d = 0.85
|
||||
self.weight = []
|
||||
self.weight_sum = []
|
||||
self.vertex = []
|
||||
self.max_iter = 200
|
||||
self.min_diff = 0.001
|
||||
self.top = []
|
||||
|
||||
def solve(self):
|
||||
for cnt, doc in enumerate(self.docs):
|
||||
scores = self.bm25.simall(doc)
|
||||
self.weight.append(scores)
|
||||
self.weight_sum.append(sum(scores)-scores[cnt])
|
||||
self.vertex.append(1.0)
|
||||
for _ in range(self.max_iter):
|
||||
m = []
|
||||
max_diff = 0
|
||||
for i in range(self.D):
|
||||
m.append(1-self.d)
|
||||
for j in range(self.D):
|
||||
if j == i or self.weight_sum[j] == 0:
|
||||
continue
|
||||
m[-1] += (self.d*self.weight[j][i]
|
||||
/ self.weight_sum[j]*self.vertex[j])
|
||||
if abs(m[-1] - self.vertex[i]) > max_diff:
|
||||
max_diff = abs(m[-1] - self.vertex[i])
|
||||
self.vertex = m
|
||||
if max_diff <= self.min_diff:
|
||||
break
|
||||
self.top = list(enumerate(self.vertex))
|
||||
self.top = sorted(self.top, key=lambda x: x[1], reverse=True)
|
||||
|
||||
def top_index(self, limit):
|
||||
return list(map(lambda x: x[0], self.top))[:limit]
|
||||
|
||||
def top(self, limit):
|
||||
return list(map(lambda x: self.docs[x[0]], self.top))
|
||||
|
||||
|
||||
class KeywordTextRank(object):
|
||||
|
||||
def __init__(self, docs):
|
||||
self.docs = docs
|
||||
self.words = {}
|
||||
self.vertex = {}
|
||||
self.d = 0.85
|
||||
self.max_iter = 200
|
||||
self.min_diff = 0.001
|
||||
self.top = []
|
||||
|
||||
def solve(self):
|
||||
for doc in self.docs:
|
||||
que = []
|
||||
for word in doc:
|
||||
if word not in self.words:
|
||||
self.words[word] = set()
|
||||
self.vertex[word] = 1.0
|
||||
que.append(word)
|
||||
if len(que) > 5:
|
||||
que.pop(0)
|
||||
for w1 in que:
|
||||
for w2 in que:
|
||||
if w1 == w2:
|
||||
continue
|
||||
self.words[w1].add(w2)
|
||||
self.words[w2].add(w1)
|
||||
for _ in range(self.max_iter):
|
||||
m = {}
|
||||
max_diff = 0
|
||||
tmp = filter(lambda x: len(self.words[x[0]]) > 0,
|
||||
self.vertex.items())
|
||||
tmp = sorted(tmp, key=lambda x: x[1] / len(self.words[x[0]]))
|
||||
for k, v in tmp:
|
||||
for j in self.words[k]:
|
||||
if k == j:
|
||||
continue
|
||||
if j not in m:
|
||||
m[j] = 1 - self.d
|
||||
m[j] += (self.d / len(self.words[k]) * self.vertex[k])
|
||||
for k in self.vertex:
|
||||
if k in m and k in self.vertex:
|
||||
if abs(m[k] - self.vertex[k]) > max_diff:
|
||||
max_diff = abs(m[k] - self.vertex[k])
|
||||
self.vertex = m
|
||||
if max_diff <= self.min_diff:
|
||||
break
|
||||
self.top = list(self.vertex.items())
|
||||
self.top = sorted(self.top, key=lambda x: x[1], reverse=True)
|
||||
|
||||
def top_index(self, limit):
|
||||
return list(map(lambda x: x[0], self.top))[:limit]
|
||||
|
||||
def top(self, limit):
|
||||
return list(map(lambda x: self.docs[x[0]], self.top))
|
||||
@@ -0,0 +1,47 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
class SimpleMerge(object):
|
||||
|
||||
def __init__(self, doc, words):
|
||||
self.doc = doc
|
||||
self.words = words
|
||||
|
||||
def merge(self):
|
||||
trans = {}
|
||||
for w in self.words:
|
||||
trans[w] = ''
|
||||
for w1 in self.words:
|
||||
cw = 0
|
||||
lw = len(w1)
|
||||
for i in range(len(self.doc)-lw+1):
|
||||
if w1 == self.doc[i: i+lw]:
|
||||
cw += 1
|
||||
for w2 in self.words:
|
||||
cnt = 0
|
||||
l2 = len(w1)+len(w2)
|
||||
for i in range(len(self.doc)-l2+1):
|
||||
if w1+w2 == self.doc[i: i+l2]:
|
||||
cnt += 1
|
||||
if cw < cnt*2:
|
||||
trans[w1] = w2
|
||||
break
|
||||
ret = []
|
||||
for w in self.words:
|
||||
if w not in trans:
|
||||
continue
|
||||
s = ''
|
||||
now = trans[w]
|
||||
while now:
|
||||
s += now
|
||||
if now not in trans:
|
||||
break
|
||||
tmp = trans[now]
|
||||
del trans[now]
|
||||
now = tmp
|
||||
trans[w] = s
|
||||
for w in self.words:
|
||||
if w in trans:
|
||||
ret.append(w+trans[w])
|
||||
return ret
|
||||
Reference in New Issue
Block a user