【snownlp相关文件】上传自己的模型,调用utils/mynlp

This commit is contained in:
redhongx
2024-07-04 11:54:06 +08:00
parent f09fcb3000
commit 93b72ea2e0
33 changed files with 135956 additions and 0 deletions
View File
+106
View File
@@ -0,0 +1,106 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from ..sim.bm25 import BM25
class TextRank(object):
def __init__(self, docs):
self.docs = docs
self.bm25 = BM25(docs)
self.D = len(docs)
self.d = 0.85
self.weight = []
self.weight_sum = []
self.vertex = []
self.max_iter = 200
self.min_diff = 0.001
self.top = []
def solve(self):
for cnt, doc in enumerate(self.docs):
scores = self.bm25.simall(doc)
self.weight.append(scores)
self.weight_sum.append(sum(scores)-scores[cnt])
self.vertex.append(1.0)
for _ in range(self.max_iter):
m = []
max_diff = 0
for i in range(self.D):
m.append(1-self.d)
for j in range(self.D):
if j == i or self.weight_sum[j] == 0:
continue
m[-1] += (self.d*self.weight[j][i]
/ self.weight_sum[j]*self.vertex[j])
if abs(m[-1] - self.vertex[i]) > max_diff:
max_diff = abs(m[-1] - self.vertex[i])
self.vertex = m
if max_diff <= self.min_diff:
break
self.top = list(enumerate(self.vertex))
self.top = sorted(self.top, key=lambda x: x[1], reverse=True)
def top_index(self, limit):
return list(map(lambda x: x[0], self.top))[:limit]
def top(self, limit):
return list(map(lambda x: self.docs[x[0]], self.top))
class KeywordTextRank(object):
def __init__(self, docs):
self.docs = docs
self.words = {}
self.vertex = {}
self.d = 0.85
self.max_iter = 200
self.min_diff = 0.001
self.top = []
def solve(self):
for doc in self.docs:
que = []
for word in doc:
if word not in self.words:
self.words[word] = set()
self.vertex[word] = 1.0
que.append(word)
if len(que) > 5:
que.pop(0)
for w1 in que:
for w2 in que:
if w1 == w2:
continue
self.words[w1].add(w2)
self.words[w2].add(w1)
for _ in range(self.max_iter):
m = {}
max_diff = 0
tmp = filter(lambda x: len(self.words[x[0]]) > 0,
self.vertex.items())
tmp = sorted(tmp, key=lambda x: x[1] / len(self.words[x[0]]))
for k, v in tmp:
for j in self.words[k]:
if k == j:
continue
if j not in m:
m[j] = 1 - self.d
m[j] += (self.d / len(self.words[k]) * self.vertex[k])
for k in self.vertex:
if k in m and k in self.vertex:
if abs(m[k] - self.vertex[k]) > max_diff:
max_diff = abs(m[k] - self.vertex[k])
self.vertex = m
if max_diff <= self.min_diff:
break
self.top = list(self.vertex.items())
self.top = sorted(self.top, key=lambda x: x[1], reverse=True)
def top_index(self, limit):
return list(map(lambda x: x[0], self.top))[:limit]
def top(self, limit):
return list(map(lambda x: self.docs[x[0]], self.top))
+47
View File
@@ -0,0 +1,47 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
class SimpleMerge(object):
def __init__(self, doc, words):
self.doc = doc
self.words = words
def merge(self):
trans = {}
for w in self.words:
trans[w] = ''
for w1 in self.words:
cw = 0
lw = len(w1)
for i in range(len(self.doc)-lw+1):
if w1 == self.doc[i: i+lw]:
cw += 1
for w2 in self.words:
cnt = 0
l2 = len(w1)+len(w2)
for i in range(len(self.doc)-l2+1):
if w1+w2 == self.doc[i: i+l2]:
cnt += 1
if cw < cnt*2:
trans[w1] = w2
break
ret = []
for w in self.words:
if w not in trans:
continue
s = ''
now = trans[w]
while now:
s += now
if now not in trans:
break
tmp = trans[now]
del trans[now]
now = tmp
trans[w] = s
for w in self.words:
if w in trans:
ret.append(w+trans[w])
return ret