Files
bettafish-company/utils/mynlp/summary/words_merge.py
T

48 lines
1.3 KiB
Python

# -*- coding: utf-8 -*-
from __future__ import unicode_literals
class SimpleMerge(object):
def __init__(self, doc, words):
self.doc = doc
self.words = words
def merge(self):
trans = {}
for w in self.words:
trans[w] = ''
for w1 in self.words:
cw = 0
lw = len(w1)
for i in range(len(self.doc)-lw+1):
if w1 == self.doc[i: i+lw]:
cw += 1
for w2 in self.words:
cnt = 0
l2 = len(w1)+len(w2)
for i in range(len(self.doc)-l2+1):
if w1+w2 == self.doc[i: i+l2]:
cnt += 1
if cw < cnt*2:
trans[w1] = w2
break
ret = []
for w in self.words:
if w not in trans:
continue
s = ''
now = trans[w]
while now:
s += now
if now not in trans:
break
tmp = trans[now]
del trans[now]
now = tmp
trans[w] = s
for w in self.words:
if w in trans:
ret.append(w+trans[w])
return ret