116 lines
4.2 KiB
Python
116 lines
4.2 KiB
Python
from typing import List
|
|
from sklearn.feature_extraction.text import TfidfTransformer
|
|
from sklearn.preprocessing import normalize
|
|
from sklearn.utils import check_array
|
|
import numpy as np
|
|
import scipy.sparse as sp
|
|
|
|
|
|
class ClassTfidfTransformer(TfidfTransformer):
|
|
"""A Class-based TF-IDF procedure using scikit-learns TfidfTransformer as a base.
|
|
|
|

|
|
|
|
c-TF-IDF can best be explained as a TF-IDF formula adopted for multiple classes
|
|
by joining all documents per class. Thus, each class is converted to a single document
|
|
instead of set of documents. The frequency of each word **x** is extracted
|
|
for each class **c** and is **l1** normalized. This constitutes the term frequency.
|
|
|
|
Then, the term frequency is multiplied with IDF which is the logarithm of 1 plus
|
|
the average number of words per class **A** divided by the frequency of word **x**
|
|
across all classes.
|
|
|
|
Arguments:
|
|
bm25_weighting: Uses BM25-inspired idf-weighting procedure instead of the procedure
|
|
as defined in the c-TF-IDF formula. It uses the following weighting scheme:
|
|
`log(1+((avg_nr_samples - df + 0.5) / (df+0.5)))`
|
|
reduce_frequent_words: Takes the square root of the bag-of-words after normalizing the matrix.
|
|
Helps to reduce the impact of words that appear too frequently.
|
|
seed_words: Specific words that will have their idf value increased by
|
|
the value of `seed_multiplier`.
|
|
NOTE: This will only increase the value of words that have an exact match.
|
|
seed_multiplier: The value with which the idf values of the words in `seed_words`
|
|
are multiplied.
|
|
|
|
Examples:
|
|
```python
|
|
transformer = ClassTfidfTransformer()
|
|
```
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
bm25_weighting: bool = False,
|
|
reduce_frequent_words: bool = False,
|
|
seed_words: List[str] = None,
|
|
seed_multiplier: float = 2,
|
|
):
|
|
self.bm25_weighting = bm25_weighting
|
|
self.reduce_frequent_words = reduce_frequent_words
|
|
self.seed_words = seed_words
|
|
self.seed_multiplier = seed_multiplier
|
|
super(ClassTfidfTransformer, self).__init__()
|
|
|
|
def fit(self, X: sp.csr_matrix, multiplier: np.ndarray = None):
|
|
"""Learn the idf vector (global term weights).
|
|
|
|
Arguments:
|
|
X: A matrix of term/token counts.
|
|
multiplier: A multiplier for increasing/decreasing certain IDF scores
|
|
"""
|
|
X = check_array(X, accept_sparse=("csr", "csc"))
|
|
if not sp.issparse(X):
|
|
X = sp.csr_matrix(X)
|
|
dtype = np.float64
|
|
|
|
if self.use_idf:
|
|
_, n_features = X.shape
|
|
|
|
# Calculate the frequency of words across all classes
|
|
df = np.squeeze(np.asarray(X.sum(axis=0)))
|
|
|
|
# Calculate the average number of samples as regularization
|
|
avg_nr_samples = int(X.sum(axis=1).mean())
|
|
|
|
# BM25-inspired weighting procedure
|
|
if self.bm25_weighting:
|
|
idf = np.log(1 + ((avg_nr_samples - df + 0.5) / (df + 0.5)))
|
|
|
|
# Divide the average number of samples by the word frequency
|
|
# +1 is added to force values to be positive
|
|
else:
|
|
idf = np.log((avg_nr_samples / df) + 1)
|
|
|
|
# Multiplier to increase/decrease certain idf scores
|
|
if multiplier is not None:
|
|
idf = idf * multiplier
|
|
|
|
self._idf_diag = sp.diags(
|
|
idf,
|
|
offsets=0,
|
|
shape=(n_features, n_features),
|
|
format="csr",
|
|
dtype=dtype,
|
|
)
|
|
|
|
return self
|
|
|
|
def transform(self, X: sp.csr_matrix):
|
|
"""Transform a count-based matrix to c-TF-IDF.
|
|
|
|
Arguments:
|
|
X (sparse matrix): A matrix of term/token counts.
|
|
|
|
Returns:
|
|
X (sparse matrix): A c-TF-IDF matrix
|
|
"""
|
|
if self.use_idf:
|
|
X = normalize(X, axis=1, norm="l1", copy=False)
|
|
|
|
if self.reduce_frequent_words:
|
|
X.data = np.sqrt(X.data)
|
|
|
|
X = X * self._idf_diag
|
|
|
|
return X
|