Add BERTopic.
This commit is contained in:
@@ -0,0 +1,4 @@
|
||||
from ._ctfidf import ClassTfidfTransformer
|
||||
from ._online_cv import OnlineCountVectorizer
|
||||
|
||||
__all__ = ["ClassTfidfTransformer", "OnlineCountVectorizer"]
|
||||
@@ -0,0 +1,115 @@
|
||||
from typing import List
|
||||
from sklearn.feature_extraction.text import TfidfTransformer
|
||||
from sklearn.preprocessing import normalize
|
||||
from sklearn.utils import check_array
|
||||
import numpy as np
|
||||
import scipy.sparse as sp
|
||||
|
||||
|
||||
class ClassTfidfTransformer(TfidfTransformer):
|
||||
"""A Class-based TF-IDF procedure using scikit-learns TfidfTransformer as a base.
|
||||
|
||||

|
||||
|
||||
c-TF-IDF can best be explained as a TF-IDF formula adopted for multiple classes
|
||||
by joining all documents per class. Thus, each class is converted to a single document
|
||||
instead of set of documents. The frequency of each word **x** is extracted
|
||||
for each class **c** and is **l1** normalized. This constitutes the term frequency.
|
||||
|
||||
Then, the term frequency is multiplied with IDF which is the logarithm of 1 plus
|
||||
the average number of words per class **A** divided by the frequency of word **x**
|
||||
across all classes.
|
||||
|
||||
Arguments:
|
||||
bm25_weighting: Uses BM25-inspired idf-weighting procedure instead of the procedure
|
||||
as defined in the c-TF-IDF formula. It uses the following weighting scheme:
|
||||
`log(1+((avg_nr_samples - df + 0.5) / (df+0.5)))`
|
||||
reduce_frequent_words: Takes the square root of the bag-of-words after normalizing the matrix.
|
||||
Helps to reduce the impact of words that appear too frequently.
|
||||
seed_words: Specific words that will have their idf value increased by
|
||||
the value of `seed_multiplier`.
|
||||
NOTE: This will only increase the value of words that have an exact match.
|
||||
seed_multiplier: The value with which the idf values of the words in `seed_words`
|
||||
are multiplied.
|
||||
|
||||
Examples:
|
||||
```python
|
||||
transformer = ClassTfidfTransformer()
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
bm25_weighting: bool = False,
|
||||
reduce_frequent_words: bool = False,
|
||||
seed_words: List[str] = None,
|
||||
seed_multiplier: float = 2,
|
||||
):
|
||||
self.bm25_weighting = bm25_weighting
|
||||
self.reduce_frequent_words = reduce_frequent_words
|
||||
self.seed_words = seed_words
|
||||
self.seed_multiplier = seed_multiplier
|
||||
super(ClassTfidfTransformer, self).__init__()
|
||||
|
||||
def fit(self, X: sp.csr_matrix, multiplier: np.ndarray = None):
|
||||
"""Learn the idf vector (global term weights).
|
||||
|
||||
Arguments:
|
||||
X: A matrix of term/token counts.
|
||||
multiplier: A multiplier for increasing/decreasing certain IDF scores
|
||||
"""
|
||||
X = check_array(X, accept_sparse=("csr", "csc"))
|
||||
if not sp.issparse(X):
|
||||
X = sp.csr_matrix(X)
|
||||
dtype = np.float64
|
||||
|
||||
if self.use_idf:
|
||||
_, n_features = X.shape
|
||||
|
||||
# Calculate the frequency of words across all classes
|
||||
df = np.squeeze(np.asarray(X.sum(axis=0)))
|
||||
|
||||
# Calculate the average number of samples as regularization
|
||||
avg_nr_samples = int(X.sum(axis=1).mean())
|
||||
|
||||
# BM25-inspired weighting procedure
|
||||
if self.bm25_weighting:
|
||||
idf = np.log(1 + ((avg_nr_samples - df + 0.5) / (df + 0.5)))
|
||||
|
||||
# Divide the average number of samples by the word frequency
|
||||
# +1 is added to force values to be positive
|
||||
else:
|
||||
idf = np.log((avg_nr_samples / df) + 1)
|
||||
|
||||
# Multiplier to increase/decrease certain idf scores
|
||||
if multiplier is not None:
|
||||
idf = idf * multiplier
|
||||
|
||||
self._idf_diag = sp.diags(
|
||||
idf,
|
||||
offsets=0,
|
||||
shape=(n_features, n_features),
|
||||
format="csr",
|
||||
dtype=dtype,
|
||||
)
|
||||
|
||||
return self
|
||||
|
||||
def transform(self, X: sp.csr_matrix):
|
||||
"""Transform a count-based matrix to c-TF-IDF.
|
||||
|
||||
Arguments:
|
||||
X (sparse matrix): A matrix of term/token counts.
|
||||
|
||||
Returns:
|
||||
X (sparse matrix): A c-TF-IDF matrix
|
||||
"""
|
||||
if self.use_idf:
|
||||
X = normalize(X, axis=1, norm="l1", copy=False)
|
||||
|
||||
if self.reduce_frequent_words:
|
||||
X.data = np.sqrt(X.data)
|
||||
|
||||
X = X * self._idf_diag
|
||||
|
||||
return X
|
||||
@@ -0,0 +1,158 @@
|
||||
import numpy as np
|
||||
from itertools import chain
|
||||
from typing import List
|
||||
|
||||
from scipy import sparse
|
||||
from scipy.sparse import csr_matrix
|
||||
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
|
||||
|
||||
class OnlineCountVectorizer(CountVectorizer):
|
||||
"""An online variant of the CountVectorizer with updating vocabulary.
|
||||
|
||||
At each `.partial_fit`, its vocabulary is updated based on any OOV words
|
||||
it might find. Then, `.update_bow` can be used to track and update
|
||||
the Bag-of-Words representation. These functions are separated such that
|
||||
the vectorizer can be used in iteration without updating the Bag-of-Words
|
||||
representation can might speed up the fitting process. However, the
|
||||
`.update_bow` function is used in BERTopic to track changes in the
|
||||
topic representations and allow for decay.
|
||||
|
||||
This class inherits its parameters and attributes from:
|
||||
`sklearn.feature_extraction.text.CountVectorizer`
|
||||
|
||||
Arguments:
|
||||
decay: A value between [0, 1] to weight the percentage of frequencies
|
||||
the previous bag-of-words should be decreased. For example,
|
||||
a value of `.1` will decrease the frequencies in the bag-of-words
|
||||
matrix with 10% at each iteration.
|
||||
delete_min_df: Delete words at each iteration from its vocabulary
|
||||
that are below a minimum frequency.
|
||||
This will keep the resulting bag-of-words matrix small
|
||||
such that it does not explode in size with increasing
|
||||
vocabulary. If `decay` is None then this equals `min_df`.
|
||||
**kwargs: Set of parameters inherited from:
|
||||
`sklearn.feature_extraction.text.CountVectorizer`
|
||||
In practice, this means that you can still use parameters
|
||||
from the original CountVectorizer, like `stop_words` and
|
||||
`ngram_range`.
|
||||
|
||||
Attributes:
|
||||
X_ (scipy.sparse.csr_matrix) : The Bag-of-Words representation
|
||||
|
||||
Examples:
|
||||
```python
|
||||
from bertopic.vectorizers import OnlineCountVectorizer
|
||||
vectorizer = OnlineCountVectorizer(stop_words="english")
|
||||
|
||||
for index, doc in enumerate(my_docs):
|
||||
vectorizer.partial_fit(doc)
|
||||
|
||||
# Update and clean the bow every 100 iterations:
|
||||
if index % 100 == 0:
|
||||
X = vectorizer.update_bow()
|
||||
```
|
||||
|
||||
To use the model in BERTopic:
|
||||
|
||||
```python
|
||||
from bertopic import BERTopic
|
||||
from bertopic.vectorizers import OnlineCountVectorizer
|
||||
|
||||
vectorizer_model = OnlineCountVectorizer(stop_words="english")
|
||||
topic_model = BERTopic(vectorizer_model=vectorizer_model)
|
||||
```
|
||||
|
||||
References:
|
||||
Adapted from: https://github.com/idoshlomo/online_vectorizers
|
||||
"""
|
||||
|
||||
def __init__(self, decay: float = None, delete_min_df: float = None, **kwargs):
|
||||
self.decay = decay
|
||||
self.delete_min_df = delete_min_df
|
||||
super(OnlineCountVectorizer, self).__init__(**kwargs)
|
||||
|
||||
def partial_fit(self, raw_documents: List[str]) -> None:
|
||||
"""Perform a partial fit and update vocabulary with OOV tokens.
|
||||
|
||||
Arguments:
|
||||
raw_documents: A list of documents
|
||||
"""
|
||||
if not hasattr(self, "vocabulary_"):
|
||||
return self.fit(raw_documents)
|
||||
|
||||
analyzer = self.build_analyzer()
|
||||
analyzed_documents = [analyzer(doc) for doc in raw_documents]
|
||||
new_tokens = set(chain.from_iterable(analyzed_documents))
|
||||
oov_tokens = new_tokens.difference(set(self.vocabulary_.keys()))
|
||||
|
||||
if oov_tokens:
|
||||
max_index = max(self.vocabulary_.values())
|
||||
oov_vocabulary = dict(
|
||||
zip(
|
||||
oov_tokens,
|
||||
list(range(max_index + 1, max_index + 1 + len(oov_tokens), 1)),
|
||||
)
|
||||
)
|
||||
self.vocabulary_.update(oov_vocabulary)
|
||||
|
||||
return self
|
||||
|
||||
def update_bow(self, raw_documents: List[str]) -> csr_matrix:
|
||||
"""Create or update the bag-of-words matrix.
|
||||
|
||||
Update the bag-of-words matrix by adding the newly transformed
|
||||
documents. This may add empty columns if new words are found and/or
|
||||
add empty rows if new topics are found.
|
||||
|
||||
During this process, the previous bag-of-words matrix might be
|
||||
decayed if `self.decay` has been set during init. Similarly, words
|
||||
that do not exceed `self.delete_min_df` are removed from its
|
||||
vocabulary and bag-of-words matrix.
|
||||
|
||||
Arguments:
|
||||
raw_documents: A list of documents
|
||||
|
||||
Returns:
|
||||
X_: Bag-of-words matrix
|
||||
"""
|
||||
if hasattr(self, "X_"):
|
||||
X = self.transform(raw_documents)
|
||||
|
||||
# Add empty columns if new words are found
|
||||
columns = csr_matrix((self.X_.shape[0], X.shape[1] - self.X_.shape[1]), dtype=int)
|
||||
self.X_ = sparse.hstack([self.X_, columns])
|
||||
|
||||
# Add empty rows if new topics are found
|
||||
rows = csr_matrix((X.shape[0] - self.X_.shape[0], self.X_.shape[1]), dtype=int)
|
||||
self.X_ = sparse.vstack([self.X_, rows])
|
||||
|
||||
# Decay of BoW matrix
|
||||
if self.decay is not None:
|
||||
self.X_ = self.X_ * (1 - self.decay)
|
||||
|
||||
self.X_ += X
|
||||
else:
|
||||
self.X_ = self.transform(raw_documents)
|
||||
|
||||
if self.delete_min_df is not None:
|
||||
self._clean_bow()
|
||||
|
||||
return self.X_
|
||||
|
||||
def _clean_bow(self) -> None:
|
||||
"""Remove words that do not exceed `self.delete_min_df`."""
|
||||
# Only keep words with a minimum frequency
|
||||
indices = np.where(self.X_.sum(0) >= self.delete_min_df)[1]
|
||||
indices_dict = {index: index for index in indices}
|
||||
self.X_ = self.X_[:, indices]
|
||||
|
||||
# Update vocabulary with new words
|
||||
new_vocab = {}
|
||||
vocabulary_dict = {v: k for k, v in self.vocabulary_.items()}
|
||||
for i, index in enumerate(indices):
|
||||
if indices_dict.get(index) is not None:
|
||||
new_vocab[vocabulary_dict[index]] = i
|
||||
|
||||
self.vocabulary_ = new_vocab
|
||||
Reference in New Issue
Block a user