Add BERTopic.

2025-08-12 19:01:20 +08:00
parent e2323d579c
commit c5c530775e
256 changed files with 28666 additions and 0 deletions
@@ -0,0 +1,4 @@
+from ._ctfidf import ClassTfidfTransformer
+from ._online_cv import OnlineCountVectorizer
+
+__all__ = ["ClassTfidfTransformer", "OnlineCountVectorizer"]
@@ -0,0 +1,115 @@
+from typing import List
+from sklearn.feature_extraction.text import TfidfTransformer
+from sklearn.preprocessing import normalize
+from sklearn.utils import check_array
+import numpy as np
+import scipy.sparse as sp
+
+
+class ClassTfidfTransformer(TfidfTransformer):
+    """A Class-based TF-IDF procedure using scikit-learns TfidfTransformer as a base.
+
+    ![](../algorithm/c-TF-IDF.svg)
+
+    c-TF-IDF can best be explained as a TF-IDF formula adopted for multiple classes
+    by joining all documents per class. Thus, each class is converted to a single document
+    instead of set of documents. The frequency of each word **x** is extracted
+    for each class **c** and is **l1** normalized. This constitutes the term frequency.
+
+    Then, the term frequency is multiplied with IDF which is the logarithm of 1 plus
+    the average number of words per class **A** divided by the frequency of word **x**
+    across all classes.
+
+    Arguments:
+        bm25_weighting: Uses BM25-inspired idf-weighting procedure instead of the procedure
+                        as defined in the c-TF-IDF formula. It uses the following weighting scheme:
+                        `log(1+((avg_nr_samples - df + 0.5) / (df+0.5)))`
+        reduce_frequent_words: Takes the square root of the bag-of-words after normalizing the matrix.
+                               Helps to reduce the impact of words that appear too frequently.
+        seed_words: Specific words that will have their idf value increased by
+                    the value of `seed_multiplier`.
+                    NOTE: This will only increase the value of words that have an exact match.
+        seed_multiplier: The value with which the idf values of the words in `seed_words`
+                         are multiplied.
+
+    Examples:
+    ```python
+    transformer = ClassTfidfTransformer()
+    ```
+    """
+
+    def __init__(
+        self,
+        bm25_weighting: bool = False,
+        reduce_frequent_words: bool = False,
+        seed_words: List[str] = None,
+        seed_multiplier: float = 2,
+    ):
+        self.bm25_weighting = bm25_weighting
+        self.reduce_frequent_words = reduce_frequent_words
+        self.seed_words = seed_words
+        self.seed_multiplier = seed_multiplier
+        super(ClassTfidfTransformer, self).__init__()
+
+    def fit(self, X: sp.csr_matrix, multiplier: np.ndarray = None):
+        """Learn the idf vector (global term weights).
+
+        Arguments:
+            X: A matrix of term/token counts.
+            multiplier: A multiplier for increasing/decreasing certain IDF scores
+        """
+        X = check_array(X, accept_sparse=("csr", "csc"))
+        if not sp.issparse(X):
+            X = sp.csr_matrix(X)
+        dtype = np.float64
+
+        if self.use_idf:
+            _, n_features = X.shape
+
+            # Calculate the frequency of words across all classes
+            df = np.squeeze(np.asarray(X.sum(axis=0)))
+
+            # Calculate the average number of samples as regularization
+            avg_nr_samples = int(X.sum(axis=1).mean())
+
+            # BM25-inspired weighting procedure
+            if self.bm25_weighting:
+                idf = np.log(1 + ((avg_nr_samples - df + 0.5) / (df + 0.5)))
+
+            # Divide the average number of samples by the word frequency
+            # +1 is added to force values to be positive
+            else:
+                idf = np.log((avg_nr_samples / df) + 1)
+
+            # Multiplier to increase/decrease certain idf scores
+            if multiplier is not None:
+                idf = idf * multiplier
+
+            self._idf_diag = sp.diags(
+                idf,
+                offsets=0,
+                shape=(n_features, n_features),
+                format="csr",
+                dtype=dtype,
+            )
+
+        return self
+
+    def transform(self, X: sp.csr_matrix):
+        """Transform a count-based matrix to c-TF-IDF.
+
+        Arguments:
+            X (sparse matrix): A matrix of term/token counts.
+
+        Returns:
+            X (sparse matrix): A c-TF-IDF matrix
+        """
+        if self.use_idf:
+            X = normalize(X, axis=1, norm="l1", copy=False)
+
+            if self.reduce_frequent_words:
+                X.data = np.sqrt(X.data)
+
+            X = X * self._idf_diag
+
+        return X
@@ -0,0 +1,158 @@
+import numpy as np
+from itertools import chain
+from typing import List
+
+from scipy import sparse
+from scipy.sparse import csr_matrix
+
+from sklearn.feature_extraction.text import CountVectorizer
+
+
+class OnlineCountVectorizer(CountVectorizer):
+    """An online variant of the CountVectorizer with updating vocabulary.
+
+    At each `.partial_fit`, its vocabulary is updated based on any OOV words
+    it might find. Then, `.update_bow` can be used to track and update
+    the Bag-of-Words representation. These functions are separated such that
+    the vectorizer can be used in iteration without updating the Bag-of-Words
+    representation can might speed up the fitting process. However, the
+    `.update_bow` function is used in BERTopic to track changes in the
+    topic representations and allow for decay.
+
+    This class inherits its parameters and attributes from:
+        `sklearn.feature_extraction.text.CountVectorizer`
+
+    Arguments:
+        decay: A value between [0, 1] to weight the percentage of frequencies
+               the previous bag-of-words should be decreased. For example,
+               a value of `.1` will decrease the frequencies in the bag-of-words
+               matrix with 10% at each iteration.
+        delete_min_df: Delete words at each iteration from its vocabulary
+                       that are below a minimum frequency.
+                       This will keep the resulting bag-of-words matrix small
+                       such that it does not explode in size with increasing
+                       vocabulary. If `decay` is None then this equals `min_df`.
+        **kwargs: Set of parameters inherited from:
+                  `sklearn.feature_extraction.text.CountVectorizer`
+                  In practice, this means that you can still use parameters
+                  from the original CountVectorizer, like `stop_words` and
+                  `ngram_range`.
+
+    Attributes:
+        X_ (scipy.sparse.csr_matrix) : The Bag-of-Words representation
+
+    Examples:
+    ```python
+    from bertopic.vectorizers import OnlineCountVectorizer
+    vectorizer = OnlineCountVectorizer(stop_words="english")
+
+    for index, doc in enumerate(my_docs):
+        vectorizer.partial_fit(doc)
+
+        # Update and clean the bow every 100 iterations:
+        if index % 100 == 0:
+            X = vectorizer.update_bow()
+    ```
+
+    To use the model in BERTopic:
+
+    ```python
+    from bertopic import BERTopic
+    from bertopic.vectorizers import OnlineCountVectorizer
+
+    vectorizer_model = OnlineCountVectorizer(stop_words="english")
+    topic_model = BERTopic(vectorizer_model=vectorizer_model)
+    ```
+
+    References:
+        Adapted from: https://github.com/idoshlomo/online_vectorizers
+    """
+
+    def __init__(self, decay: float = None, delete_min_df: float = None, **kwargs):
+        self.decay = decay
+        self.delete_min_df = delete_min_df
+        super(OnlineCountVectorizer, self).__init__(**kwargs)
+
+    def partial_fit(self, raw_documents: List[str]) -> None:
+        """Perform a partial fit and update vocabulary with OOV tokens.
+
+        Arguments:
+            raw_documents: A list of documents
+        """
+        if not hasattr(self, "vocabulary_"):
+            return self.fit(raw_documents)
+
+        analyzer = self.build_analyzer()
+        analyzed_documents = [analyzer(doc) for doc in raw_documents]
+        new_tokens = set(chain.from_iterable(analyzed_documents))
+        oov_tokens = new_tokens.difference(set(self.vocabulary_.keys()))
+
+        if oov_tokens:
+            max_index = max(self.vocabulary_.values())
+            oov_vocabulary = dict(
+                zip(
+                    oov_tokens,
+                    list(range(max_index + 1, max_index + 1 + len(oov_tokens), 1)),
+                )
+            )
+            self.vocabulary_.update(oov_vocabulary)
+
+        return self
+
+    def update_bow(self, raw_documents: List[str]) -> csr_matrix:
+        """Create or update the bag-of-words matrix.
+
+        Update the bag-of-words matrix by adding the newly transformed
+        documents. This may add empty columns if new words are found and/or
+        add empty rows if new topics are found.
+
+        During this process, the previous bag-of-words matrix might be
+        decayed if `self.decay` has been set during init. Similarly, words
+        that do not exceed `self.delete_min_df` are removed from its
+        vocabulary and bag-of-words matrix.
+
+        Arguments:
+            raw_documents: A list of documents
+
+        Returns:
+            X_: Bag-of-words matrix
+        """
+        if hasattr(self, "X_"):
+            X = self.transform(raw_documents)
+
+            # Add empty columns if new words are found
+            columns = csr_matrix((self.X_.shape[0], X.shape[1] - self.X_.shape[1]), dtype=int)
+            self.X_ = sparse.hstack([self.X_, columns])
+
+            # Add empty rows if new topics are found
+            rows = csr_matrix((X.shape[0] - self.X_.shape[0], self.X_.shape[1]), dtype=int)
+            self.X_ = sparse.vstack([self.X_, rows])
+
+            # Decay of BoW matrix
+            if self.decay is not None:
+                self.X_ = self.X_ * (1 - self.decay)
+
+            self.X_ += X
+        else:
+            self.X_ = self.transform(raw_documents)
+
+        if self.delete_min_df is not None:
+            self._clean_bow()
+
+        return self.X_
+
+    def _clean_bow(self) -> None:
+        """Remove words that do not exceed `self.delete_min_df`."""
+        # Only keep words with a minimum frequency
+        indices = np.where(self.X_.sum(0) >= self.delete_min_df)[1]
+        indices_dict = {index: index for index in indices}
+        self.X_ = self.X_[:, indices]
+
+        # Update vocabulary with new words
+        new_vocab = {}
+        vocabulary_dict = {v: k for k, v in self.vocabulary_.items()}
+        for i, index in enumerate(indices):
+            if indices_dict.get(index) is not None:
+                new_vocab[vocabulary_dict[index]] = i
+
+        self.vocabulary_ = new_vocab