Add BERTopic.

2025-08-12 19:01:20 +08:00
parent e2323d579c
commit c5c530775e
256 changed files with 28666 additions and 0 deletions
@@ -0,0 +1,222 @@
+import numpy as np
+import pandas as pd
+
+from packaging import version
+from scipy.sparse import csr_matrix
+from typing import Mapping, List, Tuple, Union
+from sklearn.metrics.pairwise import cosine_similarity
+from bertopic.representation._base import BaseRepresentation
+from sklearn import __version__ as sklearn_version
+
+
+class KeyBERTInspired(BaseRepresentation):
+    def __init__(
+        self,
+        top_n_words: int = 10,
+        nr_repr_docs: int = 5,
+        nr_samples: int = 500,
+        nr_candidate_words: int = 100,
+        random_state: int = 42,
+    ):
+        """Use a KeyBERT-like model to fine-tune the topic representations.
+
+        The algorithm follows KeyBERT but does some optimization in
+        order to speed up inference.
+
+        The steps are as follows. First, we extract the top n representative
+        documents per topic. To extract the representative documents, we
+        randomly sample a number of candidate documents per cluster
+        which is controlled by the `nr_samples` parameter. Then,
+        the top n representative documents  are extracted by calculating
+        the c-TF-IDF representation for the  candidate documents and finding,
+        through cosine similarity, which are closest to the topic c-TF-IDF representation.
+        Next, the top n words per topic are extracted based on their
+        c-TF-IDF representation, which is controlled by the `nr_repr_docs`
+        parameter.
+
+        Then, we extract the embeddings for words and representative documents
+        and create topic embeddings by averaging the representative documents.
+        Finally, the most similar words to each topic are extracted by
+        calculating the cosine similarity between word and topic embeddings.
+
+        Arguments:
+            top_n_words: The top n words to extract per topic.
+            nr_repr_docs: The number of representative documents to extract per cluster.
+            nr_samples: The number of candidate documents to extract per cluster.
+            nr_candidate_words: The number of candidate words per cluster.
+            random_state: The random state for randomly sampling candidate documents.
+
+        Usage:
+
+        ```python
+        from bertopic.representation import KeyBERTInspired
+        from bertopic import BERTopic
+
+        # Create your representation model
+        representation_model = KeyBERTInspired()
+
+        # Use the representation model in BERTopic on top of the default pipeline
+        topic_model = BERTopic(representation_model=representation_model)
+        ```
+        """
+        self.top_n_words = top_n_words
+        self.nr_repr_docs = nr_repr_docs
+        self.nr_samples = nr_samples
+        self.nr_candidate_words = nr_candidate_words
+        self.random_state = random_state
+
+    def extract_topics(
+        self,
+        topic_model,
+        documents: pd.DataFrame,
+        c_tf_idf: csr_matrix,
+        topics: Mapping[str, List[Tuple[str, float]]],
+        embeddings: np.ndarray = None,
+    ) -> Mapping[str, List[Tuple[str, float]]]:
+        """Extract topics.
+
+        Arguments:
+            topic_model: A BERTopic model
+            documents: All input documents
+            c_tf_idf: The topic c-TF-IDF representation
+            topics: The candidate topics as calculated with c-TF-IDF
+            embeddings: Pre-trained document embeddings. These can be used
+                        instead of an embedding model
+
+        Returns:
+            updated_topics: Updated topic representations
+        """
+        # We extract the top n representative documents per class
+        _, representative_docs, repr_doc_indices, _ = topic_model._extract_representative_docs(
+            c_tf_idf, documents, topics, self.nr_samples, self.nr_repr_docs
+        )
+
+        # If document embeddings are precomputed, extract the embeddings of the representative documents based on repr_doc_indices
+        repr_embeddings = None
+        if embeddings is not None:
+            repr_embeddings = [embeddings[index] for index in np.concatenate(repr_doc_indices)]
+
+        # We extract the top n words per class
+        topics = self._extract_candidate_words(topic_model, c_tf_idf, topics)
+
+        # We calculate the similarity between word and document embeddings and create
+        # topic embeddings from the representative document embeddings
+        sim_matrix, words = self._extract_embeddings(
+            topic_model, topics, representative_docs, repr_doc_indices, repr_embeddings
+        )
+        # Find the best matching words based on the similarity matrix for each topic
+        updated_topics = self._extract_top_words(words, topics, sim_matrix)
+
+        return updated_topics
+
+    def _extract_candidate_words(
+        self,
+        topic_model,
+        c_tf_idf: csr_matrix,
+        topics: Mapping[str, List[Tuple[str, float]]],
+    ) -> Mapping[str, List[Tuple[str, float]]]:
+        """For each topic, extract candidate words based on the c-TF-IDF
+        representation.
+
+        Arguments:
+            topic_model: A BERTopic model
+            c_tf_idf: The topic c-TF-IDF representation
+            topics: The top words per topic
+
+        Returns:
+            topics: The `self.top_n_words` per topic
+        """
+        labels = [int(label) for label in sorted(list(topics.keys()))]
+
+        # Scikit-Learn Deprecation: get_feature_names is deprecated in 1.0
+        # and will be removed in 1.2. Please use get_feature_names_out instead.
+        if version.parse(sklearn_version) >= version.parse("1.0.0"):
+            words = topic_model.vectorizer_model.get_feature_names_out()
+        else:
+            words = topic_model.vectorizer_model.get_feature_names()
+
+        indices = topic_model._top_n_idx_sparse(c_tf_idf, self.nr_candidate_words)
+        scores = topic_model._top_n_values_sparse(c_tf_idf, indices)
+        sorted_indices = np.argsort(scores, 1)
+        indices = np.take_along_axis(indices, sorted_indices, axis=1)
+        scores = np.take_along_axis(scores, sorted_indices, axis=1)
+
+        # Get top 30 words per topic based on c-TF-IDF score
+        topics = {
+            label: [
+                (words[word_index], score) if word_index is not None and score > 0 else ("", 0.00001)
+                for word_index, score in zip(indices[index][::-1], scores[index][::-1])
+            ]
+            for index, label in enumerate(labels)
+        }
+        topics = {label: list(zip(*values[: self.nr_candidate_words]))[0] for label, values in topics.items()}
+
+        return topics
+
+    def _extract_embeddings(
+        self,
+        topic_model,
+        topics: Mapping[str, List[Tuple[str, float]]],
+        representative_docs: List[str],
+        repr_doc_indices: List[List[int]],
+        repr_embeddings: np.ndarray = None,
+    ) -> Union[np.ndarray, List[str]]:
+        """Extract the representative document embeddings and create topic embeddings.
+        Then extract word embeddings and calculate the cosine similarity between topic
+        embeddings and the word embeddings. Topic embeddings are the average of
+        representative document embeddings.
+
+        Arguments:
+            topic_model: A BERTopic model
+            topics: The top words per topic
+            representative_docs: A flat list of representative documents
+            repr_doc_indices: The indices of representative documents
+                              that belong to each topic
+            repr_embeddings: Embeddings of respective representative_docs
+
+        Returns:
+            sim: The similarity matrix between word and topic embeddings
+            vocab: The complete vocabulary of input documents
+        """
+        # Calculate representative document embeddings if there are no precomputed embeddings.
+        if repr_embeddings is None:
+            repr_embeddings = topic_model._extract_embeddings(representative_docs, method="document", verbose=False)
+
+        topic_embeddings = [np.mean(repr_embeddings[i[0] : i[-1] + 1], axis=0) for i in repr_doc_indices]
+
+        # Calculate word embeddings and extract best matching with updated topic_embeddings
+        vocab = list(set([word for words in topics.values() for word in words]))
+        word_embeddings = topic_model._extract_embeddings(vocab, method="document", verbose=False)
+        sim = cosine_similarity(topic_embeddings, word_embeddings)
+
+        return sim, vocab
+
+    def _extract_top_words(
+        self,
+        vocab: List[str],
+        topics: Mapping[str, List[Tuple[str, float]]],
+        sim: np.ndarray,
+    ) -> Mapping[str, List[Tuple[str, float]]]:
+        """Extract the top n words per topic based on the
+        similarity matrix between topics and words.
+
+        Arguments:
+            vocab: The complete vocabulary of input documents
+            labels: All topic labels
+            topics: The top words per topic
+            sim: The similarity matrix between word and topic embeddings
+
+        Returns:
+            updated_topics: The updated topic representations
+        """
+        labels = [int(label) for label in sorted(list(topics.keys()))]
+        updated_topics = {}
+        for i, topic in enumerate(labels):
+            indices = [vocab.index(word) for word in topics[topic]]
+            values = sim[:, indices][i]
+            word_indices = [indices[index] for index in np.argsort(values)[-self.top_n_words :]]
+            updated_topics[topic] = [
+                (vocab[index], val) for val, index in zip(np.sort(values)[-self.top_n_words :], word_indices)
+            ][::-1]
+
+        return updated_topics