Add BERTopic.

2025-08-12 19:01:20 +08:00
parent e2323d579c
commit c5c530775e
256 changed files with 28666 additions and 0 deletions
@@ -0,0 +1,128 @@
+import warnings
+import numpy as np
+import pandas as pd
+from typing import List, Mapping, Tuple
+from scipy.sparse import csr_matrix
+from sklearn.metrics.pairwise import cosine_similarity
+from bertopic.representation._base import BaseRepresentation
+
+
+class MaximalMarginalRelevance(BaseRepresentation):
+    """Calculate Maximal Marginal Relevance (MMR)
+    between candidate keywords and the document.
+
+    MMR considers the similarity of keywords/keyphrases with the
+    document, along with the similarity of already selected
+    keywords and keyphrases. This results in a selection of keywords
+    that maximize their within diversity with respect to the document.
+
+    Arguments:
+        diversity: How diverse the select keywords/keyphrases are.
+                    Values range between 0 and 1 with 0 being not diverse at all
+                    and 1 being most diverse.
+        top_n_words: The number of keywords/keyhprases to return
+
+    Usage:
+
+    ```python
+    from bertopic.representation import MaximalMarginalRelevance
+    from bertopic import BERTopic
+
+    # Create your representation model
+    representation_model = MaximalMarginalRelevance(diversity=0.3)
+
+    # Use the representation model in BERTopic on top of the default pipeline
+    topic_model = BERTopic(representation_model=representation_model)
+    ```
+    """
+
+    def __init__(self, diversity: float = 0.1, top_n_words: int = 10):
+        self.diversity = diversity
+        self.top_n_words = top_n_words
+
+    def extract_topics(
+        self,
+        topic_model,
+        documents: pd.DataFrame,
+        c_tf_idf: csr_matrix,
+        topics: Mapping[str, List[Tuple[str, float]]],
+    ) -> Mapping[str, List[Tuple[str, float]]]:
+        """Extract topic representations.
+
+        Arguments:
+            topic_model: The BERTopic model
+            documents: Not used
+            c_tf_idf: Not used
+            topics: The candidate topics as calculated with c-TF-IDF
+
+        Returns:
+            updated_topics: Updated topic representations
+        """
+        if topic_model.embedding_model is None:
+            warnings.warn(
+                "MaximalMarginalRelevance can only be used BERTopic was instantiated"
+                "with the `embedding_model` parameter."
+            )
+            return topics
+
+        updated_topics = {}
+        for topic, topic_words in topics.items():
+            words = [word[0] for word in topic_words]
+            word_embeddings = topic_model._extract_embeddings(words, method="word", verbose=False)
+            topic_embedding = topic_model._extract_embeddings(" ".join(words), method="word", verbose=False).reshape(
+                1, -1
+            )
+            topic_words = mmr(
+                topic_embedding,
+                word_embeddings,
+                words,
+                self.diversity,
+                self.top_n_words,
+            )
+            updated_topics[topic] = [(word, value) for word, value in topics[topic] if word in topic_words]
+        return updated_topics
+
+
+def mmr(
+    doc_embedding: np.ndarray,
+    word_embeddings: np.ndarray,
+    words: List[str],
+    diversity: float = 0.1,
+    top_n: int = 10,
+) -> List[str]:
+    """Maximal Marginal Relevance.
+
+    Arguments:
+        doc_embedding: The document embeddings
+        word_embeddings: The embeddings of the selected candidate keywords/phrases
+        words: The selected candidate keywords/keyphrases
+        diversity: The diversity of the selected embeddings.
+                   Values between 0 and 1.
+        top_n: The top n items to return
+
+    Returns:
+            List[str]: The selected keywords/keyphrases
+    """
+    # Extract similarity within words, and between words and the document
+    word_doc_similarity = cosine_similarity(word_embeddings, doc_embedding)
+    word_similarity = cosine_similarity(word_embeddings)
+
+    # Initialize candidates and already choose best keyword/keyphras
+    keywords_idx = [np.argmax(word_doc_similarity)]
+    candidates_idx = [i for i in range(len(words)) if i != keywords_idx[0]]
+
+    for _ in range(top_n - 1):
+        # Extract similarities within candidates and
+        # between candidates and selected keywords/phrases
+        candidate_similarities = word_doc_similarity[candidates_idx, :]
+        target_similarities = np.max(word_similarity[candidates_idx][:, keywords_idx], axis=1)
+
+        # Calculate MMR
+        mmr = (1 - diversity) * candidate_similarities - diversity * target_similarities.reshape(-1, 1)
+        mmr_idx = candidates_idx[np.argmax(mmr)]
+
+        # Update keywords & candidates
+        keywords_idx.append(mmr_idx)
+        candidates_idx.remove(mmr_idx)
+
+    return [words[idx] for idx in keywords_idx]