Add BERTopic.
This commit is contained in:
@@ -0,0 +1,222 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from packaging import version
|
||||
from scipy.sparse import csr_matrix
|
||||
from typing import Mapping, List, Tuple, Union
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
from bertopic.representation._base import BaseRepresentation
|
||||
from sklearn import __version__ as sklearn_version
|
||||
|
||||
|
||||
class KeyBERTInspired(BaseRepresentation):
|
||||
def __init__(
|
||||
self,
|
||||
top_n_words: int = 10,
|
||||
nr_repr_docs: int = 5,
|
||||
nr_samples: int = 500,
|
||||
nr_candidate_words: int = 100,
|
||||
random_state: int = 42,
|
||||
):
|
||||
"""Use a KeyBERT-like model to fine-tune the topic representations.
|
||||
|
||||
The algorithm follows KeyBERT but does some optimization in
|
||||
order to speed up inference.
|
||||
|
||||
The steps are as follows. First, we extract the top n representative
|
||||
documents per topic. To extract the representative documents, we
|
||||
randomly sample a number of candidate documents per cluster
|
||||
which is controlled by the `nr_samples` parameter. Then,
|
||||
the top n representative documents are extracted by calculating
|
||||
the c-TF-IDF representation for the candidate documents and finding,
|
||||
through cosine similarity, which are closest to the topic c-TF-IDF representation.
|
||||
Next, the top n words per topic are extracted based on their
|
||||
c-TF-IDF representation, which is controlled by the `nr_repr_docs`
|
||||
parameter.
|
||||
|
||||
Then, we extract the embeddings for words and representative documents
|
||||
and create topic embeddings by averaging the representative documents.
|
||||
Finally, the most similar words to each topic are extracted by
|
||||
calculating the cosine similarity between word and topic embeddings.
|
||||
|
||||
Arguments:
|
||||
top_n_words: The top n words to extract per topic.
|
||||
nr_repr_docs: The number of representative documents to extract per cluster.
|
||||
nr_samples: The number of candidate documents to extract per cluster.
|
||||
nr_candidate_words: The number of candidate words per cluster.
|
||||
random_state: The random state for randomly sampling candidate documents.
|
||||
|
||||
Usage:
|
||||
|
||||
```python
|
||||
from bertopic.representation import KeyBERTInspired
|
||||
from bertopic import BERTopic
|
||||
|
||||
# Create your representation model
|
||||
representation_model = KeyBERTInspired()
|
||||
|
||||
# Use the representation model in BERTopic on top of the default pipeline
|
||||
topic_model = BERTopic(representation_model=representation_model)
|
||||
```
|
||||
"""
|
||||
self.top_n_words = top_n_words
|
||||
self.nr_repr_docs = nr_repr_docs
|
||||
self.nr_samples = nr_samples
|
||||
self.nr_candidate_words = nr_candidate_words
|
||||
self.random_state = random_state
|
||||
|
||||
def extract_topics(
|
||||
self,
|
||||
topic_model,
|
||||
documents: pd.DataFrame,
|
||||
c_tf_idf: csr_matrix,
|
||||
topics: Mapping[str, List[Tuple[str, float]]],
|
||||
embeddings: np.ndarray = None,
|
||||
) -> Mapping[str, List[Tuple[str, float]]]:
|
||||
"""Extract topics.
|
||||
|
||||
Arguments:
|
||||
topic_model: A BERTopic model
|
||||
documents: All input documents
|
||||
c_tf_idf: The topic c-TF-IDF representation
|
||||
topics: The candidate topics as calculated with c-TF-IDF
|
||||
embeddings: Pre-trained document embeddings. These can be used
|
||||
instead of an embedding model
|
||||
|
||||
Returns:
|
||||
updated_topics: Updated topic representations
|
||||
"""
|
||||
# We extract the top n representative documents per class
|
||||
_, representative_docs, repr_doc_indices, _ = topic_model._extract_representative_docs(
|
||||
c_tf_idf, documents, topics, self.nr_samples, self.nr_repr_docs
|
||||
)
|
||||
|
||||
# If document embeddings are precomputed, extract the embeddings of the representative documents based on repr_doc_indices
|
||||
repr_embeddings = None
|
||||
if embeddings is not None:
|
||||
repr_embeddings = [embeddings[index] for index in np.concatenate(repr_doc_indices)]
|
||||
|
||||
# We extract the top n words per class
|
||||
topics = self._extract_candidate_words(topic_model, c_tf_idf, topics)
|
||||
|
||||
# We calculate the similarity between word and document embeddings and create
|
||||
# topic embeddings from the representative document embeddings
|
||||
sim_matrix, words = self._extract_embeddings(
|
||||
topic_model, topics, representative_docs, repr_doc_indices, repr_embeddings
|
||||
)
|
||||
# Find the best matching words based on the similarity matrix for each topic
|
||||
updated_topics = self._extract_top_words(words, topics, sim_matrix)
|
||||
|
||||
return updated_topics
|
||||
|
||||
def _extract_candidate_words(
|
||||
self,
|
||||
topic_model,
|
||||
c_tf_idf: csr_matrix,
|
||||
topics: Mapping[str, List[Tuple[str, float]]],
|
||||
) -> Mapping[str, List[Tuple[str, float]]]:
|
||||
"""For each topic, extract candidate words based on the c-TF-IDF
|
||||
representation.
|
||||
|
||||
Arguments:
|
||||
topic_model: A BERTopic model
|
||||
c_tf_idf: The topic c-TF-IDF representation
|
||||
topics: The top words per topic
|
||||
|
||||
Returns:
|
||||
topics: The `self.top_n_words` per topic
|
||||
"""
|
||||
labels = [int(label) for label in sorted(list(topics.keys()))]
|
||||
|
||||
# Scikit-Learn Deprecation: get_feature_names is deprecated in 1.0
|
||||
# and will be removed in 1.2. Please use get_feature_names_out instead.
|
||||
if version.parse(sklearn_version) >= version.parse("1.0.0"):
|
||||
words = topic_model.vectorizer_model.get_feature_names_out()
|
||||
else:
|
||||
words = topic_model.vectorizer_model.get_feature_names()
|
||||
|
||||
indices = topic_model._top_n_idx_sparse(c_tf_idf, self.nr_candidate_words)
|
||||
scores = topic_model._top_n_values_sparse(c_tf_idf, indices)
|
||||
sorted_indices = np.argsort(scores, 1)
|
||||
indices = np.take_along_axis(indices, sorted_indices, axis=1)
|
||||
scores = np.take_along_axis(scores, sorted_indices, axis=1)
|
||||
|
||||
# Get top 30 words per topic based on c-TF-IDF score
|
||||
topics = {
|
||||
label: [
|
||||
(words[word_index], score) if word_index is not None and score > 0 else ("", 0.00001)
|
||||
for word_index, score in zip(indices[index][::-1], scores[index][::-1])
|
||||
]
|
||||
for index, label in enumerate(labels)
|
||||
}
|
||||
topics = {label: list(zip(*values[: self.nr_candidate_words]))[0] for label, values in topics.items()}
|
||||
|
||||
return topics
|
||||
|
||||
def _extract_embeddings(
|
||||
self,
|
||||
topic_model,
|
||||
topics: Mapping[str, List[Tuple[str, float]]],
|
||||
representative_docs: List[str],
|
||||
repr_doc_indices: List[List[int]],
|
||||
repr_embeddings: np.ndarray = None,
|
||||
) -> Union[np.ndarray, List[str]]:
|
||||
"""Extract the representative document embeddings and create topic embeddings.
|
||||
Then extract word embeddings and calculate the cosine similarity between topic
|
||||
embeddings and the word embeddings. Topic embeddings are the average of
|
||||
representative document embeddings.
|
||||
|
||||
Arguments:
|
||||
topic_model: A BERTopic model
|
||||
topics: The top words per topic
|
||||
representative_docs: A flat list of representative documents
|
||||
repr_doc_indices: The indices of representative documents
|
||||
that belong to each topic
|
||||
repr_embeddings: Embeddings of respective representative_docs
|
||||
|
||||
Returns:
|
||||
sim: The similarity matrix between word and topic embeddings
|
||||
vocab: The complete vocabulary of input documents
|
||||
"""
|
||||
# Calculate representative document embeddings if there are no precomputed embeddings.
|
||||
if repr_embeddings is None:
|
||||
repr_embeddings = topic_model._extract_embeddings(representative_docs, method="document", verbose=False)
|
||||
|
||||
topic_embeddings = [np.mean(repr_embeddings[i[0] : i[-1] + 1], axis=0) for i in repr_doc_indices]
|
||||
|
||||
# Calculate word embeddings and extract best matching with updated topic_embeddings
|
||||
vocab = list(set([word for words in topics.values() for word in words]))
|
||||
word_embeddings = topic_model._extract_embeddings(vocab, method="document", verbose=False)
|
||||
sim = cosine_similarity(topic_embeddings, word_embeddings)
|
||||
|
||||
return sim, vocab
|
||||
|
||||
def _extract_top_words(
|
||||
self,
|
||||
vocab: List[str],
|
||||
topics: Mapping[str, List[Tuple[str, float]]],
|
||||
sim: np.ndarray,
|
||||
) -> Mapping[str, List[Tuple[str, float]]]:
|
||||
"""Extract the top n words per topic based on the
|
||||
similarity matrix between topics and words.
|
||||
|
||||
Arguments:
|
||||
vocab: The complete vocabulary of input documents
|
||||
labels: All topic labels
|
||||
topics: The top words per topic
|
||||
sim: The similarity matrix between word and topic embeddings
|
||||
|
||||
Returns:
|
||||
updated_topics: The updated topic representations
|
||||
"""
|
||||
labels = [int(label) for label in sorted(list(topics.keys()))]
|
||||
updated_topics = {}
|
||||
for i, topic in enumerate(labels):
|
||||
indices = [vocab.index(word) for word in topics[topic]]
|
||||
values = sim[:, indices][i]
|
||||
word_indices = [indices[index] for index in np.argsort(values)[-self.top_n_words :]]
|
||||
updated_topics[topic] = [
|
||||
(vocab[index], val) for val, index in zip(np.sort(values)[-self.top_n_words :], word_indices)
|
||||
][::-1]
|
||||
|
||||
return updated_topics
|
||||
Reference in New Issue
Block a user