Add BERTopic.
This commit is contained in:
@@ -0,0 +1,192 @@
|
||||
import copy
|
||||
import pytest
|
||||
from umap import UMAP
|
||||
from hdbscan import HDBSCAN
|
||||
from bertopic import BERTopic
|
||||
from sklearn.datasets import fetch_20newsgroups
|
||||
from sentence_transformers import SentenceTransformer
|
||||
from sklearn.cluster import KMeans, MiniBatchKMeans
|
||||
from sklearn.decomposition import PCA
|
||||
from bertopic.vectorizers import OnlineCountVectorizer
|
||||
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance
|
||||
from bertopic.dimensionality import BaseDimensionalityReduction
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def embedding_model():
|
||||
model = SentenceTransformer("all-MiniLM-L6-v2")
|
||||
return model
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def document_embeddings(documents, embedding_model):
|
||||
embeddings = embedding_model.encode(documents)
|
||||
return embeddings
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def reduced_embeddings(document_embeddings):
|
||||
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric="cosine").fit_transform(
|
||||
document_embeddings
|
||||
)
|
||||
return reduced_embeddings
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def documents():
|
||||
newsgroup_docs = fetch_20newsgroups(subset="all", remove=("headers", "footers", "quotes"))["data"][:1000]
|
||||
return newsgroup_docs
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def targets():
|
||||
data = fetch_20newsgroups(subset="all", remove=("headers", "footers", "quotes"))
|
||||
y = data["target"][:1000]
|
||||
return y
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def base_topic_model(documents, document_embeddings, embedding_model):
|
||||
model = BERTopic(embedding_model=embedding_model, calculate_probabilities=True)
|
||||
model.umap_model.random_state = 42
|
||||
model.hdbscan_model.min_cluster_size = 3
|
||||
model.fit(documents, document_embeddings)
|
||||
return model
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def zeroshot_topic_model(documents, document_embeddings, embedding_model):
|
||||
zeroshot_topic_list = ["religion", "cars", "electronics"]
|
||||
model = BERTopic(
|
||||
embedding_model=embedding_model,
|
||||
calculate_probabilities=True,
|
||||
zeroshot_topic_list=zeroshot_topic_list,
|
||||
zeroshot_min_similarity=0.3,
|
||||
)
|
||||
model.umap_model.random_state = 42
|
||||
model.hdbscan_model.min_cluster_size = 2
|
||||
model.fit(documents, document_embeddings)
|
||||
return model
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def custom_topic_model(documents, document_embeddings, embedding_model):
|
||||
umap_model = UMAP(n_neighbors=15, n_components=6, min_dist=0.0, metric="cosine", random_state=42)
|
||||
hdbscan_model = HDBSCAN(
|
||||
min_cluster_size=3,
|
||||
metric="euclidean",
|
||||
cluster_selection_method="eom",
|
||||
prediction_data=True,
|
||||
)
|
||||
model = BERTopic(
|
||||
umap_model=umap_model,
|
||||
hdbscan_model=hdbscan_model,
|
||||
embedding_model=embedding_model,
|
||||
calculate_probabilities=True,
|
||||
).fit(documents, document_embeddings)
|
||||
return model
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def representation_topic_model(documents, document_embeddings, embedding_model):
|
||||
umap_model = UMAP(n_neighbors=15, n_components=6, min_dist=0.0, metric="cosine", random_state=42)
|
||||
hdbscan_model = HDBSCAN(
|
||||
min_cluster_size=3,
|
||||
metric="euclidean",
|
||||
cluster_selection_method="eom",
|
||||
prediction_data=True,
|
||||
)
|
||||
representation_model = {
|
||||
"Main": KeyBERTInspired(),
|
||||
"MMR": [KeyBERTInspired(top_n_words=30), MaximalMarginalRelevance()],
|
||||
}
|
||||
model = BERTopic(
|
||||
umap_model=umap_model,
|
||||
hdbscan_model=hdbscan_model,
|
||||
embedding_model=embedding_model,
|
||||
representation_model=representation_model,
|
||||
calculate_probabilities=True,
|
||||
).fit(documents, document_embeddings)
|
||||
return model
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def reduced_topic_model(custom_topic_model, documents):
|
||||
model = copy.deepcopy(custom_topic_model)
|
||||
model.reduce_topics(documents, nr_topics="auto")
|
||||
return model
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def merged_topic_model(custom_topic_model, documents):
|
||||
model = copy.deepcopy(custom_topic_model)
|
||||
|
||||
# Merge once
|
||||
topics_to_merge = [[1, 2], [3, 4]]
|
||||
model.merge_topics(documents, topics_to_merge)
|
||||
|
||||
# Merge second time
|
||||
topics_to_merge = [[5, 6, 7]]
|
||||
model.merge_topics(documents, topics_to_merge)
|
||||
return model
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def kmeans_pca_topic_model(documents, document_embeddings):
|
||||
hdbscan_model = KMeans(n_clusters=15, random_state=42)
|
||||
dim_model = PCA(n_components=5)
|
||||
model = BERTopic(
|
||||
hdbscan_model=hdbscan_model,
|
||||
umap_model=dim_model,
|
||||
embedding_model=embedding_model,
|
||||
).fit(documents, document_embeddings)
|
||||
return model
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def supervised_topic_model(documents, document_embeddings, embedding_model, targets):
|
||||
empty_dimensionality_model = BaseDimensionalityReduction()
|
||||
clf = LogisticRegression()
|
||||
|
||||
model = BERTopic(
|
||||
embedding_model=embedding_model,
|
||||
umap_model=empty_dimensionality_model,
|
||||
hdbscan_model=clf,
|
||||
).fit(documents, embeddings=document_embeddings, y=targets)
|
||||
return model
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def online_topic_model(documents, document_embeddings, embedding_model):
|
||||
umap_model = PCA(n_components=5)
|
||||
cluster_model = MiniBatchKMeans(n_clusters=50, random_state=0)
|
||||
vectorizer_model = OnlineCountVectorizer(stop_words="english", decay=0.01)
|
||||
model = BERTopic(
|
||||
umap_model=umap_model,
|
||||
hdbscan_model=cluster_model,
|
||||
vectorizer_model=vectorizer_model,
|
||||
embedding_model=embedding_model,
|
||||
)
|
||||
|
||||
topics = []
|
||||
for index in range(0, len(documents), 50):
|
||||
model.partial_fit(documents[index : index + 50], document_embeddings[index : index + 50])
|
||||
topics.extend(model.topics_)
|
||||
model.topics_ = topics
|
||||
return model
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def cuml_base_topic_model(documents, document_embeddings, embedding_model):
|
||||
from cuml.cluster import HDBSCAN as cuml_hdbscan
|
||||
from cuml.manifold import UMAP as cuml_umap
|
||||
|
||||
model = BERTopic(
|
||||
embedding_model=embedding_model,
|
||||
calculate_probabilities=True,
|
||||
umap_model=cuml_umap(n_components=5, n_neighbors=5, random_state=42),
|
||||
hdbscan_model=cuml_hdbscan(min_cluster_size=3, prediction_data=True),
|
||||
)
|
||||
model.fit(documents, document_embeddings)
|
||||
return model
|
||||
@@ -0,0 +1,155 @@
|
||||
import copy
|
||||
import pytest
|
||||
from bertopic import BERTopic
|
||||
import importlib.util
|
||||
|
||||
|
||||
def cuml_available():
|
||||
try:
|
||||
return importlib.util.find_spec("cuml") is not None
|
||||
except ImportError:
|
||||
return False
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
("base_topic_model"),
|
||||
("kmeans_pca_topic_model"),
|
||||
("custom_topic_model"),
|
||||
("merged_topic_model"),
|
||||
("reduced_topic_model"),
|
||||
("online_topic_model"),
|
||||
("supervised_topic_model"),
|
||||
("representation_topic_model"),
|
||||
("zeroshot_topic_model"),
|
||||
pytest.param(
|
||||
"cuml_base_topic_model",
|
||||
marks=pytest.mark.skipif(not cuml_available(), reason="cuML not available"),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_full_model(model, documents, request):
|
||||
"""Tests the entire pipeline in one go. This serves as a sanity check to see if the default
|
||||
settings result in a good separation of topics.
|
||||
|
||||
NOTE: This does not cover all cases but merely combines it all together
|
||||
"""
|
||||
topic_model = copy.deepcopy(request.getfixturevalue(model))
|
||||
if model == "base_topic_model":
|
||||
topic_model.save(
|
||||
"model_dir",
|
||||
serialization="pytorch",
|
||||
save_ctfidf=True,
|
||||
save_embedding_model="sentence-transformers/all-MiniLM-L6-v2",
|
||||
)
|
||||
topic_model = BERTopic.load("model_dir")
|
||||
|
||||
if model == "cuml_base_topic_model":
|
||||
assert "cuml" in str(type(topic_model.umap_model)).lower()
|
||||
assert "cuml" in str(type(topic_model.hdbscan_model)).lower()
|
||||
|
||||
topics = topic_model.topics_
|
||||
|
||||
for topic in set(topics):
|
||||
words = topic_model.get_topic(topic)[:10]
|
||||
assert len(words) == 10
|
||||
|
||||
for topic in topic_model.get_topic_freq().Topic:
|
||||
words = topic_model.get_topic(topic)[:10]
|
||||
assert len(words) == 10
|
||||
|
||||
assert len(topic_model.get_topic_freq()) > 2
|
||||
assert len(topic_model.get_topics()) == len(topic_model.get_topic_freq())
|
||||
|
||||
# Test extraction of document info
|
||||
document_info = topic_model.get_document_info(documents)
|
||||
assert len(document_info) == len(documents)
|
||||
|
||||
# Test transform
|
||||
doc = "This is a new document to predict."
|
||||
topics_test, probs_test = topic_model.transform([doc, doc])
|
||||
|
||||
assert len(topics_test) == 2
|
||||
|
||||
# Test zero-shot topic modeling
|
||||
if topic_model._is_zeroshot():
|
||||
if topic_model._outliers:
|
||||
assert set(topic_model.topic_labels_.keys()) == set(range(-1, len(topic_model.topic_labels_) - 1))
|
||||
else:
|
||||
assert set(topic_model.topic_labels_.keys()) == set(range(len(topic_model.topic_labels_)))
|
||||
|
||||
# Test topics over time
|
||||
timestamps = [i % 10 for i in range(len(documents))]
|
||||
topics_over_time = topic_model.topics_over_time(documents, timestamps)
|
||||
|
||||
assert topics_over_time.Frequency.sum() == len(documents)
|
||||
assert len(topics_over_time.Topic.unique()) == len(set(topics))
|
||||
|
||||
# Test hierarchical topics
|
||||
hier_topics = topic_model.hierarchical_topics(documents)
|
||||
|
||||
assert len(hier_topics) > 0
|
||||
assert hier_topics.Parent_ID.astype(int).min() > max(topics)
|
||||
|
||||
# Test creation of topic tree
|
||||
tree = topic_model.get_topic_tree(hier_topics, tight_layout=False)
|
||||
assert isinstance(tree, str)
|
||||
assert len(tree) > 10
|
||||
|
||||
# Test find topic
|
||||
similar_topics, similarity = topic_model.find_topics("query", top_n=2)
|
||||
assert len(similar_topics) == 2
|
||||
assert len(similarity) == 2
|
||||
assert max(similarity) <= 1
|
||||
|
||||
# Test topic reduction
|
||||
nr_topics = len(set(topics))
|
||||
nr_topics = 2 if nr_topics < 2 else nr_topics - 1
|
||||
topic_model.reduce_topics(documents, nr_topics=nr_topics)
|
||||
|
||||
assert len(topic_model.get_topic_freq()) == nr_topics
|
||||
assert len(topic_model.topics_) == len(topics)
|
||||
|
||||
# Test update topics
|
||||
topic = topic_model.get_topic(1)[:10]
|
||||
vectorizer_model = topic_model.vectorizer_model
|
||||
topic_model.update_topics(documents, n_gram_range=(2, 2))
|
||||
|
||||
updated_topic = topic_model.get_topic(1)[:10]
|
||||
|
||||
topic_model.update_topics(documents, vectorizer_model=vectorizer_model)
|
||||
original_topic = topic_model.get_topic(1)[:10]
|
||||
|
||||
assert topic != updated_topic
|
||||
if topic_model.representation_model is not None:
|
||||
assert topic != original_topic
|
||||
|
||||
# Test updating topic labels
|
||||
topic_labels = topic_model.generate_topic_labels(nr_words=3, topic_prefix=False, word_length=10, separator=", ")
|
||||
assert len(topic_labels) == len(set(topic_model.topics_))
|
||||
|
||||
# Test setting topic labels
|
||||
topic_model.set_topic_labels(topic_labels)
|
||||
assert topic_model.custom_labels_ == topic_labels
|
||||
|
||||
# Test merging topics
|
||||
freq = topic_model.get_topic_freq(0)
|
||||
topics_to_merge = [0, 1]
|
||||
topic_model.merge_topics(documents, topics_to_merge)
|
||||
assert freq < topic_model.get_topic_freq(0)
|
||||
|
||||
# Test reduction of outliers
|
||||
if -1 in topics:
|
||||
new_topics = topic_model.reduce_outliers(documents, topics, threshold=0.0)
|
||||
nr_outliers_topic_model = sum([1 for topic in topic_model.topics_ if topic == -1])
|
||||
nr_outliers_new_topics = sum([1 for topic in new_topics if topic == -1])
|
||||
|
||||
if topic_model._outliers == 1:
|
||||
assert nr_outliers_topic_model > nr_outliers_new_topics
|
||||
|
||||
# Combine models
|
||||
topic_model1 = BERTopic.load("model_dir")
|
||||
merged_model = BERTopic.merge_models([topic_model, topic_model1])
|
||||
|
||||
assert len(merged_model.get_topic_info()) > len(topic_model.get_topic_info())
|
||||
@@ -0,0 +1,22 @@
|
||||
from bertopic import BERTopic
|
||||
|
||||
|
||||
def test_load_save_model():
|
||||
model = BERTopic(language="Dutch", embedding_model=None)
|
||||
model.save("test", serialization="pickle")
|
||||
loaded_model = BERTopic.load("test")
|
||||
assert type(model) is type(loaded_model)
|
||||
assert model.language == loaded_model.language
|
||||
assert model.embedding_model == loaded_model.embedding_model
|
||||
assert model.top_n_words == loaded_model.top_n_words
|
||||
|
||||
|
||||
def test_get_params():
|
||||
model = BERTopic()
|
||||
params = model.get_params()
|
||||
assert not params["embedding_model"]
|
||||
assert not params["low_memory"]
|
||||
assert not params["nr_topics"]
|
||||
assert params["n_gram_range"] == (1, 1)
|
||||
assert params["min_topic_size"] == 10
|
||||
assert params["language"] == "english"
|
||||
@@ -0,0 +1,34 @@
|
||||
import copy
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize("batch_size", [50, None])
|
||||
@pytest.mark.parametrize("padding", [True, False])
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
("kmeans_pca_topic_model"),
|
||||
("base_topic_model"),
|
||||
("custom_topic_model"),
|
||||
("merged_topic_model"),
|
||||
("reduced_topic_model"),
|
||||
],
|
||||
)
|
||||
def test_approximate_distribution(batch_size, padding, model, documents, request):
|
||||
topic_model = copy.deepcopy(request.getfixturevalue(model))
|
||||
|
||||
# Calculate only on a document-level based on tokensets
|
||||
topic_distr, _ = topic_model.approximate_distribution(documents, padding=padding, batch_size=batch_size)
|
||||
assert topic_distr.shape[1] == len(topic_model.topic_labels_) - topic_model._outliers
|
||||
|
||||
# Use the distribution visualization
|
||||
for i in range(3):
|
||||
topic_model.visualize_distribution(topic_distr[i])
|
||||
|
||||
# Calculate distribution on a token-level
|
||||
topic_distr, topic_token_distr = topic_model.approximate_distribution(documents[:100], calculate_tokens=True)
|
||||
assert topic_distr.shape[1] == len(topic_model.topic_labels_) - topic_model._outliers
|
||||
assert len(topic_token_distr) == len(documents[:100])
|
||||
|
||||
for token_distr in topic_token_distr:
|
||||
assert token_distr.shape[1] == len(topic_model.topic_labels_) - topic_model._outliers
|
||||
@@ -0,0 +1,55 @@
|
||||
import copy
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
("kmeans_pca_topic_model"),
|
||||
("base_topic_model"),
|
||||
("custom_topic_model"),
|
||||
("merged_topic_model"),
|
||||
("reduced_topic_model"),
|
||||
("online_topic_model"),
|
||||
],
|
||||
)
|
||||
def test_barchart(model, request):
|
||||
topic_model = copy.deepcopy(request.getfixturevalue(model))
|
||||
fig = topic_model.visualize_barchart()
|
||||
|
||||
assert len(fig.to_dict()["layout"]["annotations"]) == 8
|
||||
for annotation in fig.to_dict()["layout"]["annotations"]:
|
||||
assert int(annotation["text"].split(" ")[-1]) != -1
|
||||
|
||||
fig = topic_model.visualize_barchart(top_n_topics=5)
|
||||
|
||||
assert len(fig.to_dict()["layout"]["annotations"]) == 5
|
||||
for annotation in fig.to_dict()["layout"]["annotations"]:
|
||||
assert int(annotation["text"].split(" ")[-1]) != -1
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
("kmeans_pca_topic_model"),
|
||||
("base_topic_model"),
|
||||
("custom_topic_model"),
|
||||
("merged_topic_model"),
|
||||
("reduced_topic_model"),
|
||||
("online_topic_model"),
|
||||
],
|
||||
)
|
||||
def test_barchart_outlier(model, request):
|
||||
topic_model = copy.deepcopy(request.getfixturevalue(model))
|
||||
topic_model.topic_sizes_[-1] = 4
|
||||
fig = topic_model.visualize_barchart()
|
||||
|
||||
assert len(fig.to_dict()["layout"]["annotations"]) == 8
|
||||
for annotation in fig.to_dict()["layout"]["annotations"]:
|
||||
assert int(annotation["text"].split(" ")[-1]) != -1
|
||||
|
||||
fig = topic_model.visualize_barchart(top_n_topics=5)
|
||||
|
||||
assert len(fig.to_dict()["layout"]["annotations"]) == 5
|
||||
for annotation in fig.to_dict()["layout"]["annotations"]:
|
||||
assert int(annotation["text"].split(" ")[-1]) != -1
|
||||
@@ -0,0 +1,22 @@
|
||||
import copy
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
("kmeans_pca_topic_model"),
|
||||
("base_topic_model"),
|
||||
("custom_topic_model"),
|
||||
("merged_topic_model"),
|
||||
("reduced_topic_model"),
|
||||
],
|
||||
)
|
||||
def test_documents(model, reduced_embeddings, documents, request):
|
||||
topic_model = copy.deepcopy(request.getfixturevalue(model))
|
||||
topics = set(topic_model.topics_)
|
||||
if -1 in topics:
|
||||
topics.remove(-1)
|
||||
fig = topic_model.visualize_documents(documents, embeddings=reduced_embeddings, hide_document_hover=True)
|
||||
fig_topics = [int(data["name"].split("_")[0]) for data in fig.to_dict()["data"][1:]]
|
||||
assert set(fig_topics) == topics
|
||||
@@ -0,0 +1,22 @@
|
||||
import copy
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
("kmeans_pca_topic_model"),
|
||||
("base_topic_model"),
|
||||
("custom_topic_model"),
|
||||
("merged_topic_model"),
|
||||
("reduced_topic_model"),
|
||||
("online_topic_model"),
|
||||
],
|
||||
)
|
||||
def test_dynamic(model, documents, request):
|
||||
topic_model = copy.deepcopy(request.getfixturevalue(model))
|
||||
timestamps = [i % 10 for i in range(len(documents))]
|
||||
topics_over_time = topic_model.topics_over_time(documents, timestamps)
|
||||
fig = topic_model.visualize_topics_over_time(topics_over_time)
|
||||
|
||||
assert len(fig.to_dict()["data"]) == len(set(topic_model.topics_)) - topic_model._outliers
|
||||
@@ -0,0 +1,23 @@
|
||||
import copy
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
("kmeans_pca_topic_model"),
|
||||
("base_topic_model"),
|
||||
("custom_topic_model"),
|
||||
("merged_topic_model"),
|
||||
("reduced_topic_model"),
|
||||
],
|
||||
)
|
||||
def test_heatmap(model, request):
|
||||
topic_model = copy.deepcopy(request.getfixturevalue(model))
|
||||
topics = set(topic_model.topics_)
|
||||
if -1 in topics:
|
||||
topics.remove(-1)
|
||||
fig = topic_model.visualize_heatmap()
|
||||
fig_topics = [int(topic.split("_")[0]) for topic in fig.to_dict()["data"][0]["x"]]
|
||||
|
||||
assert set(fig_topics) == topics
|
||||
@@ -0,0 +1,8 @@
|
||||
import copy
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", [("kmeans_pca_topic_model"), ("base_topic_model"), ("custom_topic_model")])
|
||||
def test_term_rank(model, request):
|
||||
topic_model = copy.deepcopy(request.getfixturevalue(model))
|
||||
topic_model.visualize_term_rank()
|
||||
@@ -0,0 +1,52 @@
|
||||
import copy
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
("kmeans_pca_topic_model"),
|
||||
("base_topic_model"),
|
||||
("custom_topic_model"),
|
||||
("merged_topic_model"),
|
||||
("reduced_topic_model"),
|
||||
("online_topic_model"),
|
||||
],
|
||||
)
|
||||
def test_topics(model, request):
|
||||
topic_model = copy.deepcopy(request.getfixturevalue(model))
|
||||
fig = topic_model.visualize_topics()
|
||||
for slider in fig.to_dict()["layout"]["sliders"]:
|
||||
for step in slider["steps"]:
|
||||
assert int(step["label"].split(" ")[-1]) != -1
|
||||
|
||||
fig = topic_model.visualize_topics(top_n_topics=5)
|
||||
for slider in fig.to_dict()["layout"]["sliders"]:
|
||||
for step in slider["steps"]:
|
||||
assert int(step["label"].split(" ")[-1]) != -1
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
("kmeans_pca_topic_model"),
|
||||
("base_topic_model"),
|
||||
("custom_topic_model"),
|
||||
("merged_topic_model"),
|
||||
("reduced_topic_model"),
|
||||
("online_topic_model"),
|
||||
],
|
||||
)
|
||||
def test_topics_outlier(model, request):
|
||||
topic_model = copy.deepcopy(request.getfixturevalue(model))
|
||||
topic_model.topic_sizes_[-1] = 4
|
||||
fig = topic_model.visualize_topics()
|
||||
|
||||
for slider in fig.to_dict()["layout"]["sliders"]:
|
||||
for step in slider["steps"]:
|
||||
assert int(step["label"].split(" ")[-1]) != -1
|
||||
|
||||
fig = topic_model.visualize_topics(top_n_topics=5)
|
||||
for slider in fig.to_dict()["layout"]["sliders"]:
|
||||
for step in slider["steps"]:
|
||||
assert int(step["label"].split(" ")[-1]) != -1
|
||||
@@ -0,0 +1,59 @@
|
||||
import copy
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
("kmeans_pca_topic_model"),
|
||||
("base_topic_model"),
|
||||
("custom_topic_model"),
|
||||
("merged_topic_model"),
|
||||
("reduced_topic_model"),
|
||||
("online_topic_model"),
|
||||
],
|
||||
)
|
||||
def test_delete(model, request):
|
||||
topic_model = copy.deepcopy(request.getfixturevalue(model))
|
||||
nr_topics = len(set(topic_model.topics_))
|
||||
length_documents = len(topic_model.topics_)
|
||||
|
||||
# First deletion
|
||||
topics_to_delete = [1, 2]
|
||||
topic_model.delete_topics(topics_to_delete)
|
||||
mappings = topic_model.topic_mapper_.get_mappings(list(topic_model.hdbscan_model.labels_))
|
||||
mapped_labels = [mappings[label] for label in topic_model.hdbscan_model.labels_]
|
||||
|
||||
if model == "online_topic_model" or model == "kmeans_pca_topic_model":
|
||||
assert nr_topics == len(set(topic_model.topics_)) + 1
|
||||
assert topic_model.get_topic_info().Count.sum() == length_documents
|
||||
else:
|
||||
assert nr_topics == len(set(topic_model.topics_)) + 2
|
||||
assert topic_model.get_topic_info().Count.sum() == length_documents
|
||||
|
||||
if model == "online_topic_model":
|
||||
assert mapped_labels == topic_model.topics_[950:]
|
||||
else:
|
||||
assert mapped_labels == topic_model.topics_
|
||||
|
||||
# Find two existing topics for second deletion
|
||||
remaining_topics = sorted(list(set(topic_model.topics_)))
|
||||
remaining_topics = [t for t in remaining_topics if t != -1] # Exclude outlier topic
|
||||
topics_to_delete = remaining_topics[:2] # Take first two remaining topics
|
||||
|
||||
# Second deletion
|
||||
topic_model.delete_topics(topics_to_delete)
|
||||
mappings = topic_model.topic_mapper_.get_mappings(list(topic_model.hdbscan_model.labels_))
|
||||
mapped_labels = [mappings[label] for label in topic_model.hdbscan_model.labels_]
|
||||
|
||||
if model == "online_topic_model" or model == "kmeans_pca_topic_model":
|
||||
assert nr_topics == len(set(topic_model.topics_)) + 3
|
||||
assert topic_model.get_topic_info().Count.sum() == length_documents
|
||||
else:
|
||||
assert nr_topics == len(set(topic_model.topics_)) + 4
|
||||
assert topic_model.get_topic_info().Count.sum() == length_documents
|
||||
|
||||
if model == "online_topic_model":
|
||||
assert mapped_labels == topic_model.topics_[950:]
|
||||
else:
|
||||
assert mapped_labels == topic_model.topics_
|
||||
@@ -0,0 +1,42 @@
|
||||
import copy
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
("kmeans_pca_topic_model"),
|
||||
("base_topic_model"),
|
||||
("custom_topic_model"),
|
||||
("merged_topic_model"),
|
||||
("reduced_topic_model"),
|
||||
("online_topic_model"),
|
||||
],
|
||||
)
|
||||
def test_merge(model, documents, request):
|
||||
topic_model = copy.deepcopy(request.getfixturevalue(model))
|
||||
nr_topics = len(set(topic_model.topics_))
|
||||
|
||||
topics_to_merge = [1, 2]
|
||||
topic_model.merge_topics(documents, topics_to_merge)
|
||||
mappings = topic_model.topic_mapper_.get_mappings(list(topic_model.hdbscan_model.labels_))
|
||||
mapped_labels = [mappings[label] for label in topic_model.hdbscan_model.labels_]
|
||||
|
||||
assert nr_topics == len(set(topic_model.topics_)) + 1
|
||||
assert topic_model.get_topic_info().Count.sum() == len(documents)
|
||||
if model == "online_topic_model":
|
||||
assert mapped_labels == topic_model.topics_[950:]
|
||||
else:
|
||||
assert mapped_labels == topic_model.topics_
|
||||
|
||||
topics_to_merge = [1, 2]
|
||||
topic_model.merge_topics(documents, topics_to_merge)
|
||||
mappings = topic_model.topic_mapper_.get_mappings(list(topic_model.hdbscan_model.labels_))
|
||||
mapped_labels = [mappings[label] for label in topic_model.hdbscan_model.labels_]
|
||||
|
||||
assert nr_topics == len(set(topic_model.topics_)) + 2
|
||||
assert topic_model.get_topic_info().Count.sum() == len(documents)
|
||||
if model == "online_topic_model":
|
||||
assert mapped_labels == topic_model.topics_[950:]
|
||||
else:
|
||||
assert mapped_labels == topic_model.topics_
|
||||
@@ -0,0 +1,126 @@
|
||||
import copy
|
||||
import pytest
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
("kmeans_pca_topic_model"),
|
||||
("base_topic_model"),
|
||||
("custom_topic_model"),
|
||||
("merged_topic_model"),
|
||||
("reduced_topic_model"),
|
||||
("online_topic_model"),
|
||||
],
|
||||
)
|
||||
def test_get_topic(model, request):
|
||||
topic_model = copy.deepcopy(request.getfixturevalue(model))
|
||||
topics = [topic_model.get_topic(topic) for topic in set(topic_model.topics_)]
|
||||
unknown_topic = topic_model.get_topic(500)
|
||||
|
||||
for topic in topics:
|
||||
assert topic is not False
|
||||
|
||||
assert len(topics) == len(topic_model.get_topic_info())
|
||||
assert not unknown_topic
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
("kmeans_pca_topic_model"),
|
||||
("base_topic_model"),
|
||||
("custom_topic_model"),
|
||||
("merged_topic_model"),
|
||||
("reduced_topic_model"),
|
||||
("online_topic_model"),
|
||||
],
|
||||
)
|
||||
def test_get_topics(model, request):
|
||||
topic_model = copy.deepcopy(request.getfixturevalue(model))
|
||||
topics = topic_model.get_topics()
|
||||
|
||||
assert topics == topic_model.topic_representations_
|
||||
assert len(topics.keys()) == len(set(topic_model.topics_))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
("kmeans_pca_topic_model"),
|
||||
("base_topic_model"),
|
||||
("custom_topic_model"),
|
||||
("merged_topic_model"),
|
||||
("reduced_topic_model"),
|
||||
("online_topic_model"),
|
||||
],
|
||||
)
|
||||
def test_get_topic_freq(model, request):
|
||||
topic_model = copy.deepcopy(request.getfixturevalue(model))
|
||||
for topic in set(topic_model.topics_):
|
||||
assert not isinstance(topic_model.get_topic_freq(topic), pd.DataFrame)
|
||||
|
||||
topic_freq = topic_model.get_topic_freq()
|
||||
unique_topics = set(topic_model.topics_)
|
||||
topics_in_mapper = set(np.array(topic_model.topic_mapper_.mappings_)[:, -1])
|
||||
|
||||
assert isinstance(topic_freq, pd.DataFrame)
|
||||
|
||||
assert len(topic_freq) == len(set(topic_model.topics_))
|
||||
assert len(topics_in_mapper.difference(unique_topics)) == 0
|
||||
assert len(unique_topics.difference(topics_in_mapper)) == 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
("base_topic_model"),
|
||||
("custom_topic_model"),
|
||||
("merged_topic_model"),
|
||||
("reduced_topic_model"),
|
||||
],
|
||||
)
|
||||
def test_get_representative_docs(model, request):
|
||||
topic_model = copy.deepcopy(request.getfixturevalue(model))
|
||||
all_docs = topic_model.get_representative_docs()
|
||||
unique_topics = set(topic_model.topics_)
|
||||
topics_in_mapper = set(np.array(topic_model.topic_mapper_.mappings_)[:, -1])
|
||||
|
||||
assert len(all_docs) == len(topic_model.topic_sizes_.keys())
|
||||
assert len(all_docs) == len(topics_in_mapper)
|
||||
assert len(all_docs) == topic_model.c_tf_idf_.shape[0]
|
||||
assert len(all_docs) == len(topic_model.topic_labels_)
|
||||
assert all([True if len(docs) == 3 else False for docs in all_docs.values()])
|
||||
|
||||
topics = set(list(all_docs.keys()))
|
||||
|
||||
assert len(topics.difference(unique_topics)) == 0
|
||||
assert len(topics.difference(topics_in_mapper)) == 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
("kmeans_pca_topic_model"),
|
||||
("base_topic_model"),
|
||||
("custom_topic_model"),
|
||||
("merged_topic_model"),
|
||||
("reduced_topic_model"),
|
||||
("online_topic_model"),
|
||||
],
|
||||
)
|
||||
def test_get_topic_info(model, request):
|
||||
topic_model = copy.deepcopy(request.getfixturevalue(model))
|
||||
info = topic_model.get_topic_info()
|
||||
|
||||
if topic_model._outliers:
|
||||
assert info.iloc[0].Topic == -1
|
||||
else:
|
||||
assert info.iloc[0].Topic == 0
|
||||
|
||||
for topic in set(topic_model.topics_):
|
||||
assert len(topic_model.get_topic_info(topic)) == 1
|
||||
|
||||
assert len(topic_model.get_topic_info(200)) == 0
|
||||
@@ -0,0 +1,72 @@
|
||||
import copy
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
("kmeans_pca_topic_model"),
|
||||
("base_topic_model"),
|
||||
("custom_topic_model"),
|
||||
("merged_topic_model"),
|
||||
("reduced_topic_model"),
|
||||
("online_topic_model"),
|
||||
],
|
||||
)
|
||||
def test_generate_topic_labels(model, request):
|
||||
topic_model = copy.deepcopy(request.getfixturevalue(model))
|
||||
labels = topic_model.generate_topic_labels(topic_prefix=False)
|
||||
|
||||
assert sum([label[0].isdigit() for label in labels[1:]]) / len(labels) < 0.2
|
||||
|
||||
labels = [int(label.split("_")[0]) for label in topic_model.generate_topic_labels()]
|
||||
assert labels == sorted(list(set(topic_model.topics_)))
|
||||
|
||||
labels = topic_model.generate_topic_labels(nr_words=1, topic_prefix=False)
|
||||
assert all([True if len(label) < 15 else False for label in labels])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
("kmeans_pca_topic_model"),
|
||||
("base_topic_model"),
|
||||
("custom_topic_model"),
|
||||
("merged_topic_model"),
|
||||
("reduced_topic_model"),
|
||||
("online_topic_model"),
|
||||
],
|
||||
)
|
||||
def test_set_labels(model, request):
|
||||
topic_model = copy.deepcopy(request.getfixturevalue(model))
|
||||
|
||||
labels = topic_model.generate_topic_labels()
|
||||
topic_model.set_topic_labels(labels)
|
||||
assert topic_model.custom_labels_ == labels
|
||||
|
||||
if model != "online_topic_model":
|
||||
labels = {1: "My label", 2: "Another label"}
|
||||
topic_model.set_topic_labels(labels)
|
||||
assert topic_model.custom_labels_[1 + topic_model._outliers] == "My label"
|
||||
assert topic_model.custom_labels_[2 + topic_model._outliers] == "Another label"
|
||||
|
||||
labels = {1: "Change label", 3: "New label"}
|
||||
topic_model.set_topic_labels(labels)
|
||||
assert topic_model.custom_labels_[1 + topic_model._outliers] == "Change label"
|
||||
assert topic_model.custom_labels_[3 + topic_model._outliers] == "New label"
|
||||
else:
|
||||
labels = {
|
||||
sorted(set(topic_model.topics_))[0]: "My label",
|
||||
sorted(set(topic_model.topics_))[1]: "Another label",
|
||||
}
|
||||
topic_model.set_topic_labels(labels)
|
||||
assert topic_model.custom_labels_[0] == "My label"
|
||||
assert topic_model.custom_labels_[1] == "Another label"
|
||||
|
||||
labels = {
|
||||
sorted(set(topic_model.topics_))[0]: "Change label",
|
||||
sorted(set(topic_model.topics_))[2]: "New label",
|
||||
}
|
||||
topic_model.set_topic_labels(labels)
|
||||
assert topic_model.custom_labels_[0 + topic_model._outliers] == "Change label"
|
||||
assert topic_model.custom_labels_[2 + topic_model._outliers] == "New label"
|
||||
@@ -0,0 +1,184 @@
|
||||
import copy
|
||||
import pytest
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
("kmeans_pca_topic_model"),
|
||||
("base_topic_model"),
|
||||
("custom_topic_model"),
|
||||
("merged_topic_model"),
|
||||
("reduced_topic_model"),
|
||||
],
|
||||
)
|
||||
def test_update_topics(model, documents, request):
|
||||
topic_model = copy.deepcopy(request.getfixturevalue(model))
|
||||
old_ctfidf = topic_model.c_tf_idf_
|
||||
old_topics = topic_model.topics_
|
||||
|
||||
topic_model.update_topics(documents, n_gram_range=(1, 3))
|
||||
|
||||
assert old_ctfidf.shape[1] < topic_model.c_tf_idf_.shape[1]
|
||||
assert old_topics == topic_model.topics_
|
||||
|
||||
updated_topics = [topic if topic != 1 else 0 for topic in old_topics]
|
||||
topic_model.update_topics(documents, topics=updated_topics, n_gram_range=(1, 3))
|
||||
|
||||
assert len(set(old_topics)) - 1 == len(set(topic_model.topics_))
|
||||
|
||||
old_topics = topic_model.topics_
|
||||
updated_topics = [topic if topic != 2 else 0 for topic in old_topics]
|
||||
topic_model.update_topics(documents, topics=updated_topics, n_gram_range=(1, 3))
|
||||
|
||||
assert len(set(old_topics)) - 1 == len(set(topic_model.topics_))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
("kmeans_pca_topic_model"),
|
||||
("base_topic_model"),
|
||||
("custom_topic_model"),
|
||||
("merged_topic_model"),
|
||||
("reduced_topic_model"),
|
||||
("online_topic_model"),
|
||||
],
|
||||
)
|
||||
def test_extract_topics(model, documents, request):
|
||||
topic_model = copy.deepcopy(request.getfixturevalue(model))
|
||||
nr_topics = 5
|
||||
documents = pd.DataFrame(
|
||||
{
|
||||
"Document": documents,
|
||||
"ID": range(len(documents)),
|
||||
"Topic": np.random.randint(-1, nr_topics - 1, len(documents)),
|
||||
}
|
||||
)
|
||||
topic_model._update_topic_size(documents)
|
||||
topic_model._extract_topics(documents)
|
||||
freq = topic_model.get_topic_freq()
|
||||
|
||||
assert topic_model.c_tf_idf_.shape[0] == 5
|
||||
assert topic_model.c_tf_idf_.shape[1] > 100
|
||||
assert isinstance(freq, pd.DataFrame)
|
||||
assert nr_topics == len(freq.Topic.unique())
|
||||
assert freq.Count.sum() == len(documents)
|
||||
assert len(freq.Topic.unique()) == len(freq)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
("kmeans_pca_topic_model"),
|
||||
("base_topic_model"),
|
||||
("custom_topic_model"),
|
||||
("merged_topic_model"),
|
||||
("reduced_topic_model"),
|
||||
("online_topic_model"),
|
||||
],
|
||||
)
|
||||
def test_extract_topics_custom_cv(model, documents, request):
|
||||
topic_model = copy.deepcopy(request.getfixturevalue(model))
|
||||
nr_topics = 5
|
||||
documents = pd.DataFrame(
|
||||
{
|
||||
"Document": documents,
|
||||
"ID": range(len(documents)),
|
||||
"Topic": np.random.randint(-1, nr_topics - 1, len(documents)),
|
||||
}
|
||||
)
|
||||
|
||||
cv = CountVectorizer(ngram_range=(1, 2))
|
||||
topic_model.vectorizer_model = cv
|
||||
topic_model._update_topic_size(documents)
|
||||
topic_model._extract_topics(documents)
|
||||
freq = topic_model.get_topic_freq()
|
||||
|
||||
assert topic_model.c_tf_idf_.shape[0] == 5
|
||||
assert topic_model.c_tf_idf_.shape[1] > 100
|
||||
assert isinstance(freq, pd.DataFrame)
|
||||
assert nr_topics == len(freq.Topic.unique())
|
||||
assert freq.Count.sum() == len(documents)
|
||||
assert len(freq.Topic.unique()) == len(freq)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
("kmeans_pca_topic_model"),
|
||||
("base_topic_model"),
|
||||
("custom_topic_model"),
|
||||
("merged_topic_model"),
|
||||
("reduced_topic_model"),
|
||||
("online_topic_model"),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("reduced_topics", [2, 4, 10])
|
||||
def test_topic_reduction(model, reduced_topics, documents, request):
|
||||
topic_model = copy.deepcopy(request.getfixturevalue(model))
|
||||
old_topics = copy.deepcopy(topic_model.topics_)
|
||||
old_freq = topic_model.get_topic_freq()
|
||||
|
||||
topic_model.reduce_topics(documents, nr_topics=reduced_topics)
|
||||
|
||||
new_freq = topic_model.get_topic_freq()
|
||||
|
||||
if model != "online_topic_model":
|
||||
assert old_freq.Count.sum() == new_freq.Count.sum()
|
||||
assert len(old_freq.Topic.unique()) == len(old_freq)
|
||||
assert len(new_freq.Topic.unique()) == len(new_freq)
|
||||
assert len(topic_model.topics_) == len(old_topics)
|
||||
assert topic_model.topics_ != old_topics
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
("kmeans_pca_topic_model"),
|
||||
("base_topic_model"),
|
||||
("custom_topic_model"),
|
||||
("merged_topic_model"),
|
||||
("reduced_topic_model"),
|
||||
("online_topic_model"),
|
||||
],
|
||||
)
|
||||
def test_topic_reduction_edge_cases(model, documents, request):
|
||||
topic_model = copy.deepcopy(request.getfixturevalue(model))
|
||||
topic_model.nr_topics = 100
|
||||
nr_topics = 5
|
||||
topics = np.random.randint(-1, nr_topics - 1, len(documents))
|
||||
old_documents = pd.DataFrame({"Document": documents, "ID": range(len(documents)), "Topic": topics})
|
||||
topic_model._update_topic_size(old_documents)
|
||||
old_documents = topic_model._sort_mappings_by_frequency(old_documents)
|
||||
topic_model._extract_topics(old_documents)
|
||||
old_freq = topic_model.get_topic_freq()
|
||||
|
||||
new_documents = topic_model._reduce_topics(old_documents)
|
||||
new_freq = topic_model.get_topic_freq()
|
||||
|
||||
assert not set(old_documents.Topic).difference(set(new_documents.Topic))
|
||||
pd.testing.assert_frame_equal(old_documents, new_documents)
|
||||
pd.testing.assert_frame_equal(old_freq, new_freq)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
("kmeans_pca_topic_model"),
|
||||
("base_topic_model"),
|
||||
("custom_topic_model"),
|
||||
("merged_topic_model"),
|
||||
("reduced_topic_model"),
|
||||
("online_topic_model"),
|
||||
],
|
||||
)
|
||||
def test_find_topics(model, request):
|
||||
topic_model = copy.deepcopy(request.getfixturevalue(model))
|
||||
similar_topics, similarity = topic_model.find_topics("car")
|
||||
|
||||
assert np.mean(similarity) > 0.1
|
||||
assert len(similar_topics) > 0
|
||||
@@ -0,0 +1,77 @@
|
||||
import pytest
|
||||
import pandas as pd
|
||||
|
||||
from sklearn.datasets import make_blobs
|
||||
from sklearn.cluster import KMeans
|
||||
from hdbscan import HDBSCAN
|
||||
|
||||
from bertopic import BERTopic
|
||||
|
||||
|
||||
@pytest.mark.parametrize("cluster_model", ["hdbscan", "kmeans"])
|
||||
@pytest.mark.parametrize(
|
||||
"samples,features,centers",
|
||||
[
|
||||
(200, 500, 1),
|
||||
(500, 200, 1),
|
||||
(200, 500, 2),
|
||||
(500, 200, 2),
|
||||
(200, 500, 4),
|
||||
(500, 200, 4),
|
||||
],
|
||||
)
|
||||
def test_hdbscan_cluster_embeddings(cluster_model, samples, features, centers):
|
||||
embeddings, _ = make_blobs(n_samples=samples, centers=centers, n_features=features, random_state=42)
|
||||
documents = [str(i + 1) for i in range(embeddings.shape[0])]
|
||||
old_df = pd.DataFrame({"Document": documents, "ID": range(len(documents)), "Topic": None})
|
||||
|
||||
if cluster_model == "kmeans":
|
||||
cluster_model = KMeans(n_clusters=centers)
|
||||
else:
|
||||
cluster_model = HDBSCAN(
|
||||
min_cluster_size=10,
|
||||
metric="euclidean",
|
||||
cluster_selection_method="eom",
|
||||
prediction_data=True,
|
||||
)
|
||||
|
||||
model = BERTopic(hdbscan_model=cluster_model)
|
||||
new_df, _ = model._cluster_embeddings(embeddings, old_df)
|
||||
|
||||
assert len(new_df.Topic.unique()) == centers
|
||||
assert "Topic" in new_df.columns
|
||||
pd.testing.assert_frame_equal(old_df.drop("Topic", axis=1), new_df.drop("Topic", axis=1))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("cluster_model", ["hdbscan", "kmeans"])
|
||||
@pytest.mark.parametrize(
|
||||
"samples,features,centers",
|
||||
[
|
||||
(200, 500, 1),
|
||||
(500, 200, 1),
|
||||
(200, 500, 2),
|
||||
(500, 200, 2),
|
||||
(200, 500, 4),
|
||||
(500, 200, 4),
|
||||
],
|
||||
)
|
||||
def test_custom_hdbscan_cluster_embeddings(cluster_model, samples, features, centers):
|
||||
embeddings, _ = make_blobs(n_samples=samples, centers=centers, n_features=features, random_state=42)
|
||||
documents = [str(i + 1) for i in range(embeddings.shape[0])]
|
||||
old_df = pd.DataFrame({"Document": documents, "ID": range(len(documents)), "Topic": None})
|
||||
if cluster_model == "kmeans":
|
||||
cluster_model = KMeans(n_clusters=centers)
|
||||
else:
|
||||
cluster_model = HDBSCAN(
|
||||
min_cluster_size=10,
|
||||
metric="euclidean",
|
||||
cluster_selection_method="eom",
|
||||
prediction_data=True,
|
||||
)
|
||||
|
||||
model = BERTopic(hdbscan_model=cluster_model)
|
||||
new_df, _ = model._cluster_embeddings(embeddings, old_df)
|
||||
|
||||
assert len(new_df.Topic.unique()) == centers
|
||||
assert "Topic" in new_df.columns
|
||||
pd.testing.assert_frame_equal(old_df.drop("Topic", axis=1), new_df.drop("Topic", axis=1))
|
||||
@@ -0,0 +1,40 @@
|
||||
import copy
|
||||
import pytest
|
||||
import numpy as np
|
||||
from umap import UMAP
|
||||
from sklearn.decomposition import PCA
|
||||
|
||||
from bertopic import BERTopic
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dim_model", [UMAP, PCA])
|
||||
@pytest.mark.parametrize(
|
||||
"embeddings,shape,n_components",
|
||||
[
|
||||
(np.random.rand(100, 128), 100, 5),
|
||||
(np.random.rand(10, 256), 10, 5),
|
||||
(np.random.rand(50, 15), 50, 10),
|
||||
],
|
||||
)
|
||||
def test_reduce_dimensionality(dim_model, embeddings, shape, n_components):
|
||||
model = BERTopic(umap_model=dim_model(n_components=n_components))
|
||||
umap_embeddings = model._reduce_dimensionality(embeddings)
|
||||
assert umap_embeddings.shape == (shape, n_components)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
("kmeans_pca_topic_model"),
|
||||
("base_topic_model"),
|
||||
("custom_topic_model"),
|
||||
("merged_topic_model"),
|
||||
("reduced_topic_model"),
|
||||
("online_topic_model"),
|
||||
],
|
||||
)
|
||||
def test_custom_reduce_dimensionality(model, request):
|
||||
embeddings = np.random.rand(500, 128)
|
||||
topic_model = copy.deepcopy(request.getfixturevalue(model))
|
||||
umap_embeddings = topic_model._reduce_dimensionality(embeddings)
|
||||
assert umap_embeddings.shape[1] < embeddings.shape[1]
|
||||
@@ -0,0 +1,65 @@
|
||||
import copy
|
||||
import pytest
|
||||
import numpy as np
|
||||
from bertopic import BERTopic
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
("kmeans_pca_topic_model"),
|
||||
("base_topic_model"),
|
||||
("custom_topic_model"),
|
||||
("merged_topic_model"),
|
||||
("reduced_topic_model"),
|
||||
("online_topic_model"),
|
||||
],
|
||||
)
|
||||
def test_extract_embeddings(model, request):
|
||||
topic_model = copy.deepcopy(request.getfixturevalue(model))
|
||||
single_embedding = topic_model._extract_embeddings("a document")
|
||||
multiple_embeddings = topic_model._extract_embeddings(["something different", "another document"])
|
||||
sim_matrix = cosine_similarity(single_embedding, multiple_embeddings)[0]
|
||||
|
||||
assert single_embedding.shape[0] == 1
|
||||
assert single_embedding.shape[1] == 384
|
||||
assert np.min(single_embedding) > -5
|
||||
assert np.max(single_embedding) < 5
|
||||
|
||||
assert multiple_embeddings.shape[0] == 2
|
||||
assert multiple_embeddings.shape[1] == 384
|
||||
assert np.min(multiple_embeddings) > -5
|
||||
assert np.max(multiple_embeddings) < 5
|
||||
|
||||
assert sim_matrix[0] < 0.5
|
||||
assert sim_matrix[1] > 0.5
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
("kmeans_pca_topic_model"),
|
||||
("base_topic_model"),
|
||||
("custom_topic_model"),
|
||||
("merged_topic_model"),
|
||||
("reduced_topic_model"),
|
||||
("online_topic_model"),
|
||||
],
|
||||
)
|
||||
def test_extract_embeddings_compare(model, embedding_model, request):
|
||||
docs = ["some document"]
|
||||
topic_model = copy.deepcopy(request.getfixturevalue(model))
|
||||
bertopic_embeddings = topic_model._extract_embeddings(docs)
|
||||
|
||||
assert isinstance(bertopic_embeddings, np.ndarray)
|
||||
assert bertopic_embeddings.shape == (1, 384)
|
||||
|
||||
sentence_embeddings = embedding_model.encode(docs, show_progress_bar=False)
|
||||
assert np.array_equal(bertopic_embeddings, sentence_embeddings)
|
||||
|
||||
|
||||
def test_extract_incorrect_embeddings():
|
||||
with pytest.raises(ValueError):
|
||||
model = BERTopic(language="Unknown language")
|
||||
model.fit(["some document"])
|
||||
@@ -0,0 +1,94 @@
|
||||
import pytest
|
||||
import logging
|
||||
import numpy as np
|
||||
from typing import List
|
||||
from bertopic._utils import (
|
||||
check_documents_type,
|
||||
check_embeddings_shape,
|
||||
MyLogger,
|
||||
select_topic_representation,
|
||||
get_unique_distances,
|
||||
)
|
||||
from scipy.sparse import csr_matrix
|
||||
|
||||
|
||||
def test_logger():
|
||||
logger = MyLogger()
|
||||
logger.configure("DEBUG")
|
||||
assert isinstance(logger.logger, logging.Logger)
|
||||
assert logger.logger.level == 10
|
||||
|
||||
logger = MyLogger()
|
||||
logger.configure("WARNING")
|
||||
assert isinstance(logger.logger, logging.Logger)
|
||||
assert logger.logger.level == 30
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"docs",
|
||||
["A document not in an iterable", [None], 5],
|
||||
)
|
||||
def test_check_documents_type(docs):
|
||||
with pytest.raises(TypeError):
|
||||
check_documents_type(docs)
|
||||
|
||||
|
||||
def test_check_embeddings_shape():
|
||||
docs = ["doc_one", "doc_two"]
|
||||
embeddings = np.array([[1, 2, 3], [2, 3, 4]])
|
||||
check_embeddings_shape(embeddings, docs)
|
||||
|
||||
|
||||
def test_make_unique_distances():
|
||||
def check_dists(dists: List[float], noise_max: float):
|
||||
unique_dists = get_unique_distances(np.array(dists, dtype=float), noise_max=noise_max)
|
||||
assert len(unique_dists) == len(dists), "The number of elements must be the same"
|
||||
assert len(dists) == len(np.unique(unique_dists)), "The distances must be unique"
|
||||
|
||||
check_dists([0, 0, 0.5, 0.75, 1, 1], noise_max=1e-7)
|
||||
|
||||
# testing whether the distances are sorted in ascending order when if the noise is extremely high
|
||||
check_dists([0, 0, 0, 0.5, 0.75, 1, 1], noise_max=20)
|
||||
|
||||
# test whether the distances are sorted in ascending order when the distances are all the same
|
||||
check_dists([0, 0, 0, 0, 0, 0, 0], noise_max=1e-7)
|
||||
|
||||
|
||||
def test_select_topic_representation():
|
||||
ctfidf_embeddings = np.array([[1, 1, 1]])
|
||||
ctfidf_embeddings_sparse = csr_matrix(
|
||||
(ctfidf_embeddings.reshape(-1).tolist(), ([0, 0, 0], [0, 1, 2])),
|
||||
shape=ctfidf_embeddings.shape,
|
||||
)
|
||||
topic_embeddings = np.array([[2, 2, 2]])
|
||||
|
||||
# Use topic embeddings
|
||||
repr_, ctfidf_used = select_topic_representation(ctfidf_embeddings, topic_embeddings, use_ctfidf=False)
|
||||
np.testing.assert_array_equal(topic_embeddings, repr_)
|
||||
assert not ctfidf_used
|
||||
|
||||
# Fallback to c-TF-IDF
|
||||
repr_, ctfidf_used = select_topic_representation(ctfidf_embeddings, None, use_ctfidf=False)
|
||||
np.testing.assert_array_equal(ctfidf_embeddings, repr_)
|
||||
assert ctfidf_used
|
||||
|
||||
# Use c-TF-IDF
|
||||
repr_, ctfidf_used = select_topic_representation(ctfidf_embeddings, topic_embeddings, use_ctfidf=True)
|
||||
np.testing.assert_array_equal(ctfidf_embeddings, repr_)
|
||||
assert ctfidf_used
|
||||
|
||||
# Fallback to topic embeddings
|
||||
repr_, ctfidf_used = select_topic_representation(None, topic_embeddings, use_ctfidf=True)
|
||||
np.testing.assert_array_equal(topic_embeddings, repr_)
|
||||
assert not ctfidf_used
|
||||
|
||||
# `scipy.sparse.csr_matrix` can be used as c-TF-IDF embeddings
|
||||
np.testing.assert_array_equal(
|
||||
ctfidf_embeddings,
|
||||
select_topic_representation(ctfidf_embeddings_sparse, None, use_ctfidf=True, output_ndarray=True)[0],
|
||||
)
|
||||
|
||||
# check that `csr_matrix` is not casted to `np.ndarray` when `ctfidf_as_ndarray` is False
|
||||
repr_ = select_topic_representation(ctfidf_embeddings_sparse, None, output_ndarray=False)[0]
|
||||
|
||||
assert isinstance(repr_, csr_matrix)
|
||||
@@ -0,0 +1,29 @@
|
||||
import copy
|
||||
import pytest
|
||||
from sklearn.datasets import fetch_20newsgroups
|
||||
|
||||
data = fetch_20newsgroups(subset="all", remove=("headers", "footers", "quotes"))
|
||||
classes = [data["target_names"][i] for i in data["target"]][:1000]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
("kmeans_pca_topic_model"),
|
||||
("custom_topic_model"),
|
||||
("merged_topic_model"),
|
||||
("reduced_topic_model"),
|
||||
("online_topic_model"),
|
||||
],
|
||||
)
|
||||
def test_class(model, documents, request):
|
||||
topic_model = copy.deepcopy(request.getfixturevalue(model))
|
||||
topics_per_class_global = topic_model.topics_per_class(documents, classes=classes, global_tuning=True)
|
||||
topics_per_class_local = topic_model.topics_per_class(documents, classes=classes, global_tuning=False)
|
||||
|
||||
assert topics_per_class_global.Frequency.sum() == len(documents)
|
||||
assert topics_per_class_local.Frequency.sum() == len(documents)
|
||||
assert set(topics_per_class_global.Topic.unique()) == set(topic_model.topics_)
|
||||
assert set(topics_per_class_local.Topic.unique()) == set(topic_model.topics_)
|
||||
assert len(topics_per_class_global.Class.unique()) == len(set(classes))
|
||||
assert len(topics_per_class_local.Class.unique()) == len(set(classes))
|
||||
@@ -0,0 +1,22 @@
|
||||
import copy
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
("kmeans_pca_topic_model"),
|
||||
("custom_topic_model"),
|
||||
("merged_topic_model"),
|
||||
("reduced_topic_model"),
|
||||
("online_topic_model"),
|
||||
],
|
||||
)
|
||||
def test_dynamic(model, documents, request):
|
||||
topic_model = copy.deepcopy(request.getfixturevalue(model))
|
||||
timestamps = [i % 10 for i in range(len(documents))]
|
||||
topics_over_time = topic_model.topics_over_time(documents, timestamps)
|
||||
|
||||
assert topics_over_time.Frequency.sum() == len(documents)
|
||||
assert set(topics_over_time.Topic.unique()) == set(topic_model.topics_)
|
||||
assert len(topics_over_time.Timestamp.unique()) == len(set(timestamps))
|
||||
@@ -0,0 +1,69 @@
|
||||
import copy
|
||||
import pytest
|
||||
from scipy.cluster import hierarchy as sch
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
("kmeans_pca_topic_model"),
|
||||
("custom_topic_model"),
|
||||
("merged_topic_model"),
|
||||
("reduced_topic_model"),
|
||||
("online_topic_model"),
|
||||
],
|
||||
)
|
||||
def test_hierarchy(model, documents, request):
|
||||
topic_model = copy.deepcopy(request.getfixturevalue(model))
|
||||
hierarchical_topics = topic_model.hierarchical_topics(documents)
|
||||
|
||||
merged_topics = set([v for vals in hierarchical_topics.Topics.values for v in vals])
|
||||
|
||||
assert len(hierarchical_topics) > 0
|
||||
assert merged_topics == set(topic_model.topics_).difference({-1})
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
("kmeans_pca_topic_model"),
|
||||
("custom_topic_model"),
|
||||
("merged_topic_model"),
|
||||
("reduced_topic_model"),
|
||||
("online_topic_model"),
|
||||
],
|
||||
)
|
||||
def test_linkage(model, documents, request):
|
||||
topic_model = copy.deepcopy(request.getfixturevalue(model))
|
||||
linkage_function = lambda x: sch.linkage(x, "single", optimal_ordering=True)
|
||||
hierarchical_topics = topic_model.hierarchical_topics(documents, linkage_function=linkage_function)
|
||||
merged_topics = set([v for vals in hierarchical_topics.Topics.values for v in vals])
|
||||
tree = topic_model.get_topic_tree(hierarchical_topics)
|
||||
|
||||
assert len(hierarchical_topics) > 0
|
||||
assert len(tree) > 50
|
||||
assert len(tree.split("\n")) <= 2 * len(set(topic_model.topics_))
|
||||
assert merged_topics == set(topic_model.topics_).difference({-1})
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
("kmeans_pca_topic_model"),
|
||||
("custom_topic_model"),
|
||||
("merged_topic_model"),
|
||||
("reduced_topic_model"),
|
||||
("online_topic_model"),
|
||||
],
|
||||
)
|
||||
def test_tree(model, documents, request):
|
||||
topic_model = copy.deepcopy(request.getfixturevalue(model))
|
||||
linkage_function = lambda x: sch.linkage(x, "single", optimal_ordering=True)
|
||||
hierarchical_topics = topic_model.hierarchical_topics(documents, linkage_function=linkage_function)
|
||||
merged_topics = set([v for vals in hierarchical_topics.Topics.values for v in vals])
|
||||
tree = topic_model.get_topic_tree(hierarchical_topics)
|
||||
|
||||
assert len(hierarchical_topics) > 0
|
||||
assert len(tree) > 50
|
||||
assert len(tree.split("\n")) <= 2 * len(set(topic_model.topics_))
|
||||
assert merged_topics == set(topic_model.topics_).difference({-1})
|
||||
@@ -0,0 +1,101 @@
|
||||
import copy
|
||||
import pytest
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from packaging import version
|
||||
from scipy.sparse import csr_matrix
|
||||
from sklearn import __version__ as sklearn_version
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from bertopic.vectorizers import ClassTfidfTransformer
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
("kmeans_pca_topic_model"),
|
||||
("base_topic_model"),
|
||||
("custom_topic_model"),
|
||||
("merged_topic_model"),
|
||||
("reduced_topic_model"),
|
||||
("online_topic_model"),
|
||||
],
|
||||
)
|
||||
def test_ctfidf(model, documents, request):
|
||||
topic_model = copy.deepcopy(request.getfixturevalue(model))
|
||||
topics = topic_model.topics_
|
||||
documents = pd.DataFrame({"Document": documents, "ID": range(len(documents)), "Topic": topics})
|
||||
documents_per_topic = documents.groupby(["Topic"], as_index=False).agg({"Document": " ".join})
|
||||
documents = topic_model._preprocess_text(documents_per_topic.Document.values)
|
||||
count = topic_model.vectorizer_model.fit(documents)
|
||||
|
||||
# Scikit-Learn Deprecation: get_feature_names is deprecated in 1.0
|
||||
# and will be removed in 1.2. Please use get_feature_names_out instead.
|
||||
if version.parse(sklearn_version) >= version.parse("1.0.0"):
|
||||
words = count.get_feature_names_out()
|
||||
else:
|
||||
words = count.get_feature_names()
|
||||
|
||||
X = count.transform(documents)
|
||||
transformer = ClassTfidfTransformer().fit(X)
|
||||
c_tf_idf = transformer.transform(X)
|
||||
|
||||
assert len(words) > 1000
|
||||
assert all([isinstance(x, str) for x in words])
|
||||
|
||||
assert isinstance(X, csr_matrix)
|
||||
assert isinstance(c_tf_idf, csr_matrix)
|
||||
|
||||
assert X.shape[0] == len(set(topics))
|
||||
assert X.shape[1] == len(words)
|
||||
|
||||
assert c_tf_idf.shape[0] == len(set(topics))
|
||||
assert c_tf_idf.shape[1] == len(words)
|
||||
|
||||
assert np.min(X) == 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
("kmeans_pca_topic_model"),
|
||||
("base_topic_model"),
|
||||
("custom_topic_model"),
|
||||
("merged_topic_model"),
|
||||
("reduced_topic_model"),
|
||||
("online_topic_model"),
|
||||
],
|
||||
)
|
||||
def test_ctfidf_custom_cv(model, documents, request):
|
||||
cv = CountVectorizer(ngram_range=(1, 3), stop_words="english")
|
||||
topic_model = copy.deepcopy(request.getfixturevalue(model))
|
||||
topic_model.vectorizer_model = cv
|
||||
topics = topic_model.topics_
|
||||
documents = pd.DataFrame({"Document": documents, "ID": range(len(documents)), "Topic": topics})
|
||||
documents_per_topic = documents.groupby(["Topic"], as_index=False).agg({"Document": " ".join})
|
||||
documents = topic_model._preprocess_text(documents_per_topic.Document.values)
|
||||
count = topic_model.vectorizer_model.fit(documents)
|
||||
|
||||
# Scikit-Learn Deprecation: get_feature_names is deprecated in 1.0
|
||||
# and will be removed in 1.2. Please use get_feature_names_out instead.
|
||||
if version.parse(sklearn_version) >= version.parse("1.0.0"):
|
||||
words = count.get_feature_names_out()
|
||||
else:
|
||||
words = count.get_feature_names()
|
||||
|
||||
X = count.transform(documents)
|
||||
transformer = ClassTfidfTransformer().fit(X)
|
||||
c_tf_idf = transformer.transform(X)
|
||||
|
||||
assert len(words) > 1000
|
||||
assert all([isinstance(x, str) for x in words])
|
||||
|
||||
assert isinstance(X, csr_matrix)
|
||||
assert isinstance(c_tf_idf, csr_matrix)
|
||||
|
||||
assert X.shape[0] == len(set(topics))
|
||||
assert X.shape[1] == len(words)
|
||||
|
||||
assert c_tf_idf.shape[0] == len(set(topics))
|
||||
assert c_tf_idf.shape[1] == len(words)
|
||||
|
||||
assert np.min(X) == 0
|
||||
@@ -0,0 +1,38 @@
|
||||
import copy
|
||||
import pytest
|
||||
from bertopic.vectorizers import OnlineCountVectorizer
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
("kmeans_pca_topic_model"),
|
||||
("custom_topic_model"),
|
||||
("merged_topic_model"),
|
||||
("reduced_topic_model"),
|
||||
("online_topic_model"),
|
||||
],
|
||||
)
|
||||
def test_online_cv(model, documents, request):
|
||||
topic_model = copy.deepcopy(request.getfixturevalue(model))
|
||||
vectorizer_model = OnlineCountVectorizer(stop_words="english", ngram_range=(2, 2))
|
||||
|
||||
topics = [topic_model.get_topic(topic) for topic in set(topic_model.topics_)]
|
||||
topic_model.update_topics(documents, vectorizer_model=vectorizer_model)
|
||||
new_topics = [topic_model.get_topic(topic) for topic in set(topic_model.topics_)]
|
||||
|
||||
for old_topic, new_topic in zip(topics, new_topics):
|
||||
if old_topic[0][0] != "":
|
||||
assert old_topic != new_topic
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", [("online_topic_model")])
|
||||
def test_clean_bow(model, request):
|
||||
topic_model = copy.deepcopy(request.getfixturevalue(model))
|
||||
|
||||
original_shape = topic_model.vectorizer_model.X_.shape
|
||||
topic_model.vectorizer_model.delete_min_df = 2
|
||||
topic_model.vectorizer_model._clean_bow()
|
||||
|
||||
assert original_shape[0] == topic_model.vectorizer_model.X_.shape[0]
|
||||
assert original_shape[1] > topic_model.vectorizer_model.X_.shape[1]
|
||||
Reference in New Issue
Block a user