Add BERTopic.
This commit is contained in:
@@ -0,0 +1,34 @@
|
||||
import copy
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize("batch_size", [50, None])
|
||||
@pytest.mark.parametrize("padding", [True, False])
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
("kmeans_pca_topic_model"),
|
||||
("base_topic_model"),
|
||||
("custom_topic_model"),
|
||||
("merged_topic_model"),
|
||||
("reduced_topic_model"),
|
||||
],
|
||||
)
|
||||
def test_approximate_distribution(batch_size, padding, model, documents, request):
|
||||
topic_model = copy.deepcopy(request.getfixturevalue(model))
|
||||
|
||||
# Calculate only on a document-level based on tokensets
|
||||
topic_distr, _ = topic_model.approximate_distribution(documents, padding=padding, batch_size=batch_size)
|
||||
assert topic_distr.shape[1] == len(topic_model.topic_labels_) - topic_model._outliers
|
||||
|
||||
# Use the distribution visualization
|
||||
for i in range(3):
|
||||
topic_model.visualize_distribution(topic_distr[i])
|
||||
|
||||
# Calculate distribution on a token-level
|
||||
topic_distr, topic_token_distr = topic_model.approximate_distribution(documents[:100], calculate_tokens=True)
|
||||
assert topic_distr.shape[1] == len(topic_model.topic_labels_) - topic_model._outliers
|
||||
assert len(topic_token_distr) == len(documents[:100])
|
||||
|
||||
for token_distr in topic_token_distr:
|
||||
assert token_distr.shape[1] == len(topic_model.topic_labels_) - topic_model._outliers
|
||||
@@ -0,0 +1,55 @@
|
||||
import copy
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
("kmeans_pca_topic_model"),
|
||||
("base_topic_model"),
|
||||
("custom_topic_model"),
|
||||
("merged_topic_model"),
|
||||
("reduced_topic_model"),
|
||||
("online_topic_model"),
|
||||
],
|
||||
)
|
||||
def test_barchart(model, request):
|
||||
topic_model = copy.deepcopy(request.getfixturevalue(model))
|
||||
fig = topic_model.visualize_barchart()
|
||||
|
||||
assert len(fig.to_dict()["layout"]["annotations"]) == 8
|
||||
for annotation in fig.to_dict()["layout"]["annotations"]:
|
||||
assert int(annotation["text"].split(" ")[-1]) != -1
|
||||
|
||||
fig = topic_model.visualize_barchart(top_n_topics=5)
|
||||
|
||||
assert len(fig.to_dict()["layout"]["annotations"]) == 5
|
||||
for annotation in fig.to_dict()["layout"]["annotations"]:
|
||||
assert int(annotation["text"].split(" ")[-1]) != -1
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
("kmeans_pca_topic_model"),
|
||||
("base_topic_model"),
|
||||
("custom_topic_model"),
|
||||
("merged_topic_model"),
|
||||
("reduced_topic_model"),
|
||||
("online_topic_model"),
|
||||
],
|
||||
)
|
||||
def test_barchart_outlier(model, request):
|
||||
topic_model = copy.deepcopy(request.getfixturevalue(model))
|
||||
topic_model.topic_sizes_[-1] = 4
|
||||
fig = topic_model.visualize_barchart()
|
||||
|
||||
assert len(fig.to_dict()["layout"]["annotations"]) == 8
|
||||
for annotation in fig.to_dict()["layout"]["annotations"]:
|
||||
assert int(annotation["text"].split(" ")[-1]) != -1
|
||||
|
||||
fig = topic_model.visualize_barchart(top_n_topics=5)
|
||||
|
||||
assert len(fig.to_dict()["layout"]["annotations"]) == 5
|
||||
for annotation in fig.to_dict()["layout"]["annotations"]:
|
||||
assert int(annotation["text"].split(" ")[-1]) != -1
|
||||
@@ -0,0 +1,22 @@
|
||||
import copy
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
("kmeans_pca_topic_model"),
|
||||
("base_topic_model"),
|
||||
("custom_topic_model"),
|
||||
("merged_topic_model"),
|
||||
("reduced_topic_model"),
|
||||
],
|
||||
)
|
||||
def test_documents(model, reduced_embeddings, documents, request):
|
||||
topic_model = copy.deepcopy(request.getfixturevalue(model))
|
||||
topics = set(topic_model.topics_)
|
||||
if -1 in topics:
|
||||
topics.remove(-1)
|
||||
fig = topic_model.visualize_documents(documents, embeddings=reduced_embeddings, hide_document_hover=True)
|
||||
fig_topics = [int(data["name"].split("_")[0]) for data in fig.to_dict()["data"][1:]]
|
||||
assert set(fig_topics) == topics
|
||||
@@ -0,0 +1,22 @@
|
||||
import copy
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
("kmeans_pca_topic_model"),
|
||||
("base_topic_model"),
|
||||
("custom_topic_model"),
|
||||
("merged_topic_model"),
|
||||
("reduced_topic_model"),
|
||||
("online_topic_model"),
|
||||
],
|
||||
)
|
||||
def test_dynamic(model, documents, request):
|
||||
topic_model = copy.deepcopy(request.getfixturevalue(model))
|
||||
timestamps = [i % 10 for i in range(len(documents))]
|
||||
topics_over_time = topic_model.topics_over_time(documents, timestamps)
|
||||
fig = topic_model.visualize_topics_over_time(topics_over_time)
|
||||
|
||||
assert len(fig.to_dict()["data"]) == len(set(topic_model.topics_)) - topic_model._outliers
|
||||
@@ -0,0 +1,23 @@
|
||||
import copy
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
("kmeans_pca_topic_model"),
|
||||
("base_topic_model"),
|
||||
("custom_topic_model"),
|
||||
("merged_topic_model"),
|
||||
("reduced_topic_model"),
|
||||
],
|
||||
)
|
||||
def test_heatmap(model, request):
|
||||
topic_model = copy.deepcopy(request.getfixturevalue(model))
|
||||
topics = set(topic_model.topics_)
|
||||
if -1 in topics:
|
||||
topics.remove(-1)
|
||||
fig = topic_model.visualize_heatmap()
|
||||
fig_topics = [int(topic.split("_")[0]) for topic in fig.to_dict()["data"][0]["x"]]
|
||||
|
||||
assert set(fig_topics) == topics
|
||||
@@ -0,0 +1,8 @@
|
||||
import copy
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", [("kmeans_pca_topic_model"), ("base_topic_model"), ("custom_topic_model")])
|
||||
def test_term_rank(model, request):
|
||||
topic_model = copy.deepcopy(request.getfixturevalue(model))
|
||||
topic_model.visualize_term_rank()
|
||||
@@ -0,0 +1,52 @@
|
||||
import copy
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
("kmeans_pca_topic_model"),
|
||||
("base_topic_model"),
|
||||
("custom_topic_model"),
|
||||
("merged_topic_model"),
|
||||
("reduced_topic_model"),
|
||||
("online_topic_model"),
|
||||
],
|
||||
)
|
||||
def test_topics(model, request):
|
||||
topic_model = copy.deepcopy(request.getfixturevalue(model))
|
||||
fig = topic_model.visualize_topics()
|
||||
for slider in fig.to_dict()["layout"]["sliders"]:
|
||||
for step in slider["steps"]:
|
||||
assert int(step["label"].split(" ")[-1]) != -1
|
||||
|
||||
fig = topic_model.visualize_topics(top_n_topics=5)
|
||||
for slider in fig.to_dict()["layout"]["sliders"]:
|
||||
for step in slider["steps"]:
|
||||
assert int(step["label"].split(" ")[-1]) != -1
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
("kmeans_pca_topic_model"),
|
||||
("base_topic_model"),
|
||||
("custom_topic_model"),
|
||||
("merged_topic_model"),
|
||||
("reduced_topic_model"),
|
||||
("online_topic_model"),
|
||||
],
|
||||
)
|
||||
def test_topics_outlier(model, request):
|
||||
topic_model = copy.deepcopy(request.getfixturevalue(model))
|
||||
topic_model.topic_sizes_[-1] = 4
|
||||
fig = topic_model.visualize_topics()
|
||||
|
||||
for slider in fig.to_dict()["layout"]["sliders"]:
|
||||
for step in slider["steps"]:
|
||||
assert int(step["label"].split(" ")[-1]) != -1
|
||||
|
||||
fig = topic_model.visualize_topics(top_n_topics=5)
|
||||
for slider in fig.to_dict()["layout"]["sliders"]:
|
||||
for step in slider["steps"]:
|
||||
assert int(step["label"].split(" ")[-1]) != -1
|
||||
Reference in New Issue
Block a user