Add BERTopic.
This commit is contained in:
@@ -0,0 +1,29 @@
|
||||
import copy
|
||||
import pytest
|
||||
from sklearn.datasets import fetch_20newsgroups
|
||||
|
||||
data = fetch_20newsgroups(subset="all", remove=("headers", "footers", "quotes"))
|
||||
classes = [data["target_names"][i] for i in data["target"]][:1000]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
("kmeans_pca_topic_model"),
|
||||
("custom_topic_model"),
|
||||
("merged_topic_model"),
|
||||
("reduced_topic_model"),
|
||||
("online_topic_model"),
|
||||
],
|
||||
)
|
||||
def test_class(model, documents, request):
|
||||
topic_model = copy.deepcopy(request.getfixturevalue(model))
|
||||
topics_per_class_global = topic_model.topics_per_class(documents, classes=classes, global_tuning=True)
|
||||
topics_per_class_local = topic_model.topics_per_class(documents, classes=classes, global_tuning=False)
|
||||
|
||||
assert topics_per_class_global.Frequency.sum() == len(documents)
|
||||
assert topics_per_class_local.Frequency.sum() == len(documents)
|
||||
assert set(topics_per_class_global.Topic.unique()) == set(topic_model.topics_)
|
||||
assert set(topics_per_class_local.Topic.unique()) == set(topic_model.topics_)
|
||||
assert len(topics_per_class_global.Class.unique()) == len(set(classes))
|
||||
assert len(topics_per_class_local.Class.unique()) == len(set(classes))
|
||||
@@ -0,0 +1,22 @@
|
||||
import copy
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
("kmeans_pca_topic_model"),
|
||||
("custom_topic_model"),
|
||||
("merged_topic_model"),
|
||||
("reduced_topic_model"),
|
||||
("online_topic_model"),
|
||||
],
|
||||
)
|
||||
def test_dynamic(model, documents, request):
|
||||
topic_model = copy.deepcopy(request.getfixturevalue(model))
|
||||
timestamps = [i % 10 for i in range(len(documents))]
|
||||
topics_over_time = topic_model.topics_over_time(documents, timestamps)
|
||||
|
||||
assert topics_over_time.Frequency.sum() == len(documents)
|
||||
assert set(topics_over_time.Topic.unique()) == set(topic_model.topics_)
|
||||
assert len(topics_over_time.Timestamp.unique()) == len(set(timestamps))
|
||||
@@ -0,0 +1,69 @@
|
||||
import copy
|
||||
import pytest
|
||||
from scipy.cluster import hierarchy as sch
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
("kmeans_pca_topic_model"),
|
||||
("custom_topic_model"),
|
||||
("merged_topic_model"),
|
||||
("reduced_topic_model"),
|
||||
("online_topic_model"),
|
||||
],
|
||||
)
|
||||
def test_hierarchy(model, documents, request):
|
||||
topic_model = copy.deepcopy(request.getfixturevalue(model))
|
||||
hierarchical_topics = topic_model.hierarchical_topics(documents)
|
||||
|
||||
merged_topics = set([v for vals in hierarchical_topics.Topics.values for v in vals])
|
||||
|
||||
assert len(hierarchical_topics) > 0
|
||||
assert merged_topics == set(topic_model.topics_).difference({-1})
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
("kmeans_pca_topic_model"),
|
||||
("custom_topic_model"),
|
||||
("merged_topic_model"),
|
||||
("reduced_topic_model"),
|
||||
("online_topic_model"),
|
||||
],
|
||||
)
|
||||
def test_linkage(model, documents, request):
|
||||
topic_model = copy.deepcopy(request.getfixturevalue(model))
|
||||
linkage_function = lambda x: sch.linkage(x, "single", optimal_ordering=True)
|
||||
hierarchical_topics = topic_model.hierarchical_topics(documents, linkage_function=linkage_function)
|
||||
merged_topics = set([v for vals in hierarchical_topics.Topics.values for v in vals])
|
||||
tree = topic_model.get_topic_tree(hierarchical_topics)
|
||||
|
||||
assert len(hierarchical_topics) > 0
|
||||
assert len(tree) > 50
|
||||
assert len(tree.split("\n")) <= 2 * len(set(topic_model.topics_))
|
||||
assert merged_topics == set(topic_model.topics_).difference({-1})
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
("kmeans_pca_topic_model"),
|
||||
("custom_topic_model"),
|
||||
("merged_topic_model"),
|
||||
("reduced_topic_model"),
|
||||
("online_topic_model"),
|
||||
],
|
||||
)
|
||||
def test_tree(model, documents, request):
|
||||
topic_model = copy.deepcopy(request.getfixturevalue(model))
|
||||
linkage_function = lambda x: sch.linkage(x, "single", optimal_ordering=True)
|
||||
hierarchical_topics = topic_model.hierarchical_topics(documents, linkage_function=linkage_function)
|
||||
merged_topics = set([v for vals in hierarchical_topics.Topics.values for v in vals])
|
||||
tree = topic_model.get_topic_tree(hierarchical_topics)
|
||||
|
||||
assert len(hierarchical_topics) > 0
|
||||
assert len(tree) > 50
|
||||
assert len(tree.split("\n")) <= 2 * len(set(topic_model.topics_))
|
||||
assert merged_topics == set(topic_model.topics_).difference({-1})
|
||||
Reference in New Issue
Block a user