Add BERTopic.

2025-08-12 19:01:20 +08:00
parent e2323d579c
commit c5c530775e
256 changed files with 28666 additions and 0 deletions
@@ -0,0 +1,126 @@
+import copy
+import pytest
+import numpy as np
+import pandas as pd
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        ("kmeans_pca_topic_model"),
+        ("base_topic_model"),
+        ("custom_topic_model"),
+        ("merged_topic_model"),
+        ("reduced_topic_model"),
+        ("online_topic_model"),
+    ],
+)
+def test_get_topic(model, request):
+    topic_model = copy.deepcopy(request.getfixturevalue(model))
+    topics = [topic_model.get_topic(topic) for topic in set(topic_model.topics_)]
+    unknown_topic = topic_model.get_topic(500)
+
+    for topic in topics:
+        assert topic is not False
+
+    assert len(topics) == len(topic_model.get_topic_info())
+    assert not unknown_topic
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        ("kmeans_pca_topic_model"),
+        ("base_topic_model"),
+        ("custom_topic_model"),
+        ("merged_topic_model"),
+        ("reduced_topic_model"),
+        ("online_topic_model"),
+    ],
+)
+def test_get_topics(model, request):
+    topic_model = copy.deepcopy(request.getfixturevalue(model))
+    topics = topic_model.get_topics()
+
+    assert topics == topic_model.topic_representations_
+    assert len(topics.keys()) == len(set(topic_model.topics_))
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        ("kmeans_pca_topic_model"),
+        ("base_topic_model"),
+        ("custom_topic_model"),
+        ("merged_topic_model"),
+        ("reduced_topic_model"),
+        ("online_topic_model"),
+    ],
+)
+def test_get_topic_freq(model, request):
+    topic_model = copy.deepcopy(request.getfixturevalue(model))
+    for topic in set(topic_model.topics_):
+        assert not isinstance(topic_model.get_topic_freq(topic), pd.DataFrame)
+
+    topic_freq = topic_model.get_topic_freq()
+    unique_topics = set(topic_model.topics_)
+    topics_in_mapper = set(np.array(topic_model.topic_mapper_.mappings_)[:, -1])
+
+    assert isinstance(topic_freq, pd.DataFrame)
+
+    assert len(topic_freq) == len(set(topic_model.topics_))
+    assert len(topics_in_mapper.difference(unique_topics)) == 0
+    assert len(unique_topics.difference(topics_in_mapper)) == 0
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        ("base_topic_model"),
+        ("custom_topic_model"),
+        ("merged_topic_model"),
+        ("reduced_topic_model"),
+    ],
+)
+def test_get_representative_docs(model, request):
+    topic_model = copy.deepcopy(request.getfixturevalue(model))
+    all_docs = topic_model.get_representative_docs()
+    unique_topics = set(topic_model.topics_)
+    topics_in_mapper = set(np.array(topic_model.topic_mapper_.mappings_)[:, -1])
+
+    assert len(all_docs) == len(topic_model.topic_sizes_.keys())
+    assert len(all_docs) == len(topics_in_mapper)
+    assert len(all_docs) == topic_model.c_tf_idf_.shape[0]
+    assert len(all_docs) == len(topic_model.topic_labels_)
+    assert all([True if len(docs) == 3 else False for docs in all_docs.values()])
+
+    topics = set(list(all_docs.keys()))
+
+    assert len(topics.difference(unique_topics)) == 0
+    assert len(topics.difference(topics_in_mapper)) == 0
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        ("kmeans_pca_topic_model"),
+        ("base_topic_model"),
+        ("custom_topic_model"),
+        ("merged_topic_model"),
+        ("reduced_topic_model"),
+        ("online_topic_model"),
+    ],
+)
+def test_get_topic_info(model, request):
+    topic_model = copy.deepcopy(request.getfixturevalue(model))
+    info = topic_model.get_topic_info()
+
+    if topic_model._outliers:
+        assert info.iloc[0].Topic == -1
+    else:
+        assert info.iloc[0].Topic == 0
+
+    for topic in set(topic_model.topics_):
+        assert len(topic_model.get_topic_info(topic)) == 1
+
+    assert len(topic_model.get_topic_info(200)) == 0
@@ -0,0 +1,72 @@
+import copy
+import pytest
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        ("kmeans_pca_topic_model"),
+        ("base_topic_model"),
+        ("custom_topic_model"),
+        ("merged_topic_model"),
+        ("reduced_topic_model"),
+        ("online_topic_model"),
+    ],
+)
+def test_generate_topic_labels(model, request):
+    topic_model = copy.deepcopy(request.getfixturevalue(model))
+    labels = topic_model.generate_topic_labels(topic_prefix=False)
+
+    assert sum([label[0].isdigit() for label in labels[1:]]) / len(labels) < 0.2
+
+    labels = [int(label.split("_")[0]) for label in topic_model.generate_topic_labels()]
+    assert labels == sorted(list(set(topic_model.topics_)))
+
+    labels = topic_model.generate_topic_labels(nr_words=1, topic_prefix=False)
+    assert all([True if len(label) < 15 else False for label in labels])
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        ("kmeans_pca_topic_model"),
+        ("base_topic_model"),
+        ("custom_topic_model"),
+        ("merged_topic_model"),
+        ("reduced_topic_model"),
+        ("online_topic_model"),
+    ],
+)
+def test_set_labels(model, request):
+    topic_model = copy.deepcopy(request.getfixturevalue(model))
+
+    labels = topic_model.generate_topic_labels()
+    topic_model.set_topic_labels(labels)
+    assert topic_model.custom_labels_ == labels
+
+    if model != "online_topic_model":
+        labels = {1: "My label", 2: "Another label"}
+        topic_model.set_topic_labels(labels)
+        assert topic_model.custom_labels_[1 + topic_model._outliers] == "My label"
+        assert topic_model.custom_labels_[2 + topic_model._outliers] == "Another label"
+
+        labels = {1: "Change label", 3: "New label"}
+        topic_model.set_topic_labels(labels)
+        assert topic_model.custom_labels_[1 + topic_model._outliers] == "Change label"
+        assert topic_model.custom_labels_[3 + topic_model._outliers] == "New label"
+    else:
+        labels = {
+            sorted(set(topic_model.topics_))[0]: "My label",
+            sorted(set(topic_model.topics_))[1]: "Another label",
+        }
+        topic_model.set_topic_labels(labels)
+        assert topic_model.custom_labels_[0] == "My label"
+        assert topic_model.custom_labels_[1] == "Another label"
+
+        labels = {
+            sorted(set(topic_model.topics_))[0]: "Change label",
+            sorted(set(topic_model.topics_))[2]: "New label",
+        }
+        topic_model.set_topic_labels(labels)
+        assert topic_model.custom_labels_[0 + topic_model._outliers] == "Change label"
+        assert topic_model.custom_labels_[2 + topic_model._outliers] == "New label"
@@ -0,0 +1,184 @@
+import copy
+import pytest
+import numpy as np
+import pandas as pd
+from sklearn.feature_extraction.text import CountVectorizer
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        ("kmeans_pca_topic_model"),
+        ("base_topic_model"),
+        ("custom_topic_model"),
+        ("merged_topic_model"),
+        ("reduced_topic_model"),
+    ],
+)
+def test_update_topics(model, documents, request):
+    topic_model = copy.deepcopy(request.getfixturevalue(model))
+    old_ctfidf = topic_model.c_tf_idf_
+    old_topics = topic_model.topics_
+
+    topic_model.update_topics(documents, n_gram_range=(1, 3))
+
+    assert old_ctfidf.shape[1] < topic_model.c_tf_idf_.shape[1]
+    assert old_topics == topic_model.topics_
+
+    updated_topics = [topic if topic != 1 else 0 for topic in old_topics]
+    topic_model.update_topics(documents, topics=updated_topics, n_gram_range=(1, 3))
+
+    assert len(set(old_topics)) - 1 == len(set(topic_model.topics_))
+
+    old_topics = topic_model.topics_
+    updated_topics = [topic if topic != 2 else 0 for topic in old_topics]
+    topic_model.update_topics(documents, topics=updated_topics, n_gram_range=(1, 3))
+
+    assert len(set(old_topics)) - 1 == len(set(topic_model.topics_))
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        ("kmeans_pca_topic_model"),
+        ("base_topic_model"),
+        ("custom_topic_model"),
+        ("merged_topic_model"),
+        ("reduced_topic_model"),
+        ("online_topic_model"),
+    ],
+)
+def test_extract_topics(model, documents, request):
+    topic_model = copy.deepcopy(request.getfixturevalue(model))
+    nr_topics = 5
+    documents = pd.DataFrame(
+        {
+            "Document": documents,
+            "ID": range(len(documents)),
+            "Topic": np.random.randint(-1, nr_topics - 1, len(documents)),
+        }
+    )
+    topic_model._update_topic_size(documents)
+    topic_model._extract_topics(documents)
+    freq = topic_model.get_topic_freq()
+
+    assert topic_model.c_tf_idf_.shape[0] == 5
+    assert topic_model.c_tf_idf_.shape[1] > 100
+    assert isinstance(freq, pd.DataFrame)
+    assert nr_topics == len(freq.Topic.unique())
+    assert freq.Count.sum() == len(documents)
+    assert len(freq.Topic.unique()) == len(freq)
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        ("kmeans_pca_topic_model"),
+        ("base_topic_model"),
+        ("custom_topic_model"),
+        ("merged_topic_model"),
+        ("reduced_topic_model"),
+        ("online_topic_model"),
+    ],
+)
+def test_extract_topics_custom_cv(model, documents, request):
+    topic_model = copy.deepcopy(request.getfixturevalue(model))
+    nr_topics = 5
+    documents = pd.DataFrame(
+        {
+            "Document": documents,
+            "ID": range(len(documents)),
+            "Topic": np.random.randint(-1, nr_topics - 1, len(documents)),
+        }
+    )
+
+    cv = CountVectorizer(ngram_range=(1, 2))
+    topic_model.vectorizer_model = cv
+    topic_model._update_topic_size(documents)
+    topic_model._extract_topics(documents)
+    freq = topic_model.get_topic_freq()
+
+    assert topic_model.c_tf_idf_.shape[0] == 5
+    assert topic_model.c_tf_idf_.shape[1] > 100
+    assert isinstance(freq, pd.DataFrame)
+    assert nr_topics == len(freq.Topic.unique())
+    assert freq.Count.sum() == len(documents)
+    assert len(freq.Topic.unique()) == len(freq)
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        ("kmeans_pca_topic_model"),
+        ("base_topic_model"),
+        ("custom_topic_model"),
+        ("merged_topic_model"),
+        ("reduced_topic_model"),
+        ("online_topic_model"),
+    ],
+)
+@pytest.mark.parametrize("reduced_topics", [2, 4, 10])
+def test_topic_reduction(model, reduced_topics, documents, request):
+    topic_model = copy.deepcopy(request.getfixturevalue(model))
+    old_topics = copy.deepcopy(topic_model.topics_)
+    old_freq = topic_model.get_topic_freq()
+
+    topic_model.reduce_topics(documents, nr_topics=reduced_topics)
+
+    new_freq = topic_model.get_topic_freq()
+
+    if model != "online_topic_model":
+        assert old_freq.Count.sum() == new_freq.Count.sum()
+    assert len(old_freq.Topic.unique()) == len(old_freq)
+    assert len(new_freq.Topic.unique()) == len(new_freq)
+    assert len(topic_model.topics_) == len(old_topics)
+    assert topic_model.topics_ != old_topics
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        ("kmeans_pca_topic_model"),
+        ("base_topic_model"),
+        ("custom_topic_model"),
+        ("merged_topic_model"),
+        ("reduced_topic_model"),
+        ("online_topic_model"),
+    ],
+)
+def test_topic_reduction_edge_cases(model, documents, request):
+    topic_model = copy.deepcopy(request.getfixturevalue(model))
+    topic_model.nr_topics = 100
+    nr_topics = 5
+    topics = np.random.randint(-1, nr_topics - 1, len(documents))
+    old_documents = pd.DataFrame({"Document": documents, "ID": range(len(documents)), "Topic": topics})
+    topic_model._update_topic_size(old_documents)
+    old_documents = topic_model._sort_mappings_by_frequency(old_documents)
+    topic_model._extract_topics(old_documents)
+    old_freq = topic_model.get_topic_freq()
+
+    new_documents = topic_model._reduce_topics(old_documents)
+    new_freq = topic_model.get_topic_freq()
+
+    assert not set(old_documents.Topic).difference(set(new_documents.Topic))
+    pd.testing.assert_frame_equal(old_documents, new_documents)
+    pd.testing.assert_frame_equal(old_freq, new_freq)
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        ("kmeans_pca_topic_model"),
+        ("base_topic_model"),
+        ("custom_topic_model"),
+        ("merged_topic_model"),
+        ("reduced_topic_model"),
+        ("online_topic_model"),
+    ],
+)
+def test_find_topics(model, request):
+    topic_model = copy.deepcopy(request.getfixturevalue(model))
+    similar_topics, similarity = topic_model.find_topics("car")
+
+    assert np.mean(similarity) > 0.1
+    assert len(similar_topics) > 0