Add BERTopic.

2025-08-12 19:01:20 +08:00
parent e2323d579c
commit c5c530775e
256 changed files with 28666 additions and 0 deletions
@@ -0,0 +1,101 @@
+import copy
+import pytest
+import numpy as np
+import pandas as pd
+from packaging import version
+from scipy.sparse import csr_matrix
+from sklearn import __version__ as sklearn_version
+from sklearn.feature_extraction.text import CountVectorizer
+from bertopic.vectorizers import ClassTfidfTransformer
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        ("kmeans_pca_topic_model"),
+        ("base_topic_model"),
+        ("custom_topic_model"),
+        ("merged_topic_model"),
+        ("reduced_topic_model"),
+        ("online_topic_model"),
+    ],
+)
+def test_ctfidf(model, documents, request):
+    topic_model = copy.deepcopy(request.getfixturevalue(model))
+    topics = topic_model.topics_
+    documents = pd.DataFrame({"Document": documents, "ID": range(len(documents)), "Topic": topics})
+    documents_per_topic = documents.groupby(["Topic"], as_index=False).agg({"Document": " ".join})
+    documents = topic_model._preprocess_text(documents_per_topic.Document.values)
+    count = topic_model.vectorizer_model.fit(documents)
+
+    # Scikit-Learn Deprecation: get_feature_names is deprecated in 1.0
+    # and will be removed in 1.2. Please use get_feature_names_out instead.
+    if version.parse(sklearn_version) >= version.parse("1.0.0"):
+        words = count.get_feature_names_out()
+    else:
+        words = count.get_feature_names()
+
+    X = count.transform(documents)
+    transformer = ClassTfidfTransformer().fit(X)
+    c_tf_idf = transformer.transform(X)
+
+    assert len(words) > 1000
+    assert all([isinstance(x, str) for x in words])
+
+    assert isinstance(X, csr_matrix)
+    assert isinstance(c_tf_idf, csr_matrix)
+
+    assert X.shape[0] == len(set(topics))
+    assert X.shape[1] == len(words)
+
+    assert c_tf_idf.shape[0] == len(set(topics))
+    assert c_tf_idf.shape[1] == len(words)
+
+    assert np.min(X) == 0
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        ("kmeans_pca_topic_model"),
+        ("base_topic_model"),
+        ("custom_topic_model"),
+        ("merged_topic_model"),
+        ("reduced_topic_model"),
+        ("online_topic_model"),
+    ],
+)
+def test_ctfidf_custom_cv(model, documents, request):
+    cv = CountVectorizer(ngram_range=(1, 3), stop_words="english")
+    topic_model = copy.deepcopy(request.getfixturevalue(model))
+    topic_model.vectorizer_model = cv
+    topics = topic_model.topics_
+    documents = pd.DataFrame({"Document": documents, "ID": range(len(documents)), "Topic": topics})
+    documents_per_topic = documents.groupby(["Topic"], as_index=False).agg({"Document": " ".join})
+    documents = topic_model._preprocess_text(documents_per_topic.Document.values)
+    count = topic_model.vectorizer_model.fit(documents)
+
+    # Scikit-Learn Deprecation: get_feature_names is deprecated in 1.0
+    # and will be removed in 1.2. Please use get_feature_names_out instead.
+    if version.parse(sklearn_version) >= version.parse("1.0.0"):
+        words = count.get_feature_names_out()
+    else:
+        words = count.get_feature_names()
+
+    X = count.transform(documents)
+    transformer = ClassTfidfTransformer().fit(X)
+    c_tf_idf = transformer.transform(X)
+
+    assert len(words) > 1000
+    assert all([isinstance(x, str) for x in words])
+
+    assert isinstance(X, csr_matrix)
+    assert isinstance(c_tf_idf, csr_matrix)
+
+    assert X.shape[0] == len(set(topics))
+    assert X.shape[1] == len(words)
+
+    assert c_tf_idf.shape[0] == len(set(topics))
+    assert c_tf_idf.shape[1] == len(words)
+
+    assert np.min(X) == 0
@@ -0,0 +1,38 @@
+import copy
+import pytest
+from bertopic.vectorizers import OnlineCountVectorizer
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        ("kmeans_pca_topic_model"),
+        ("custom_topic_model"),
+        ("merged_topic_model"),
+        ("reduced_topic_model"),
+        ("online_topic_model"),
+    ],
+)
+def test_online_cv(model, documents, request):
+    topic_model = copy.deepcopy(request.getfixturevalue(model))
+    vectorizer_model = OnlineCountVectorizer(stop_words="english", ngram_range=(2, 2))
+
+    topics = [topic_model.get_topic(topic) for topic in set(topic_model.topics_)]
+    topic_model.update_topics(documents, vectorizer_model=vectorizer_model)
+    new_topics = [topic_model.get_topic(topic) for topic in set(topic_model.topics_)]
+
+    for old_topic, new_topic in zip(topics, new_topics):
+        if old_topic[0][0] != "":
+            assert old_topic != new_topic
+
+
+@pytest.mark.parametrize("model", [("online_topic_model")])
+def test_clean_bow(model, request):
+    topic_model = copy.deepcopy(request.getfixturevalue(model))
+
+    original_shape = topic_model.vectorizer_model.X_.shape
+    topic_model.vectorizer_model.delete_min_df = 2
+    topic_model.vectorizer_model._clean_bow()
+
+    assert original_shape[0] == topic_model.vectorizer_model.X_.shape[0]
+    assert original_shape[1] > topic_model.vectorizer_model.X_.shape[1]