Add BERTopic.

2025-08-12 19:01:20 +08:00
parent e2323d579c
commit c5c530775e
256 changed files with 28666 additions and 0 deletions
@@ -0,0 +1,77 @@
+import pytest
+import pandas as pd
+
+from sklearn.datasets import make_blobs
+from sklearn.cluster import KMeans
+from hdbscan import HDBSCAN
+
+from bertopic import BERTopic
+
+
+@pytest.mark.parametrize("cluster_model", ["hdbscan", "kmeans"])
+@pytest.mark.parametrize(
+    "samples,features,centers",
+    [
+        (200, 500, 1),
+        (500, 200, 1),
+        (200, 500, 2),
+        (500, 200, 2),
+        (200, 500, 4),
+        (500, 200, 4),
+    ],
+)
+def test_hdbscan_cluster_embeddings(cluster_model, samples, features, centers):
+    embeddings, _ = make_blobs(n_samples=samples, centers=centers, n_features=features, random_state=42)
+    documents = [str(i + 1) for i in range(embeddings.shape[0])]
+    old_df = pd.DataFrame({"Document": documents, "ID": range(len(documents)), "Topic": None})
+
+    if cluster_model == "kmeans":
+        cluster_model = KMeans(n_clusters=centers)
+    else:
+        cluster_model = HDBSCAN(
+            min_cluster_size=10,
+            metric="euclidean",
+            cluster_selection_method="eom",
+            prediction_data=True,
+        )
+
+    model = BERTopic(hdbscan_model=cluster_model)
+    new_df, _ = model._cluster_embeddings(embeddings, old_df)
+
+    assert len(new_df.Topic.unique()) == centers
+    assert "Topic" in new_df.columns
+    pd.testing.assert_frame_equal(old_df.drop("Topic", axis=1), new_df.drop("Topic", axis=1))
+
+
+@pytest.mark.parametrize("cluster_model", ["hdbscan", "kmeans"])
+@pytest.mark.parametrize(
+    "samples,features,centers",
+    [
+        (200, 500, 1),
+        (500, 200, 1),
+        (200, 500, 2),
+        (500, 200, 2),
+        (200, 500, 4),
+        (500, 200, 4),
+    ],
+)
+def test_custom_hdbscan_cluster_embeddings(cluster_model, samples, features, centers):
+    embeddings, _ = make_blobs(n_samples=samples, centers=centers, n_features=features, random_state=42)
+    documents = [str(i + 1) for i in range(embeddings.shape[0])]
+    old_df = pd.DataFrame({"Document": documents, "ID": range(len(documents)), "Topic": None})
+    if cluster_model == "kmeans":
+        cluster_model = KMeans(n_clusters=centers)
+    else:
+        cluster_model = HDBSCAN(
+            min_cluster_size=10,
+            metric="euclidean",
+            cluster_selection_method="eom",
+            prediction_data=True,
+        )
+
+    model = BERTopic(hdbscan_model=cluster_model)
+    new_df, _ = model._cluster_embeddings(embeddings, old_df)
+
+    assert len(new_df.Topic.unique()) == centers
+    assert "Topic" in new_df.columns
+    pd.testing.assert_frame_equal(old_df.drop("Topic", axis=1), new_df.drop("Topic", axis=1))
@@ -0,0 +1,40 @@
+import copy
+import pytest
+import numpy as np
+from umap import UMAP
+from sklearn.decomposition import PCA
+
+from bertopic import BERTopic
+
+
+@pytest.mark.parametrize("dim_model", [UMAP, PCA])
+@pytest.mark.parametrize(
+    "embeddings,shape,n_components",
+    [
+        (np.random.rand(100, 128), 100, 5),
+        (np.random.rand(10, 256), 10, 5),
+        (np.random.rand(50, 15), 50, 10),
+    ],
+)
+def test_reduce_dimensionality(dim_model, embeddings, shape, n_components):
+    model = BERTopic(umap_model=dim_model(n_components=n_components))
+    umap_embeddings = model._reduce_dimensionality(embeddings)
+    assert umap_embeddings.shape == (shape, n_components)
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        ("kmeans_pca_topic_model"),
+        ("base_topic_model"),
+        ("custom_topic_model"),
+        ("merged_topic_model"),
+        ("reduced_topic_model"),
+        ("online_topic_model"),
+    ],
+)
+def test_custom_reduce_dimensionality(model, request):
+    embeddings = np.random.rand(500, 128)
+    topic_model = copy.deepcopy(request.getfixturevalue(model))
+    umap_embeddings = topic_model._reduce_dimensionality(embeddings)
+    assert umap_embeddings.shape[1] < embeddings.shape[1]
@@ -0,0 +1,65 @@
+import copy
+import pytest
+import numpy as np
+from bertopic import BERTopic
+from sklearn.metrics.pairwise import cosine_similarity
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        ("kmeans_pca_topic_model"),
+        ("base_topic_model"),
+        ("custom_topic_model"),
+        ("merged_topic_model"),
+        ("reduced_topic_model"),
+        ("online_topic_model"),
+    ],
+)
+def test_extract_embeddings(model, request):
+    topic_model = copy.deepcopy(request.getfixturevalue(model))
+    single_embedding = topic_model._extract_embeddings("a document")
+    multiple_embeddings = topic_model._extract_embeddings(["something different", "another document"])
+    sim_matrix = cosine_similarity(single_embedding, multiple_embeddings)[0]
+
+    assert single_embedding.shape[0] == 1
+    assert single_embedding.shape[1] == 384
+    assert np.min(single_embedding) > -5
+    assert np.max(single_embedding) < 5
+
+    assert multiple_embeddings.shape[0] == 2
+    assert multiple_embeddings.shape[1] == 384
+    assert np.min(multiple_embeddings) > -5
+    assert np.max(multiple_embeddings) < 5
+
+    assert sim_matrix[0] < 0.5
+    assert sim_matrix[1] > 0.5
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        ("kmeans_pca_topic_model"),
+        ("base_topic_model"),
+        ("custom_topic_model"),
+        ("merged_topic_model"),
+        ("reduced_topic_model"),
+        ("online_topic_model"),
+    ],
+)
+def test_extract_embeddings_compare(model, embedding_model, request):
+    docs = ["some document"]
+    topic_model = copy.deepcopy(request.getfixturevalue(model))
+    bertopic_embeddings = topic_model._extract_embeddings(docs)
+
+    assert isinstance(bertopic_embeddings, np.ndarray)
+    assert bertopic_embeddings.shape == (1, 384)
+
+    sentence_embeddings = embedding_model.encode(docs, show_progress_bar=False)
+    assert np.array_equal(bertopic_embeddings, sentence_embeddings)
+
+
+def test_extract_incorrect_embeddings():
+    with pytest.raises(ValueError):
+        model = BERTopic(language="Unknown language")
+        model.fit(["some document"])