Add BERTopic.
This commit is contained in:
@@ -0,0 +1,101 @@
|
||||
import copy
|
||||
import pytest
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from packaging import version
|
||||
from scipy.sparse import csr_matrix
|
||||
from sklearn import __version__ as sklearn_version
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from bertopic.vectorizers import ClassTfidfTransformer
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
("kmeans_pca_topic_model"),
|
||||
("base_topic_model"),
|
||||
("custom_topic_model"),
|
||||
("merged_topic_model"),
|
||||
("reduced_topic_model"),
|
||||
("online_topic_model"),
|
||||
],
|
||||
)
|
||||
def test_ctfidf(model, documents, request):
|
||||
topic_model = copy.deepcopy(request.getfixturevalue(model))
|
||||
topics = topic_model.topics_
|
||||
documents = pd.DataFrame({"Document": documents, "ID": range(len(documents)), "Topic": topics})
|
||||
documents_per_topic = documents.groupby(["Topic"], as_index=False).agg({"Document": " ".join})
|
||||
documents = topic_model._preprocess_text(documents_per_topic.Document.values)
|
||||
count = topic_model.vectorizer_model.fit(documents)
|
||||
|
||||
# Scikit-Learn Deprecation: get_feature_names is deprecated in 1.0
|
||||
# and will be removed in 1.2. Please use get_feature_names_out instead.
|
||||
if version.parse(sklearn_version) >= version.parse("1.0.0"):
|
||||
words = count.get_feature_names_out()
|
||||
else:
|
||||
words = count.get_feature_names()
|
||||
|
||||
X = count.transform(documents)
|
||||
transformer = ClassTfidfTransformer().fit(X)
|
||||
c_tf_idf = transformer.transform(X)
|
||||
|
||||
assert len(words) > 1000
|
||||
assert all([isinstance(x, str) for x in words])
|
||||
|
||||
assert isinstance(X, csr_matrix)
|
||||
assert isinstance(c_tf_idf, csr_matrix)
|
||||
|
||||
assert X.shape[0] == len(set(topics))
|
||||
assert X.shape[1] == len(words)
|
||||
|
||||
assert c_tf_idf.shape[0] == len(set(topics))
|
||||
assert c_tf_idf.shape[1] == len(words)
|
||||
|
||||
assert np.min(X) == 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
("kmeans_pca_topic_model"),
|
||||
("base_topic_model"),
|
||||
("custom_topic_model"),
|
||||
("merged_topic_model"),
|
||||
("reduced_topic_model"),
|
||||
("online_topic_model"),
|
||||
],
|
||||
)
|
||||
def test_ctfidf_custom_cv(model, documents, request):
|
||||
cv = CountVectorizer(ngram_range=(1, 3), stop_words="english")
|
||||
topic_model = copy.deepcopy(request.getfixturevalue(model))
|
||||
topic_model.vectorizer_model = cv
|
||||
topics = topic_model.topics_
|
||||
documents = pd.DataFrame({"Document": documents, "ID": range(len(documents)), "Topic": topics})
|
||||
documents_per_topic = documents.groupby(["Topic"], as_index=False).agg({"Document": " ".join})
|
||||
documents = topic_model._preprocess_text(documents_per_topic.Document.values)
|
||||
count = topic_model.vectorizer_model.fit(documents)
|
||||
|
||||
# Scikit-Learn Deprecation: get_feature_names is deprecated in 1.0
|
||||
# and will be removed in 1.2. Please use get_feature_names_out instead.
|
||||
if version.parse(sklearn_version) >= version.parse("1.0.0"):
|
||||
words = count.get_feature_names_out()
|
||||
else:
|
||||
words = count.get_feature_names()
|
||||
|
||||
X = count.transform(documents)
|
||||
transformer = ClassTfidfTransformer().fit(X)
|
||||
c_tf_idf = transformer.transform(X)
|
||||
|
||||
assert len(words) > 1000
|
||||
assert all([isinstance(x, str) for x in words])
|
||||
|
||||
assert isinstance(X, csr_matrix)
|
||||
assert isinstance(c_tf_idf, csr_matrix)
|
||||
|
||||
assert X.shape[0] == len(set(topics))
|
||||
assert X.shape[1] == len(words)
|
||||
|
||||
assert c_tf_idf.shape[0] == len(set(topics))
|
||||
assert c_tf_idf.shape[1] == len(words)
|
||||
|
||||
assert np.min(X) == 0
|
||||
Reference in New Issue
Block a user