Add BERTopic.
This commit is contained in:
@@ -0,0 +1,40 @@
|
||||
import pandas as pd
|
||||
from scipy.sparse import csr_matrix
|
||||
from sklearn.base import BaseEstimator
|
||||
from typing import Mapping, List, Tuple
|
||||
|
||||
|
||||
class BaseRepresentation(BaseEstimator):
|
||||
"""The base representation model for fine-tuning topic representations."""
|
||||
|
||||
def extract_topics(
|
||||
self,
|
||||
topic_model,
|
||||
documents: pd.DataFrame,
|
||||
c_tf_idf: csr_matrix,
|
||||
topics: Mapping[str, List[Tuple[str, float]]],
|
||||
) -> Mapping[str, List[Tuple[str, float]]]:
|
||||
"""Extract topics.
|
||||
|
||||
Each representation model that inherits this class will have
|
||||
its arguments (topic_model, documents, c_tf_idf, topics)
|
||||
automatically passed. Therefore, the representation model
|
||||
will only have access to the information about topics related
|
||||
to those arguments.
|
||||
|
||||
Arguments:
|
||||
topic_model: The BERTopic model that is fitted until topic
|
||||
representations are calculated.
|
||||
documents: A dataframe with columns "Document" and "Topic"
|
||||
that contains all documents with each corresponding
|
||||
topic.
|
||||
c_tf_idf: A c-TF-IDF representation that is typically
|
||||
identical to `topic_model.c_tf_idf_` except for
|
||||
dynamic, class-based, and hierarchical topic modeling
|
||||
where it is calculated on a subset of the documents.
|
||||
topics: A dictionary with topic (key) and tuple of word and
|
||||
weight (value) as calculated by c-TF-IDF. This is the
|
||||
default topics that are returned if no representation
|
||||
model is used.
|
||||
"""
|
||||
return topic_model.topic_representations_
|
||||
Reference in New Issue
Block a user