41 lines
1.7 KiB
Python
41 lines
1.7 KiB
Python
import pandas as pd
|
|
from scipy.sparse import csr_matrix
|
|
from sklearn.base import BaseEstimator
|
|
from typing import Mapping, List, Tuple
|
|
|
|
|
|
class BaseRepresentation(BaseEstimator):
|
|
"""The base representation model for fine-tuning topic representations."""
|
|
|
|
def extract_topics(
|
|
self,
|
|
topic_model,
|
|
documents: pd.DataFrame,
|
|
c_tf_idf: csr_matrix,
|
|
topics: Mapping[str, List[Tuple[str, float]]],
|
|
) -> Mapping[str, List[Tuple[str, float]]]:
|
|
"""Extract topics.
|
|
|
|
Each representation model that inherits this class will have
|
|
its arguments (topic_model, documents, c_tf_idf, topics)
|
|
automatically passed. Therefore, the representation model
|
|
will only have access to the information about topics related
|
|
to those arguments.
|
|
|
|
Arguments:
|
|
topic_model: The BERTopic model that is fitted until topic
|
|
representations are calculated.
|
|
documents: A dataframe with columns "Document" and "Topic"
|
|
that contains all documents with each corresponding
|
|
topic.
|
|
c_tf_idf: A c-TF-IDF representation that is typically
|
|
identical to `topic_model.c_tf_idf_` except for
|
|
dynamic, class-based, and hierarchical topic modeling
|
|
where it is calculated on a subset of the documents.
|
|
topics: A dictionary with topic (key) and tuple of word and
|
|
weight (value) as calculated by c-TF-IDF. This is the
|
|
default topics that are returned if no representation
|
|
model is used.
|
|
"""
|
|
return topic_model.topic_representations_
|