The LLM-based topic recognition model is complete and adapted to quickly updating Weibo topics.

This commit is contained in:
戒酒的李白
2025-08-07 11:14:38 +08:00
parent 1e780876c9
commit d88d5edd99
32 changed files with 8352 additions and 1 deletions
@@ -0,0 +1,12 @@
class Client:
def __init__(self, api_key: str, azure_endpoint: dict = None) -> None:
if azure_endpoint:
from openai import AzureOpenAI
self.client = AzureOpenAI(api_key=api_key, api_version=azure_endpoint['api_version'], azure_endpoint=azure_endpoint['endpoint'])
else:
from openai import OpenAI
self.client = OpenAI(api_key=api_key)
def __getattr__(self, name):
"""Delegate attribute access to the self.client object."""
return getattr(self.client, name)
@@ -0,0 +1,286 @@
import numpy as np
import umap
import hdbscan
import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
import umap.plot
from copy import deepcopy
from sklearn.cluster import AgglomerativeClustering
from typing import Tuple
class Clustering_and_DimRed():
"""
Class to perform dimensionality reduction with UMAP followed by clustering with HDBSCAN.
"""
def __init__(self,
n_dims_umap: int = 5,
n_neighbors_umap: int = 15,
min_dist_umap: float = 0,
metric_umap: str = "cosine",
min_cluster_size_hdbscan: int = 30,
metric_hdbscan: str = "euclidean",
cluster_selection_method_hdbscan: str = "eom",
number_clusters_hdbscan: int = None,
random_state: int = 42,
verbose: bool = True,
UMAP_hyperparams: dict = {},
HDBSCAN_hyperparams: dict = {}) -> None:
"""
Initializes the clustering and dimensionality reduction parameters for topic modeling.
Args:
n_dims_umap (int, optional): Number of dimensions to reduce to using UMAP.
n_neighbors_umap (int, optional): Number of neighbors for UMAP.
min_dist_umap (float, optional): Minimum distance for UMAP.
metric_umap (str, optional): Metric for UMAP.
min_cluster_size_hdbscan (int, optional): Minimum cluster size for HDBSCAN.
metric_hdbscan (str, optional): Metric for HDBSCAN.
cluster_selection_method_hdbscan (str, optional): Cluster selection method for HDBSCAN.
number_clusters_hdbscan (int, optional): Number of clusters for HDBSCAN. If None, HDBSCAN will determine the number of clusters automatically. Ensure that min_cluster_size is not too large to find enough clusters.
random_state (int, optional): Random state for UMAP and HDBSCAN.
verbose (bool, optional): Whether to print progress.
UMAP_hyperparams (dict, optional): Additional hyperparameters for UMAP.
HDBSCAN_hyperparams (dict, optional): Additional hyperparameters for HDBSCAN.
"""
# do some checks on the input arguments
assert n_dims_umap > 0, "n_dims_umap must be greater than 0"
assert n_neighbors_umap > 0, "n_neighbors_umap must be greater than 0"
assert min_dist_umap >= 0, "min_dist_umap must be greater than or equal to 0"
assert min_cluster_size_hdbscan > 0, "min_cluster_size_hdbscan must be greater than 0"
assert number_clusters_hdbscan is None or number_clusters_hdbscan > 0, "number_clusters_hdbscan must be greater than 0 or None"
assert random_state is None or random_state >= 0, "random_state must be greater than or equal to 0"
self.random_state = random_state
self.verbose = verbose
self.UMAP_hyperparams = UMAP_hyperparams
self.HDBSCAN_hyperparams = HDBSCAN_hyperparams
# update hyperparameters for UMAP
self.UMAP_hyperparams["n_components"] = n_dims_umap
self.UMAP_hyperparams["n_neighbors"] = n_neighbors_umap
self.UMAP_hyperparams["min_dist"] = min_dist_umap
self.UMAP_hyperparams["metric"] = metric_umap
self.UMAP_hyperparams["random_state"] = random_state
self.UMAP_hyperparams["verbose"] = verbose
self.umap = umap.UMAP(**self.UMAP_hyperparams)
self.HDBSCAN_hyperparams["min_cluster_size"] = min_cluster_size_hdbscan
self.HDBSCAN_hyperparams["metric"] = metric_hdbscan
self.HDBSCAN_hyperparams["cluster_selection_method"] = cluster_selection_method_hdbscan
self.number_clusters_hdbscan = number_clusters_hdbscan
self.hdbscan = hdbscan.HDBSCAN(**self.HDBSCAN_hyperparams)
def reduce_dimensions_umap(self, embeddings: np.ndarray) -> Tuple[np.ndarray, umap.UMAP]:
"""
Reduces dimensions of embeddings using UMAP.
Args:
embeddings (np.ndarray): Embeddings to reduce.
Returns:
tuple: A tuple containing two items:
- reduced_embeddings (np.ndarray): Reduced embeddings.
- umap_mapper (umap.UMAP): UMAP mapper for transforming new embeddings, especially embeddings of the vocabulary. (MAKE SURE TO NORMALIZE EMBEDDINGS AFTER USING THE MAPPER)
"""
mapper = umap.UMAP(**self.UMAP_hyperparams).fit(embeddings)
dim_red_embeddings = mapper.transform(embeddings)
dim_red_embeddings = dim_red_embeddings/np.linalg.norm(dim_red_embeddings, axis=1).reshape(-1,1)
return dim_red_embeddings, mapper
def cluster_hdbscan(self, embeddings: np.ndarray) -> np.ndarray:
"""
Cluster embeddings using HDBSCAN.
If self.number_clusters_hdbscan is not None, further clusters the data with AgglomerativeClustering to achieve a fixed number of clusters.
Args:
embeddings (np.ndarray): Embeddings to cluster.
Returns:
np.ndarray: Cluster labels.
"""
labels = self.hdbscan.fit_predict(embeddings)
outliers = np.where(labels == -1)[0]
if self.number_clusters_hdbscan is not None:
clusterer = AgglomerativeClustering(n_clusters=self.number_clusters_hdbscan) #one cluster for outliers
labels = clusterer.fit_predict(embeddings)
labels[outliers] = -1
# reindex to make the labels consecutive numbers from -1 to the number of clusters. -1 is reserved for outliers
unique_labels = np.unique(labels)
unique_labels_no_outliers = unique_labels[unique_labels != -1]
map2newlabel = {label: i for i, label in enumerate(unique_labels_no_outliers)}
map2newlabel[-1] = -1
labels = np.array([map2newlabel[label] for label in labels])
return labels
def cluster_and_reduce(self, embeddings: np.ndarray) -> Tuple[np.ndarray, np.ndarray, umap.UMAP]:
"""
Cluster embeddings using HDBSCAN and reduce dimensions with UMAP.
Args:
embeddings (np.ndarray): Embeddings to cluster and reduce.
Returns:
tuple: A tuple containing three items:
- reduced_embeddings (np.ndarray): Reduced embeddings.
- cluster_labels (np.ndarray): Cluster labels.
- umap_mapper (umap.UMAP): UMAP mapper for transforming new embeddings, especially embeddings of the vocabulary. (MAKE SURE TO NORMALIZE EMBEDDINGS AFTER USING THE MAPPER)
"""
dim_red_embeddings, umap_mapper = self.reduce_dimensions_umap(embeddings)
clusters = self.cluster_hdbscan(dim_red_embeddings)
return dim_red_embeddings, clusters, umap_mapper
def visualize_clusters_static(self, embeddings: np.ndarray, labels: np.ndarray):
"""
Reduce dimensionality with UMAP to two dimensions and plot the clusters.
Args:
embeddings (np.ndarray): Embeddings for which to plot clustering.
labels (np.ndarray): Cluster labels.
"""
# Reduce dimensionality with UMAP
reducer = umap.UMAP(n_components=2, random_state = self.random_state, n_neighbors=30, metric="cosine", min_dist=0)
embeddings_2d = reducer.fit_transform(embeddings)
# Create a color palette, then map the labels to the colors.
# We add one to the number of unique labels to account for the noise points labelled as -1.
palette = plt.cm.get_cmap("tab20", len(np.unique(labels)) + 1)
# Create a new figure
fig, ax = plt.subplots(figsize=(10, 8))
outlier_shown_in_legend = False
# Iterate through all unique labels (clusters and outliers)
for label in np.unique(labels):
# Find the embeddings that are part of this cluster
cluster_points = embeddings_2d[labels == label]
# If label is -1, these are outliers. We want to display them in grey.
if label == -1:
color = 'grey'
if not outlier_shown_in_legend:
ax.scatter(cluster_points[:, 0], cluster_points[:, 1], c=color, label='outlier', s = 0.1)
outlier_shown_in_legend = True
else:
ax.scatter(cluster_points[:, 0], cluster_points[:, 1], c=color, s = 0.1)
else:
color = palette(label)
# Plot the points in this cluster without a label to prevent them from showing up in the legend
ax.scatter(cluster_points[:, 0], cluster_points[:, 1], c=color, s = 0.1)
# Add a legend
ax.legend()
# Show the plot
plt.show()
def visualize_clusters_dynamic(self, embeddings: np.ndarray, labels: np.ndarray, texts: list[str], class_names: list[str] = None):
"""
Visualize clusters using Plotly and enable hovering over clusters to see the beginning of the texts of the documents.
Args:
embeddings (np.ndarray): Embeddings for which to visualize clustering.
labels (np.ndarray): Cluster labels.
texts (list[str]): Texts of the documents.
class_names (list[str], optional): Names of the classes.
"""
# Reduce dimensionality with UMAP
reducer = umap.UMAP(n_components=2, random_state = self.random_state, n_neighbors=30, metric="cosine", min_dist=0)
embeddings_2d = reducer.fit_transform(embeddings)
df = pd.DataFrame(embeddings_2d, columns=['x', 'y'])
df['text'] = [text[:200] for text in texts]
df["class"] = labels
if class_names is not None:
df["class"] = [class_names[label] for label in labels]
# Create a color palette, then map the labels to the colors.
# Exclude the outlier (-1) label from color palette assignment
unique_labels = [label for label in np.unique(labels) if label != -1]
palette = plt.cm.get_cmap("tab20", len(unique_labels))
# Create color map
color_discrete_map = {label: 'rgb'+str(tuple(int(val*255) for val in palette(i)[:3])) if label != -1 else 'grey' for i, label in enumerate(unique_labels)}
color_discrete_map[-1] = 'grey'
# plot data points where the color represents the class
fig = px.scatter(df, x='x', y='y', hover_data=['text', 'class'], color='class', color_discrete_map=color_discrete_map)
fig.update_traces(mode='markers', marker=dict(size=3)) # Optional: Increase the marker size
# make plot quadratic
fig.update_layout(
autosize=False,
width=1500,
height=1500,
margin=dict(
l=50,
r=50,
b=100,
t=100,
pad=4
)
)
# set title
fig.update_layout(title_text='UMAP projection of the document embeddings', title_x=0.5)
# show plot
fig.show()
def umap_diagnostics(self, embeddings, hammer_edges = False):
"""
Fit UMAP on the provided embeddings and generate diagnostic plots.
Params:
------
embeddings : array-like
The high-dimensional data for UMAP to reduce and visualize.
hammer_edges : bool, default False. Is computationally expensive.
"""
new_hyperparams = deepcopy(self.UMAP_hyperparams)
new_hyperparams["n_components"] = 2
mapper = umap.UMAP(**new_hyperparams).fit(embeddings)
# 1. Connectivity plot with points
print("UMAP Connectivity Plot with Points")
umap.plot.connectivity(mapper, show_points=True)
plt.show()
if hammer_edges:
# 2. Connectivity plot with edge bundling
print("UMAP Connectivity Plot with Hammer Edge Bundling")
umap.plot.connectivity(mapper, edge_bundling='hammer')
plt.show()
# 3. PCA diagnostic plot
print("UMAP PCA Diagnostic Plot")
umap.plot.diagnostic(mapper, diagnostic_type='pca')
plt.show()
# 4. Local dimension diagnostic plot
print("UMAP Local Dimension Diagnostic Plot")
umap.plot.diagnostic(mapper, diagnostic_type='local_dim')
plt.show()
@@ -0,0 +1,429 @@
import nltk
import string
import collections
from tqdm import tqdm
from typing import List
import numpy as np
import re
from nltk.tokenize import word_tokenize
import umap
from collections import Counter
import warnings
from typing import List
# make sure the import works even if the package has not been installed and just the files are used
try:
from topicgpt.GetEmbeddingsOpenAI import GetEmbeddingsOpenAI
except:
from GetEmbeddingsOpenAI import GetEmbeddingsOpenAI
nltk.download('stopwords', quiet=True) # download stopwords
nltk.download('punkt', quiet=True) # download tokenizer
class ExtractTopWords:
def extract_centroids(self, embeddings: np.ndarray, labels: np.ndarray) -> dict:
"""
Extract centroids of clusters.
Args:
embeddings (np.ndarray): Embeddings to cluster and reduce.
labels (np.ndarray): Cluster labels. -1 means outlier.
Returns:
dict: Dictionary of cluster labels and their centroids.
"""
centroid_dict = {}
for label in np.unique(labels):
if label != -1:
centroid_dict[label] = np.mean(embeddings[labels == label], axis = 0)
return centroid_dict
def extract_centroid(self, embeddings: np.ndarray) -> np.ndarray:
"""
Extract the single centroid of a cluster.
Args:
embeddings (np.ndarray): Embeddings to extract the centroid from.
Returns:
np.ndarray: The centroid of the cluster.
"""
return np.mean(embeddings, axis = 0)
def compute_centroid_similarity(self, embeddings: np.ndarray, centroid_dict: dict, cluster_label: int) -> np.ndarray:
"""
Compute the similarity of the document embeddings to the centroid of the cluster via cosine similarity.
Args:
embeddings (np.ndarray): Embeddings to cluster and reduce.
centroid_dict (dict): Dictionary of cluster labels and their centroids.
cluster_label (int): Cluster label for which to compute the similarity.
Returns:
np.ndarray: Cosine similarity of the document embeddings to the centroid of the cluster.
"""
centroid = centroid_dict[cluster_label]
similarity = np.dot(embeddings, centroid) / (np.linalg.norm(embeddings) * np.linalg.norm(centroid))
return similarity
def get_most_similar_docs(self, corpus: list[str], embeddings: np.ndarray, labels: np.ndarray, centroid_dict: dict, cluster_label: int, top_n: int = 10) -> List[str]:
"""
Get the most similar documents to the centroid of a cluster.
Args:
corpus (list[str]): List of documents.
embeddings (np.ndarray): Embeddings to cluster and reduce.
labels (np.ndarray): Cluster labels. -1 means outlier.
centroid_dict (dict): Dictionary of cluster labels and their centroids.
cluster_label (int): Cluster label for which to compute the similarity.
top_n (int, optional): Number of top documents to extract.
Returns:
List[str]: List of the most similar documents to the centroid of a cluster.
"""
similarity = self.compute_centroid_similarity(embeddings, centroid_dict, cluster_label)
most_similar_docs = [corpus[i] for i in np.argsort(similarity)[-top_n:][::-1]]
return most_similar_docs
def compute_corpus_vocab(self,
corpus: list[str],
remove_stopwords: bool = True,
remove_punction: bool = True,
min_word_length: int = 3,
max_word_length: int = 20,
remove_short_words: bool = True,
remove_numbers: bool = True,
verbose: bool = True,
min_doc_frequency: int = 3,
min_freq: float = 0.1,
max_freq: float = 0.9) -> list[str]:
"""
Compute the vocabulary of the corpus and perform preprocessing of the corpus.
Args:
corpus (list[str]): List of documents.
remove_stopwords (bool, optional): Whether to remove stopwords.
remove_punction (bool, optional): Whether to remove punctuation.
min_word_length (int, optional): Minimum word length to retain.
max_word_length (int, optional): Maximum word length to retain.
remove_short_words (bool, optional): Whether to remove short words.
remove_numbers (bool, optional): Whether to remove numbers.
verbose (bool, optional): Whether to print progress and describe what is happening.
min_doc_frequency (int, optional): Minimum number of documents a word should appear in to be considered in the vocabulary.
min_freq (float, optional): Minimum frequency percentile of words to be considered in the vocabulary.
max_freq (float, optional): Maximum frequency percentile of words to be considered in the vocabulary.
Returns:
list[str]: List of words in the corpus sorted alphabetically.
"""
stopwords = set(nltk.corpus.stopwords.words('english'))
word_counter = collections.Counter()
doc_frequency = collections.defaultdict(set)
for doc_id, doc in enumerate(tqdm(corpus, disable=not verbose, desc="Processing corpus")):
words = nltk.word_tokenize(doc)
for word in words:
if remove_punction and word in string.punctuation:
continue
if remove_stopwords and word.lower() in stopwords:
continue
if remove_numbers and re.search(r'\d', word): # use a regular expression to check for digits
continue
if not re.search('[a-zA-Z]', word): # checks if word contains at least one alphabetic character
continue
# remove words that do not begin with an alphabetic character
if not word[0].isalpha():
continue
if len(word) > max_word_length or (remove_short_words and len(word) < min_word_length):
continue
word_lower = word.lower()
word_counter[word_lower] += 1
doc_frequency[word_lower].add(doc_id)
total_words = sum(word_counter.values())
freq_counter = {word: count / total_words for word, count in word_counter.items()}
# print most common words and their frequencies
if verbose:
print("Most common words in the vocabulary:")
for word, count in word_counter.most_common(10):
print(f"{word}: {count}")
freq_arr = np.array(list(freq_counter.values()))
min_freq_value = np.quantile(freq_arr, min_freq, method="lower")
max_freq_value = np.quantile(freq_arr, max_freq, method="higher")
vocab = {}
for word in freq_counter.keys():
if min_freq_value <= freq_counter[word] <= max_freq_value and len(doc_frequency[word]) >= min_doc_frequency:
vocab[word] = freq_counter[word]
vocab = {word for word in freq_counter.keys()
if min_freq_value <= freq_counter[word] <= max_freq_value
and len(doc_frequency[word]) >= min_doc_frequency}
# Sorting the vocabulary alphabetically
vocab = sorted(list(vocab))
return vocab
def compute_words_topics(self, corpus: list[str], vocab: list[str], labels: np.ndarray) -> dict:
"""
Compute the words per topic.
Args:
corpus (list[str]): List of documents.
vocab (list[str]): List of words in the corpus sorted alphabetically.
labels (np.ndarray): Cluster labels. -1 means outlier.
Returns:
dict: Dictionary of topics and their words.
"""
# Download NLTK resources (only required once)
nltk.download("punkt")
vocab = set(vocab)
words_per_topic = {label: [] for label in np.unique(labels) if label != -1}
for doc, label in tqdm(zip(corpus, labels), desc="Computing words per topic", total=len(corpus)):
if label != -1:
words = word_tokenize(doc)
for word in words:
if word.lower() in vocab:
words_per_topic[label].append(word.lower())
return words_per_topic
def embed_vocab_openAI(self, client, vocab: list[str], embedder: GetEmbeddingsOpenAI = None) -> dict[str, np.ndarray]:
"""
Embed the vocabulary using the OpenAI embedding API.
Args:
client: Client.
vocab (list[str]): List of words in the corpus sorted alphabetically.
embedder (GetEmbeddingsOpenAI, optional): Embedding object.
Returns:
dict[str, np.ndarray]: Dictionary of words and their embeddings.
"""
vocab = sorted(list(set(vocab)))
if embedder is None:
embedder = GetEmbeddingsOpenAI.GetEmbeddingsOpenAI(client)
result = embedder.get_embeddings(vocab)
res_dict = {}
for word, emb in zip(vocab, result["embeddings"]):
res_dict[word] = emb
return res_dict
def compute_bow_representation(self, document: str, vocab: list[str], vocab_set: set[str]) -> np.ndarray:
"""
Compute the bag-of-words representation of a document.
Args:
document (str): Document to compute the bag-of-words representation of.
vocab (list[str]): List of words in the corpus sorted alphabetically.
vocab_set (set[str]): Set of words in the corpus sorted alphabetically.
Returns:
np.ndarray: Bag-of-words representation of the document.
"""
bow = np.zeros(len(vocab))
words = word_tokenize(document)
if vocab_set is None:
vocab_set = set(vocab)
for word in words:
if word.lower() in vocab_set:
bow[vocab.index(word.lower())] += 1
return bow
def compute_word_topic_mat_old(self, corpus: list[str], vocab: list[str], labels: np.ndarray, consider_outliers: bool = False) -> np.ndarray:
"""
Compute the word-topic matrix.
Args:
corpus (list[str]): List of documents.
vocab (list[str]): List of words in the corpus sorted alphabetically.
labels (np.ndarray): Cluster labels. -1 means outlier.
consider_outliers (bool, optional): Whether to consider outliers when computing the top words. I.e. whether the labels contain -1 to indicate outliers.
Returns:
np.ndarray: Word-topic matrix.
"""
if consider_outliers:
word_topic_mat = np.zeros(len(vocab), len((np.unique(labels))))
else:
word_topic_mat = np.zeros((len(vocab), len((np.unique(labels)) - 1)))
vocab_set = set(vocab)
for i, doc in tqdm(enumerate(corpus), desc="Computing word-topic matrix", total=len(corpus)):
if labels[i] > - 0.5:
bow = self.compute_bow_representation(doc, vocab, vocab_set)
idx_to_add = labels[i]
word_topic_mat[:, idx_to_add] += bow
return word_topic_mat
def compute_word_topic_mat(self, corpus: list[str], vocab: list[str], labels: np.ndarray, consider_outliers=False) -> np.ndarray:
"""
Compute the word-topic matrix efficiently.
Args:
corpus (list[str]): List of documents.
vocab (list[str]): List of words in the corpus, sorted alphabetically.
labels (np.ndarray): Cluster labels. -1 indicates outliers.
consider_outliers (bool, optional): Whether to consider outliers when computing the top words. Defaults to False.
Returns:
np.ndarray: Word-topic matrix.
"""
corpus_arr = np.array(corpus)
if consider_outliers:
word_topic_mat = np.zeros((len(vocab), len((np.unique(labels)))))
else:
word_topic_mat = np.zeros((len(vocab), len((np.unique(labels)))))
for i, label in tqdm(enumerate(np.unique(labels)), desc="Computing word-topic matrix", total=len(np.unique(labels))):
topic_docs = corpus_arr[labels == label]
topic_doc_string = " ".join(topic_docs)
topic_doc_words = word_tokenize(topic_doc_string)
topic_doc_counter = Counter(topic_doc_words)
word_topic_mat[:, i] = np.array([topic_doc_counter.get(word, 0) for word in vocab])
return word_topic_mat
def extract_topwords_tfidf(self, word_topic_mat: np.ndarray, vocab: list[str], labels: np.ndarray, top_n_words: int = 10) -> dict:
"""
Extract the top words for each topic using a class-based tf-idf score.
Args:
word_topic_mat (np.ndarray): Word-topic matrix.
vocab (list[str]): List of words in the corpus sorted alphabetically.
labels (np.ndarray): Cluster labels. -1 means outlier.
top_n_words (int, optional): Number of top words to extract per topic.
Returns:
dict: Dictionary of topics and their top words.
"""
if min(labels) == -1:
word_topic_mat = word_topic_mat[:, 1:]
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=RuntimeWarning)
tf = word_topic_mat / np.sum(word_topic_mat, axis=0)
idf = np.log(1 + (word_topic_mat.shape[1] / np.sum(word_topic_mat > 0, axis=1)))
tfidf = tf * idf[:, np.newaxis]
# set tfidf to zero if tf is nan (happens if word does not occur in any document or topic does not have any words)
tfidf[np.isnan(tf)] = 0
# extract top words for each topic
top_words = {}
top_word_scores = {}
for topic in np.unique(labels):
if topic != -1:
indices = np.argsort(-tfidf[:, topic])[:top_n_words]
top_words[topic] = [vocab[word_idx] for word_idx in indices]
top_word_scores[topic] = [tfidf[word_idx, topic] for word_idx in indices]
return top_words, top_word_scores
def compute_embedding_similarity_centroids(self, vocab: list[str], vocab_embedding_dict: dict, umap_mapper: umap.UMAP, centroid_dict: dict, reduce_vocab_embeddings: bool = False, reduce_centroid_embeddings: bool = False) -> np.ndarray:
"""
Compute the cosine similarity of each word in the vocabulary to each centroid.
Args:
vocab (list[str]): List of words in the corpus sorted alphabetically.
vocab_embedding_dict (dict): Dictionary of words and their embeddings.
umap_mapper (umap.UMAP): UMAP mapper to transform new embeddings in the same way as the document embeddings.
centroid_dict (dict): Dictionary of cluster labels and their centroids. -1 means outlier.
reduce_vocab_embeddings (bool, optional): Whether to reduce the vocab embeddings with the UMAP mapper.
reduce_centroid_embeddings (bool, optional): Whether to reduce the centroid embeddings with the UMAP mapper.
Returns:
np.ndarray: Cosine similarity of each word in the vocab to each centroid. Has shape (len(vocab), len(centroid_dict) - 1).
"""
embedding_dim = umap_mapper.n_components
centroid_arr = np.zeros((len(centroid_dict), embedding_dim))
for i, centroid in enumerate(centroid_dict.values()):
centroid_arr[i] = centroid
if reduce_centroid_embeddings:
centroid_arr = umap_mapper.transform(centroid_arr)
centroid_arr = centroid_arr / np.linalg.norm(centroid_arr, axis=1).reshape(-1,1)
org_embedding_dim = list(vocab_embedding_dict.values())[0].shape[0]
vocab_arr = np.zeros((len(vocab), org_embedding_dim))
for i, word in enumerate(vocab):
vocab_arr[i] = vocab_embedding_dict[word]
if reduce_vocab_embeddings:
vocab_arr = umap_mapper.transform(vocab_arr)
vocab_arr = vocab_arr / np.linalg.norm(vocab_arr, axis=1).reshape(-1,1)
similarity = vocab_arr @ centroid_arr.T # cosine similarity
return similarity
def extract_topwords_centroid_similarity(self, word_topic_mat: np.ndarray, vocab: list[str], vocab_embedding_dict: dict, centroid_dict: dict, umap_mapper: umap.UMAP, top_n_words: int = 10, reduce_vocab_embeddings: bool = True, reduce_centroid_embeddings: bool = False, consider_outliers: bool = False) -> tuple[dict, np.ndarray]:
"""
Extract the top words for each cluster by computing the cosine similarity of the words that occur in the corpus to the centroid of the cluster.
Args:
word_topic_mat (np.ndarray): Word-topic matrix.
vocab (list[str]): List of words in the corpus sorted alphabetically.
vocab_embedding_dict (dict): Dictionary of words and their embeddings.
centroid_dict (dict): Dictionary of cluster labels and their centroids. -1 means outlier.
umap_mapper (umap.UMAP): UMAP mapper to transform new embeddings in the same way as the document embeddings.
top_n_words (int, optional): Number of top words to extract per topic.
reduce_vocab_embeddings (bool, optional): Whether to reduce the vocab embeddings with the UMAP mapper.
reduce_centroid_embeddings (bool, optional): Whether to reduce the centroid embeddings with the UMAP mapper.
consider_outliers (bool, optional): Whether to consider outliers when computing the top words. I.e., whether the labels contain -1 to indicate outliers.
Returns:
dict: Dictionary of topics and their top words.
np.ndarray: Cosine similarity of each word in the vocab to each centroid. Has shape (len(vocab), len(centroid_dict) - 1).
"""
similarity_mat = self.compute_embedding_similarity_centroids(vocab, vocab_embedding_dict, umap_mapper, centroid_dict, reduce_vocab_embeddings, reduce_centroid_embeddings)
top_words = {}
top_word_scores = {}
if word_topic_mat.shape[1] > len(np.unique(list(centroid_dict.keys()))):
word_topic_mat = word_topic_mat[:, 1:] #ignore outliers
for i, topic in enumerate(np.unique(list(centroid_dict.keys()))):
if topic != -1:
topic_similarity_mat = similarity_mat[:, topic] * word_topic_mat[:, topic]
top_words[topic] = [vocab[word_idx] for word_idx in np.argsort(-topic_similarity_mat)[:top_n_words]]
top_word_scores[topic] = [similarity_mat[word_idx, topic] for word_idx in np.argsort(-similarity_mat[:, topic])[:top_n_words]]
return top_words, top_word_scores
@@ -0,0 +1,217 @@
from openai import OpenAI
import tiktoken
from tqdm import tqdm
import numpy as np
class GetEmbeddingsOpenAI:
"""
This class allows to compute embeddings of text using the OpenAI API.
"""
def __init__(self, client, azure_config: dict = {}, embedding_model: str = "text-embedding-ada-002", tokenizer: str = None, max_tokens: int = 8191) -> None:
"""
Constructor of the class.
Args:
client: Client.
embedding_model (str, optional): Name of the embedding model to use.
tokenizer (str, optional): Name of the tokenizer to use.
max_tokens (int, optional): Maximum number of tokens to use.
Note:
By default, the embedding model "text-embedding-ada-002" is used with the corresponding tokenizer "cl100k_base" and a maximum number of tokens of 8191.
"""
self.client = client
self.embedding_model = embedding_model
self.tokenizer_str = tokenizer
self.max_tokens = max_tokens
@staticmethod
def num_tokens_from_string(string: str, encoding) -> int:
"""
Returns the number of tokens in a text string.
Args:
string (str): Text string to compute the number of tokens.
encoding: A function to encode the string into tokens.
Returns:
int: Number of tokens in the text string.
"""
num_tokens = len(encoding.encode(string))
return num_tokens
def compute_number_of_tokens(self, corpus: list[str]) -> int:
"""
Computes the total number of tokens needed to embed the corpus.
Args:
corpus (list[str]): List of strings to embed, where each element in the list is a document.
Returns:
int: Total number of tokens needed to embed the corpus.
"""
if self.tokenizer_str is None:
tokenizer = tiktoken.encoding_for_model(self.embedding_model)
else:
tokenizer = tiktoken.get_encoding(self.tokenizer_str)
num_tokens = 0
for document in tqdm(corpus):
num_tokens += self.num_tokens_from_string(document, tokenizer)
return num_tokens
def split_doc(self, text):
"""
Splits a single document that is longer than the maximum number of tokens into a list of smaller documents.
Args:
self: The instance of the class.
text (str): The string to be split.
Returns:
List[str]: A list of strings to embed, where each element in the list is a list of chunks comprising the document.
"""
split_text = []
split_text.append(text[:self.max_tokens])
for i in range(1, len(text) // self.max_tokens):
split_text.append(text[i * self.max_tokens:(i + 1) * self.max_tokens])
split_text.append(text[(len(text) // self.max_tokens) * self.max_tokens:])
return split_text
def split_long_docs(self, text: list[str]) -> list[list[str]]:
"""
Splits all documents that are longer than the maximum number of tokens into a list of smaller documents.
Args:
self: The instance of the class.
text (list[str]): List of strings to embed, where each element in the list is a document.
Returns:
List[list[str]]: A list of lists of strings to embed, where each element in the outer list is a list of chunks comprising the document.
"""
if self.tokenizer_str is None:
tokenizer = tiktoken.encoding_for_model(self.embedding_model)
else:
tokenizer = tiktoken.get_encoding(self.tokenizer_str)
split_text = []
for document in tqdm(text):
if self.num_tokens_from_string(document, tokenizer) > self.max_tokens:
split_text.append(self.split_doc(document))
else:
split_text.append([document])
return split_text
def make_api_call(self, text: str):
"""
Makes an API call to the OpenAI API to embed a text string.
Args:
self: The instance of the class.
text (str): The string to embed.
Returns:
API response: The response from the API.
"""
response = self.client.embeddings.create(input = [text], model = self.embedding_model)
return response
def get_embeddings_doc_split(self, corpus: list[list[str]], n_tries=3) -> list[dict]:
"""
Computes the embeddings of a corpus for split documents.
Args:
self: The instance of the class.
corpus (list[list[str]]): List of strings to embed, where each element is a document represented by a list of its chunks.
n_tries (int, optional): Number of tries to make an API call (default is 3).
Returns:
List[dict]: A list of dictionaries, where each dictionary contains the embedding of the document, the text of the document, and a list of errors that occurred during the embedding process.
"""
api_res_list = []
for i in tqdm(range(len(corpus))):
chunk_lis = corpus[i]
api_res_doc = []
for chunk_n, chunk in enumerate(chunk_lis):
for i in range(n_tries + 1):
try:
api_res_doc.append(
{"api_res": self.make_api_call(chunk),
"error": None }
)
break
except Exception as e:
print(f"Error {e} occured for chunk {chunk_n} of document {i}")
print(chunk)
print("Trying again.")
if i == n_tries:
print("Maximum number of tries reached. Skipping chunk.")
api_res_doc.append(
{"api_res": None,
"error": e })
# average the embeddings of the chunks
emb_lis = []
for api_res in api_res_doc:
if api_res["api_res"] is not None:
emb_lis.append(np.array(api_res["api_res"].data[0].embedding))
text = " ".join(chunk_lis)
embedding = np.mean(emb_lis, axis = 0)
api_res_list.append(
{"embedding": embedding,
"text": text,
"errors": [api_res["error"] for api_res in api_res_doc]}
)
return api_res_list
def convert_api_res_list(self, api_res_list: list[dict]) -> dict:
"""
Converts the api_res list into a dictionary containing the embeddings as a matrix and the corpus as a list of strings.
Args:
self: The instance of the class.
api_res_list (list[dict]): List of dictionaries, where each dictionary contains the embedding of the document, the text of the document, and a list of errors that occurred during the embedding process.
Returns:
dict: A dictionary containing the embeddings as a matrix and the corpus as a list of strings.
"""
embeddings = np.array([api_res["embedding"] for api_res in api_res_list])
corpus = [api_res["text"] for api_res in api_res_list]
errors = [api_res["errors"] for api_res in api_res_list]
return {"embeddings": embeddings, "corpus": corpus, "errors": errors}
def get_embeddings(self, corpus: list[str]) -> dict:
"""
Computes the embeddings of a corpus.
Args:
self: The instance of the class.
corpus (list[str]): List of strings to embed, where each element in the list is a document.
Returns:
dict: A dictionary containing the embeddings as a matrix and the corpus as a list of strings.
"""
corpus_split = self.split_long_docs(corpus)
corpus_emb = self.get_embeddings_doc_split(corpus_split)
self.corpus_emb = corpus_emb
res = self.convert_api_res_list(corpus_emb)
return res
@@ -0,0 +1,137 @@
from topicgpt.TopicRepresentation import Topic
import unittest
from sklearn.datasets import fetch_20newsgroups
from topicgpt.TopicGPT import TopicGPT
import sys
class QuickestTopicGPT_prompting(unittest.TestCase):
"""
This class is used to mainly test the prompting functionality of the TopicGPT class.
"""
@classmethod
def setUpClass(cls, sample_size:int = 500):
"""
download the necessary data and only keep a sample of it
params:
client: Client.
sample_size: the number of documents to use for the test
"""
data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes')) #download the 20 Newsgroups dataset
corpus = data['data']# just select the first 1000 documents for this example
corpus = [doc for doc in corpus if doc != ""]
corpus = corpus[:sample_size]
cls.corpus = corpus
cls.tm = TopicGPT(client = client, n_topics = 1)
cls.tm.fit(cls.corpus)
def test_repr_topics(self):
"""
test the repr_topics function of the TopicGPT class
"""
print("Testing repr_topics...")
self.assertTrue(type(self.tm.repr_topics()) == str)
def test_promt_knn_search(self):
"""
test the ppromt function that calls knn_search of the TopicPrompting class
"""
print("Testing ppromt_knn_search...")
prompt_lis = ["Is topic 0 about Bananas? Use knn Search",
"Is topic 0 about Space? Use knn Search"]
for prompt in prompt_lis:
answer, function_result = self.tm.prompt(prompt)
print(f"Answer to the prompt '{prompt}' \n is \n '{answer}'")
self.assertTrue(type(answer) == str)
self.assertTrue(type(function_result[0]) == list)
self.assertTrue(type(function_result[1]) == list)
self.assertTrue(type(function_result[0][0]) == str)
self.assertTrue(type(function_result[1][0]) == int)
def test_prompt_split_topic_kmeans_inplace(self):
"""
test the ppromt function that calls split_topic_kmeans of the TopicPrompting class
"""
print("Testing ppromt_split_topic_kmeans...")
prompt_lis = ["Split topic 0 into 2 subtopics using kmeans. Do this inplace"]
added_topic_lis_len = [2]
old_number_of_topics = len(self.tm.topic_lis)
for prompt, added_topic_len in zip(prompt_lis, added_topic_lis_len):
answer, function_result = self.tm.prompt(prompt)
print(f"Answer to the prompt '{prompt}' \n is \n '{answer}'")
print("function_result: ", function_result)
self.assertTrue(type(answer) == str)
self.assertTrue(type(function_result) == list)
self.assertTrue(type(function_result[0]) == Topic)
self.assertTrue(len(self.tm.topic_lis) == old_number_of_topics + added_topic_len -1 )
self.assertTrue(self.tm.topic_lis == function_result)
def test_prompt_combine_topics_inplace(self):
"""
test the prompt function that calls combine_topics of the TopicPrompting class
"""
print("Testing ppromt_combine_topics...")
prompt_lis = ["Combine topic 0 and topic 1 into one topic. Do this inplace"]
# split topic first
self.tm.prompt("Please split topic 0 into two subtopic. Do this inplace.")
old_number_topics = len(self.tm.topic_lis)
for prompt in prompt_lis:
answer, function_result = self.tm.prompt(prompt)
print(f"Answer to the prompt '{prompt}' \n is \n '{answer}'")
print("function_result: ", function_result)
print("topic_gpt_topic_list: ", self.tm.topic_lis)
self.assertTrue(type(answer) == str)
self.assertTrue(type(function_result) == list)
self.assertTrue(type(function_result[0]) == Topic)
self.assertTrue(self.tm.topic_lis == function_result)
self.assertTrue(len(self.tm.topic_lis) == old_number_topics -1)
if __name__ == "__main__":
for i, arg in enumerate(sys.argv):
if arg == "--api-key":
api_key = sys.argv.pop(i + 1)
sys.argv.pop(i)
break
if api_key is None:
print("API key must be provided with --api-key")
sys.exit(1)
unittest.main()
@@ -0,0 +1,120 @@
from topicgpt.TopicRepresentation import Topic
import unittest
from sklearn.datasets import fetch_20newsgroups
from topicgpt.TopicGPT import TopicGPT
class QuickTestTopicGPT_init_and_fit(unittest.TestCase):
"""
Run some basic tests on TopicGPT that do not require any saved data
"""
@classmethod
def setUpClass(cls, sample_size:int = 500):
"""
download the necessary data and only keep a sample of it
params:
api_key: the openai api key
sample_size: the number of documents to use for the test
"""
data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes')) #download the 20 Newsgroups dataset
corpus = data['data']# just select the first 1000 documents for this example
corpus = [doc for doc in corpus if doc != ""]
corpus = corpus[:sample_size]
cls.corpus = corpus
def setUp(self):
self.api_key_openai = api_key
def test_init(self):
"""
test the init function of the TopicGPT class
"""
print("Testing init...")
topicgpt = TopicGPT(api_key = self.api_key_openai)
self.assertTrue(isinstance(topicgpt, TopicGPT))
topicgpt = TopicGPT(api_key = self.api_key_openai,
n_topics= 20)
self.assertTrue(isinstance(topicgpt, TopicGPT))
topicgpt = TopicGPT(api_key = self.api_key_openai,
n_topics= 20,
corpus_instruction="This is a corpus instruction")
self.assertTrue(isinstance(topicgpt, TopicGPT))
# check if assertions are triggered
with self.assertRaises(AssertionError):
topicgpt = TopicGPT(api_key = None,
n_topics= 32,
openai_prompting_model="gpt-4",
max_number_of_tokens=8000,
corpus_instruction="This is a corpus instruction")
with self.assertRaises(AssertionError):
topicgpt = TopicGPT(api_key = self.api_key_openai,
n_topics= 0,
max_number_of_tokens=8000,
corpus_instruction="This is a corpus instruction")
with self.assertRaises(AssertionError):
topicgpt = TopicGPT(api_key = self.api_key_openai,
n_topics= 20,
max_number_of_tokens=0,
corpus_instruction="This is a corpus instruction")
def test_fit(self):
"""
test the fit function of the TopicGPT class
"""
print("Testing fit...")
def instance_test(topicgpt):
topicgpt.fit(self.corpus)
self.assertTrue(hasattr(topicgpt, "vocab"))
self.assertTrue(hasattr(topicgpt, "topic_lis"))
self.assertTrue(isinstance(topicgpt.vocab, list))
self.assertTrue(isinstance(topicgpt.vocab[0], str))
self.assertTrue(isinstance(topicgpt.topic_lis, list))
self.assertTrue(type(topicgpt.topic_lis[0]) == Topic)
if topicgpt.n_topics is not None:
self.assertTrue(len(topicgpt.topic_lis) == topicgpt.n_topics)
self.assertTrue(topicgpt.topic_lis == topicgpt.topic_prompting.topic_lis)
self.assertTrue(topicgpt.vocab == topicgpt.topic_prompting.vocab)
self.assertTrue(topicgpt.vocab_embeddings == topicgpt.topic_prompting.vocab_embeddings)
topicgpt1 = TopicGPT(api_key = self.api_key_openai, n_topics = 1)
topic_gpt_list = [topicgpt1]
for topic_gpt in topic_gpt_list:
instance_test(topic_gpt)
import sys
if __name__ == "__main__":
for i, arg in enumerate(sys.argv):
if arg == "--api-key":
api_key = sys.argv.pop(i + 1)
sys.argv.pop(i)
break
if api_key is None:
print("API key must be provided with --api-key")
sys.exit(1)
unittest.main()
@@ -0,0 +1,378 @@
import numpy as np
import os
import pickle
# make sure the import works even if the package has not been installed and just the files are used
from topicgpt.Clustering import Clustering_and_DimRed
from topicgpt.ExtractTopWords import ExtractTopWords
from topicgpt.TopwordEnhancement import TopwordEnhancement
from topicgpt.GetEmbeddingsOpenAI import GetEmbeddingsOpenAI
from topicgpt.TopicPrompting import TopicPrompting
from topicgpt.TopicRepresentation import Topic
from topicgpt.Client import Client
import topicgpt.TopicRepresentation as TopicRepresentation
embeddings_path= "SavedEmbeddings/embeddings.pkl" #global variable for the path to the embeddings
class TopicGPT:
"""
This is the main class for doing topic modelling with TopicGPT.
"""
def __init__(self,
api_key: str = "",
azure_endpoint: dict = {},
n_topics: int = None,
openai_prompting_model: str = "gpt-3.5-turbo-16k",
max_number_of_tokens: int = 16384,
corpus_instruction: str = "",
document_embeddings: np.ndarray = None,
vocab_embeddings: dict[str, np.ndarray] = None,
embedding_model: str = "text-embedding-ada-002",
max_number_of_tokens_embedding: int = 8191,
use_saved_embeddings: bool = True,
path_saved_embeddings: str = embeddings_path,
clusterer: Clustering_and_DimRed = None,
n_topwords: int = 2000,
n_topwords_description: int = 500,
topword_extraction_methods: list[str] = ["tfidf", "cosine_similarity"],
compute_vocab_hyperparams: dict = {},
enhancer: TopwordEnhancement = None,
topic_prompting: TopicPrompting = None,
verbose: bool = True) -> None:
"""
Initializes the main class for conducting topic modeling with TopicGPT.
Args:
api_key (str): Your OpenAI API key. Obtain this key from https://beta.openai.com/account/api-keys.
n_topics (int, optional): Number of topics to discover. If None, the Hdbscan algorithm (https://pypi.org/project/hdbscan/) is used to determine the number of topics automatically. Otherwise, agglomerative clustering is used. Note that with insufficient data, fewer topics may be found than specified.
openai_prompting_model (str, optional): Model provided by OpenAI for topic description and prompts. Refer to https://platform.openai.com/docs/models for available models.
max_number_of_tokens (int, optional): Maximum number of tokens to use for the OpenAI API.
corpus_instruction (str, optional): Additional information about the corpus, if available, to benefit the model.
document_embeddings (np.ndarray, optional): Document embeddings for the corpus. If None, they will be computed using the OpenAI API.
vocab_embeddings (dict[str, np.ndarray], optional): Vocabulary embeddings for the corpus in a dictionary format where keys are words and values are embeddings. If None, they will be computed using the OpenAI API.
embedding_model (str, optional): Name of the embedding model to use. See https://beta.openai.com/docs/api-reference/text-embedding for available models.
max_number_of_tokens_embedding (int, optional): Maximum number of tokens to use for the OpenAI API when computing embeddings.
use_saved_embeddings (bool, optional): Whether to use saved embeddings. If True, embeddings are loaded from the file 'SavedEmbeddings/embeddings.pkl' or path_saved_embeddings if different. If False, embeddings are computed using the OpenAI API and saved to the file.
path_saved_embeddings (str, optional): Path to the saved embeddings file.
clusterer (Clustering_and_DimRed, optional): Clustering and dimensionality reduction object. Find the class in the "Clustering/Clustering" folder. If None, a clustering object with default parameters is used. Note that providing document and vocab embeddings and an embedding object at the same time is not sensible; the number of topics specified in the clusterer will overwrite the n_topics argument.
n_topwords (int, optional): Number of top words to extract and save for each topic. Note that fewer top words might be used later.
n_topwords_description (int, optional): Number of top words to provide to the LLM (Language Model) to describe the topic.
topword_extraction_methods (list[str], optional): List of methods for extracting top words. Available methods include "tfidf", "cosine_similarity", and "topword_enhancement". Refer to the file 'ExtractTopWords/ExtractTopWords.py' for more details.
compute_vocab_hyperparams (dict, optional): Hyperparameters for computing vocabulary embeddings. Refer to the file 'ExtractTopWords/ExtractTopWords.py' for more details.
enhancer (TopwordEnhancement, optional): Topword enhancement object. Used for describing topics. Find the class in the "TopwordEnhancement/TopwordEnhancement.py" folder. If None, a topword enhancement object with default parameters is used. If an openai model is specified here, it will overwrite the openai_prompting_model argument for topic description.
topic_prompting (TopicPrompting, optional): Topic prompting object for formulating prompts. Find the class in the "TopicPrompting/TopicPrompting.py" folder. If None, a topic prompting object with default parameters is used. If an openai model is specified here, it will overwrite the openai_prompting_model argument for topic description.
verbose (bool, optional): Whether to print detailed information about the process. This can be overridden by arguments in passed objects.
"""
# Do some checks on the input arguments
assert api_key is not None, "You need to provide an OpenAI API key."
assert n_topics is None or n_topics > 0, "The number of topics needs to be a positive integer."
assert max_number_of_tokens > 0, "The maximum number of tokens needs to be a positive integer."
assert max_number_of_tokens_embedding > 0, "The maximum number of tokens for the embedding model needs to be a positive integer."
assert n_topwords > 0, "The number of top words needs to be a positive integer."
assert n_topwords_description > 0, "The number of top words for the topic description needs to be a positive integer."
assert len(topword_extraction_methods) > 0, "You need to provide at least one topword extraction method."
assert n_topwords_description <= n_topwords, "The number of top words for the topic description needs to be smaller or equal to the number of top words."
self.client = Client(api_key = api_key, azure_endpoint = azure_endpoint)
self.n_topics = n_topics
self.openai_prompting_model = openai_prompting_model
self.max_number_of_tokens = max_number_of_tokens
self.corpus_instruction = corpus_instruction
self.document_embeddings = document_embeddings
self.vocab_embeddings = vocab_embeddings
self.embedding_model = embedding_model
self.max_number_of_tokens_embedding = max_number_of_tokens_embedding
self.embedder = GetEmbeddingsOpenAI(client = self.client, embedding_model = self.embedding_model, max_tokens = self.max_number_of_tokens_embedding)
self.clusterer = clusterer
self.n_topwords = n_topwords
self.n_topwords_description = n_topwords_description
self.topword_extraction_methods = topword_extraction_methods
self.compute_vocab_hyperparams = compute_vocab_hyperparams
self.enhancer = enhancer
self.topic_prompting = topic_prompting
self.use_saved_embeddings = use_saved_embeddings
self.verbose = verbose
self.compute_vocab_hyperparams["verbose"] = self.verbose
# if embeddings have already been downloaded to the folder SavedEmbeddings, then load them
if self.use_saved_embeddings and os.path.exists(path_saved_embeddings):
with open(path_saved_embeddings, "rb") as f:
self.document_embeddings, self.vocab_embeddings = pickle.load(f)
for elem in topword_extraction_methods:
assert elem in ["tfidf", "cosine_similarity", "topword_enhancement"], "Invalid topword extraction method. Valid methods are 'tfidf', 'cosine_similarity', and 'topword_enhancement'."
if clusterer is None:
self.clusterer = Clustering_and_DimRed(number_clusters_hdbscan = self.n_topics, verbose = self.verbose)
else:
self.n_topics = clusterer.number_clusters_hdbscan
if enhancer is None:
self.enhancer = TopwordEnhancement(client = self.client, openai_model = self.openai_prompting_model, max_context_length = self.max_number_of_tokens, corpus_instruction = self.corpus_instruction)
if topic_prompting is None:
self.topic_prompting = TopicPrompting(topic_lis = [], client = self.client, openai_prompting_model = self.openai_prompting_model, max_context_length_promting = 16000, enhancer = self.enhancer, openai_embedding_model = self.embedding_model, max_context_length_embedding = self.max_number_of_tokens_embedding, corpus_instruction = corpus_instruction)
self.extractor = ExtractTopWords()
def __repr__(self) -> str:
repr = "TopicGPT object with the following parameters:\n"
repr += "-"*150 + "\n"
repr += "n_topics: " + str(self.n_topics) + "\n"
repr += "openai_prompting_model: " + self.openai_prompting_model + "\n"
repr += "max_number_of_tokens: " + str(self.max_number_of_tokens) + "\n"
repr += "corpus_instruction: " + self.corpus_instruction + "\n"
repr += "embedding_model: " + self.embedding_model + "\n"
repr += "clusterer: " + str(self.clusterer) + "\n"
repr += "n_topwords: " + str(self.n_topwords) + "\n"
repr += "n_topwords_description: " + str(self.n_topwords_description) + "\n"
repr += "topword_extraction_methods: " + str(self.topword_extraction_methods) + "\n"
repr += "compute_vocab_hyperparams: " + str(self.compute_vocab_hyperparams) + "\n"
repr += "enhancer: " + str(self.enhancer) + "\n"
repr += "topic_prompting: " + str(self.topic_prompting) + "\n"
return repr
def compute_embeddings(self, corpus: list[str]) -> tuple[np.ndarray, dict[str, np.ndarray]]:
"""
Computes document and vocabulary embeddings for the given corpus.
Args:
corpus (list[str]): List of strings to embed, where each element is a document.
Returns:
tuple: A tuple containing two items:
- document_embeddings (np.ndarray): Document embeddings for the corpus, with shape (len(corpus), n_embedding_dimensions).
- vocab_embeddings (dict[str, np.ndarray]): Vocabulary embeddings for the corpus, provided as a dictionary where keys are words and values are embeddings.
"""
self.document_embeddings = self.embedder.get_embeddings(corpus)["embeddings"]
self.vocab_embeddings = self.extractor.embed_vocab_openAI(self.client, self.vocab, embedder = self.embedder)
return self.document_embeddings, self.vocab_embeddings
def extract_topics(self, corpus: list[str]) -> list[Topic]:
"""
Extracts topics from the given corpus.
Args:
corpus (list[str]): List of strings to process, where each element represents a document.
Returns:
list[Topic]: A list of Topic objects representing the extracted topics.
"""
assert self.document_embeddings is not None and self.vocab_embeddings is not None, "You need to compute the embeddings first."
if self.vocab is None:
self.vocab = self.extractor.compute_corpus_vocab(self.corpus, **self.compute_vocab_hyperparams)
self.topic_lis = TopicRepresentation.extract_topics_no_new_vocab_computation(
corpus = corpus,
vocab = self.vocab,
document_embeddings = self.document_embeddings,
clusterer = self.clusterer,
vocab_embeddings = self.vocab_embeddings,
n_topwords = self.n_topwords,
topword_extraction_methods = self.topword_extraction_methods,
consider_outliers = True
)
return self.topic_lis
def describe_topics(self, topics: list[Topic]) -> list[Topic]:
"""
Names and describes the provided topics using the OpenAI API.
Args:
topics (list[Topic]): List of Topic objects to be named and described.
Returns:
list[Topic]: A list of Topic objects with names and descriptions.
"""
assert self.topic_lis is not None, "You need to extract the topics first."
if "cosine_similarity" in self.topword_extraction_methods:
topword_method = "cosine_similarity"
elif "tfidf" in self.topword_extraction_methods:
topword_method = "tfidf"
else:
raise ValueError("You need to use either 'cosine_similarity' or 'tfidf' as topword extraction method.")
self.topic_lis = TopicRepresentation.describe_and_name_topics(
topics = topics,
enhancer = self.enhancer,
topword_method= topword_method,
n_words = self.n_topwords_description
)
return self.topic_lis
def fit(self, corpus: list[str], verbose: bool = True):
"""
Compute embeddings if necessary, extract topics, and describe them.
Args:
corpus (list[str]): List of strings to embed, where each element represents a document.
verbose (bool, optional): Whether to print the progress and details of the process.
"""
self.corpus = corpus
# remove empty documents
len_before_removing = len(self.corpus)
while '' in self.corpus:
corpus.remove('')
len_after_removing = len(self.corpus)
if verbose:
print("Removed " + str(len_before_removing - len_after_removing) + " empty documents.")
if self.vocab_embeddings is None:
if verbose:
print("Computing vocabulary...")
self.vocab = self.extractor.compute_corpus_vocab(self.corpus, **self.compute_vocab_hyperparams)
else:
print('Vocab already computed')
self.vocab = list(self.vocab_embeddings.keys())
if self.vocab_embeddings is None or self.document_embeddings is None:
if verbose:
print("Computing embeddings...")
self.compute_embeddings(corpus = self.corpus)
else:
print('Embeddings already computed')
if verbose:
print("Extracting topics...")
self.topic_lis = self.extract_topics(corpus = self.corpus)
if verbose:
print("Describing topics...")
self.topic_lis = self.describe_topics(topics = self.topic_lis)
self.topic_prompting.topic_lis = self.topic_lis
self.topic_prompting.vocab_embeddings = self.vocab_embeddings
self.topic_prompting.vocab = self.vocab
def visualize_clusters(self):
"""
Visualizes the identified clusters representing the topics in a scatterplot.
"""
assert self.topic_lis is not None, "You need to extract the topics first."
all_document_embeddings = np.concatenate([topic.document_embeddings_hd for topic in self.topic_lis], axis = 0)
all_texts = np.concatenate([topic.documents for topic in self.topic_lis], axis = 0)
all_document_indices = np.concatenate([np.repeat(i, topic.document_embeddings_hd.shape[0]) for i, topic in enumerate(self.topic_lis)], axis = 0)
class_names = [str(topic) for topic in self.topic_lis]
self.clusterer.visualize_clusters_dynamic(all_document_embeddings, all_document_indices, all_texts, class_names)
def repr_topics(self) -> str:
"""
Returns a string explanation of the topics.
"""
assert self.topic_lis is not None, "You need to extract the topics first."
if "cosine_similarity" in self.topword_extraction_methods:
topword_method = "cosine_similarity"
elif "tfidf" in self.topword_extraction_methods:
topword_method = "tfidf"
else:
raise ValueError("You need to use either 'cosine_similarity' or 'tfidf' as topword extraction method.")
repr = ""
for topic in self.topic_lis:
repr += str(topic) + "\n"
repr += "Topic_description: " + topic.topic_description + "\n"
repr += "Top words: " + str(topic.top_words[topword_method][:10]) + "\n"
repr += "\n"
repr += "-"*150 + "\n"
return repr
def print_topics(self):
"""
Prints a string explanation of the topics.
"""
print(self.repr_topics())
def prompt(self, query: str) -> tuple[str, object]:
"""
Prompts the model with the given query.
Args:
query (str): The query to prompt the model with.
Returns:
tuple: A tuple containing two items:
- answer (str): The answer from the model.
- function_result (object): The result of the function call.
Note:
Please refer to the TopicPrompting class for more details on available functions for prompting the model.
"""
result = self.topic_prompting.general_prompt(query)
answer = result[0][-1].choices[0].message.content
function_result = result[1]
self.topic_prompting._fix_dictionary_topwords()
self.topic_lis = self.topic_prompting.topic_lis
return answer, function_result
def pprompt(self, query: str, return_function_result: bool = True) -> object:
"""
Prompts the model with the given query and prints the answer.
Args:
query (str): The query to prompt the model with.
return_function_result (bool, optional): Whether to return the result of the function call by the Language Model (LLM).
Returns:
object: The result of the function call if return_function_result is True, otherwise None.
"""
answer, function_result = self.prompt(query)
print(answer)
if return_function_result:
return function_result
def save_embeddings(self, path: str = embeddings_path) -> None:
"""
Saves the document and vocabulary embeddings to a pickle file for later re-use.
Args:
path (str, optional): The path to save the embeddings to. Defaults to embeddings_path.
"""
assert self.document_embeddings is not None and self.vocab_embeddings is not None, "You need to compute the embeddings first."
# create dictionary if it doesn't exist yet
if not os.path.exists("SavedEmbeddings"):
os.makedirs("SavedEmbeddings")
with open(path, "wb") as f:
pickle.dump([self.document_embeddings, self.vocab_embeddings], f)
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,664 @@
import numpy as np
import umap
import sys
import os
import inspect
from tqdm import tqdm
import umap
import json
# make sure the import works even if the package has not been installed and just the files are used
from topicgpt.Clustering import Clustering_and_DimRed
from topicgpt.ExtractTopWords import ExtractTopWords
from topicgpt.TopwordEnhancement import TopwordEnhancement
class Topic:
"""
class to represent a topic and all its attributes
"""
def __init__(self,
topic_idx: str,
documents: list[str],
words: dict[str, int],
centroid_hd: np.ndarray = None,
centroid_ld: np.ndarray = None,
document_embeddings_hd: np.ndarray = None,
document_embeddings_ld: np.ndarray = None,
document_embedding_similarity: np.ndarray = None,
umap_mapper: umap.UMAP = None,
top_words: dict[str, list[str]] = None,
top_word_scores: dict[str, list[float]] = None
) -> None:
"""
Represents a topic and all its attributes.
Args:
topic_idx (str): Index or name of the topic.
documents (list[str]): List of documents in the topic.
words (dict[str, int]): Dictionary of words and their counts in the topic.
centroid_hd (np.ndarray, optional): Centroid of the topic in high-dimensional space.
centroid_ld (np.ndarray, optional): Centroid of the topic in low-dimensional space.
document_embeddings_hd (np.ndarray, optional): Embeddings of documents in high-dimensional space that belong to this topic.
document_embeddings_ld (np.ndarray, optional): Embeddings of documents in low-dimensional space that belong to this topic.
document_embedding_similarity (np.ndarray, optional): Similarity array of document embeddings to the centroid in low-dimensional space.
umap_mapper (umap.UMAP, optional): UMAP mapper object to map from high-dimensional space to low-dimensional space.
top_words (dict[str, list[str]], optional): Dictionary of top words in the topic according to different metrics.
top_word_scores (dict[str, list[float]], optional): Dictionary of how representative the top words are according to different metrics.
"""
# do some checks on the input
assert len(documents) == len(document_embeddings_hd) == len(document_embeddings_ld) == len(document_embedding_similarity), "documents, document_embeddings_hd, document_embeddings_ld and document_embedding_similarity must have the same length"
assert len(documents) > 0, "documents must not be empty"
assert len(words) > 0, "words must not be empty"
self.topic_idx = topic_idx
self.documents = documents
self.words = words
self.centroid_hd = centroid_hd
self.centroid_ld = centroid_ld
self.document_embeddings_hd = document_embeddings_hd
self.document_embeddings_ld = document_embeddings_ld
self.document_embedding_similarity = document_embedding_similarity
self.umap_mapper = umap_mapper
self.top_words = top_words
self.top_word_scores = top_word_scores
self.topic_name = None # initialize the name of the topic as none
def __str__(self) -> str:
if self.topic_idx and self.topic_name is None:
repr = f"Topic {hash(self)}\n"
if self.topic_name is None:
repr = f"Topic: {self.topic_idx}\n"
else:
repr = f"Topic {self.topic_idx}: {self.topic_name}\n"
return repr
def __repr__(self) -> str:
return self.__str__()
def to_json(self) -> str:
"""
return a json representation of the topic
"""
repr_dict = {
"topic_idx": self.topic_idx,
"topic_name": self.topic_name,
"topic_description": self.topic_description
}
json_object = json.dumps(repr_dict, indent = 4)
return json_object
def to_dict(self) -> dict:
"""
return a dict representation of the topic
"""
repr_dict = {
"topic_idx": int(self.topic_idx),
"topic_name": self.topic_name,
"topic_description": self.topic_description
}
return repr_dict
def set_topic_name(self, name:str):
"""
add a name to the topic
params:
name: name of the topic
"""
self.topic_name = name
def set_topic_description(self, text: str):
"""
add a text description to the topic
params:
text: text description of the topic
"""
self.topic_description = text
def topic_to_json(topic: Topic) -> str:
"""
Return a JSON representation of the topic.
Args:
topic (Topic): The topic object to convert to JSON.
Returns:
str: A JSON string representing the topic.
"""
repr_dict = {
"topic_idx": topic.topic_idx,
"topic_name": topic.topic_name,
"topic_description": topic.topic_description
}
json_object = json.dumps(repr_dict, indent = 4)
return json_object
def topic_lis_to_json(topics: list[Topic]) -> str:
"""
Return a JSON representation of a list of topics.
Args:
topics (list[Topic]): The list of topic objects to convert to JSON.
Returns:
str: A JSON string representing the list of topics.
"""
repr_dict = {}
for topic in topics:
repr_dict[topic.topic_idx] = {
"topic_name": topic.topic_name,
"topic_description": topic.topic_description
}
json_object = json.dumps(repr_dict, indent = 4)
return json_object
@staticmethod
def extract_topics(corpus: list[str], document_embeddings: np.ndarray, clusterer: Clustering_and_DimRed, vocab_embeddings: np.ndarray, n_topwords: int = 2000, topword_extraction_methods: list[str] = ["tfidf", "cosine_similarity"], compute_vocab_hyperparams: dict = {}) -> list[Topic]:
"""
Extracts topics from the given corpus using the provided clusterer object on the document embeddings.
Args:
corpus (list[str]): List of documents.
document_embeddings (np.ndarray): Embeddings of the documents.
clusterer (Clustering_and_DimRed): Clustering and dimensionality reduction object to cluster the documents.
vocab_embeddings (np.ndarray): Embeddings of the vocabulary.
n_topwords (int, optional): Number of top-words to extract from the topics (default is 2000).
topword_extraction_methods (list[str], optional): List of methods to extract top-words from the topics.
Can contain "tfidf" and "cosine_similarity" (default is ["tfidf", "cosine_similarity"]).
compute_vocab_hyperparams (dict, optional): Hyperparameters for the top-word extraction methods.
Returns:
list[Topic]: List of Topic objects representing the extracted topics.
"""
for elem in topword_extraction_methods:
if elem not in ["tfidf", "cosine_similarity"]:
raise ValueError("topword_extraction_methods can only contain 'tfidf' and 'cosine_similarity'")
if topword_extraction_methods == []:
raise ValueError("topword_extraction_methods cannot be empty")
dim_red_embeddings, labels, umap_mapper = clusterer.cluster_and_reduce(document_embeddings) # get dimensionality reduced embeddings, their labels and the umap mapper object
unique_labels = np.unique(labels) # In case the cluster labels are not consecutive numbers, we need to map them to consecutive
label_mapping = {label: i for i, label in enumerate(unique_labels[unique_labels != -1])}
label_mapping[-1] = -1
labels = np.array([label_mapping[label] for label in labels])
extractor = ExtractTopWords()
centroid_dict = extractor.extract_centroids(document_embeddings, labels) # get the centroids of the clusters
centroid_arr = np.array(list(centroid_dict.values()))
if centroid_arr.ndim == 1:
centroid_arr = centroid_arr.reshape(-1, 1)
dim_red_centroids = umap_mapper.transform(np.array(list(centroid_dict.values()))) # map the centroids to low dimensional space
dim_red_centroid_dict = {label: centroid for label, centroid in zip(centroid_dict.keys(), dim_red_centroids)}
vocab = extractor.compute_corpus_vocab(corpus, **compute_vocab_hyperparams) # compute the vocabulary of the corpus
word_topic_mat = extractor.compute_word_topic_mat(corpus, vocab, labels, consider_outliers = False) # compute the word-topic matrix of the corpus
if "tfidf" in topword_extraction_methods:
tfidf_topwords, tfidf_dict = extractor.extract_topwords_tfidf(word_topic_mat = word_topic_mat, vocab = vocab, labels = labels, top_n_words = n_topwords) # extract the top-words according to tfidf
if "cosine_similarity" in topword_extraction_methods:
cosine_topwords, cosine_dict = extractor.extract_topwords_centroid_similarity(word_topic_mat = word_topic_mat, vocab = vocab, vocab_embedding_dict = vocab_embeddings, centroid_dict= dim_red_centroid_dict, umap_mapper = umap_mapper, top_n_words = n_topwords, reduce_vocab_embeddings = True, reduce_centroid_embeddings = False, consider_outliers = False)
topics = []
for i, label in enumerate(np.unique(labels)):
if label < -0.5: # dont include outliers
continue
topic_idx = f"{label}"
documents = [doc for j, doc in enumerate(corpus) if labels[j] == label]
embeddings_hd = document_embeddings[labels == label]
embeddings_ld = dim_red_embeddings[labels == label]
centroid_hd = centroid_dict[label]
centroid_ld = dim_red_centroids[label]
centroid_similarity = np.dot(embeddings_ld, centroid_ld)/(np.linalg.norm(embeddings_ld, axis = 1)*np.linalg.norm(centroid_ld))
similarity_sorting = np.argsort(centroid_similarity)[::-1]
documents = [documents[i] for i in similarity_sorting]
embeddings_hd = embeddings_hd[similarity_sorting]
embeddings_ld = embeddings_ld[similarity_sorting]
if type(cosine_topwords[label]) == dict:
cosine_topwords[label] = cosine_topwords[label][0]
top_words = {
"tfidf": tfidf_topwords[label] if "tfidf" in topword_extraction_methods else None,
"cosine_similarity": cosine_topwords[label] if "cosine_similarity" in topword_extraction_methods else None
}
top_word_scores = {
"tfidf": tfidf_dict[label] if "tfidf" in topword_extraction_methods else None,
"cosine_similarity": cosine_dict[label] if "cosine_similarity" in topword_extraction_methods else None
}
topic = Topic(topic_idx = topic_idx,
documents = documents,
words = vocab,
centroid_hd = centroid_hd,
centroid_ld = centroid_ld,
document_embeddings_hd = embeddings_hd,
document_embeddings_ld = embeddings_ld,
document_embedding_similarity = centroid_similarity,
umap_mapper = umap_mapper,
top_words = top_words,
top_word_scores = top_word_scores
)
topics.append(topic)
return topics
@staticmethod
def extract_topics_no_new_vocab_computation(corpus: list[str], vocab: list[str], document_embeddings: np.ndarray, clusterer: Clustering_and_DimRed, vocab_embeddings: np.ndarray, n_topwords: int = 2000, topword_extraction_methods: list[str] = ["tfidf", "cosine_similarity"], consider_outliers: bool = False) -> list[Topic]:
"""
Extracts topics from the given corpus using the provided clusterer object on the document embeddings.
This version does not compute the vocabulary of the corpus and instead uses the provided vocabulary.
Args:
corpus (list[str]): List of documents.
vocab (list[str]): Vocabulary of the corpus.
document_embeddings (np.ndarray): Embeddings of the documents.
clusterer (Clustering_and_DimRed): Clustering and dimensionality reduction object to cluster the documents.
vocab_embeddings (np.ndarray): Embeddings of the vocabulary.
n_topwords (int, optional): Number of top-words to extract from the topics (default is 2000).
topword_extraction_methods (list[str], optional): List of methods to extract top-words from the topics.
Can contain "tfidf" and "cosine_similarity" (default is ["tfidf", "cosine_similarity"]).
consider_outliers (bool, optional): Whether to consider outliers during topic extraction (default is False).
Returns:
list[Topic]: List of Topic objects representing the extracted topics.
"""
for elem in topword_extraction_methods:
if elem not in ["tfidf", "cosine_similarity"]:
raise ValueError("topword_extraction_methods can only contain 'tfidf' and 'cosine_similarity'")
if topword_extraction_methods == []:
raise ValueError("topword_extraction_methods cannot be empty")
dim_red_embeddings, labels, umap_mapper = clusterer.cluster_and_reduce(document_embeddings) # get dimensionality reduced embeddings, their labels and the umap mapper object
unique_labels = np.unique(labels) # In case the cluster labels are not consecutive numbers, we need to map them to consecutive
label_mapping = {label: i for i, label in enumerate(unique_labels[unique_labels != -1])}
label_mapping[-1] = -1
labels = np.array([label_mapping[label] for label in labels])
extractor = ExtractTopWords()
centroid_dict = extractor.extract_centroids(document_embeddings, labels) # get the centroids of the clusters
centroid_arr = np.array(list(centroid_dict.values()))
if centroid_arr.ndim == 1:
centroid_arr = centroid_arr.reshape(-1, 1)
dim_red_centroids = umap_mapper.transform(np.array(list(centroid_dict.values()))) # map the centroids to low dimensional space
dim_red_centroid_dict = {label: centroid for label, centroid in zip(centroid_dict.keys(), dim_red_centroids)}
word_topic_mat = extractor.compute_word_topic_mat(corpus, vocab, labels, consider_outliers = consider_outliers) # compute the word-topic matrix of the corpus
if "tfidf" in topword_extraction_methods:
tfidf_topwords, tfidf_dict = extractor.extract_topwords_tfidf(word_topic_mat = word_topic_mat, vocab = vocab, labels = labels, top_n_words = n_topwords) # extract the top-words according to tfidf
if "cosine_similarity" in topword_extraction_methods:
cosine_topwords, cosine_dict = extractor.extract_topwords_centroid_similarity(word_topic_mat = word_topic_mat, vocab = vocab, vocab_embedding_dict = vocab_embeddings, centroid_dict= dim_red_centroid_dict, umap_mapper = umap_mapper, top_n_words = n_topwords, reduce_vocab_embeddings = True, reduce_centroid_embeddings = False, consider_outliers = True)
topics = []
for i, label in enumerate(np.unique(labels)):
if label < -0.5: # dont include outliers
continue
topic_idx = f"{label}"
documents = [doc for j, doc in enumerate(corpus) if labels[j] == label]
embeddings_hd = document_embeddings[labels == label]
embeddings_ld = dim_red_embeddings[labels == label]
centroid_hd = centroid_dict[label]
centroid_ld = dim_red_centroids[label]
centroid_similarity = np.dot(embeddings_ld, centroid_ld)/(np.linalg.norm(embeddings_ld, axis = 1)*np.linalg.norm(centroid_ld))
similarity_sorting = np.argsort(centroid_similarity)[::-1]
documents = [documents[i] for i in similarity_sorting]
embeddings_hd = embeddings_hd[similarity_sorting]
embeddings_ld = embeddings_ld[similarity_sorting]
try:
if type(cosine_topwords[label]) == dict:
cosine_topwords[label] = cosine_topwords[label][0]
except:
pass
top_words = {
"tfidf": tfidf_topwords[label] if "tfidf" in topword_extraction_methods else None,
"cosine_similarity": cosine_topwords[label] if "cosine_similarity" in topword_extraction_methods else None
}
top_word_scores = {
"tfidf": tfidf_dict[label] if "tfidf" in topword_extraction_methods else None,
"cosine_similarity": cosine_dict[label] if "cosine_similarity" in topword_extraction_methods else None
}
topic = Topic(topic_idx = topic_idx,
documents = documents,
words = vocab,
centroid_hd = centroid_hd,
centroid_ld = centroid_ld,
document_embeddings_hd = embeddings_hd,
document_embeddings_ld = embeddings_ld,
document_embedding_similarity = centroid_similarity,
umap_mapper = umap_mapper,
top_words = top_words,
top_word_scores = top_word_scores
)
topics.append(topic)
return topics
@staticmethod
def extract_and_describe_topics(corpus: list[str], document_embeddings: np.ndarray, clusterer: Clustering_and_DimRed, vocab_embeddings: np.ndarray, enhancer: TopwordEnhancement, n_topwords: int = 2000, n_topwords_description: int = 500, topword_extraction_methods: list[str] = ["tfidf", "cosine_similarity"], compute_vocab_hyperparams: dict = {}, topword_description_method: str = "cosine_similarity") -> list[Topic]:
"""
Extracts topics from the given corpus using the provided clusterer object on the document embeddings and describes/names them using the given enhancer object.
Args:
corpus (list[str]): List of documents.
document_embeddings (np.ndarray): Embeddings of the documents.
clusterer (Clustering_and_DimRed): Clustering and dimensionality reduction object to cluster the documents.
vocab_embeddings (np.ndarray): Embeddings of the vocabulary.
enhancer (TopwordEnhancement): Enhancer object for enhancing top-words and generating descriptions/names for topics.
n_topwords (int, optional): Number of top-words to extract from the topics (default is 2000).
n_topwords_description (int, optional): Number of top-words to use from the extracted topics for description and naming (default is 500).
topword_extraction_methods (list[str], optional): List of methods to extract top-words from the topics.
Can contain "tfidf" and "cosine_similarity" (default is ["tfidf", "cosine_similarity"]).
compute_vocab_hyperparams (dict, optional): Hyperparameters for the top-word extraction methods.
topword_description_method (str, optional): Method to use for top-word extraction for description/naming.
Can be "tfidf" or "cosine_similarity" (default is "cosine_similarity").
Returns:
list[Topic]: List of Topic objects representing the extracted and described topics.
"""
print("Extracting topics...")
topics = extract_topics(corpus, document_embeddings, clusterer, vocab_embeddings, n_topwords, topword_extraction_methods, compute_vocab_hyperparams)
print("Describing topics...")
topics = describe_and_name_topics(topics, enhancer, topword_description_method, n_topwords_description)
return topics
@staticmethod
def extract_topics_labels_vocab(corpus: list[str], document_embeddings_hd: np.ndarray, document_embeddings_ld: np.ndarray, labels: np.ndarray, umap_mapper: umap.UMAP, vocab_embeddings: np.ndarray, vocab: list[str] = None, n_topwords: int = 2000, topword_extraction_methods: list[str] = ["tfidf", "cosine_similarity"]) -> list[Topic]:
"""
Extracts topics from the given corpus using the provided labels that indicate the topics (no -1 for outliers). Vocabulary is already computed.
Args:
corpus (list[str]): List of documents.
document_embeddings_hd (np.ndarray): Embeddings of the documents in high-dimensional space.
document_embeddings_ld (np.ndarray): Embeddings of the documents in low-dimensional space.
labels (np.ndarray): Labels indicating the topics.
umap_mapper (umap.UMAP): UMAP mapper object to map from high-dimensional space to low-dimensional space.
vocab_embeddings (np.ndarray): Embeddings of the vocabulary.
vocab (list[str], optional): Vocabulary of the corpus (default is None).
n_topwords (int, optional): Number of top-words to extract from the topics (default is 2000).
topword_extraction_methods (list[str], optional): List of methods to extract top-words from the topics.
Can contain "tfidf" and "cosine_similarity" (default is ["tfidf", "cosine_similarity"]).
Returns:
list[Topic]: List of Topic objects representing the extracted topics.
"""
for elem in topword_extraction_methods:
if elem not in ["tfidf", "cosine_similarity"]:
raise ValueError("topword_extraction_methods can only contain 'tfidf' and 'cosine_similarity'")
if topword_extraction_methods == []:
raise ValueError("topword_extraction_methods cannot be empty")
if vocab is None:
extractor = ExtractTopWords()
vocab = extractor.compute_corpus_vocab(corpus) # compute the vocabulary of the corpus
extractor = ExtractTopWords()
centroid_dict = extractor.extract_centroids(document_embeddings_hd, labels) # get the centroids of the clusters
centroid_arr = np.array(list(centroid_dict.values()))
if centroid_arr.ndim == 1:
centroid_arr = centroid_arr.reshape(-1, 1)
dim_red_centroids = umap_mapper.transform(np.array(list(centroid_dict.values()))) # map the centroids to low dimensional space
word_topic_mat = extractor.compute_word_topic_mat(corpus, vocab, labels, consider_outliers = False) # compute the word-topic matrix of the corpus
dim_red_centroid_dict = {label: centroid for label, centroid in zip(centroid_dict.keys(), dim_red_centroids)}
if "tfidf" in topword_extraction_methods:
tfidf_topwords, tfidf_dict = extractor.extract_topwords_tfidf(word_topic_mat = word_topic_mat, vocab = vocab, labels = labels, top_n_words = n_topwords) # extract the top-words according to tfidf
if "cosine_similarity" in topword_extraction_methods:
cosine_topwords, cosine_dict = extractor.extract_topwords_centroid_similarity(word_topic_mat = word_topic_mat, vocab = vocab, vocab_embedding_dict = vocab_embeddings, centroid_dict= dim_red_centroid_dict, umap_mapper = umap_mapper, top_n_words = n_topwords, reduce_vocab_embeddings = True, reduce_centroid_embeddings = False, consider_outliers = False)
topics = []
for i, label in enumerate(np.unique(labels)):
if label < -0.5: # dont include outliers
continue
topic_idx = f"{label}"
documents = [doc for j, doc in enumerate(corpus) if labels[j] == label]
embeddings_hd = document_embeddings_hd[labels == label]
embeddings_ld = document_embeddings_ld[labels == label]
centroid_hd = centroid_dict[label]
centroid_ld = dim_red_centroids[label]
centroid_similarity = np.dot(embeddings_ld, centroid_ld)/(np.linalg.norm(embeddings_ld, axis = 1)*np.linalg.norm(centroid_ld))
similarity_sorting = np.argsort(centroid_similarity)[::-1]
documents = [documents[i] for i in similarity_sorting]
embeddings_hd = embeddings_hd[similarity_sorting]
embeddings_ld = embeddings_ld[similarity_sorting]
if type(cosine_topwords[label]) == dict:
cosine_topwords[label] = cosine_topwords[label][0]
top_words = {
"tfidf": tfidf_topwords[label] if "tfidf" in topword_extraction_methods else None,
"cosine_similarity": cosine_topwords[label] if "cosine_similarity" in topword_extraction_methods else None
}
top_word_scores = {
"tfidf": tfidf_dict[label] if "tfidf" in topword_extraction_methods else None,
"cosine_similarity": cosine_dict[label] if "cosine_similarity" in topword_extraction_methods else None
}
topic = Topic(topic_idx = topic_idx,
documents = documents,
words = vocab,
centroid_hd = centroid_hd,
centroid_ld = centroid_ld,
document_embeddings_hd = embeddings_hd,
document_embeddings_ld = embeddings_ld,
document_embedding_similarity = centroid_similarity,
umap_mapper = umap_mapper,
top_words = top_words,
top_word_scores = top_word_scores
)
topics.append(topic)
return topics
@staticmethod
def extract_describe_topics_labels_vocab(
corpus: list[str],
document_embeddings_hd: np.ndarray,
document_embeddings_ld: np.ndarray,
labels: np.ndarray,
umap_mapper: umap.UMAP,
vocab_embeddings: np.ndarray,
enhancer: TopwordEnhancement,
vocab: list[str] = None,
n_topwords: int = 2000,
n_topwords_description: int = 500,
topword_extraction_methods: list[str] = ["tfidf", "cosine_similarity"],
topword_description_method: str = "cosine_similarity"
) -> list[Topic]:
"""
Extracts topics from the given corpus using the provided labels that indicate the topics (no -1 for outliers). Vocabulary is already computed.
Describe and name the topics with the given enhancer object.
Args:
corpus (list[str]): List of documents.
document_embeddings_hd (np.ndarray): Embeddings of the documents in high-dimensional space.
document_embeddings_ld (np.ndarray): Embeddings of the documents in low-dimensional space.
labels (np.ndarray): Labels indicating the topics.
umap_mapper (umap.UMAP): UMAP mapper object to map from high-dimensional space to low-dimensional space.
vocab_embeddings (np.ndarray): Embeddings of the vocabulary.
enhancer (TopwordEnhancement): Enhancer object to enhance the top-words and generate the description.
vocab (list[str], optional): Vocabulary of the corpus (default is None).
n_topwords (int, optional): Number of top-words to extract from the topics (default is 2000).
n_topwords_description (int, optional): Number of top-words to use from the extracted topics for the description and the name (default is 500).
topword_extraction_methods (list[str], optional): List of methods to extract top-words from the topics.
Can contain "tfidf" and "cosine_similarity" (default is ["tfidf", "cosine_similarity"]).
topword_description_method (str, optional): Method to use for top-word extraction. Can be "tfidf" or "cosine_similarity" (default is "cosine_similarity").
Returns:
list[Topic]: List of Topic objects representing the extracted topics.
"""
topics = extract_topics_labels_vocab(corpus, document_embeddings_hd, document_embeddings_ld, labels, umap_mapper, vocab_embeddings, vocab, n_topwords, topword_extraction_methods)
topics = describe_and_name_topics(topics, enhancer, topword_description_method, n_topwords_description)
return topics
@staticmethod
def extract_topic_cos_sim(
documents_topic: list[str],
document_embeddings_topic: np.ndarray,
words_topic: list[str],
vocab_embeddings: dict,
umap_mapper: umap.UMAP,
n_topwords: int = 2000
) -> Topic:
"""
Create a Topic object from the given documents and embeddings by computing the centroid and the top-words.
Only uses cosine-similarity for top-word extraction.
Args:
documents_topic (list[str]): List of documents in the topic.
document_embeddings_topic (np.ndarray): High-dimensional embeddings of the documents in the topic.
words_topic (list[str]): List of words in the topic.
vocab_embeddings (dict): Embeddings of the vocabulary.
umap_mapper (umap.UMAP): UMAP mapper object to map from high-dimensional space to low-dimensional space.
n_topwords (int, optional): Number of top-words to extract from the topics (default is 2000).
Returns:
Topic: Topic object representing the extracted topic.
"""
topword_extraction_methods = ["cosine_similarity"]
extractor = ExtractTopWords()
centroid_hd = extractor.extract_centroid(document_embeddings_topic)
centroid_ld = umap_mapper.transform(centroid_hd.reshape(1, -1))[0]
labels = np.zeros(len(documents_topic), dtype = int) #everything has label 0
word_topic_mat = extractor.compute_word_topic_mat(documents_topic, words_topic, labels, consider_outliers = False) # compute the word-topic matrix of the corpus
if "cosine_similarity" in topword_extraction_methods:
cosine_topwords, cosine_dict = extractor.extract_topwords_centroid_similarity(word_topic_mat = word_topic_mat, vocab = words_topic, vocab_embedding_dict = vocab_embeddings, centroid_dict= {0: centroid_ld}, umap_mapper = umap_mapper, top_n_words = n_topwords, reduce_vocab_embeddings = True, reduce_centroid_embeddings = False, consider_outliers = False)
top_words = {
"cosine_similarity": cosine_topwords if "cosine_similarity" in topword_extraction_methods else None
}
top_word_scores = {
"cosine_similarity": cosine_dict if "cosine_similarity" in topword_extraction_methods else None
}
document_embeddings_hd = document_embeddings_topic
document_embeddings_ld = umap_mapper.transform(document_embeddings_hd)
document_embedding_similarity = np.dot(document_embeddings_ld, centroid_ld)/(np.linalg.norm(document_embeddings_ld, axis = 1)*np.linalg.norm(centroid_ld)) # is this correct???
topic = Topic(topic_idx = None,
documents = documents_topic,
words = words_topic,
centroid_hd = centroid_hd,
centroid_ld = centroid_ld,
document_embeddings_hd = document_embeddings_hd,
document_embeddings_ld = document_embeddings_ld,
document_embedding_similarity = document_embedding_similarity,
umap_mapper = umap_mapper,
top_words = top_words,
top_word_scores = top_word_scores
)
return topic
@staticmethod
def extract_and_describe_topic_cos_sim(
documents_topic: list[str],
document_embeddings_topic: np.ndarray,
words_topic: list[str],
vocab_embeddings: dict,
umap_mapper: umap.UMAP,
enhancer: TopwordEnhancement,
n_topwords: int = 2000,
n_topwords_description=500
) -> Topic:
"""
Create a Topic object from the given documents and embeddings by computing the centroid and the top-words.
Only use cosine-similarity for top-word extraction.
Describe and name the topic with the given enhancer object.
Args:
documents_topic (list[str]): List of documents in the topic.
document_embeddings_topic (np.ndarray): High-dimensional embeddings of the documents in the topic.
words_topic (list[str]): List of words in the topic.
vocab_embeddings (dict): Embeddings of the vocabulary.
umap_mapper (umap.UMAP): UMAP mapper object to map from high-dimensional space to low-dimensional space.
enhancer (TopwordEnhancement): Enhancer object to enhance the top-words and generate the description.
n_topwords (int, optional): Number of top-words to extract from the topics (default is 2000).
n_topwords_description (int, optional): Number of top-words to use from the extracted topics for the description and the name (default is 500).
Returns:
Topic: Topic object representing the extracted and described topic.
"""
topic = extract_topic_cos_sim(documents_topic, document_embeddings_topic, words_topic, vocab_embeddings, umap_mapper, n_topwords)
topic = describe_and_name_topics([topic], enhancer, "cosine_similarity", n_topwords_description)[0]
return topic
topic = extract_topic_cos_sim(documents_topic, document_embeddings_topic, words_topic, vocab_embeddings, umap_mapper, n_topwords)
topic = describe_and_name_topics([topic], enhancer, "cosine_similarity", n_topwords_description)[0]
return topic
@staticmethod
def describe_and_name_topics(
topics: list[Topic],
enhancer: TopwordEnhancement,
topword_method="tfidf",
n_words=500
) -> list[Topic]:
"""
Describe and name the topics using the OpenAI API with the given enhancer object.
Args:
topics (list[Topic]): List of Topic objects.
enhancer (TopwordEnhancement): Enhancer object to enhance the top-words and generate the description.
topword_method (str, optional): Method to use for top-word extraction. Can be "tfidf" or "cosine_similarity" (default is "tfidf").
n_words (int, optional): Number of topwords to extract for the description and the name (default is 500).
Returns:
list[Topic]: List of Topic objects with the description and name added.
"""
if topword_method not in ["tfidf", "cosine_similarity"]:
raise ValueError("topword_method can only be 'tfidf' or 'cosine_similarity'")
for topic in tqdm(topics):
tws = topic.top_words[topword_method]
try:
topic_name = enhancer.generate_topic_name_str(tws, n_words = n_words)
topic_description = enhancer.describe_topic_topwords_str(tws, n_words = n_words)
except Exception as e:
print(f"Error in topic {topic.topic_idx}: {e}")
print("Trying again...")
topic_name = enhancer.generate_topic_name_str(tws, n_words = n_words)
topic_description = enhancer.describe_topic_topwords_str(tws, n_words = n_words)
topic.set_topic_name(topic_name)
topic.set_topic_description(topic_description)
return topics
@@ -0,0 +1,306 @@
import tiktoken
from openai import OpenAI
from typing import Callable
import numpy as np
basic_instruction = "You are a helpful assistant. You are excellent at inferring topics from top-words extracted via topic-modelling. You make sure that everything you output is strictly based on the provided text."
class TopwordEnhancement:
def __init__(
self,
client,
openai_model: str = "gpt-3.5-turbo",
max_context_length: int = 4000,
openai_model_temperature: float = 0.5,
basic_model_instruction: str = basic_instruction,
corpus_instruction: str = "") -> None:
"""
Initialize the OpenAIAssistant with the specified parameters.
Args:
client: Client.
openai_model (str, optional): The OpenAI model to use (default is "gpt-3.5-turbo").
max_context_length (int, optional): The maximum length of the context for the OpenAI model (default is 4000).
openai_model_temperature (float, optional): The softmax temperature to use for the OpenAI model (default is 0.5).
basic_model_instruction (str, optional): The basic instruction for the model.
corpus_instruction (str, optional): The instruction for the corpus. Useful if specific information on the corpus is available.
Returns:
None
"""
# do some checks on the input arguments
assert openai_model is not None, "Please provide an openai model"
assert max_context_length > 0, "Please provide a positive max_context_length"
assert openai_model_temperature > 0, "Please provide a positive openai_model_temperature"
self.client = client
self.openai_model = openai_model
self.max_context_length = max_context_length
self.openai_model_temperature = openai_model_temperature
self.basic_model_instruction = basic_model_instruction
self.corpus_instruction = f" The following information is available about the corpus used to identify the topics: {corpus_instruction}"
def __str__(self) -> str:
repr = f"TopwordEnhancement(openai_model = {self.openai_model})"
return repr
def __repr__(self) -> str:
repr = f"TopwordEnhancement(openai_model = {self.openai_model})"
return repr
def count_tokens_api_message(self, messages: list[dict[str]]) -> int:
"""
Count the number of tokens in the API messages.
Args:
messages (list[dict[str]]): List of messages from the API.
Returns:
int: Number of tokens in the messages.
"""
encoding = tiktoken.encoding_for_model(self.openai_model)
n_tokens = 0
for message in messages:
for key, value in message.items():
if key == "content":
n_tokens += len(encoding.encode(value))
return n_tokens
def describe_topic_topwords_completion_object(self,
topwords: list[str],
n_words: int = None,
query_function: Callable = lambda tws: f"Please give me the common topic of those words: {tws}. Also describe the various aspects and sub-topics of the topic.") :
"""
Describe the given topic based on its topwords using the OpenAI model.
Args:
topwords (list[str]): List of topwords.
n_words (int, optional): Number of words to use for the query. If None, all words are used.
query_function (Callable, optional): Function to query the model. The function should take a list of topwords and return a string.
Returns:
openai.ChatCompletion: A description of the topics by the model in the form of an OpenAI ChatCompletion object.
"""
if n_words is None:
n_words = len(topwords)
if type(topwords) == dict:
topwords = topwords[0]
topwords = topwords[:n_words]
topwords = np.array(topwords)
# if too many topwords are given, use only the first part of the topwords that fits into the context length
tokens_cumsum = np.cumsum([len(tiktoken.encoding_for_model(self.openai_model).encode(tw + ", ")) for tw in topwords]) + len(tiktoken.encoding_for_model(self.openai_model).encode(self.basic_model_instruction + " " + self.corpus_instruction))
if tokens_cumsum[-1] > self.max_context_length:
print("Too many topwords given. Using only the first part of the topwords that fits into the context length. Number of topwords used: ", np.argmax(tokens_cumsum > self.max_context_length))
n_words = np.argmax(tokens_cumsum > self.max_context_length)
topwords = topwords[:n_words]
completion = self.client.chat.completions.create(model=self.openai_model,
messages=[
{"role": "system", "content": self.basic_model_instruction + " " + self.corpus_instruction},
{"role": "user", "content": query_function(topwords)},
],
temperature = self.openai_model_temperature)
return completion
def describe_topic_topwords_str(self,
topwords: list[str],
n_words: int = None,
query_function: Callable = lambda tws: f"Please give me the common topic of those words: {tws}. Also describe the various aspects and sub-topics of the topic. Make sure the descriptions are short and concise! Do not cite more than 5 words per sub-aspect!!!") -> str:
"""
Describe the given topic based on its topwords using the OpenAI model.
Args:
topwords (list[str]): List of topwords.
n_words (int, optional): Number of words to use for the query. If None, all words are used.
query_function (Callable, optional): Function to query the model. The function should take a list of topwords and return a string.
Returns:
str: A description of the topics by the model in the form of a string.
"""
completion = self.describe_topic_topwords_completion_object(topwords, n_words, query_function)
return completion.choices[0].message.content
def generate_topic_name_str(self,
topwords: list[str],
n_words: int = None,
query_function: Callable = lambda tws: f"Please give me the common topic of those words: {tws}. Give me only the title of the topic and nothing else please. Make sure the title is precise and not longer than 5 words, ideally even shorter.") -> str:
"""
Generate a topic name based on the given topwords using the OpenAI model.
Args:
topwords (list[str]): List of topwords.
n_words (int, optional): Number of words to use for the query. If None, all words are used.
query_function (Callable, optional): Function to query the model. The function should take a list of topwords and return a string.
Returns:
str: A topic name generated by the model in the form of a string.
"""
return self.describe_topic_topwords_str(topwords, n_words, query_function)
def describe_topic_documents_completion_object(self,
documents: list[str],
truncate_doc_thresh=100,
n_documents: int = None,
query_function: Callable = lambda docs: f"Please give me the common topic of those documents: {docs}. Note that the documents are truncated if they are too long. Also describe the various aspects and sub-topics of the topic."):
"""
Describe the given topic based on its documents using the OpenAI model.
Args:
documents (list[str]): List of documents.
truncate_doc_thresh (int, optional): Threshold for the number of words in a document. If a document has more words than this threshold, it is pruned to this threshold.
n_documents (int, optional): Number of documents to use for the query. If None, all documents are used.
query_function (Callable, optional): Function to query the model. The function should take a list of documents and return a string.
Returns:
openai.ChatCompletion: A description of the topics by the model in the form of an openai.ChatCompletion object.
"""
if n_documents is None:
n_documents = len(documents)
documents = documents[:n_documents]
# prune documents based on number of tokens they contain
new_doc_lis = []
for doc in documents:
doc = doc.split(" ")
if len(doc) > truncate_doc_thresh:
doc = doc[:truncate_doc_thresh]
new_doc_lis.append(" ".join(doc))
documents = new_doc_lis
# if too many documents are given, use only the first part of the documents that fits into the context length
tokens_cumsum = np.cumsum([len(tiktoken.encoding_for_model(self.openai_model).encode(doc + ", ")) for doc in documents]) + len(tiktoken.encoding_for_model(self.openai_model).encode(self.basic_model_instruction + " " + self.corpus_instruction))
if tokens_cumsum[-1] > self.max_context_length:
print("Too many documents given. Using only the first part of the documents that fits into the context length. Number of documents used: ", np.argmax(tokens_cumsum > self.max_context_length))
n_documents = np.argmax(tokens_cumsum > self.max_context_length)
documents = documents[:n_documents]
completion = self.client.chat.completions.create(model=self.openai_model,
messages=[
{"role": "system", "content": self.basic_model_instruction + " " + self.corpus_instruction},
{"role": "user", "content": query_function(documents)},
],
temperature = self.openai_model_temperature)
return completion
@staticmethod
def sample_identity(n_docs: int) -> np.ndarray:
"""
Generate an identity array of document indices without changing their order.
Args:
n_docs (int): Number of documents.
Returns:
np.ndarray: An array containing document indices from 0 to (n_docs - 1).
"""
return np.arange(n_docs)
@staticmethod
def sample_uniform(n_docs: int) -> np.ndarray:
"""
Randomly sample document indices without replacement.
Args:
n_docs (int): Number of documents.
Returns:
np.ndarray: An array containing randomly permuted document indices from 0 to (n_docs - 1).
"""
return np.random.permutation(n_docs)
@staticmethod
def sample_poisson(n_docs: int) -> np.ndarray:
"""
Randomly sample document indices according to a Poisson distribution, favoring documents from the beginning of the list.
Args:
n_docs (int): Number of documents.
Returns:
np.ndarray: An array containing randomly permuted document indices, with more documents drawn from the beginning of the list.
"""
return np.random.poisson(1, n_docs)
def describe_topic_documents_sampling_completion_object(
self,
documents: list[str],
truncate_doc_thresh=100,
n_documents: int = None,
query_function: Callable = lambda docs: f"Please give me the common topic of the sample of those documents: {docs}. Note that the documents are truncated if they are too long. Also describe the various aspects and sub-topics of the topic.",
sampling_strategy: str = None,):
"""
Describe a topic based on a sample of its documents by using the openai model.
Args:
documents (list[str]): List of documents ordered by similarity to the topic's centroid.
truncate_doc_thresh (int, optional): Threshold for the number of words in a document. If a document exceeds this threshold, it is truncated. Defaults to 100.
n_documents (int, optional): Number of documents to use for the query. If None, all documents are used. Defaults to None.
query_function (Callable, optional): Function to query the model. Defaults to a lambda function generating a query based on the provided documents.
sampling_strategy (Union[Callable, str], optional): Strategy to sample the documents. If None, the first provided documents are used.
If it's a string, it's interpreted as a method of the class (e.g., "sample_uniform" is interpreted as self.sample_uniform). It can also be a custom sampling function. Defaults to None.
Returns:
openai.ChatCompletion: A description of the topic by the model in the form of an openai.ChatCompletion object.
"""
if type(sampling_strategy) == str:
if sampling_strategy == "topk":
sampling_strategy = self.sample_identity
if sampling_strategy=="identity":
sampling_strategy = self.sample_identity
elif sampling_strategy=="uniform":
sampling_strategy = self.sample_uniform
elif sampling_strategy=="poisson":
sampling_strategy = self.sample_poisson
new_documents = [documents[i] for i in sampling_strategy(n_documents)]
result = self.describe_topic_documents_completion_object(new_documents, truncate_doc_thresh, n_documents, query_function)
return result
def describe_topic_document_sampling_str(
self,
documents: list[str],
truncate_doc_thresh=100,
n_documents: int = None,
query_function: Callable = lambda docs: f"Please give me the common topic of the sample of those documents: {docs}. Note that the documents are truncated if they are too long. Also describe the various aspects and sub-topics of the topic.",
sampling_strategy: str = None,) -> str:
"""
Describe a topic based on a sample of its documents by using the openai model.
Args:
documents (list[str]): List of documents ordered by similarity to the topic's centroid.
truncate_doc_thresh (int, optional): Threshold for the number of words in a document. If a document exceeds this threshold, it is truncated. Defaults to 100.
n_documents (int, optional): Number of documents to use for the query. If None, all documents are used. Defaults to None.
query_function (Callable, optional): Function to query the model. Defaults to a lambda function generating a query based on the provided documents.
sampling_strategy (Union[Callable, str], optional): Strategy to sample the documents. If None, the first provided documents are used.
If it's a string, it's interpreted as a method of the class (e.g., "sample_uniform" is interpreted as self.sample_uniform). It can also be a custom sampling function. Defaults to None.
Returns:
str: A description of the topic by the model in the form of a string.
"""
completion = self.describe_topic_document_sampling_completion_object(documents, truncate_doc_thresh, n_documents, query_function, sampling_strategy)
return completion.choices[0].message.content
@@ -0,0 +1 @@
__version__ = '0.0.5'