The LLM-based topic recognition model is complete and adapted to quickly updating Weibo topics.
This commit is contained in:
@@ -0,0 +1,12 @@
|
||||
class Client:
|
||||
def __init__(self, api_key: str, azure_endpoint: dict = None) -> None:
|
||||
if azure_endpoint:
|
||||
from openai import AzureOpenAI
|
||||
self.client = AzureOpenAI(api_key=api_key, api_version=azure_endpoint['api_version'], azure_endpoint=azure_endpoint['endpoint'])
|
||||
else:
|
||||
from openai import OpenAI
|
||||
self.client = OpenAI(api_key=api_key)
|
||||
|
||||
def __getattr__(self, name):
|
||||
"""Delegate attribute access to the self.client object."""
|
||||
return getattr(self.client, name)
|
||||
@@ -0,0 +1,286 @@
|
||||
import numpy as np
|
||||
import umap
|
||||
import hdbscan
|
||||
import matplotlib.pyplot as plt
|
||||
import pandas as pd
|
||||
import plotly.express as px
|
||||
import umap.plot
|
||||
from copy import deepcopy
|
||||
from sklearn.cluster import AgglomerativeClustering
|
||||
|
||||
from typing import Tuple
|
||||
|
||||
class Clustering_and_DimRed():
|
||||
|
||||
"""
|
||||
Class to perform dimensionality reduction with UMAP followed by clustering with HDBSCAN.
|
||||
"""
|
||||
def __init__(self,
|
||||
n_dims_umap: int = 5,
|
||||
n_neighbors_umap: int = 15,
|
||||
min_dist_umap: float = 0,
|
||||
metric_umap: str = "cosine",
|
||||
min_cluster_size_hdbscan: int = 30,
|
||||
metric_hdbscan: str = "euclidean",
|
||||
cluster_selection_method_hdbscan: str = "eom",
|
||||
number_clusters_hdbscan: int = None,
|
||||
random_state: int = 42,
|
||||
verbose: bool = True,
|
||||
UMAP_hyperparams: dict = {},
|
||||
HDBSCAN_hyperparams: dict = {}) -> None:
|
||||
"""
|
||||
Initializes the clustering and dimensionality reduction parameters for topic modeling.
|
||||
|
||||
Args:
|
||||
n_dims_umap (int, optional): Number of dimensions to reduce to using UMAP.
|
||||
n_neighbors_umap (int, optional): Number of neighbors for UMAP.
|
||||
min_dist_umap (float, optional): Minimum distance for UMAP.
|
||||
metric_umap (str, optional): Metric for UMAP.
|
||||
min_cluster_size_hdbscan (int, optional): Minimum cluster size for HDBSCAN.
|
||||
metric_hdbscan (str, optional): Metric for HDBSCAN.
|
||||
cluster_selection_method_hdbscan (str, optional): Cluster selection method for HDBSCAN.
|
||||
number_clusters_hdbscan (int, optional): Number of clusters for HDBSCAN. If None, HDBSCAN will determine the number of clusters automatically. Ensure that min_cluster_size is not too large to find enough clusters.
|
||||
random_state (int, optional): Random state for UMAP and HDBSCAN.
|
||||
verbose (bool, optional): Whether to print progress.
|
||||
UMAP_hyperparams (dict, optional): Additional hyperparameters for UMAP.
|
||||
HDBSCAN_hyperparams (dict, optional): Additional hyperparameters for HDBSCAN.
|
||||
"""
|
||||
|
||||
|
||||
# do some checks on the input arguments
|
||||
assert n_dims_umap > 0, "n_dims_umap must be greater than 0"
|
||||
assert n_neighbors_umap > 0, "n_neighbors_umap must be greater than 0"
|
||||
assert min_dist_umap >= 0, "min_dist_umap must be greater than or equal to 0"
|
||||
assert min_cluster_size_hdbscan > 0, "min_cluster_size_hdbscan must be greater than 0"
|
||||
assert number_clusters_hdbscan is None or number_clusters_hdbscan > 0, "number_clusters_hdbscan must be greater than 0 or None"
|
||||
assert random_state is None or random_state >= 0, "random_state must be greater than or equal to 0"
|
||||
|
||||
self.random_state = random_state
|
||||
self.verbose = verbose
|
||||
self.UMAP_hyperparams = UMAP_hyperparams
|
||||
self.HDBSCAN_hyperparams = HDBSCAN_hyperparams
|
||||
|
||||
# update hyperparameters for UMAP
|
||||
self.UMAP_hyperparams["n_components"] = n_dims_umap
|
||||
self.UMAP_hyperparams["n_neighbors"] = n_neighbors_umap
|
||||
self.UMAP_hyperparams["min_dist"] = min_dist_umap
|
||||
self.UMAP_hyperparams["metric"] = metric_umap
|
||||
self.UMAP_hyperparams["random_state"] = random_state
|
||||
self.UMAP_hyperparams["verbose"] = verbose
|
||||
self.umap = umap.UMAP(**self.UMAP_hyperparams)
|
||||
|
||||
self.HDBSCAN_hyperparams["min_cluster_size"] = min_cluster_size_hdbscan
|
||||
self.HDBSCAN_hyperparams["metric"] = metric_hdbscan
|
||||
self.HDBSCAN_hyperparams["cluster_selection_method"] = cluster_selection_method_hdbscan
|
||||
self.number_clusters_hdbscan = number_clusters_hdbscan
|
||||
self.hdbscan = hdbscan.HDBSCAN(**self.HDBSCAN_hyperparams)
|
||||
|
||||
|
||||
def reduce_dimensions_umap(self, embeddings: np.ndarray) -> Tuple[np.ndarray, umap.UMAP]:
|
||||
"""
|
||||
Reduces dimensions of embeddings using UMAP.
|
||||
|
||||
Args:
|
||||
embeddings (np.ndarray): Embeddings to reduce.
|
||||
|
||||
Returns:
|
||||
tuple: A tuple containing two items:
|
||||
- reduced_embeddings (np.ndarray): Reduced embeddings.
|
||||
- umap_mapper (umap.UMAP): UMAP mapper for transforming new embeddings, especially embeddings of the vocabulary. (MAKE SURE TO NORMALIZE EMBEDDINGS AFTER USING THE MAPPER)
|
||||
"""
|
||||
|
||||
mapper = umap.UMAP(**self.UMAP_hyperparams).fit(embeddings)
|
||||
dim_red_embeddings = mapper.transform(embeddings)
|
||||
dim_red_embeddings = dim_red_embeddings/np.linalg.norm(dim_red_embeddings, axis=1).reshape(-1,1)
|
||||
return dim_red_embeddings, mapper
|
||||
|
||||
def cluster_hdbscan(self, embeddings: np.ndarray) -> np.ndarray:
|
||||
"""
|
||||
Cluster embeddings using HDBSCAN.
|
||||
|
||||
If self.number_clusters_hdbscan is not None, further clusters the data with AgglomerativeClustering to achieve a fixed number of clusters.
|
||||
|
||||
Args:
|
||||
embeddings (np.ndarray): Embeddings to cluster.
|
||||
|
||||
Returns:
|
||||
np.ndarray: Cluster labels.
|
||||
"""
|
||||
|
||||
labels = self.hdbscan.fit_predict(embeddings)
|
||||
outliers = np.where(labels == -1)[0]
|
||||
|
||||
if self.number_clusters_hdbscan is not None:
|
||||
clusterer = AgglomerativeClustering(n_clusters=self.number_clusters_hdbscan) #one cluster for outliers
|
||||
labels = clusterer.fit_predict(embeddings)
|
||||
labels[outliers] = -1
|
||||
|
||||
# reindex to make the labels consecutive numbers from -1 to the number of clusters. -1 is reserved for outliers
|
||||
unique_labels = np.unique(labels)
|
||||
unique_labels_no_outliers = unique_labels[unique_labels != -1]
|
||||
map2newlabel = {label: i for i, label in enumerate(unique_labels_no_outliers)}
|
||||
map2newlabel[-1] = -1
|
||||
labels = np.array([map2newlabel[label] for label in labels])
|
||||
|
||||
return labels
|
||||
|
||||
def cluster_and_reduce(self, embeddings: np.ndarray) -> Tuple[np.ndarray, np.ndarray, umap.UMAP]:
|
||||
"""
|
||||
Cluster embeddings using HDBSCAN and reduce dimensions with UMAP.
|
||||
|
||||
Args:
|
||||
embeddings (np.ndarray): Embeddings to cluster and reduce.
|
||||
|
||||
Returns:
|
||||
tuple: A tuple containing three items:
|
||||
- reduced_embeddings (np.ndarray): Reduced embeddings.
|
||||
- cluster_labels (np.ndarray): Cluster labels.
|
||||
- umap_mapper (umap.UMAP): UMAP mapper for transforming new embeddings, especially embeddings of the vocabulary. (MAKE SURE TO NORMALIZE EMBEDDINGS AFTER USING THE MAPPER)
|
||||
"""
|
||||
|
||||
dim_red_embeddings, umap_mapper = self.reduce_dimensions_umap(embeddings)
|
||||
clusters = self.cluster_hdbscan(dim_red_embeddings)
|
||||
return dim_red_embeddings, clusters, umap_mapper
|
||||
|
||||
def visualize_clusters_static(self, embeddings: np.ndarray, labels: np.ndarray):
|
||||
"""
|
||||
Reduce dimensionality with UMAP to two dimensions and plot the clusters.
|
||||
|
||||
Args:
|
||||
embeddings (np.ndarray): Embeddings for which to plot clustering.
|
||||
labels (np.ndarray): Cluster labels.
|
||||
"""
|
||||
|
||||
|
||||
# Reduce dimensionality with UMAP
|
||||
reducer = umap.UMAP(n_components=2, random_state = self.random_state, n_neighbors=30, metric="cosine", min_dist=0)
|
||||
embeddings_2d = reducer.fit_transform(embeddings)
|
||||
|
||||
|
||||
# Create a color palette, then map the labels to the colors.
|
||||
# We add one to the number of unique labels to account for the noise points labelled as -1.
|
||||
palette = plt.cm.get_cmap("tab20", len(np.unique(labels)) + 1)
|
||||
|
||||
# Create a new figure
|
||||
fig, ax = plt.subplots(figsize=(10, 8))
|
||||
|
||||
outlier_shown_in_legend = False
|
||||
|
||||
# Iterate through all unique labels (clusters and outliers)
|
||||
for label in np.unique(labels):
|
||||
# Find the embeddings that are part of this cluster
|
||||
cluster_points = embeddings_2d[labels == label]
|
||||
|
||||
# If label is -1, these are outliers. We want to display them in grey.
|
||||
if label == -1:
|
||||
color = 'grey'
|
||||
if not outlier_shown_in_legend:
|
||||
ax.scatter(cluster_points[:, 0], cluster_points[:, 1], c=color, label='outlier', s = 0.1)
|
||||
outlier_shown_in_legend = True
|
||||
else:
|
||||
ax.scatter(cluster_points[:, 0], cluster_points[:, 1], c=color, s = 0.1)
|
||||
else:
|
||||
color = palette(label)
|
||||
# Plot the points in this cluster without a label to prevent them from showing up in the legend
|
||||
ax.scatter(cluster_points[:, 0], cluster_points[:, 1], c=color, s = 0.1)
|
||||
|
||||
# Add a legend
|
||||
ax.legend()
|
||||
|
||||
# Show the plot
|
||||
plt.show()
|
||||
|
||||
|
||||
def visualize_clusters_dynamic(self, embeddings: np.ndarray, labels: np.ndarray, texts: list[str], class_names: list[str] = None):
|
||||
"""
|
||||
Visualize clusters using Plotly and enable hovering over clusters to see the beginning of the texts of the documents.
|
||||
|
||||
Args:
|
||||
embeddings (np.ndarray): Embeddings for which to visualize clustering.
|
||||
labels (np.ndarray): Cluster labels.
|
||||
texts (list[str]): Texts of the documents.
|
||||
class_names (list[str], optional): Names of the classes.
|
||||
"""
|
||||
|
||||
|
||||
# Reduce dimensionality with UMAP
|
||||
reducer = umap.UMAP(n_components=2, random_state = self.random_state, n_neighbors=30, metric="cosine", min_dist=0)
|
||||
embeddings_2d = reducer.fit_transform(embeddings)
|
||||
|
||||
df = pd.DataFrame(embeddings_2d, columns=['x', 'y'])
|
||||
df['text'] = [text[:200] for text in texts]
|
||||
df["class"] = labels
|
||||
|
||||
if class_names is not None:
|
||||
df["class"] = [class_names[label] for label in labels]
|
||||
|
||||
# Create a color palette, then map the labels to the colors.
|
||||
# Exclude the outlier (-1) label from color palette assignment
|
||||
unique_labels = [label for label in np.unique(labels) if label != -1]
|
||||
palette = plt.cm.get_cmap("tab20", len(unique_labels))
|
||||
|
||||
# Create color map
|
||||
color_discrete_map = {label: 'rgb'+str(tuple(int(val*255) for val in palette(i)[:3])) if label != -1 else 'grey' for i, label in enumerate(unique_labels)}
|
||||
color_discrete_map[-1] = 'grey'
|
||||
|
||||
# plot data points where the color represents the class
|
||||
fig = px.scatter(df, x='x', y='y', hover_data=['text', 'class'], color='class', color_discrete_map=color_discrete_map)
|
||||
|
||||
fig.update_traces(mode='markers', marker=dict(size=3)) # Optional: Increase the marker size
|
||||
|
||||
# make plot quadratic
|
||||
fig.update_layout(
|
||||
autosize=False,
|
||||
width=1500,
|
||||
height=1500,
|
||||
margin=dict(
|
||||
l=50,
|
||||
r=50,
|
||||
b=100,
|
||||
t=100,
|
||||
pad=4
|
||||
)
|
||||
)
|
||||
# set title
|
||||
fig.update_layout(title_text='UMAP projection of the document embeddings', title_x=0.5)
|
||||
|
||||
|
||||
# show plot
|
||||
fig.show()
|
||||
|
||||
|
||||
def umap_diagnostics(self, embeddings, hammer_edges = False):
|
||||
"""
|
||||
Fit UMAP on the provided embeddings and generate diagnostic plots.
|
||||
|
||||
Params:
|
||||
------
|
||||
embeddings : array-like
|
||||
The high-dimensional data for UMAP to reduce and visualize.
|
||||
hammer_edges : bool, default False. Is computationally expensive.
|
||||
|
||||
"""
|
||||
new_hyperparams = deepcopy(self.UMAP_hyperparams)
|
||||
new_hyperparams["n_components"] = 2
|
||||
mapper = umap.UMAP(**new_hyperparams).fit(embeddings)
|
||||
|
||||
# 1. Connectivity plot with points
|
||||
print("UMAP Connectivity Plot with Points")
|
||||
umap.plot.connectivity(mapper, show_points=True)
|
||||
plt.show()
|
||||
|
||||
if hammer_edges:
|
||||
# 2. Connectivity plot with edge bundling
|
||||
print("UMAP Connectivity Plot with Hammer Edge Bundling")
|
||||
umap.plot.connectivity(mapper, edge_bundling='hammer')
|
||||
plt.show()
|
||||
|
||||
# 3. PCA diagnostic plot
|
||||
print("UMAP PCA Diagnostic Plot")
|
||||
umap.plot.diagnostic(mapper, diagnostic_type='pca')
|
||||
plt.show()
|
||||
|
||||
# 4. Local dimension diagnostic plot
|
||||
print("UMAP Local Dimension Diagnostic Plot")
|
||||
umap.plot.diagnostic(mapper, diagnostic_type='local_dim')
|
||||
plt.show()
|
||||
@@ -0,0 +1,429 @@
|
||||
import nltk
|
||||
import string
|
||||
import collections
|
||||
from tqdm import tqdm
|
||||
from typing import List
|
||||
import numpy as np
|
||||
import re
|
||||
from nltk.tokenize import word_tokenize
|
||||
import umap
|
||||
from collections import Counter
|
||||
import warnings
|
||||
|
||||
from typing import List
|
||||
|
||||
# make sure the import works even if the package has not been installed and just the files are used
|
||||
try:
|
||||
from topicgpt.GetEmbeddingsOpenAI import GetEmbeddingsOpenAI
|
||||
except:
|
||||
from GetEmbeddingsOpenAI import GetEmbeddingsOpenAI
|
||||
|
||||
nltk.download('stopwords', quiet=True) # download stopwords
|
||||
nltk.download('punkt', quiet=True) # download tokenizer
|
||||
|
||||
class ExtractTopWords:
|
||||
|
||||
def extract_centroids(self, embeddings: np.ndarray, labels: np.ndarray) -> dict:
|
||||
"""
|
||||
Extract centroids of clusters.
|
||||
|
||||
Args:
|
||||
embeddings (np.ndarray): Embeddings to cluster and reduce.
|
||||
labels (np.ndarray): Cluster labels. -1 means outlier.
|
||||
|
||||
Returns:
|
||||
dict: Dictionary of cluster labels and their centroids.
|
||||
"""
|
||||
|
||||
centroid_dict = {}
|
||||
for label in np.unique(labels):
|
||||
if label != -1:
|
||||
centroid_dict[label] = np.mean(embeddings[labels == label], axis = 0)
|
||||
|
||||
return centroid_dict
|
||||
|
||||
def extract_centroid(self, embeddings: np.ndarray) -> np.ndarray:
|
||||
"""
|
||||
Extract the single centroid of a cluster.
|
||||
|
||||
Args:
|
||||
embeddings (np.ndarray): Embeddings to extract the centroid from.
|
||||
|
||||
Returns:
|
||||
np.ndarray: The centroid of the cluster.
|
||||
"""
|
||||
|
||||
return np.mean(embeddings, axis = 0)
|
||||
|
||||
def compute_centroid_similarity(self, embeddings: np.ndarray, centroid_dict: dict, cluster_label: int) -> np.ndarray:
|
||||
"""
|
||||
Compute the similarity of the document embeddings to the centroid of the cluster via cosine similarity.
|
||||
|
||||
Args:
|
||||
embeddings (np.ndarray): Embeddings to cluster and reduce.
|
||||
centroid_dict (dict): Dictionary of cluster labels and their centroids.
|
||||
cluster_label (int): Cluster label for which to compute the similarity.
|
||||
|
||||
Returns:
|
||||
np.ndarray: Cosine similarity of the document embeddings to the centroid of the cluster.
|
||||
"""
|
||||
|
||||
centroid = centroid_dict[cluster_label]
|
||||
similarity = np.dot(embeddings, centroid) / (np.linalg.norm(embeddings) * np.linalg.norm(centroid))
|
||||
return similarity
|
||||
|
||||
def get_most_similar_docs(self, corpus: list[str], embeddings: np.ndarray, labels: np.ndarray, centroid_dict: dict, cluster_label: int, top_n: int = 10) -> List[str]:
|
||||
"""
|
||||
Get the most similar documents to the centroid of a cluster.
|
||||
|
||||
Args:
|
||||
corpus (list[str]): List of documents.
|
||||
embeddings (np.ndarray): Embeddings to cluster and reduce.
|
||||
labels (np.ndarray): Cluster labels. -1 means outlier.
|
||||
centroid_dict (dict): Dictionary of cluster labels and their centroids.
|
||||
cluster_label (int): Cluster label for which to compute the similarity.
|
||||
top_n (int, optional): Number of top documents to extract.
|
||||
|
||||
Returns:
|
||||
List[str]: List of the most similar documents to the centroid of a cluster.
|
||||
"""
|
||||
|
||||
similarity = self.compute_centroid_similarity(embeddings, centroid_dict, cluster_label)
|
||||
most_similar_docs = [corpus[i] for i in np.argsort(similarity)[-top_n:][::-1]]
|
||||
return most_similar_docs
|
||||
|
||||
def compute_corpus_vocab(self,
|
||||
corpus: list[str],
|
||||
remove_stopwords: bool = True,
|
||||
remove_punction: bool = True,
|
||||
min_word_length: int = 3,
|
||||
max_word_length: int = 20,
|
||||
remove_short_words: bool = True,
|
||||
remove_numbers: bool = True,
|
||||
verbose: bool = True,
|
||||
min_doc_frequency: int = 3,
|
||||
min_freq: float = 0.1,
|
||||
max_freq: float = 0.9) -> list[str]:
|
||||
"""
|
||||
Compute the vocabulary of the corpus and perform preprocessing of the corpus.
|
||||
|
||||
Args:
|
||||
corpus (list[str]): List of documents.
|
||||
remove_stopwords (bool, optional): Whether to remove stopwords.
|
||||
remove_punction (bool, optional): Whether to remove punctuation.
|
||||
min_word_length (int, optional): Minimum word length to retain.
|
||||
max_word_length (int, optional): Maximum word length to retain.
|
||||
remove_short_words (bool, optional): Whether to remove short words.
|
||||
remove_numbers (bool, optional): Whether to remove numbers.
|
||||
verbose (bool, optional): Whether to print progress and describe what is happening.
|
||||
min_doc_frequency (int, optional): Minimum number of documents a word should appear in to be considered in the vocabulary.
|
||||
min_freq (float, optional): Minimum frequency percentile of words to be considered in the vocabulary.
|
||||
max_freq (float, optional): Maximum frequency percentile of words to be considered in the vocabulary.
|
||||
|
||||
Returns:
|
||||
list[str]: List of words in the corpus sorted alphabetically.
|
||||
"""
|
||||
|
||||
stopwords = set(nltk.corpus.stopwords.words('english'))
|
||||
|
||||
word_counter = collections.Counter()
|
||||
doc_frequency = collections.defaultdict(set)
|
||||
|
||||
for doc_id, doc in enumerate(tqdm(corpus, disable=not verbose, desc="Processing corpus")):
|
||||
words = nltk.word_tokenize(doc)
|
||||
for word in words:
|
||||
if remove_punction and word in string.punctuation:
|
||||
continue
|
||||
if remove_stopwords and word.lower() in stopwords:
|
||||
continue
|
||||
if remove_numbers and re.search(r'\d', word): # use a regular expression to check for digits
|
||||
continue
|
||||
if not re.search('[a-zA-Z]', word): # checks if word contains at least one alphabetic character
|
||||
continue
|
||||
# remove words that do not begin with an alphabetic character
|
||||
if not word[0].isalpha():
|
||||
continue
|
||||
if len(word) > max_word_length or (remove_short_words and len(word) < min_word_length):
|
||||
continue
|
||||
|
||||
word_lower = word.lower()
|
||||
word_counter[word_lower] += 1
|
||||
doc_frequency[word_lower].add(doc_id)
|
||||
|
||||
total_words = sum(word_counter.values())
|
||||
freq_counter = {word: count / total_words for word, count in word_counter.items()}
|
||||
|
||||
# print most common words and their frequencies
|
||||
if verbose:
|
||||
print("Most common words in the vocabulary:")
|
||||
for word, count in word_counter.most_common(10):
|
||||
print(f"{word}: {count}")
|
||||
|
||||
freq_arr = np.array(list(freq_counter.values()))
|
||||
|
||||
min_freq_value = np.quantile(freq_arr, min_freq, method="lower")
|
||||
max_freq_value = np.quantile(freq_arr, max_freq, method="higher")
|
||||
|
||||
|
||||
vocab = {}
|
||||
|
||||
for word in freq_counter.keys():
|
||||
if min_freq_value <= freq_counter[word] <= max_freq_value and len(doc_frequency[word]) >= min_doc_frequency:
|
||||
vocab[word] = freq_counter[word]
|
||||
|
||||
vocab = {word for word in freq_counter.keys()
|
||||
if min_freq_value <= freq_counter[word] <= max_freq_value
|
||||
and len(doc_frequency[word]) >= min_doc_frequency}
|
||||
|
||||
# Sorting the vocabulary alphabetically
|
||||
vocab = sorted(list(vocab))
|
||||
|
||||
return vocab
|
||||
|
||||
def compute_words_topics(self, corpus: list[str], vocab: list[str], labels: np.ndarray) -> dict:
|
||||
"""
|
||||
Compute the words per topic.
|
||||
|
||||
Args:
|
||||
corpus (list[str]): List of documents.
|
||||
vocab (list[str]): List of words in the corpus sorted alphabetically.
|
||||
labels (np.ndarray): Cluster labels. -1 means outlier.
|
||||
|
||||
Returns:
|
||||
dict: Dictionary of topics and their words.
|
||||
"""
|
||||
|
||||
|
||||
# Download NLTK resources (only required once)
|
||||
nltk.download("punkt")
|
||||
vocab = set(vocab)
|
||||
|
||||
words_per_topic = {label: [] for label in np.unique(labels) if label != -1}
|
||||
|
||||
for doc, label in tqdm(zip(corpus, labels), desc="Computing words per topic", total=len(corpus)):
|
||||
if label != -1:
|
||||
words = word_tokenize(doc)
|
||||
for word in words:
|
||||
if word.lower() in vocab:
|
||||
words_per_topic[label].append(word.lower())
|
||||
|
||||
return words_per_topic
|
||||
|
||||
def embed_vocab_openAI(self, client, vocab: list[str], embedder: GetEmbeddingsOpenAI = None) -> dict[str, np.ndarray]:
|
||||
"""
|
||||
Embed the vocabulary using the OpenAI embedding API.
|
||||
|
||||
Args:
|
||||
client: Client.
|
||||
vocab (list[str]): List of words in the corpus sorted alphabetically.
|
||||
embedder (GetEmbeddingsOpenAI, optional): Embedding object.
|
||||
|
||||
Returns:
|
||||
dict[str, np.ndarray]: Dictionary of words and their embeddings.
|
||||
"""
|
||||
|
||||
vocab = sorted(list(set(vocab)))
|
||||
if embedder is None:
|
||||
embedder = GetEmbeddingsOpenAI.GetEmbeddingsOpenAI(client)
|
||||
result = embedder.get_embeddings(vocab)
|
||||
|
||||
res_dict = {}
|
||||
for word, emb in zip(vocab, result["embeddings"]):
|
||||
res_dict[word] = emb
|
||||
return res_dict
|
||||
|
||||
def compute_bow_representation(self, document: str, vocab: list[str], vocab_set: set[str]) -> np.ndarray:
|
||||
"""
|
||||
Compute the bag-of-words representation of a document.
|
||||
|
||||
Args:
|
||||
document (str): Document to compute the bag-of-words representation of.
|
||||
vocab (list[str]): List of words in the corpus sorted alphabetically.
|
||||
vocab_set (set[str]): Set of words in the corpus sorted alphabetically.
|
||||
|
||||
Returns:
|
||||
np.ndarray: Bag-of-words representation of the document.
|
||||
"""
|
||||
|
||||
bow = np.zeros(len(vocab))
|
||||
words = word_tokenize(document)
|
||||
if vocab_set is None:
|
||||
vocab_set = set(vocab)
|
||||
for word in words:
|
||||
if word.lower() in vocab_set:
|
||||
bow[vocab.index(word.lower())] += 1
|
||||
return bow
|
||||
|
||||
def compute_word_topic_mat_old(self, corpus: list[str], vocab: list[str], labels: np.ndarray, consider_outliers: bool = False) -> np.ndarray:
|
||||
"""
|
||||
Compute the word-topic matrix.
|
||||
|
||||
Args:
|
||||
corpus (list[str]): List of documents.
|
||||
vocab (list[str]): List of words in the corpus sorted alphabetically.
|
||||
labels (np.ndarray): Cluster labels. -1 means outlier.
|
||||
consider_outliers (bool, optional): Whether to consider outliers when computing the top words. I.e. whether the labels contain -1 to indicate outliers.
|
||||
|
||||
Returns:
|
||||
np.ndarray: Word-topic matrix.
|
||||
"""
|
||||
|
||||
if consider_outliers:
|
||||
word_topic_mat = np.zeros(len(vocab), len((np.unique(labels))))
|
||||
else:
|
||||
word_topic_mat = np.zeros((len(vocab), len((np.unique(labels)) - 1)))
|
||||
|
||||
vocab_set = set(vocab)
|
||||
for i, doc in tqdm(enumerate(corpus), desc="Computing word-topic matrix", total=len(corpus)):
|
||||
if labels[i] > - 0.5:
|
||||
bow = self.compute_bow_representation(doc, vocab, vocab_set)
|
||||
idx_to_add = labels[i]
|
||||
word_topic_mat[:, idx_to_add] += bow
|
||||
|
||||
return word_topic_mat
|
||||
|
||||
def compute_word_topic_mat(self, corpus: list[str], vocab: list[str], labels: np.ndarray, consider_outliers=False) -> np.ndarray:
|
||||
"""
|
||||
Compute the word-topic matrix efficiently.
|
||||
|
||||
Args:
|
||||
corpus (list[str]): List of documents.
|
||||
vocab (list[str]): List of words in the corpus, sorted alphabetically.
|
||||
labels (np.ndarray): Cluster labels. -1 indicates outliers.
|
||||
consider_outliers (bool, optional): Whether to consider outliers when computing the top words. Defaults to False.
|
||||
|
||||
Returns:
|
||||
np.ndarray: Word-topic matrix.
|
||||
"""
|
||||
|
||||
|
||||
corpus_arr = np.array(corpus)
|
||||
|
||||
if consider_outliers:
|
||||
word_topic_mat = np.zeros((len(vocab), len((np.unique(labels)))))
|
||||
else:
|
||||
word_topic_mat = np.zeros((len(vocab), len((np.unique(labels)))))
|
||||
|
||||
for i, label in tqdm(enumerate(np.unique(labels)), desc="Computing word-topic matrix", total=len(np.unique(labels))):
|
||||
topic_docs = corpus_arr[labels == label]
|
||||
topic_doc_string = " ".join(topic_docs)
|
||||
topic_doc_words = word_tokenize(topic_doc_string)
|
||||
topic_doc_counter = Counter(topic_doc_words)
|
||||
|
||||
word_topic_mat[:, i] = np.array([topic_doc_counter.get(word, 0) for word in vocab])
|
||||
|
||||
return word_topic_mat
|
||||
|
||||
def extract_topwords_tfidf(self, word_topic_mat: np.ndarray, vocab: list[str], labels: np.ndarray, top_n_words: int = 10) -> dict:
|
||||
"""
|
||||
Extract the top words for each topic using a class-based tf-idf score.
|
||||
|
||||
Args:
|
||||
word_topic_mat (np.ndarray): Word-topic matrix.
|
||||
vocab (list[str]): List of words in the corpus sorted alphabetically.
|
||||
labels (np.ndarray): Cluster labels. -1 means outlier.
|
||||
top_n_words (int, optional): Number of top words to extract per topic.
|
||||
|
||||
Returns:
|
||||
dict: Dictionary of topics and their top words.
|
||||
"""
|
||||
|
||||
|
||||
if min(labels) == -1:
|
||||
word_topic_mat = word_topic_mat[:, 1:]
|
||||
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings("ignore", category=RuntimeWarning)
|
||||
tf = word_topic_mat / np.sum(word_topic_mat, axis=0)
|
||||
idf = np.log(1 + (word_topic_mat.shape[1] / np.sum(word_topic_mat > 0, axis=1)))
|
||||
|
||||
tfidf = tf * idf[:, np.newaxis]
|
||||
|
||||
# set tfidf to zero if tf is nan (happens if word does not occur in any document or topic does not have any words)
|
||||
tfidf[np.isnan(tf)] = 0
|
||||
|
||||
# extract top words for each topic
|
||||
top_words = {}
|
||||
top_word_scores = {}
|
||||
for topic in np.unique(labels):
|
||||
if topic != -1:
|
||||
indices = np.argsort(-tfidf[:, topic])[:top_n_words]
|
||||
top_words[topic] = [vocab[word_idx] for word_idx in indices]
|
||||
top_word_scores[topic] = [tfidf[word_idx, topic] for word_idx in indices]
|
||||
|
||||
|
||||
return top_words, top_word_scores
|
||||
|
||||
def compute_embedding_similarity_centroids(self, vocab: list[str], vocab_embedding_dict: dict, umap_mapper: umap.UMAP, centroid_dict: dict, reduce_vocab_embeddings: bool = False, reduce_centroid_embeddings: bool = False) -> np.ndarray:
|
||||
"""
|
||||
Compute the cosine similarity of each word in the vocabulary to each centroid.
|
||||
|
||||
Args:
|
||||
vocab (list[str]): List of words in the corpus sorted alphabetically.
|
||||
vocab_embedding_dict (dict): Dictionary of words and their embeddings.
|
||||
umap_mapper (umap.UMAP): UMAP mapper to transform new embeddings in the same way as the document embeddings.
|
||||
centroid_dict (dict): Dictionary of cluster labels and their centroids. -1 means outlier.
|
||||
reduce_vocab_embeddings (bool, optional): Whether to reduce the vocab embeddings with the UMAP mapper.
|
||||
reduce_centroid_embeddings (bool, optional): Whether to reduce the centroid embeddings with the UMAP mapper.
|
||||
|
||||
Returns:
|
||||
np.ndarray: Cosine similarity of each word in the vocab to each centroid. Has shape (len(vocab), len(centroid_dict) - 1).
|
||||
"""
|
||||
|
||||
embedding_dim = umap_mapper.n_components
|
||||
centroid_arr = np.zeros((len(centroid_dict), embedding_dim))
|
||||
for i, centroid in enumerate(centroid_dict.values()):
|
||||
centroid_arr[i] = centroid
|
||||
if reduce_centroid_embeddings:
|
||||
centroid_arr = umap_mapper.transform(centroid_arr)
|
||||
|
||||
centroid_arr = centroid_arr / np.linalg.norm(centroid_arr, axis=1).reshape(-1,1)
|
||||
|
||||
|
||||
org_embedding_dim = list(vocab_embedding_dict.values())[0].shape[0]
|
||||
vocab_arr = np.zeros((len(vocab), org_embedding_dim))
|
||||
for i, word in enumerate(vocab):
|
||||
vocab_arr[i] = vocab_embedding_dict[word]
|
||||
if reduce_vocab_embeddings:
|
||||
vocab_arr = umap_mapper.transform(vocab_arr)
|
||||
|
||||
vocab_arr = vocab_arr / np.linalg.norm(vocab_arr, axis=1).reshape(-1,1)
|
||||
|
||||
similarity = vocab_arr @ centroid_arr.T # cosine similarity
|
||||
return similarity
|
||||
|
||||
def extract_topwords_centroid_similarity(self, word_topic_mat: np.ndarray, vocab: list[str], vocab_embedding_dict: dict, centroid_dict: dict, umap_mapper: umap.UMAP, top_n_words: int = 10, reduce_vocab_embeddings: bool = True, reduce_centroid_embeddings: bool = False, consider_outliers: bool = False) -> tuple[dict, np.ndarray]:
|
||||
"""
|
||||
Extract the top words for each cluster by computing the cosine similarity of the words that occur in the corpus to the centroid of the cluster.
|
||||
|
||||
Args:
|
||||
word_topic_mat (np.ndarray): Word-topic matrix.
|
||||
vocab (list[str]): List of words in the corpus sorted alphabetically.
|
||||
vocab_embedding_dict (dict): Dictionary of words and their embeddings.
|
||||
centroid_dict (dict): Dictionary of cluster labels and their centroids. -1 means outlier.
|
||||
umap_mapper (umap.UMAP): UMAP mapper to transform new embeddings in the same way as the document embeddings.
|
||||
top_n_words (int, optional): Number of top words to extract per topic.
|
||||
reduce_vocab_embeddings (bool, optional): Whether to reduce the vocab embeddings with the UMAP mapper.
|
||||
reduce_centroid_embeddings (bool, optional): Whether to reduce the centroid embeddings with the UMAP mapper.
|
||||
consider_outliers (bool, optional): Whether to consider outliers when computing the top words. I.e., whether the labels contain -1 to indicate outliers.
|
||||
|
||||
Returns:
|
||||
dict: Dictionary of topics and their top words.
|
||||
np.ndarray: Cosine similarity of each word in the vocab to each centroid. Has shape (len(vocab), len(centroid_dict) - 1).
|
||||
"""
|
||||
|
||||
similarity_mat = self.compute_embedding_similarity_centroids(vocab, vocab_embedding_dict, umap_mapper, centroid_dict, reduce_vocab_embeddings, reduce_centroid_embeddings)
|
||||
top_words = {}
|
||||
top_word_scores = {}
|
||||
|
||||
if word_topic_mat.shape[1] > len(np.unique(list(centroid_dict.keys()))):
|
||||
word_topic_mat = word_topic_mat[:, 1:] #ignore outliers
|
||||
|
||||
for i, topic in enumerate(np.unique(list(centroid_dict.keys()))):
|
||||
if topic != -1:
|
||||
topic_similarity_mat = similarity_mat[:, topic] * word_topic_mat[:, topic]
|
||||
top_words[topic] = [vocab[word_idx] for word_idx in np.argsort(-topic_similarity_mat)[:top_n_words]]
|
||||
top_word_scores[topic] = [similarity_mat[word_idx, topic] for word_idx in np.argsort(-similarity_mat[:, topic])[:top_n_words]]
|
||||
|
||||
return top_words, top_word_scores
|
||||
@@ -0,0 +1,217 @@
|
||||
from openai import OpenAI
|
||||
|
||||
import tiktoken
|
||||
from tqdm import tqdm
|
||||
import numpy as np
|
||||
|
||||
class GetEmbeddingsOpenAI:
|
||||
"""
|
||||
This class allows to compute embeddings of text using the OpenAI API.
|
||||
"""
|
||||
|
||||
def __init__(self, client, azure_config: dict = {}, embedding_model: str = "text-embedding-ada-002", tokenizer: str = None, max_tokens: int = 8191) -> None:
|
||||
"""
|
||||
Constructor of the class.
|
||||
|
||||
Args:
|
||||
client: Client.
|
||||
embedding_model (str, optional): Name of the embedding model to use.
|
||||
tokenizer (str, optional): Name of the tokenizer to use.
|
||||
max_tokens (int, optional): Maximum number of tokens to use.
|
||||
|
||||
Note:
|
||||
By default, the embedding model "text-embedding-ada-002" is used with the corresponding tokenizer "cl100k_base" and a maximum number of tokens of 8191.
|
||||
"""
|
||||
|
||||
self.client = client
|
||||
self.embedding_model = embedding_model
|
||||
self.tokenizer_str = tokenizer
|
||||
self.max_tokens = max_tokens
|
||||
|
||||
@staticmethod
|
||||
def num_tokens_from_string(string: str, encoding) -> int:
|
||||
"""
|
||||
Returns the number of tokens in a text string.
|
||||
|
||||
Args:
|
||||
string (str): Text string to compute the number of tokens.
|
||||
encoding: A function to encode the string into tokens.
|
||||
|
||||
Returns:
|
||||
int: Number of tokens in the text string.
|
||||
"""
|
||||
num_tokens = len(encoding.encode(string))
|
||||
return num_tokens
|
||||
|
||||
def compute_number_of_tokens(self, corpus: list[str]) -> int:
|
||||
"""
|
||||
Computes the total number of tokens needed to embed the corpus.
|
||||
|
||||
Args:
|
||||
corpus (list[str]): List of strings to embed, where each element in the list is a document.
|
||||
|
||||
Returns:
|
||||
int: Total number of tokens needed to embed the corpus.
|
||||
"""
|
||||
|
||||
|
||||
if self.tokenizer_str is None:
|
||||
tokenizer = tiktoken.encoding_for_model(self.embedding_model)
|
||||
|
||||
else:
|
||||
tokenizer = tiktoken.get_encoding(self.tokenizer_str)
|
||||
|
||||
num_tokens = 0
|
||||
for document in tqdm(corpus):
|
||||
num_tokens += self.num_tokens_from_string(document, tokenizer)
|
||||
|
||||
return num_tokens
|
||||
|
||||
def split_doc(self, text):
|
||||
"""
|
||||
Splits a single document that is longer than the maximum number of tokens into a list of smaller documents.
|
||||
|
||||
Args:
|
||||
self: The instance of the class.
|
||||
text (str): The string to be split.
|
||||
|
||||
Returns:
|
||||
List[str]: A list of strings to embed, where each element in the list is a list of chunks comprising the document.
|
||||
"""
|
||||
|
||||
split_text = []
|
||||
split_text.append(text[:self.max_tokens])
|
||||
for i in range(1, len(text) // self.max_tokens):
|
||||
split_text.append(text[i * self.max_tokens:(i + 1) * self.max_tokens])
|
||||
split_text.append(text[(len(text) // self.max_tokens) * self.max_tokens:])
|
||||
return split_text
|
||||
|
||||
def split_long_docs(self, text: list[str]) -> list[list[str]]:
|
||||
"""
|
||||
Splits all documents that are longer than the maximum number of tokens into a list of smaller documents.
|
||||
|
||||
Args:
|
||||
self: The instance of the class.
|
||||
text (list[str]): List of strings to embed, where each element in the list is a document.
|
||||
|
||||
Returns:
|
||||
List[list[str]]: A list of lists of strings to embed, where each element in the outer list is a list of chunks comprising the document.
|
||||
"""
|
||||
|
||||
if self.tokenizer_str is None:
|
||||
tokenizer = tiktoken.encoding_for_model(self.embedding_model)
|
||||
else:
|
||||
tokenizer = tiktoken.get_encoding(self.tokenizer_str)
|
||||
|
||||
|
||||
split_text = []
|
||||
for document in tqdm(text):
|
||||
if self.num_tokens_from_string(document, tokenizer) > self.max_tokens:
|
||||
split_text.append(self.split_doc(document))
|
||||
else:
|
||||
split_text.append([document])
|
||||
return split_text
|
||||
|
||||
def make_api_call(self, text: str):
|
||||
"""
|
||||
Makes an API call to the OpenAI API to embed a text string.
|
||||
|
||||
Args:
|
||||
self: The instance of the class.
|
||||
text (str): The string to embed.
|
||||
|
||||
Returns:
|
||||
API response: The response from the API.
|
||||
"""
|
||||
response = self.client.embeddings.create(input = [text], model = self.embedding_model)
|
||||
return response
|
||||
|
||||
|
||||
|
||||
def get_embeddings_doc_split(self, corpus: list[list[str]], n_tries=3) -> list[dict]:
|
||||
"""
|
||||
Computes the embeddings of a corpus for split documents.
|
||||
|
||||
Args:
|
||||
self: The instance of the class.
|
||||
corpus (list[list[str]]): List of strings to embed, where each element is a document represented by a list of its chunks.
|
||||
n_tries (int, optional): Number of tries to make an API call (default is 3).
|
||||
|
||||
Returns:
|
||||
List[dict]: A list of dictionaries, where each dictionary contains the embedding of the document, the text of the document, and a list of errors that occurred during the embedding process.
|
||||
"""
|
||||
|
||||
api_res_list = []
|
||||
for i in tqdm(range(len(corpus))):
|
||||
chunk_lis = corpus[i]
|
||||
api_res_doc = []
|
||||
for chunk_n, chunk in enumerate(chunk_lis):
|
||||
|
||||
for i in range(n_tries + 1):
|
||||
try:
|
||||
api_res_doc.append(
|
||||
{"api_res": self.make_api_call(chunk),
|
||||
"error": None }
|
||||
)
|
||||
break
|
||||
except Exception as e:
|
||||
print(f"Error {e} occured for chunk {chunk_n} of document {i}")
|
||||
print(chunk)
|
||||
print("Trying again.")
|
||||
if i == n_tries:
|
||||
print("Maximum number of tries reached. Skipping chunk.")
|
||||
api_res_doc.append(
|
||||
{"api_res": None,
|
||||
"error": e })
|
||||
|
||||
|
||||
# average the embeddings of the chunks
|
||||
emb_lis = []
|
||||
for api_res in api_res_doc:
|
||||
if api_res["api_res"] is not None:
|
||||
emb_lis.append(np.array(api_res["api_res"].data[0].embedding))
|
||||
text = " ".join(chunk_lis)
|
||||
embedding = np.mean(emb_lis, axis = 0)
|
||||
api_res_list.append(
|
||||
{"embedding": embedding,
|
||||
"text": text,
|
||||
"errors": [api_res["error"] for api_res in api_res_doc]}
|
||||
)
|
||||
return api_res_list
|
||||
|
||||
def convert_api_res_list(self, api_res_list: list[dict]) -> dict:
|
||||
"""
|
||||
Converts the api_res list into a dictionary containing the embeddings as a matrix and the corpus as a list of strings.
|
||||
|
||||
Args:
|
||||
self: The instance of the class.
|
||||
api_res_list (list[dict]): List of dictionaries, where each dictionary contains the embedding of the document, the text of the document, and a list of errors that occurred during the embedding process.
|
||||
|
||||
Returns:
|
||||
dict: A dictionary containing the embeddings as a matrix and the corpus as a list of strings.
|
||||
"""
|
||||
|
||||
|
||||
embeddings = np.array([api_res["embedding"] for api_res in api_res_list])
|
||||
corpus = [api_res["text"] for api_res in api_res_list]
|
||||
errors = [api_res["errors"] for api_res in api_res_list]
|
||||
return {"embeddings": embeddings, "corpus": corpus, "errors": errors}
|
||||
|
||||
|
||||
def get_embeddings(self, corpus: list[str]) -> dict:
|
||||
"""
|
||||
Computes the embeddings of a corpus.
|
||||
|
||||
Args:
|
||||
self: The instance of the class.
|
||||
corpus (list[str]): List of strings to embed, where each element in the list is a document.
|
||||
|
||||
Returns:
|
||||
dict: A dictionary containing the embeddings as a matrix and the corpus as a list of strings.
|
||||
"""
|
||||
|
||||
corpus_split = self.split_long_docs(corpus)
|
||||
corpus_emb = self.get_embeddings_doc_split(corpus_split)
|
||||
self.corpus_emb = corpus_emb
|
||||
res = self.convert_api_res_list(corpus_emb)
|
||||
return res
|
||||
@@ -0,0 +1,137 @@
|
||||
from topicgpt.TopicRepresentation import Topic
|
||||
|
||||
import unittest
|
||||
from sklearn.datasets import fetch_20newsgroups
|
||||
|
||||
from topicgpt.TopicGPT import TopicGPT
|
||||
|
||||
|
||||
import sys
|
||||
|
||||
|
||||
class QuickestTopicGPT_prompting(unittest.TestCase):
|
||||
"""
|
||||
This class is used to mainly test the prompting functionality of the TopicGPT class.
|
||||
"""
|
||||
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls, sample_size:int = 500):
|
||||
"""
|
||||
download the necessary data and only keep a sample of it
|
||||
params:
|
||||
client: Client.
|
||||
sample_size: the number of documents to use for the test
|
||||
"""
|
||||
|
||||
data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes')) #download the 20 Newsgroups dataset
|
||||
corpus = data['data']# just select the first 1000 documents for this example
|
||||
corpus = [doc for doc in corpus if doc != ""]
|
||||
corpus = corpus[:sample_size]
|
||||
|
||||
cls.corpus = corpus
|
||||
|
||||
cls.tm = TopicGPT(client = client, n_topics = 1)
|
||||
cls.tm.fit(cls.corpus)
|
||||
|
||||
def test_repr_topics(self):
|
||||
"""
|
||||
test the repr_topics function of the TopicGPT class
|
||||
"""
|
||||
print("Testing repr_topics...")
|
||||
self.assertTrue(type(self.tm.repr_topics()) == str)
|
||||
|
||||
def test_promt_knn_search(self):
|
||||
"""
|
||||
test the ppromt function that calls knn_search of the TopicPrompting class
|
||||
"""
|
||||
print("Testing ppromt_knn_search...")
|
||||
|
||||
prompt_lis = ["Is topic 0 about Bananas? Use knn Search",
|
||||
"Is topic 0 about Space? Use knn Search"]
|
||||
|
||||
for prompt in prompt_lis:
|
||||
|
||||
answer, function_result = self.tm.prompt(prompt)
|
||||
|
||||
print(f"Answer to the prompt '{prompt}' \n is \n '{answer}'")
|
||||
|
||||
self.assertTrue(type(answer) == str)
|
||||
self.assertTrue(type(function_result[0]) == list)
|
||||
self.assertTrue(type(function_result[1]) == list)
|
||||
self.assertTrue(type(function_result[0][0]) == str)
|
||||
self.assertTrue(type(function_result[1][0]) == int)
|
||||
|
||||
|
||||
def test_prompt_split_topic_kmeans_inplace(self):
|
||||
"""
|
||||
test the ppromt function that calls split_topic_kmeans of the TopicPrompting class
|
||||
"""
|
||||
|
||||
print("Testing ppromt_split_topic_kmeans...")
|
||||
|
||||
prompt_lis = ["Split topic 0 into 2 subtopics using kmeans. Do this inplace"]
|
||||
added_topic_lis_len = [2]
|
||||
|
||||
old_number_of_topics = len(self.tm.topic_lis)
|
||||
|
||||
for prompt, added_topic_len in zip(prompt_lis, added_topic_lis_len):
|
||||
|
||||
answer, function_result = self.tm.prompt(prompt)
|
||||
|
||||
print(f"Answer to the prompt '{prompt}' \n is \n '{answer}'")
|
||||
print("function_result: ", function_result)
|
||||
|
||||
self.assertTrue(type(answer) == str)
|
||||
self.assertTrue(type(function_result) == list)
|
||||
self.assertTrue(type(function_result[0]) == Topic)
|
||||
|
||||
self.assertTrue(len(self.tm.topic_lis) == old_number_of_topics + added_topic_len -1 )
|
||||
self.assertTrue(self.tm.topic_lis == function_result)
|
||||
|
||||
|
||||
def test_prompt_combine_topics_inplace(self):
|
||||
"""
|
||||
test the prompt function that calls combine_topics of the TopicPrompting class
|
||||
"""
|
||||
|
||||
print("Testing ppromt_combine_topics...")
|
||||
|
||||
prompt_lis = ["Combine topic 0 and topic 1 into one topic. Do this inplace"]
|
||||
|
||||
# split topic first
|
||||
self.tm.prompt("Please split topic 0 into two subtopic. Do this inplace.")
|
||||
|
||||
old_number_topics = len(self.tm.topic_lis)
|
||||
|
||||
|
||||
|
||||
for prompt in prompt_lis:
|
||||
|
||||
answer, function_result = self.tm.prompt(prompt)
|
||||
|
||||
print(f"Answer to the prompt '{prompt}' \n is \n '{answer}'")
|
||||
print("function_result: ", function_result)
|
||||
print("topic_gpt_topic_list: ", self.tm.topic_lis)
|
||||
|
||||
self.assertTrue(type(answer) == str)
|
||||
self.assertTrue(type(function_result) == list)
|
||||
self.assertTrue(type(function_result[0]) == Topic)
|
||||
self.assertTrue(self.tm.topic_lis == function_result)
|
||||
self.assertTrue(len(self.tm.topic_lis) == old_number_topics -1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
for i, arg in enumerate(sys.argv):
|
||||
if arg == "--api-key":
|
||||
api_key = sys.argv.pop(i + 1)
|
||||
sys.argv.pop(i)
|
||||
break
|
||||
|
||||
if api_key is None:
|
||||
print("API key must be provided with --api-key")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
unittest.main()
|
||||
@@ -0,0 +1,120 @@
|
||||
from topicgpt.TopicRepresentation import Topic
|
||||
|
||||
import unittest
|
||||
from sklearn.datasets import fetch_20newsgroups
|
||||
|
||||
from topicgpt.TopicGPT import TopicGPT
|
||||
|
||||
|
||||
class QuickTestTopicGPT_init_and_fit(unittest.TestCase):
|
||||
"""
|
||||
Run some basic tests on TopicGPT that do not require any saved data
|
||||
"""
|
||||
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls, sample_size:int = 500):
|
||||
"""
|
||||
download the necessary data and only keep a sample of it
|
||||
params:
|
||||
api_key: the openai api key
|
||||
sample_size: the number of documents to use for the test
|
||||
"""
|
||||
|
||||
data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes')) #download the 20 Newsgroups dataset
|
||||
corpus = data['data']# just select the first 1000 documents for this example
|
||||
corpus = [doc for doc in corpus if doc != ""]
|
||||
corpus = corpus[:sample_size]
|
||||
|
||||
cls.corpus = corpus
|
||||
|
||||
def setUp(self):
|
||||
self.api_key_openai = api_key
|
||||
|
||||
|
||||
def test_init(self):
|
||||
"""
|
||||
test the init function of the TopicGPT class
|
||||
"""
|
||||
print("Testing init...")
|
||||
topicgpt = TopicGPT(api_key = self.api_key_openai)
|
||||
self.assertTrue(isinstance(topicgpt, TopicGPT))
|
||||
|
||||
topicgpt = TopicGPT(api_key = self.api_key_openai,
|
||||
n_topics= 20)
|
||||
self.assertTrue(isinstance(topicgpt, TopicGPT))
|
||||
|
||||
topicgpt = TopicGPT(api_key = self.api_key_openai,
|
||||
n_topics= 20,
|
||||
corpus_instruction="This is a corpus instruction")
|
||||
self.assertTrue(isinstance(topicgpt, TopicGPT))
|
||||
|
||||
# check if assertions are triggered
|
||||
|
||||
with self.assertRaises(AssertionError):
|
||||
topicgpt = TopicGPT(api_key = None,
|
||||
n_topics= 32,
|
||||
openai_prompting_model="gpt-4",
|
||||
max_number_of_tokens=8000,
|
||||
corpus_instruction="This is a corpus instruction")
|
||||
|
||||
with self.assertRaises(AssertionError):
|
||||
topicgpt = TopicGPT(api_key = self.api_key_openai,
|
||||
n_topics= 0,
|
||||
max_number_of_tokens=8000,
|
||||
corpus_instruction="This is a corpus instruction")
|
||||
|
||||
with self.assertRaises(AssertionError):
|
||||
topicgpt = TopicGPT(api_key = self.api_key_openai,
|
||||
n_topics= 20,
|
||||
max_number_of_tokens=0,
|
||||
corpus_instruction="This is a corpus instruction")
|
||||
|
||||
|
||||
def test_fit(self):
|
||||
"""
|
||||
test the fit function of the TopicGPT class
|
||||
"""
|
||||
print("Testing fit...")
|
||||
|
||||
def instance_test(topicgpt):
|
||||
topicgpt.fit(self.corpus)
|
||||
|
||||
self.assertTrue(hasattr(topicgpt, "vocab"))
|
||||
self.assertTrue(hasattr(topicgpt, "topic_lis"))
|
||||
|
||||
self.assertTrue(isinstance(topicgpt.vocab, list))
|
||||
self.assertTrue(isinstance(topicgpt.vocab[0], str))
|
||||
|
||||
self.assertTrue(isinstance(topicgpt.topic_lis, list))
|
||||
self.assertTrue(type(topicgpt.topic_lis[0]) == Topic)
|
||||
|
||||
if topicgpt.n_topics is not None:
|
||||
self.assertTrue(len(topicgpt.topic_lis) == topicgpt.n_topics)
|
||||
|
||||
self.assertTrue(topicgpt.topic_lis == topicgpt.topic_prompting.topic_lis)
|
||||
self.assertTrue(topicgpt.vocab == topicgpt.topic_prompting.vocab)
|
||||
self.assertTrue(topicgpt.vocab_embeddings == topicgpt.topic_prompting.vocab_embeddings)
|
||||
|
||||
|
||||
topicgpt1 = TopicGPT(api_key = self.api_key_openai, n_topics = 1)
|
||||
|
||||
topic_gpt_list = [topicgpt1]
|
||||
|
||||
for topic_gpt in topic_gpt_list:
|
||||
instance_test(topic_gpt)
|
||||
|
||||
|
||||
import sys
|
||||
|
||||
if __name__ == "__main__":
|
||||
for i, arg in enumerate(sys.argv):
|
||||
if arg == "--api-key":
|
||||
api_key = sys.argv.pop(i + 1)
|
||||
sys.argv.pop(i)
|
||||
break
|
||||
|
||||
if api_key is None:
|
||||
print("API key must be provided with --api-key")
|
||||
sys.exit(1)
|
||||
unittest.main()
|
||||
@@ -0,0 +1,378 @@
|
||||
import numpy as np
|
||||
import os
|
||||
import pickle
|
||||
# make sure the import works even if the package has not been installed and just the files are used
|
||||
from topicgpt.Clustering import Clustering_and_DimRed
|
||||
from topicgpt.ExtractTopWords import ExtractTopWords
|
||||
from topicgpt.TopwordEnhancement import TopwordEnhancement
|
||||
from topicgpt.GetEmbeddingsOpenAI import GetEmbeddingsOpenAI
|
||||
from topicgpt.TopicPrompting import TopicPrompting
|
||||
from topicgpt.TopicRepresentation import Topic
|
||||
from topicgpt.Client import Client
|
||||
import topicgpt.TopicRepresentation as TopicRepresentation
|
||||
|
||||
|
||||
embeddings_path= "SavedEmbeddings/embeddings.pkl" #global variable for the path to the embeddings
|
||||
|
||||
class TopicGPT:
|
||||
"""
|
||||
This is the main class for doing topic modelling with TopicGPT.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
api_key: str = "",
|
||||
azure_endpoint: dict = {},
|
||||
n_topics: int = None,
|
||||
openai_prompting_model: str = "gpt-3.5-turbo-16k",
|
||||
max_number_of_tokens: int = 16384,
|
||||
corpus_instruction: str = "",
|
||||
document_embeddings: np.ndarray = None,
|
||||
vocab_embeddings: dict[str, np.ndarray] = None,
|
||||
embedding_model: str = "text-embedding-ada-002",
|
||||
max_number_of_tokens_embedding: int = 8191,
|
||||
use_saved_embeddings: bool = True,
|
||||
path_saved_embeddings: str = embeddings_path,
|
||||
clusterer: Clustering_and_DimRed = None,
|
||||
n_topwords: int = 2000,
|
||||
n_topwords_description: int = 500,
|
||||
topword_extraction_methods: list[str] = ["tfidf", "cosine_similarity"],
|
||||
compute_vocab_hyperparams: dict = {},
|
||||
enhancer: TopwordEnhancement = None,
|
||||
topic_prompting: TopicPrompting = None,
|
||||
verbose: bool = True) -> None:
|
||||
|
||||
"""
|
||||
Initializes the main class for conducting topic modeling with TopicGPT.
|
||||
|
||||
Args:
|
||||
api_key (str): Your OpenAI API key. Obtain this key from https://beta.openai.com/account/api-keys.
|
||||
n_topics (int, optional): Number of topics to discover. If None, the Hdbscan algorithm (https://pypi.org/project/hdbscan/) is used to determine the number of topics automatically. Otherwise, agglomerative clustering is used. Note that with insufficient data, fewer topics may be found than specified.
|
||||
openai_prompting_model (str, optional): Model provided by OpenAI for topic description and prompts. Refer to https://platform.openai.com/docs/models for available models.
|
||||
max_number_of_tokens (int, optional): Maximum number of tokens to use for the OpenAI API.
|
||||
corpus_instruction (str, optional): Additional information about the corpus, if available, to benefit the model.
|
||||
document_embeddings (np.ndarray, optional): Document embeddings for the corpus. If None, they will be computed using the OpenAI API.
|
||||
vocab_embeddings (dict[str, np.ndarray], optional): Vocabulary embeddings for the corpus in a dictionary format where keys are words and values are embeddings. If None, they will be computed using the OpenAI API.
|
||||
embedding_model (str, optional): Name of the embedding model to use. See https://beta.openai.com/docs/api-reference/text-embedding for available models.
|
||||
max_number_of_tokens_embedding (int, optional): Maximum number of tokens to use for the OpenAI API when computing embeddings.
|
||||
use_saved_embeddings (bool, optional): Whether to use saved embeddings. If True, embeddings are loaded from the file 'SavedEmbeddings/embeddings.pkl' or path_saved_embeddings if different. If False, embeddings are computed using the OpenAI API and saved to the file.
|
||||
path_saved_embeddings (str, optional): Path to the saved embeddings file.
|
||||
clusterer (Clustering_and_DimRed, optional): Clustering and dimensionality reduction object. Find the class in the "Clustering/Clustering" folder. If None, a clustering object with default parameters is used. Note that providing document and vocab embeddings and an embedding object at the same time is not sensible; the number of topics specified in the clusterer will overwrite the n_topics argument.
|
||||
n_topwords (int, optional): Number of top words to extract and save for each topic. Note that fewer top words might be used later.
|
||||
n_topwords_description (int, optional): Number of top words to provide to the LLM (Language Model) to describe the topic.
|
||||
topword_extraction_methods (list[str], optional): List of methods for extracting top words. Available methods include "tfidf", "cosine_similarity", and "topword_enhancement". Refer to the file 'ExtractTopWords/ExtractTopWords.py' for more details.
|
||||
compute_vocab_hyperparams (dict, optional): Hyperparameters for computing vocabulary embeddings. Refer to the file 'ExtractTopWords/ExtractTopWords.py' for more details.
|
||||
enhancer (TopwordEnhancement, optional): Topword enhancement object. Used for describing topics. Find the class in the "TopwordEnhancement/TopwordEnhancement.py" folder. If None, a topword enhancement object with default parameters is used. If an openai model is specified here, it will overwrite the openai_prompting_model argument for topic description.
|
||||
topic_prompting (TopicPrompting, optional): Topic prompting object for formulating prompts. Find the class in the "TopicPrompting/TopicPrompting.py" folder. If None, a topic prompting object with default parameters is used. If an openai model is specified here, it will overwrite the openai_prompting_model argument for topic description.
|
||||
verbose (bool, optional): Whether to print detailed information about the process. This can be overridden by arguments in passed objects.
|
||||
"""
|
||||
|
||||
|
||||
|
||||
# Do some checks on the input arguments
|
||||
assert api_key is not None, "You need to provide an OpenAI API key."
|
||||
assert n_topics is None or n_topics > 0, "The number of topics needs to be a positive integer."
|
||||
assert max_number_of_tokens > 0, "The maximum number of tokens needs to be a positive integer."
|
||||
assert max_number_of_tokens_embedding > 0, "The maximum number of tokens for the embedding model needs to be a positive integer."
|
||||
assert n_topwords > 0, "The number of top words needs to be a positive integer."
|
||||
assert n_topwords_description > 0, "The number of top words for the topic description needs to be a positive integer."
|
||||
assert len(topword_extraction_methods) > 0, "You need to provide at least one topword extraction method."
|
||||
assert n_topwords_description <= n_topwords, "The number of top words for the topic description needs to be smaller or equal to the number of top words."
|
||||
|
||||
self.client = Client(api_key = api_key, azure_endpoint = azure_endpoint)
|
||||
|
||||
|
||||
self.n_topics = n_topics
|
||||
self.openai_prompting_model = openai_prompting_model
|
||||
self.max_number_of_tokens = max_number_of_tokens
|
||||
self.corpus_instruction = corpus_instruction
|
||||
self.document_embeddings = document_embeddings
|
||||
self.vocab_embeddings = vocab_embeddings
|
||||
self.embedding_model = embedding_model
|
||||
self.max_number_of_tokens_embedding = max_number_of_tokens_embedding
|
||||
self.embedder = GetEmbeddingsOpenAI(client = self.client, embedding_model = self.embedding_model, max_tokens = self.max_number_of_tokens_embedding)
|
||||
self.clusterer = clusterer
|
||||
self.n_topwords = n_topwords
|
||||
self.n_topwords_description = n_topwords_description
|
||||
self.topword_extraction_methods = topword_extraction_methods
|
||||
self.compute_vocab_hyperparams = compute_vocab_hyperparams
|
||||
self.enhancer = enhancer
|
||||
self.topic_prompting = topic_prompting
|
||||
self.use_saved_embeddings = use_saved_embeddings
|
||||
self.verbose = verbose
|
||||
|
||||
self.compute_vocab_hyperparams["verbose"] = self.verbose
|
||||
|
||||
# if embeddings have already been downloaded to the folder SavedEmbeddings, then load them
|
||||
if self.use_saved_embeddings and os.path.exists(path_saved_embeddings):
|
||||
with open(path_saved_embeddings, "rb") as f:
|
||||
self.document_embeddings, self.vocab_embeddings = pickle.load(f)
|
||||
|
||||
for elem in topword_extraction_methods:
|
||||
assert elem in ["tfidf", "cosine_similarity", "topword_enhancement"], "Invalid topword extraction method. Valid methods are 'tfidf', 'cosine_similarity', and 'topword_enhancement'."
|
||||
|
||||
if clusterer is None:
|
||||
self.clusterer = Clustering_and_DimRed(number_clusters_hdbscan = self.n_topics, verbose = self.verbose)
|
||||
else:
|
||||
self.n_topics = clusterer.number_clusters_hdbscan
|
||||
|
||||
if enhancer is None:
|
||||
self.enhancer = TopwordEnhancement(client = self.client, openai_model = self.openai_prompting_model, max_context_length = self.max_number_of_tokens, corpus_instruction = self.corpus_instruction)
|
||||
|
||||
if topic_prompting is None:
|
||||
self.topic_prompting = TopicPrompting(topic_lis = [], client = self.client, openai_prompting_model = self.openai_prompting_model, max_context_length_promting = 16000, enhancer = self.enhancer, openai_embedding_model = self.embedding_model, max_context_length_embedding = self.max_number_of_tokens_embedding, corpus_instruction = corpus_instruction)
|
||||
|
||||
self.extractor = ExtractTopWords()
|
||||
|
||||
def __repr__(self) -> str:
|
||||
repr = "TopicGPT object with the following parameters:\n"
|
||||
repr += "-"*150 + "\n"
|
||||
repr += "n_topics: " + str(self.n_topics) + "\n"
|
||||
repr += "openai_prompting_model: " + self.openai_prompting_model + "\n"
|
||||
repr += "max_number_of_tokens: " + str(self.max_number_of_tokens) + "\n"
|
||||
repr += "corpus_instruction: " + self.corpus_instruction + "\n"
|
||||
repr += "embedding_model: " + self.embedding_model + "\n"
|
||||
repr += "clusterer: " + str(self.clusterer) + "\n"
|
||||
repr += "n_topwords: " + str(self.n_topwords) + "\n"
|
||||
repr += "n_topwords_description: " + str(self.n_topwords_description) + "\n"
|
||||
repr += "topword_extraction_methods: " + str(self.topword_extraction_methods) + "\n"
|
||||
repr += "compute_vocab_hyperparams: " + str(self.compute_vocab_hyperparams) + "\n"
|
||||
repr += "enhancer: " + str(self.enhancer) + "\n"
|
||||
repr += "topic_prompting: " + str(self.topic_prompting) + "\n"
|
||||
|
||||
return repr
|
||||
|
||||
def compute_embeddings(self, corpus: list[str]) -> tuple[np.ndarray, dict[str, np.ndarray]]:
|
||||
"""
|
||||
Computes document and vocabulary embeddings for the given corpus.
|
||||
|
||||
Args:
|
||||
corpus (list[str]): List of strings to embed, where each element is a document.
|
||||
|
||||
Returns:
|
||||
tuple: A tuple containing two items:
|
||||
- document_embeddings (np.ndarray): Document embeddings for the corpus, with shape (len(corpus), n_embedding_dimensions).
|
||||
- vocab_embeddings (dict[str, np.ndarray]): Vocabulary embeddings for the corpus, provided as a dictionary where keys are words and values are embeddings.
|
||||
"""
|
||||
|
||||
|
||||
self.document_embeddings = self.embedder.get_embeddings(corpus)["embeddings"]
|
||||
|
||||
self.vocab_embeddings = self.extractor.embed_vocab_openAI(self.client, self.vocab, embedder = self.embedder)
|
||||
|
||||
return self.document_embeddings, self.vocab_embeddings
|
||||
|
||||
def extract_topics(self, corpus: list[str]) -> list[Topic]:
|
||||
"""
|
||||
Extracts topics from the given corpus.
|
||||
|
||||
Args:
|
||||
corpus (list[str]): List of strings to process, where each element represents a document.
|
||||
|
||||
Returns:
|
||||
list[Topic]: A list of Topic objects representing the extracted topics.
|
||||
"""
|
||||
|
||||
assert self.document_embeddings is not None and self.vocab_embeddings is not None, "You need to compute the embeddings first."
|
||||
|
||||
if self.vocab is None:
|
||||
self.vocab = self.extractor.compute_corpus_vocab(self.corpus, **self.compute_vocab_hyperparams)
|
||||
|
||||
self.topic_lis = TopicRepresentation.extract_topics_no_new_vocab_computation(
|
||||
corpus = corpus,
|
||||
vocab = self.vocab,
|
||||
document_embeddings = self.document_embeddings,
|
||||
clusterer = self.clusterer,
|
||||
vocab_embeddings = self.vocab_embeddings,
|
||||
n_topwords = self.n_topwords,
|
||||
topword_extraction_methods = self.topword_extraction_methods,
|
||||
consider_outliers = True
|
||||
)
|
||||
|
||||
return self.topic_lis
|
||||
|
||||
def describe_topics(self, topics: list[Topic]) -> list[Topic]:
|
||||
"""
|
||||
Names and describes the provided topics using the OpenAI API.
|
||||
|
||||
Args:
|
||||
topics (list[Topic]): List of Topic objects to be named and described.
|
||||
|
||||
Returns:
|
||||
list[Topic]: A list of Topic objects with names and descriptions.
|
||||
"""
|
||||
|
||||
|
||||
assert self.topic_lis is not None, "You need to extract the topics first."
|
||||
|
||||
if "cosine_similarity" in self.topword_extraction_methods:
|
||||
topword_method = "cosine_similarity"
|
||||
elif "tfidf" in self.topword_extraction_methods:
|
||||
topword_method = "tfidf"
|
||||
else:
|
||||
raise ValueError("You need to use either 'cosine_similarity' or 'tfidf' as topword extraction method.")
|
||||
|
||||
self.topic_lis = TopicRepresentation.describe_and_name_topics(
|
||||
topics = topics,
|
||||
enhancer = self.enhancer,
|
||||
topword_method= topword_method,
|
||||
n_words = self.n_topwords_description
|
||||
)
|
||||
|
||||
return self.topic_lis
|
||||
|
||||
def fit(self, corpus: list[str], verbose: bool = True):
|
||||
"""
|
||||
Compute embeddings if necessary, extract topics, and describe them.
|
||||
|
||||
Args:
|
||||
corpus (list[str]): List of strings to embed, where each element represents a document.
|
||||
verbose (bool, optional): Whether to print the progress and details of the process.
|
||||
"""
|
||||
|
||||
self.corpus = corpus
|
||||
|
||||
# remove empty documents
|
||||
len_before_removing = len(self.corpus)
|
||||
while '' in self.corpus:
|
||||
corpus.remove('')
|
||||
len_after_removing = len(self.corpus)
|
||||
if verbose:
|
||||
print("Removed " + str(len_before_removing - len_after_removing) + " empty documents.")
|
||||
|
||||
if self.vocab_embeddings is None:
|
||||
if verbose:
|
||||
print("Computing vocabulary...")
|
||||
|
||||
self.vocab = self.extractor.compute_corpus_vocab(self.corpus, **self.compute_vocab_hyperparams)
|
||||
else:
|
||||
print('Vocab already computed')
|
||||
self.vocab = list(self.vocab_embeddings.keys())
|
||||
|
||||
if self.vocab_embeddings is None or self.document_embeddings is None:
|
||||
if verbose:
|
||||
print("Computing embeddings...")
|
||||
self.compute_embeddings(corpus = self.corpus)
|
||||
else:
|
||||
print('Embeddings already computed')
|
||||
if verbose:
|
||||
print("Extracting topics...")
|
||||
self.topic_lis = self.extract_topics(corpus = self.corpus)
|
||||
|
||||
if verbose:
|
||||
print("Describing topics...")
|
||||
self.topic_lis = self.describe_topics(topics = self.topic_lis)
|
||||
|
||||
self.topic_prompting.topic_lis = self.topic_lis
|
||||
self.topic_prompting.vocab_embeddings = self.vocab_embeddings
|
||||
self.topic_prompting.vocab = self.vocab
|
||||
|
||||
def visualize_clusters(self):
|
||||
"""
|
||||
Visualizes the identified clusters representing the topics in a scatterplot.
|
||||
"""
|
||||
|
||||
assert self.topic_lis is not None, "You need to extract the topics first."
|
||||
|
||||
all_document_embeddings = np.concatenate([topic.document_embeddings_hd for topic in self.topic_lis], axis = 0)
|
||||
all_texts = np.concatenate([topic.documents for topic in self.topic_lis], axis = 0)
|
||||
all_document_indices = np.concatenate([np.repeat(i, topic.document_embeddings_hd.shape[0]) for i, topic in enumerate(self.topic_lis)], axis = 0)
|
||||
class_names = [str(topic) for topic in self.topic_lis]
|
||||
|
||||
self.clusterer.visualize_clusters_dynamic(all_document_embeddings, all_document_indices, all_texts, class_names)
|
||||
|
||||
def repr_topics(self) -> str:
|
||||
"""
|
||||
Returns a string explanation of the topics.
|
||||
"""
|
||||
|
||||
assert self.topic_lis is not None, "You need to extract the topics first."
|
||||
|
||||
if "cosine_similarity" in self.topword_extraction_methods:
|
||||
topword_method = "cosine_similarity"
|
||||
elif "tfidf" in self.topword_extraction_methods:
|
||||
topword_method = "tfidf"
|
||||
else:
|
||||
raise ValueError("You need to use either 'cosine_similarity' or 'tfidf' as topword extraction method.")
|
||||
|
||||
repr = ""
|
||||
for topic in self.topic_lis:
|
||||
repr += str(topic) + "\n"
|
||||
repr += "Topic_description: " + topic.topic_description + "\n"
|
||||
repr += "Top words: " + str(topic.top_words[topword_method][:10]) + "\n"
|
||||
repr += "\n"
|
||||
repr += "-"*150 + "\n"
|
||||
|
||||
return repr
|
||||
|
||||
def print_topics(self):
|
||||
"""
|
||||
Prints a string explanation of the topics.
|
||||
"""
|
||||
|
||||
print(self.repr_topics())
|
||||
|
||||
def prompt(self, query: str) -> tuple[str, object]:
|
||||
"""
|
||||
Prompts the model with the given query.
|
||||
|
||||
Args:
|
||||
query (str): The query to prompt the model with.
|
||||
|
||||
Returns:
|
||||
tuple: A tuple containing two items:
|
||||
- answer (str): The answer from the model.
|
||||
- function_result (object): The result of the function call.
|
||||
|
||||
Note:
|
||||
Please refer to the TopicPrompting class for more details on available functions for prompting the model.
|
||||
"""
|
||||
|
||||
|
||||
result = self.topic_prompting.general_prompt(query)
|
||||
|
||||
answer = result[0][-1].choices[0].message.content
|
||||
function_result = result[1]
|
||||
self.topic_prompting._fix_dictionary_topwords()
|
||||
self.topic_lis = self.topic_prompting.topic_lis
|
||||
|
||||
return answer, function_result
|
||||
|
||||
def pprompt(self, query: str, return_function_result: bool = True) -> object:
|
||||
"""
|
||||
Prompts the model with the given query and prints the answer.
|
||||
|
||||
Args:
|
||||
query (str): The query to prompt the model with.
|
||||
return_function_result (bool, optional): Whether to return the result of the function call by the Language Model (LLM).
|
||||
|
||||
Returns:
|
||||
object: The result of the function call if return_function_result is True, otherwise None.
|
||||
"""
|
||||
|
||||
|
||||
answer, function_result = self.prompt(query)
|
||||
|
||||
print(answer)
|
||||
|
||||
if return_function_result:
|
||||
return function_result
|
||||
|
||||
def save_embeddings(self, path: str = embeddings_path) -> None:
|
||||
"""
|
||||
Saves the document and vocabulary embeddings to a pickle file for later re-use.
|
||||
|
||||
Args:
|
||||
path (str, optional): The path to save the embeddings to. Defaults to embeddings_path.
|
||||
"""
|
||||
|
||||
|
||||
assert self.document_embeddings is not None and self.vocab_embeddings is not None, "You need to compute the embeddings first."
|
||||
|
||||
# create dictionary if it doesn't exist yet
|
||||
if not os.path.exists("SavedEmbeddings"):
|
||||
os.makedirs("SavedEmbeddings")
|
||||
|
||||
|
||||
with open(path, "wb") as f:
|
||||
pickle.dump([self.document_embeddings, self.vocab_embeddings], f)
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,664 @@
|
||||
import numpy as np
|
||||
import umap
|
||||
import sys
|
||||
import os
|
||||
import inspect
|
||||
from tqdm import tqdm
|
||||
import umap
|
||||
import json
|
||||
|
||||
# make sure the import works even if the package has not been installed and just the files are used
|
||||
|
||||
from topicgpt.Clustering import Clustering_and_DimRed
|
||||
from topicgpt.ExtractTopWords import ExtractTopWords
|
||||
from topicgpt.TopwordEnhancement import TopwordEnhancement
|
||||
|
||||
class Topic:
|
||||
"""
|
||||
class to represent a topic and all its attributes
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
topic_idx: str,
|
||||
documents: list[str],
|
||||
words: dict[str, int],
|
||||
centroid_hd: np.ndarray = None,
|
||||
centroid_ld: np.ndarray = None,
|
||||
document_embeddings_hd: np.ndarray = None,
|
||||
document_embeddings_ld: np.ndarray = None,
|
||||
document_embedding_similarity: np.ndarray = None,
|
||||
umap_mapper: umap.UMAP = None,
|
||||
top_words: dict[str, list[str]] = None,
|
||||
top_word_scores: dict[str, list[float]] = None
|
||||
) -> None:
|
||||
"""
|
||||
Represents a topic and all its attributes.
|
||||
|
||||
Args:
|
||||
topic_idx (str): Index or name of the topic.
|
||||
documents (list[str]): List of documents in the topic.
|
||||
words (dict[str, int]): Dictionary of words and their counts in the topic.
|
||||
centroid_hd (np.ndarray, optional): Centroid of the topic in high-dimensional space.
|
||||
centroid_ld (np.ndarray, optional): Centroid of the topic in low-dimensional space.
|
||||
document_embeddings_hd (np.ndarray, optional): Embeddings of documents in high-dimensional space that belong to this topic.
|
||||
document_embeddings_ld (np.ndarray, optional): Embeddings of documents in low-dimensional space that belong to this topic.
|
||||
document_embedding_similarity (np.ndarray, optional): Similarity array of document embeddings to the centroid in low-dimensional space.
|
||||
umap_mapper (umap.UMAP, optional): UMAP mapper object to map from high-dimensional space to low-dimensional space.
|
||||
top_words (dict[str, list[str]], optional): Dictionary of top words in the topic according to different metrics.
|
||||
top_word_scores (dict[str, list[float]], optional): Dictionary of how representative the top words are according to different metrics.
|
||||
"""
|
||||
|
||||
# do some checks on the input
|
||||
|
||||
assert len(documents) == len(document_embeddings_hd) == len(document_embeddings_ld) == len(document_embedding_similarity), "documents, document_embeddings_hd, document_embeddings_ld and document_embedding_similarity must have the same length"
|
||||
assert len(documents) > 0, "documents must not be empty"
|
||||
assert len(words) > 0, "words must not be empty"
|
||||
|
||||
|
||||
self.topic_idx = topic_idx
|
||||
self.documents = documents
|
||||
self.words = words
|
||||
self.centroid_hd = centroid_hd
|
||||
self.centroid_ld = centroid_ld
|
||||
self.document_embeddings_hd = document_embeddings_hd
|
||||
self.document_embeddings_ld = document_embeddings_ld
|
||||
self.document_embedding_similarity = document_embedding_similarity
|
||||
self.umap_mapper = umap_mapper
|
||||
self.top_words = top_words
|
||||
self.top_word_scores = top_word_scores
|
||||
|
||||
self.topic_name = None # initialize the name of the topic as none
|
||||
|
||||
def __str__(self) -> str:
|
||||
|
||||
if self.topic_idx and self.topic_name is None:
|
||||
repr = f"Topic {hash(self)}\n"
|
||||
if self.topic_name is None:
|
||||
repr = f"Topic: {self.topic_idx}\n"
|
||||
else:
|
||||
repr = f"Topic {self.topic_idx}: {self.topic_name}\n"
|
||||
|
||||
return repr
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return self.__str__()
|
||||
|
||||
def to_json(self) -> str:
|
||||
"""
|
||||
return a json representation of the topic
|
||||
"""
|
||||
repr_dict = {
|
||||
"topic_idx": self.topic_idx,
|
||||
"topic_name": self.topic_name,
|
||||
"topic_description": self.topic_description
|
||||
}
|
||||
|
||||
json_object = json.dumps(repr_dict, indent = 4)
|
||||
return json_object
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""
|
||||
return a dict representation of the topic
|
||||
"""
|
||||
repr_dict = {
|
||||
"topic_idx": int(self.topic_idx),
|
||||
"topic_name": self.topic_name,
|
||||
"topic_description": self.topic_description
|
||||
}
|
||||
return repr_dict
|
||||
|
||||
def set_topic_name(self, name:str):
|
||||
"""
|
||||
add a name to the topic
|
||||
params:
|
||||
name: name of the topic
|
||||
"""
|
||||
self.topic_name = name
|
||||
|
||||
def set_topic_description(self, text: str):
|
||||
"""
|
||||
add a text description to the topic
|
||||
params:
|
||||
text: text description of the topic
|
||||
"""
|
||||
self.topic_description = text
|
||||
|
||||
def topic_to_json(topic: Topic) -> str:
|
||||
"""
|
||||
Return a JSON representation of the topic.
|
||||
|
||||
Args:
|
||||
topic (Topic): The topic object to convert to JSON.
|
||||
|
||||
Returns:
|
||||
str: A JSON string representing the topic.
|
||||
"""
|
||||
repr_dict = {
|
||||
"topic_idx": topic.topic_idx,
|
||||
"topic_name": topic.topic_name,
|
||||
"topic_description": topic.topic_description
|
||||
}
|
||||
|
||||
json_object = json.dumps(repr_dict, indent = 4)
|
||||
return json_object
|
||||
|
||||
def topic_lis_to_json(topics: list[Topic]) -> str:
|
||||
"""
|
||||
Return a JSON representation of a list of topics.
|
||||
|
||||
Args:
|
||||
topics (list[Topic]): The list of topic objects to convert to JSON.
|
||||
|
||||
Returns:
|
||||
str: A JSON string representing the list of topics.
|
||||
"""
|
||||
repr_dict = {}
|
||||
for topic in topics:
|
||||
repr_dict[topic.topic_idx] = {
|
||||
"topic_name": topic.topic_name,
|
||||
"topic_description": topic.topic_description
|
||||
}
|
||||
|
||||
json_object = json.dumps(repr_dict, indent = 4)
|
||||
return json_object
|
||||
|
||||
@staticmethod
|
||||
def extract_topics(corpus: list[str], document_embeddings: np.ndarray, clusterer: Clustering_and_DimRed, vocab_embeddings: np.ndarray, n_topwords: int = 2000, topword_extraction_methods: list[str] = ["tfidf", "cosine_similarity"], compute_vocab_hyperparams: dict = {}) -> list[Topic]:
|
||||
"""
|
||||
Extracts topics from the given corpus using the provided clusterer object on the document embeddings.
|
||||
|
||||
Args:
|
||||
corpus (list[str]): List of documents.
|
||||
document_embeddings (np.ndarray): Embeddings of the documents.
|
||||
clusterer (Clustering_and_DimRed): Clustering and dimensionality reduction object to cluster the documents.
|
||||
vocab_embeddings (np.ndarray): Embeddings of the vocabulary.
|
||||
n_topwords (int, optional): Number of top-words to extract from the topics (default is 2000).
|
||||
topword_extraction_methods (list[str], optional): List of methods to extract top-words from the topics.
|
||||
Can contain "tfidf" and "cosine_similarity" (default is ["tfidf", "cosine_similarity"]).
|
||||
compute_vocab_hyperparams (dict, optional): Hyperparameters for the top-word extraction methods.
|
||||
|
||||
Returns:
|
||||
list[Topic]: List of Topic objects representing the extracted topics.
|
||||
"""
|
||||
|
||||
for elem in topword_extraction_methods:
|
||||
if elem not in ["tfidf", "cosine_similarity"]:
|
||||
raise ValueError("topword_extraction_methods can only contain 'tfidf' and 'cosine_similarity'")
|
||||
if topword_extraction_methods == []:
|
||||
raise ValueError("topword_extraction_methods cannot be empty")
|
||||
|
||||
dim_red_embeddings, labels, umap_mapper = clusterer.cluster_and_reduce(document_embeddings) # get dimensionality reduced embeddings, their labels and the umap mapper object
|
||||
|
||||
unique_labels = np.unique(labels) # In case the cluster labels are not consecutive numbers, we need to map them to consecutive
|
||||
label_mapping = {label: i for i, label in enumerate(unique_labels[unique_labels != -1])}
|
||||
label_mapping[-1] = -1
|
||||
labels = np.array([label_mapping[label] for label in labels])
|
||||
|
||||
extractor = ExtractTopWords()
|
||||
centroid_dict = extractor.extract_centroids(document_embeddings, labels) # get the centroids of the clusters
|
||||
centroid_arr = np.array(list(centroid_dict.values()))
|
||||
if centroid_arr.ndim == 1:
|
||||
centroid_arr = centroid_arr.reshape(-1, 1)
|
||||
dim_red_centroids = umap_mapper.transform(np.array(list(centroid_dict.values()))) # map the centroids to low dimensional space
|
||||
|
||||
dim_red_centroid_dict = {label: centroid for label, centroid in zip(centroid_dict.keys(), dim_red_centroids)}
|
||||
|
||||
vocab = extractor.compute_corpus_vocab(corpus, **compute_vocab_hyperparams) # compute the vocabulary of the corpus
|
||||
|
||||
word_topic_mat = extractor.compute_word_topic_mat(corpus, vocab, labels, consider_outliers = False) # compute the word-topic matrix of the corpus
|
||||
if "tfidf" in topword_extraction_methods:
|
||||
tfidf_topwords, tfidf_dict = extractor.extract_topwords_tfidf(word_topic_mat = word_topic_mat, vocab = vocab, labels = labels, top_n_words = n_topwords) # extract the top-words according to tfidf
|
||||
if "cosine_similarity" in topword_extraction_methods:
|
||||
cosine_topwords, cosine_dict = extractor.extract_topwords_centroid_similarity(word_topic_mat = word_topic_mat, vocab = vocab, vocab_embedding_dict = vocab_embeddings, centroid_dict= dim_red_centroid_dict, umap_mapper = umap_mapper, top_n_words = n_topwords, reduce_vocab_embeddings = True, reduce_centroid_embeddings = False, consider_outliers = False)
|
||||
|
||||
topics = []
|
||||
for i, label in enumerate(np.unique(labels)):
|
||||
if label < -0.5: # dont include outliers
|
||||
continue
|
||||
topic_idx = f"{label}"
|
||||
documents = [doc for j, doc in enumerate(corpus) if labels[j] == label]
|
||||
embeddings_hd = document_embeddings[labels == label]
|
||||
embeddings_ld = dim_red_embeddings[labels == label]
|
||||
centroid_hd = centroid_dict[label]
|
||||
centroid_ld = dim_red_centroids[label]
|
||||
|
||||
centroid_similarity = np.dot(embeddings_ld, centroid_ld)/(np.linalg.norm(embeddings_ld, axis = 1)*np.linalg.norm(centroid_ld))
|
||||
similarity_sorting = np.argsort(centroid_similarity)[::-1]
|
||||
documents = [documents[i] for i in similarity_sorting]
|
||||
embeddings_hd = embeddings_hd[similarity_sorting]
|
||||
embeddings_ld = embeddings_ld[similarity_sorting]
|
||||
|
||||
if type(cosine_topwords[label]) == dict:
|
||||
cosine_topwords[label] = cosine_topwords[label][0]
|
||||
|
||||
top_words = {
|
||||
"tfidf": tfidf_topwords[label] if "tfidf" in topword_extraction_methods else None,
|
||||
"cosine_similarity": cosine_topwords[label] if "cosine_similarity" in topword_extraction_methods else None
|
||||
}
|
||||
top_word_scores = {
|
||||
"tfidf": tfidf_dict[label] if "tfidf" in topword_extraction_methods else None,
|
||||
"cosine_similarity": cosine_dict[label] if "cosine_similarity" in topword_extraction_methods else None
|
||||
}
|
||||
|
||||
topic = Topic(topic_idx = topic_idx,
|
||||
documents = documents,
|
||||
words = vocab,
|
||||
centroid_hd = centroid_hd,
|
||||
centroid_ld = centroid_ld,
|
||||
document_embeddings_hd = embeddings_hd,
|
||||
document_embeddings_ld = embeddings_ld,
|
||||
document_embedding_similarity = centroid_similarity,
|
||||
umap_mapper = umap_mapper,
|
||||
top_words = top_words,
|
||||
top_word_scores = top_word_scores
|
||||
)
|
||||
|
||||
topics.append(topic)
|
||||
|
||||
return topics
|
||||
|
||||
@staticmethod
|
||||
def extract_topics_no_new_vocab_computation(corpus: list[str], vocab: list[str], document_embeddings: np.ndarray, clusterer: Clustering_and_DimRed, vocab_embeddings: np.ndarray, n_topwords: int = 2000, topword_extraction_methods: list[str] = ["tfidf", "cosine_similarity"], consider_outliers: bool = False) -> list[Topic]:
|
||||
"""
|
||||
Extracts topics from the given corpus using the provided clusterer object on the document embeddings.
|
||||
This version does not compute the vocabulary of the corpus and instead uses the provided vocabulary.
|
||||
|
||||
Args:
|
||||
corpus (list[str]): List of documents.
|
||||
vocab (list[str]): Vocabulary of the corpus.
|
||||
document_embeddings (np.ndarray): Embeddings of the documents.
|
||||
clusterer (Clustering_and_DimRed): Clustering and dimensionality reduction object to cluster the documents.
|
||||
vocab_embeddings (np.ndarray): Embeddings of the vocabulary.
|
||||
n_topwords (int, optional): Number of top-words to extract from the topics (default is 2000).
|
||||
topword_extraction_methods (list[str], optional): List of methods to extract top-words from the topics.
|
||||
Can contain "tfidf" and "cosine_similarity" (default is ["tfidf", "cosine_similarity"]).
|
||||
consider_outliers (bool, optional): Whether to consider outliers during topic extraction (default is False).
|
||||
|
||||
Returns:
|
||||
list[Topic]: List of Topic objects representing the extracted topics.
|
||||
"""
|
||||
|
||||
|
||||
for elem in topword_extraction_methods:
|
||||
if elem not in ["tfidf", "cosine_similarity"]:
|
||||
raise ValueError("topword_extraction_methods can only contain 'tfidf' and 'cosine_similarity'")
|
||||
if topword_extraction_methods == []:
|
||||
raise ValueError("topword_extraction_methods cannot be empty")
|
||||
|
||||
dim_red_embeddings, labels, umap_mapper = clusterer.cluster_and_reduce(document_embeddings) # get dimensionality reduced embeddings, their labels and the umap mapper object
|
||||
|
||||
unique_labels = np.unique(labels) # In case the cluster labels are not consecutive numbers, we need to map them to consecutive
|
||||
label_mapping = {label: i for i, label in enumerate(unique_labels[unique_labels != -1])}
|
||||
label_mapping[-1] = -1
|
||||
labels = np.array([label_mapping[label] for label in labels])
|
||||
|
||||
extractor = ExtractTopWords()
|
||||
centroid_dict = extractor.extract_centroids(document_embeddings, labels) # get the centroids of the clusters
|
||||
|
||||
centroid_arr = np.array(list(centroid_dict.values()))
|
||||
if centroid_arr.ndim == 1:
|
||||
centroid_arr = centroid_arr.reshape(-1, 1)
|
||||
dim_red_centroids = umap_mapper.transform(np.array(list(centroid_dict.values()))) # map the centroids to low dimensional space
|
||||
|
||||
dim_red_centroid_dict = {label: centroid for label, centroid in zip(centroid_dict.keys(), dim_red_centroids)}
|
||||
|
||||
word_topic_mat = extractor.compute_word_topic_mat(corpus, vocab, labels, consider_outliers = consider_outliers) # compute the word-topic matrix of the corpus
|
||||
if "tfidf" in topword_extraction_methods:
|
||||
tfidf_topwords, tfidf_dict = extractor.extract_topwords_tfidf(word_topic_mat = word_topic_mat, vocab = vocab, labels = labels, top_n_words = n_topwords) # extract the top-words according to tfidf
|
||||
if "cosine_similarity" in topword_extraction_methods:
|
||||
cosine_topwords, cosine_dict = extractor.extract_topwords_centroid_similarity(word_topic_mat = word_topic_mat, vocab = vocab, vocab_embedding_dict = vocab_embeddings, centroid_dict= dim_red_centroid_dict, umap_mapper = umap_mapper, top_n_words = n_topwords, reduce_vocab_embeddings = True, reduce_centroid_embeddings = False, consider_outliers = True)
|
||||
|
||||
topics = []
|
||||
for i, label in enumerate(np.unique(labels)):
|
||||
if label < -0.5: # dont include outliers
|
||||
continue
|
||||
topic_idx = f"{label}"
|
||||
documents = [doc for j, doc in enumerate(corpus) if labels[j] == label]
|
||||
embeddings_hd = document_embeddings[labels == label]
|
||||
embeddings_ld = dim_red_embeddings[labels == label]
|
||||
centroid_hd = centroid_dict[label]
|
||||
centroid_ld = dim_red_centroids[label]
|
||||
|
||||
centroid_similarity = np.dot(embeddings_ld, centroid_ld)/(np.linalg.norm(embeddings_ld, axis = 1)*np.linalg.norm(centroid_ld))
|
||||
similarity_sorting = np.argsort(centroid_similarity)[::-1]
|
||||
documents = [documents[i] for i in similarity_sorting]
|
||||
embeddings_hd = embeddings_hd[similarity_sorting]
|
||||
embeddings_ld = embeddings_ld[similarity_sorting]
|
||||
|
||||
try:
|
||||
if type(cosine_topwords[label]) == dict:
|
||||
cosine_topwords[label] = cosine_topwords[label][0]
|
||||
except:
|
||||
pass
|
||||
|
||||
top_words = {
|
||||
"tfidf": tfidf_topwords[label] if "tfidf" in topword_extraction_methods else None,
|
||||
"cosine_similarity": cosine_topwords[label] if "cosine_similarity" in topword_extraction_methods else None
|
||||
}
|
||||
top_word_scores = {
|
||||
"tfidf": tfidf_dict[label] if "tfidf" in topword_extraction_methods else None,
|
||||
"cosine_similarity": cosine_dict[label] if "cosine_similarity" in topword_extraction_methods else None
|
||||
}
|
||||
|
||||
topic = Topic(topic_idx = topic_idx,
|
||||
documents = documents,
|
||||
words = vocab,
|
||||
centroid_hd = centroid_hd,
|
||||
centroid_ld = centroid_ld,
|
||||
document_embeddings_hd = embeddings_hd,
|
||||
document_embeddings_ld = embeddings_ld,
|
||||
document_embedding_similarity = centroid_similarity,
|
||||
umap_mapper = umap_mapper,
|
||||
top_words = top_words,
|
||||
top_word_scores = top_word_scores
|
||||
)
|
||||
|
||||
topics.append(topic)
|
||||
|
||||
return topics
|
||||
|
||||
@staticmethod
|
||||
def extract_and_describe_topics(corpus: list[str], document_embeddings: np.ndarray, clusterer: Clustering_and_DimRed, vocab_embeddings: np.ndarray, enhancer: TopwordEnhancement, n_topwords: int = 2000, n_topwords_description: int = 500, topword_extraction_methods: list[str] = ["tfidf", "cosine_similarity"], compute_vocab_hyperparams: dict = {}, topword_description_method: str = "cosine_similarity") -> list[Topic]:
|
||||
"""
|
||||
Extracts topics from the given corpus using the provided clusterer object on the document embeddings and describes/names them using the given enhancer object.
|
||||
|
||||
Args:
|
||||
corpus (list[str]): List of documents.
|
||||
document_embeddings (np.ndarray): Embeddings of the documents.
|
||||
clusterer (Clustering_and_DimRed): Clustering and dimensionality reduction object to cluster the documents.
|
||||
vocab_embeddings (np.ndarray): Embeddings of the vocabulary.
|
||||
enhancer (TopwordEnhancement): Enhancer object for enhancing top-words and generating descriptions/names for topics.
|
||||
n_topwords (int, optional): Number of top-words to extract from the topics (default is 2000).
|
||||
n_topwords_description (int, optional): Number of top-words to use from the extracted topics for description and naming (default is 500).
|
||||
topword_extraction_methods (list[str], optional): List of methods to extract top-words from the topics.
|
||||
Can contain "tfidf" and "cosine_similarity" (default is ["tfidf", "cosine_similarity"]).
|
||||
compute_vocab_hyperparams (dict, optional): Hyperparameters for the top-word extraction methods.
|
||||
topword_description_method (str, optional): Method to use for top-word extraction for description/naming.
|
||||
Can be "tfidf" or "cosine_similarity" (default is "cosine_similarity").
|
||||
|
||||
Returns:
|
||||
list[Topic]: List of Topic objects representing the extracted and described topics.
|
||||
"""
|
||||
|
||||
print("Extracting topics...")
|
||||
topics = extract_topics(corpus, document_embeddings, clusterer, vocab_embeddings, n_topwords, topword_extraction_methods, compute_vocab_hyperparams)
|
||||
print("Describing topics...")
|
||||
topics = describe_and_name_topics(topics, enhancer, topword_description_method, n_topwords_description)
|
||||
return topics
|
||||
|
||||
@staticmethod
|
||||
def extract_topics_labels_vocab(corpus: list[str], document_embeddings_hd: np.ndarray, document_embeddings_ld: np.ndarray, labels: np.ndarray, umap_mapper: umap.UMAP, vocab_embeddings: np.ndarray, vocab: list[str] = None, n_topwords: int = 2000, topword_extraction_methods: list[str] = ["tfidf", "cosine_similarity"]) -> list[Topic]:
|
||||
"""
|
||||
Extracts topics from the given corpus using the provided labels that indicate the topics (no -1 for outliers). Vocabulary is already computed.
|
||||
|
||||
Args:
|
||||
corpus (list[str]): List of documents.
|
||||
document_embeddings_hd (np.ndarray): Embeddings of the documents in high-dimensional space.
|
||||
document_embeddings_ld (np.ndarray): Embeddings of the documents in low-dimensional space.
|
||||
labels (np.ndarray): Labels indicating the topics.
|
||||
umap_mapper (umap.UMAP): UMAP mapper object to map from high-dimensional space to low-dimensional space.
|
||||
vocab_embeddings (np.ndarray): Embeddings of the vocabulary.
|
||||
vocab (list[str], optional): Vocabulary of the corpus (default is None).
|
||||
n_topwords (int, optional): Number of top-words to extract from the topics (default is 2000).
|
||||
topword_extraction_methods (list[str], optional): List of methods to extract top-words from the topics.
|
||||
Can contain "tfidf" and "cosine_similarity" (default is ["tfidf", "cosine_similarity"]).
|
||||
|
||||
Returns:
|
||||
list[Topic]: List of Topic objects representing the extracted topics.
|
||||
"""
|
||||
|
||||
for elem in topword_extraction_methods:
|
||||
if elem not in ["tfidf", "cosine_similarity"]:
|
||||
raise ValueError("topword_extraction_methods can only contain 'tfidf' and 'cosine_similarity'")
|
||||
if topword_extraction_methods == []:
|
||||
raise ValueError("topword_extraction_methods cannot be empty")
|
||||
|
||||
if vocab is None:
|
||||
extractor = ExtractTopWords()
|
||||
vocab = extractor.compute_corpus_vocab(corpus) # compute the vocabulary of the corpus
|
||||
|
||||
extractor = ExtractTopWords()
|
||||
centroid_dict = extractor.extract_centroids(document_embeddings_hd, labels) # get the centroids of the clusters
|
||||
|
||||
centroid_arr = np.array(list(centroid_dict.values()))
|
||||
if centroid_arr.ndim == 1:
|
||||
centroid_arr = centroid_arr.reshape(-1, 1)
|
||||
dim_red_centroids = umap_mapper.transform(np.array(list(centroid_dict.values()))) # map the centroids to low dimensional space
|
||||
|
||||
word_topic_mat = extractor.compute_word_topic_mat(corpus, vocab, labels, consider_outliers = False) # compute the word-topic matrix of the corpus
|
||||
|
||||
dim_red_centroid_dict = {label: centroid for label, centroid in zip(centroid_dict.keys(), dim_red_centroids)}
|
||||
|
||||
if "tfidf" in topword_extraction_methods:
|
||||
tfidf_topwords, tfidf_dict = extractor.extract_topwords_tfidf(word_topic_mat = word_topic_mat, vocab = vocab, labels = labels, top_n_words = n_topwords) # extract the top-words according to tfidf
|
||||
if "cosine_similarity" in topword_extraction_methods:
|
||||
cosine_topwords, cosine_dict = extractor.extract_topwords_centroid_similarity(word_topic_mat = word_topic_mat, vocab = vocab, vocab_embedding_dict = vocab_embeddings, centroid_dict= dim_red_centroid_dict, umap_mapper = umap_mapper, top_n_words = n_topwords, reduce_vocab_embeddings = True, reduce_centroid_embeddings = False, consider_outliers = False)
|
||||
|
||||
topics = []
|
||||
for i, label in enumerate(np.unique(labels)):
|
||||
if label < -0.5: # dont include outliers
|
||||
continue
|
||||
topic_idx = f"{label}"
|
||||
documents = [doc for j, doc in enumerate(corpus) if labels[j] == label]
|
||||
embeddings_hd = document_embeddings_hd[labels == label]
|
||||
embeddings_ld = document_embeddings_ld[labels == label]
|
||||
centroid_hd = centroid_dict[label]
|
||||
centroid_ld = dim_red_centroids[label]
|
||||
|
||||
centroid_similarity = np.dot(embeddings_ld, centroid_ld)/(np.linalg.norm(embeddings_ld, axis = 1)*np.linalg.norm(centroid_ld))
|
||||
similarity_sorting = np.argsort(centroid_similarity)[::-1]
|
||||
documents = [documents[i] for i in similarity_sorting]
|
||||
embeddings_hd = embeddings_hd[similarity_sorting]
|
||||
embeddings_ld = embeddings_ld[similarity_sorting]
|
||||
|
||||
if type(cosine_topwords[label]) == dict:
|
||||
cosine_topwords[label] = cosine_topwords[label][0]
|
||||
top_words = {
|
||||
"tfidf": tfidf_topwords[label] if "tfidf" in topword_extraction_methods else None,
|
||||
"cosine_similarity": cosine_topwords[label] if "cosine_similarity" in topword_extraction_methods else None
|
||||
}
|
||||
top_word_scores = {
|
||||
"tfidf": tfidf_dict[label] if "tfidf" in topword_extraction_methods else None,
|
||||
"cosine_similarity": cosine_dict[label] if "cosine_similarity" in topword_extraction_methods else None
|
||||
}
|
||||
|
||||
topic = Topic(topic_idx = topic_idx,
|
||||
documents = documents,
|
||||
words = vocab,
|
||||
centroid_hd = centroid_hd,
|
||||
centroid_ld = centroid_ld,
|
||||
document_embeddings_hd = embeddings_hd,
|
||||
document_embeddings_ld = embeddings_ld,
|
||||
document_embedding_similarity = centroid_similarity,
|
||||
umap_mapper = umap_mapper,
|
||||
top_words = top_words,
|
||||
top_word_scores = top_word_scores
|
||||
)
|
||||
|
||||
topics.append(topic)
|
||||
|
||||
return topics
|
||||
|
||||
@staticmethod
|
||||
def extract_describe_topics_labels_vocab(
|
||||
corpus: list[str],
|
||||
document_embeddings_hd: np.ndarray,
|
||||
document_embeddings_ld: np.ndarray,
|
||||
labels: np.ndarray,
|
||||
umap_mapper: umap.UMAP,
|
||||
vocab_embeddings: np.ndarray,
|
||||
enhancer: TopwordEnhancement,
|
||||
vocab: list[str] = None,
|
||||
n_topwords: int = 2000,
|
||||
n_topwords_description: int = 500,
|
||||
topword_extraction_methods: list[str] = ["tfidf", "cosine_similarity"],
|
||||
topword_description_method: str = "cosine_similarity"
|
||||
) -> list[Topic]:
|
||||
"""
|
||||
Extracts topics from the given corpus using the provided labels that indicate the topics (no -1 for outliers). Vocabulary is already computed.
|
||||
Describe and name the topics with the given enhancer object.
|
||||
|
||||
Args:
|
||||
corpus (list[str]): List of documents.
|
||||
document_embeddings_hd (np.ndarray): Embeddings of the documents in high-dimensional space.
|
||||
document_embeddings_ld (np.ndarray): Embeddings of the documents in low-dimensional space.
|
||||
labels (np.ndarray): Labels indicating the topics.
|
||||
umap_mapper (umap.UMAP): UMAP mapper object to map from high-dimensional space to low-dimensional space.
|
||||
vocab_embeddings (np.ndarray): Embeddings of the vocabulary.
|
||||
enhancer (TopwordEnhancement): Enhancer object to enhance the top-words and generate the description.
|
||||
vocab (list[str], optional): Vocabulary of the corpus (default is None).
|
||||
n_topwords (int, optional): Number of top-words to extract from the topics (default is 2000).
|
||||
n_topwords_description (int, optional): Number of top-words to use from the extracted topics for the description and the name (default is 500).
|
||||
topword_extraction_methods (list[str], optional): List of methods to extract top-words from the topics.
|
||||
Can contain "tfidf" and "cosine_similarity" (default is ["tfidf", "cosine_similarity"]).
|
||||
topword_description_method (str, optional): Method to use for top-word extraction. Can be "tfidf" or "cosine_similarity" (default is "cosine_similarity").
|
||||
|
||||
Returns:
|
||||
list[Topic]: List of Topic objects representing the extracted topics.
|
||||
"""
|
||||
|
||||
topics = extract_topics_labels_vocab(corpus, document_embeddings_hd, document_embeddings_ld, labels, umap_mapper, vocab_embeddings, vocab, n_topwords, topword_extraction_methods)
|
||||
topics = describe_and_name_topics(topics, enhancer, topword_description_method, n_topwords_description)
|
||||
return topics
|
||||
|
||||
@staticmethod
|
||||
def extract_topic_cos_sim(
|
||||
documents_topic: list[str],
|
||||
document_embeddings_topic: np.ndarray,
|
||||
words_topic: list[str],
|
||||
vocab_embeddings: dict,
|
||||
umap_mapper: umap.UMAP,
|
||||
n_topwords: int = 2000
|
||||
) -> Topic:
|
||||
"""
|
||||
Create a Topic object from the given documents and embeddings by computing the centroid and the top-words.
|
||||
Only uses cosine-similarity for top-word extraction.
|
||||
|
||||
Args:
|
||||
documents_topic (list[str]): List of documents in the topic.
|
||||
document_embeddings_topic (np.ndarray): High-dimensional embeddings of the documents in the topic.
|
||||
words_topic (list[str]): List of words in the topic.
|
||||
vocab_embeddings (dict): Embeddings of the vocabulary.
|
||||
umap_mapper (umap.UMAP): UMAP mapper object to map from high-dimensional space to low-dimensional space.
|
||||
n_topwords (int, optional): Number of top-words to extract from the topics (default is 2000).
|
||||
|
||||
Returns:
|
||||
Topic: Topic object representing the extracted topic.
|
||||
"""
|
||||
|
||||
topword_extraction_methods = ["cosine_similarity"]
|
||||
extractor = ExtractTopWords()
|
||||
centroid_hd = extractor.extract_centroid(document_embeddings_topic)
|
||||
centroid_ld = umap_mapper.transform(centroid_hd.reshape(1, -1))[0]
|
||||
|
||||
labels = np.zeros(len(documents_topic), dtype = int) #everything has label 0
|
||||
|
||||
word_topic_mat = extractor.compute_word_topic_mat(documents_topic, words_topic, labels, consider_outliers = False) # compute the word-topic matrix of the corpus
|
||||
if "cosine_similarity" in topword_extraction_methods:
|
||||
cosine_topwords, cosine_dict = extractor.extract_topwords_centroid_similarity(word_topic_mat = word_topic_mat, vocab = words_topic, vocab_embedding_dict = vocab_embeddings, centroid_dict= {0: centroid_ld}, umap_mapper = umap_mapper, top_n_words = n_topwords, reduce_vocab_embeddings = True, reduce_centroid_embeddings = False, consider_outliers = False)
|
||||
|
||||
|
||||
|
||||
top_words = {
|
||||
"cosine_similarity": cosine_topwords if "cosine_similarity" in topword_extraction_methods else None
|
||||
}
|
||||
top_word_scores = {
|
||||
"cosine_similarity": cosine_dict if "cosine_similarity" in topword_extraction_methods else None
|
||||
}
|
||||
|
||||
document_embeddings_hd = document_embeddings_topic
|
||||
document_embeddings_ld = umap_mapper.transform(document_embeddings_hd)
|
||||
document_embedding_similarity = np.dot(document_embeddings_ld, centroid_ld)/(np.linalg.norm(document_embeddings_ld, axis = 1)*np.linalg.norm(centroid_ld)) # is this correct???
|
||||
|
||||
topic = Topic(topic_idx = None,
|
||||
documents = documents_topic,
|
||||
words = words_topic,
|
||||
centroid_hd = centroid_hd,
|
||||
centroid_ld = centroid_ld,
|
||||
document_embeddings_hd = document_embeddings_hd,
|
||||
document_embeddings_ld = document_embeddings_ld,
|
||||
document_embedding_similarity = document_embedding_similarity,
|
||||
umap_mapper = umap_mapper,
|
||||
top_words = top_words,
|
||||
top_word_scores = top_word_scores
|
||||
)
|
||||
|
||||
return topic
|
||||
|
||||
@staticmethod
|
||||
def extract_and_describe_topic_cos_sim(
|
||||
documents_topic: list[str],
|
||||
document_embeddings_topic: np.ndarray,
|
||||
words_topic: list[str],
|
||||
vocab_embeddings: dict,
|
||||
umap_mapper: umap.UMAP,
|
||||
enhancer: TopwordEnhancement,
|
||||
n_topwords: int = 2000,
|
||||
n_topwords_description=500
|
||||
) -> Topic:
|
||||
"""
|
||||
Create a Topic object from the given documents and embeddings by computing the centroid and the top-words.
|
||||
Only use cosine-similarity for top-word extraction.
|
||||
Describe and name the topic with the given enhancer object.
|
||||
|
||||
Args:
|
||||
documents_topic (list[str]): List of documents in the topic.
|
||||
document_embeddings_topic (np.ndarray): High-dimensional embeddings of the documents in the topic.
|
||||
words_topic (list[str]): List of words in the topic.
|
||||
vocab_embeddings (dict): Embeddings of the vocabulary.
|
||||
umap_mapper (umap.UMAP): UMAP mapper object to map from high-dimensional space to low-dimensional space.
|
||||
enhancer (TopwordEnhancement): Enhancer object to enhance the top-words and generate the description.
|
||||
n_topwords (int, optional): Number of top-words to extract from the topics (default is 2000).
|
||||
n_topwords_description (int, optional): Number of top-words to use from the extracted topics for the description and the name (default is 500).
|
||||
|
||||
Returns:
|
||||
Topic: Topic object representing the extracted and described topic.
|
||||
"""
|
||||
topic = extract_topic_cos_sim(documents_topic, document_embeddings_topic, words_topic, vocab_embeddings, umap_mapper, n_topwords)
|
||||
topic = describe_and_name_topics([topic], enhancer, "cosine_similarity", n_topwords_description)[0]
|
||||
return topic
|
||||
|
||||
topic = extract_topic_cos_sim(documents_topic, document_embeddings_topic, words_topic, vocab_embeddings, umap_mapper, n_topwords)
|
||||
topic = describe_and_name_topics([topic], enhancer, "cosine_similarity", n_topwords_description)[0]
|
||||
return topic
|
||||
|
||||
@staticmethod
|
||||
def describe_and_name_topics(
|
||||
topics: list[Topic],
|
||||
enhancer: TopwordEnhancement,
|
||||
topword_method="tfidf",
|
||||
n_words=500
|
||||
) -> list[Topic]:
|
||||
"""
|
||||
Describe and name the topics using the OpenAI API with the given enhancer object.
|
||||
|
||||
Args:
|
||||
topics (list[Topic]): List of Topic objects.
|
||||
enhancer (TopwordEnhancement): Enhancer object to enhance the top-words and generate the description.
|
||||
topword_method (str, optional): Method to use for top-word extraction. Can be "tfidf" or "cosine_similarity" (default is "tfidf").
|
||||
n_words (int, optional): Number of topwords to extract for the description and the name (default is 500).
|
||||
|
||||
Returns:
|
||||
list[Topic]: List of Topic objects with the description and name added.
|
||||
"""
|
||||
|
||||
if topword_method not in ["tfidf", "cosine_similarity"]:
|
||||
raise ValueError("topword_method can only be 'tfidf' or 'cosine_similarity'")
|
||||
|
||||
for topic in tqdm(topics):
|
||||
tws = topic.top_words[topword_method]
|
||||
try:
|
||||
topic_name = enhancer.generate_topic_name_str(tws, n_words = n_words)
|
||||
topic_description = enhancer.describe_topic_topwords_str(tws, n_words = n_words)
|
||||
except Exception as e:
|
||||
print(f"Error in topic {topic.topic_idx}: {e}")
|
||||
print("Trying again...")
|
||||
topic_name = enhancer.generate_topic_name_str(tws, n_words = n_words)
|
||||
topic_description = enhancer.describe_topic_topwords_str(tws, n_words = n_words)
|
||||
|
||||
|
||||
topic.set_topic_name(topic_name)
|
||||
topic.set_topic_description(topic_description)
|
||||
|
||||
return topics
|
||||
|
||||
@@ -0,0 +1,306 @@
|
||||
import tiktoken
|
||||
from openai import OpenAI
|
||||
|
||||
|
||||
from typing import Callable
|
||||
import numpy as np
|
||||
|
||||
basic_instruction = "You are a helpful assistant. You are excellent at inferring topics from top-words extracted via topic-modelling. You make sure that everything you output is strictly based on the provided text."
|
||||
|
||||
class TopwordEnhancement:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
client,
|
||||
openai_model: str = "gpt-3.5-turbo",
|
||||
max_context_length: int = 4000,
|
||||
openai_model_temperature: float = 0.5,
|
||||
basic_model_instruction: str = basic_instruction,
|
||||
corpus_instruction: str = "") -> None:
|
||||
"""
|
||||
Initialize the OpenAIAssistant with the specified parameters.
|
||||
|
||||
Args:
|
||||
client: Client.
|
||||
openai_model (str, optional): The OpenAI model to use (default is "gpt-3.5-turbo").
|
||||
max_context_length (int, optional): The maximum length of the context for the OpenAI model (default is 4000).
|
||||
openai_model_temperature (float, optional): The softmax temperature to use for the OpenAI model (default is 0.5).
|
||||
basic_model_instruction (str, optional): The basic instruction for the model.
|
||||
corpus_instruction (str, optional): The instruction for the corpus. Useful if specific information on the corpus is available.
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
|
||||
# do some checks on the input arguments
|
||||
assert openai_model is not None, "Please provide an openai model"
|
||||
assert max_context_length > 0, "Please provide a positive max_context_length"
|
||||
assert openai_model_temperature > 0, "Please provide a positive openai_model_temperature"
|
||||
|
||||
self.client = client
|
||||
self.openai_model = openai_model
|
||||
self.max_context_length = max_context_length
|
||||
self.openai_model_temperature = openai_model_temperature
|
||||
self.basic_model_instruction = basic_model_instruction
|
||||
self.corpus_instruction = f" The following information is available about the corpus used to identify the topics: {corpus_instruction}"
|
||||
|
||||
def __str__(self) -> str:
|
||||
repr = f"TopwordEnhancement(openai_model = {self.openai_model})"
|
||||
return repr
|
||||
|
||||
def __repr__(self) -> str:
|
||||
repr = f"TopwordEnhancement(openai_model = {self.openai_model})"
|
||||
return repr
|
||||
|
||||
def count_tokens_api_message(self, messages: list[dict[str]]) -> int:
|
||||
"""
|
||||
Count the number of tokens in the API messages.
|
||||
|
||||
Args:
|
||||
messages (list[dict[str]]): List of messages from the API.
|
||||
|
||||
Returns:
|
||||
int: Number of tokens in the messages.
|
||||
"""
|
||||
encoding = tiktoken.encoding_for_model(self.openai_model)
|
||||
n_tokens = 0
|
||||
for message in messages:
|
||||
for key, value in message.items():
|
||||
if key == "content":
|
||||
n_tokens += len(encoding.encode(value))
|
||||
|
||||
return n_tokens
|
||||
|
||||
def describe_topic_topwords_completion_object(self,
|
||||
topwords: list[str],
|
||||
n_words: int = None,
|
||||
query_function: Callable = lambda tws: f"Please give me the common topic of those words: {tws}. Also describe the various aspects and sub-topics of the topic.") :
|
||||
"""
|
||||
Describe the given topic based on its topwords using the OpenAI model.
|
||||
|
||||
Args:
|
||||
topwords (list[str]): List of topwords.
|
||||
n_words (int, optional): Number of words to use for the query. If None, all words are used.
|
||||
query_function (Callable, optional): Function to query the model. The function should take a list of topwords and return a string.
|
||||
|
||||
Returns:
|
||||
openai.ChatCompletion: A description of the topics by the model in the form of an OpenAI ChatCompletion object.
|
||||
"""
|
||||
|
||||
if n_words is None:
|
||||
n_words = len(topwords)
|
||||
|
||||
if type(topwords) == dict:
|
||||
topwords = topwords[0]
|
||||
|
||||
topwords = topwords[:n_words]
|
||||
topwords = np.array(topwords)
|
||||
|
||||
|
||||
# if too many topwords are given, use only the first part of the topwords that fits into the context length
|
||||
tokens_cumsum = np.cumsum([len(tiktoken.encoding_for_model(self.openai_model).encode(tw + ", ")) for tw in topwords]) + len(tiktoken.encoding_for_model(self.openai_model).encode(self.basic_model_instruction + " " + self.corpus_instruction))
|
||||
if tokens_cumsum[-1] > self.max_context_length:
|
||||
print("Too many topwords given. Using only the first part of the topwords that fits into the context length. Number of topwords used: ", np.argmax(tokens_cumsum > self.max_context_length))
|
||||
n_words = np.argmax(tokens_cumsum > self.max_context_length)
|
||||
topwords = topwords[:n_words]
|
||||
|
||||
|
||||
|
||||
completion = self.client.chat.completions.create(model=self.openai_model,
|
||||
messages=[
|
||||
{"role": "system", "content": self.basic_model_instruction + " " + self.corpus_instruction},
|
||||
{"role": "user", "content": query_function(topwords)},
|
||||
],
|
||||
temperature = self.openai_model_temperature)
|
||||
|
||||
return completion
|
||||
|
||||
def describe_topic_topwords_str(self,
|
||||
topwords: list[str],
|
||||
n_words: int = None,
|
||||
query_function: Callable = lambda tws: f"Please give me the common topic of those words: {tws}. Also describe the various aspects and sub-topics of the topic. Make sure the descriptions are short and concise! Do not cite more than 5 words per sub-aspect!!!") -> str:
|
||||
"""
|
||||
Describe the given topic based on its topwords using the OpenAI model.
|
||||
|
||||
Args:
|
||||
topwords (list[str]): List of topwords.
|
||||
n_words (int, optional): Number of words to use for the query. If None, all words are used.
|
||||
query_function (Callable, optional): Function to query the model. The function should take a list of topwords and return a string.
|
||||
|
||||
Returns:
|
||||
str: A description of the topics by the model in the form of a string.
|
||||
"""
|
||||
|
||||
completion = self.describe_topic_topwords_completion_object(topwords, n_words, query_function)
|
||||
return completion.choices[0].message.content
|
||||
|
||||
def generate_topic_name_str(self,
|
||||
topwords: list[str],
|
||||
n_words: int = None,
|
||||
query_function: Callable = lambda tws: f"Please give me the common topic of those words: {tws}. Give me only the title of the topic and nothing else please. Make sure the title is precise and not longer than 5 words, ideally even shorter.") -> str:
|
||||
"""
|
||||
Generate a topic name based on the given topwords using the OpenAI model.
|
||||
|
||||
Args:
|
||||
topwords (list[str]): List of topwords.
|
||||
n_words (int, optional): Number of words to use for the query. If None, all words are used.
|
||||
query_function (Callable, optional): Function to query the model. The function should take a list of topwords and return a string.
|
||||
|
||||
Returns:
|
||||
str: A topic name generated by the model in the form of a string.
|
||||
"""
|
||||
|
||||
return self.describe_topic_topwords_str(topwords, n_words, query_function)
|
||||
|
||||
def describe_topic_documents_completion_object(self,
|
||||
documents: list[str],
|
||||
truncate_doc_thresh=100,
|
||||
n_documents: int = None,
|
||||
query_function: Callable = lambda docs: f"Please give me the common topic of those documents: {docs}. Note that the documents are truncated if they are too long. Also describe the various aspects and sub-topics of the topic."):
|
||||
"""
|
||||
Describe the given topic based on its documents using the OpenAI model.
|
||||
|
||||
Args:
|
||||
documents (list[str]): List of documents.
|
||||
truncate_doc_thresh (int, optional): Threshold for the number of words in a document. If a document has more words than this threshold, it is pruned to this threshold.
|
||||
n_documents (int, optional): Number of documents to use for the query. If None, all documents are used.
|
||||
query_function (Callable, optional): Function to query the model. The function should take a list of documents and return a string.
|
||||
|
||||
Returns:
|
||||
openai.ChatCompletion: A description of the topics by the model in the form of an openai.ChatCompletion object.
|
||||
"""
|
||||
|
||||
if n_documents is None:
|
||||
n_documents = len(documents)
|
||||
documents = documents[:n_documents]
|
||||
|
||||
# prune documents based on number of tokens they contain
|
||||
new_doc_lis = []
|
||||
for doc in documents:
|
||||
doc = doc.split(" ")
|
||||
if len(doc) > truncate_doc_thresh:
|
||||
doc = doc[:truncate_doc_thresh]
|
||||
new_doc_lis.append(" ".join(doc))
|
||||
documents = new_doc_lis
|
||||
|
||||
# if too many documents are given, use only the first part of the documents that fits into the context length
|
||||
tokens_cumsum = np.cumsum([len(tiktoken.encoding_for_model(self.openai_model).encode(doc + ", ")) for doc in documents]) + len(tiktoken.encoding_for_model(self.openai_model).encode(self.basic_model_instruction + " " + self.corpus_instruction))
|
||||
if tokens_cumsum[-1] > self.max_context_length:
|
||||
print("Too many documents given. Using only the first part of the documents that fits into the context length. Number of documents used: ", np.argmax(tokens_cumsum > self.max_context_length))
|
||||
n_documents = np.argmax(tokens_cumsum > self.max_context_length)
|
||||
documents = documents[:n_documents]
|
||||
|
||||
completion = self.client.chat.completions.create(model=self.openai_model,
|
||||
messages=[
|
||||
{"role": "system", "content": self.basic_model_instruction + " " + self.corpus_instruction},
|
||||
{"role": "user", "content": query_function(documents)},
|
||||
],
|
||||
temperature = self.openai_model_temperature)
|
||||
|
||||
return completion
|
||||
|
||||
|
||||
@staticmethod
|
||||
def sample_identity(n_docs: int) -> np.ndarray:
|
||||
"""
|
||||
Generate an identity array of document indices without changing their order.
|
||||
|
||||
Args:
|
||||
n_docs (int): Number of documents.
|
||||
|
||||
Returns:
|
||||
np.ndarray: An array containing document indices from 0 to (n_docs - 1).
|
||||
"""
|
||||
|
||||
return np.arange(n_docs)
|
||||
|
||||
|
||||
@staticmethod
|
||||
def sample_uniform(n_docs: int) -> np.ndarray:
|
||||
"""
|
||||
Randomly sample document indices without replacement.
|
||||
|
||||
Args:
|
||||
n_docs (int): Number of documents.
|
||||
|
||||
Returns:
|
||||
np.ndarray: An array containing randomly permuted document indices from 0 to (n_docs - 1).
|
||||
"""
|
||||
|
||||
return np.random.permutation(n_docs)
|
||||
|
||||
@staticmethod
|
||||
def sample_poisson(n_docs: int) -> np.ndarray:
|
||||
"""
|
||||
Randomly sample document indices according to a Poisson distribution, favoring documents from the beginning of the list.
|
||||
|
||||
Args:
|
||||
n_docs (int): Number of documents.
|
||||
|
||||
Returns:
|
||||
np.ndarray: An array containing randomly permuted document indices, with more documents drawn from the beginning of the list.
|
||||
"""
|
||||
|
||||
return np.random.poisson(1, n_docs)
|
||||
|
||||
def describe_topic_documents_sampling_completion_object(
|
||||
self,
|
||||
documents: list[str],
|
||||
truncate_doc_thresh=100,
|
||||
n_documents: int = None,
|
||||
query_function: Callable = lambda docs: f"Please give me the common topic of the sample of those documents: {docs}. Note that the documents are truncated if they are too long. Also describe the various aspects and sub-topics of the topic.",
|
||||
sampling_strategy: str = None,):
|
||||
"""
|
||||
Describe a topic based on a sample of its documents by using the openai model.
|
||||
|
||||
Args:
|
||||
documents (list[str]): List of documents ordered by similarity to the topic's centroid.
|
||||
truncate_doc_thresh (int, optional): Threshold for the number of words in a document. If a document exceeds this threshold, it is truncated. Defaults to 100.
|
||||
n_documents (int, optional): Number of documents to use for the query. If None, all documents are used. Defaults to None.
|
||||
query_function (Callable, optional): Function to query the model. Defaults to a lambda function generating a query based on the provided documents.
|
||||
sampling_strategy (Union[Callable, str], optional): Strategy to sample the documents. If None, the first provided documents are used.
|
||||
If it's a string, it's interpreted as a method of the class (e.g., "sample_uniform" is interpreted as self.sample_uniform). It can also be a custom sampling function. Defaults to None.
|
||||
|
||||
Returns:
|
||||
openai.ChatCompletion: A description of the topic by the model in the form of an openai.ChatCompletion object.
|
||||
"""
|
||||
|
||||
if type(sampling_strategy) == str:
|
||||
if sampling_strategy == "topk":
|
||||
sampling_strategy = self.sample_identity
|
||||
if sampling_strategy=="identity":
|
||||
sampling_strategy = self.sample_identity
|
||||
elif sampling_strategy=="uniform":
|
||||
sampling_strategy = self.sample_uniform
|
||||
elif sampling_strategy=="poisson":
|
||||
sampling_strategy = self.sample_poisson
|
||||
|
||||
new_documents = [documents[i] for i in sampling_strategy(n_documents)]
|
||||
|
||||
result = self.describe_topic_documents_completion_object(new_documents, truncate_doc_thresh, n_documents, query_function)
|
||||
return result
|
||||
|
||||
def describe_topic_document_sampling_str(
|
||||
self,
|
||||
documents: list[str],
|
||||
truncate_doc_thresh=100,
|
||||
n_documents: int = None,
|
||||
query_function: Callable = lambda docs: f"Please give me the common topic of the sample of those documents: {docs}. Note that the documents are truncated if they are too long. Also describe the various aspects and sub-topics of the topic.",
|
||||
sampling_strategy: str = None,) -> str:
|
||||
"""
|
||||
Describe a topic based on a sample of its documents by using the openai model.
|
||||
|
||||
Args:
|
||||
documents (list[str]): List of documents ordered by similarity to the topic's centroid.
|
||||
truncate_doc_thresh (int, optional): Threshold for the number of words in a document. If a document exceeds this threshold, it is truncated. Defaults to 100.
|
||||
n_documents (int, optional): Number of documents to use for the query. If None, all documents are used. Defaults to None.
|
||||
query_function (Callable, optional): Function to query the model. Defaults to a lambda function generating a query based on the provided documents.
|
||||
sampling_strategy (Union[Callable, str], optional): Strategy to sample the documents. If None, the first provided documents are used.
|
||||
If it's a string, it's interpreted as a method of the class (e.g., "sample_uniform" is interpreted as self.sample_uniform). It can also be a custom sampling function. Defaults to None.
|
||||
|
||||
Returns:
|
||||
str: A description of the topic by the model in the form of a string.
|
||||
"""
|
||||
|
||||
completion = self.describe_topic_document_sampling_completion_object(documents, truncate_doc_thresh, n_documents, query_function, sampling_strategy)
|
||||
return completion.choices[0].message.content
|
||||
@@ -0,0 +1 @@
|
||||
__version__ = '0.0.5'
|
||||
Reference in New Issue
Block a user