The LLM-based topic recognition model is complete and adapted to quickly updating Weibo topics.

2025-08-07 11:14:38 +08:00
parent 1e780876c9
commit d88d5edd99
32 changed files with 8352 additions and 1 deletions
@@ -0,0 +1,12 @@
+class Client:
+    def __init__(self, api_key: str, azure_endpoint: dict = None) -> None:
+        if azure_endpoint:
+            from openai import AzureOpenAI
+            self.client = AzureOpenAI(api_key=api_key, api_version=azure_endpoint['api_version'], azure_endpoint=azure_endpoint['endpoint'])
+        else:
+            from openai import OpenAI
+            self.client = OpenAI(api_key=api_key)
+    
+    def __getattr__(self, name):
+        """Delegate attribute access to the self.client object."""
+        return getattr(self.client, name)
@@ -0,0 +1,286 @@
+import numpy as np
+import umap
+import hdbscan
+import matplotlib.pyplot as plt
+import pandas as pd
+import plotly.express as px
+import umap.plot
+from copy import deepcopy
+from sklearn.cluster import AgglomerativeClustering
+
+from typing import Tuple
+
+class Clustering_and_DimRed():
+
+    """
+    Class to perform dimensionality reduction with UMAP followed by clustering with HDBSCAN.
+    """
+    def __init__(self,
+             n_dims_umap: int = 5,
+             n_neighbors_umap: int = 15,
+             min_dist_umap: float = 0,
+             metric_umap: str = "cosine",
+             min_cluster_size_hdbscan: int = 30,
+             metric_hdbscan: str = "euclidean",
+             cluster_selection_method_hdbscan: str = "eom",
+             number_clusters_hdbscan: int = None,
+             random_state: int = 42,
+             verbose: bool = True,
+             UMAP_hyperparams: dict = {},
+             HDBSCAN_hyperparams: dict = {}) -> None:
+        """
+        Initializes the clustering and dimensionality reduction parameters for topic modeling.
+
+        Args:
+            n_dims_umap (int, optional): Number of dimensions to reduce to using UMAP.
+            n_neighbors_umap (int, optional): Number of neighbors for UMAP.
+            min_dist_umap (float, optional): Minimum distance for UMAP.
+            metric_umap (str, optional): Metric for UMAP.
+            min_cluster_size_hdbscan (int, optional): Minimum cluster size for HDBSCAN.
+            metric_hdbscan (str, optional): Metric for HDBSCAN.
+            cluster_selection_method_hdbscan (str, optional): Cluster selection method for HDBSCAN.
+            number_clusters_hdbscan (int, optional): Number of clusters for HDBSCAN. If None, HDBSCAN will determine the number of clusters automatically. Ensure that min_cluster_size is not too large to find enough clusters.
+            random_state (int, optional): Random state for UMAP and HDBSCAN.
+            verbose (bool, optional): Whether to print progress.
+            UMAP_hyperparams (dict, optional): Additional hyperparameters for UMAP.
+            HDBSCAN_hyperparams (dict, optional): Additional hyperparameters for HDBSCAN.
+        """
+
+
+        # do some checks on the input arguments 
+        assert n_dims_umap > 0, "n_dims_umap must be greater than 0"
+        assert n_neighbors_umap > 0, "n_neighbors_umap must be greater than 0"
+        assert min_dist_umap >= 0, "min_dist_umap must be greater than or equal to 0"
+        assert min_cluster_size_hdbscan > 0, "min_cluster_size_hdbscan must be greater than 0"
+        assert number_clusters_hdbscan is None or number_clusters_hdbscan > 0, "number_clusters_hdbscan must be greater than 0 or None"
+        assert random_state is None or random_state >= 0, "random_state must be greater than or equal to 0"
+
+        self.random_state = random_state
+        self.verbose = verbose
+        self.UMAP_hyperparams = UMAP_hyperparams
+        self.HDBSCAN_hyperparams = HDBSCAN_hyperparams
+
+        # update hyperparameters for UMAP
+        self.UMAP_hyperparams["n_components"] = n_dims_umap
+        self.UMAP_hyperparams["n_neighbors"] = n_neighbors_umap
+        self.UMAP_hyperparams["min_dist"] = min_dist_umap
+        self.UMAP_hyperparams["metric"] = metric_umap
+        self.UMAP_hyperparams["random_state"] = random_state
+        self.UMAP_hyperparams["verbose"] = verbose
+        self.umap = umap.UMAP(**self.UMAP_hyperparams)
+
+        self.HDBSCAN_hyperparams["min_cluster_size"] = min_cluster_size_hdbscan
+        self.HDBSCAN_hyperparams["metric"] = metric_hdbscan
+        self.HDBSCAN_hyperparams["cluster_selection_method"] = cluster_selection_method_hdbscan
+        self.number_clusters_hdbscan = number_clusters_hdbscan
+        self.hdbscan = hdbscan.HDBSCAN(**self.HDBSCAN_hyperparams)
+
+    
+    def reduce_dimensions_umap(self, embeddings: np.ndarray) -> Tuple[np.ndarray, umap.UMAP]:
+        """
+        Reduces dimensions of embeddings using UMAP.
+
+        Args:
+            embeddings (np.ndarray): Embeddings to reduce.
+
+        Returns:
+            tuple: A tuple containing two items:
+                - reduced_embeddings (np.ndarray): Reduced embeddings.
+                - umap_mapper (umap.UMAP): UMAP mapper for transforming new embeddings, especially embeddings of the vocabulary. (MAKE SURE TO NORMALIZE EMBEDDINGS AFTER USING THE MAPPER)
+        """
+
+        mapper = umap.UMAP(**self.UMAP_hyperparams).fit(embeddings)
+        dim_red_embeddings = mapper.transform(embeddings)
+        dim_red_embeddings = dim_red_embeddings/np.linalg.norm(dim_red_embeddings, axis=1).reshape(-1,1)
+        return dim_red_embeddings, mapper
+    
+    def cluster_hdbscan(self, embeddings: np.ndarray) -> np.ndarray:
+        """
+        Cluster embeddings using HDBSCAN.
+        
+        If self.number_clusters_hdbscan is not None, further clusters the data with AgglomerativeClustering to achieve a fixed number of clusters.
+
+        Args:
+            embeddings (np.ndarray): Embeddings to cluster.
+
+        Returns:
+            np.ndarray: Cluster labels.
+        """
+
+        labels = self.hdbscan.fit_predict(embeddings)
+        outliers = np.where(labels == -1)[0]
+
+        if self.number_clusters_hdbscan is not None:
+            clusterer = AgglomerativeClustering(n_clusters=self.number_clusters_hdbscan)  #one cluster for outliers  
+            labels = clusterer.fit_predict(embeddings)
+            labels[outliers] = -1
+
+        # reindex to make the labels consecutive numbers from -1 to the number of clusters. -1 is reserved for outliers
+        unique_labels = np.unique(labels)
+        unique_labels_no_outliers = unique_labels[unique_labels != -1]
+        map2newlabel = {label: i for i, label in enumerate(unique_labels_no_outliers)}
+        map2newlabel[-1] = -1
+        labels = np.array([map2newlabel[label] for label in labels])
+
+        return labels
+    
+    def cluster_and_reduce(self, embeddings: np.ndarray) -> Tuple[np.ndarray, np.ndarray, umap.UMAP]:
+        """
+        Cluster embeddings using HDBSCAN and reduce dimensions with UMAP.
+
+        Args:
+            embeddings (np.ndarray): Embeddings to cluster and reduce.
+
+        Returns:
+            tuple: A tuple containing three items:
+                - reduced_embeddings (np.ndarray): Reduced embeddings.
+                - cluster_labels (np.ndarray): Cluster labels.
+                - umap_mapper (umap.UMAP): UMAP mapper for transforming new embeddings, especially embeddings of the vocabulary. (MAKE SURE TO NORMALIZE EMBEDDINGS AFTER USING THE MAPPER)
+        """
+
+        dim_red_embeddings, umap_mapper = self.reduce_dimensions_umap(embeddings)
+        clusters = self.cluster_hdbscan(dim_red_embeddings)
+        return dim_red_embeddings, clusters, umap_mapper
+    
+    def visualize_clusters_static(self, embeddings: np.ndarray, labels: np.ndarray):
+        """
+        Reduce dimensionality with UMAP to two dimensions and plot the clusters.
+
+        Args:
+            embeddings (np.ndarray): Embeddings for which to plot clustering.
+            labels (np.ndarray): Cluster labels.
+        """
+
+
+        # Reduce dimensionality with UMAP
+        reducer = umap.UMAP(n_components=2, random_state = self.random_state, n_neighbors=30, metric="cosine", min_dist=0)
+        embeddings_2d = reducer.fit_transform(embeddings)
+
+
+        # Create a color palette, then map the labels to the colors.
+        # We add one to the number of unique labels to account for the noise points labelled as -1.
+        palette = plt.cm.get_cmap("tab20", len(np.unique(labels)) + 1)
+        
+        # Create a new figure
+        fig, ax = plt.subplots(figsize=(10, 8))
+
+        outlier_shown_in_legend = False
+
+        # Iterate through all unique labels (clusters and outliers)
+        for label in np.unique(labels):
+            # Find the embeddings that are part of this cluster
+            cluster_points = embeddings_2d[labels == label]
+            
+            # If label is -1, these are outliers. We want to display them in grey.
+            if label == -1:
+                color = 'grey'
+                if not outlier_shown_in_legend:
+                    ax.scatter(cluster_points[:, 0], cluster_points[:, 1], c=color, label='outlier', s = 0.1)
+                    outlier_shown_in_legend = True
+                else:
+                    ax.scatter(cluster_points[:, 0], cluster_points[:, 1], c=color, s = 0.1)
+            else:
+                color = palette(label)
+                # Plot the points in this cluster without a label to prevent them from showing up in the legend
+                ax.scatter(cluster_points[:, 0], cluster_points[:, 1], c=color, s = 0.1)
+            
+        # Add a legend
+        ax.legend()
+
+        # Show the plot
+        plt.show()
+
+
+    def visualize_clusters_dynamic(self, embeddings: np.ndarray, labels: np.ndarray, texts: list[str], class_names: list[str] = None):
+        """
+        Visualize clusters using Plotly and enable hovering over clusters to see the beginning of the texts of the documents.
+
+        Args:
+            embeddings (np.ndarray): Embeddings for which to visualize clustering.
+            labels (np.ndarray): Cluster labels.
+            texts (list[str]): Texts of the documents.
+            class_names (list[str], optional): Names of the classes.
+        """
+
+
+        # Reduce dimensionality with UMAP
+        reducer = umap.UMAP(n_components=2, random_state = self.random_state, n_neighbors=30, metric="cosine", min_dist=0)
+        embeddings_2d = reducer.fit_transform(embeddings)
+
+        df = pd.DataFrame(embeddings_2d, columns=['x', 'y'])
+        df['text'] = [text[:200] for text in texts] 
+        df["class"] = labels
+
+        if class_names is not None:
+            df["class"] = [class_names[label] for label in labels]
+
+        # Create a color palette, then map the labels to the colors.
+        # Exclude the outlier (-1) label from color palette assignment
+        unique_labels = [label for label in np.unique(labels) if label != -1]
+        palette = plt.cm.get_cmap("tab20", len(unique_labels))
+
+        # Create color map
+        color_discrete_map = {label: 'rgb'+str(tuple(int(val*255) for val in palette(i)[:3])) if label != -1 else 'grey' for i, label in enumerate(unique_labels)}
+        color_discrete_map[-1] = 'grey'
+        
+        # plot data points where the color represents the class
+        fig = px.scatter(df, x='x', y='y', hover_data=['text', 'class'], color='class', color_discrete_map=color_discrete_map)
+        
+        fig.update_traces(mode='markers', marker=dict(size=3))  # Optional: Increase the marker size
+
+        # make plot quadratic
+        fig.update_layout(
+        autosize=False,
+        width=1500,
+        height=1500,
+        margin=dict(
+            l=50,   
+            r=50,
+            b=100,
+            t=100,
+            pad=4
+        )
+        )
+        # set title 
+        fig.update_layout(title_text='UMAP projection of the document embeddings', title_x=0.5)
+
+        
+        # show plot
+        fig.show()
+
+
+    def umap_diagnostics(self, embeddings, hammer_edges = False):
+        """
+        Fit UMAP on the provided embeddings and generate diagnostic plots.
+        
+        Params:
+        ------
+        embeddings : array-like
+            The high-dimensional data for UMAP to reduce and visualize.
+        hammer_edges : bool, default False. Is computationally expensive.
+            
+        """
+        new_hyperparams = deepcopy(self.UMAP_hyperparams)
+        new_hyperparams["n_components"] = 2
+        mapper = umap.UMAP(**new_hyperparams).fit(embeddings)
+
+        # 1. Connectivity plot with points
+        print("UMAP Connectivity Plot with Points")
+        umap.plot.connectivity(mapper, show_points=True)
+        plt.show()
+
+        if hammer_edges:
+            # 2. Connectivity plot with edge bundling
+            print("UMAP Connectivity Plot with Hammer Edge Bundling")
+            umap.plot.connectivity(mapper, edge_bundling='hammer')
+            plt.show()
+
+        # 3. PCA diagnostic plot
+        print("UMAP PCA Diagnostic Plot")
+        umap.plot.diagnostic(mapper, diagnostic_type='pca')
+        plt.show()
+
+        # 4. Local dimension diagnostic plot
+        print("UMAP Local Dimension Diagnostic Plot")
+        umap.plot.diagnostic(mapper, diagnostic_type='local_dim')
+        plt.show()
@@ -0,0 +1,429 @@
+import nltk
+import string
+import collections
+from tqdm import tqdm
+from typing import List
+import numpy as np
+import re  
+from nltk.tokenize import word_tokenize
+import umap
+from collections import Counter
+import warnings
+
+from typing import List
+
+# make sure the import works even if the package has not been installed and just the files are used
+try:
+    from topicgpt.GetEmbeddingsOpenAI import GetEmbeddingsOpenAI
+except:
+    from GetEmbeddingsOpenAI import GetEmbeddingsOpenAI
+
+nltk.download('stopwords', quiet=True)  # download stopwords
+nltk.download('punkt', quiet=True) # download tokenizer
+
+class ExtractTopWords:
+    
+    def extract_centroids(self, embeddings: np.ndarray, labels: np.ndarray) -> dict:
+        """
+        Extract centroids of clusters.
+
+        Args:
+            embeddings (np.ndarray): Embeddings to cluster and reduce.
+            labels (np.ndarray): Cluster labels. -1 means outlier.
+
+        Returns:
+            dict: Dictionary of cluster labels and their centroids.
+        """
+
+        centroid_dict = {}
+        for label in np.unique(labels):
+            if label != -1:
+                centroid_dict[label] = np.mean(embeddings[labels == label], axis = 0)
+
+        return centroid_dict
+    
+    def extract_centroid(self, embeddings: np.ndarray) -> np.ndarray:
+        """
+        Extract the single centroid of a cluster.
+
+        Args:
+            embeddings (np.ndarray): Embeddings to extract the centroid from.
+
+        Returns:
+            np.ndarray: The centroid of the cluster.
+        """
+
+        return np.mean(embeddings, axis = 0)
+    
+    def compute_centroid_similarity(self, embeddings: np.ndarray, centroid_dict: dict, cluster_label: int) -> np.ndarray:
+        """
+        Compute the similarity of the document embeddings to the centroid of the cluster via cosine similarity.
+
+        Args:
+            embeddings (np.ndarray): Embeddings to cluster and reduce.
+            centroid_dict (dict): Dictionary of cluster labels and their centroids.
+            cluster_label (int): Cluster label for which to compute the similarity.
+
+        Returns:
+            np.ndarray: Cosine similarity of the document embeddings to the centroid of the cluster.
+        """
+
+        centroid = centroid_dict[cluster_label]
+        similarity = np.dot(embeddings, centroid) / (np.linalg.norm(embeddings) * np.linalg.norm(centroid))
+        return similarity
+    
+    def get_most_similar_docs(self, corpus: list[str], embeddings: np.ndarray, labels: np.ndarray, centroid_dict: dict, cluster_label: int, top_n: int = 10) -> List[str]:
+        """
+        Get the most similar documents to the centroid of a cluster.
+
+        Args:
+            corpus (list[str]): List of documents.
+            embeddings (np.ndarray): Embeddings to cluster and reduce.
+            labels (np.ndarray): Cluster labels. -1 means outlier.
+            centroid_dict (dict): Dictionary of cluster labels and their centroids.
+            cluster_label (int): Cluster label for which to compute the similarity.
+            top_n (int, optional): Number of top documents to extract.
+
+        Returns:
+            List[str]: List of the most similar documents to the centroid of a cluster.
+        """
+
+        similarity = self.compute_centroid_similarity(embeddings, centroid_dict, cluster_label)
+        most_similar_docs = [corpus[i] for i in np.argsort(similarity)[-top_n:][::-1]]
+        return most_similar_docs
+    
+    def compute_corpus_vocab(self, 
+                        corpus: list[str],
+                        remove_stopwords: bool = True, 
+                        remove_punction: bool = True, 
+                        min_word_length: int = 3,
+                        max_word_length: int = 20, 
+                        remove_short_words: bool = True, 
+                        remove_numbers: bool = True, 
+                        verbose: bool = True,
+                        min_doc_frequency: int = 3,
+                        min_freq: float = 0.1,
+                        max_freq: float = 0.9) -> list[str]:
+        """
+        Compute the vocabulary of the corpus and perform preprocessing of the corpus.
+
+        Args:
+            corpus (list[str]): List of documents.
+            remove_stopwords (bool, optional): Whether to remove stopwords.
+            remove_punction (bool, optional): Whether to remove punctuation.
+            min_word_length (int, optional): Minimum word length to retain.
+            max_word_length (int, optional): Maximum word length to retain.
+            remove_short_words (bool, optional): Whether to remove short words.
+            remove_numbers (bool, optional): Whether to remove numbers.
+            verbose (bool, optional): Whether to print progress and describe what is happening.
+            min_doc_frequency (int, optional): Minimum number of documents a word should appear in to be considered in the vocabulary.
+            min_freq (float, optional): Minimum frequency percentile of words to be considered in the vocabulary.
+            max_freq (float, optional): Maximum frequency percentile of words to be considered in the vocabulary.
+
+        Returns:
+            list[str]: List of words in the corpus sorted alphabetically.
+        """
+
+        stopwords = set(nltk.corpus.stopwords.words('english'))
+        
+        word_counter = collections.Counter()
+        doc_frequency = collections.defaultdict(set)
+
+        for doc_id, doc in enumerate(tqdm(corpus, disable=not verbose, desc="Processing corpus")):
+            words = nltk.word_tokenize(doc)
+            for word in words:
+                if remove_punction and word in string.punctuation:
+                    continue
+                if remove_stopwords and word.lower() in stopwords:
+                    continue
+                if remove_numbers and re.search(r'\d', word):  # use a regular expression to check for digits
+                    continue
+                if not re.search('[a-zA-Z]', word):  # checks if word contains at least one alphabetic character
+                    continue
+                # remove words that do not begin with an alphabetic character
+                if not word[0].isalpha():
+                    continue
+                if len(word) > max_word_length or (remove_short_words and len(word) < min_word_length):
+                    continue
+                
+                word_lower = word.lower()
+                word_counter[word_lower] += 1
+                doc_frequency[word_lower].add(doc_id)
+
+        total_words = sum(word_counter.values())
+        freq_counter = {word: count / total_words for word, count in word_counter.items()}
+
+        # print most common words and their frequencies
+        if verbose:
+            print("Most common words in the vocabulary:")
+            for word, count in word_counter.most_common(10):
+                print(f"{word}: {count}")
+
+        freq_arr = np.array(list(freq_counter.values()))
+
+        min_freq_value = np.quantile(freq_arr, min_freq, method="lower")
+        max_freq_value = np.quantile(freq_arr, max_freq, method="higher")
+        
+
+        vocab = {}
+
+        for word in freq_counter.keys():
+            if min_freq_value <= freq_counter[word] <= max_freq_value and len(doc_frequency[word]) >= min_doc_frequency:
+                vocab[word] = freq_counter[word]
+
+        vocab = {word for word in freq_counter.keys() 
+                if min_freq_value <= freq_counter[word] <= max_freq_value 
+                and len(doc_frequency[word]) >= min_doc_frequency}
+
+        # Sorting the vocabulary alphabetically
+        vocab = sorted(list(vocab))
+        
+        return vocab
+
+    def compute_words_topics(self, corpus: list[str], vocab: list[str], labels: np.ndarray) -> dict:
+        """
+        Compute the words per topic.
+
+        Args:
+            corpus (list[str]): List of documents.
+            vocab (list[str]): List of words in the corpus sorted alphabetically.
+            labels (np.ndarray): Cluster labels. -1 means outlier.
+
+        Returns:
+            dict: Dictionary of topics and their words.
+        """
+
+
+        # Download NLTK resources (only required once)
+        nltk.download("punkt")
+        vocab = set(vocab)
+
+        words_per_topic = {label: [] for label in np.unique(labels) if label != -1}
+
+        for doc, label in tqdm(zip(corpus, labels), desc="Computing words per topic", total=len(corpus)):
+            if label != -1:
+                words = word_tokenize(doc)
+                for word in words:
+                    if word.lower() in vocab:
+                        words_per_topic[label].append(word.lower())
+
+        return words_per_topic
+                    
+    def embed_vocab_openAI(self, client, vocab: list[str], embedder: GetEmbeddingsOpenAI = None) -> dict[str, np.ndarray]:
+        """
+        Embed the vocabulary using the OpenAI embedding API.
+
+        Args:
+            client: Client.
+            vocab (list[str]): List of words in the corpus sorted alphabetically.
+            embedder (GetEmbeddingsOpenAI, optional): Embedding object.
+
+        Returns:
+            dict[str, np.ndarray]: Dictionary of words and their embeddings.
+        """
+
+        vocab = sorted(list(set(vocab)))
+        if embedder is None: 
+            embedder = GetEmbeddingsOpenAI.GetEmbeddingsOpenAI(client)
+        result = embedder.get_embeddings(vocab)
+
+        res_dict = {}
+        for word, emb in zip(vocab, result["embeddings"]):
+            res_dict[word] = emb
+        return res_dict
+    
+    def compute_bow_representation(self, document: str, vocab: list[str], vocab_set: set[str]) -> np.ndarray:
+        """
+        Compute the bag-of-words representation of a document.
+
+        Args:
+            document (str): Document to compute the bag-of-words representation of.
+            vocab (list[str]): List of words in the corpus sorted alphabetically.
+            vocab_set (set[str]): Set of words in the corpus sorted alphabetically.
+
+        Returns:
+            np.ndarray: Bag-of-words representation of the document.
+        """
+
+        bow = np.zeros(len(vocab))
+        words = word_tokenize(document)
+        if vocab_set is None:
+            vocab_set = set(vocab)
+        for word in words:
+            if word.lower() in vocab_set:
+                bow[vocab.index(word.lower())] += 1
+        return bow   
+    
+    def compute_word_topic_mat_old(self, corpus: list[str], vocab: list[str], labels: np.ndarray, consider_outliers: bool = False) -> np.ndarray:
+        """
+        Compute the word-topic matrix.
+
+        Args:
+            corpus (list[str]): List of documents.
+            vocab (list[str]): List of words in the corpus sorted alphabetically.
+            labels (np.ndarray): Cluster labels. -1 means outlier.
+            consider_outliers (bool, optional): Whether to consider outliers when computing the top words. I.e. whether the labels contain -1 to indicate outliers.
+
+        Returns:
+            np.ndarray: Word-topic matrix.
+        """
+
+        if consider_outliers:
+            word_topic_mat = np.zeros(len(vocab), len((np.unique(labels))))
+        else:
+            word_topic_mat = np.zeros((len(vocab), len((np.unique(labels)) - 1)))
+
+        vocab_set = set(vocab)
+        for i, doc in tqdm(enumerate(corpus), desc="Computing word-topic matrix", total=len(corpus)):
+            if labels[i] > - 0.5:
+                bow = self.compute_bow_representation(doc, vocab, vocab_set)
+                idx_to_add = labels[i]
+                word_topic_mat[:, idx_to_add] += bow
+
+        return word_topic_mat
+    
+    def compute_word_topic_mat(self, corpus: list[str], vocab: list[str], labels: np.ndarray, consider_outliers=False) -> np.ndarray:
+        """
+        Compute the word-topic matrix efficiently.
+
+        Args:
+            corpus (list[str]): List of documents.
+            vocab (list[str]): List of words in the corpus, sorted alphabetically.
+            labels (np.ndarray): Cluster labels. -1 indicates outliers.
+            consider_outliers (bool, optional): Whether to consider outliers when computing the top words. Defaults to False.
+
+        Returns:
+            np.ndarray: Word-topic matrix.
+        """
+
+
+        corpus_arr = np.array(corpus) 
+
+        if consider_outliers:
+            word_topic_mat = np.zeros((len(vocab), len((np.unique(labels)))))
+        else:
+            word_topic_mat = np.zeros((len(vocab), len((np.unique(labels)))))
+        
+        for i, label in tqdm(enumerate(np.unique(labels)), desc="Computing word-topic matrix", total=len(np.unique(labels))):
+            topic_docs = corpus_arr[labels == label]
+            topic_doc_string = " ".join(topic_docs)
+            topic_doc_words = word_tokenize(topic_doc_string)
+            topic_doc_counter = Counter(topic_doc_words)
+
+            word_topic_mat[:, i] = np.array([topic_doc_counter.get(word, 0) for word in vocab])
+        
+        return word_topic_mat
+
+    def extract_topwords_tfidf(self, word_topic_mat: np.ndarray, vocab: list[str], labels: np.ndarray, top_n_words: int = 10) -> dict:
+        """
+        Extract the top words for each topic using a class-based tf-idf score.
+
+        Args:
+            word_topic_mat (np.ndarray): Word-topic matrix.
+            vocab (list[str]): List of words in the corpus sorted alphabetically.
+            labels (np.ndarray): Cluster labels. -1 means outlier.
+            top_n_words (int, optional): Number of top words to extract per topic.
+
+        Returns:
+            dict: Dictionary of topics and their top words.
+        """
+
+
+        if min(labels) == -1:
+            word_topic_mat = word_topic_mat[:, 1:]
+
+
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore", category=RuntimeWarning)
+            tf = word_topic_mat / np.sum(word_topic_mat, axis=0)
+            idf = np.log(1 + (word_topic_mat.shape[1] / np.sum(word_topic_mat > 0, axis=1)))
+
+            tfidf = tf * idf[:, np.newaxis]
+        
+            # set tfidf to zero if tf is nan (happens if word does not occur in any document or topic does not have any words)
+            tfidf[np.isnan(tf)] = 0
+
+        # extract top words for each topic
+        top_words = {}
+        top_word_scores = {}
+        for topic in np.unique(labels):
+            if topic != -1:
+                indices = np.argsort(-tfidf[:, topic])[:top_n_words]
+                top_words[topic] = [vocab[word_idx] for word_idx in indices]
+                top_word_scores[topic] = [tfidf[word_idx, topic] for word_idx in indices]
+
+
+        return top_words, top_word_scores
+    
+    def compute_embedding_similarity_centroids(self, vocab: list[str], vocab_embedding_dict: dict, umap_mapper: umap.UMAP, centroid_dict: dict, reduce_vocab_embeddings: bool = False, reduce_centroid_embeddings: bool = False) -> np.ndarray:
+        """
+        Compute the cosine similarity of each word in the vocabulary to each centroid.
+
+        Args:
+            vocab (list[str]): List of words in the corpus sorted alphabetically.
+            vocab_embedding_dict (dict): Dictionary of words and their embeddings.
+            umap_mapper (umap.UMAP): UMAP mapper to transform new embeddings in the same way as the document embeddings.
+            centroid_dict (dict): Dictionary of cluster labels and their centroids. -1 means outlier.
+            reduce_vocab_embeddings (bool, optional): Whether to reduce the vocab embeddings with the UMAP mapper.
+            reduce_centroid_embeddings (bool, optional): Whether to reduce the centroid embeddings with the UMAP mapper.
+
+        Returns:
+            np.ndarray: Cosine similarity of each word in the vocab to each centroid. Has shape (len(vocab), len(centroid_dict) - 1).
+        """
+
+        embedding_dim = umap_mapper.n_components
+        centroid_arr = np.zeros((len(centroid_dict), embedding_dim))
+        for i, centroid in enumerate(centroid_dict.values()):
+            centroid_arr[i] = centroid
+        if reduce_centroid_embeddings:
+            centroid_arr = umap_mapper.transform(centroid_arr)
+        
+        centroid_arr = centroid_arr / np.linalg.norm(centroid_arr, axis=1).reshape(-1,1)
+        
+
+        org_embedding_dim = list(vocab_embedding_dict.values())[0].shape[0]
+        vocab_arr = np.zeros((len(vocab), org_embedding_dim))
+        for i, word in enumerate(vocab):
+            vocab_arr[i] = vocab_embedding_dict[word]
+        if reduce_vocab_embeddings:
+            vocab_arr = umap_mapper.transform(vocab_arr)
+
+        vocab_arr = vocab_arr / np.linalg.norm(vocab_arr, axis=1).reshape(-1,1)
+        
+        similarity = vocab_arr @ centroid_arr.T # cosine similarity
+        return similarity
+    
+    def extract_topwords_centroid_similarity(self, word_topic_mat: np.ndarray, vocab: list[str], vocab_embedding_dict: dict, centroid_dict: dict, umap_mapper: umap.UMAP, top_n_words: int = 10, reduce_vocab_embeddings: bool = True, reduce_centroid_embeddings: bool = False, consider_outliers: bool = False) -> tuple[dict, np.ndarray]:
+        """
+        Extract the top words for each cluster by computing the cosine similarity of the words that occur in the corpus to the centroid of the cluster.
+
+        Args:
+            word_topic_mat (np.ndarray): Word-topic matrix.
+            vocab (list[str]): List of words in the corpus sorted alphabetically.
+            vocab_embedding_dict (dict): Dictionary of words and their embeddings.
+            centroid_dict (dict): Dictionary of cluster labels and their centroids. -1 means outlier.
+            umap_mapper (umap.UMAP): UMAP mapper to transform new embeddings in the same way as the document embeddings.
+            top_n_words (int, optional): Number of top words to extract per topic.
+            reduce_vocab_embeddings (bool, optional): Whether to reduce the vocab embeddings with the UMAP mapper.
+            reduce_centroid_embeddings (bool, optional): Whether to reduce the centroid embeddings with the UMAP mapper.
+            consider_outliers (bool, optional): Whether to consider outliers when computing the top words. I.e., whether the labels contain -1 to indicate outliers.
+
+        Returns:
+            dict: Dictionary of topics and their top words.
+            np.ndarray: Cosine similarity of each word in the vocab to each centroid. Has shape (len(vocab), len(centroid_dict) - 1).
+        """
+
+        similarity_mat = self.compute_embedding_similarity_centroids(vocab, vocab_embedding_dict, umap_mapper, centroid_dict, reduce_vocab_embeddings, reduce_centroid_embeddings)
+        top_words = {}
+        top_word_scores = {}
+        
+        if word_topic_mat.shape[1] > len(np.unique(list(centroid_dict.keys()))):	
+            word_topic_mat = word_topic_mat[:, 1:] #ignore outliers
+
+        for i, topic in enumerate(np.unique(list(centroid_dict.keys()))):
+            if topic != -1:
+                topic_similarity_mat = similarity_mat[:, topic] * word_topic_mat[:, topic]
+                top_words[topic] = [vocab[word_idx] for word_idx in np.argsort(-topic_similarity_mat)[:top_n_words]]
+                top_word_scores[topic] = [similarity_mat[word_idx, topic] for word_idx in np.argsort(-similarity_mat[:, topic])[:top_n_words]]
+
+        return top_words, top_word_scores
@@ -0,0 +1,217 @@
+from openai import OpenAI
+
+import tiktoken
+from tqdm import tqdm
+import numpy as np
+
+class GetEmbeddingsOpenAI:
+    """
+    This class allows to compute embeddings of text using the OpenAI API.
+    """
+
+    def __init__(self, client, azure_config: dict = {}, embedding_model: str = "text-embedding-ada-002", tokenizer: str = None, max_tokens: int = 8191) -> None:
+        """
+        Constructor of the class.
+
+        Args:
+            client: Client.
+            embedding_model (str, optional): Name of the embedding model to use.
+            tokenizer (str, optional): Name of the tokenizer to use.
+            max_tokens (int, optional): Maximum number of tokens to use.
+
+        Note:
+            By default, the embedding model "text-embedding-ada-002" is used with the corresponding tokenizer "cl100k_base" and a maximum number of tokens of 8191.
+        """
+
+        self.client = client
+        self.embedding_model = embedding_model
+        self.tokenizer_str = tokenizer
+        self.max_tokens = max_tokens
+
+    @staticmethod
+    def num_tokens_from_string(string: str, encoding) -> int:
+        """
+        Returns the number of tokens in a text string.
+
+        Args:
+            string (str): Text string to compute the number of tokens.
+            encoding: A function to encode the string into tokens.
+
+        Returns:
+            int: Number of tokens in the text string.
+        """
+        num_tokens = len(encoding.encode(string))
+        return num_tokens
+
+    def compute_number_of_tokens(self, corpus: list[str]) -> int:
+        """
+        Computes the total number of tokens needed to embed the corpus.
+
+        Args:
+            corpus (list[str]): List of strings to embed, where each element in the list is a document.
+
+        Returns:
+            int: Total number of tokens needed to embed the corpus.
+        """
+
+
+        if self.tokenizer_str is None:
+             tokenizer = tiktoken.encoding_for_model(self.embedding_model)
+
+        else: 
+             tokenizer = tiktoken.get_encoding(self.tokenizer_str)
+
+        num_tokens = 0
+        for document in tqdm(corpus):
+            num_tokens += self.num_tokens_from_string(document, tokenizer)
+
+        return num_tokens
+
+    def split_doc(self, text):
+        """
+        Splits a single document that is longer than the maximum number of tokens into a list of smaller documents.
+
+        Args:
+            self: The instance of the class.
+            text (str): The string to be split.
+
+        Returns:
+            List[str]: A list of strings to embed, where each element in the list is a list of chunks comprising the document.
+        """
+
+        split_text = []
+        split_text.append(text[:self.max_tokens])
+        for i in range(1, len(text) // self.max_tokens):
+            split_text.append(text[i * self.max_tokens:(i + 1) * self.max_tokens])
+        split_text.append(text[(len(text) // self.max_tokens) * self.max_tokens:])
+        return split_text
+
+    def split_long_docs(self, text: list[str]) -> list[list[str]]:
+        """
+        Splits all documents that are longer than the maximum number of tokens into a list of smaller documents.
+
+        Args:
+            self: The instance of the class.
+            text (list[str]): List of strings to embed, where each element in the list is a document.
+
+        Returns:
+            List[list[str]]: A list of lists of strings to embed, where each element in the outer list is a list of chunks comprising the document.
+        """
+
+        if self.tokenizer_str is None:
+            tokenizer = tiktoken.encoding_for_model(self.embedding_model)
+        else:
+            tokenizer = tiktoken.get_encoding(self.tokenizer_str)
+
+
+        split_text = []
+        for document in tqdm(text):
+            if self.num_tokens_from_string(document, tokenizer) > self.max_tokens:
+                split_text.append(self.split_doc(document))
+            else:
+                split_text.append([document])
+        return split_text   
+
+    def make_api_call(self, text: str):
+        """
+        Makes an API call to the OpenAI API to embed a text string.
+
+        Args:
+            self: The instance of the class.
+            text (str): The string to embed.
+
+        Returns:
+            API response: The response from the API.
+        """
+        response = self.client.embeddings.create(input = [text], model = self.embedding_model)
+        return response
+
+
+
+    def get_embeddings_doc_split(self, corpus: list[list[str]], n_tries=3) -> list[dict]:
+        """
+        Computes the embeddings of a corpus for split documents.
+
+        Args:
+            self: The instance of the class.
+            corpus (list[list[str]]): List of strings to embed, where each element is a document represented by a list of its chunks.
+            n_tries (int, optional): Number of tries to make an API call (default is 3).
+
+        Returns:
+            List[dict]: A list of dictionaries, where each dictionary contains the embedding of the document, the text of the document, and a list of errors that occurred during the embedding process.
+        """
+
+        api_res_list = [] 
+        for i in tqdm(range(len(corpus))):
+            chunk_lis = corpus[i]
+            api_res_doc = []
+            for chunk_n, chunk in enumerate(chunk_lis):
+
+                for i in range(n_tries + 1):
+                    try: 
+                        api_res_doc.append(
+                            {"api_res": self.make_api_call(chunk), 
+                            "error": None }
+                         )
+                        break
+                    except Exception as e:
+                            print(f"Error {e} occured for chunk {chunk_n} of document {i}")
+                            print(chunk)
+                            print("Trying again.")
+                            if i == n_tries: 
+                                print("Maximum number of tries reached. Skipping chunk.")
+                                api_res_doc.append(
+                                    {"api_res": None, 
+                                    "error": e })
+
+
+            # average the embeddings of the chunks
+            emb_lis = []
+            for api_res in api_res_doc:
+                if api_res["api_res"] is not None:
+                    emb_lis.append(np.array(api_res["api_res"].data[0].embedding))
+            text = " ".join(chunk_lis)
+            embedding = np.mean(emb_lis, axis = 0)
+            api_res_list.append(
+                {"embedding": embedding, 
+                "text": text, 
+                "errors": [api_res["error"] for api_res in api_res_doc]}
+                )
+        return api_res_list
+
+    def convert_api_res_list(self, api_res_list: list[dict]) -> dict:
+        """
+        Converts the api_res list into a dictionary containing the embeddings as a matrix and the corpus as a list of strings.
+
+        Args:
+            self: The instance of the class.
+            api_res_list (list[dict]): List of dictionaries, where each dictionary contains the embedding of the document, the text of the document, and a list of errors that occurred during the embedding process.
+
+        Returns:
+            dict: A dictionary containing the embeddings as a matrix and the corpus as a list of strings.
+        """
+
+
+        embeddings = np.array([api_res["embedding"] for api_res in api_res_list])
+        corpus = [api_res["text"] for api_res in api_res_list]
+        errors = [api_res["errors"] for api_res in api_res_list]
+        return {"embeddings": embeddings, "corpus": corpus, "errors": errors}
+
+
+    def get_embeddings(self, corpus: list[str]) -> dict:
+        """
+        Computes the embeddings of a corpus.
+
+        Args:
+            self: The instance of the class.
+            corpus (list[str]): List of strings to embed, where each element in the list is a document.
+
+        Returns:
+            dict: A dictionary containing the embeddings as a matrix and the corpus as a list of strings.
+        """
+
+        corpus_split = self.split_long_docs(corpus)
+        corpus_emb = self.get_embeddings_doc_split(corpus_split)
+        self.corpus_emb = corpus_emb
+        res = self.convert_api_res_list(corpus_emb)
+        return res
@@ -0,0 +1,137 @@
+from topicgpt.TopicRepresentation import Topic
+
+import unittest
+from sklearn.datasets import fetch_20newsgroups 
+
+from topicgpt.TopicGPT import TopicGPT
+
+    
+import sys
+
+
+class QuickestTopicGPT_prompting(unittest.TestCase):
+    """
+    This class is used to mainly test the prompting functionality of the TopicGPT class.
+    """
+
+
+    @classmethod
+    def setUpClass(cls, sample_size:int = 500):
+        """
+        download the necessary data and only keep a sample of it 
+        params: 
+            client: Client.
+            sample_size: the number of documents to use for the test
+        """
+
+        data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes')) #download the 20 Newsgroups dataset
+        corpus = data['data']# just select the first 1000 documents for this example
+        corpus = [doc for doc in corpus if doc != ""]
+        corpus = corpus[:sample_size]
+
+        cls.corpus = corpus
+
+        cls.tm = TopicGPT(client = client, n_topics = 1)
+        cls.tm.fit(cls.corpus)
+
+    def test_repr_topics(self):
+        """
+        test the repr_topics function of the TopicGPT class
+        """
+        print("Testing repr_topics...")
+        self.assertTrue(type(self.tm.repr_topics()) == str)
+
+    def test_promt_knn_search(self):
+        """
+        test the ppromt function that calls knn_search of the TopicPrompting class
+        """
+        print("Testing ppromt_knn_search...")
+        
+        prompt_lis = ["Is topic 0 about Bananas? Use knn Search",
+                      "Is topic 0 about Space? Use knn Search"]
+        
+        for prompt in prompt_lis:
+
+            answer, function_result = self.tm.prompt(prompt)
+
+            print(f"Answer to the prompt '{prompt}' \n is \n '{answer}'")
+
+            self.assertTrue(type(answer) == str)
+            self.assertTrue(type(function_result[0]) == list)
+            self.assertTrue(type(function_result[1]) == list)
+            self.assertTrue(type(function_result[0][0]) == str)
+            self.assertTrue(type(function_result[1][0]) == int)
+
+
+    def test_prompt_split_topic_kmeans_inplace(self):
+        """
+        test the ppromt function that calls split_topic_kmeans of the TopicPrompting class
+        """
+
+        print("Testing ppromt_split_topic_kmeans...")
+
+        prompt_lis = ["Split topic 0 into 2 subtopics using kmeans. Do this inplace"]
+        added_topic_lis_len  = [2]
+
+        old_number_of_topics = len(self.tm.topic_lis)
+
+        for prompt, added_topic_len in zip(prompt_lis, added_topic_lis_len):
+                
+                answer, function_result = self.tm.prompt(prompt)
+    
+                print(f"Answer to the prompt '{prompt}' \n is \n '{answer}'")
+                print("function_result: ", function_result)
+    
+                self.assertTrue(type(answer) == str)
+                self.assertTrue(type(function_result) == list)
+                self.assertTrue(type(function_result[0]) == Topic)
+
+                self.assertTrue(len(self.tm.topic_lis) == old_number_of_topics + added_topic_len -1 )
+                self.assertTrue(self.tm.topic_lis == function_result)
+
+   
+    def test_prompt_combine_topics_inplace(self):
+        """
+        test the prompt function that calls combine_topics of the TopicPrompting class
+        """
+
+        print("Testing ppromt_combine_topics...")
+
+        prompt_lis = ["Combine topic 0 and topic 1 into one topic. Do this inplace"]
+
+        # split topic first
+        self.tm.prompt("Please split topic 0 into two subtopic. Do this inplace.")
+
+        old_number_topics = len(self.tm.topic_lis)
+
+
+
+        for prompt in prompt_lis:
+                
+                answer, function_result = self.tm.prompt(prompt)
+    
+                print(f"Answer to the prompt '{prompt}' \n is \n '{answer}'")
+                print("function_result: ", function_result)
+                print("topic_gpt_topic_list: ", self.tm.topic_lis)
+    
+                self.assertTrue(type(answer) == str)
+                self.assertTrue(type(function_result) == list)
+                self.assertTrue(type(function_result[0]) == Topic)
+                self.assertTrue(self.tm.topic_lis == function_result)
+                self.assertTrue(len(self.tm.topic_lis) == old_number_topics -1)
+
+
+if __name__ == "__main__":
+    
+    for i, arg in enumerate(sys.argv):
+        if arg == "--api-key":
+            api_key = sys.argv.pop(i + 1)
+            sys.argv.pop(i)
+            break
+
+    if api_key is None:
+        print("API key must be provided with --api-key")
+        sys.exit(1)
+
+    
+    unittest.main()
@@ -0,0 +1,120 @@
+from topicgpt.TopicRepresentation import Topic
+
+import unittest
+from sklearn.datasets import fetch_20newsgroups 
+
+from topicgpt.TopicGPT import TopicGPT
+
+
+class QuickTestTopicGPT_init_and_fit(unittest.TestCase):
+    """
+    Run some basic tests on TopicGPT that do not require any saved data
+    """
+
+
+    @classmethod
+    def setUpClass(cls, sample_size:int = 500):
+        """
+        download the necessary data and only keep a sample of it 
+        params: 
+            api_key: the openai api key
+            sample_size: the number of documents to use for the test
+        """
+
+        data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes')) #download the 20 Newsgroups dataset
+        corpus = data['data']# just select the first 1000 documents for this example
+        corpus = [doc for doc in corpus if doc != ""]
+        corpus = corpus[:sample_size]
+
+        cls.corpus = corpus
+
+    def setUp(self):
+        self.api_key_openai = api_key
+
+
+    def test_init(self):
+        """
+        test the init function of the TopicGPT class
+        """
+        print("Testing init...")
+        topicgpt = TopicGPT(api_key = self.api_key_openai)
+        self.assertTrue(isinstance(topicgpt, TopicGPT))
+
+        topicgpt = TopicGPT(api_key = self.api_key_openai, 
+                            n_topics= 20)
+        self.assertTrue(isinstance(topicgpt, TopicGPT))
+        
+        topicgpt = TopicGPT(api_key = self.api_key_openai, 
+                            n_topics= 20,
+                            corpus_instruction="This is a corpus instruction")
+        self.assertTrue(isinstance(topicgpt, TopicGPT))
+
+        # check if assertions are triggered
+
+        with self.assertRaises(AssertionError):
+            topicgpt = TopicGPT(api_key = None, 
+                                n_topics= 32,
+                                openai_prompting_model="gpt-4",
+                                max_number_of_tokens=8000,
+                                corpus_instruction="This is a corpus instruction")
+
+        with self.assertRaises(AssertionError):
+            topicgpt = TopicGPT(api_key = self.api_key_openai, 
+                                n_topics= 0,
+                                max_number_of_tokens=8000,
+                                corpus_instruction="This is a corpus instruction")
+            
+        with self.assertRaises(AssertionError):
+            topicgpt = TopicGPT(api_key = self.api_key_openai, 
+                                n_topics= 20,
+                                max_number_of_tokens=0,
+                                corpus_instruction="This is a corpus instruction")
+            
+
+    def test_fit(self):
+        """
+        test the fit function of the TopicGPT class
+        """
+        print("Testing fit...")
+
+        def instance_test(topicgpt):
+            topicgpt.fit(self.corpus)
+
+            self.assertTrue(hasattr(topicgpt, "vocab"))
+            self.assertTrue(hasattr(topicgpt, "topic_lis"))
+
+            self.assertTrue(isinstance(topicgpt.vocab, list))
+            self.assertTrue(isinstance(topicgpt.vocab[0], str))
+
+            self.assertTrue(isinstance(topicgpt.topic_lis, list))
+            self.assertTrue(type(topicgpt.topic_lis[0]) == Topic)
+
+            if topicgpt.n_topics is not None:
+                self.assertTrue(len(topicgpt.topic_lis) == topicgpt.n_topics)
+
+            self.assertTrue(topicgpt.topic_lis == topicgpt.topic_prompting.topic_lis)
+            self.assertTrue(topicgpt.vocab == topicgpt.topic_prompting.vocab)
+            self.assertTrue(topicgpt.vocab_embeddings == topicgpt.topic_prompting.vocab_embeddings)
+
+        
+        topicgpt1 = TopicGPT(api_key = self.api_key_openai, n_topics = 1)
+
+        topic_gpt_list = [topicgpt1]
+
+        for topic_gpt in topic_gpt_list:
+            instance_test(topic_gpt)
+
+
+import sys
+
+if __name__ == "__main__":
+    for i, arg in enumerate(sys.argv):
+        if arg == "--api-key":
+            api_key = sys.argv.pop(i + 1)
+            sys.argv.pop(i)
+            break
+
+    if api_key is None:
+        print("API key must be provided with --api-key")
+        sys.exit(1)
+    unittest.main()
@@ -0,0 +1,378 @@
+import numpy as np
+import os
+import pickle
+# make sure the import works even if the package has not been installed and just the files are used
+from topicgpt.Clustering import Clustering_and_DimRed
+from topicgpt.ExtractTopWords import ExtractTopWords
+from topicgpt.TopwordEnhancement import TopwordEnhancement
+from topicgpt.GetEmbeddingsOpenAI import GetEmbeddingsOpenAI
+from topicgpt.TopicPrompting import TopicPrompting
+from topicgpt.TopicRepresentation import Topic
+from topicgpt.Client import Client
+import topicgpt.TopicRepresentation as TopicRepresentation
+
+
+embeddings_path= "SavedEmbeddings/embeddings.pkl" #global variable for the path to the embeddings
+
+class TopicGPT:
+    """
+    This is the main class for doing topic modelling with TopicGPT. 
+    """
+
+    def __init__(self,
+             api_key: str = "",
+             azure_endpoint: dict = {},
+             n_topics: int = None,
+             openai_prompting_model: str = "gpt-3.5-turbo-16k",
+             max_number_of_tokens: int = 16384,
+             corpus_instruction: str = "",
+             document_embeddings: np.ndarray = None,
+             vocab_embeddings: dict[str, np.ndarray] = None,
+             embedding_model: str = "text-embedding-ada-002",
+             max_number_of_tokens_embedding: int = 8191,
+             use_saved_embeddings: bool = True,
+             path_saved_embeddings: str = embeddings_path,
+             clusterer: Clustering_and_DimRed = None,
+             n_topwords: int = 2000,
+             n_topwords_description: int = 500,
+             topword_extraction_methods: list[str] = ["tfidf", "cosine_similarity"],
+             compute_vocab_hyperparams: dict = {},
+             enhancer: TopwordEnhancement = None,
+             topic_prompting: TopicPrompting = None,
+             verbose: bool = True) -> None:
+        
+        """
+        Initializes the main class for conducting topic modeling with TopicGPT.
+
+        Args:
+            api_key (str): Your OpenAI API key. Obtain this key from https://beta.openai.com/account/api-keys.
+            n_topics (int, optional): Number of topics to discover. If None, the Hdbscan algorithm (https://pypi.org/project/hdbscan/) is used to determine the number of topics automatically. Otherwise, agglomerative clustering is used. Note that with insufficient data, fewer topics may be found than specified.
+            openai_prompting_model (str, optional): Model provided by OpenAI for topic description and prompts. Refer to https://platform.openai.com/docs/models for available models.
+            max_number_of_tokens (int, optional): Maximum number of tokens to use for the OpenAI API.
+            corpus_instruction (str, optional): Additional information about the corpus, if available, to benefit the model.
+            document_embeddings (np.ndarray, optional): Document embeddings for the corpus. If None, they will be computed using the OpenAI API.
+            vocab_embeddings (dict[str, np.ndarray], optional): Vocabulary embeddings for the corpus in a dictionary format where keys are words and values are embeddings. If None, they will be computed using the OpenAI API.
+            embedding_model (str, optional): Name of the embedding model to use. See https://beta.openai.com/docs/api-reference/text-embedding for available models.
+            max_number_of_tokens_embedding (int, optional): Maximum number of tokens to use for the OpenAI API when computing embeddings.
+            use_saved_embeddings (bool, optional): Whether to use saved embeddings. If True, embeddings are loaded from the file 'SavedEmbeddings/embeddings.pkl' or path_saved_embeddings if different. If False, embeddings are computed using the OpenAI API and saved to the file.
+            path_saved_embeddings (str, optional): Path to the saved embeddings file.
+            clusterer (Clustering_and_DimRed, optional): Clustering and dimensionality reduction object. Find the class in the "Clustering/Clustering" folder. If None, a clustering object with default parameters is used. Note that providing document and vocab embeddings and an embedding object at the same time is not sensible; the number of topics specified in the clusterer will overwrite the n_topics argument.
+            n_topwords (int, optional): Number of top words to extract and save for each topic. Note that fewer top words might be used later.
+            n_topwords_description (int, optional): Number of top words to provide to the LLM (Language Model) to describe the topic.
+            topword_extraction_methods (list[str], optional): List of methods for extracting top words. Available methods include "tfidf", "cosine_similarity", and "topword_enhancement". Refer to the file 'ExtractTopWords/ExtractTopWords.py' for more details.
+            compute_vocab_hyperparams (dict, optional): Hyperparameters for computing vocabulary embeddings. Refer to the file 'ExtractTopWords/ExtractTopWords.py' for more details.
+            enhancer (TopwordEnhancement, optional): Topword enhancement object. Used for describing topics. Find the class in the "TopwordEnhancement/TopwordEnhancement.py" folder. If None, a topword enhancement object with default parameters is used. If an openai model is specified here, it will overwrite the openai_prompting_model argument for topic description.
+            topic_prompting (TopicPrompting, optional): Topic prompting object for formulating prompts. Find the class in the "TopicPrompting/TopicPrompting.py" folder. If None, a topic prompting object with default parameters is used. If an openai model is specified here, it will overwrite the openai_prompting_model argument for topic description.
+            verbose (bool, optional): Whether to print detailed information about the process. This can be overridden by arguments in passed objects.
+        """
+        
+
+
+        # Do some checks on the input arguments
+        assert api_key is not None, "You need to provide an OpenAI API key."
+        assert n_topics is None or n_topics > 0, "The number of topics needs to be a positive integer."
+        assert max_number_of_tokens > 0, "The maximum number of tokens needs to be a positive integer."
+        assert max_number_of_tokens_embedding > 0, "The maximum number of tokens for the embedding model needs to be a positive integer."
+        assert n_topwords > 0, "The number of top words needs to be a positive integer."
+        assert n_topwords_description > 0, "The number of top words for the topic description needs to be a positive integer."
+        assert len(topword_extraction_methods) > 0, "You need to provide at least one topword extraction method."
+        assert n_topwords_description <= n_topwords, "The number of top words for the topic description needs to be smaller or equal to the number of top words."
+
+        self.client = Client(api_key = api_key, azure_endpoint = azure_endpoint)
+
+
+        self.n_topics = n_topics
+        self.openai_prompting_model = openai_prompting_model
+        self.max_number_of_tokens = max_number_of_tokens
+        self.corpus_instruction = corpus_instruction
+        self.document_embeddings = document_embeddings
+        self.vocab_embeddings = vocab_embeddings
+        self.embedding_model = embedding_model
+        self.max_number_of_tokens_embedding = max_number_of_tokens_embedding
+        self.embedder = GetEmbeddingsOpenAI(client = self.client, embedding_model = self.embedding_model, max_tokens = self.max_number_of_tokens_embedding)
+        self.clusterer = clusterer
+        self.n_topwords = n_topwords
+        self.n_topwords_description = n_topwords_description
+        self.topword_extraction_methods = topword_extraction_methods
+        self.compute_vocab_hyperparams = compute_vocab_hyperparams
+        self.enhancer = enhancer
+        self.topic_prompting = topic_prompting	
+        self.use_saved_embeddings = use_saved_embeddings
+        self.verbose = verbose
+
+        self.compute_vocab_hyperparams["verbose"] = self.verbose
+        
+        # if embeddings have already been downloaded to the folder SavedEmbeddings, then load them
+        if self.use_saved_embeddings and os.path.exists(path_saved_embeddings):
+            with open(path_saved_embeddings, "rb") as f:
+                self.document_embeddings, self.vocab_embeddings = pickle.load(f)
+
+        for elem in topword_extraction_methods:
+            assert elem in ["tfidf", "cosine_similarity", "topword_enhancement"], "Invalid topword extraction method. Valid methods are 'tfidf', 'cosine_similarity', and 'topword_enhancement'."
+        
+        if clusterer is None:
+            self.clusterer = Clustering_and_DimRed(number_clusters_hdbscan = self.n_topics, verbose = self.verbose)
+        else:
+            self.n_topics = clusterer.number_clusters_hdbscan
+        
+        if enhancer is None:
+            self.enhancer = TopwordEnhancement(client = self.client, openai_model = self.openai_prompting_model, max_context_length = self.max_number_of_tokens, corpus_instruction = self.corpus_instruction)
+
+        if topic_prompting is None:
+            self.topic_prompting = TopicPrompting(topic_lis = [], client = self.client, openai_prompting_model = self.openai_prompting_model,  max_context_length_promting = 16000, enhancer = self.enhancer, openai_embedding_model = self.embedding_model, max_context_length_embedding = self.max_number_of_tokens_embedding, corpus_instruction = corpus_instruction)
+        
+        self.extractor = ExtractTopWords()
+    
+    def __repr__(self) -> str:
+        repr = "TopicGPT object with the following parameters:\n"
+        repr += "-"*150 + "\n"
+        repr += "n_topics: " + str(self.n_topics) + "\n"
+        repr += "openai_prompting_model: " + self.openai_prompting_model + "\n"
+        repr += "max_number_of_tokens: " + str(self.max_number_of_tokens) + "\n"
+        repr += "corpus_instruction: " + self.corpus_instruction + "\n"
+        repr += "embedding_model: " + self.embedding_model + "\n"
+        repr += "clusterer: " + str(self.clusterer) + "\n"
+        repr += "n_topwords: " + str(self.n_topwords) + "\n"
+        repr += "n_topwords_description: " + str(self.n_topwords_description) + "\n"
+        repr += "topword_extraction_methods: " + str(self.topword_extraction_methods) + "\n"
+        repr += "compute_vocab_hyperparams: " + str(self.compute_vocab_hyperparams) + "\n"
+        repr += "enhancer: " + str(self.enhancer) + "\n"
+        repr += "topic_prompting: " + str(self.topic_prompting) + "\n"
+
+        return repr
+
+    def compute_embeddings(self, corpus: list[str]) -> tuple[np.ndarray, dict[str, np.ndarray]]:
+        """
+        Computes document and vocabulary embeddings for the given corpus.
+
+        Args:
+            corpus (list[str]): List of strings to embed, where each element is a document.
+
+        Returns:
+            tuple: A tuple containing two items:
+                - document_embeddings (np.ndarray): Document embeddings for the corpus, with shape (len(corpus), n_embedding_dimensions).
+                - vocab_embeddings (dict[str, np.ndarray]): Vocabulary embeddings for the corpus, provided as a dictionary where keys are words and values are embeddings.
+        """
+
+        
+        self.document_embeddings = self.embedder.get_embeddings(corpus)["embeddings"]
+
+        self.vocab_embeddings = self.extractor.embed_vocab_openAI(self.client, self.vocab, embedder = self.embedder)
+
+        return self.document_embeddings, self.vocab_embeddings
+    
+    def extract_topics(self, corpus: list[str]) -> list[Topic]:
+        """
+        Extracts topics from the given corpus.
+
+        Args:
+            corpus (list[str]): List of strings to process, where each element represents a document.
+
+        Returns:
+            list[Topic]: A list of Topic objects representing the extracted topics.
+        """
+
+        assert self.document_embeddings is not None and self.vocab_embeddings is not None, "You need to compute the embeddings first."
+
+        if self.vocab is None: 
+            self.vocab = self.extractor.compute_corpus_vocab(self.corpus, **self.compute_vocab_hyperparams)
+        
+        self.topic_lis = TopicRepresentation.extract_topics_no_new_vocab_computation(
+            corpus = corpus,
+            vocab = self.vocab,
+            document_embeddings = self.document_embeddings,
+            clusterer = self.clusterer,
+            vocab_embeddings = self.vocab_embeddings,
+            n_topwords = self.n_topwords,
+            topword_extraction_methods = self.topword_extraction_methods,
+            consider_outliers = True
+        )
+
+        return self.topic_lis
+    
+    def describe_topics(self, topics: list[Topic]) -> list[Topic]:
+        """
+        Names and describes the provided topics using the OpenAI API.
+
+        Args:
+            topics (list[Topic]): List of Topic objects to be named and described.
+
+        Returns:
+            list[Topic]: A list of Topic objects with names and descriptions.
+        """
+
+
+        assert self.topic_lis is not None, "You need to extract the topics first."
+
+        if "cosine_similarity" in self.topword_extraction_methods:
+            topword_method = "cosine_similarity"
+        elif "tfidf" in self.topword_extraction_methods:
+            topword_method = "tfidf"
+        else:
+            raise ValueError("You need to use either 'cosine_similarity' or 'tfidf' as topword extraction method.")
+
+        self.topic_lis = TopicRepresentation.describe_and_name_topics(
+            topics = topics,
+            enhancer = self.enhancer,
+            topword_method= topword_method,
+            n_words = self.n_topwords_description
+        )
+
+        return self.topic_lis
+    
+    def fit(self, corpus: list[str], verbose: bool = True):
+        """
+        Compute embeddings if necessary, extract topics, and describe them.
+
+        Args:
+            corpus (list[str]): List of strings to embed, where each element represents a document.
+            verbose (bool, optional): Whether to print the progress and details of the process.
+        """
+
+        self.corpus = corpus 
+        
+        # remove empty documents
+        len_before_removing = len(self.corpus)
+        while '' in self.corpus:
+            corpus.remove('')
+        len_after_removing = len(self.corpus)
+        if verbose:
+            print("Removed " + str(len_before_removing - len_after_removing) + " empty documents.")
+
+        if self.vocab_embeddings is None:
+            if verbose:
+                print("Computing vocabulary...")
+
+            self.vocab = self.extractor.compute_corpus_vocab(self.corpus, **self.compute_vocab_hyperparams)
+        else:
+            print('Vocab already computed')
+            self.vocab = list(self.vocab_embeddings.keys())
+
+        if self.vocab_embeddings is None or self.document_embeddings is None:  
+            if verbose:
+                print("Computing embeddings...")
+            self.compute_embeddings(corpus = self.corpus)
+        else:
+            print('Embeddings already computed')
+        if verbose: 
+            print("Extracting topics...")
+        self.topic_lis = self.extract_topics(corpus = self.corpus)
+
+        if verbose:
+            print("Describing topics...")
+        self.topic_lis = self.describe_topics(topics = self.topic_lis)
+
+        self.topic_prompting.topic_lis = self.topic_lis
+        self.topic_prompting.vocab_embeddings = self.vocab_embeddings
+        self.topic_prompting.vocab = self.vocab
+
+    def visualize_clusters(self):
+        """
+        Visualizes the identified clusters representing the topics in a scatterplot.
+        """
+
+        assert self.topic_lis is not None, "You need to extract the topics first."
+
+        all_document_embeddings = np.concatenate([topic.document_embeddings_hd for topic in self.topic_lis], axis = 0)
+        all_texts = np.concatenate([topic.documents for topic in self.topic_lis], axis = 0)
+        all_document_indices = np.concatenate([np.repeat(i, topic.document_embeddings_hd.shape[0]) for i, topic in enumerate(self.topic_lis)], axis = 0)
+        class_names = [str(topic) for topic in self.topic_lis]
+
+        self.clusterer.visualize_clusters_dynamic(all_document_embeddings, all_document_indices, all_texts, class_names)
+    
+    def repr_topics(self) -> str:
+        """
+        Returns a string explanation of the topics.
+        """
+
+        assert self.topic_lis is not None, "You need to extract the topics first."
+
+        if "cosine_similarity" in self.topword_extraction_methods:
+            topword_method = "cosine_similarity"
+        elif "tfidf" in self.topword_extraction_methods:
+            topword_method = "tfidf"
+        else:
+            raise ValueError("You need to use either 'cosine_similarity' or 'tfidf' as topword extraction method.")
+
+        repr = ""
+        for topic in self.topic_lis:
+            repr += str(topic) + "\n"
+            repr += "Topic_description: " + topic.topic_description + "\n"
+            repr += "Top words: " + str(topic.top_words[topword_method][:10]) + "\n"
+            repr += "\n"
+            repr += "-"*150 + "\n"
+
+        return repr
+
+    def print_topics(self):
+        """
+        Prints a string explanation of the topics.
+        """
+   
+        print(self.repr_topics())
+
+    def prompt(self, query: str) -> tuple[str, object]:
+        """
+        Prompts the model with the given query.
+
+        Args:
+            query (str): The query to prompt the model with.
+
+        Returns:
+            tuple: A tuple containing two items:
+                - answer (str): The answer from the model.
+                - function_result (object): The result of the function call.
+        
+        Note:
+            Please refer to the TopicPrompting class for more details on available functions for prompting the model.
+        """
+
+
+        result = self.topic_prompting.general_prompt(query)
+
+        answer = result[0][-1].choices[0].message.content
+        function_result = result[1]
+        self.topic_prompting._fix_dictionary_topwords()
+        self.topic_lis = self.topic_prompting.topic_lis
+
+        return answer, function_result
+    
+    def pprompt(self, query: str, return_function_result: bool = True) -> object:
+        """
+        Prompts the model with the given query and prints the answer.
+
+        Args:
+            query (str): The query to prompt the model with.
+            return_function_result (bool, optional): Whether to return the result of the function call by the Language Model (LLM).
+
+        Returns:
+            object: The result of the function call if return_function_result is True, otherwise None.
+        """
+
+
+        answer, function_result = self.prompt(query)
+
+        print(answer)
+
+        if return_function_result:
+            return function_result
+        
+    def save_embeddings(self, path: str = embeddings_path) -> None:
+        """
+        Saves the document and vocabulary embeddings to a pickle file for later re-use.
+
+        Args:
+            path (str, optional): The path to save the embeddings to. Defaults to embeddings_path.
+        """
+
+
+        assert self.document_embeddings is not None and self.vocab_embeddings is not None, "You need to compute the embeddings first."
+
+        # create dictionary if it doesn't exist yet 
+        if not os.path.exists("SavedEmbeddings"):
+            os.makedirs("SavedEmbeddings")
+
+
+        with open(path, "wb") as f:
+            pickle.dump([self.document_embeddings, self.vocab_embeddings], f)
+
@@ -0,0 +1,664 @@
+import numpy as np
+import umap
+import sys
+import os
+import inspect
+from tqdm import tqdm
+import umap
+import json
+
+# make sure the import works even if the package has not been installed and just the files are used
+
+from topicgpt.Clustering import Clustering_and_DimRed
+from topicgpt.ExtractTopWords import ExtractTopWords
+from topicgpt.TopwordEnhancement import TopwordEnhancement
+
+class Topic:
+    """
+    class to represent a topic and all its attributes
+    """
+
+    def __init__(self, 
+             topic_idx: str, 
+             documents: list[str], 
+             words: dict[str, int],
+             centroid_hd: np.ndarray = None, 
+             centroid_ld: np.ndarray = None,
+             document_embeddings_hd: np.ndarray = None,
+             document_embeddings_ld: np.ndarray = None,
+             document_embedding_similarity: np.ndarray = None,
+             umap_mapper: umap.UMAP = None,
+             top_words: dict[str, list[str]] = None,
+             top_word_scores: dict[str, list[float]] = None
+             ) -> None:
+        """
+        Represents a topic and all its attributes.
+
+        Args:
+            topic_idx (str): Index or name of the topic.
+            documents (list[str]): List of documents in the topic.
+            words (dict[str, int]): Dictionary of words and their counts in the topic.
+            centroid_hd (np.ndarray, optional): Centroid of the topic in high-dimensional space.
+            centroid_ld (np.ndarray, optional): Centroid of the topic in low-dimensional space.
+            document_embeddings_hd (np.ndarray, optional): Embeddings of documents in high-dimensional space that belong to this topic.
+            document_embeddings_ld (np.ndarray, optional): Embeddings of documents in low-dimensional space that belong to this topic.
+            document_embedding_similarity (np.ndarray, optional): Similarity array of document embeddings to the centroid in low-dimensional space.
+            umap_mapper (umap.UMAP, optional): UMAP mapper object to map from high-dimensional space to low-dimensional space.
+            top_words (dict[str, list[str]], optional): Dictionary of top words in the topic according to different metrics.
+            top_word_scores (dict[str, list[float]], optional): Dictionary of how representative the top words are according to different metrics.
+        """
+
+        # do some checks on the input
+
+        assert len(documents) == len(document_embeddings_hd) == len(document_embeddings_ld) == len(document_embedding_similarity), "documents, document_embeddings_hd, document_embeddings_ld and document_embedding_similarity must have the same length"
+        assert len(documents) > 0, "documents must not be empty"
+        assert len(words) > 0, "words must not be empty"
+
+
+        self.topic_idx = topic_idx
+        self.documents = documents
+        self.words = words
+        self.centroid_hd = centroid_hd
+        self.centroid_ld = centroid_ld
+        self.document_embeddings_hd = document_embeddings_hd
+        self.document_embeddings_ld = document_embeddings_ld
+        self.document_embedding_similarity = document_embedding_similarity
+        self.umap_mapper = umap_mapper
+        self.top_words = top_words
+        self.top_word_scores = top_word_scores
+
+        self.topic_name = None # initialize the name of the topic as none
+
+    def __str__(self) -> str:
+
+        if self.topic_idx and self.topic_name is None:
+            repr = f"Topic {hash(self)}\n"
+        if self.topic_name is None:
+            repr = f"Topic: {self.topic_idx}\n"
+        else: 
+            repr = f"Topic {self.topic_idx}: {self.topic_name}\n"
+        
+        return repr
+    
+    def __repr__(self) -> str:
+        return self.__str__()
+    
+    def to_json(self) -> str:
+        """
+        return a json representation of the topic
+        """
+        repr_dict = {
+            "topic_idx": self.topic_idx,
+            "topic_name": self.topic_name,
+            "topic_description": self.topic_description
+        }
+
+        json_object = json.dumps(repr_dict, indent = 4)
+        return json_object
+    
+    def to_dict(self) -> dict:
+        """
+        return a dict representation of the topic
+        """
+        repr_dict = {
+            "topic_idx": int(self.topic_idx),
+            "topic_name": self.topic_name,
+            "topic_description": self.topic_description
+        }
+        return repr_dict
+    
+    def set_topic_name(self, name:str):
+        """
+        add a name to the topic
+        params:
+            name: name of the topic
+        """
+        self.topic_name = name
+
+    def set_topic_description(self, text: str):
+        """
+        add a text description to the topic
+        params:
+            text: text description of the topic
+        """
+        self.topic_description = text
+
+def topic_to_json(topic: Topic) -> str:
+    """
+    Return a JSON representation of the topic.
+
+    Args:
+        topic (Topic): The topic object to convert to JSON.
+
+    Returns:
+        str: A JSON string representing the topic.
+    """
+    repr_dict = {
+        "topic_idx": topic.topic_idx,
+        "topic_name": topic.topic_name,
+        "topic_description": topic.topic_description
+    }
+
+    json_object = json.dumps(repr_dict, indent = 4)
+    return json_object
+
+def topic_lis_to_json(topics: list[Topic]) -> str:
+    """
+    Return a JSON representation of a list of topics.
+
+    Args:
+        topics (list[Topic]): The list of topic objects to convert to JSON.
+
+    Returns:
+        str: A JSON string representing the list of topics.
+    """
+    repr_dict = {}
+    for topic in topics:
+        repr_dict[topic.topic_idx] = {
+            "topic_name": topic.topic_name,
+            "topic_description": topic.topic_description
+        }
+
+    json_object = json.dumps(repr_dict, indent = 4)
+    return json_object
+
+@staticmethod
+def extract_topics(corpus: list[str], document_embeddings: np.ndarray, clusterer: Clustering_and_DimRed, vocab_embeddings: np.ndarray, n_topwords: int = 2000, topword_extraction_methods: list[str] = ["tfidf", "cosine_similarity"], compute_vocab_hyperparams: dict = {}) -> list[Topic]:
+    """
+    Extracts topics from the given corpus using the provided clusterer object on the document embeddings.
+
+    Args:
+        corpus (list[str]): List of documents.
+        document_embeddings (np.ndarray): Embeddings of the documents.
+        clusterer (Clustering_and_DimRed): Clustering and dimensionality reduction object to cluster the documents.
+        vocab_embeddings (np.ndarray): Embeddings of the vocabulary.
+        n_topwords (int, optional): Number of top-words to extract from the topics (default is 2000).
+        topword_extraction_methods (list[str], optional): List of methods to extract top-words from the topics. 
+            Can contain "tfidf" and "cosine_similarity" (default is ["tfidf", "cosine_similarity"]).
+        compute_vocab_hyperparams (dict, optional): Hyperparameters for the top-word extraction methods.
+
+    Returns:
+        list[Topic]: List of Topic objects representing the extracted topics.
+    """
+
+    for elem in topword_extraction_methods:
+        if elem not in ["tfidf", "cosine_similarity"]:
+            raise ValueError("topword_extraction_methods can only contain 'tfidf' and 'cosine_similarity'")
+    if topword_extraction_methods == []:
+        raise ValueError("topword_extraction_methods cannot be empty")
+
+    dim_red_embeddings, labels, umap_mapper = clusterer.cluster_and_reduce(document_embeddings)  # get dimensionality reduced embeddings, their labels and the umap mapper object
+
+    unique_labels = np.unique(labels)  # In case the cluster labels are not consecutive numbers, we need to map them to consecutive 
+    label_mapping = {label: i for i, label in enumerate(unique_labels[unique_labels != -1])}
+    label_mapping[-1] = -1
+    labels = np.array([label_mapping[label] for label in labels])
+
+    extractor = ExtractTopWords()
+    centroid_dict = extractor.extract_centroids(document_embeddings, labels)  # get the centroids of the clusters
+    centroid_arr = np.array(list(centroid_dict.values()))
+    if centroid_arr.ndim == 1:
+        centroid_arr = centroid_arr.reshape(-1, 1)
+    dim_red_centroids = umap_mapper.transform(np.array(list(centroid_dict.values())))  # map the centroids to low dimensional space
+    
+    dim_red_centroid_dict = {label: centroid for label, centroid in zip(centroid_dict.keys(), dim_red_centroids)}
+
+    vocab = extractor.compute_corpus_vocab(corpus, **compute_vocab_hyperparams)  # compute the vocabulary of the corpus
+
+    word_topic_mat = extractor.compute_word_topic_mat(corpus, vocab, labels, consider_outliers = False)  # compute the word-topic matrix of the corpus
+    if "tfidf" in topword_extraction_methods:
+        tfidf_topwords, tfidf_dict = extractor.extract_topwords_tfidf(word_topic_mat = word_topic_mat, vocab = vocab, labels = labels, top_n_words = n_topwords)  # extract the top-words according to tfidf
+    if "cosine_similarity" in topword_extraction_methods:
+        cosine_topwords, cosine_dict = extractor.extract_topwords_centroid_similarity(word_topic_mat = word_topic_mat, vocab = vocab, vocab_embedding_dict = vocab_embeddings, centroid_dict= dim_red_centroid_dict, umap_mapper = umap_mapper, top_n_words = n_topwords, reduce_vocab_embeddings = True, reduce_centroid_embeddings = False, consider_outliers = False)
+                                                                                     
+    topics = []
+    for i, label in enumerate(np.unique(labels)):
+        if label < -0.5: # dont include outliers
+            continue
+        topic_idx = f"{label}"
+        documents = [doc for j, doc in enumerate(corpus) if labels[j] == label]
+        embeddings_hd = document_embeddings[labels == label]
+        embeddings_ld = dim_red_embeddings[labels == label]
+        centroid_hd = centroid_dict[label]
+        centroid_ld = dim_red_centroids[label]
+        
+        centroid_similarity = np.dot(embeddings_ld, centroid_ld)/(np.linalg.norm(embeddings_ld, axis = 1)*np.linalg.norm(centroid_ld))
+        similarity_sorting = np.argsort(centroid_similarity)[::-1]
+        documents = [documents[i] for i in similarity_sorting]
+        embeddings_hd = embeddings_hd[similarity_sorting]
+        embeddings_ld = embeddings_ld[similarity_sorting]
+
+        if type(cosine_topwords[label]) == dict:
+            cosine_topwords[label] = cosine_topwords[label][0]
+
+        top_words = {
+            "tfidf": tfidf_topwords[label] if "tfidf" in topword_extraction_methods else None,
+            "cosine_similarity": cosine_topwords[label] if "cosine_similarity" in topword_extraction_methods else None
+        }
+        top_word_scores = {
+            "tfidf": tfidf_dict[label] if "tfidf" in topword_extraction_methods else None,
+            "cosine_similarity": cosine_dict[label] if "cosine_similarity" in topword_extraction_methods else None
+        }
+
+        topic = Topic(topic_idx = topic_idx,
+                        documents = documents,
+                        words = vocab,
+                        centroid_hd = centroid_hd,
+                        centroid_ld = centroid_ld,
+                        document_embeddings_hd = embeddings_hd,
+                        document_embeddings_ld = embeddings_ld,
+                        document_embedding_similarity = centroid_similarity,
+                        umap_mapper = umap_mapper,
+                        top_words = top_words, 
+                        top_word_scores = top_word_scores
+                        )
+                      
+        topics.append(topic)
+    
+    return topics
+
+@staticmethod
+def extract_topics_no_new_vocab_computation(corpus: list[str], vocab: list[str], document_embeddings: np.ndarray, clusterer: Clustering_and_DimRed, vocab_embeddings: np.ndarray, n_topwords: int = 2000, topword_extraction_methods: list[str] = ["tfidf", "cosine_similarity"], consider_outliers: bool = False) -> list[Topic]:
+    """
+    Extracts topics from the given corpus using the provided clusterer object on the document embeddings. 
+    This version does not compute the vocabulary of the corpus and instead uses the provided vocabulary.
+
+    Args:
+        corpus (list[str]): List of documents.
+        vocab (list[str]): Vocabulary of the corpus.
+        document_embeddings (np.ndarray): Embeddings of the documents.
+        clusterer (Clustering_and_DimRed): Clustering and dimensionality reduction object to cluster the documents.
+        vocab_embeddings (np.ndarray): Embeddings of the vocabulary.
+        n_topwords (int, optional): Number of top-words to extract from the topics (default is 2000).
+        topword_extraction_methods (list[str], optional): List of methods to extract top-words from the topics. 
+            Can contain "tfidf" and "cosine_similarity" (default is ["tfidf", "cosine_similarity"]).
+        consider_outliers (bool, optional): Whether to consider outliers during topic extraction (default is False).
+
+    Returns:
+        list[Topic]: List of Topic objects representing the extracted topics.
+    """
+
+
+    for elem in topword_extraction_methods:
+        if elem not in ["tfidf", "cosine_similarity"]:
+            raise ValueError("topword_extraction_methods can only contain 'tfidf' and 'cosine_similarity'")
+    if topword_extraction_methods == []:
+        raise ValueError("topword_extraction_methods cannot be empty")
+
+    dim_red_embeddings, labels, umap_mapper = clusterer.cluster_and_reduce(document_embeddings)  # get dimensionality reduced embeddings, their labels and the umap mapper object
+
+    unique_labels = np.unique(labels)  # In case the cluster labels are not consecutive numbers, we need to map them to consecutive 
+    label_mapping = {label: i for i, label in enumerate(unique_labels[unique_labels != -1])}
+    label_mapping[-1] = -1
+    labels = np.array([label_mapping[label] for label in labels])
+
+    extractor = ExtractTopWords()
+    centroid_dict = extractor.extract_centroids(document_embeddings, labels)  # get the centroids of the clusters
+
+    centroid_arr = np.array(list(centroid_dict.values()))
+    if centroid_arr.ndim == 1:
+        centroid_arr = centroid_arr.reshape(-1, 1)
+    dim_red_centroids = umap_mapper.transform(np.array(list(centroid_dict.values())))  # map the centroids to low dimensional space
+
+    dim_red_centroid_dict = {label: centroid for label, centroid in zip(centroid_dict.keys(), dim_red_centroids)}
+
+    word_topic_mat = extractor.compute_word_topic_mat(corpus, vocab, labels, consider_outliers = consider_outliers)  # compute the word-topic matrix of the corpus
+    if "tfidf" in topword_extraction_methods:
+        tfidf_topwords, tfidf_dict = extractor.extract_topwords_tfidf(word_topic_mat = word_topic_mat, vocab = vocab, labels = labels, top_n_words = n_topwords)  # extract the top-words according to tfidf
+    if "cosine_similarity" in topword_extraction_methods:
+        cosine_topwords, cosine_dict = extractor.extract_topwords_centroid_similarity(word_topic_mat = word_topic_mat, vocab = vocab, vocab_embedding_dict = vocab_embeddings, centroid_dict= dim_red_centroid_dict, umap_mapper = umap_mapper, top_n_words = n_topwords, reduce_vocab_embeddings = True, reduce_centroid_embeddings = False, consider_outliers = True)
+                                                                                           
+    topics = []
+    for i, label in enumerate(np.unique(labels)):
+        if label < -0.5: # dont include outliers
+            continue
+        topic_idx = f"{label}"
+        documents = [doc for j, doc in enumerate(corpus) if labels[j] == label]
+        embeddings_hd = document_embeddings[labels == label]
+        embeddings_ld = dim_red_embeddings[labels == label]
+        centroid_hd = centroid_dict[label]
+        centroid_ld = dim_red_centroids[label]
+        
+        centroid_similarity = np.dot(embeddings_ld, centroid_ld)/(np.linalg.norm(embeddings_ld, axis = 1)*np.linalg.norm(centroid_ld))
+        similarity_sorting = np.argsort(centroid_similarity)[::-1]
+        documents = [documents[i] for i in similarity_sorting]
+        embeddings_hd = embeddings_hd[similarity_sorting]
+        embeddings_ld = embeddings_ld[similarity_sorting]
+
+        try:
+            if type(cosine_topwords[label]) == dict:
+                cosine_topwords[label] = cosine_topwords[label][0]
+        except:
+            pass
+
+        top_words = {
+            "tfidf": tfidf_topwords[label] if "tfidf" in topword_extraction_methods else None,
+            "cosine_similarity": cosine_topwords[label] if "cosine_similarity" in topword_extraction_methods else None
+        }
+        top_word_scores = {
+            "tfidf": tfidf_dict[label] if "tfidf" in topword_extraction_methods else None,
+            "cosine_similarity": cosine_dict[label] if "cosine_similarity" in topword_extraction_methods else None
+        }
+
+        topic = Topic(topic_idx = topic_idx,
+                        documents = documents,
+                        words = vocab,
+                        centroid_hd = centroid_hd,
+                        centroid_ld = centroid_ld,
+                        document_embeddings_hd = embeddings_hd,
+                        document_embeddings_ld = embeddings_ld,
+                        document_embedding_similarity = centroid_similarity,
+                        umap_mapper = umap_mapper,
+                        top_words = top_words, 
+                        top_word_scores = top_word_scores
+                        )
+                      
+        topics.append(topic)
+    
+    return topics
+
+@staticmethod
+def extract_and_describe_topics(corpus: list[str], document_embeddings: np.ndarray, clusterer: Clustering_and_DimRed, vocab_embeddings: np.ndarray, enhancer: TopwordEnhancement, n_topwords: int = 2000, n_topwords_description: int = 500, topword_extraction_methods: list[str] = ["tfidf", "cosine_similarity"], compute_vocab_hyperparams: dict = {}, topword_description_method: str = "cosine_similarity") -> list[Topic]:
+    """
+    Extracts topics from the given corpus using the provided clusterer object on the document embeddings and describes/names them using the given enhancer object.
+
+    Args:
+        corpus (list[str]): List of documents.
+        document_embeddings (np.ndarray): Embeddings of the documents.
+        clusterer (Clustering_and_DimRed): Clustering and dimensionality reduction object to cluster the documents.
+        vocab_embeddings (np.ndarray): Embeddings of the vocabulary.
+        enhancer (TopwordEnhancement): Enhancer object for enhancing top-words and generating descriptions/names for topics.
+        n_topwords (int, optional): Number of top-words to extract from the topics (default is 2000).
+        n_topwords_description (int, optional): Number of top-words to use from the extracted topics for description and naming (default is 500).
+        topword_extraction_methods (list[str], optional): List of methods to extract top-words from the topics. 
+            Can contain "tfidf" and "cosine_similarity" (default is ["tfidf", "cosine_similarity"]).
+        compute_vocab_hyperparams (dict, optional): Hyperparameters for the top-word extraction methods.
+        topword_description_method (str, optional): Method to use for top-word extraction for description/naming. 
+            Can be "tfidf" or "cosine_similarity" (default is "cosine_similarity").
+
+    Returns:
+        list[Topic]: List of Topic objects representing the extracted and described topics.
+    """
+
+    print("Extracting topics...")
+    topics = extract_topics(corpus, document_embeddings, clusterer, vocab_embeddings, n_topwords, topword_extraction_methods, compute_vocab_hyperparams)
+    print("Describing topics...")
+    topics = describe_and_name_topics(topics, enhancer, topword_description_method, n_topwords_description)
+    return topics
+
+@staticmethod
+def extract_topics_labels_vocab(corpus: list[str], document_embeddings_hd: np.ndarray, document_embeddings_ld: np.ndarray, labels: np.ndarray, umap_mapper: umap.UMAP, vocab_embeddings: np.ndarray, vocab: list[str] = None, n_topwords: int = 2000, topword_extraction_methods: list[str] = ["tfidf", "cosine_similarity"]) -> list[Topic]:
+    """
+    Extracts topics from the given corpus using the provided labels that indicate the topics (no -1 for outliers). Vocabulary is already computed.
+
+    Args:
+        corpus (list[str]): List of documents.
+        document_embeddings_hd (np.ndarray): Embeddings of the documents in high-dimensional space.
+        document_embeddings_ld (np.ndarray): Embeddings of the documents in low-dimensional space.
+        labels (np.ndarray): Labels indicating the topics.
+        umap_mapper (umap.UMAP): UMAP mapper object to map from high-dimensional space to low-dimensional space.
+        vocab_embeddings (np.ndarray): Embeddings of the vocabulary.
+        vocab (list[str], optional): Vocabulary of the corpus (default is None).
+        n_topwords (int, optional): Number of top-words to extract from the topics (default is 2000).
+        topword_extraction_methods (list[str], optional): List of methods to extract top-words from the topics. 
+            Can contain "tfidf" and "cosine_similarity" (default is ["tfidf", "cosine_similarity"]).
+
+    Returns:
+        list[Topic]: List of Topic objects representing the extracted topics.
+    """
+
+    for elem in topword_extraction_methods:
+        if elem not in ["tfidf", "cosine_similarity"]:
+            raise ValueError("topword_extraction_methods can only contain 'tfidf' and 'cosine_similarity'")
+    if topword_extraction_methods == []:
+        raise ValueError("topword_extraction_methods cannot be empty")
+    
+    if vocab is None:
+        extractor = ExtractTopWords()
+        vocab = extractor.compute_corpus_vocab(corpus)  # compute the vocabulary of the corpus
+    
+    extractor = ExtractTopWords()
+    centroid_dict = extractor.extract_centroids(document_embeddings_hd, labels)  # get the centroids of the clusters
+    
+    centroid_arr = np.array(list(centroid_dict.values()))
+    if centroid_arr.ndim == 1:
+        centroid_arr = centroid_arr.reshape(-1, 1)
+    dim_red_centroids = umap_mapper.transform(np.array(list(centroid_dict.values())))  # map the centroids to low dimensional space
+
+    word_topic_mat = extractor.compute_word_topic_mat(corpus, vocab, labels, consider_outliers = False)  # compute the word-topic matrix of the corpus
+
+    dim_red_centroid_dict = {label: centroid for label, centroid in zip(centroid_dict.keys(), dim_red_centroids)}
+
+    if "tfidf" in topword_extraction_methods:
+        tfidf_topwords, tfidf_dict = extractor.extract_topwords_tfidf(word_topic_mat = word_topic_mat, vocab = vocab, labels = labels, top_n_words = n_topwords)  # extract the top-words according to tfidf
+    if "cosine_similarity" in topword_extraction_methods:
+        cosine_topwords, cosine_dict = extractor.extract_topwords_centroid_similarity(word_topic_mat = word_topic_mat, vocab = vocab, vocab_embedding_dict = vocab_embeddings, centroid_dict= dim_red_centroid_dict, umap_mapper = umap_mapper, top_n_words = n_topwords, reduce_vocab_embeddings = True, reduce_centroid_embeddings = False, consider_outliers = False)
+                                                                                                 
+    topics = []
+    for i, label in enumerate(np.unique(labels)):
+        if label < -0.5: # dont include outliers
+            continue
+        topic_idx = f"{label}"
+        documents = [doc for j, doc in enumerate(corpus) if labels[j] == label]
+        embeddings_hd = document_embeddings_hd[labels == label]
+        embeddings_ld = document_embeddings_ld[labels == label]
+        centroid_hd = centroid_dict[label]
+        centroid_ld = dim_red_centroids[label]
+        
+        centroid_similarity = np.dot(embeddings_ld, centroid_ld)/(np.linalg.norm(embeddings_ld, axis = 1)*np.linalg.norm(centroid_ld))
+        similarity_sorting = np.argsort(centroid_similarity)[::-1]
+        documents = [documents[i] for i in similarity_sorting]
+        embeddings_hd = embeddings_hd[similarity_sorting]
+        embeddings_ld = embeddings_ld[similarity_sorting]
+
+        if type(cosine_topwords[label]) == dict:
+            cosine_topwords[label] = cosine_topwords[label][0]
+        top_words = {
+            "tfidf": tfidf_topwords[label] if "tfidf" in topword_extraction_methods else None,
+            "cosine_similarity": cosine_topwords[label] if "cosine_similarity" in topword_extraction_methods else None
+        }
+        top_word_scores = {
+            "tfidf": tfidf_dict[label] if "tfidf" in topword_extraction_methods else None,
+            "cosine_similarity": cosine_dict[label] if "cosine_similarity" in topword_extraction_methods else None
+        }
+
+        topic = Topic(topic_idx = topic_idx,
+                        documents = documents,
+                        words = vocab,
+                        centroid_hd = centroid_hd,
+                        centroid_ld = centroid_ld,
+                        document_embeddings_hd = embeddings_hd,
+                        document_embeddings_ld = embeddings_ld,
+                        document_embedding_similarity = centroid_similarity,
+                        umap_mapper = umap_mapper,
+                        top_words = top_words, 
+                        top_word_scores = top_word_scores
+                        )
+                      
+        topics.append(topic)
+    
+    return topics
+
+@staticmethod
+def extract_describe_topics_labels_vocab(
+    corpus: list[str],
+    document_embeddings_hd: np.ndarray,
+    document_embeddings_ld: np.ndarray,
+    labels: np.ndarray,
+    umap_mapper: umap.UMAP,
+    vocab_embeddings: np.ndarray,
+    enhancer: TopwordEnhancement,
+    vocab: list[str] = None,
+    n_topwords: int = 2000,
+    n_topwords_description: int = 500,
+    topword_extraction_methods: list[str] = ["tfidf", "cosine_similarity"],
+    topword_description_method: str = "cosine_similarity"
+) -> list[Topic]:
+    """
+    Extracts topics from the given corpus using the provided labels that indicate the topics (no -1 for outliers). Vocabulary is already computed.
+    Describe and name the topics with the given enhancer object.
+
+    Args:
+        corpus (list[str]): List of documents.
+        document_embeddings_hd (np.ndarray): Embeddings of the documents in high-dimensional space.
+        document_embeddings_ld (np.ndarray): Embeddings of the documents in low-dimensional space.
+        labels (np.ndarray): Labels indicating the topics.
+        umap_mapper (umap.UMAP): UMAP mapper object to map from high-dimensional space to low-dimensional space.
+        vocab_embeddings (np.ndarray): Embeddings of the vocabulary.
+        enhancer (TopwordEnhancement): Enhancer object to enhance the top-words and generate the description.
+        vocab (list[str], optional): Vocabulary of the corpus (default is None).
+        n_topwords (int, optional): Number of top-words to extract from the topics (default is 2000).
+        n_topwords_description (int, optional): Number of top-words to use from the extracted topics for the description and the name (default is 500).
+        topword_extraction_methods (list[str], optional): List of methods to extract top-words from the topics. 
+            Can contain "tfidf" and "cosine_similarity" (default is ["tfidf", "cosine_similarity"]).
+        topword_description_method (str, optional): Method to use for top-word extraction. Can be "tfidf" or "cosine_similarity" (default is "cosine_similarity").
+
+    Returns:
+        list[Topic]: List of Topic objects representing the extracted topics.
+    """
+
+    topics = extract_topics_labels_vocab(corpus, document_embeddings_hd, document_embeddings_ld, labels, umap_mapper, vocab_embeddings, vocab, n_topwords, topword_extraction_methods)
+    topics = describe_and_name_topics(topics, enhancer, topword_description_method, n_topwords_description)
+    return topics
+
+@staticmethod
+def extract_topic_cos_sim(
+    documents_topic: list[str],
+    document_embeddings_topic: np.ndarray,
+    words_topic: list[str],
+    vocab_embeddings: dict,
+    umap_mapper: umap.UMAP,
+    n_topwords: int = 2000
+) -> Topic:
+    """
+    Create a Topic object from the given documents and embeddings by computing the centroid and the top-words.
+    Only uses cosine-similarity for top-word extraction.
+
+    Args:
+        documents_topic (list[str]): List of documents in the topic.
+        document_embeddings_topic (np.ndarray): High-dimensional embeddings of the documents in the topic.
+        words_topic (list[str]): List of words in the topic.
+        vocab_embeddings (dict): Embeddings of the vocabulary.
+        umap_mapper (umap.UMAP): UMAP mapper object to map from high-dimensional space to low-dimensional space.
+        n_topwords (int, optional): Number of top-words to extract from the topics (default is 2000).
+
+    Returns:
+        Topic: Topic object representing the extracted topic.
+    """
+
+    topword_extraction_methods = ["cosine_similarity"]
+    extractor = ExtractTopWords()
+    centroid_hd = extractor.extract_centroid(document_embeddings_topic)
+    centroid_ld = umap_mapper.transform(centroid_hd.reshape(1, -1))[0]
+
+    labels = np.zeros(len(documents_topic), dtype = int) #everything has label 0   
+
+    word_topic_mat = extractor.compute_word_topic_mat(documents_topic, words_topic, labels, consider_outliers = False)  # compute the word-topic matrix of the corpus
+    if "cosine_similarity" in topword_extraction_methods:
+        cosine_topwords, cosine_dict = extractor.extract_topwords_centroid_similarity(word_topic_mat = word_topic_mat, vocab = words_topic, vocab_embedding_dict = vocab_embeddings, centroid_dict= {0: centroid_ld}, umap_mapper = umap_mapper, top_n_words = n_topwords, reduce_vocab_embeddings = True, reduce_centroid_embeddings = False, consider_outliers = False)
+
+    
+
+    top_words = {
+        "cosine_similarity": cosine_topwords if "cosine_similarity" in topword_extraction_methods else None
+    }
+    top_word_scores = {
+        "cosine_similarity": cosine_dict if "cosine_similarity" in topword_extraction_methods else None
+    }
+
+    document_embeddings_hd = document_embeddings_topic
+    document_embeddings_ld = umap_mapper.transform(document_embeddings_hd)
+    document_embedding_similarity = np.dot(document_embeddings_ld, centroid_ld)/(np.linalg.norm(document_embeddings_ld, axis = 1)*np.linalg.norm(centroid_ld)) # is this correct???
+
+    topic = Topic(topic_idx = None,
+                documents = documents_topic,	
+                words = words_topic,
+                centroid_hd = centroid_hd,
+                centroid_ld = centroid_ld,
+                document_embeddings_hd = document_embeddings_hd,
+                document_embeddings_ld = document_embeddings_ld,
+                document_embedding_similarity = document_embedding_similarity,
+                umap_mapper = umap_mapper,
+                top_words = top_words,
+                top_word_scores = top_word_scores
+                )
+    
+    return topic
+
+@staticmethod
+def extract_and_describe_topic_cos_sim(
+    documents_topic: list[str],
+    document_embeddings_topic: np.ndarray,
+    words_topic: list[str],
+    vocab_embeddings: dict,
+    umap_mapper: umap.UMAP,
+    enhancer: TopwordEnhancement,
+    n_topwords: int = 2000,
+    n_topwords_description=500
+) -> Topic:
+    """
+    Create a Topic object from the given documents and embeddings by computing the centroid and the top-words.
+    Only use cosine-similarity for top-word extraction.
+    Describe and name the topic with the given enhancer object.
+
+    Args:
+        documents_topic (list[str]): List of documents in the topic.
+        document_embeddings_topic (np.ndarray): High-dimensional embeddings of the documents in the topic.
+        words_topic (list[str]): List of words in the topic.
+        vocab_embeddings (dict): Embeddings of the vocabulary.
+        umap_mapper (umap.UMAP): UMAP mapper object to map from high-dimensional space to low-dimensional space.
+        enhancer (TopwordEnhancement): Enhancer object to enhance the top-words and generate the description.
+        n_topwords (int, optional): Number of top-words to extract from the topics (default is 2000).
+        n_topwords_description (int, optional): Number of top-words to use from the extracted topics for the description and the name (default is 500).
+
+    Returns:
+        Topic: Topic object representing the extracted and described topic.
+    """
+    topic = extract_topic_cos_sim(documents_topic, document_embeddings_topic, words_topic, vocab_embeddings, umap_mapper, n_topwords)
+    topic = describe_and_name_topics([topic], enhancer, "cosine_similarity", n_topwords_description)[0]
+    return topic
+
+    topic = extract_topic_cos_sim(documents_topic, document_embeddings_topic, words_topic, vocab_embeddings, umap_mapper, n_topwords)
+    topic = describe_and_name_topics([topic], enhancer, "cosine_similarity", n_topwords_description)[0]
+    return topic
+
+@staticmethod
+def describe_and_name_topics(
+    topics: list[Topic],
+    enhancer: TopwordEnhancement,
+    topword_method="tfidf",
+    n_words=500
+) -> list[Topic]:
+    """
+    Describe and name the topics using the OpenAI API with the given enhancer object.
+
+    Args:
+        topics (list[Topic]): List of Topic objects.
+        enhancer (TopwordEnhancement): Enhancer object to enhance the top-words and generate the description.
+        topword_method (str, optional): Method to use for top-word extraction. Can be "tfidf" or "cosine_similarity" (default is "tfidf").
+        n_words (int, optional): Number of topwords to extract for the description and the name (default is 500).
+
+    Returns:
+        list[Topic]: List of Topic objects with the description and name added.
+    """
+
+    if topword_method not in ["tfidf", "cosine_similarity"]:
+        raise ValueError("topword_method can only be 'tfidf' or 'cosine_similarity'")
+   
+    for topic in tqdm(topics):
+        tws = topic.top_words[topword_method]
+        try: 
+            topic_name = enhancer.generate_topic_name_str(tws, n_words = n_words)
+            topic_description = enhancer.describe_topic_topwords_str(tws, n_words = n_words)
+        except Exception as e:
+            print(f"Error in topic {topic.topic_idx}: {e}")
+            print("Trying again...")
+            topic_name = enhancer.generate_topic_name_str(tws, n_words = n_words)
+            topic_description = enhancer.describe_topic_topwords_str(tws, n_words = n_words)
+
+
+        topic.set_topic_name(topic_name)
+        topic.set_topic_description(topic_description)
+        
+    return topics
+
@@ -0,0 +1,306 @@
+import tiktoken
+from openai import OpenAI
+
+
+from typing import Callable
+import numpy as np
+
+basic_instruction =  "You are a helpful assistant. You are excellent at inferring topics from top-words extracted via topic-modelling. You make sure that everything you output is strictly based on the provided text."
+
+class TopwordEnhancement:
+
+    def __init__(
+    self,
+    client,
+    openai_model: str = "gpt-3.5-turbo",
+    max_context_length: int = 4000,
+    openai_model_temperature: float = 0.5,
+    basic_model_instruction: str = basic_instruction,
+    corpus_instruction: str = "") -> None:
+        """
+        Initialize the OpenAIAssistant with the specified parameters.
+
+        Args:
+            client: Client.
+            openai_model (str, optional): The OpenAI model to use (default is "gpt-3.5-turbo").
+            max_context_length (int, optional): The maximum length of the context for the OpenAI model (default is 4000).
+            openai_model_temperature (float, optional): The softmax temperature to use for the OpenAI model (default is 0.5).
+            basic_model_instruction (str, optional): The basic instruction for the model.
+            corpus_instruction (str, optional): The instruction for the corpus. Useful if specific information on the corpus is available.
+
+        Returns:
+            None
+        """
+
+        # do some checks on the input arguments
+        assert openai_model is not None, "Please provide an openai model"
+        assert max_context_length > 0, "Please provide a positive max_context_length"
+        assert openai_model_temperature > 0, "Please provide a positive openai_model_temperature"
+
+        self.client = client
+        self.openai_model = openai_model
+        self.max_context_length = max_context_length
+        self.openai_model_temperature = openai_model_temperature
+        self.basic_model_instruction = basic_model_instruction
+        self.corpus_instruction = f" The following information is available about the corpus used to identify the topics: {corpus_instruction}"
+
+    def __str__(self) -> str:
+        repr = f"TopwordEnhancement(openai_model = {self.openai_model})"
+        return repr
+
+    def __repr__(self) -> str:
+        repr = f"TopwordEnhancement(openai_model = {self.openai_model})"
+        return repr
+
+    def count_tokens_api_message(self, messages: list[dict[str]]) -> int:
+        """
+        Count the number of tokens in the API messages.
+
+        Args:
+            messages (list[dict[str]]): List of messages from the API.
+
+        Returns:
+            int: Number of tokens in the messages.
+        """
+        encoding = tiktoken.encoding_for_model(self.openai_model)
+        n_tokens = 0
+        for message in messages: 
+            for key, value in message.items():
+                if key == "content":
+                    n_tokens += len(encoding.encode(value))
+
+        return n_tokens
+
+    def describe_topic_topwords_completion_object(self, 
+                               topwords: list[str], 
+                               n_words: int = None,
+                               query_function: Callable = lambda tws: f"Please give me the common topic of those words: {tws}. Also describe the various aspects and sub-topics of the topic.") :
+        """
+        Describe the given topic based on its topwords using the OpenAI model.
+
+        Args:
+            topwords (list[str]): List of topwords.
+            n_words (int, optional): Number of words to use for the query. If None, all words are used.
+            query_function (Callable, optional): Function to query the model. The function should take a list of topwords and return a string.
+
+        Returns:
+            openai.ChatCompletion: A description of the topics by the model in the form of an OpenAI ChatCompletion object.
+        """
+
+        if n_words is None:
+            n_words = len(topwords)
+
+        if type(topwords) == dict:
+            topwords = topwords[0]
+
+        topwords = topwords[:n_words]
+        topwords = np.array(topwords)
+
+
+        # if too many topwords are given, use only the first part of the topwords that fits into the context length
+        tokens_cumsum = np.cumsum([len(tiktoken.encoding_for_model(self.openai_model).encode(tw + ", ")) for tw in topwords]) + len(tiktoken.encoding_for_model(self.openai_model).encode(self.basic_model_instruction + " " + self.corpus_instruction))
+        if tokens_cumsum[-1] > self.max_context_length:
+            print("Too many topwords given. Using only the first part of the topwords that fits into the context length. Number of topwords used: ", np.argmax(tokens_cumsum > self.max_context_length))
+            n_words = np.argmax(tokens_cumsum > self.max_context_length)
+            topwords = topwords[:n_words]
+
+
+
+        completion = self.client.chat.completions.create(model=self.openai_model,
+        messages=[
+            {"role": "system", "content":  self.basic_model_instruction + " " + self.corpus_instruction},
+            {"role": "user", "content": query_function(topwords)},
+        ],
+        temperature = self.openai_model_temperature)
+
+        return completion
+
+    def describe_topic_topwords_str(self, 
+                               topwords: list[str], 
+                               n_words: int = None,
+                               query_function: Callable = lambda tws: f"Please give me the common topic of those words: {tws}. Also describe the various aspects and sub-topics of the topic. Make sure the descriptions are short and concise! Do not cite more than 5 words per sub-aspect!!!") -> str:
+        """
+        Describe the given topic based on its topwords using the OpenAI model.
+
+        Args:
+            topwords (list[str]): List of topwords.
+            n_words (int, optional): Number of words to use for the query. If None, all words are used.
+            query_function (Callable, optional): Function to query the model. The function should take a list of topwords and return a string.
+
+        Returns:
+            str: A description of the topics by the model in the form of a string.
+        """
+
+        completion = self.describe_topic_topwords_completion_object(topwords, n_words, query_function)
+        return completion.choices[0].message.content
+
+    def generate_topic_name_str(self,
+                            topwords: list[str],
+                            n_words: int = None,
+                            query_function: Callable = lambda tws: f"Please give me the common topic of those words: {tws}. Give me only the title of the topic and nothing else please. Make sure the title is precise and not longer than 5 words, ideally even shorter.") -> str:
+        """
+        Generate a topic name based on the given topwords using the OpenAI model.
+
+        Args:
+            topwords (list[str]): List of topwords.
+            n_words (int, optional): Number of words to use for the query. If None, all words are used.
+            query_function (Callable, optional): Function to query the model. The function should take a list of topwords and return a string.
+
+        Returns:
+            str: A topic name generated by the model in the form of a string.
+        """
+
+        return self.describe_topic_topwords_str(topwords, n_words, query_function)
+
+    def describe_topic_documents_completion_object(self, 
+                                               documents: list[str],
+                                               truncate_doc_thresh=100,
+                                               n_documents: int = None,
+                                               query_function: Callable = lambda docs: f"Please give me the common topic of those documents: {docs}. Note that the documents are truncated if they are too long. Also describe the various aspects and sub-topics of the topic."):
+        """
+        Describe the given topic based on its documents using the OpenAI model.
+
+        Args:
+            documents (list[str]): List of documents.
+            truncate_doc_thresh (int, optional): Threshold for the number of words in a document. If a document has more words than this threshold, it is pruned to this threshold.
+            n_documents (int, optional): Number of documents to use for the query. If None, all documents are used.
+            query_function (Callable, optional): Function to query the model. The function should take a list of documents and return a string.
+
+        Returns:
+            openai.ChatCompletion: A description of the topics by the model in the form of an openai.ChatCompletion object.
+        """
+
+        if n_documents is None:
+            n_documents = len(documents)
+        documents = documents[:n_documents]
+
+        # prune documents based on number of tokens they contain 
+        new_doc_lis = []
+        for doc in documents:
+            doc = doc.split(" ")
+            if len(doc) > truncate_doc_thresh:
+                doc = doc[:truncate_doc_thresh]
+            new_doc_lis.append(" ".join(doc))
+        documents = new_doc_lis
+
+        # if too many documents are given, use only the first part of the documents that fits into the context length
+        tokens_cumsum = np.cumsum([len(tiktoken.encoding_for_model(self.openai_model).encode(doc + ", ")) for doc in documents]) + len(tiktoken.encoding_for_model(self.openai_model).encode(self.basic_model_instruction + " " + self.corpus_instruction))
+        if tokens_cumsum[-1] > self.max_context_length:
+            print("Too many documents given. Using only the first part of the documents that fits into the context length. Number of documents used: ", np.argmax(tokens_cumsum > self.max_context_length))
+            n_documents = np.argmax(tokens_cumsum > self.max_context_length)
+            documents = documents[:n_documents]
+
+        completion = self.client.chat.completions.create(model=self.openai_model,
+        messages=[
+            {"role": "system", "content": self.basic_model_instruction + " " + self.corpus_instruction},
+            {"role": "user", "content": query_function(documents)},
+        ],
+        temperature = self.openai_model_temperature)
+
+        return completion
+
+
+    @staticmethod
+    def sample_identity(n_docs: int) -> np.ndarray:
+        """
+        Generate an identity array of document indices without changing their order.
+
+        Args:
+            n_docs (int): Number of documents.
+
+        Returns:
+            np.ndarray: An array containing document indices from 0 to (n_docs - 1).
+        """
+
+        return np.arange(n_docs)
+
+
+    @staticmethod
+    def sample_uniform(n_docs: int) -> np.ndarray:
+        """
+        Randomly sample document indices without replacement.
+
+        Args:
+            n_docs (int): Number of documents.
+
+        Returns:
+            np.ndarray: An array containing randomly permuted document indices from 0 to (n_docs - 1).
+        """
+
+        return np.random.permutation(n_docs)
+
+    @staticmethod
+    def sample_poisson(n_docs: int) -> np.ndarray:
+        """
+        Randomly sample document indices according to a Poisson distribution, favoring documents from the beginning of the list.
+
+        Args:
+            n_docs (int): Number of documents.
+
+        Returns:
+            np.ndarray: An array containing randomly permuted document indices, with more documents drawn from the beginning of the list.
+        """
+
+        return np.random.poisson(1, n_docs)
+
+    def describe_topic_documents_sampling_completion_object(
+        self,
+        documents: list[str],
+        truncate_doc_thresh=100,
+        n_documents: int = None,
+        query_function: Callable = lambda docs: f"Please give me the common topic of the sample of those documents: {docs}. Note that the documents are truncated if they are too long. Also describe the various aspects and sub-topics of the topic.",
+        sampling_strategy: str = None,):
+        """
+        Describe a topic based on a sample of its documents by using the openai model.
+
+        Args:
+            documents (list[str]): List of documents ordered by similarity to the topic's centroid.
+            truncate_doc_thresh (int, optional): Threshold for the number of words in a document. If a document exceeds this threshold, it is truncated. Defaults to 100.
+            n_documents (int, optional): Number of documents to use for the query. If None, all documents are used. Defaults to None.
+            query_function (Callable, optional): Function to query the model. Defaults to a lambda function generating a query based on the provided documents.
+            sampling_strategy (Union[Callable, str], optional): Strategy to sample the documents. If None, the first provided documents are used.
+                If it's a string, it's interpreted as a method of the class (e.g., "sample_uniform" is interpreted as self.sample_uniform). It can also be a custom sampling function. Defaults to None.
+
+        Returns:
+            openai.ChatCompletion: A description of the topic by the model in the form of an openai.ChatCompletion object.
+        """
+
+        if type(sampling_strategy) == str:
+            if sampling_strategy == "topk":
+                sampling_strategy = self.sample_identity
+            if sampling_strategy=="identity":
+                sampling_strategy = self.sample_identity
+            elif sampling_strategy=="uniform":
+                sampling_strategy = self.sample_uniform
+            elif sampling_strategy=="poisson":
+                sampling_strategy = self.sample_poisson
+
+        new_documents = [documents[i] for i in sampling_strategy(n_documents)]
+
+        result = self.describe_topic_documents_completion_object(new_documents, truncate_doc_thresh, n_documents, query_function)
+        return result
+
+    def describe_topic_document_sampling_str(
+    self,
+    documents: list[str],
+    truncate_doc_thresh=100,
+    n_documents: int = None,
+    query_function: Callable = lambda docs: f"Please give me the common topic of the sample of those documents: {docs}. Note that the documents are truncated if they are too long. Also describe the various aspects and sub-topics of the topic.",
+    sampling_strategy: str = None,) -> str:
+        """
+        Describe a topic based on a sample of its documents by using the openai model.
+
+        Args:
+            documents (list[str]): List of documents ordered by similarity to the topic's centroid.
+            truncate_doc_thresh (int, optional): Threshold for the number of words in a document. If a document exceeds this threshold, it is truncated. Defaults to 100.
+            n_documents (int, optional): Number of documents to use for the query. If None, all documents are used. Defaults to None.
+            query_function (Callable, optional): Function to query the model. Defaults to a lambda function generating a query based on the provided documents.
+            sampling_strategy (Union[Callable, str], optional): Strategy to sample the documents. If None, the first provided documents are used.
+                If it's a string, it's interpreted as a method of the class (e.g., "sample_uniform" is interpreted as self.sample_uniform). It can also be a custom sampling function. Defaults to None.
+
+        Returns:
+            str: A description of the topic by the model in the form of a string.
+        """
+
+        completion = self.describe_topic_document_sampling_completion_object(documents, truncate_doc_thresh, n_documents, query_function, sampling_strategy)
+        return completion.choices[0].message.content
@@ -0,0 +1 @@
+__version__ = '0.0.5'