The LLM-based topic recognition model is complete and adapted to quickly updating Weibo topics.

2025-08-07 11:14:38 +08:00
parent 1e780876c9
commit d88d5edd99
32 changed files with 8352 additions and 1 deletions
@@ -0,0 +1,286 @@
+import numpy as np
+import umap
+import hdbscan
+import matplotlib.pyplot as plt
+import pandas as pd
+import plotly.express as px
+import umap.plot
+from copy import deepcopy
+from sklearn.cluster import AgglomerativeClustering
+
+from typing import Tuple
+
+class Clustering_and_DimRed():
+
+    """
+    Class to perform dimensionality reduction with UMAP followed by clustering with HDBSCAN.
+    """
+    def __init__(self,
+             n_dims_umap: int = 5,
+             n_neighbors_umap: int = 15,
+             min_dist_umap: float = 0,
+             metric_umap: str = "cosine",
+             min_cluster_size_hdbscan: int = 30,
+             metric_hdbscan: str = "euclidean",
+             cluster_selection_method_hdbscan: str = "eom",
+             number_clusters_hdbscan: int = None,
+             random_state: int = 42,
+             verbose: bool = True,
+             UMAP_hyperparams: dict = {},
+             HDBSCAN_hyperparams: dict = {}) -> None:
+        """
+        Initializes the clustering and dimensionality reduction parameters for topic modeling.
+
+        Args:
+            n_dims_umap (int, optional): Number of dimensions to reduce to using UMAP.
+            n_neighbors_umap (int, optional): Number of neighbors for UMAP.
+            min_dist_umap (float, optional): Minimum distance for UMAP.
+            metric_umap (str, optional): Metric for UMAP.
+            min_cluster_size_hdbscan (int, optional): Minimum cluster size for HDBSCAN.
+            metric_hdbscan (str, optional): Metric for HDBSCAN.
+            cluster_selection_method_hdbscan (str, optional): Cluster selection method for HDBSCAN.
+            number_clusters_hdbscan (int, optional): Number of clusters for HDBSCAN. If None, HDBSCAN will determine the number of clusters automatically. Ensure that min_cluster_size is not too large to find enough clusters.
+            random_state (int, optional): Random state for UMAP and HDBSCAN.
+            verbose (bool, optional): Whether to print progress.
+            UMAP_hyperparams (dict, optional): Additional hyperparameters for UMAP.
+            HDBSCAN_hyperparams (dict, optional): Additional hyperparameters for HDBSCAN.
+        """
+
+
+        # do some checks on the input arguments 
+        assert n_dims_umap > 0, "n_dims_umap must be greater than 0"
+        assert n_neighbors_umap > 0, "n_neighbors_umap must be greater than 0"
+        assert min_dist_umap >= 0, "min_dist_umap must be greater than or equal to 0"
+        assert min_cluster_size_hdbscan > 0, "min_cluster_size_hdbscan must be greater than 0"
+        assert number_clusters_hdbscan is None or number_clusters_hdbscan > 0, "number_clusters_hdbscan must be greater than 0 or None"
+        assert random_state is None or random_state >= 0, "random_state must be greater than or equal to 0"
+
+        self.random_state = random_state
+        self.verbose = verbose
+        self.UMAP_hyperparams = UMAP_hyperparams
+        self.HDBSCAN_hyperparams = HDBSCAN_hyperparams
+
+        # update hyperparameters for UMAP
+        self.UMAP_hyperparams["n_components"] = n_dims_umap
+        self.UMAP_hyperparams["n_neighbors"] = n_neighbors_umap
+        self.UMAP_hyperparams["min_dist"] = min_dist_umap
+        self.UMAP_hyperparams["metric"] = metric_umap
+        self.UMAP_hyperparams["random_state"] = random_state
+        self.UMAP_hyperparams["verbose"] = verbose
+        self.umap = umap.UMAP(**self.UMAP_hyperparams)
+
+        self.HDBSCAN_hyperparams["min_cluster_size"] = min_cluster_size_hdbscan
+        self.HDBSCAN_hyperparams["metric"] = metric_hdbscan
+        self.HDBSCAN_hyperparams["cluster_selection_method"] = cluster_selection_method_hdbscan
+        self.number_clusters_hdbscan = number_clusters_hdbscan
+        self.hdbscan = hdbscan.HDBSCAN(**self.HDBSCAN_hyperparams)
+
+    
+    def reduce_dimensions_umap(self, embeddings: np.ndarray) -> Tuple[np.ndarray, umap.UMAP]:
+        """
+        Reduces dimensions of embeddings using UMAP.
+
+        Args:
+            embeddings (np.ndarray): Embeddings to reduce.
+
+        Returns:
+            tuple: A tuple containing two items:
+                - reduced_embeddings (np.ndarray): Reduced embeddings.
+                - umap_mapper (umap.UMAP): UMAP mapper for transforming new embeddings, especially embeddings of the vocabulary. (MAKE SURE TO NORMALIZE EMBEDDINGS AFTER USING THE MAPPER)
+        """
+
+        mapper = umap.UMAP(**self.UMAP_hyperparams).fit(embeddings)
+        dim_red_embeddings = mapper.transform(embeddings)
+        dim_red_embeddings = dim_red_embeddings/np.linalg.norm(dim_red_embeddings, axis=1).reshape(-1,1)
+        return dim_red_embeddings, mapper
+    
+    def cluster_hdbscan(self, embeddings: np.ndarray) -> np.ndarray:
+        """
+        Cluster embeddings using HDBSCAN.
+        
+        If self.number_clusters_hdbscan is not None, further clusters the data with AgglomerativeClustering to achieve a fixed number of clusters.
+
+        Args:
+            embeddings (np.ndarray): Embeddings to cluster.
+
+        Returns:
+            np.ndarray: Cluster labels.
+        """
+
+        labels = self.hdbscan.fit_predict(embeddings)
+        outliers = np.where(labels == -1)[0]
+
+        if self.number_clusters_hdbscan is not None:
+            clusterer = AgglomerativeClustering(n_clusters=self.number_clusters_hdbscan)  #one cluster for outliers  
+            labels = clusterer.fit_predict(embeddings)
+            labels[outliers] = -1
+
+        # reindex to make the labels consecutive numbers from -1 to the number of clusters. -1 is reserved for outliers
+        unique_labels = np.unique(labels)
+        unique_labels_no_outliers = unique_labels[unique_labels != -1]
+        map2newlabel = {label: i for i, label in enumerate(unique_labels_no_outliers)}
+        map2newlabel[-1] = -1
+        labels = np.array([map2newlabel[label] for label in labels])
+
+        return labels
+    
+    def cluster_and_reduce(self, embeddings: np.ndarray) -> Tuple[np.ndarray, np.ndarray, umap.UMAP]:
+        """
+        Cluster embeddings using HDBSCAN and reduce dimensions with UMAP.
+
+        Args:
+            embeddings (np.ndarray): Embeddings to cluster and reduce.
+
+        Returns:
+            tuple: A tuple containing three items:
+                - reduced_embeddings (np.ndarray): Reduced embeddings.
+                - cluster_labels (np.ndarray): Cluster labels.
+                - umap_mapper (umap.UMAP): UMAP mapper for transforming new embeddings, especially embeddings of the vocabulary. (MAKE SURE TO NORMALIZE EMBEDDINGS AFTER USING THE MAPPER)
+        """
+
+        dim_red_embeddings, umap_mapper = self.reduce_dimensions_umap(embeddings)
+        clusters = self.cluster_hdbscan(dim_red_embeddings)
+        return dim_red_embeddings, clusters, umap_mapper
+    
+    def visualize_clusters_static(self, embeddings: np.ndarray, labels: np.ndarray):
+        """
+        Reduce dimensionality with UMAP to two dimensions and plot the clusters.
+
+        Args:
+            embeddings (np.ndarray): Embeddings for which to plot clustering.
+            labels (np.ndarray): Cluster labels.
+        """
+
+
+        # Reduce dimensionality with UMAP
+        reducer = umap.UMAP(n_components=2, random_state = self.random_state, n_neighbors=30, metric="cosine", min_dist=0)
+        embeddings_2d = reducer.fit_transform(embeddings)
+
+
+        # Create a color palette, then map the labels to the colors.
+        # We add one to the number of unique labels to account for the noise points labelled as -1.
+        palette = plt.cm.get_cmap("tab20", len(np.unique(labels)) + 1)
+        
+        # Create a new figure
+        fig, ax = plt.subplots(figsize=(10, 8))
+
+        outlier_shown_in_legend = False
+
+        # Iterate through all unique labels (clusters and outliers)
+        for label in np.unique(labels):
+            # Find the embeddings that are part of this cluster
+            cluster_points = embeddings_2d[labels == label]
+            
+            # If label is -1, these are outliers. We want to display them in grey.
+            if label == -1:
+                color = 'grey'
+                if not outlier_shown_in_legend:
+                    ax.scatter(cluster_points[:, 0], cluster_points[:, 1], c=color, label='outlier', s = 0.1)
+                    outlier_shown_in_legend = True
+                else:
+                    ax.scatter(cluster_points[:, 0], cluster_points[:, 1], c=color, s = 0.1)
+            else:
+                color = palette(label)
+                # Plot the points in this cluster without a label to prevent them from showing up in the legend
+                ax.scatter(cluster_points[:, 0], cluster_points[:, 1], c=color, s = 0.1)
+            
+        # Add a legend
+        ax.legend()
+
+        # Show the plot
+        plt.show()
+
+
+    def visualize_clusters_dynamic(self, embeddings: np.ndarray, labels: np.ndarray, texts: list[str], class_names: list[str] = None):
+        """
+        Visualize clusters using Plotly and enable hovering over clusters to see the beginning of the texts of the documents.
+
+        Args:
+            embeddings (np.ndarray): Embeddings for which to visualize clustering.
+            labels (np.ndarray): Cluster labels.
+            texts (list[str]): Texts of the documents.
+            class_names (list[str], optional): Names of the classes.
+        """
+
+
+        # Reduce dimensionality with UMAP
+        reducer = umap.UMAP(n_components=2, random_state = self.random_state, n_neighbors=30, metric="cosine", min_dist=0)
+        embeddings_2d = reducer.fit_transform(embeddings)
+
+        df = pd.DataFrame(embeddings_2d, columns=['x', 'y'])
+        df['text'] = [text[:200] for text in texts] 
+        df["class"] = labels
+
+        if class_names is not None:
+            df["class"] = [class_names[label] for label in labels]
+
+        # Create a color palette, then map the labels to the colors.
+        # Exclude the outlier (-1) label from color palette assignment
+        unique_labels = [label for label in np.unique(labels) if label != -1]
+        palette = plt.cm.get_cmap("tab20", len(unique_labels))
+
+        # Create color map
+        color_discrete_map = {label: 'rgb'+str(tuple(int(val*255) for val in palette(i)[:3])) if label != -1 else 'grey' for i, label in enumerate(unique_labels)}
+        color_discrete_map[-1] = 'grey'
+        
+        # plot data points where the color represents the class
+        fig = px.scatter(df, x='x', y='y', hover_data=['text', 'class'], color='class', color_discrete_map=color_discrete_map)
+        
+        fig.update_traces(mode='markers', marker=dict(size=3))  # Optional: Increase the marker size
+
+        # make plot quadratic
+        fig.update_layout(
+        autosize=False,
+        width=1500,
+        height=1500,
+        margin=dict(
+            l=50,   
+            r=50,
+            b=100,
+            t=100,
+            pad=4
+        )
+        )
+        # set title 
+        fig.update_layout(title_text='UMAP projection of the document embeddings', title_x=0.5)
+
+        
+        # show plot
+        fig.show()
+
+
+    def umap_diagnostics(self, embeddings, hammer_edges = False):
+        """
+        Fit UMAP on the provided embeddings and generate diagnostic plots.
+        
+        Params:
+        ------
+        embeddings : array-like
+            The high-dimensional data for UMAP to reduce and visualize.
+        hammer_edges : bool, default False. Is computationally expensive.
+            
+        """
+        new_hyperparams = deepcopy(self.UMAP_hyperparams)
+        new_hyperparams["n_components"] = 2
+        mapper = umap.UMAP(**new_hyperparams).fit(embeddings)
+
+        # 1. Connectivity plot with points
+        print("UMAP Connectivity Plot with Points")
+        umap.plot.connectivity(mapper, show_points=True)
+        plt.show()
+
+        if hammer_edges:
+            # 2. Connectivity plot with edge bundling
+            print("UMAP Connectivity Plot with Hammer Edge Bundling")
+            umap.plot.connectivity(mapper, edge_bundling='hammer')
+            plt.show()
+
+        # 3. PCA diagnostic plot
+        print("UMAP PCA Diagnostic Plot")
+        umap.plot.diagnostic(mapper, diagnostic_type='pca')
+        plt.show()
+
+        # 4. Local dimension diagnostic plot
+        print("UMAP Local Dimension Diagnostic Plot")
+        umap.plot.diagnostic(mapper, diagnostic_type='local_dim')
+        plt.show()