Add BERTopic.

2025-08-12 19:01:20 +08:00
parent e2323d579c
commit c5c530775e
256 changed files with 28666 additions and 0 deletions
@@ -0,0 +1,32 @@
+<svg width="652" height="186" viewBox="0 0 652 186" fill="none" xmlns="http://www.w3.org/2000/svg">
+<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Inter" font-size="12" letter-spacing="0em"><tspan x="22" y="76.8636">Images</tspan></text>
+<text fill="#757474" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" letter-spacing="0em"><tspan x="15.3008" y="41.9697">Embed text, </tspan><tspan x="8.82129" y="53.9697">images or both </tspan></text>
+<text fill="#757474" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" letter-spacing="0em"><tspan x="390.892" y="55.9697">For each topic, find the best matching images </tspan><tspan x="392.127" y="67.9697">based on the most representative documents</tspan></text>
+<rect x="14.5" y="59.5" width="56" height="27" stroke="black"/>
+<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Inter" font-size="12" letter-spacing="0em"><tspan x="29" y="175.864">Text</tspan></text>
+<rect x="14.5" y="158.5" width="56" height="27" stroke="black"/>
+<line x1="79" y1="123" x2="42" y2="123" stroke="black" stroke-width="2"/>
+<line x1="607" y1="73" x2="79" y2="73" stroke="black" stroke-width="2"/>
+<line x1="43" y1="97" x2="43" y2="122" stroke="black" stroke-width="2"/>
+<line x1="43" y1="123" x2="43" y2="152" stroke="black" stroke-width="2"/>
+<rect x="118" y="90" width="534" height="57" fill="white"/>
+<rect x="118.5" y="104.5" width="88" height="42" fill="white" stroke="black"/>
+<text fill="#757474" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" letter-spacing="0em"><tspan x="134.238" y="158.97">clip-ViT-B-32</tspan></text>
+<text fill="#757474" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" letter-spacing="0em"><tspan x="301" y="158.97">UMAP</tspan></text>
+<text fill="#757474" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" letter-spacing="0em"><tspan x="437.149" y="158.97">HDBSCAN</tspan></text>
+<text fill="#757474" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" letter-spacing="0em"><tspan x="586" y="158.97">c-TF-IDF</tspan></text>
+<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="127" y="128.764">Embeddings</tspan></text>
+<rect x="260.5" y="104.5" width="105" height="42" fill="white" stroke="black"/>
+<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="274.094" y="123.764">Dimensionality &#10;</tspan><tspan x="289.762" y="137.764">reduction</tspan></text>
+<path d="M244.707 123.707C245.098 123.317 245.098 122.683 244.707 122.293L238.343 115.929C237.953 115.538 237.319 115.538 236.929 115.929C236.538 116.319 236.538 116.953 236.929 117.343L242.586 123L236.929 128.657C236.538 129.047 236.538 129.681 236.929 130.071C237.319 130.462 237.953 130.462 238.343 130.071L244.707 123.707ZM217 124H244V122H217V124Z" fill="black"/>
+<path d="M104.707 123.707C105.098 123.317 105.098 122.683 104.707 122.293L98.3431 115.929C97.9526 115.538 97.3195 115.538 96.9289 115.929C96.5384 116.319 96.5384 116.953 96.9289 117.343L102.586 123L96.9289 128.657C96.5384 129.047 96.5384 129.681 96.9289 130.071C97.3195 130.462 97.9526 130.462 98.3431 130.071L104.707 123.707ZM77 124H104V122H77V124Z" fill="black"/>
+<rect x="413.5" y="104.5" width="91" height="42" fill="white" stroke="black"/>
+<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="435" y="128.764">Clustering</tspan></text>
+<path d="M403.707 123.707C404.098 123.317 404.098 122.683 403.707 122.293L397.343 115.929C396.953 115.538 396.319 115.538 395.929 115.929C395.538 116.319 395.538 116.953 395.929 117.343L401.586 123L395.929 128.657C395.538 129.047 395.538 129.681 395.929 130.071C396.319 130.462 396.953 130.462 397.343 130.071L403.707 123.707ZM376 124H403V122H376V124Z" fill="black"/>
+<rect x="560.5" y="104.5" width="91" height="42" fill="white" stroke="black"/>
+<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="590.404" y="120.764">Topic &#10;</tspan><tspan x="568.215" y="134.764">representation</tspan></text>
+<rect x="560.5" y="0.5" width="91" height="42" fill="white" stroke="black"/>
+<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="565.232" y="16.7637">Representative &#10;</tspan><tspan x="587.785" y="30.7637">images</tspan></text>
+<path d="M544.707 123.707C545.098 123.317 545.098 122.683 544.707 122.293L538.343 115.929C537.953 115.538 537.319 115.538 536.929 115.929C536.538 116.319 536.538 116.953 536.929 117.343L542.586 123L536.929 128.657C536.538 129.047 536.538 129.681 536.929 130.071C537.319 130.462 537.953 130.462 538.343 130.071L544.707 123.707ZM517 124H544V122H517V124Z" fill="black"/>
+<path d="M607.707 51.2929C607.317 50.9024 606.683 50.9024 606.293 51.2929L599.929 57.6569C599.538 58.0474 599.538 58.6805 599.929 59.0711C600.319 59.4616 600.953 59.4616 601.343 59.0711L607 53.4142L612.657 59.0711C613.047 59.4616 613.681 59.4616 614.071 59.0711C614.462 58.6805 614.462 58.0474 614.071 57.6569L607.707 51.2929ZM608 98L608 52L606 52L606 98L608 98Z" fill="black"/>
+</svg>
@@ -0,0 +1,32 @@
+<svg width="803" height="169" viewBox="0 0 803 169" fill="none" xmlns="http://www.w3.org/2000/svg">
+<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Inter" font-size="12" letter-spacing="0em"><tspan x="21" y="127.864">Images</tspan></text>
+<text fill="#757474" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" letter-spacing="0em"><tspan x="9.67578" y="158.97">Embed images</tspan></text>
+<text fill="#757474" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" letter-spacing="0em"><tspan x="542.892" y="55.9697">For each topic, find the best matching images </tspan><tspan x="544.127" y="67.9697">based on the most representative documents</tspan></text>
+<rect x="13.5" y="104.5" width="60" height="42" stroke="black"/>
+<line x1="757" y1="73" x2="41" y2="73" stroke="black" stroke-width="2"/>
+<line x1="40" y1="94" x2="40" y2="72" stroke="black" stroke-width="2"/>
+<rect x="120" y="90" width="534" height="57" fill="white"/>
+<rect x="120.5" y="104.5" width="88" height="42" fill="white" stroke="black"/>
+<text fill="#757474" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" letter-spacing="0em"><tspan x="136.238" y="158.97">clip-ViT-B-32</tspan></text>
+<text fill="#757474" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" letter-spacing="0em"><tspan x="303" y="158.97">UMAP</tspan></text>
+<text fill="#757474" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" letter-spacing="0em"><tspan x="439.149" y="158.97">HDBSCAN</tspan></text>
+<text fill="#757474" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" letter-spacing="0em"><tspan x="557" y="158.97">vit-gpt2-image-captioning</tspan></text>
+<text fill="#757474" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" letter-spacing="0em"><tspan x="738" y="158.97">c-TF-IDF</tspan></text>
+<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="129" y="128.764">Embeddings</tspan></text>
+<rect x="262.5" y="104.5" width="105" height="42" fill="white" stroke="black"/>
+<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="276.094" y="123.764">Dimensionality &#10;</tspan><tspan x="291.762" y="137.764">reduction</tspan></text>
+<path d="M246.707 123.707C247.098 123.317 247.098 122.683 246.707 122.293L240.343 115.929C239.953 115.538 239.319 115.538 238.929 115.929C238.538 116.319 238.538 116.953 238.929 117.343L244.586 123L238.929 128.657C238.538 129.047 238.538 129.681 238.929 130.071C239.319 130.462 239.953 130.462 240.343 130.071L246.707 123.707ZM219 124H246V122H219V124Z" fill="black"/>
+<path d="M106.707 123.707C107.098 123.317 107.098 122.683 106.707 122.293L100.343 115.929C99.9526 115.538 99.3195 115.538 98.9289 115.929C98.5384 116.319 98.5384 116.953 98.9289 117.343L104.586 123L98.9289 128.657C98.5384 129.047 98.5384 129.681 98.9289 130.071C99.3195 130.462 99.9526 130.462 100.343 130.071L106.707 123.707ZM79 124H106V122H79V124Z" fill="black"/>
+<rect x="415.5" y="104.5" width="91" height="42" fill="white" stroke="black"/>
+<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="437" y="128.764">Clustering</tspan></text>
+<path d="M405.707 123.707C406.098 123.317 406.098 122.683 405.707 122.293L399.343 115.929C398.953 115.538 398.319 115.538 397.929 115.929C397.538 116.319 397.538 116.953 397.929 117.343L403.586 123L397.929 128.657C397.538 129.047 397.538 129.681 397.929 130.071C398.319 130.462 398.953 130.462 399.343 130.071L405.707 123.707ZM378 124H405V122H378V124Z" fill="black"/>
+<rect x="562.5" y="104.5" width="91" height="42" fill="white" stroke="black"/>
+<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="10" letter-spacing="0em"><tspan x="590.224" y="116.97">Caption &#10;</tspan><tspan x="574.11" y="128.97">Representative &#10;</tspan><tspan x="592.182" y="140.97">Images</tspan></text>
+<rect x="710.5" y="0.5" width="91" height="42" fill="white" stroke="black"/>
+<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="715.232" y="16.7637">Representative &#10;</tspan><tspan x="737.785" y="30.7637">images</tspan></text>
+<path d="M546.707 123.707C547.098 123.317 547.098 122.683 546.707 122.293L540.343 115.929C539.953 115.538 539.319 115.538 538.929 115.929C538.538 116.319 538.538 116.953 538.929 117.343L544.586 123L538.929 128.657C538.538 129.047 538.538 129.681 538.929 130.071C539.319 130.462 539.953 130.462 540.343 130.071L546.707 123.707ZM519 124H546V122H519V124Z" fill="black"/>
+<rect x="711.5" y="104.5" width="91" height="42" fill="white" stroke="black"/>
+<text fill="black" xml:space="preserve" style="white-space: pre" font-family="Tahoma" font-size="12" letter-spacing="0em"><tspan x="741.404" y="120.764">Topic &#10;</tspan><tspan x="719.215" y="134.764">representation</tspan></text>
+<path d="M695.707 123.707C696.098 123.317 696.098 122.683 695.707 122.293L689.343 115.929C688.953 115.538 688.319 115.538 687.929 115.929C687.538 116.319 687.538 116.953 687.929 117.343L693.586 123L687.929 128.657C687.538 129.047 687.538 129.681 687.929 130.071C688.319 130.462 688.953 130.462 689.343 130.071L695.707 123.707ZM668 124H695V122H668V124Z" fill="black"/>
+<path d="M757.707 51.2929C757.317 50.9024 756.683 50.9024 756.293 51.2929L749.929 57.6569C749.538 58.0474 749.538 58.6805 749.929 59.0711C750.319 59.4616 750.953 59.4616 751.343 59.0711L757 53.4142L762.657 59.0711C763.047 59.4616 763.681 59.4616 764.071 59.0711C764.462 58.6805 764.462 58.0474 764.071 57.6569L757.707 51.2929ZM758 98L758 52L756 52L756 98L758 98Z" fill="black"/>
+</svg>
@@ -0,0 +1,190 @@
+Documents or text are often accompanied by imagery or the other way around. For example, social media images with captions and products with descriptions. Topic modeling has traditionally focused on creating topics from textual representations. However, as more multimodal representations are created, the need for multimodal topics increases.
+
+BERTopic can perform **multimodal topic modeling** in a number of ways during `.fit` and `.fit_transform` stages.
+
+## **Text + Images**
+
+The most basic example of multimodal topic modeling in BERTopic is when you have images that accompany your documents. This means that it is expected that each document has an image and vice versa. Instagram pictures, for example, almost always have some descriptions to them.
+
+<figure markdown>
+  ![Image title](images_and_text.svg)
+  <figcaption></figcaption>
+</figure>
+
+In this example, we are going to use images from `flickr` that each have a caption associated to it:
+
+```python
+# NOTE: This requires the `datasets` package which you can
+# install with `pip install datasets`
+from datasets import load_dataset
+
+ds = load_dataset("maderix/flickr_bw_rgb")
+images = ds["train"]["image"]
+docs = ds["train"]["caption"]
+```
+
+The `docs` variable contains the captions for each image in `images`. We can now use these variables to run our multimodal example:
+
+!!! Tip
+    Do note that it is better to pass the paths of the images instead of the images themselves as there is no need to keep all images in memory. When passing the paths of the images, they are only opened temporarily when they are needed.
+
+```python
+from bertopic import BERTopic
+from bertopic.representation import VisualRepresentation
+
+# Additional ways of representing a topic
+visual_model = VisualRepresentation()
+
+# Make sure to add the `visual_model` to a dictionary
+representation_model = {
+   "Visual_Aspect":  visual_model,
+}
+topic_model = BERTopic(representation_model=representation_model, verbose=True)
+```
+
+In this example, we are clustering the documents and are then looking for the best matching images to the resulting clusters.
+
+We can now access our image representations for each topic with `topic_model.topic_aspects_["Visual_Aspect"]`.
+If you want an overview of the topic images together with their textual representations in jupyter, you can run the following:
+
+```python
+import base64
+from io import BytesIO
+from IPython.display import HTML
+
+def image_base64(im):
+    if isinstance(im, str):
+        im = get_thumbnail(im)
+    with BytesIO() as buffer:
+        im.save(buffer, 'jpeg')
+        return base64.b64encode(buffer.getvalue()).decode()
+
+
+def image_formatter(im):
+    return f'<img src="data:image/jpeg;base64,{image_base64(im)}">'
+
+# Extract dataframe
+df = topic_model.get_topic_info().drop("Representative_Docs", 1).drop("Name", 1)
+
+# Visualize the images
+HTML(df.to_html(formatters={'Visual_Aspect': image_formatter}, escape=False))
+```
+
+<br><br>
+<img src="images_and_text.jpg">
+<br><br>
+
+!!! Tip
+    In the example above, we are clustering the documents but since you have
+    images, you might want to cluster those or cluster an aggregation of both
+    images and documents. For that, you can use the new `MultiModalBackend`
+    to generate embeddings:
+
+    ```python
+    from bertopic.backend import MultiModalBackend
+    model = MultiModalBackend('clip-ViT-B-32', batch_size=32)
+
+    # Embed documents only
+    doc_embeddings = model.embed_documents(docs)
+
+    # Embedding images only
+    image_embeddings = model.embed_images(images)
+
+    # Embed both images and documents, then average them
+    doc_image_embeddings = model.embed(docs, images)
+    ```
+
+## **Images Only**
+
+Traditional topic modeling techniques can only be run on textual data, as is shown in the example above. However, there are plenty of cases where textual data is not available but images are. BERTopic allows topic modeling to be performed using only images as your input data.
+
+<figure markdown>
+  ![Image title](images_only.svg)
+  <figcaption></figcaption>
+</figure>
+
+To run BERTopic on images only, we first need to embed our images and then define a model that convert images to text. To do so, we are going to need some images. We will take the same images as the above but instead save them locally and pass the paths to the images instead. As mentioned before, this will make sure that we do not hold too many images in memory whilst only a small subset is needed:
+
+
+```python
+import os
+import glob
+import zipfile
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+from sentence_transformers import util
+
+# Flickr 8k images
+img_folder = 'photos/'
+caps_folder = 'captions/'
+if not os.path.exists(img_folder) or len(os.listdir(img_folder)) == 0:
+    os.makedirs(img_folder, exist_ok=True)
+
+    if not os.path.exists('Flickr8k_Dataset.zip'):   #Download dataset if does not exist
+        util.http_get('https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_Dataset.zip', 'Flickr8k_Dataset.zip')
+        util.http_get('https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_text.zip', 'Flickr8k_text.zip')
+
+    for folder, file in [(img_folder, 'Flickr8k_Dataset.zip'), (caps_folder, 'Flickr8k_text.zip')]:
+        with zipfile.ZipFile(file, 'r') as zf:
+            for member in tqdm(zf.infolist(), desc='Extracting'):
+                zf.extract(member, folder)
+images = list(glob.glob('photos/Flicker8k_Dataset/*.jpg'))
+```
+
+Next, we can run our pipeline:
+
+
+```python
+from bertopic.representation import KeyBERTInspired, VisualRepresentation
+from bertopic.backend import MultiModalBackend
+
+# Image embedding model
+embedding_model = MultiModalBackend('clip-ViT-B-32', batch_size=32)
+
+# Image to text representation model
+representation_model = {
+    "Visual_Aspect": VisualRepresentation(image_to_text_model="nlpconnect/vit-gpt2-image-captioning")
+}
+
+```
+
+Using these models, we can run our pipeline:
+
+```python
+from bertopic import BERTopic
+
+# Train our model with images only
+topic_model = BERTopic(embedding_model=embedding_model, representation_model=representation_model, min_topic_size=30)
+topics, probs = topic_model.fit_transform(documents=None, images=images)
+```
+
+We can now access our image representations for each topic with `topic_model.topic_aspects_["Visual_Aspect"]`.
+If you want an overview of the topic images together with their textual representations in jupyter, you can run the following:
+
+```python
+import base64
+from io import BytesIO
+from IPython.display import HTML
+
+def image_base64(im):
+    if isinstance(im, str):
+        im = get_thumbnail(im)
+    with BytesIO() as buffer:
+        im.save(buffer, 'jpeg')
+        return base64.b64encode(buffer.getvalue()).decode()
+
+
+def image_formatter(im):
+    return f'<img src="data:image/jpeg;base64,{image_base64(im)}">'
+
+# Extract dataframe
+df = topic_model.get_topic_info().drop("Representative_Docs", 1).drop("Name", 1)
+
+# Visualize the images
+HTML(df.to_html(formatters={'Visual_Aspect': image_formatter}, escape=False))
+```
+
+<br><br>
+<img src="images_only.jpg">
+<br><br>