Add BERTopic.

This commit is contained in:
戒酒的李白
2025-08-12 19:01:20 +08:00
parent e2323d579c
commit c5c530775e
256 changed files with 28666 additions and 0 deletions
@@ -0,0 +1,60 @@
from ._base import BaseEmbedder
from ._word_doc import WordDocEmbedder
from ._utils import languages
from bertopic._utils import NotInstalled
# OpenAI Embeddings
try:
from bertopic.backend._openai import OpenAIBackend
except ModuleNotFoundError:
msg = "`pip install openai` \n\n"
OpenAIBackend = NotInstalled("OpenAI", "OpenAI", custom_msg=msg)
# Cohere Embeddings
try:
from bertopic.backend._cohere import CohereBackend
except ModuleNotFoundError:
msg = "`pip install cohere` \n\n"
CohereBackend = NotInstalled("Cohere", "Cohere", custom_msg=msg)
# Multimodal Embeddings
try:
from bertopic.backend._multimodal import MultiModalBackend
except ModuleNotFoundError:
msg = "`pip install bertopic[vision]` \n\n"
MultiModalBackend = NotInstalled("Vision", "Vision", custom_msg=msg)
# Model2Vec Embeddings
try:
from bertopic.backend._model2vec import Model2VecBackend
except ModuleNotFoundError:
msg = "`pip install model2vec` \n\n"
Model2VecBackend = NotInstalled("Model2Vec", "Model2Vec", custom_msg=msg)
# FasteEmbed Embeddings
try:
from bertopic.backend._fastembed import FastEmbedBackend
except ModuleNotFoundError:
msg = "`pip install fastembed` \n\n"
FastEmbedBackend = NotInstalled("FastEmbed", "FastEmbed", custom_msg=msg)
# Langchain Embedddings
try:
from bertopic.backend._langchain import LangChainBackend
except ModuleNotFoundError:
msg = "`pip install langchain` \n\n"
LangChainBackend = NotInstalled("LangChain", "LangChain", custom_msg=msg)
__all__ = [
"BaseEmbedder",
"WordDocEmbedder",
"OpenAIBackend",
"CohereBackend",
"Model2VecBackend",
"MultiModalBackend",
"FastEmbedBackend",
"LangChainBackend",
"languages",
]
@@ -0,0 +1,62 @@
import numpy as np
from typing import List
class BaseEmbedder:
"""The Base Embedder used for creating embedding models.
Arguments:
embedding_model: The main embedding model to be used for extracting
document and word embedding
word_embedding_model: The embedding model used for extracting word
embeddings only. If this model is selected,
then the `embedding_model` is purely used for
creating document embeddings.
"""
def __init__(self, embedding_model=None, word_embedding_model=None):
self.embedding_model = embedding_model
self.word_embedding_model = word_embedding_model
def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
"""Embed a list of n documents/words into an n-dimensional
matrix of embeddings.
Arguments:
documents: A list of documents or words to be embedded
verbose: Controls the verbosity of the process
Returns:
Document/words embeddings with shape (n, m) with `n` documents/words
that each have an embeddings size of `m`
"""
pass
def embed_words(self, words: List[str], verbose: bool = False) -> np.ndarray:
"""Embed a list of n words into an n-dimensional
matrix of embeddings.
Arguments:
words: A list of words to be embedded
verbose: Controls the verbosity of the process
Returns:
Word embeddings with shape (n, m) with `n` words
that each have an embeddings size of `m`
"""
return self.embed(words, verbose)
def embed_documents(self, document: List[str], verbose: bool = False) -> np.ndarray:
"""Embed a list of n words into an n-dimensional
matrix of embeddings.
Arguments:
document: A list of documents to be embedded
verbose: Controls the verbosity of the process
Returns:
Document embeddings with shape (n, m) with `n` documents
that each have an embeddings size of `m`
"""
return self.embed(document, verbose)
@@ -0,0 +1,94 @@
import time
import numpy as np
from tqdm import tqdm
from typing import Any, List, Mapping
from bertopic.backend import BaseEmbedder
class CohereBackend(BaseEmbedder):
"""Cohere Embedding Model.
Arguments:
client: A `cohere` client.
embedding_model: A Cohere model. Default is "large".
For an overview of models see:
https://docs.cohere.ai/docs/generation-card
delay_in_seconds: If a `batch_size` is given, use this set
the delay in seconds between batches.
batch_size: The size of each batch.
embed_kwargs: Kwargs passed to `cohere.Client.embed`.
Can be used to define additional parameters
such as `input_type`
Examples:
```python
import cohere
from bertopic.backend import CohereBackend
client = cohere.Client("APIKEY")
cohere_model = CohereBackend(client)
```
If you want to specify `input_type`:
```python
cohere_model = CohereBackend(
client,
embedding_model="embed-english-v3.0",
embed_kwargs={"input_type": "clustering"}
)
```
"""
def __init__(
self,
client,
embedding_model: str = "large",
delay_in_seconds: float = None,
batch_size: int = None,
embed_kwargs: Mapping[str, Any] = {},
):
super().__init__()
self.client = client
self.embedding_model = embedding_model
self.delay_in_seconds = delay_in_seconds
self.batch_size = batch_size
self.embed_kwargs = embed_kwargs
if self.embed_kwargs.get("model"):
self.embedding_model = embed_kwargs.get("model")
else:
self.embed_kwargs["model"] = self.embedding_model
def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
"""Embed a list of n documents/words into an n-dimensional
matrix of embeddings.
Arguments:
documents: A list of documents or words to be embedded
verbose: Controls the verbosity of the process
Returns:
Document/words embeddings with shape (n, m) with `n` documents/words
that each have an embeddings size of `m`
"""
# Batch-wise embedding extraction
if self.batch_size is not None:
embeddings = []
for batch in tqdm(self._chunks(documents), disable=not verbose):
response = self.client.embed(texts=batch, **self.embed_kwargs)
embeddings.extend(response.embeddings)
# Delay subsequent calls
if self.delay_in_seconds:
time.sleep(self.delay_in_seconds)
# Extract embeddings all at once
else:
response = self.client.embed(texts=documents, **self.embed_kwargs)
embeddings = response.embeddings
return np.array(embeddings)
def _chunks(self, documents):
for i in range(0, len(documents), self.batch_size):
yield documents[i : i + self.batch_size]
@@ -0,0 +1,54 @@
import numpy as np
from typing import List
from fastembed import TextEmbedding
from bertopic.backend import BaseEmbedder
class FastEmbedBackend(BaseEmbedder):
"""FastEmbed embedding model.
The FastEmbed embedding model used for generating sentence embeddings.
Arguments:
embedding_model: A FastEmbed embedding model
Examples:
To create a model, you can load in a string pointing to a supported
FastEmbed model:
```python
from bertopic.backend import FastEmbedBackend
sentence_model = FastEmbedBackend("BAAI/bge-small-en-v1.5")
```
"""
def __init__(self, embedding_model: str = "BAAI/bge-small-en-v1.5"):
super().__init__()
supported_models = [m["model"] for m in TextEmbedding.list_supported_models()]
if isinstance(embedding_model, str) and embedding_model in supported_models:
self.embedding_model = TextEmbedding(model_name=embedding_model)
else:
raise ValueError(
"Please select a correct FasteEmbed model: \n"
"the model must be a string and must be supported. \n"
"The supported TextEmbedding model list is here: https://qdrant.github.io/fastembed/examples/Supported_Models/"
)
def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
"""Embed a list of n documents/words into an n-dimensional
matrix of embeddings.
Arguments:
documents: A list of documents or words to be embedded
verbose: Controls the verbosity of the process
Returns:
Document/words embeddings with shape (n, m) with `n` documents/words
that each have an embeddings size of `m`
"""
embeddings = np.array(list(self.embedding_model.embed(documents, show_progress_bar=verbose)))
return embeddings
@@ -0,0 +1,78 @@
import numpy as np
from tqdm import tqdm
from typing import Union, List
from flair.data import Sentence
from flair.embeddings import DocumentEmbeddings, TokenEmbeddings, DocumentPoolEmbeddings
from bertopic.backend import BaseEmbedder
class FlairBackend(BaseEmbedder):
"""Flair Embedding Model.
The Flair embedding model used for generating document and
word embeddings.
Arguments:
embedding_model: A Flair embedding model
Examples:
```python
from bertopic.backend import FlairBackend
from flair.embeddings import WordEmbeddings, DocumentPoolEmbeddings
# Create a Flair Embedding model
glove_embedding = WordEmbeddings('crawl')
document_glove_embeddings = DocumentPoolEmbeddings([glove_embedding])
# Pass the Flair model to create a new backend
flair_embedder = FlairBackend(document_glove_embeddings)
```
"""
def __init__(self, embedding_model: Union[TokenEmbeddings, DocumentEmbeddings]):
super().__init__()
# Flair word embeddings
if isinstance(embedding_model, TokenEmbeddings):
self.embedding_model = DocumentPoolEmbeddings([embedding_model])
# Flair document embeddings + disable fine tune to prevent CUDA OOM
# https://github.com/flairNLP/flair/issues/1719
elif isinstance(embedding_model, DocumentEmbeddings):
if "fine_tune" in embedding_model.__dict__:
embedding_model.fine_tune = False
self.embedding_model = embedding_model
else:
raise ValueError(
"Please select a correct Flair model by either using preparing a token or document "
"embedding model: \n"
"`from flair.embeddings import TransformerDocumentEmbeddings` \n"
"`roberta = TransformerDocumentEmbeddings('roberta-base')`"
)
def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
"""Embed a list of n documents/words into an n-dimensional
matrix of embeddings.
Arguments:
documents: A list of documents or words to be embedded
verbose: Controls the verbosity of the process
Returns:
Document/words embeddings with shape (n, m) with `n` documents/words
that each have an embeddings size of `m`
"""
embeddings = []
for document in tqdm(documents, disable=not verbose):
try:
sentence = Sentence(document) if document else Sentence("an empty document")
self.embedding_model.embed(sentence)
except RuntimeError:
sentence = Sentence("an empty document")
self.embedding_model.embed(sentence)
embedding = sentence.embedding.detach().cpu().numpy()
embeddings.append(embedding)
embeddings = np.asarray(embeddings)
return embeddings
@@ -0,0 +1,69 @@
import numpy as np
from tqdm import tqdm
from typing import List
from bertopic.backend import BaseEmbedder
from gensim.models.keyedvectors import Word2VecKeyedVectors
class GensimBackend(BaseEmbedder):
"""Gensim Embedding Model.
The Gensim embedding model is typically used for word embeddings with
GloVe, Word2Vec or FastText.
Arguments:
embedding_model: A Gensim embedding model
Examples:
```python
from bertopic.backend import GensimBackend
import gensim.downloader as api
ft = api.load('fasttext-wiki-news-subwords-300')
ft_embedder = GensimBackend(ft)
```
"""
def __init__(self, embedding_model: Word2VecKeyedVectors):
super().__init__()
if isinstance(embedding_model, Word2VecKeyedVectors):
self.embedding_model = embedding_model
else:
raise ValueError(
"Please select a correct Gensim model: \n"
"`import gensim.downloader as api` \n"
"`ft = api.load('fasttext-wiki-news-subwords-300')`"
)
def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
"""Embed a list of n documents/words into an n-dimensional
matrix of embeddings.
Arguments:
documents: A list of documents or words to be embedded
verbose: Controls the verbosity of the process
Returns:
Document/words embeddings with shape (n, m) with `n` documents/words
that each have an embeddings size of `m`
"""
vector_shape = self.embedding_model.get_vector(list(self.embedding_model.index_to_key)[0]).shape[0]
empty_vector = np.zeros(vector_shape)
# Extract word embeddings and pool to document-level
embeddings = []
for doc in tqdm(documents, disable=not verbose, position=0, leave=True):
embedding = [
self.embedding_model.get_vector(word)
for word in doc.split()
if word in self.embedding_model.key_to_index
]
if len(embedding) > 0:
embeddings.append(np.mean(embedding, axis=0))
else:
embeddings.append(empty_vector)
embeddings = np.array(embeddings)
return embeddings
@@ -0,0 +1,104 @@
import numpy as np
from tqdm import tqdm
from typing import List
from torch.utils.data import Dataset
from sklearn.preprocessing import normalize
from transformers.pipelines import Pipeline
from bertopic.backend import BaseEmbedder
class HFTransformerBackend(BaseEmbedder):
"""Hugging Face transformers model.
This uses the `transformers.pipelines.pipeline` to define and create
a feature generation pipeline from which embeddings can be extracted.
Arguments:
embedding_model: A Hugging Face feature extraction pipeline
Examples:
To use a Hugging Face transformers model, load in a pipeline and point
to any model found on their model hub (https://huggingface.co/models):
```python
from bertopic.backend import HFTransformerBackend
from transformers.pipelines import pipeline
hf_model = pipeline("feature-extraction", model="distilbert-base-cased")
embedding_model = HFTransformerBackend(hf_model)
```
"""
def __init__(self, embedding_model: Pipeline):
super().__init__()
if isinstance(embedding_model, Pipeline):
self.embedding_model = embedding_model
else:
raise ValueError(
"Please select a correct transformers pipeline. For example: "
"pipeline('feature-extraction', model='distilbert-base-cased', device=0)"
)
def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
"""Embed a list of n documents/words into an n-dimensional
matrix of embeddings.
Arguments:
documents: A list of documents or words to be embedded
verbose: Controls the verbosity of the process
Returns:
Document/words embeddings with shape (n, m) with `n` documents/words
that each have an embeddings size of `m`
"""
dataset = MyDataset(documents)
embeddings = []
for document, features in tqdm(
zip(documents, self.embedding_model(dataset, truncation=True, padding=True)),
total=len(dataset),
disable=not verbose,
):
embeddings.append(self._embed(document, features))
return np.array(embeddings)
def _embed(self, document: str, features: np.ndarray) -> np.ndarray:
"""Mean pooling.
Arguments:
document: The document for which to extract the attention mask
features: The embeddings for each token
Adopted from:
https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2#usage-huggingface-transformers
"""
token_embeddings = np.array(features)
attention_mask = self.embedding_model.tokenizer(document, truncation=True, padding=True, return_tensors="np")[
"attention_mask"
]
input_mask_expanded = np.broadcast_to(np.expand_dims(attention_mask, -1), token_embeddings.shape)
sum_embeddings = np.sum(token_embeddings * input_mask_expanded, 1)
sum_mask = np.clip(
input_mask_expanded.sum(1),
a_min=1e-9,
a_max=input_mask_expanded.sum(1).max(),
)
embedding = normalize(sum_embeddings / sum_mask)[0]
return embedding
class MyDataset(Dataset):
"""Dataset to pass to `transformers.pipelines.pipeline`."""
def __init__(self, docs):
self.docs = docs
def __len__(self):
return len(self.docs)
def __getitem__(self, idx):
return self.docs[idx]
@@ -0,0 +1,43 @@
from typing import List
import numpy as np
from bertopic.backend import BaseEmbedder
from langchain_core.embeddings import Embeddings
class LangChainBackend(BaseEmbedder):
"""LangChain Embedding Model.
This class uses the LangChain Embedding class to embed the documents.
Argument:
embedding_model: A LangChain Embedding Instance.
Examples:
```python
from langchain_community.embeddings import HuggingFaceInstructEmbeddings
from bertopic.backend import LangChainBackend
hf_embedding = HuggingFaceInstructEmbeddings()
langchain_embedder = LangChainBackend(hf_embedding)
```
"""
def __init__(self, embedding_model: Embeddings):
self.embedding_model = embedding_model
def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
"""Embed a list of n documents/words into an n-dimensional
matrix of embeddings.
Arguments:
documents: A list of documents or words to be embedded
verbose: Controls the verbosity of the process
Returns:
Document/words embeddings with shape (n, m) with `n` documents/words
that each have an embeddings size of `m`
"""
# Prepare documents, replacing empty strings with a single space
prepared_documents = [" " if doc == "" else doc for doc in documents]
response = self.embedding_model.embed_documents(prepared_documents)
return np.array(response)
@@ -0,0 +1,129 @@
import numpy as np
from typing import List, Union
from model2vec import StaticModel
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.backend import BaseEmbedder
class Model2VecBackend(BaseEmbedder):
"""Model2Vec embedding model.
Arguments:
embedding_model: Either a model2vec model or a
string pointing to a model2vec model
distill: Indicates whether to distill a sentence-transformers compatible model.
The distillation will happen during fitting of the topic model.
NOTE: Only works if `embedding_model` is a string.
distill_kwargs: Keyword arguments to pass to the distillation process
of `model2vec.distill.distill`
distill_vectorizer: A CountVectorizer used for creating a custom vocabulary
based on the same documents used for topic modeling.
NOTE: If "vocabulary" is in `distill_kwargs`, this will be ignored.
Examples:
To create a model, you can load in a string pointing to a
model2vec model:
```python
from bertopic.backend import Model2VecBackend
sentence_model = Model2VecBackend("minishlab/potion-base-8M")
```
or you can instantiate a model yourself:
```python
from bertopic.backend import Model2VecBackend
from model2vec import StaticModel
embedding_model = StaticModel.from_pretrained("minishlab/potion-base-8M")
sentence_model = Model2VecBackend(embedding_model)
```
If you want to distill a sentence-transformers model with the vocabulary of the documents,
run the following:
```python
from bertopic.backend import Model2VecBackend
sentence_model = Model2VecBackend("sentence-transformers/all-MiniLM-L6-v2", distill=True)
```
"""
def __init__(
self,
embedding_model: Union[str, StaticModel],
distill: bool = False,
distill_kwargs: dict = {},
distill_vectorizer: str = None,
):
super().__init__()
self.distill = distill
self.distill_kwargs = distill_kwargs
self.distill_vectorizer = distill_vectorizer
self._has_distilled = False
# When we distill, we need a string pointing to a sentence-transformer model
if self.distill:
self._check_model2vec_installation()
if not self.distill_vectorizer:
self.distill_vectorizer = CountVectorizer()
if isinstance(embedding_model, str):
self.embedding_model = embedding_model
else:
raise ValueError("Please pass a string pointing to a sentence-transformer model when distilling.")
# If we don't distill, we can pass a model2vec model directly or load from a string
elif isinstance(embedding_model, StaticModel):
self.embedding_model = embedding_model
elif isinstance(embedding_model, str):
self.embedding_model = StaticModel.from_pretrained(embedding_model)
else:
raise ValueError(
"Please select a correct Model2Vec model: \n"
"`from model2vec import StaticModel` \n"
"`model = StaticModel.from_pretrained('minishlab/potion-base-8M')`"
)
def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
"""Embed a list of n documents/words into an n-dimensional
matrix of embeddings.
Arguments:
documents: A list of documents or words to be embedded
verbose: Controls the verbosity of the process
Returns:
Document/words embeddings with shape (n, m) with `n` documents/words
that each have an embeddings size of `m`
"""
# Distill the model
if self.distill and not self._has_distilled:
from model2vec.distill import distill
# Distill with the vocabulary of the documents
if not self.distill_kwargs.get("vocabulary"):
X = self.distill_vectorizer.fit_transform(documents)
word_counts = np.array(X.sum(axis=0)).flatten()
words = self.distill_vectorizer.get_feature_names_out()
vocabulary = [word for word, _ in sorted(zip(words, word_counts), key=lambda x: x[1], reverse=True)]
self.distill_kwargs["vocabulary"] = vocabulary
# Distill the model
self.embedding_model = distill(self.embedding_model, **self.distill_kwargs)
# Distillation should happen only once and not for every embed call
# The distillation should only happen the first time on the entire vocabulary
self._has_distilled = True
# Embed the documents
embeddings = self.embedding_model.encode(documents, show_progress_bar=verbose)
return embeddings
def _check_model2vec_installation(self):
try:
from model2vec.distill import distill # noqa: F401
except ImportError:
raise ImportError("To distill a model using model2vec, you need to run `pip install model2vec[distill]`")
@@ -0,0 +1,200 @@
import numpy as np
from PIL import Image
from tqdm import tqdm
from typing import List, Union
from sentence_transformers import SentenceTransformer
from bertopic.backend import BaseEmbedder
class MultiModalBackend(BaseEmbedder):
"""Multimodal backend using Sentence-transformers.
The sentence-transformers embedding model used for
generating word, document, and image embeddings.
Arguments:
embedding_model: A sentence-transformers embedding model that
can either embed both images and text or only text.
If it only embeds text, then `image_model` needs
to be used to embed the images.
image_model: A sentence-transformers embedding model that is used
to embed only images.
batch_size: The sizes of image batches to pass
Examples:
To create a model, you can load in a string pointing to a
sentence-transformers model:
```python
from bertopic.backend import MultiModalBackend
sentence_model = MultiModalBackend("clip-ViT-B-32")
```
or you can instantiate a model yourself:
```python
from bertopic.backend import MultiModalBackend
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer("clip-ViT-B-32")
sentence_model = MultiModalBackend(embedding_model)
```
"""
def __init__(
self,
embedding_model: Union[str, SentenceTransformer],
image_model: Union[str, SentenceTransformer] = None,
batch_size: int = 32,
):
super().__init__()
self.batch_size = batch_size
# Text or Text+Image model
if isinstance(embedding_model, SentenceTransformer):
self.embedding_model = embedding_model
elif isinstance(embedding_model, str):
self.embedding_model = SentenceTransformer(embedding_model)
else:
raise ValueError(
"Please select a correct SentenceTransformers model: \n"
"`from sentence_transformers import SentenceTransformer` \n"
"`model = SentenceTransformer('clip-ViT-B-32')`"
)
# Image Model
self.image_model = None
if image_model is not None:
if isinstance(image_model, SentenceTransformer):
self.image_model = image_model
elif isinstance(image_model, str):
self.image_model = SentenceTransformer(image_model)
else:
raise ValueError(
"Please select a correct SentenceTransformers model: \n"
"`from sentence_transformers import SentenceTransformer` \n"
"`model = SentenceTransformer('clip-ViT-B-32')`"
)
try:
self.tokenizer = self.embedding_model._first_module().processor.tokenizer
except AttributeError:
self.tokenizer = self.embedding_model.tokenizer
except: # noqa: E722
self.tokenizer = None
def embed(self, documents: List[str], images: List[str] = None, verbose: bool = False) -> np.ndarray:
"""Embed a list of n documents/words or images into an n-dimensional
matrix of embeddings.
Either documents, images, or both can be provided. If both are provided,
then the embeddings are averaged.
Arguments:
documents: A list of documents or words to be embedded
images: A list of image paths to be embedded
verbose: Controls the verbosity of the process
Returns:
Document/words embeddings with shape (n, m) with `n` documents/words
that each have an embeddings size of `m`
"""
# Embed documents
doc_embeddings = None
if documents[0] is not None:
doc_embeddings = self.embed_documents(documents)
# Embed images
image_embeddings = None
if isinstance(images, list):
image_embeddings = self.embed_images(images, verbose)
# Average embeddings
averaged_embeddings = None
if doc_embeddings is not None and image_embeddings is not None:
averaged_embeddings = np.mean([doc_embeddings, image_embeddings], axis=0)
if averaged_embeddings is not None:
return averaged_embeddings
elif doc_embeddings is not None:
return doc_embeddings
elif image_embeddings is not None:
return image_embeddings
def embed_documents(self, documents: List[str], verbose: bool = False) -> np.ndarray:
"""Embed a list of n documents/words into an n-dimensional
matrix of embeddings.
Arguments:
documents: A list of documents or words to be embedded
verbose: Controls the verbosity of the process
Returns:
Document/words embeddings with shape (n, m) with `n` documents/words
that each have an embeddings size of `m`
"""
truncated_docs = [self._truncate_document(doc) for doc in documents]
embeddings = self.embedding_model.encode(truncated_docs, show_progress_bar=verbose)
return embeddings
def embed_words(self, words: List[str], verbose: bool = False) -> np.ndarray:
"""Embed a list of n words into an n-dimensional
matrix of embeddings.
Arguments:
words: A list of words to be embedded
verbose: Controls the verbosity of the process
Returns:
Document/words embeddings with shape (n, m) with `n` documents/words
that each have an embeddings size of `m`
"""
embeddings = self.embedding_model.encode(words, show_progress_bar=verbose)
return embeddings
def embed_images(self, images, verbose):
if self.batch_size:
nr_iterations = int(np.ceil(len(images) / self.batch_size))
# Embed images per batch
embeddings = []
for i in tqdm(range(nr_iterations), disable=not verbose):
start_index = i * self.batch_size
end_index = (i * self.batch_size) + self.batch_size
images_to_embed = [
Image.open(image) if isinstance(image, str) else image for image in images[start_index:end_index]
]
if self.image_model is not None:
img_emb = self.image_model.encode(images_to_embed)
else:
img_emb = self.embedding_model.encode(images_to_embed, show_progress_bar=False)
embeddings.extend(img_emb.tolist())
# Close images
if isinstance(images[0], str):
for image in images_to_embed:
image.close()
embeddings = np.array(embeddings)
else:
images_to_embed = [Image.open(filepath) for filepath in images]
if self.image_model is not None:
embeddings = self.image_model.encode(images_to_embed)
else:
embeddings = self.embedding_model.encode(images_to_embed, show_progress_bar=False)
return embeddings
def _truncate_document(self, document):
if self.tokenizer:
tokens = self.tokenizer.encode(document)
if len(tokens) > 77:
# Skip the starting token, only include 75 tokens
truncated_tokens = tokens[1:76]
document = self.tokenizer.decode(truncated_tokens)
# Recursive call here, because the encode(decode()) can have different result
return self._truncate_document(document)
return document
@@ -0,0 +1,88 @@
import time
import openai
import numpy as np
from tqdm import tqdm
from typing import List, Mapping, Any
from bertopic.backend import BaseEmbedder
class OpenAIBackend(BaseEmbedder):
"""OpenAI Embedding Model.
Arguments:
client: A `openai.OpenAI` client.
embedding_model: An OpenAI model. Default is
For an overview of models see:
https://platform.openai.com/docs/models/embeddings
delay_in_seconds: If a `batch_size` is given, use this set
the delay in seconds between batches.
batch_size: The size of each batch.
generator_kwargs: Kwargs passed to `openai.Embedding.create`.
Can be used to define custom engines or
deployment_ids.
Examples:
```python
import openai
from bertopic.backend import OpenAIBackend
client = openai.OpenAI(api_key="sk-...")
openai_embedder = OpenAIBackend(client, "text-embedding-ada-002")
```
"""
def __init__(
self,
client: openai.OpenAI,
embedding_model: str = "text-embedding-ada-002",
delay_in_seconds: float = None,
batch_size: int = None,
generator_kwargs: Mapping[str, Any] = {},
):
super().__init__()
self.client = client
self.embedding_model = embedding_model
self.delay_in_seconds = delay_in_seconds
self.batch_size = batch_size
self.generator_kwargs = generator_kwargs
if self.generator_kwargs.get("model"):
self.embedding_model = generator_kwargs.get("model")
elif not self.generator_kwargs.get("engine"):
self.generator_kwargs["model"] = self.embedding_model
def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
"""Embed a list of n documents/words into an n-dimensional
matrix of embeddings.
Arguments:
documents: A list of documents or words to be embedded
verbose: Controls the verbosity of the process
Returns:
Document/words embeddings with shape (n, m) with `n` documents/words
that each have an embeddings size of `m`
"""
# Prepare documents, replacing empty strings with a single space
prepared_documents = [" " if doc == "" else doc for doc in documents]
# Batch-wise embedding extraction
if self.batch_size is not None:
embeddings = []
for batch in tqdm(self._chunks(prepared_documents), disable=not verbose):
response = self.client.embeddings.create(input=batch, **self.generator_kwargs)
embeddings.extend([r.embedding for r in response.data])
# Delay subsequent calls
if self.delay_in_seconds:
time.sleep(self.delay_in_seconds)
# Extract embeddings all at once
else:
response = self.client.embeddings.create(input=prepared_documents, **self.generator_kwargs)
embeddings = [r.embedding for r in response.data]
return np.array(embeddings)
def _chunks(self, documents):
for i in range(0, len(documents), self.batch_size):
yield documents[i : i + self.batch_size]
@@ -0,0 +1,85 @@
import numpy as np
from typing import List, Union
from sentence_transformers import SentenceTransformer
from sentence_transformers.models import StaticEmbedding
from bertopic.backend import BaseEmbedder
class SentenceTransformerBackend(BaseEmbedder):
"""Sentence-transformers embedding model.
The sentence-transformers embedding model used for generating document and
word embeddings.
Arguments:
embedding_model: A sentence-transformers embedding model
model2vec: Indicates whether `embedding_model` is a model2vec model.
NOTE: Only works if `embedding_model` is a string.
Otherwise, you can pass the model2vec model directly to `embedding_model`.
Examples:
To create a model, you can load in a string pointing to a
sentence-transformers model:
```python
from bertopic.backend import SentenceTransformerBackend
sentence_model = SentenceTransformerBackend("all-MiniLM-L6-v2")
```
or you can instantiate a model yourself:
```python
from bertopic.backend import SentenceTransformerBackend
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
sentence_model = SentenceTransformerBackend(embedding_model)
```
If you want to use a model2vec model without having to install model2vec,
you can pass the model2vec model as a string:
```python
from bertopic.backend import SentenceTransformerBackend
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer("minishlab/potion-base-8M", model2vec=True)
sentence_model = SentenceTransformerBackend(embedding_model)
```
"""
def __init__(self, embedding_model: Union[str, SentenceTransformer], model2vec: bool = False):
super().__init__()
self._hf_model = None
if model2vec and isinstance(embedding_model, str):
static_embedding = StaticEmbedding.from_model2vec(embedding_model)
self.embedding_model = SentenceTransformer(modules=[static_embedding])
elif isinstance(embedding_model, SentenceTransformer):
self.embedding_model = embedding_model
elif isinstance(embedding_model, str):
self.embedding_model = SentenceTransformer(embedding_model)
self._hf_model = embedding_model
else:
raise ValueError(
"Please select a correct SentenceTransformers model: \n"
"`from sentence_transformers import SentenceTransformer` \n"
"`model = SentenceTransformer('all-MiniLM-L6-v2')`"
)
def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
"""Embed a list of n documents/words into an n-dimensional
matrix of embeddings.
Arguments:
documents: A list of documents or words to be embedded
verbose: Controls the verbosity of the process
Returns:
Document/words embeddings with shape (n, m) with `n` documents/words
that each have an embeddings size of `m`
"""
embeddings = self.embedding_model.encode(documents, show_progress_bar=verbose)
return embeddings
@@ -0,0 +1,68 @@
from bertopic.backend import BaseEmbedder
from sklearn.utils.validation import check_is_fitted, NotFittedError
class SklearnEmbedder(BaseEmbedder):
"""Scikit-Learn based embedding model.
This component allows the usage of scikit-learn pipelines for generating document and
word embeddings.
Arguments:
pipe: A scikit-learn pipeline that can `.transform()` text.
Examples:
Scikit-Learn is very flexible and it allows for many representations.
A relatively simple pipeline is shown below.
```python
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from bertopic.backend import SklearnEmbedder
pipe = make_pipeline(
TfidfVectorizer(),
TruncatedSVD(100)
)
sklearn_embedder = SklearnEmbedder(pipe)
topic_model = BERTopic(embedding_model=sklearn_embedder)
```
This pipeline first constructs a sparse representation based on TF/idf and then
makes it dense by applying SVD. Alternatively, you might also construct something
more elaborate. As long as you construct a scikit-learn compatible pipeline, you
should be able to pass it to Bertopic.
!!! Warning
One caveat to be aware of is that scikit-learns base `Pipeline` class does not
support the `.partial_fit()`-API. If you have a pipeline that theoretically should
be able to support online learning then you might want to explore
the [scikit-partial](https://github.com/koaning/scikit-partial) project.
"""
def __init__(self, pipe):
super().__init__()
self.pipe = pipe
def embed(self, documents, verbose=False):
"""Embed a list of n documents/words into an n-dimensional
matrix of embeddings.
Arguments:
documents: A list of documents or words to be embedded
verbose: No-op variable that's kept around to keep the API consistent. If you want to get feedback on training times, you should use the sklearn API.
Returns:
Document/words embeddings with shape (n, m) with `n` documents/words
that each have an embeddings size of `m`
"""
try:
check_is_fitted(self.pipe)
embeddings = self.pipe.transform(documents)
except NotFittedError:
embeddings = self.pipe.fit_transform(documents)
return embeddings
@@ -0,0 +1,94 @@
import numpy as np
from tqdm import tqdm
from typing import List
from bertopic.backend import BaseEmbedder
class SpacyBackend(BaseEmbedder):
"""Spacy embedding model.
The Spacy embedding model used for generating document and
word embeddings.
Arguments:
embedding_model: A spacy embedding model
Examples:
To create a Spacy backend, you need to create an nlp object and
pass it through this backend:
```python
import spacy
from bertopic.backend import SpacyBackend
nlp = spacy.load("en_core_web_md", exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'])
spacy_model = SpacyBackend(nlp)
```
To load in a transformer model use the following:
```python
import spacy
from thinc.api import set_gpu_allocator, require_gpu
from bertopic.backend import SpacyBackend
nlp = spacy.load("en_core_web_trf", exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'])
set_gpu_allocator("pytorch")
require_gpu(0)
spacy_model = SpacyBackend(nlp)
```
If you run into gpu/memory-issues, please use:
```python
import spacy
from bertopic.backend import SpacyBackend
spacy.prefer_gpu()
nlp = spacy.load("en_core_web_trf", exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'])
spacy_model = SpacyBackend(nlp)
```
"""
def __init__(self, embedding_model):
super().__init__()
if "spacy" in str(type(embedding_model)):
self.embedding_model = embedding_model
else:
raise ValueError(
"Please select a correct Spacy model by either using a string such as 'en_core_web_md' "
"or create a nlp model using: `nlp = spacy.load('en_core_web_md')"
)
def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
"""Embed a list of n documents/words into an n-dimensional
matrix of embeddings.
Arguments:
documents: A list of documents or words to be embedded
verbose: Controls the verbosity of the process
Returns:
Document/words embeddings with shape (n, m) with `n` documents/words
that each have an embeddings size of `m`
"""
# Handle empty documents, spaCy models automatically map
# empty strings to the zero vector
empty_document = " "
# Extract embeddings
embeddings = []
for doc in tqdm(documents, position=0, leave=True, disable=not verbose):
embedding = self.embedding_model(doc or empty_document)
if embedding.has_vector:
embedding = embedding.vector
else:
embedding = embedding._.trf_data.tensors[-1][0]
if not isinstance(embedding, np.ndarray) and hasattr(embedding, "get"):
# Convert cupy array to numpy array
embedding = embedding.get()
embeddings.append(embedding)
return np.array(embeddings)
@@ -0,0 +1,55 @@
import numpy as np
from tqdm import tqdm
from typing import List
from bertopic.backend import BaseEmbedder
class USEBackend(BaseEmbedder):
"""Universal Sentence Encoder.
USE encodes text into high-dimensional vectors that
are used for semantic similarity in BERTopic.
Arguments:
embedding_model: An USE embedding model
Examples:
```python
import tensorflow_hub
from bertopic.backend import USEBackend
embedding_model = tensorflow_hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
use_embedder = USEBackend(embedding_model)
```
"""
def __init__(self, embedding_model):
super().__init__()
try:
embedding_model(["test sentence"])
self.embedding_model = embedding_model
except TypeError:
raise ValueError(
"Please select a correct USE model: \n"
"`import tensorflow_hub` \n"
"`embedding_model = tensorflow_hub.load(path_to_model)`"
)
def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
"""Embed a list of n documents/words into an n-dimensional
matrix of embeddings.
Arguments:
documents: A list of documents or words to be embedded
verbose: Controls the verbosity of the process
Returns:
Document/words embeddings with shape (n, m) with `n` documents/words
that each have an embeddings size of `m`
"""
embeddings = np.array(
[self.embedding_model([doc]).cpu().numpy()[0] for doc in tqdm(documents, disable=not verbose)]
)
return embeddings
@@ -0,0 +1,171 @@
from ._base import BaseEmbedder
# Imports for light-weight variant of BERTopic
from bertopic.backend._sklearn import SklearnEmbedder
from bertopic._utils import MyLogger
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline as ScikitPipeline
logger = MyLogger()
logger.configure("WARNING")
languages = [
"arabic",
"bulgarian",
"catalan",
"czech",
"danish",
"german",
"greek",
"english",
"spanish",
"estonian",
"persian",
"finnish",
"french",
"canadian french",
"galician",
"gujarati",
"hebrew",
"hindi",
"croatian",
"hungarian",
"armenian",
"indonesian",
"italian",
"japanese",
"georgian",
"korean",
"kurdish",
"lithuanian",
"latvian",
"macedonian",
"mongolian",
"marathi",
"malay",
"burmese",
"norwegian bokmal",
"dutch",
"polish",
"portuguese",
"brazilian portuguese",
"romanian",
"russian",
"slovak",
"slovenian",
"albanian",
"serbian",
"swedish",
"thai",
"turkish",
"ukrainian",
"urdu",
"vietnamese",
"chinese (simplified)",
"chinese (traditional)",
]
def select_backend(embedding_model, language: str = None, verbose: bool = False) -> BaseEmbedder:
"""Select an embedding model based on language or a specific provided model.
When selecting a language, we choose all-MiniLM-L6-v2 for English and
paraphrase-multilingual-MiniLM-L12-v2 for all other languages as it support 100+ languages.
If sentence-transformers is not installed, in the case of a lightweight installation,
a scikit-learn backend is default.
Returns:
model: The selected model backend.
"""
logger.set_level("INFO" if verbose else "WARNING")
# BERTopic language backend
if isinstance(embedding_model, BaseEmbedder):
return embedding_model
# Scikit-learn backend
if isinstance(embedding_model, ScikitPipeline):
return SklearnEmbedder(embedding_model)
# Flair word embeddings
if "flair" in str(type(embedding_model)):
from bertopic.backend._flair import FlairBackend
return FlairBackend(embedding_model)
# Spacy embeddings
if "spacy" in str(type(embedding_model)):
from bertopic.backend._spacy import SpacyBackend
return SpacyBackend(embedding_model)
# Gensim embeddings
if "gensim" in str(type(embedding_model)):
from bertopic.backend._gensim import GensimBackend
return GensimBackend(embedding_model)
# USE embeddings
if "tensorflow" and "saved_model" in str(type(embedding_model)):
from bertopic.backend._use import USEBackend
return USEBackend(embedding_model)
# Sentence Transformer embeddings
if "sentence_transformers" in str(type(embedding_model)) or isinstance(embedding_model, str):
from ._sentencetransformers import SentenceTransformerBackend
return SentenceTransformerBackend(embedding_model)
# Hugging Face embeddings
if "transformers" and "pipeline" in str(type(embedding_model)):
from ._hftransformers import HFTransformerBackend
return HFTransformerBackend(embedding_model)
# Model2Vec embeddings
if "model2vec" in str(type(embedding_model)):
from ._model2vec import Model2VecBackend
return Model2VecBackend(embedding_model)
# FastEmbed word embeddings
if "fastembed" in str(type(embedding_model)):
from bertopic.backend._fastembed import FastEmbedBackend
return FastEmbedBackend(embedding_model)
# Select embedding model based on language
if language:
try:
from ._sentencetransformers import SentenceTransformerBackend
if language.lower() in ["English", "english", "en"]:
return SentenceTransformerBackend("sentence-transformers/all-MiniLM-L6-v2")
elif language.lower() in languages or language == "multilingual":
return SentenceTransformerBackend("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
else:
raise ValueError(
f"{language} is currently not supported. However, you can "
f"create any embeddings yourself and pass it through fit_transform(docs, embeddings)\n"
"Else, please select a language from the following list:\n"
f"{languages}"
)
# A ModuleNotFoundError might be a lightweight installation
except ModuleNotFoundError as e:
if e.name != "sentence_transformers":
# Error occurred in a downstream module, probably not a lightweight install
raise e
# Whole sentence_transformers module is missing, probably a lightweight install
if verbose:
logger.info(
"Automatically selecting lightweight scikit-learn embedding backend as sentence-transformers appears to not be installed."
)
pipe = make_pipeline(TfidfVectorizer(), TruncatedSVD(100))
return SklearnEmbedder(pipe)
from ._sentencetransformers import SentenceTransformerBackend
return SentenceTransformerBackend("sentence-transformers/all-MiniLM-L6-v2")
@@ -0,0 +1,43 @@
import numpy as np
from typing import List
from bertopic.backend._base import BaseEmbedder
from bertopic.backend._utils import select_backend
class WordDocEmbedder(BaseEmbedder):
"""Combine a document- and word-level embedder."""
def __init__(self, embedding_model, word_embedding_model):
super().__init__()
self.embedding_model = select_backend(embedding_model)
self.word_embedding_model = select_backend(word_embedding_model)
def embed_words(self, words: List[str], verbose: bool = False) -> np.ndarray:
"""Embed a list of n words into an n-dimensional
matrix of embeddings.
Arguments:
words: A list of words to be embedded
verbose: Controls the verbosity of the process
Returns:
Word embeddings with shape (n, m) with `n` words
that each have an embeddings size of `m`
"""
return self.word_embedding_model.embed(words, verbose)
def embed_documents(self, document: List[str], verbose: bool = False) -> np.ndarray:
"""Embed a list of n words into an n-dimensional
matrix of embeddings.
Arguments:
document: A list of documents to be embedded
verbose: Controls the verbosity of the process
Returns:
Document embeddings with shape (n, m) with `n` documents
that each have an embeddings size of `m`
"""
return self.embedding_model.embed(document, verbose)