Add BERTopic.

2025-08-12 19:01:20 +08:00
parent e2323d579c
commit c5c530775e
256 changed files with 28666 additions and 0 deletions
@@ -0,0 +1,60 @@
+from ._base import BaseEmbedder
+from ._word_doc import WordDocEmbedder
+from ._utils import languages
+from bertopic._utils import NotInstalled
+
+# OpenAI Embeddings
+try:
+    from bertopic.backend._openai import OpenAIBackend
+except ModuleNotFoundError:
+    msg = "`pip install openai` \n\n"
+    OpenAIBackend = NotInstalled("OpenAI", "OpenAI", custom_msg=msg)
+
+# Cohere Embeddings
+try:
+    from bertopic.backend._cohere import CohereBackend
+except ModuleNotFoundError:
+    msg = "`pip install cohere` \n\n"
+    CohereBackend = NotInstalled("Cohere", "Cohere", custom_msg=msg)
+
+# Multimodal Embeddings
+try:
+    from bertopic.backend._multimodal import MultiModalBackend
+except ModuleNotFoundError:
+    msg = "`pip install bertopic[vision]` \n\n"
+    MultiModalBackend = NotInstalled("Vision", "Vision", custom_msg=msg)
+
+# Model2Vec Embeddings
+try:
+    from bertopic.backend._model2vec import Model2VecBackend
+except ModuleNotFoundError:
+    msg = "`pip install model2vec` \n\n"
+    Model2VecBackend = NotInstalled("Model2Vec", "Model2Vec", custom_msg=msg)
+
+# FasteEmbed Embeddings
+try:
+    from bertopic.backend._fastembed import FastEmbedBackend
+except ModuleNotFoundError:
+    msg = "`pip install fastembed` \n\n"
+    FastEmbedBackend = NotInstalled("FastEmbed", "FastEmbed", custom_msg=msg)
+
+
+# Langchain Embedddings
+try:
+    from bertopic.backend._langchain import LangChainBackend
+except ModuleNotFoundError:
+    msg = "`pip install langchain` \n\n"
+    LangChainBackend = NotInstalled("LangChain", "LangChain", custom_msg=msg)
+
+
+__all__ = [
+    "BaseEmbedder",
+    "WordDocEmbedder",
+    "OpenAIBackend",
+    "CohereBackend",
+    "Model2VecBackend",
+    "MultiModalBackend",
+    "FastEmbedBackend",
+    "LangChainBackend",
+    "languages",
+]
@@ -0,0 +1,62 @@
+import numpy as np
+from typing import List
+
+
+class BaseEmbedder:
+    """The Base Embedder used for creating embedding models.
+
+    Arguments:
+        embedding_model: The main embedding model to be used for extracting
+                         document and word embedding
+        word_embedding_model: The embedding model used for extracting word
+                              embeddings only. If this model is selected,
+                              then the `embedding_model` is purely used for
+                              creating document embeddings.
+    """
+
+    def __init__(self, embedding_model=None, word_embedding_model=None):
+        self.embedding_model = embedding_model
+        self.word_embedding_model = word_embedding_model
+
+    def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
+        """Embed a list of n documents/words into an n-dimensional
+        matrix of embeddings.
+
+        Arguments:
+            documents: A list of documents or words to be embedded
+            verbose: Controls the verbosity of the process
+
+        Returns:
+            Document/words embeddings with shape (n, m) with `n` documents/words
+            that each have an embeddings size of `m`
+        """
+        pass
+
+    def embed_words(self, words: List[str], verbose: bool = False) -> np.ndarray:
+        """Embed a list of n words into an n-dimensional
+        matrix of embeddings.
+
+        Arguments:
+            words: A list of words to be embedded
+            verbose: Controls the verbosity of the process
+
+        Returns:
+            Word embeddings with shape (n, m) with `n` words
+            that each have an embeddings size of `m`
+
+        """
+        return self.embed(words, verbose)
+
+    def embed_documents(self, document: List[str], verbose: bool = False) -> np.ndarray:
+        """Embed a list of n words into an n-dimensional
+        matrix of embeddings.
+
+        Arguments:
+            document: A list of documents to be embedded
+            verbose: Controls the verbosity of the process
+
+        Returns:
+            Document embeddings with shape (n, m) with `n` documents
+            that each have an embeddings size of `m`
+        """
+        return self.embed(document, verbose)
@@ -0,0 +1,94 @@
+import time
+import numpy as np
+from tqdm import tqdm
+from typing import Any, List, Mapping
+from bertopic.backend import BaseEmbedder
+
+
+class CohereBackend(BaseEmbedder):
+    """Cohere Embedding Model.
+
+    Arguments:
+        client: A `cohere` client.
+        embedding_model: A Cohere model. Default is "large".
+                         For an overview of models see:
+                         https://docs.cohere.ai/docs/generation-card
+        delay_in_seconds: If a `batch_size` is given, use this set
+                          the delay in seconds between batches.
+        batch_size: The size of each batch.
+        embed_kwargs: Kwargs passed to `cohere.Client.embed`.
+                            Can be used to define additional parameters
+                            such as `input_type`
+
+    Examples:
+    ```python
+    import cohere
+    from bertopic.backend import CohereBackend
+
+    client = cohere.Client("APIKEY")
+    cohere_model = CohereBackend(client)
+    ```
+
+    If you want to specify `input_type`:
+
+    ```python
+    cohere_model = CohereBackend(
+        client,
+        embedding_model="embed-english-v3.0",
+        embed_kwargs={"input_type": "clustering"}
+    )
+    ```
+    """
+
+    def __init__(
+        self,
+        client,
+        embedding_model: str = "large",
+        delay_in_seconds: float = None,
+        batch_size: int = None,
+        embed_kwargs: Mapping[str, Any] = {},
+    ):
+        super().__init__()
+        self.client = client
+        self.embedding_model = embedding_model
+        self.delay_in_seconds = delay_in_seconds
+        self.batch_size = batch_size
+        self.embed_kwargs = embed_kwargs
+
+        if self.embed_kwargs.get("model"):
+            self.embedding_model = embed_kwargs.get("model")
+        else:
+            self.embed_kwargs["model"] = self.embedding_model
+
+    def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
+        """Embed a list of n documents/words into an n-dimensional
+        matrix of embeddings.
+
+        Arguments:
+            documents: A list of documents or words to be embedded
+            verbose: Controls the verbosity of the process
+
+        Returns:
+            Document/words embeddings with shape (n, m) with `n` documents/words
+            that each have an embeddings size of `m`
+        """
+        # Batch-wise embedding extraction
+        if self.batch_size is not None:
+            embeddings = []
+            for batch in tqdm(self._chunks(documents), disable=not verbose):
+                response = self.client.embed(texts=batch, **self.embed_kwargs)
+                embeddings.extend(response.embeddings)
+
+                # Delay subsequent calls
+                if self.delay_in_seconds:
+                    time.sleep(self.delay_in_seconds)
+
+        # Extract embeddings all at once
+        else:
+            response = self.client.embed(texts=documents, **self.embed_kwargs)
+            embeddings = response.embeddings
+        return np.array(embeddings)
+
+    def _chunks(self, documents):
+        for i in range(0, len(documents), self.batch_size):
+            yield documents[i : i + self.batch_size]
@@ -0,0 +1,54 @@
+import numpy as np
+from typing import List
+from fastembed import TextEmbedding
+
+from bertopic.backend import BaseEmbedder
+
+
+class FastEmbedBackend(BaseEmbedder):
+    """FastEmbed embedding model.
+
+    The FastEmbed embedding model used for generating sentence embeddings.
+
+    Arguments:
+        embedding_model: A FastEmbed embedding model
+
+    Examples:
+    To create a model, you can load in a string pointing to a supported
+    FastEmbed model:
+
+    ```python
+    from bertopic.backend import FastEmbedBackend
+
+    sentence_model = FastEmbedBackend("BAAI/bge-small-en-v1.5")
+    ```
+    """
+
+    def __init__(self, embedding_model: str = "BAAI/bge-small-en-v1.5"):
+        super().__init__()
+
+        supported_models = [m["model"] for m in TextEmbedding.list_supported_models()]
+
+        if isinstance(embedding_model, str) and embedding_model in supported_models:
+            self.embedding_model = TextEmbedding(model_name=embedding_model)
+        else:
+            raise ValueError(
+                "Please select a correct FasteEmbed model: \n"
+                "the model must be a string and must be supported. \n"
+                "The supported TextEmbedding model list is here: https://qdrant.github.io/fastembed/examples/Supported_Models/"
+            )
+
+    def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
+        """Embed a list of n documents/words into an n-dimensional
+        matrix of embeddings.
+
+        Arguments:
+            documents: A list of documents or words to be embedded
+            verbose: Controls the verbosity of the process
+
+        Returns:
+            Document/words embeddings with shape (n, m) with `n` documents/words
+            that each have an embeddings size of `m`
+        """
+        embeddings = np.array(list(self.embedding_model.embed(documents, show_progress_bar=verbose)))
+        return embeddings
@@ -0,0 +1,78 @@
+import numpy as np
+from tqdm import tqdm
+from typing import Union, List
+from flair.data import Sentence
+from flair.embeddings import DocumentEmbeddings, TokenEmbeddings, DocumentPoolEmbeddings
+
+from bertopic.backend import BaseEmbedder
+
+
+class FlairBackend(BaseEmbedder):
+    """Flair Embedding Model.
+
+    The Flair embedding model used for generating document and
+    word embeddings.
+
+    Arguments:
+        embedding_model: A Flair embedding model
+
+    Examples:
+    ```python
+    from bertopic.backend import FlairBackend
+    from flair.embeddings import WordEmbeddings, DocumentPoolEmbeddings
+
+    # Create a Flair Embedding model
+    glove_embedding = WordEmbeddings('crawl')
+    document_glove_embeddings = DocumentPoolEmbeddings([glove_embedding])
+
+    # Pass the Flair model to create a new backend
+    flair_embedder = FlairBackend(document_glove_embeddings)
+    ```
+    """
+
+    def __init__(self, embedding_model: Union[TokenEmbeddings, DocumentEmbeddings]):
+        super().__init__()
+
+        # Flair word embeddings
+        if isinstance(embedding_model, TokenEmbeddings):
+            self.embedding_model = DocumentPoolEmbeddings([embedding_model])
+
+        # Flair document embeddings + disable fine tune to prevent CUDA OOM
+        # https://github.com/flairNLP/flair/issues/1719
+        elif isinstance(embedding_model, DocumentEmbeddings):
+            if "fine_tune" in embedding_model.__dict__:
+                embedding_model.fine_tune = False
+            self.embedding_model = embedding_model
+
+        else:
+            raise ValueError(
+                "Please select a correct Flair model by either using preparing a token or document "
+                "embedding model: \n"
+                "`from flair.embeddings import TransformerDocumentEmbeddings` \n"
+                "`roberta = TransformerDocumentEmbeddings('roberta-base')`"
+            )
+
+    def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
+        """Embed a list of n documents/words into an n-dimensional
+        matrix of embeddings.
+
+        Arguments:
+            documents: A list of documents or words to be embedded
+            verbose: Controls the verbosity of the process
+
+        Returns:
+            Document/words embeddings with shape (n, m) with `n` documents/words
+            that each have an embeddings size of `m`
+        """
+        embeddings = []
+        for document in tqdm(documents, disable=not verbose):
+            try:
+                sentence = Sentence(document) if document else Sentence("an empty document")
+                self.embedding_model.embed(sentence)
+            except RuntimeError:
+                sentence = Sentence("an empty document")
+                self.embedding_model.embed(sentence)
+            embedding = sentence.embedding.detach().cpu().numpy()
+            embeddings.append(embedding)
+        embeddings = np.asarray(embeddings)
+        return embeddings
@@ -0,0 +1,69 @@
+import numpy as np
+from tqdm import tqdm
+from typing import List
+from bertopic.backend import BaseEmbedder
+from gensim.models.keyedvectors import Word2VecKeyedVectors
+
+
+class GensimBackend(BaseEmbedder):
+    """Gensim Embedding Model.
+
+    The Gensim embedding model is typically used for word embeddings with
+    GloVe, Word2Vec or FastText.
+
+    Arguments:
+        embedding_model: A Gensim embedding model
+
+    Examples:
+    ```python
+    from bertopic.backend import GensimBackend
+    import gensim.downloader as api
+
+    ft = api.load('fasttext-wiki-news-subwords-300')
+    ft_embedder = GensimBackend(ft)
+    ```
+    """
+
+    def __init__(self, embedding_model: Word2VecKeyedVectors):
+        super().__init__()
+
+        if isinstance(embedding_model, Word2VecKeyedVectors):
+            self.embedding_model = embedding_model
+        else:
+            raise ValueError(
+                "Please select a correct Gensim model: \n"
+                "`import gensim.downloader as api` \n"
+                "`ft = api.load('fasttext-wiki-news-subwords-300')`"
+            )
+
+    def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
+        """Embed a list of n documents/words into an n-dimensional
+        matrix of embeddings.
+
+        Arguments:
+            documents: A list of documents or words to be embedded
+            verbose: Controls the verbosity of the process
+
+        Returns:
+            Document/words embeddings with shape (n, m) with `n` documents/words
+            that each have an embeddings size of `m`
+        """
+        vector_shape = self.embedding_model.get_vector(list(self.embedding_model.index_to_key)[0]).shape[0]
+        empty_vector = np.zeros(vector_shape)
+
+        # Extract word embeddings and pool to document-level
+        embeddings = []
+        for doc in tqdm(documents, disable=not verbose, position=0, leave=True):
+            embedding = [
+                self.embedding_model.get_vector(word)
+                for word in doc.split()
+                if word in self.embedding_model.key_to_index
+            ]
+
+            if len(embedding) > 0:
+                embeddings.append(np.mean(embedding, axis=0))
+            else:
+                embeddings.append(empty_vector)
+
+        embeddings = np.array(embeddings)
+        return embeddings
@@ -0,0 +1,104 @@
+import numpy as np
+
+from tqdm import tqdm
+from typing import List
+from torch.utils.data import Dataset
+from sklearn.preprocessing import normalize
+from transformers.pipelines import Pipeline
+
+from bertopic.backend import BaseEmbedder
+
+
+class HFTransformerBackend(BaseEmbedder):
+    """Hugging Face transformers model.
+
+    This uses the `transformers.pipelines.pipeline` to define and create
+    a feature generation pipeline from which embeddings can be extracted.
+
+    Arguments:
+        embedding_model: A Hugging Face feature extraction pipeline
+
+    Examples:
+    To use a Hugging Face transformers model, load in a pipeline and point
+    to any model found on their model hub (https://huggingface.co/models):
+
+    ```python
+    from bertopic.backend import HFTransformerBackend
+    from transformers.pipelines import pipeline
+
+    hf_model = pipeline("feature-extraction", model="distilbert-base-cased")
+    embedding_model = HFTransformerBackend(hf_model)
+    ```
+    """
+
+    def __init__(self, embedding_model: Pipeline):
+        super().__init__()
+
+        if isinstance(embedding_model, Pipeline):
+            self.embedding_model = embedding_model
+        else:
+            raise ValueError(
+                "Please select a correct transformers pipeline. For example: "
+                "pipeline('feature-extraction', model='distilbert-base-cased', device=0)"
+            )
+
+    def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
+        """Embed a list of n documents/words into an n-dimensional
+        matrix of embeddings.
+
+        Arguments:
+            documents: A list of documents or words to be embedded
+            verbose: Controls the verbosity of the process
+
+        Returns:
+            Document/words embeddings with shape (n, m) with `n` documents/words
+            that each have an embeddings size of `m`
+        """
+        dataset = MyDataset(documents)
+
+        embeddings = []
+        for document, features in tqdm(
+            zip(documents, self.embedding_model(dataset, truncation=True, padding=True)),
+            total=len(dataset),
+            disable=not verbose,
+        ):
+            embeddings.append(self._embed(document, features))
+
+        return np.array(embeddings)
+
+    def _embed(self, document: str, features: np.ndarray) -> np.ndarray:
+        """Mean pooling.
+
+        Arguments:
+            document: The document for which to extract the attention mask
+            features: The embeddings for each token
+
+        Adopted from:
+        https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2#usage-huggingface-transformers
+        """
+        token_embeddings = np.array(features)
+        attention_mask = self.embedding_model.tokenizer(document, truncation=True, padding=True, return_tensors="np")[
+            "attention_mask"
+        ]
+        input_mask_expanded = np.broadcast_to(np.expand_dims(attention_mask, -1), token_embeddings.shape)
+        sum_embeddings = np.sum(token_embeddings * input_mask_expanded, 1)
+        sum_mask = np.clip(
+            input_mask_expanded.sum(1),
+            a_min=1e-9,
+            a_max=input_mask_expanded.sum(1).max(),
+        )
+        embedding = normalize(sum_embeddings / sum_mask)[0]
+        return embedding
+
+
+class MyDataset(Dataset):
+    """Dataset to pass to `transformers.pipelines.pipeline`."""
+
+    def __init__(self, docs):
+        self.docs = docs
+
+    def __len__(self):
+        return len(self.docs)
+
+    def __getitem__(self, idx):
+        return self.docs[idx]
@@ -0,0 +1,43 @@
+from typing import List
+
+import numpy as np
+from bertopic.backend import BaseEmbedder
+from langchain_core.embeddings import Embeddings
+
+
+class LangChainBackend(BaseEmbedder):
+    """LangChain Embedding Model.
+
+    This class uses the LangChain Embedding class to embed the documents.
+    Argument:
+        embedding_model: A LangChain Embedding Instance.
+
+    Examples:
+    ```python
+    from langchain_community.embeddings import HuggingFaceInstructEmbeddings
+    from bertopic.backend import LangChainBackend
+
+    hf_embedding = HuggingFaceInstructEmbeddings()
+    langchain_embedder = LangChainBackend(hf_embedding)
+    ```
+    """
+
+    def __init__(self, embedding_model: Embeddings):
+        self.embedding_model = embedding_model
+
+    def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
+        """Embed a list of n documents/words into an n-dimensional
+        matrix of embeddings.
+
+        Arguments:
+            documents: A list of documents or words to be embedded
+            verbose: Controls the verbosity of the process
+
+        Returns:
+            Document/words embeddings with shape (n, m) with `n` documents/words
+            that each have an embeddings size of `m`
+        """
+        # Prepare documents, replacing empty strings with a single space
+        prepared_documents = [" " if doc == "" else doc for doc in documents]
+        response = self.embedding_model.embed_documents(prepared_documents)
+        return np.array(response)
@@ -0,0 +1,129 @@
+import numpy as np
+from typing import List, Union
+from model2vec import StaticModel
+from sklearn.feature_extraction.text import CountVectorizer
+
+from bertopic.backend import BaseEmbedder
+
+
+class Model2VecBackend(BaseEmbedder):
+    """Model2Vec embedding model.
+
+    Arguments:
+        embedding_model: Either a model2vec model or a
+                         string pointing to a model2vec model
+        distill: Indicates whether to distill a sentence-transformers compatible model.
+                 The distillation will happen during fitting of the topic model.
+                 NOTE: Only works if `embedding_model` is a string.
+        distill_kwargs: Keyword arguments to pass to the distillation process
+                        of `model2vec.distill.distill`
+        distill_vectorizer: A CountVectorizer used for creating a custom vocabulary
+                            based on the same documents used for topic modeling.
+                            NOTE: If "vocabulary" is in `distill_kwargs`, this will be ignored.
+
+    Examples:
+    To create a model, you can load in a string pointing to a
+    model2vec model:
+
+    ```python
+    from bertopic.backend import Model2VecBackend
+
+    sentence_model = Model2VecBackend("minishlab/potion-base-8M")
+    ```
+
+    or  you can instantiate a model yourself:
+
+    ```python
+    from bertopic.backend import Model2VecBackend
+    from model2vec import StaticModel
+
+    embedding_model = StaticModel.from_pretrained("minishlab/potion-base-8M")
+    sentence_model = Model2VecBackend(embedding_model)
+    ```
+
+    If you want to distill a sentence-transformers model with the vocabulary of the documents,
+    run the following:
+
+    ```python
+    from bertopic.backend import Model2VecBackend
+
+    sentence_model = Model2VecBackend("sentence-transformers/all-MiniLM-L6-v2", distill=True)
+    ```
+    """
+
+    def __init__(
+        self,
+        embedding_model: Union[str, StaticModel],
+        distill: bool = False,
+        distill_kwargs: dict = {},
+        distill_vectorizer: str = None,
+    ):
+        super().__init__()
+
+        self.distill = distill
+        self.distill_kwargs = distill_kwargs
+        self.distill_vectorizer = distill_vectorizer
+        self._has_distilled = False
+
+        # When we distill, we need a string pointing to a sentence-transformer model
+        if self.distill:
+            self._check_model2vec_installation()
+            if not self.distill_vectorizer:
+                self.distill_vectorizer = CountVectorizer()
+            if isinstance(embedding_model, str):
+                self.embedding_model = embedding_model
+            else:
+                raise ValueError("Please pass a string pointing to a sentence-transformer model when distilling.")
+
+        # If we don't distill, we can pass a model2vec model directly or load from a string
+        elif isinstance(embedding_model, StaticModel):
+            self.embedding_model = embedding_model
+        elif isinstance(embedding_model, str):
+            self.embedding_model = StaticModel.from_pretrained(embedding_model)
+        else:
+            raise ValueError(
+                "Please select a correct Model2Vec model: \n"
+                "`from model2vec import StaticModel` \n"
+                "`model = StaticModel.from_pretrained('minishlab/potion-base-8M')`"
+            )
+
+    def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
+        """Embed a list of n documents/words into an n-dimensional
+        matrix of embeddings.
+
+        Arguments:
+            documents: A list of documents or words to be embedded
+            verbose: Controls the verbosity of the process
+
+        Returns:
+            Document/words embeddings with shape (n, m) with `n` documents/words
+            that each have an embeddings size of `m`
+        """
+        # Distill the model
+        if self.distill and not self._has_distilled:
+            from model2vec.distill import distill
+
+            # Distill with the vocabulary of the documents
+            if not self.distill_kwargs.get("vocabulary"):
+                X = self.distill_vectorizer.fit_transform(documents)
+                word_counts = np.array(X.sum(axis=0)).flatten()
+                words = self.distill_vectorizer.get_feature_names_out()
+                vocabulary = [word for word, _ in sorted(zip(words, word_counts), key=lambda x: x[1], reverse=True)]
+                self.distill_kwargs["vocabulary"] = vocabulary
+
+            # Distill the model
+            self.embedding_model = distill(self.embedding_model, **self.distill_kwargs)
+
+            # Distillation should happen only once and not for every embed call
+            # The distillation should only happen the first time on the entire vocabulary
+            self._has_distilled = True
+
+        # Embed the documents
+        embeddings = self.embedding_model.encode(documents, show_progress_bar=verbose)
+        return embeddings
+
+    def _check_model2vec_installation(self):
+        try:
+            from model2vec.distill import distill  # noqa: F401
+        except ImportError:
+            raise ImportError("To distill a model using model2vec, you need to run `pip install model2vec[distill]`")
@@ -0,0 +1,200 @@
+import numpy as np
+from PIL import Image
+from tqdm import tqdm
+from typing import List, Union
+from sentence_transformers import SentenceTransformer
+
+from bertopic.backend import BaseEmbedder
+
+
+class MultiModalBackend(BaseEmbedder):
+    """Multimodal backend using Sentence-transformers.
+
+    The sentence-transformers embedding model used for
+    generating word, document, and image embeddings.
+
+    Arguments:
+        embedding_model: A sentence-transformers embedding model that
+                         can either embed both images and text or only text.
+                         If it only embeds text, then `image_model` needs
+                         to be used to embed the images.
+        image_model: A sentence-transformers embedding model that is used
+                     to embed only images.
+        batch_size: The sizes of image batches to pass
+
+    Examples:
+    To create a model, you can load in a string pointing to a
+    sentence-transformers model:
+
+    ```python
+    from bertopic.backend import MultiModalBackend
+
+    sentence_model = MultiModalBackend("clip-ViT-B-32")
+    ```
+
+    or  you can instantiate a model yourself:
+    ```python
+    from bertopic.backend import MultiModalBackend
+    from sentence_transformers import SentenceTransformer
+
+    embedding_model = SentenceTransformer("clip-ViT-B-32")
+    sentence_model = MultiModalBackend(embedding_model)
+    ```
+    """
+
+    def __init__(
+        self,
+        embedding_model: Union[str, SentenceTransformer],
+        image_model: Union[str, SentenceTransformer] = None,
+        batch_size: int = 32,
+    ):
+        super().__init__()
+        self.batch_size = batch_size
+
+        # Text or Text+Image model
+        if isinstance(embedding_model, SentenceTransformer):
+            self.embedding_model = embedding_model
+        elif isinstance(embedding_model, str):
+            self.embedding_model = SentenceTransformer(embedding_model)
+        else:
+            raise ValueError(
+                "Please select a correct SentenceTransformers model: \n"
+                "`from sentence_transformers import SentenceTransformer` \n"
+                "`model = SentenceTransformer('clip-ViT-B-32')`"
+            )
+
+        # Image Model
+        self.image_model = None
+        if image_model is not None:
+            if isinstance(image_model, SentenceTransformer):
+                self.image_model = image_model
+            elif isinstance(image_model, str):
+                self.image_model = SentenceTransformer(image_model)
+            else:
+                raise ValueError(
+                    "Please select a correct SentenceTransformers model: \n"
+                    "`from sentence_transformers import SentenceTransformer` \n"
+                    "`model = SentenceTransformer('clip-ViT-B-32')`"
+                )
+
+        try:
+            self.tokenizer = self.embedding_model._first_module().processor.tokenizer
+        except AttributeError:
+            self.tokenizer = self.embedding_model.tokenizer
+        except:  # noqa: E722
+            self.tokenizer = None
+
+    def embed(self, documents: List[str], images: List[str] = None, verbose: bool = False) -> np.ndarray:
+        """Embed a list of n documents/words or images into an n-dimensional
+        matrix of embeddings.
+
+        Either documents, images, or both can be provided. If both are provided,
+        then the embeddings are averaged.
+
+        Arguments:
+            documents: A list of documents or words to be embedded
+            images: A list of image paths to be embedded
+            verbose: Controls the verbosity of the process
+
+        Returns:
+            Document/words embeddings with shape (n, m) with `n` documents/words
+            that each have an embeddings size of `m`
+        """
+        # Embed documents
+        doc_embeddings = None
+        if documents[0] is not None:
+            doc_embeddings = self.embed_documents(documents)
+
+        # Embed images
+        image_embeddings = None
+        if isinstance(images, list):
+            image_embeddings = self.embed_images(images, verbose)
+
+        # Average embeddings
+        averaged_embeddings = None
+        if doc_embeddings is not None and image_embeddings is not None:
+            averaged_embeddings = np.mean([doc_embeddings, image_embeddings], axis=0)
+
+        if averaged_embeddings is not None:
+            return averaged_embeddings
+        elif doc_embeddings is not None:
+            return doc_embeddings
+        elif image_embeddings is not None:
+            return image_embeddings
+
+    def embed_documents(self, documents: List[str], verbose: bool = False) -> np.ndarray:
+        """Embed a list of n documents/words into an n-dimensional
+        matrix of embeddings.
+
+        Arguments:
+            documents: A list of documents or words to be embedded
+            verbose: Controls the verbosity of the process
+
+        Returns:
+            Document/words embeddings with shape (n, m) with `n` documents/words
+            that each have an embeddings size of `m`
+        """
+        truncated_docs = [self._truncate_document(doc) for doc in documents]
+        embeddings = self.embedding_model.encode(truncated_docs, show_progress_bar=verbose)
+        return embeddings
+
+    def embed_words(self, words: List[str], verbose: bool = False) -> np.ndarray:
+        """Embed a list of n words into an n-dimensional
+        matrix of embeddings.
+
+        Arguments:
+            words: A list of words to be embedded
+            verbose: Controls the verbosity of the process
+
+        Returns:
+            Document/words embeddings with shape (n, m) with `n` documents/words
+            that each have an embeddings size of `m`
+        """
+        embeddings = self.embedding_model.encode(words, show_progress_bar=verbose)
+        return embeddings
+
+    def embed_images(self, images, verbose):
+        if self.batch_size:
+            nr_iterations = int(np.ceil(len(images) / self.batch_size))
+
+            # Embed images per batch
+            embeddings = []
+            for i in tqdm(range(nr_iterations), disable=not verbose):
+                start_index = i * self.batch_size
+                end_index = (i * self.batch_size) + self.batch_size
+
+                images_to_embed = [
+                    Image.open(image) if isinstance(image, str) else image for image in images[start_index:end_index]
+                ]
+                if self.image_model is not None:
+                    img_emb = self.image_model.encode(images_to_embed)
+                else:
+                    img_emb = self.embedding_model.encode(images_to_embed, show_progress_bar=False)
+                embeddings.extend(img_emb.tolist())
+
+                # Close images
+                if isinstance(images[0], str):
+                    for image in images_to_embed:
+                        image.close()
+            embeddings = np.array(embeddings)
+        else:
+            images_to_embed = [Image.open(filepath) for filepath in images]
+            if self.image_model is not None:
+                embeddings = self.image_model.encode(images_to_embed)
+            else:
+                embeddings = self.embedding_model.encode(images_to_embed, show_progress_bar=False)
+        return embeddings
+
+    def _truncate_document(self, document):
+        if self.tokenizer:
+            tokens = self.tokenizer.encode(document)
+
+            if len(tokens) > 77:
+                # Skip the starting token, only include 75 tokens
+                truncated_tokens = tokens[1:76]
+                document = self.tokenizer.decode(truncated_tokens)
+
+                # Recursive call here, because the encode(decode()) can have different result
+                return self._truncate_document(document)
+
+        return document
@@ -0,0 +1,88 @@
+import time
+import openai
+import numpy as np
+from tqdm import tqdm
+from typing import List, Mapping, Any
+from bertopic.backend import BaseEmbedder
+
+
+class OpenAIBackend(BaseEmbedder):
+    """OpenAI Embedding Model.
+
+    Arguments:
+        client: A `openai.OpenAI` client.
+        embedding_model: An OpenAI model. Default is
+                         For an overview of models see:
+                         https://platform.openai.com/docs/models/embeddings
+        delay_in_seconds: If a `batch_size` is given, use this set
+                          the delay in seconds between batches.
+        batch_size: The size of each batch.
+        generator_kwargs: Kwargs passed to `openai.Embedding.create`.
+                          Can be used to define custom engines or
+                          deployment_ids.
+
+    Examples:
+    ```python
+    import openai
+    from bertopic.backend import OpenAIBackend
+
+    client = openai.OpenAI(api_key="sk-...")
+    openai_embedder = OpenAIBackend(client, "text-embedding-ada-002")
+    ```
+    """
+
+    def __init__(
+        self,
+        client: openai.OpenAI,
+        embedding_model: str = "text-embedding-ada-002",
+        delay_in_seconds: float = None,
+        batch_size: int = None,
+        generator_kwargs: Mapping[str, Any] = {},
+    ):
+        super().__init__()
+        self.client = client
+        self.embedding_model = embedding_model
+        self.delay_in_seconds = delay_in_seconds
+        self.batch_size = batch_size
+        self.generator_kwargs = generator_kwargs
+
+        if self.generator_kwargs.get("model"):
+            self.embedding_model = generator_kwargs.get("model")
+        elif not self.generator_kwargs.get("engine"):
+            self.generator_kwargs["model"] = self.embedding_model
+
+    def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
+        """Embed a list of n documents/words into an n-dimensional
+        matrix of embeddings.
+
+        Arguments:
+            documents: A list of documents or words to be embedded
+            verbose: Controls the verbosity of the process
+
+        Returns:
+            Document/words embeddings with shape (n, m) with `n` documents/words
+            that each have an embeddings size of `m`
+        """
+        # Prepare documents, replacing empty strings with a single space
+        prepared_documents = [" " if doc == "" else doc for doc in documents]
+
+        # Batch-wise embedding extraction
+        if self.batch_size is not None:
+            embeddings = []
+            for batch in tqdm(self._chunks(prepared_documents), disable=not verbose):
+                response = self.client.embeddings.create(input=batch, **self.generator_kwargs)
+                embeddings.extend([r.embedding for r in response.data])
+
+                # Delay subsequent calls
+                if self.delay_in_seconds:
+                    time.sleep(self.delay_in_seconds)
+
+        # Extract embeddings all at once
+        else:
+            response = self.client.embeddings.create(input=prepared_documents, **self.generator_kwargs)
+            embeddings = [r.embedding for r in response.data]
+        return np.array(embeddings)
+
+    def _chunks(self, documents):
+        for i in range(0, len(documents), self.batch_size):
+            yield documents[i : i + self.batch_size]
@@ -0,0 +1,85 @@
+import numpy as np
+from typing import List, Union
+from sentence_transformers import SentenceTransformer
+from sentence_transformers.models import StaticEmbedding
+
+from bertopic.backend import BaseEmbedder
+
+
+class SentenceTransformerBackend(BaseEmbedder):
+    """Sentence-transformers embedding model.
+
+    The sentence-transformers embedding model used for generating document and
+    word embeddings.
+
+    Arguments:
+        embedding_model: A sentence-transformers embedding model
+        model2vec: Indicates whether `embedding_model` is a model2vec model.
+                   NOTE: Only works if `embedding_model` is a string.
+                   Otherwise, you can pass the model2vec model directly to `embedding_model`.
+
+    Examples:
+    To create a model, you can load in a string pointing to a
+    sentence-transformers model:
+
+    ```python
+    from bertopic.backend import SentenceTransformerBackend
+
+    sentence_model = SentenceTransformerBackend("all-MiniLM-L6-v2")
+    ```
+
+    or  you can instantiate a model yourself:
+
+    ```python
+    from bertopic.backend import SentenceTransformerBackend
+    from sentence_transformers import SentenceTransformer
+
+    embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
+    sentence_model = SentenceTransformerBackend(embedding_model)
+    ```
+
+    If you want to use a model2vec model without having to install model2vec,
+    you can pass the model2vec model as a string:
+
+    ```python
+    from bertopic.backend import SentenceTransformerBackend
+    from sentence_transformers import SentenceTransformer
+
+    embedding_model = SentenceTransformer("minishlab/potion-base-8M", model2vec=True)
+    sentence_model = SentenceTransformerBackend(embedding_model)
+    ```
+    """
+
+    def __init__(self, embedding_model: Union[str, SentenceTransformer], model2vec: bool = False):
+        super().__init__()
+
+        self._hf_model = None
+        if model2vec and isinstance(embedding_model, str):
+            static_embedding = StaticEmbedding.from_model2vec(embedding_model)
+            self.embedding_model = SentenceTransformer(modules=[static_embedding])
+        elif isinstance(embedding_model, SentenceTransformer):
+            self.embedding_model = embedding_model
+        elif isinstance(embedding_model, str):
+            self.embedding_model = SentenceTransformer(embedding_model)
+            self._hf_model = embedding_model
+        else:
+            raise ValueError(
+                "Please select a correct SentenceTransformers model: \n"
+                "`from sentence_transformers import SentenceTransformer` \n"
+                "`model = SentenceTransformer('all-MiniLM-L6-v2')`"
+            )
+
+    def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
+        """Embed a list of n documents/words into an n-dimensional
+        matrix of embeddings.
+
+        Arguments:
+            documents: A list of documents or words to be embedded
+            verbose: Controls the verbosity of the process
+
+        Returns:
+            Document/words embeddings with shape (n, m) with `n` documents/words
+            that each have an embeddings size of `m`
+        """
+        embeddings = self.embedding_model.encode(documents, show_progress_bar=verbose)
+        return embeddings
@@ -0,0 +1,68 @@
+from bertopic.backend import BaseEmbedder
+from sklearn.utils.validation import check_is_fitted, NotFittedError
+
+
+class SklearnEmbedder(BaseEmbedder):
+    """Scikit-Learn based embedding model.
+
+    This component allows the usage of scikit-learn pipelines for generating document and
+    word embeddings.
+
+    Arguments:
+        pipe: A scikit-learn pipeline that can `.transform()` text.
+
+    Examples:
+    Scikit-Learn is very flexible and it allows for many representations.
+    A relatively simple pipeline is shown below.
+
+    ```python
+    from sklearn.pipeline import make_pipeline
+    from sklearn.decomposition import TruncatedSVD
+    from sklearn.feature_extraction.text import TfidfVectorizer
+
+    from bertopic.backend import SklearnEmbedder
+
+    pipe = make_pipeline(
+        TfidfVectorizer(),
+        TruncatedSVD(100)
+    )
+
+    sklearn_embedder = SklearnEmbedder(pipe)
+    topic_model = BERTopic(embedding_model=sklearn_embedder)
+    ```
+
+    This pipeline first constructs a sparse representation based on TF/idf and then
+    makes it dense by applying SVD. Alternatively, you might also construct something
+    more elaborate. As long as you construct a scikit-learn compatible pipeline, you
+    should be able to pass it to Bertopic.
+
+    !!! Warning
+        One caveat to be aware of is that scikit-learns base `Pipeline` class does not
+        support the `.partial_fit()`-API. If you have a pipeline that theoretically should
+        be able to support online learning then you might want to explore
+        the [scikit-partial](https://github.com/koaning/scikit-partial) project.
+    """
+
+    def __init__(self, pipe):
+        super().__init__()
+        self.pipe = pipe
+
+    def embed(self, documents, verbose=False):
+        """Embed a list of n documents/words into an n-dimensional
+        matrix of embeddings.
+
+        Arguments:
+            documents: A list of documents or words to be embedded
+            verbose: No-op variable that's kept around to keep the API consistent. If you want to get feedback on training times, you should use the sklearn API.
+
+        Returns:
+            Document/words embeddings with shape (n, m) with `n` documents/words
+            that each have an embeddings size of `m`
+        """
+        try:
+            check_is_fitted(self.pipe)
+            embeddings = self.pipe.transform(documents)
+        except NotFittedError:
+            embeddings = self.pipe.fit_transform(documents)
+
+        return embeddings
@@ -0,0 +1,94 @@
+import numpy as np
+from tqdm import tqdm
+from typing import List
+from bertopic.backend import BaseEmbedder
+
+
+class SpacyBackend(BaseEmbedder):
+    """Spacy embedding model.
+
+    The Spacy embedding model used for generating document and
+    word embeddings.
+
+    Arguments:
+        embedding_model: A spacy embedding model
+
+    Examples:
+    To create a Spacy backend, you need to create an nlp object and
+    pass it through this backend:
+
+    ```python
+    import spacy
+    from bertopic.backend import SpacyBackend
+
+    nlp = spacy.load("en_core_web_md", exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'])
+    spacy_model = SpacyBackend(nlp)
+    ```
+
+    To load in a transformer model use the following:
+
+    ```python
+    import spacy
+    from thinc.api import set_gpu_allocator, require_gpu
+    from bertopic.backend import SpacyBackend
+
+    nlp = spacy.load("en_core_web_trf", exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'])
+    set_gpu_allocator("pytorch")
+    require_gpu(0)
+    spacy_model = SpacyBackend(nlp)
+    ```
+
+    If you run into gpu/memory-issues, please use:
+
+    ```python
+    import spacy
+    from bertopic.backend import SpacyBackend
+
+    spacy.prefer_gpu()
+    nlp = spacy.load("en_core_web_trf", exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'])
+    spacy_model = SpacyBackend(nlp)
+    ```
+    """
+
+    def __init__(self, embedding_model):
+        super().__init__()
+
+        if "spacy" in str(type(embedding_model)):
+            self.embedding_model = embedding_model
+        else:
+            raise ValueError(
+                "Please select a correct Spacy model by either using a string such as 'en_core_web_md' "
+                "or create a nlp model using: `nlp = spacy.load('en_core_web_md')"
+            )
+
+    def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
+        """Embed a list of n documents/words into an n-dimensional
+        matrix of embeddings.
+
+        Arguments:
+            documents: A list of documents or words to be embedded
+            verbose: Controls the verbosity of the process
+
+        Returns:
+            Document/words embeddings with shape (n, m) with `n` documents/words
+            that each have an embeddings size of `m`
+        """
+        # Handle empty documents, spaCy models automatically map
+        # empty strings to the zero vector
+        empty_document = " "
+
+        # Extract embeddings
+        embeddings = []
+        for doc in tqdm(documents, position=0, leave=True, disable=not verbose):
+            embedding = self.embedding_model(doc or empty_document)
+            if embedding.has_vector:
+                embedding = embedding.vector
+            else:
+                embedding = embedding._.trf_data.tensors[-1][0]
+
+            if not isinstance(embedding, np.ndarray) and hasattr(embedding, "get"):
+                # Convert cupy array to numpy array
+                embedding = embedding.get()
+            embeddings.append(embedding)
+
+        return np.array(embeddings)
@@ -0,0 +1,55 @@
+import numpy as np
+from tqdm import tqdm
+from typing import List
+
+from bertopic.backend import BaseEmbedder
+
+
+class USEBackend(BaseEmbedder):
+    """Universal Sentence Encoder.
+
+    USE encodes text into high-dimensional vectors that
+    are used for semantic similarity in BERTopic.
+
+    Arguments:
+        embedding_model: An USE embedding model
+
+    Examples:
+    ```python
+    import tensorflow_hub
+    from bertopic.backend import USEBackend
+
+    embedding_model = tensorflow_hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
+    use_embedder = USEBackend(embedding_model)
+    ```
+    """
+
+    def __init__(self, embedding_model):
+        super().__init__()
+
+        try:
+            embedding_model(["test sentence"])
+            self.embedding_model = embedding_model
+        except TypeError:
+            raise ValueError(
+                "Please select a correct USE model: \n"
+                "`import tensorflow_hub` \n"
+                "`embedding_model = tensorflow_hub.load(path_to_model)`"
+            )
+
+    def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
+        """Embed a list of n documents/words into an n-dimensional
+        matrix of embeddings.
+
+        Arguments:
+            documents: A list of documents or words to be embedded
+            verbose: Controls the verbosity of the process
+
+        Returns:
+            Document/words embeddings with shape (n, m) with `n` documents/words
+            that each have an embeddings size of `m`
+        """
+        embeddings = np.array(
+            [self.embedding_model([doc]).cpu().numpy()[0] for doc in tqdm(documents, disable=not verbose)]
+        )
+        return embeddings
@@ -0,0 +1,171 @@
+from ._base import BaseEmbedder
+
+# Imports for light-weight variant of BERTopic
+from bertopic.backend._sklearn import SklearnEmbedder
+from bertopic._utils import MyLogger
+from sklearn.pipeline import make_pipeline
+from sklearn.decomposition import TruncatedSVD
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.pipeline import Pipeline as ScikitPipeline
+
+logger = MyLogger()
+logger.configure("WARNING")
+
+languages = [
+    "arabic",
+    "bulgarian",
+    "catalan",
+    "czech",
+    "danish",
+    "german",
+    "greek",
+    "english",
+    "spanish",
+    "estonian",
+    "persian",
+    "finnish",
+    "french",
+    "canadian french",
+    "galician",
+    "gujarati",
+    "hebrew",
+    "hindi",
+    "croatian",
+    "hungarian",
+    "armenian",
+    "indonesian",
+    "italian",
+    "japanese",
+    "georgian",
+    "korean",
+    "kurdish",
+    "lithuanian",
+    "latvian",
+    "macedonian",
+    "mongolian",
+    "marathi",
+    "malay",
+    "burmese",
+    "norwegian bokmal",
+    "dutch",
+    "polish",
+    "portuguese",
+    "brazilian portuguese",
+    "romanian",
+    "russian",
+    "slovak",
+    "slovenian",
+    "albanian",
+    "serbian",
+    "swedish",
+    "thai",
+    "turkish",
+    "ukrainian",
+    "urdu",
+    "vietnamese",
+    "chinese (simplified)",
+    "chinese (traditional)",
+]
+
+
+def select_backend(embedding_model, language: str = None, verbose: bool = False) -> BaseEmbedder:
+    """Select an embedding model based on language or a specific provided model.
+    When selecting a language, we choose all-MiniLM-L6-v2 for English and
+    paraphrase-multilingual-MiniLM-L12-v2 for all other languages as it support 100+ languages.
+    If sentence-transformers is not installed, in the case of a lightweight installation,
+    a scikit-learn backend is default.
+
+    Returns:
+        model: The selected model backend.
+    """
+    logger.set_level("INFO" if verbose else "WARNING")
+
+    # BERTopic language backend
+    if isinstance(embedding_model, BaseEmbedder):
+        return embedding_model
+
+    # Scikit-learn backend
+    if isinstance(embedding_model, ScikitPipeline):
+        return SklearnEmbedder(embedding_model)
+
+    # Flair word embeddings
+    if "flair" in str(type(embedding_model)):
+        from bertopic.backend._flair import FlairBackend
+
+        return FlairBackend(embedding_model)
+
+    # Spacy embeddings
+    if "spacy" in str(type(embedding_model)):
+        from bertopic.backend._spacy import SpacyBackend
+
+        return SpacyBackend(embedding_model)
+
+    # Gensim embeddings
+    if "gensim" in str(type(embedding_model)):
+        from bertopic.backend._gensim import GensimBackend
+
+        return GensimBackend(embedding_model)
+
+    # USE embeddings
+    if "tensorflow" and "saved_model" in str(type(embedding_model)):
+        from bertopic.backend._use import USEBackend
+
+        return USEBackend(embedding_model)
+
+    # Sentence Transformer embeddings
+    if "sentence_transformers" in str(type(embedding_model)) or isinstance(embedding_model, str):
+        from ._sentencetransformers import SentenceTransformerBackend
+
+        return SentenceTransformerBackend(embedding_model)
+
+    # Hugging Face embeddings
+    if "transformers" and "pipeline" in str(type(embedding_model)):
+        from ._hftransformers import HFTransformerBackend
+
+        return HFTransformerBackend(embedding_model)
+
+    # Model2Vec embeddings
+    if "model2vec" in str(type(embedding_model)):
+        from ._model2vec import Model2VecBackend
+
+        return Model2VecBackend(embedding_model)
+
+    # FastEmbed word embeddings
+    if "fastembed" in str(type(embedding_model)):
+        from bertopic.backend._fastembed import FastEmbedBackend
+
+        return FastEmbedBackend(embedding_model)
+
+    # Select embedding model based on language
+    if language:
+        try:
+            from ._sentencetransformers import SentenceTransformerBackend
+
+            if language.lower() in ["English", "english", "en"]:
+                return SentenceTransformerBackend("sentence-transformers/all-MiniLM-L6-v2")
+            elif language.lower() in languages or language == "multilingual":
+                return SentenceTransformerBackend("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
+            else:
+                raise ValueError(
+                    f"{language} is currently not supported. However, you can "
+                    f"create any embeddings yourself and pass it through fit_transform(docs, embeddings)\n"
+                    "Else, please select a language from the following list:\n"
+                    f"{languages}"
+                )
+
+        # A ModuleNotFoundError might be a lightweight installation
+        except ModuleNotFoundError as e:
+            if e.name != "sentence_transformers":
+                # Error occurred in a downstream module, probably not a lightweight install
+                raise e
+            # Whole sentence_transformers module is missing, probably a lightweight install
+            if verbose:
+                logger.info(
+                    "Automatically selecting lightweight scikit-learn embedding backend as sentence-transformers appears to not be installed."
+                )
+            pipe = make_pipeline(TfidfVectorizer(), TruncatedSVD(100))
+            return SklearnEmbedder(pipe)
+
+    from ._sentencetransformers import SentenceTransformerBackend
+
+    return SentenceTransformerBackend("sentence-transformers/all-MiniLM-L6-v2")
@@ -0,0 +1,43 @@
+import numpy as np
+from typing import List
+from bertopic.backend._base import BaseEmbedder
+from bertopic.backend._utils import select_backend
+
+
+class WordDocEmbedder(BaseEmbedder):
+    """Combine a document- and word-level embedder."""
+
+    def __init__(self, embedding_model, word_embedding_model):
+        super().__init__()
+
+        self.embedding_model = select_backend(embedding_model)
+        self.word_embedding_model = select_backend(word_embedding_model)
+
+    def embed_words(self, words: List[str], verbose: bool = False) -> np.ndarray:
+        """Embed a list of n words into an n-dimensional
+        matrix of embeddings.
+
+        Arguments:
+            words: A list of words to be embedded
+            verbose: Controls the verbosity of the process
+
+        Returns:
+            Word embeddings with shape (n, m) with `n` words
+            that each have an embeddings size of `m`
+
+        """
+        return self.word_embedding_model.embed(words, verbose)
+
+    def embed_documents(self, document: List[str], verbose: bool = False) -> np.ndarray:
+        """Embed a list of n words into an n-dimensional
+        matrix of embeddings.
+
+        Arguments:
+            document: A list of documents to be embedded
+            verbose: Controls the verbosity of the process
+
+        Returns:
+            Document embeddings with shape (n, m) with `n` documents
+            that each have an embeddings size of `m`
+        """
+        return self.embedding_model.embed(document, verbose)