Add BERTopic.
This commit is contained in:
@@ -0,0 +1,60 @@
|
||||
from ._base import BaseEmbedder
|
||||
from ._word_doc import WordDocEmbedder
|
||||
from ._utils import languages
|
||||
from bertopic._utils import NotInstalled
|
||||
|
||||
# OpenAI Embeddings
|
||||
try:
|
||||
from bertopic.backend._openai import OpenAIBackend
|
||||
except ModuleNotFoundError:
|
||||
msg = "`pip install openai` \n\n"
|
||||
OpenAIBackend = NotInstalled("OpenAI", "OpenAI", custom_msg=msg)
|
||||
|
||||
# Cohere Embeddings
|
||||
try:
|
||||
from bertopic.backend._cohere import CohereBackend
|
||||
except ModuleNotFoundError:
|
||||
msg = "`pip install cohere` \n\n"
|
||||
CohereBackend = NotInstalled("Cohere", "Cohere", custom_msg=msg)
|
||||
|
||||
# Multimodal Embeddings
|
||||
try:
|
||||
from bertopic.backend._multimodal import MultiModalBackend
|
||||
except ModuleNotFoundError:
|
||||
msg = "`pip install bertopic[vision]` \n\n"
|
||||
MultiModalBackend = NotInstalled("Vision", "Vision", custom_msg=msg)
|
||||
|
||||
# Model2Vec Embeddings
|
||||
try:
|
||||
from bertopic.backend._model2vec import Model2VecBackend
|
||||
except ModuleNotFoundError:
|
||||
msg = "`pip install model2vec` \n\n"
|
||||
Model2VecBackend = NotInstalled("Model2Vec", "Model2Vec", custom_msg=msg)
|
||||
|
||||
# FasteEmbed Embeddings
|
||||
try:
|
||||
from bertopic.backend._fastembed import FastEmbedBackend
|
||||
except ModuleNotFoundError:
|
||||
msg = "`pip install fastembed` \n\n"
|
||||
FastEmbedBackend = NotInstalled("FastEmbed", "FastEmbed", custom_msg=msg)
|
||||
|
||||
|
||||
# Langchain Embedddings
|
||||
try:
|
||||
from bertopic.backend._langchain import LangChainBackend
|
||||
except ModuleNotFoundError:
|
||||
msg = "`pip install langchain` \n\n"
|
||||
LangChainBackend = NotInstalled("LangChain", "LangChain", custom_msg=msg)
|
||||
|
||||
|
||||
__all__ = [
|
||||
"BaseEmbedder",
|
||||
"WordDocEmbedder",
|
||||
"OpenAIBackend",
|
||||
"CohereBackend",
|
||||
"Model2VecBackend",
|
||||
"MultiModalBackend",
|
||||
"FastEmbedBackend",
|
||||
"LangChainBackend",
|
||||
"languages",
|
||||
]
|
||||
@@ -0,0 +1,62 @@
|
||||
import numpy as np
|
||||
from typing import List
|
||||
|
||||
|
||||
class BaseEmbedder:
|
||||
"""The Base Embedder used for creating embedding models.
|
||||
|
||||
Arguments:
|
||||
embedding_model: The main embedding model to be used for extracting
|
||||
document and word embedding
|
||||
word_embedding_model: The embedding model used for extracting word
|
||||
embeddings only. If this model is selected,
|
||||
then the `embedding_model` is purely used for
|
||||
creating document embeddings.
|
||||
"""
|
||||
|
||||
def __init__(self, embedding_model=None, word_embedding_model=None):
|
||||
self.embedding_model = embedding_model
|
||||
self.word_embedding_model = word_embedding_model
|
||||
|
||||
def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
|
||||
"""Embed a list of n documents/words into an n-dimensional
|
||||
matrix of embeddings.
|
||||
|
||||
Arguments:
|
||||
documents: A list of documents or words to be embedded
|
||||
verbose: Controls the verbosity of the process
|
||||
|
||||
Returns:
|
||||
Document/words embeddings with shape (n, m) with `n` documents/words
|
||||
that each have an embeddings size of `m`
|
||||
"""
|
||||
pass
|
||||
|
||||
def embed_words(self, words: List[str], verbose: bool = False) -> np.ndarray:
|
||||
"""Embed a list of n words into an n-dimensional
|
||||
matrix of embeddings.
|
||||
|
||||
Arguments:
|
||||
words: A list of words to be embedded
|
||||
verbose: Controls the verbosity of the process
|
||||
|
||||
Returns:
|
||||
Word embeddings with shape (n, m) with `n` words
|
||||
that each have an embeddings size of `m`
|
||||
|
||||
"""
|
||||
return self.embed(words, verbose)
|
||||
|
||||
def embed_documents(self, document: List[str], verbose: bool = False) -> np.ndarray:
|
||||
"""Embed a list of n words into an n-dimensional
|
||||
matrix of embeddings.
|
||||
|
||||
Arguments:
|
||||
document: A list of documents to be embedded
|
||||
verbose: Controls the verbosity of the process
|
||||
|
||||
Returns:
|
||||
Document embeddings with shape (n, m) with `n` documents
|
||||
that each have an embeddings size of `m`
|
||||
"""
|
||||
return self.embed(document, verbose)
|
||||
@@ -0,0 +1,94 @@
|
||||
import time
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
from typing import Any, List, Mapping
|
||||
from bertopic.backend import BaseEmbedder
|
||||
|
||||
|
||||
class CohereBackend(BaseEmbedder):
|
||||
"""Cohere Embedding Model.
|
||||
|
||||
Arguments:
|
||||
client: A `cohere` client.
|
||||
embedding_model: A Cohere model. Default is "large".
|
||||
For an overview of models see:
|
||||
https://docs.cohere.ai/docs/generation-card
|
||||
delay_in_seconds: If a `batch_size` is given, use this set
|
||||
the delay in seconds between batches.
|
||||
batch_size: The size of each batch.
|
||||
embed_kwargs: Kwargs passed to `cohere.Client.embed`.
|
||||
Can be used to define additional parameters
|
||||
such as `input_type`
|
||||
|
||||
Examples:
|
||||
```python
|
||||
import cohere
|
||||
from bertopic.backend import CohereBackend
|
||||
|
||||
client = cohere.Client("APIKEY")
|
||||
cohere_model = CohereBackend(client)
|
||||
```
|
||||
|
||||
If you want to specify `input_type`:
|
||||
|
||||
```python
|
||||
cohere_model = CohereBackend(
|
||||
client,
|
||||
embedding_model="embed-english-v3.0",
|
||||
embed_kwargs={"input_type": "clustering"}
|
||||
)
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
client,
|
||||
embedding_model: str = "large",
|
||||
delay_in_seconds: float = None,
|
||||
batch_size: int = None,
|
||||
embed_kwargs: Mapping[str, Any] = {},
|
||||
):
|
||||
super().__init__()
|
||||
self.client = client
|
||||
self.embedding_model = embedding_model
|
||||
self.delay_in_seconds = delay_in_seconds
|
||||
self.batch_size = batch_size
|
||||
self.embed_kwargs = embed_kwargs
|
||||
|
||||
if self.embed_kwargs.get("model"):
|
||||
self.embedding_model = embed_kwargs.get("model")
|
||||
else:
|
||||
self.embed_kwargs["model"] = self.embedding_model
|
||||
|
||||
def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
|
||||
"""Embed a list of n documents/words into an n-dimensional
|
||||
matrix of embeddings.
|
||||
|
||||
Arguments:
|
||||
documents: A list of documents or words to be embedded
|
||||
verbose: Controls the verbosity of the process
|
||||
|
||||
Returns:
|
||||
Document/words embeddings with shape (n, m) with `n` documents/words
|
||||
that each have an embeddings size of `m`
|
||||
"""
|
||||
# Batch-wise embedding extraction
|
||||
if self.batch_size is not None:
|
||||
embeddings = []
|
||||
for batch in tqdm(self._chunks(documents), disable=not verbose):
|
||||
response = self.client.embed(texts=batch, **self.embed_kwargs)
|
||||
embeddings.extend(response.embeddings)
|
||||
|
||||
# Delay subsequent calls
|
||||
if self.delay_in_seconds:
|
||||
time.sleep(self.delay_in_seconds)
|
||||
|
||||
# Extract embeddings all at once
|
||||
else:
|
||||
response = self.client.embed(texts=documents, **self.embed_kwargs)
|
||||
embeddings = response.embeddings
|
||||
return np.array(embeddings)
|
||||
|
||||
def _chunks(self, documents):
|
||||
for i in range(0, len(documents), self.batch_size):
|
||||
yield documents[i : i + self.batch_size]
|
||||
@@ -0,0 +1,54 @@
|
||||
import numpy as np
|
||||
from typing import List
|
||||
from fastembed import TextEmbedding
|
||||
|
||||
from bertopic.backend import BaseEmbedder
|
||||
|
||||
|
||||
class FastEmbedBackend(BaseEmbedder):
|
||||
"""FastEmbed embedding model.
|
||||
|
||||
The FastEmbed embedding model used for generating sentence embeddings.
|
||||
|
||||
Arguments:
|
||||
embedding_model: A FastEmbed embedding model
|
||||
|
||||
Examples:
|
||||
To create a model, you can load in a string pointing to a supported
|
||||
FastEmbed model:
|
||||
|
||||
```python
|
||||
from bertopic.backend import FastEmbedBackend
|
||||
|
||||
sentence_model = FastEmbedBackend("BAAI/bge-small-en-v1.5")
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(self, embedding_model: str = "BAAI/bge-small-en-v1.5"):
|
||||
super().__init__()
|
||||
|
||||
supported_models = [m["model"] for m in TextEmbedding.list_supported_models()]
|
||||
|
||||
if isinstance(embedding_model, str) and embedding_model in supported_models:
|
||||
self.embedding_model = TextEmbedding(model_name=embedding_model)
|
||||
else:
|
||||
raise ValueError(
|
||||
"Please select a correct FasteEmbed model: \n"
|
||||
"the model must be a string and must be supported. \n"
|
||||
"The supported TextEmbedding model list is here: https://qdrant.github.io/fastembed/examples/Supported_Models/"
|
||||
)
|
||||
|
||||
def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
|
||||
"""Embed a list of n documents/words into an n-dimensional
|
||||
matrix of embeddings.
|
||||
|
||||
Arguments:
|
||||
documents: A list of documents or words to be embedded
|
||||
verbose: Controls the verbosity of the process
|
||||
|
||||
Returns:
|
||||
Document/words embeddings with shape (n, m) with `n` documents/words
|
||||
that each have an embeddings size of `m`
|
||||
"""
|
||||
embeddings = np.array(list(self.embedding_model.embed(documents, show_progress_bar=verbose)))
|
||||
return embeddings
|
||||
@@ -0,0 +1,78 @@
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
from typing import Union, List
|
||||
from flair.data import Sentence
|
||||
from flair.embeddings import DocumentEmbeddings, TokenEmbeddings, DocumentPoolEmbeddings
|
||||
|
||||
from bertopic.backend import BaseEmbedder
|
||||
|
||||
|
||||
class FlairBackend(BaseEmbedder):
|
||||
"""Flair Embedding Model.
|
||||
|
||||
The Flair embedding model used for generating document and
|
||||
word embeddings.
|
||||
|
||||
Arguments:
|
||||
embedding_model: A Flair embedding model
|
||||
|
||||
Examples:
|
||||
```python
|
||||
from bertopic.backend import FlairBackend
|
||||
from flair.embeddings import WordEmbeddings, DocumentPoolEmbeddings
|
||||
|
||||
# Create a Flair Embedding model
|
||||
glove_embedding = WordEmbeddings('crawl')
|
||||
document_glove_embeddings = DocumentPoolEmbeddings([glove_embedding])
|
||||
|
||||
# Pass the Flair model to create a new backend
|
||||
flair_embedder = FlairBackend(document_glove_embeddings)
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(self, embedding_model: Union[TokenEmbeddings, DocumentEmbeddings]):
|
||||
super().__init__()
|
||||
|
||||
# Flair word embeddings
|
||||
if isinstance(embedding_model, TokenEmbeddings):
|
||||
self.embedding_model = DocumentPoolEmbeddings([embedding_model])
|
||||
|
||||
# Flair document embeddings + disable fine tune to prevent CUDA OOM
|
||||
# https://github.com/flairNLP/flair/issues/1719
|
||||
elif isinstance(embedding_model, DocumentEmbeddings):
|
||||
if "fine_tune" in embedding_model.__dict__:
|
||||
embedding_model.fine_tune = False
|
||||
self.embedding_model = embedding_model
|
||||
|
||||
else:
|
||||
raise ValueError(
|
||||
"Please select a correct Flair model by either using preparing a token or document "
|
||||
"embedding model: \n"
|
||||
"`from flair.embeddings import TransformerDocumentEmbeddings` \n"
|
||||
"`roberta = TransformerDocumentEmbeddings('roberta-base')`"
|
||||
)
|
||||
|
||||
def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
|
||||
"""Embed a list of n documents/words into an n-dimensional
|
||||
matrix of embeddings.
|
||||
|
||||
Arguments:
|
||||
documents: A list of documents or words to be embedded
|
||||
verbose: Controls the verbosity of the process
|
||||
|
||||
Returns:
|
||||
Document/words embeddings with shape (n, m) with `n` documents/words
|
||||
that each have an embeddings size of `m`
|
||||
"""
|
||||
embeddings = []
|
||||
for document in tqdm(documents, disable=not verbose):
|
||||
try:
|
||||
sentence = Sentence(document) if document else Sentence("an empty document")
|
||||
self.embedding_model.embed(sentence)
|
||||
except RuntimeError:
|
||||
sentence = Sentence("an empty document")
|
||||
self.embedding_model.embed(sentence)
|
||||
embedding = sentence.embedding.detach().cpu().numpy()
|
||||
embeddings.append(embedding)
|
||||
embeddings = np.asarray(embeddings)
|
||||
return embeddings
|
||||
@@ -0,0 +1,69 @@
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
from typing import List
|
||||
from bertopic.backend import BaseEmbedder
|
||||
from gensim.models.keyedvectors import Word2VecKeyedVectors
|
||||
|
||||
|
||||
class GensimBackend(BaseEmbedder):
|
||||
"""Gensim Embedding Model.
|
||||
|
||||
The Gensim embedding model is typically used for word embeddings with
|
||||
GloVe, Word2Vec or FastText.
|
||||
|
||||
Arguments:
|
||||
embedding_model: A Gensim embedding model
|
||||
|
||||
Examples:
|
||||
```python
|
||||
from bertopic.backend import GensimBackend
|
||||
import gensim.downloader as api
|
||||
|
||||
ft = api.load('fasttext-wiki-news-subwords-300')
|
||||
ft_embedder = GensimBackend(ft)
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(self, embedding_model: Word2VecKeyedVectors):
|
||||
super().__init__()
|
||||
|
||||
if isinstance(embedding_model, Word2VecKeyedVectors):
|
||||
self.embedding_model = embedding_model
|
||||
else:
|
||||
raise ValueError(
|
||||
"Please select a correct Gensim model: \n"
|
||||
"`import gensim.downloader as api` \n"
|
||||
"`ft = api.load('fasttext-wiki-news-subwords-300')`"
|
||||
)
|
||||
|
||||
def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
|
||||
"""Embed a list of n documents/words into an n-dimensional
|
||||
matrix of embeddings.
|
||||
|
||||
Arguments:
|
||||
documents: A list of documents or words to be embedded
|
||||
verbose: Controls the verbosity of the process
|
||||
|
||||
Returns:
|
||||
Document/words embeddings with shape (n, m) with `n` documents/words
|
||||
that each have an embeddings size of `m`
|
||||
"""
|
||||
vector_shape = self.embedding_model.get_vector(list(self.embedding_model.index_to_key)[0]).shape[0]
|
||||
empty_vector = np.zeros(vector_shape)
|
||||
|
||||
# Extract word embeddings and pool to document-level
|
||||
embeddings = []
|
||||
for doc in tqdm(documents, disable=not verbose, position=0, leave=True):
|
||||
embedding = [
|
||||
self.embedding_model.get_vector(word)
|
||||
for word in doc.split()
|
||||
if word in self.embedding_model.key_to_index
|
||||
]
|
||||
|
||||
if len(embedding) > 0:
|
||||
embeddings.append(np.mean(embedding, axis=0))
|
||||
else:
|
||||
embeddings.append(empty_vector)
|
||||
|
||||
embeddings = np.array(embeddings)
|
||||
return embeddings
|
||||
@@ -0,0 +1,104 @@
|
||||
import numpy as np
|
||||
|
||||
from tqdm import tqdm
|
||||
from typing import List
|
||||
from torch.utils.data import Dataset
|
||||
from sklearn.preprocessing import normalize
|
||||
from transformers.pipelines import Pipeline
|
||||
|
||||
from bertopic.backend import BaseEmbedder
|
||||
|
||||
|
||||
class HFTransformerBackend(BaseEmbedder):
|
||||
"""Hugging Face transformers model.
|
||||
|
||||
This uses the `transformers.pipelines.pipeline` to define and create
|
||||
a feature generation pipeline from which embeddings can be extracted.
|
||||
|
||||
Arguments:
|
||||
embedding_model: A Hugging Face feature extraction pipeline
|
||||
|
||||
Examples:
|
||||
To use a Hugging Face transformers model, load in a pipeline and point
|
||||
to any model found on their model hub (https://huggingface.co/models):
|
||||
|
||||
```python
|
||||
from bertopic.backend import HFTransformerBackend
|
||||
from transformers.pipelines import pipeline
|
||||
|
||||
hf_model = pipeline("feature-extraction", model="distilbert-base-cased")
|
||||
embedding_model = HFTransformerBackend(hf_model)
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(self, embedding_model: Pipeline):
|
||||
super().__init__()
|
||||
|
||||
if isinstance(embedding_model, Pipeline):
|
||||
self.embedding_model = embedding_model
|
||||
else:
|
||||
raise ValueError(
|
||||
"Please select a correct transformers pipeline. For example: "
|
||||
"pipeline('feature-extraction', model='distilbert-base-cased', device=0)"
|
||||
)
|
||||
|
||||
def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
|
||||
"""Embed a list of n documents/words into an n-dimensional
|
||||
matrix of embeddings.
|
||||
|
||||
Arguments:
|
||||
documents: A list of documents or words to be embedded
|
||||
verbose: Controls the verbosity of the process
|
||||
|
||||
Returns:
|
||||
Document/words embeddings with shape (n, m) with `n` documents/words
|
||||
that each have an embeddings size of `m`
|
||||
"""
|
||||
dataset = MyDataset(documents)
|
||||
|
||||
embeddings = []
|
||||
for document, features in tqdm(
|
||||
zip(documents, self.embedding_model(dataset, truncation=True, padding=True)),
|
||||
total=len(dataset),
|
||||
disable=not verbose,
|
||||
):
|
||||
embeddings.append(self._embed(document, features))
|
||||
|
||||
return np.array(embeddings)
|
||||
|
||||
def _embed(self, document: str, features: np.ndarray) -> np.ndarray:
|
||||
"""Mean pooling.
|
||||
|
||||
Arguments:
|
||||
document: The document for which to extract the attention mask
|
||||
features: The embeddings for each token
|
||||
|
||||
Adopted from:
|
||||
https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2#usage-huggingface-transformers
|
||||
"""
|
||||
token_embeddings = np.array(features)
|
||||
attention_mask = self.embedding_model.tokenizer(document, truncation=True, padding=True, return_tensors="np")[
|
||||
"attention_mask"
|
||||
]
|
||||
input_mask_expanded = np.broadcast_to(np.expand_dims(attention_mask, -1), token_embeddings.shape)
|
||||
sum_embeddings = np.sum(token_embeddings * input_mask_expanded, 1)
|
||||
sum_mask = np.clip(
|
||||
input_mask_expanded.sum(1),
|
||||
a_min=1e-9,
|
||||
a_max=input_mask_expanded.sum(1).max(),
|
||||
)
|
||||
embedding = normalize(sum_embeddings / sum_mask)[0]
|
||||
return embedding
|
||||
|
||||
|
||||
class MyDataset(Dataset):
|
||||
"""Dataset to pass to `transformers.pipelines.pipeline`."""
|
||||
|
||||
def __init__(self, docs):
|
||||
self.docs = docs
|
||||
|
||||
def __len__(self):
|
||||
return len(self.docs)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
return self.docs[idx]
|
||||
@@ -0,0 +1,43 @@
|
||||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
from bertopic.backend import BaseEmbedder
|
||||
from langchain_core.embeddings import Embeddings
|
||||
|
||||
|
||||
class LangChainBackend(BaseEmbedder):
|
||||
"""LangChain Embedding Model.
|
||||
|
||||
This class uses the LangChain Embedding class to embed the documents.
|
||||
Argument:
|
||||
embedding_model: A LangChain Embedding Instance.
|
||||
|
||||
Examples:
|
||||
```python
|
||||
from langchain_community.embeddings import HuggingFaceInstructEmbeddings
|
||||
from bertopic.backend import LangChainBackend
|
||||
|
||||
hf_embedding = HuggingFaceInstructEmbeddings()
|
||||
langchain_embedder = LangChainBackend(hf_embedding)
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(self, embedding_model: Embeddings):
|
||||
self.embedding_model = embedding_model
|
||||
|
||||
def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
|
||||
"""Embed a list of n documents/words into an n-dimensional
|
||||
matrix of embeddings.
|
||||
|
||||
Arguments:
|
||||
documents: A list of documents or words to be embedded
|
||||
verbose: Controls the verbosity of the process
|
||||
|
||||
Returns:
|
||||
Document/words embeddings with shape (n, m) with `n` documents/words
|
||||
that each have an embeddings size of `m`
|
||||
"""
|
||||
# Prepare documents, replacing empty strings with a single space
|
||||
prepared_documents = [" " if doc == "" else doc for doc in documents]
|
||||
response = self.embedding_model.embed_documents(prepared_documents)
|
||||
return np.array(response)
|
||||
@@ -0,0 +1,129 @@
|
||||
import numpy as np
|
||||
from typing import List, Union
|
||||
from model2vec import StaticModel
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
|
||||
from bertopic.backend import BaseEmbedder
|
||||
|
||||
|
||||
class Model2VecBackend(BaseEmbedder):
|
||||
"""Model2Vec embedding model.
|
||||
|
||||
Arguments:
|
||||
embedding_model: Either a model2vec model or a
|
||||
string pointing to a model2vec model
|
||||
distill: Indicates whether to distill a sentence-transformers compatible model.
|
||||
The distillation will happen during fitting of the topic model.
|
||||
NOTE: Only works if `embedding_model` is a string.
|
||||
distill_kwargs: Keyword arguments to pass to the distillation process
|
||||
of `model2vec.distill.distill`
|
||||
distill_vectorizer: A CountVectorizer used for creating a custom vocabulary
|
||||
based on the same documents used for topic modeling.
|
||||
NOTE: If "vocabulary" is in `distill_kwargs`, this will be ignored.
|
||||
|
||||
Examples:
|
||||
To create a model, you can load in a string pointing to a
|
||||
model2vec model:
|
||||
|
||||
```python
|
||||
from bertopic.backend import Model2VecBackend
|
||||
|
||||
sentence_model = Model2VecBackend("minishlab/potion-base-8M")
|
||||
```
|
||||
|
||||
or you can instantiate a model yourself:
|
||||
|
||||
```python
|
||||
from bertopic.backend import Model2VecBackend
|
||||
from model2vec import StaticModel
|
||||
|
||||
embedding_model = StaticModel.from_pretrained("minishlab/potion-base-8M")
|
||||
sentence_model = Model2VecBackend(embedding_model)
|
||||
```
|
||||
|
||||
If you want to distill a sentence-transformers model with the vocabulary of the documents,
|
||||
run the following:
|
||||
|
||||
```python
|
||||
from bertopic.backend import Model2VecBackend
|
||||
|
||||
sentence_model = Model2VecBackend("sentence-transformers/all-MiniLM-L6-v2", distill=True)
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
embedding_model: Union[str, StaticModel],
|
||||
distill: bool = False,
|
||||
distill_kwargs: dict = {},
|
||||
distill_vectorizer: str = None,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
self.distill = distill
|
||||
self.distill_kwargs = distill_kwargs
|
||||
self.distill_vectorizer = distill_vectorizer
|
||||
self._has_distilled = False
|
||||
|
||||
# When we distill, we need a string pointing to a sentence-transformer model
|
||||
if self.distill:
|
||||
self._check_model2vec_installation()
|
||||
if not self.distill_vectorizer:
|
||||
self.distill_vectorizer = CountVectorizer()
|
||||
if isinstance(embedding_model, str):
|
||||
self.embedding_model = embedding_model
|
||||
else:
|
||||
raise ValueError("Please pass a string pointing to a sentence-transformer model when distilling.")
|
||||
|
||||
# If we don't distill, we can pass a model2vec model directly or load from a string
|
||||
elif isinstance(embedding_model, StaticModel):
|
||||
self.embedding_model = embedding_model
|
||||
elif isinstance(embedding_model, str):
|
||||
self.embedding_model = StaticModel.from_pretrained(embedding_model)
|
||||
else:
|
||||
raise ValueError(
|
||||
"Please select a correct Model2Vec model: \n"
|
||||
"`from model2vec import StaticModel` \n"
|
||||
"`model = StaticModel.from_pretrained('minishlab/potion-base-8M')`"
|
||||
)
|
||||
|
||||
def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
|
||||
"""Embed a list of n documents/words into an n-dimensional
|
||||
matrix of embeddings.
|
||||
|
||||
Arguments:
|
||||
documents: A list of documents or words to be embedded
|
||||
verbose: Controls the verbosity of the process
|
||||
|
||||
Returns:
|
||||
Document/words embeddings with shape (n, m) with `n` documents/words
|
||||
that each have an embeddings size of `m`
|
||||
"""
|
||||
# Distill the model
|
||||
if self.distill and not self._has_distilled:
|
||||
from model2vec.distill import distill
|
||||
|
||||
# Distill with the vocabulary of the documents
|
||||
if not self.distill_kwargs.get("vocabulary"):
|
||||
X = self.distill_vectorizer.fit_transform(documents)
|
||||
word_counts = np.array(X.sum(axis=0)).flatten()
|
||||
words = self.distill_vectorizer.get_feature_names_out()
|
||||
vocabulary = [word for word, _ in sorted(zip(words, word_counts), key=lambda x: x[1], reverse=True)]
|
||||
self.distill_kwargs["vocabulary"] = vocabulary
|
||||
|
||||
# Distill the model
|
||||
self.embedding_model = distill(self.embedding_model, **self.distill_kwargs)
|
||||
|
||||
# Distillation should happen only once and not for every embed call
|
||||
# The distillation should only happen the first time on the entire vocabulary
|
||||
self._has_distilled = True
|
||||
|
||||
# Embed the documents
|
||||
embeddings = self.embedding_model.encode(documents, show_progress_bar=verbose)
|
||||
return embeddings
|
||||
|
||||
def _check_model2vec_installation(self):
|
||||
try:
|
||||
from model2vec.distill import distill # noqa: F401
|
||||
except ImportError:
|
||||
raise ImportError("To distill a model using model2vec, you need to run `pip install model2vec[distill]`")
|
||||
@@ -0,0 +1,200 @@
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
from tqdm import tqdm
|
||||
from typing import List, Union
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
from bertopic.backend import BaseEmbedder
|
||||
|
||||
|
||||
class MultiModalBackend(BaseEmbedder):
|
||||
"""Multimodal backend using Sentence-transformers.
|
||||
|
||||
The sentence-transformers embedding model used for
|
||||
generating word, document, and image embeddings.
|
||||
|
||||
Arguments:
|
||||
embedding_model: A sentence-transformers embedding model that
|
||||
can either embed both images and text or only text.
|
||||
If it only embeds text, then `image_model` needs
|
||||
to be used to embed the images.
|
||||
image_model: A sentence-transformers embedding model that is used
|
||||
to embed only images.
|
||||
batch_size: The sizes of image batches to pass
|
||||
|
||||
Examples:
|
||||
To create a model, you can load in a string pointing to a
|
||||
sentence-transformers model:
|
||||
|
||||
```python
|
||||
from bertopic.backend import MultiModalBackend
|
||||
|
||||
sentence_model = MultiModalBackend("clip-ViT-B-32")
|
||||
```
|
||||
|
||||
or you can instantiate a model yourself:
|
||||
```python
|
||||
from bertopic.backend import MultiModalBackend
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
embedding_model = SentenceTransformer("clip-ViT-B-32")
|
||||
sentence_model = MultiModalBackend(embedding_model)
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
embedding_model: Union[str, SentenceTransformer],
|
||||
image_model: Union[str, SentenceTransformer] = None,
|
||||
batch_size: int = 32,
|
||||
):
|
||||
super().__init__()
|
||||
self.batch_size = batch_size
|
||||
|
||||
# Text or Text+Image model
|
||||
if isinstance(embedding_model, SentenceTransformer):
|
||||
self.embedding_model = embedding_model
|
||||
elif isinstance(embedding_model, str):
|
||||
self.embedding_model = SentenceTransformer(embedding_model)
|
||||
else:
|
||||
raise ValueError(
|
||||
"Please select a correct SentenceTransformers model: \n"
|
||||
"`from sentence_transformers import SentenceTransformer` \n"
|
||||
"`model = SentenceTransformer('clip-ViT-B-32')`"
|
||||
)
|
||||
|
||||
# Image Model
|
||||
self.image_model = None
|
||||
if image_model is not None:
|
||||
if isinstance(image_model, SentenceTransformer):
|
||||
self.image_model = image_model
|
||||
elif isinstance(image_model, str):
|
||||
self.image_model = SentenceTransformer(image_model)
|
||||
else:
|
||||
raise ValueError(
|
||||
"Please select a correct SentenceTransformers model: \n"
|
||||
"`from sentence_transformers import SentenceTransformer` \n"
|
||||
"`model = SentenceTransformer('clip-ViT-B-32')`"
|
||||
)
|
||||
|
||||
try:
|
||||
self.tokenizer = self.embedding_model._first_module().processor.tokenizer
|
||||
except AttributeError:
|
||||
self.tokenizer = self.embedding_model.tokenizer
|
||||
except: # noqa: E722
|
||||
self.tokenizer = None
|
||||
|
||||
def embed(self, documents: List[str], images: List[str] = None, verbose: bool = False) -> np.ndarray:
|
||||
"""Embed a list of n documents/words or images into an n-dimensional
|
||||
matrix of embeddings.
|
||||
|
||||
Either documents, images, or both can be provided. If both are provided,
|
||||
then the embeddings are averaged.
|
||||
|
||||
Arguments:
|
||||
documents: A list of documents or words to be embedded
|
||||
images: A list of image paths to be embedded
|
||||
verbose: Controls the verbosity of the process
|
||||
|
||||
Returns:
|
||||
Document/words embeddings with shape (n, m) with `n` documents/words
|
||||
that each have an embeddings size of `m`
|
||||
"""
|
||||
# Embed documents
|
||||
doc_embeddings = None
|
||||
if documents[0] is not None:
|
||||
doc_embeddings = self.embed_documents(documents)
|
||||
|
||||
# Embed images
|
||||
image_embeddings = None
|
||||
if isinstance(images, list):
|
||||
image_embeddings = self.embed_images(images, verbose)
|
||||
|
||||
# Average embeddings
|
||||
averaged_embeddings = None
|
||||
if doc_embeddings is not None and image_embeddings is not None:
|
||||
averaged_embeddings = np.mean([doc_embeddings, image_embeddings], axis=0)
|
||||
|
||||
if averaged_embeddings is not None:
|
||||
return averaged_embeddings
|
||||
elif doc_embeddings is not None:
|
||||
return doc_embeddings
|
||||
elif image_embeddings is not None:
|
||||
return image_embeddings
|
||||
|
||||
def embed_documents(self, documents: List[str], verbose: bool = False) -> np.ndarray:
|
||||
"""Embed a list of n documents/words into an n-dimensional
|
||||
matrix of embeddings.
|
||||
|
||||
Arguments:
|
||||
documents: A list of documents or words to be embedded
|
||||
verbose: Controls the verbosity of the process
|
||||
|
||||
Returns:
|
||||
Document/words embeddings with shape (n, m) with `n` documents/words
|
||||
that each have an embeddings size of `m`
|
||||
"""
|
||||
truncated_docs = [self._truncate_document(doc) for doc in documents]
|
||||
embeddings = self.embedding_model.encode(truncated_docs, show_progress_bar=verbose)
|
||||
return embeddings
|
||||
|
||||
def embed_words(self, words: List[str], verbose: bool = False) -> np.ndarray:
|
||||
"""Embed a list of n words into an n-dimensional
|
||||
matrix of embeddings.
|
||||
|
||||
Arguments:
|
||||
words: A list of words to be embedded
|
||||
verbose: Controls the verbosity of the process
|
||||
|
||||
Returns:
|
||||
Document/words embeddings with shape (n, m) with `n` documents/words
|
||||
that each have an embeddings size of `m`
|
||||
"""
|
||||
embeddings = self.embedding_model.encode(words, show_progress_bar=verbose)
|
||||
return embeddings
|
||||
|
||||
def embed_images(self, images, verbose):
|
||||
if self.batch_size:
|
||||
nr_iterations = int(np.ceil(len(images) / self.batch_size))
|
||||
|
||||
# Embed images per batch
|
||||
embeddings = []
|
||||
for i in tqdm(range(nr_iterations), disable=not verbose):
|
||||
start_index = i * self.batch_size
|
||||
end_index = (i * self.batch_size) + self.batch_size
|
||||
|
||||
images_to_embed = [
|
||||
Image.open(image) if isinstance(image, str) else image for image in images[start_index:end_index]
|
||||
]
|
||||
if self.image_model is not None:
|
||||
img_emb = self.image_model.encode(images_to_embed)
|
||||
else:
|
||||
img_emb = self.embedding_model.encode(images_to_embed, show_progress_bar=False)
|
||||
embeddings.extend(img_emb.tolist())
|
||||
|
||||
# Close images
|
||||
if isinstance(images[0], str):
|
||||
for image in images_to_embed:
|
||||
image.close()
|
||||
embeddings = np.array(embeddings)
|
||||
else:
|
||||
images_to_embed = [Image.open(filepath) for filepath in images]
|
||||
if self.image_model is not None:
|
||||
embeddings = self.image_model.encode(images_to_embed)
|
||||
else:
|
||||
embeddings = self.embedding_model.encode(images_to_embed, show_progress_bar=False)
|
||||
return embeddings
|
||||
|
||||
def _truncate_document(self, document):
|
||||
if self.tokenizer:
|
||||
tokens = self.tokenizer.encode(document)
|
||||
|
||||
if len(tokens) > 77:
|
||||
# Skip the starting token, only include 75 tokens
|
||||
truncated_tokens = tokens[1:76]
|
||||
document = self.tokenizer.decode(truncated_tokens)
|
||||
|
||||
# Recursive call here, because the encode(decode()) can have different result
|
||||
return self._truncate_document(document)
|
||||
|
||||
return document
|
||||
@@ -0,0 +1,88 @@
|
||||
import time
|
||||
import openai
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
from typing import List, Mapping, Any
|
||||
from bertopic.backend import BaseEmbedder
|
||||
|
||||
|
||||
class OpenAIBackend(BaseEmbedder):
|
||||
"""OpenAI Embedding Model.
|
||||
|
||||
Arguments:
|
||||
client: A `openai.OpenAI` client.
|
||||
embedding_model: An OpenAI model. Default is
|
||||
For an overview of models see:
|
||||
https://platform.openai.com/docs/models/embeddings
|
||||
delay_in_seconds: If a `batch_size` is given, use this set
|
||||
the delay in seconds between batches.
|
||||
batch_size: The size of each batch.
|
||||
generator_kwargs: Kwargs passed to `openai.Embedding.create`.
|
||||
Can be used to define custom engines or
|
||||
deployment_ids.
|
||||
|
||||
Examples:
|
||||
```python
|
||||
import openai
|
||||
from bertopic.backend import OpenAIBackend
|
||||
|
||||
client = openai.OpenAI(api_key="sk-...")
|
||||
openai_embedder = OpenAIBackend(client, "text-embedding-ada-002")
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
client: openai.OpenAI,
|
||||
embedding_model: str = "text-embedding-ada-002",
|
||||
delay_in_seconds: float = None,
|
||||
batch_size: int = None,
|
||||
generator_kwargs: Mapping[str, Any] = {},
|
||||
):
|
||||
super().__init__()
|
||||
self.client = client
|
||||
self.embedding_model = embedding_model
|
||||
self.delay_in_seconds = delay_in_seconds
|
||||
self.batch_size = batch_size
|
||||
self.generator_kwargs = generator_kwargs
|
||||
|
||||
if self.generator_kwargs.get("model"):
|
||||
self.embedding_model = generator_kwargs.get("model")
|
||||
elif not self.generator_kwargs.get("engine"):
|
||||
self.generator_kwargs["model"] = self.embedding_model
|
||||
|
||||
def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
|
||||
"""Embed a list of n documents/words into an n-dimensional
|
||||
matrix of embeddings.
|
||||
|
||||
Arguments:
|
||||
documents: A list of documents or words to be embedded
|
||||
verbose: Controls the verbosity of the process
|
||||
|
||||
Returns:
|
||||
Document/words embeddings with shape (n, m) with `n` documents/words
|
||||
that each have an embeddings size of `m`
|
||||
"""
|
||||
# Prepare documents, replacing empty strings with a single space
|
||||
prepared_documents = [" " if doc == "" else doc for doc in documents]
|
||||
|
||||
# Batch-wise embedding extraction
|
||||
if self.batch_size is not None:
|
||||
embeddings = []
|
||||
for batch in tqdm(self._chunks(prepared_documents), disable=not verbose):
|
||||
response = self.client.embeddings.create(input=batch, **self.generator_kwargs)
|
||||
embeddings.extend([r.embedding for r in response.data])
|
||||
|
||||
# Delay subsequent calls
|
||||
if self.delay_in_seconds:
|
||||
time.sleep(self.delay_in_seconds)
|
||||
|
||||
# Extract embeddings all at once
|
||||
else:
|
||||
response = self.client.embeddings.create(input=prepared_documents, **self.generator_kwargs)
|
||||
embeddings = [r.embedding for r in response.data]
|
||||
return np.array(embeddings)
|
||||
|
||||
def _chunks(self, documents):
|
||||
for i in range(0, len(documents), self.batch_size):
|
||||
yield documents[i : i + self.batch_size]
|
||||
@@ -0,0 +1,85 @@
|
||||
import numpy as np
|
||||
from typing import List, Union
|
||||
from sentence_transformers import SentenceTransformer
|
||||
from sentence_transformers.models import StaticEmbedding
|
||||
|
||||
from bertopic.backend import BaseEmbedder
|
||||
|
||||
|
||||
class SentenceTransformerBackend(BaseEmbedder):
|
||||
"""Sentence-transformers embedding model.
|
||||
|
||||
The sentence-transformers embedding model used for generating document and
|
||||
word embeddings.
|
||||
|
||||
Arguments:
|
||||
embedding_model: A sentence-transformers embedding model
|
||||
model2vec: Indicates whether `embedding_model` is a model2vec model.
|
||||
NOTE: Only works if `embedding_model` is a string.
|
||||
Otherwise, you can pass the model2vec model directly to `embedding_model`.
|
||||
|
||||
Examples:
|
||||
To create a model, you can load in a string pointing to a
|
||||
sentence-transformers model:
|
||||
|
||||
```python
|
||||
from bertopic.backend import SentenceTransformerBackend
|
||||
|
||||
sentence_model = SentenceTransformerBackend("all-MiniLM-L6-v2")
|
||||
```
|
||||
|
||||
or you can instantiate a model yourself:
|
||||
|
||||
```python
|
||||
from bertopic.backend import SentenceTransformerBackend
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
|
||||
sentence_model = SentenceTransformerBackend(embedding_model)
|
||||
```
|
||||
|
||||
If you want to use a model2vec model without having to install model2vec,
|
||||
you can pass the model2vec model as a string:
|
||||
|
||||
```python
|
||||
from bertopic.backend import SentenceTransformerBackend
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
embedding_model = SentenceTransformer("minishlab/potion-base-8M", model2vec=True)
|
||||
sentence_model = SentenceTransformerBackend(embedding_model)
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(self, embedding_model: Union[str, SentenceTransformer], model2vec: bool = False):
|
||||
super().__init__()
|
||||
|
||||
self._hf_model = None
|
||||
if model2vec and isinstance(embedding_model, str):
|
||||
static_embedding = StaticEmbedding.from_model2vec(embedding_model)
|
||||
self.embedding_model = SentenceTransformer(modules=[static_embedding])
|
||||
elif isinstance(embedding_model, SentenceTransformer):
|
||||
self.embedding_model = embedding_model
|
||||
elif isinstance(embedding_model, str):
|
||||
self.embedding_model = SentenceTransformer(embedding_model)
|
||||
self._hf_model = embedding_model
|
||||
else:
|
||||
raise ValueError(
|
||||
"Please select a correct SentenceTransformers model: \n"
|
||||
"`from sentence_transformers import SentenceTransformer` \n"
|
||||
"`model = SentenceTransformer('all-MiniLM-L6-v2')`"
|
||||
)
|
||||
|
||||
def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
|
||||
"""Embed a list of n documents/words into an n-dimensional
|
||||
matrix of embeddings.
|
||||
|
||||
Arguments:
|
||||
documents: A list of documents or words to be embedded
|
||||
verbose: Controls the verbosity of the process
|
||||
|
||||
Returns:
|
||||
Document/words embeddings with shape (n, m) with `n` documents/words
|
||||
that each have an embeddings size of `m`
|
||||
"""
|
||||
embeddings = self.embedding_model.encode(documents, show_progress_bar=verbose)
|
||||
return embeddings
|
||||
@@ -0,0 +1,68 @@
|
||||
from bertopic.backend import BaseEmbedder
|
||||
from sklearn.utils.validation import check_is_fitted, NotFittedError
|
||||
|
||||
|
||||
class SklearnEmbedder(BaseEmbedder):
|
||||
"""Scikit-Learn based embedding model.
|
||||
|
||||
This component allows the usage of scikit-learn pipelines for generating document and
|
||||
word embeddings.
|
||||
|
||||
Arguments:
|
||||
pipe: A scikit-learn pipeline that can `.transform()` text.
|
||||
|
||||
Examples:
|
||||
Scikit-Learn is very flexible and it allows for many representations.
|
||||
A relatively simple pipeline is shown below.
|
||||
|
||||
```python
|
||||
from sklearn.pipeline import make_pipeline
|
||||
from sklearn.decomposition import TruncatedSVD
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
|
||||
from bertopic.backend import SklearnEmbedder
|
||||
|
||||
pipe = make_pipeline(
|
||||
TfidfVectorizer(),
|
||||
TruncatedSVD(100)
|
||||
)
|
||||
|
||||
sklearn_embedder = SklearnEmbedder(pipe)
|
||||
topic_model = BERTopic(embedding_model=sklearn_embedder)
|
||||
```
|
||||
|
||||
This pipeline first constructs a sparse representation based on TF/idf and then
|
||||
makes it dense by applying SVD. Alternatively, you might also construct something
|
||||
more elaborate. As long as you construct a scikit-learn compatible pipeline, you
|
||||
should be able to pass it to Bertopic.
|
||||
|
||||
!!! Warning
|
||||
One caveat to be aware of is that scikit-learns base `Pipeline` class does not
|
||||
support the `.partial_fit()`-API. If you have a pipeline that theoretically should
|
||||
be able to support online learning then you might want to explore
|
||||
the [scikit-partial](https://github.com/koaning/scikit-partial) project.
|
||||
"""
|
||||
|
||||
def __init__(self, pipe):
|
||||
super().__init__()
|
||||
self.pipe = pipe
|
||||
|
||||
def embed(self, documents, verbose=False):
|
||||
"""Embed a list of n documents/words into an n-dimensional
|
||||
matrix of embeddings.
|
||||
|
||||
Arguments:
|
||||
documents: A list of documents or words to be embedded
|
||||
verbose: No-op variable that's kept around to keep the API consistent. If you want to get feedback on training times, you should use the sklearn API.
|
||||
|
||||
Returns:
|
||||
Document/words embeddings with shape (n, m) with `n` documents/words
|
||||
that each have an embeddings size of `m`
|
||||
"""
|
||||
try:
|
||||
check_is_fitted(self.pipe)
|
||||
embeddings = self.pipe.transform(documents)
|
||||
except NotFittedError:
|
||||
embeddings = self.pipe.fit_transform(documents)
|
||||
|
||||
return embeddings
|
||||
@@ -0,0 +1,94 @@
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
from typing import List
|
||||
from bertopic.backend import BaseEmbedder
|
||||
|
||||
|
||||
class SpacyBackend(BaseEmbedder):
|
||||
"""Spacy embedding model.
|
||||
|
||||
The Spacy embedding model used for generating document and
|
||||
word embeddings.
|
||||
|
||||
Arguments:
|
||||
embedding_model: A spacy embedding model
|
||||
|
||||
Examples:
|
||||
To create a Spacy backend, you need to create an nlp object and
|
||||
pass it through this backend:
|
||||
|
||||
```python
|
||||
import spacy
|
||||
from bertopic.backend import SpacyBackend
|
||||
|
||||
nlp = spacy.load("en_core_web_md", exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'])
|
||||
spacy_model = SpacyBackend(nlp)
|
||||
```
|
||||
|
||||
To load in a transformer model use the following:
|
||||
|
||||
```python
|
||||
import spacy
|
||||
from thinc.api import set_gpu_allocator, require_gpu
|
||||
from bertopic.backend import SpacyBackend
|
||||
|
||||
nlp = spacy.load("en_core_web_trf", exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'])
|
||||
set_gpu_allocator("pytorch")
|
||||
require_gpu(0)
|
||||
spacy_model = SpacyBackend(nlp)
|
||||
```
|
||||
|
||||
If you run into gpu/memory-issues, please use:
|
||||
|
||||
```python
|
||||
import spacy
|
||||
from bertopic.backend import SpacyBackend
|
||||
|
||||
spacy.prefer_gpu()
|
||||
nlp = spacy.load("en_core_web_trf", exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'])
|
||||
spacy_model = SpacyBackend(nlp)
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(self, embedding_model):
|
||||
super().__init__()
|
||||
|
||||
if "spacy" in str(type(embedding_model)):
|
||||
self.embedding_model = embedding_model
|
||||
else:
|
||||
raise ValueError(
|
||||
"Please select a correct Spacy model by either using a string such as 'en_core_web_md' "
|
||||
"or create a nlp model using: `nlp = spacy.load('en_core_web_md')"
|
||||
)
|
||||
|
||||
def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
|
||||
"""Embed a list of n documents/words into an n-dimensional
|
||||
matrix of embeddings.
|
||||
|
||||
Arguments:
|
||||
documents: A list of documents or words to be embedded
|
||||
verbose: Controls the verbosity of the process
|
||||
|
||||
Returns:
|
||||
Document/words embeddings with shape (n, m) with `n` documents/words
|
||||
that each have an embeddings size of `m`
|
||||
"""
|
||||
# Handle empty documents, spaCy models automatically map
|
||||
# empty strings to the zero vector
|
||||
empty_document = " "
|
||||
|
||||
# Extract embeddings
|
||||
embeddings = []
|
||||
for doc in tqdm(documents, position=0, leave=True, disable=not verbose):
|
||||
embedding = self.embedding_model(doc or empty_document)
|
||||
if embedding.has_vector:
|
||||
embedding = embedding.vector
|
||||
else:
|
||||
embedding = embedding._.trf_data.tensors[-1][0]
|
||||
|
||||
if not isinstance(embedding, np.ndarray) and hasattr(embedding, "get"):
|
||||
# Convert cupy array to numpy array
|
||||
embedding = embedding.get()
|
||||
embeddings.append(embedding)
|
||||
|
||||
return np.array(embeddings)
|
||||
@@ -0,0 +1,55 @@
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
from typing import List
|
||||
|
||||
from bertopic.backend import BaseEmbedder
|
||||
|
||||
|
||||
class USEBackend(BaseEmbedder):
|
||||
"""Universal Sentence Encoder.
|
||||
|
||||
USE encodes text into high-dimensional vectors that
|
||||
are used for semantic similarity in BERTopic.
|
||||
|
||||
Arguments:
|
||||
embedding_model: An USE embedding model
|
||||
|
||||
Examples:
|
||||
```python
|
||||
import tensorflow_hub
|
||||
from bertopic.backend import USEBackend
|
||||
|
||||
embedding_model = tensorflow_hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
|
||||
use_embedder = USEBackend(embedding_model)
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(self, embedding_model):
|
||||
super().__init__()
|
||||
|
||||
try:
|
||||
embedding_model(["test sentence"])
|
||||
self.embedding_model = embedding_model
|
||||
except TypeError:
|
||||
raise ValueError(
|
||||
"Please select a correct USE model: \n"
|
||||
"`import tensorflow_hub` \n"
|
||||
"`embedding_model = tensorflow_hub.load(path_to_model)`"
|
||||
)
|
||||
|
||||
def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
|
||||
"""Embed a list of n documents/words into an n-dimensional
|
||||
matrix of embeddings.
|
||||
|
||||
Arguments:
|
||||
documents: A list of documents or words to be embedded
|
||||
verbose: Controls the verbosity of the process
|
||||
|
||||
Returns:
|
||||
Document/words embeddings with shape (n, m) with `n` documents/words
|
||||
that each have an embeddings size of `m`
|
||||
"""
|
||||
embeddings = np.array(
|
||||
[self.embedding_model([doc]).cpu().numpy()[0] for doc in tqdm(documents, disable=not verbose)]
|
||||
)
|
||||
return embeddings
|
||||
@@ -0,0 +1,171 @@
|
||||
from ._base import BaseEmbedder
|
||||
|
||||
# Imports for light-weight variant of BERTopic
|
||||
from bertopic.backend._sklearn import SklearnEmbedder
|
||||
from bertopic._utils import MyLogger
|
||||
from sklearn.pipeline import make_pipeline
|
||||
from sklearn.decomposition import TruncatedSVD
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.pipeline import Pipeline as ScikitPipeline
|
||||
|
||||
logger = MyLogger()
|
||||
logger.configure("WARNING")
|
||||
|
||||
languages = [
|
||||
"arabic",
|
||||
"bulgarian",
|
||||
"catalan",
|
||||
"czech",
|
||||
"danish",
|
||||
"german",
|
||||
"greek",
|
||||
"english",
|
||||
"spanish",
|
||||
"estonian",
|
||||
"persian",
|
||||
"finnish",
|
||||
"french",
|
||||
"canadian french",
|
||||
"galician",
|
||||
"gujarati",
|
||||
"hebrew",
|
||||
"hindi",
|
||||
"croatian",
|
||||
"hungarian",
|
||||
"armenian",
|
||||
"indonesian",
|
||||
"italian",
|
||||
"japanese",
|
||||
"georgian",
|
||||
"korean",
|
||||
"kurdish",
|
||||
"lithuanian",
|
||||
"latvian",
|
||||
"macedonian",
|
||||
"mongolian",
|
||||
"marathi",
|
||||
"malay",
|
||||
"burmese",
|
||||
"norwegian bokmal",
|
||||
"dutch",
|
||||
"polish",
|
||||
"portuguese",
|
||||
"brazilian portuguese",
|
||||
"romanian",
|
||||
"russian",
|
||||
"slovak",
|
||||
"slovenian",
|
||||
"albanian",
|
||||
"serbian",
|
||||
"swedish",
|
||||
"thai",
|
||||
"turkish",
|
||||
"ukrainian",
|
||||
"urdu",
|
||||
"vietnamese",
|
||||
"chinese (simplified)",
|
||||
"chinese (traditional)",
|
||||
]
|
||||
|
||||
|
||||
def select_backend(embedding_model, language: str = None, verbose: bool = False) -> BaseEmbedder:
|
||||
"""Select an embedding model based on language or a specific provided model.
|
||||
When selecting a language, we choose all-MiniLM-L6-v2 for English and
|
||||
paraphrase-multilingual-MiniLM-L12-v2 for all other languages as it support 100+ languages.
|
||||
If sentence-transformers is not installed, in the case of a lightweight installation,
|
||||
a scikit-learn backend is default.
|
||||
|
||||
Returns:
|
||||
model: The selected model backend.
|
||||
"""
|
||||
logger.set_level("INFO" if verbose else "WARNING")
|
||||
|
||||
# BERTopic language backend
|
||||
if isinstance(embedding_model, BaseEmbedder):
|
||||
return embedding_model
|
||||
|
||||
# Scikit-learn backend
|
||||
if isinstance(embedding_model, ScikitPipeline):
|
||||
return SklearnEmbedder(embedding_model)
|
||||
|
||||
# Flair word embeddings
|
||||
if "flair" in str(type(embedding_model)):
|
||||
from bertopic.backend._flair import FlairBackend
|
||||
|
||||
return FlairBackend(embedding_model)
|
||||
|
||||
# Spacy embeddings
|
||||
if "spacy" in str(type(embedding_model)):
|
||||
from bertopic.backend._spacy import SpacyBackend
|
||||
|
||||
return SpacyBackend(embedding_model)
|
||||
|
||||
# Gensim embeddings
|
||||
if "gensim" in str(type(embedding_model)):
|
||||
from bertopic.backend._gensim import GensimBackend
|
||||
|
||||
return GensimBackend(embedding_model)
|
||||
|
||||
# USE embeddings
|
||||
if "tensorflow" and "saved_model" in str(type(embedding_model)):
|
||||
from bertopic.backend._use import USEBackend
|
||||
|
||||
return USEBackend(embedding_model)
|
||||
|
||||
# Sentence Transformer embeddings
|
||||
if "sentence_transformers" in str(type(embedding_model)) or isinstance(embedding_model, str):
|
||||
from ._sentencetransformers import SentenceTransformerBackend
|
||||
|
||||
return SentenceTransformerBackend(embedding_model)
|
||||
|
||||
# Hugging Face embeddings
|
||||
if "transformers" and "pipeline" in str(type(embedding_model)):
|
||||
from ._hftransformers import HFTransformerBackend
|
||||
|
||||
return HFTransformerBackend(embedding_model)
|
||||
|
||||
# Model2Vec embeddings
|
||||
if "model2vec" in str(type(embedding_model)):
|
||||
from ._model2vec import Model2VecBackend
|
||||
|
||||
return Model2VecBackend(embedding_model)
|
||||
|
||||
# FastEmbed word embeddings
|
||||
if "fastembed" in str(type(embedding_model)):
|
||||
from bertopic.backend._fastembed import FastEmbedBackend
|
||||
|
||||
return FastEmbedBackend(embedding_model)
|
||||
|
||||
# Select embedding model based on language
|
||||
if language:
|
||||
try:
|
||||
from ._sentencetransformers import SentenceTransformerBackend
|
||||
|
||||
if language.lower() in ["English", "english", "en"]:
|
||||
return SentenceTransformerBackend("sentence-transformers/all-MiniLM-L6-v2")
|
||||
elif language.lower() in languages or language == "multilingual":
|
||||
return SentenceTransformerBackend("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
|
||||
else:
|
||||
raise ValueError(
|
||||
f"{language} is currently not supported. However, you can "
|
||||
f"create any embeddings yourself and pass it through fit_transform(docs, embeddings)\n"
|
||||
"Else, please select a language from the following list:\n"
|
||||
f"{languages}"
|
||||
)
|
||||
|
||||
# A ModuleNotFoundError might be a lightweight installation
|
||||
except ModuleNotFoundError as e:
|
||||
if e.name != "sentence_transformers":
|
||||
# Error occurred in a downstream module, probably not a lightweight install
|
||||
raise e
|
||||
# Whole sentence_transformers module is missing, probably a lightweight install
|
||||
if verbose:
|
||||
logger.info(
|
||||
"Automatically selecting lightweight scikit-learn embedding backend as sentence-transformers appears to not be installed."
|
||||
)
|
||||
pipe = make_pipeline(TfidfVectorizer(), TruncatedSVD(100))
|
||||
return SklearnEmbedder(pipe)
|
||||
|
||||
from ._sentencetransformers import SentenceTransformerBackend
|
||||
|
||||
return SentenceTransformerBackend("sentence-transformers/all-MiniLM-L6-v2")
|
||||
@@ -0,0 +1,43 @@
|
||||
import numpy as np
|
||||
from typing import List
|
||||
from bertopic.backend._base import BaseEmbedder
|
||||
from bertopic.backend._utils import select_backend
|
||||
|
||||
|
||||
class WordDocEmbedder(BaseEmbedder):
|
||||
"""Combine a document- and word-level embedder."""
|
||||
|
||||
def __init__(self, embedding_model, word_embedding_model):
|
||||
super().__init__()
|
||||
|
||||
self.embedding_model = select_backend(embedding_model)
|
||||
self.word_embedding_model = select_backend(word_embedding_model)
|
||||
|
||||
def embed_words(self, words: List[str], verbose: bool = False) -> np.ndarray:
|
||||
"""Embed a list of n words into an n-dimensional
|
||||
matrix of embeddings.
|
||||
|
||||
Arguments:
|
||||
words: A list of words to be embedded
|
||||
verbose: Controls the verbosity of the process
|
||||
|
||||
Returns:
|
||||
Word embeddings with shape (n, m) with `n` words
|
||||
that each have an embeddings size of `m`
|
||||
|
||||
"""
|
||||
return self.word_embedding_model.embed(words, verbose)
|
||||
|
||||
def embed_documents(self, document: List[str], verbose: bool = False) -> np.ndarray:
|
||||
"""Embed a list of n words into an n-dimensional
|
||||
matrix of embeddings.
|
||||
|
||||
Arguments:
|
||||
document: A list of documents to be embedded
|
||||
verbose: Controls the verbosity of the process
|
||||
|
||||
Returns:
|
||||
Document embeddings with shape (n, m) with `n` documents
|
||||
that each have an embeddings size of `m`
|
||||
"""
|
||||
return self.embedding_model.embed(document, verbose)
|
||||
Reference in New Issue
Block a user