6. Embedding Functions
Embedding functions convert text documents into vector embeddings for similarity search. pyseekdb supports both built-in and custom embedding functions.
6.1 Default Embedding Function
The DefaultEmbeddingFunction uses all-MiniLM-L6-v2 and is the default embedding function if none is specified.
from pyseekdb import DefaultEmbeddingFunction
# Use default model (all-MiniLM-L6-v2, 384 dimensions)
ef = DefaultEmbeddingFunction()
# Use custom model
ef = DefaultEmbeddingFunction(model_name='all-MiniLM-L6-v2')
# Get embedding dimension
print(f"Dimension: {ef.dimension}") # 384
# Generate embeddings
embeddings = ef(["Hello world", "How are you?"])
print(f"Generated {len(embeddings)} embeddings, each with {len(embeddings[0])} dimensions")
6.2 Creating Custom Embedding Functions
You can create custom embedding functions by implementing the EmbeddingFunction protocol. The function must:
Implement
__call__method that acceptsDocuments(str or List[str]) and returnsEmbeddings(List[List[float]])Optionally implement a
dimensionproperty to return the vector dimension
Example: Sentence-Transformer Custom Embedding Function
from typing import List, Union
from pyseekdb import EmbeddingFunction
Documents = Union[str, List[str]]
Embeddings = List[List[float]]
Embedding = List[float]
class SentenceTransformerCustomEmbeddingFunction(EmbeddingFunction[Documents]):
"""
A custom embedding function using sentence-transformers with a specific model.
"""
def __init__(self, model_name: str = "all-MiniLM-L6-v2", device: str = "cpu"):
"""
Initialize the sentence-transformer embedding function.
Args:
model_name: Name of the sentence-transformers model to use
device: Device to run the model on ('cpu' or 'cuda')
"""
self.model_name = model_name
self.device = device
self._model = None
self._dimension = None
def _ensure_model_loaded(self):
"""Lazy load the embedding model"""
if self._model is None:
try:
from sentence_transformers import SentenceTransformer
self._model = SentenceTransformer(self.model_name, device=self.device)
# Get dimension from model
test_embedding = self._model.encode(["test"], convert_to_numpy=True)
self._dimension = len(test_embedding[0])
except ImportError:
raise ImportError(
"sentence-transformers is not installed. "
"Please install it with: pip install sentence-transformers"
)
@property
def dimension(self) -> int:
"""Get the dimension of embeddings produced by this function"""
self._ensure_model_loaded()
return self._dimension
def __call__(self, input: Documents) -> Embeddings:
"""
Generate embeddings for the given documents.
Args:
input: Single document (str) or list of documents (List[str])
Returns:
List of embedding embeddings
"""
self._ensure_model_loaded()
# Handle single string input
if isinstance(input, str):
input = [input]
# Handle empty input
if not input:
return []
# Generate embeddings
embeddings = self._model.encode(
input,
convert_to_numpy=True,
show_progress_bar=False
)
# Convert numpy arrays to lists
return [embedding.tolist() for embedding in embeddings]
# Use the custom embedding function
from pyseekdb import Configuration, HNSWConfiguration
ef = SentenceTransformerCustomEmbeddingFunction(
model_name='all-MiniLM-L6-v2',
device='cpu'
)
collection = client.create_collection(
name="my_collection",
configuration=Configuration(
hnsw=HNSWConfiguration(dimension=384, distance='cosine')
),
embedding_function=ef
)
Example: OpenAI Embedding Function
from typing import List, Union
import os
import openai
from pyseekdb import EmbeddingFunction
Documents = Union[str, List[str]]
Embeddings = List[List[float]]
Embedding = List[float]
class OpenAIEmbeddingFunction(EmbeddingFunction[Documents]):
"""
A custom embedding function using OpenAI's embedding API.
"""
def __init__(self, model_name: str = "text-embedding-ada-002", api_key: str = None):
"""
Initialize the OpenAI embedding function.
Args:
model_name: Name of the OpenAI embedding model
api_key: OpenAI API key (if not provided, uses OPENAI_API_KEY env var)
"""
self.model_name = model_name
self.api_key = api_key or os.environ.get('OPENAI_API_KEY')
if not self.api_key:
raise ValueError("OpenAI API key is required")
# Dimension for text-embedding-ada-002 is 1536
self._dimension = 1536 if "ada-002" in model_name else None
@property
def dimension(self) -> int:
"""Get the dimension of embeddings produced by this function"""
if self._dimension is None:
# Call API to get dimension (or use known values)
raise ValueError("Dimension not set for this model")
return self._dimension
def __call__(self, input: Documents) -> Embeddings:
"""
Generate embeddings using OpenAI API.
Args:
input: Single document (str) or list of documents (List[str])
Returns:
List of embedding embeddings
"""
# Handle single string input
if isinstance(input, str):
input = [input]
# Handle empty input
if not input:
return []
# Call OpenAI API
response = openai.embeddings.create(
model=self.model_name,
input=input,
api_key=self.api_key
)
# Extract embeddings
embeddings = [item['embedding'] for item in response['data']]
return embeddings
# Use the custom embedding function
from pyseekdb import Configuration, HNSWConfiguration
ef = OpenAIEmbeddingFunction(
model_name='text-embedding-ada-002',
api_key='your-api-key'
)
collection = client.create_collection(
name="my_collection",
configuration=Configuration(
hnsw=HNSWConfiguration(dimension=1536, distance='cosine')
),
embedding_function=ef
)
6.3 Embedding Function Requirements
When creating a custom embedding function, ensure:
Implement
__call__method:Accepts:
strorList[str](single document or list of documents)Returns:
List[List[float]](list of embeddings)Each vector must have the same dimension
Implement
dimensionproperty (recommended):Returns:
int(the dimension of embeddings produced by this function)This helps validate dimension consistency when creating collections
Handle edge cases:
Single string input should be converted to list
Empty input should return empty list
All embeddings in the output must have the same dimension
6.4 Using Custom Embedding Functions
Once you’ve created a custom embedding function, use it when creating or getting collections:
from pyseekdb import Configuration, HNSWConfiguration
# Create collection with custom embedding function
ef = MyCustomEmbeddingFunction()
collection = client.create_collection(
name="my_collection",
configuration=Configuration(
hnsw=HNSWConfiguration(dimension=ef.dimension, distance='cosine')
),
embedding_function=ef
)
# Get collection with custom embedding function
collection = client.get_collection("my_collection", embedding_function=ef)
# Use the collection - documents will be automatically embedded
collection.add(
ids=["doc1", "doc2"],
documents=["Document 1", "Document 2"], # Embeddings auto-generated
metadatas=[{"tag": "A"}, {"tag": "B"}]
)
# Query with texts - query embeddings auto-generated
results = collection.query(
query_texts=["my query"],
n_results=10
)