📚 References
EmbedAnything: A high-performance, multimodal embedding pipeline.
This module provides functions and classes for embedding queries, files, and directories using different embedding models. It supports text, images, audio, PDFs, and other media types with various embedding backends (Candle, ONNX, Cloud).
Main Functions:
embed_query: Embeds text queries and returns a list of EmbedData objects.embed_file: Embeds a single file and returns a list of EmbedData objects.embed_directory: Embeds all files in a directory and returns a list of EmbedData objects.embed_image_directory: Embeds all images in a directory.embed_audio_file: Embeds audio files using Whisper for transcription.embed_webpage: Embeds content from a webpage URL.
Main Classes:
EmbeddingModel: Main class for loading and using embedding models.EmbedData: Represents embedded data with text, embedding vector, and metadata.TextEmbedConfig: Configuration for text embedding (chunking, batching, etc.).ColpaliModel: Specialized model for document/image-text embedding.ColbertModel: Model for late-interaction embeddings.Reranker: Model for re-ranking search results.AudioDecoderModel: Model for audio transcription (Whisper).
Usage Examples:
Text Embedding
from embed_anything import EmbeddingModel, WhichModel, TextEmbedConfig
import embed_anything
# Load a text embedding model
model = EmbeddingModel.from_pretrained_local(
WhichModel.Bert,
model_id="sentence-transformers/all-MiniLM-L12-v2"
)
# Configure embedding parameters
config = TextEmbedConfig(
chunk_size=1000, # Characters per chunk
batch_size=32, # Process 32 chunks at once
splitting_strategy="sentence" # Split by sentences
)
# Embed a PDF file
data = embed_anything.embed_file("test_files/document.pdf", embedder=model, config=config)
# Access results
for item in data:
print(f"Text: {item.text[:100]}...")
print(f"Embedding dimension: {len(item.embedding)}")
print(f"Metadata: {item.metadata}")
Image Embedding
import embed_anything
import numpy as np
from embed_anything import EmbedData, EmbeddingModel, WhichModel
# Load CLIP model for image-text embeddings
model = EmbeddingModel.from_pretrained_local(
WhichModel.Clip,
model_id="openai/clip-vit-base-patch16"
)
# Embed all images in a directory
data: list[EmbedData] = embed_anything.embed_image_directory(
"test_files",
embedder=model
)
# Convert to numpy array for similarity search
embeddings = np.array([item.embedding for item in data])
# Embed a text query
query = ["Photo of a monkey?"]
query_embedding = np.array(
embed_anything.embed_query(query, embedder=model)[0].embedding
)
# Calculate cosine similarity
similarities = np.dot(embeddings, query_embedding)
most_similar_idx = np.argmax(similarities)
print(f"Most similar image: {data[most_similar_idx].text}")
Audio Embedding
from embed_anything import (
AudioDecoderModel,
EmbeddingModel,
embed_audio_file,
TextEmbedConfig,
WhichModel
)
import embed_anything
# Load Whisper model for audio transcription
# Choose from: https://huggingface.co/distil-whisper or
# https://huggingface.co/collections/openai/whisper-release-6501bba2cf999715fd953013
audio_decoder = AudioDecoderModel.from_pretrained_hf(
"openai/whisper-tiny.en",
revision="main",
model_type="tiny-en",
quantized=False
)
# Load text embedding model for transcribed text
embedder = EmbeddingModel.from_pretrained_hf(
WhichModel.Bert,
model_id="sentence-transformers/all-MiniLM-L6-v2",
revision="main",
)
# Configure text embedding
config = TextEmbedConfig(chunk_size=200, batch_size=32)
# Embed audio file (transcribes then embeds)
data = embed_anything.embed_audio_file(
"test_files/audio/samples_hp0.wav",
audio_decoder=audio_decoder,
embedder=embedder,
text_embed_config=config,
)
# Access transcribed and embedded segments
for item in data:
print(f"Transcribed text: {item.text}")
print(f"Metadata: {item.metadata}")
Vector Database Integration
Store embeddings directly to a vector database without keeping them in memory:
import embed_anything
import os
from embed_anything import EmbeddingModel, WhichModel, TextEmbedConfig
from embed_anything.vectordb import PineconeAdapter
# Initialize Pinecone adapter
api_key = os.environ.get("PINECONE_API_KEY")
pinecone_adapter = PineconeAdapter(api_key)
# Create or use existing index
try:
pinecone_adapter.delete_index("my-index")
except:
pass
pinecone_adapter.create_index(
dimension=512, # Embedding dimension
metric="cosine", # Similarity metric
index_name="my-index"
)
# Load embedding model
model = EmbeddingModel.from_pretrained_hf(
WhichModel.Clip,
"openai/clip-vit-base-patch16"
)
# Embed images and stream directly to Pinecone
data = embed_anything.embed_image_directory(
"test_files",
embedder=model,
adapter=pinecone_adapter, # Streams to database
)
# Embeddings are now in Pinecone, not in memory
print("Embeddings stored in Pinecone!")
ONNX Models (Faster Inference)
from embed_anything import EmbeddingModel, WhichModel, ONNXModel, Dtype
# Load a pre-configured ONNX model (faster, lower memory)
model = EmbeddingModel.from_pretrained_onnx(
WhichModel.Bert,
model_id=ONNXModel.BGESmallENV15Q, # Quantized BGE model
dtype=Dtype.Q4F16
)
# Use like any other model
data = embed_anything.embed_file("test_files/document.pdf", embedder=model)
Semantic Chunking
from embed_anything import EmbeddingModel, WhichModel, TextEmbedConfig
import embed_anything
# Main embedding model
model = EmbeddingModel.from_pretrained_hf(
WhichModel.Bert,
model_id="sentence-transformers/all-MiniLM-L12-v2"
)
# Semantic encoder for chunk boundaries
semantic_encoder = EmbeddingModel.from_pretrained_hf(
WhichModel.Jina,
model_id="jinaai/jina-embeddings-v2-small-en"
)
# Configure semantic chunking
config = TextEmbedConfig(
chunk_size=1000,
batch_size=32,
splitting_strategy="semantic",
semantic_encoder=semantic_encoder
)
# Embed with semantic chunking
data = embed_anything.embed_file("test_files/document.pdf", embedder=model, config=config)
Supported Embedding Models:
- Text Models: BERT, Jina, Qwen3, Splade, ColBERT, Model2Vec
- Image Models: CLIP, SigLip
- Audio Models: Whisper, DistilWhisper
- Document Models: ColPali
- Rerankers: Jina Reranker, BGE Reranker, Qwen3 Reranker
- Cloud Models: OpenAI, Cohere, Gemini
For more examples and detailed documentation, visit: https://embed-anything.com
Adapter
Bases: ABC
Source code in python/python/embed_anything/_embed_anything.pyi
__init__(api_key)
Initializes the Adapter object.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
api_key
|
str
|
The API key for accessing the adapter. |
required |
convert(embeddings)
abstractmethod
Converts the embeddings to a list of dictionaries.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
embeddings
|
List[List[EmbedData]]
|
The list of embeddings. |
required |
Returns:
| Type | Description |
|---|---|
List[Dict]
|
A list of dictionaries. |
Source code in python/python/embed_anything/_embed_anything.pyi
delete_index(index_name)
abstractmethod
Deletes an index.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
index_name
|
str
|
The name of the index to delete. |
required |
upsert(data)
abstractmethod
Upserts the data into the index.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
data
|
List[Dict]
|
The list of data to upsert. |
required |
AudioDecoderModel
Represents an audio decoder model.
Attributes:
| Name | Type | Description |
|---|---|---|
model_id |
str
|
The ID of the audio decoder model. |
revision |
str
|
The revision of the audio decoder model. |
model_type |
str
|
The type of the audio decoder model. |
quantized |
bool
|
A flag indicating whether the audio decoder model is quantized or not. |
Example:
model = embed_anything.AudioDecoderModel.from_pretrained_hf(
model_id="openai/whisper-tiny.en",
revision="main",
model_type="tiny-en",
quantized=False
)
Source code in python/python/embed_anything/_embed_anything.pyi
ColbertModel
Represents the Colbert model.
Source code in python/python/embed_anything/_embed_anything.pyi
__init__(hf_model_id=None, revision=None, path_in_repo=None)
embed(text_batch, batch_size=None, is_doc=True)
Embeds the given text and returns a list of EmbedData objects.
from_pretrained_onnx(hf_model_id=None, revision=None, path_in_repo=None)
Loads a pre-trained Colbert model from the Hugging Face model hub.
Attributes:
| Name | Type | Description |
|---|---|---|
hf_model_id |
The ID of the model from Hugging Face. |
|
revision |
The revision of the model. |
|
path_in_repo |
The path to the model in the repository. |
Returns:
| Type | Description |
|---|---|
ColbertModel
|
A ColbertModel object. |
Source code in python/python/embed_anything/_embed_anything.pyi
ColpaliModel
Represents the Colpali model.
Source code in python/python/embed_anything/_embed_anything.pyi
__init__(model_id, revision=None)
Initializes the ColpaliModel object.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
model_id
|
str
|
The ID of the model from Hugging Face. |
required |
revision
|
str | None
|
The revision of the model. |
None
|
embed_file(file_path, batch_size=1)
Embeds the given pdf file and returns a list of EmbedData objects for each page in the file This first convert the pdf file into images and then embed each image.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
file_path
|
str
|
The path to the pdf file to embed. |
required |
batch_size
|
int | None
|
The batch size for processing the embeddings. Default is 1. |
1
|
Returns:
| Type | Description |
|---|---|
list[EmbedData]
|
A list of EmbedData objects for each page in the file. |
Source code in python/python/embed_anything/_embed_anything.pyi
embed_query(query)
Embeds the given query and returns a list of EmbedData objects.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
query
|
str
|
The query to embed. |
required |
Returns:
| Type | Description |
|---|---|
list[EmbedData]
|
A list of EmbedData objects. |
Source code in python/python/embed_anything/_embed_anything.pyi
from_pretrained(model_id, revision=None)
Loads a pre-trained Colpali model from the Hugging Face model hub.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
model_id
|
str
|
The ID of the model from Hugging Face. |
required |
revision
|
str | None
|
The revision of the model. |
None
|
Returns:
| Type | Description |
|---|---|
ColpaliModel
|
A ColpaliModel object. |
Source code in python/python/embed_anything/_embed_anything.pyi
from_pretrained_onnx(model_id, revision=None)
Loads a pre-trained Colpali model from the Hugging Face model hub.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
model_id
|
str
|
The ID of the model from Hugging Face. |
required |
revision
|
str | None
|
The revision of the model. |
None
|
Returns:
| Type | Description |
|---|---|
ColpaliModel
|
A ColpaliModel object. |
Source code in python/python/embed_anything/_embed_anything.pyi
DocumentRank
Represents the rank of a document.
Attributes:
| Name | Type | Description |
|---|---|---|
document |
str
|
The document to rank. |
relevance_score |
float
|
The relevance score of the document. |
rank |
int
|
The rank of the document. |
Source code in python/python/embed_anything/_embed_anything.pyi
Dtype
Bases: Enum
Represents the data type of the model.
Source code in python/python/embed_anything/_embed_anything.pyi
EmbedData
Represents the data of an embedded file.
Attributes:
| Name | Type | Description |
|---|---|---|
embedding |
list[float]
|
The embedding of the file. |
text |
str
|
The text for which the embedding is generated for. |
metadata |
dict[str, str]
|
Additional metadata associated with the embedding. |
Source code in python/python/embed_anything/_embed_anything.pyi
EmbeddingModel
Represents an embedding model.
Source code in python/python/embed_anything/_embed_anything.pyi
588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 | |
embed_audio_file(audio_file, audio_decoder, config=None)
Embeds the given audio file and returns a list of EmbedData objects.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
audio_file
|
str
|
The path to the audio file to embed. |
required |
audio_decoder
|
AudioDecoderModel
|
The audio decoder for the audio file. |
required |
config
|
TextEmbedConfig | None
|
The configuration for the embedding. |
None
|
Returns:
| Type | Description |
|---|---|
list[EmbedData]
|
A list of EmbedData objects. |
Source code in python/python/embed_anything/_embed_anything.pyi
embed_directory(directory, config=None, adapter=None)
Embeds the given directory and returns a list of EmbedData objects.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
directory
|
str
|
The path to the directory to embed. |
required |
config
|
TextEmbedConfig | None
|
The configuration for the embedding. |
None
|
adapter
|
Adapter | None
|
The adapter for the embedding. |
None
|
Returns:
| Type | Description |
|---|---|
list[EmbedData]
|
A list of EmbedData objects. |
Source code in python/python/embed_anything/_embed_anything.pyi
embed_directory_stream(directory, config=None, adapter=None)
Embeds the given directory and returns a list of EmbedData objects.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
directory
|
str
|
The path to the directory to embed. |
required |
config
|
TextEmbedConfig | None
|
The configuration for the embedding. |
None
|
adapter
|
Adapter | None
|
The adapter for the embedding. |
None
|
Returns:
| Type | Description |
|---|---|
list[EmbedData]
|
A list of EmbedData objects. |
Source code in python/python/embed_anything/_embed_anything.pyi
embed_file(file_path, config=None, adapter=None)
Embeds the given file and returns a list of EmbedData objects.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
file_path
|
str
|
The path to the file to embed. |
required |
config
|
TextEmbedConfig | None
|
The configuration for the embedding. |
None
|
adapter
|
Adapter | None
|
The adapter for the embedding. |
None
|
Returns:
| Type | Description |
|---|---|
list[EmbedData]
|
A list of EmbedData objects. |
Source code in python/python/embed_anything/_embed_anything.pyi
embed_files_batch(files, config=None, adapter=None)
Embeds the given files and returns a list of EmbedData objects.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
files
|
list[str]
|
The list of files to embed. |
required |
config
|
TextEmbedConfig | None
|
The configuration for the embedding. |
None
|
adapter
|
Adapter | None
|
The adapter for the embedding. |
None
|
Returns:
| Type | Description |
|---|---|
list[EmbedData]
|
A list of EmbedData objects. |
Source code in python/python/embed_anything/_embed_anything.pyi
embed_query(query, config=None)
Embeds the given list of queries and returns a list of EmbedData objects.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
query
|
list[str]
|
The list of queries to embed. |
required |
config
|
TextEmbedConfig | None
|
The configuration for the embedding. |
None
|
Returns:
| Type | Description |
|---|---|
list[EmbedData]
|
A list of EmbedData objects. |
Source code in python/python/embed_anything/_embed_anything.pyi
embed_webpage(url, config=None, adapter=None)
Embeds the given webpage and returns a list of EmbedData objects.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
url
|
str
|
The URL of the webpage to embed. |
required |
config
|
TextEmbedConfig | None
|
The configuration for the embedding. |
None
|
adapter
|
Adapter | None
|
The adapter for the embedding. |
None
|
Returns:
| Type | Description |
|---|---|
list[EmbedData]
|
A list of EmbedData objects. |
Source code in python/python/embed_anything/_embed_anything.pyi
from_pretrained_cloud(model, model_id, api_key=None)
Loads an embedding model from a cloud-based service.
Attributes:
| Name | Type | Description |
|---|---|---|
model |
WhichModel
|
The cloud service to use. Currently supports WhichModel.OpenAI and WhichModel.Cohere. |
model_id |
str
|
The ID of the model to use.
|
api_key |
str | None
|
The API key for accessing the model. If not provided, it is taken from the environment variable:
|
Returns:
| Name | Type | Description |
|---|---|---|
EmbeddingModel |
EmbeddingModel
|
An initialized EmbeddingModel object. |
Raises:
| Type | Description |
|---|---|
ValueError
|
If an unsupported model is specified. |
Example:
# Using Cohere
model = EmbeddingModel.from_pretrained_cloud(
model=WhichModel.Cohere,
model_id="embed-english-v3.0"
)
# Using OpenAI
model = EmbeddingModel.from_pretrained_cloud(
model=WhichModel.OpenAI,
model_id="text-embedding-3-small"
)
Source code in python/python/embed_anything/_embed_anything.pyi
from_pretrained_hf(model_id, revision=None, token=None, dtype=None)
Loads an embedding model from the Hugging Face model hub.
Attributes:
| Name | Type | Description |
|---|---|---|
model_id |
The ID of the model. |
|
revision |
The revision of the model. |
|
token |
The Hugging Face token. |
|
dtype |
The dtype of the model. |
Returns: An EmbeddingModel object.
Example:
model = EmbeddingModel.from_pretrained_hf(
model_id="sentence-transformers/all-MiniLM-L6-v2",
revision="main"
)
Source code in python/python/embed_anything/_embed_anything.pyi
from_pretrained_onnx(model, model_name=None, hf_model_id=None, revision=None, dtype=None, path_in_repo=None)
Loads an ONNX embedding model.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
model
|
WhichModel
|
The architecture of the embedding model to use. |
required |
model_name
|
ONNXModel | None
|
The name of the model. Defaults to None. |
None
|
hf_model_id
|
str | None
|
The ID of the model from Hugging Face. Defaults to None. |
None
|
revision
|
str | None
|
The revision of the model. Defaults to None. |
None
|
dtype
|
Dtype | None
|
The dtype of the model. Defaults to None. |
None
|
path_in_repo
|
str | None
|
The path to the model in the repository. Defaults to None. |
None
|
Returns: EmbeddingModel: An initialized EmbeddingModel object.
Atleast one of the following arguments must be provided
- model_name
- hf_model_id
If hf_model_id is provided, dtype is ignored and the path_in_repo has to be provided pointing to the model file in the repository. If model_name is provided, dtype is used to determine the model file to load.
Example:
model = EmbeddingModel.from_pretrained_onnx(
model=WhichModel.Bert,
model_name=ONNXModel.BGESmallENV15Q,
dtype=Dtype.Q4F16
)
model = EmbeddingModel.from_pretrained_onnx(
model=WhichModel.Bert,
hf_model_id="jinaai/jina-embeddings-v3",
path_in_repo="onnx/model_fp16.onnx"
)
Note: This method loads a pre-trained model in ONNX format, which can offer improved inference speed compared to standard PyTorch models. ONNX models are particularly useful for deployment scenarios where performance is critical.
Source code in python/python/embed_anything/_embed_anything.pyi
ImageEmbedConfig
Represents the configuration for the Image Embedding model.
Attributes:
| Name | Type | Description |
|---|---|---|
buffer_size |
int | None
|
The buffer size for the Image Embedding model. Default is 100. |
batch_size |
int | None
|
The batch size for processing the embeddings. Default is 32. Based on the memory, you can increase or decrease the batch size. |
Source code in python/python/embed_anything/_embed_anything.pyi
ONNXModel
Bases: Enum
Enum representing various ONNX models.
| Enum Variant | Description |
|----------------------------------|--------------------------------------------------|
| `AllMiniLML6V2` | sentence-transformers/all-MiniLM-L6-v2 |
| `AllMiniLML6V2Q` | Quantized sentence-transformers/all-MiniLM-L6-v2 |
| `AllMiniLML12V2` | sentence-transformers/all-MiniLM-L12-v2 |
| `AllMiniLML12V2Q` | Quantized sentence-transformers/all-MiniLM-L12-v2|
| `ModernBERTBase` | nomic-ai/modernbert-embed-base |
| `ModernBERTLarge` | nomic-ai/modernbert-embed-large |
| `BGEBaseENV15` | BAAI/bge-base-en-v1.5 |
| `BGEBaseENV15Q` | Quantized BAAI/bge-base-en-v1.5 |
| `BGELargeENV15` | BAAI/bge-large-en-v1.5 |
| `BGELargeENV15Q` | Quantized BAAI/bge-large-en-v1.5 |
| `BGESmallENV15` | BAAI/bge-small-en-v1.5 - Default |
| `BGESmallENV15Q` | Quantized BAAI/bge-small-en-v1.5 |
| `NomicEmbedTextV1` | nomic-ai/nomic-embed-text-v1 |
| `NomicEmbedTextV15` | nomic-ai/nomic-embed-text-v1.5 |
| `NomicEmbedTextV15Q` | Quantized nomic-ai/nomic-embed-text-v1.5 |
| `ParaphraseMLMiniLML12V2` | sentence-transformers/paraphrase-MiniLM-L6-v2 |
| `ParaphraseMLMiniLML12V2Q` | Quantized sentence-transformers/paraphrase-MiniLM-L6-v2 |
| `ParaphraseMLMpnetBaseV2` | sentence-transformers/paraphrase-mpnet-base-v2 |
| `BGESmallZHV15` | BAAI/bge-small-zh-v1.5 |
| `MultilingualE5Small` | intfloat/multilingual-e5-small |
| `MultilingualE5Base` | intfloat/multilingual-e5-base |
| `MultilingualE5Large` | intfloat/multilingual-e5-large |
| `MxbaiEmbedLargeV1` | mixedbread-ai/mxbai-embed-large-v1 |
| `MxbaiEmbedLargeV1Q` | Quantized mixedbread-ai/mxbai-embed-large-v1 |
| `GTEBaseENV15` | Alibaba-NLP/gte-base-en-v1.5 |
| `GTEBaseENV15Q` | Quantized Alibaba-NLP/gte-base-en-v1.5 |
| `GTELargeENV15` | Alibaba-NLP/gte-large-en-v1.5 |
| `GTELargeENV15Q` | Quantized Alibaba-NLP/gte-large-en-v1.5 |
| `JINAV2SMALLEN` | jinaai/jina-embeddings-v2-small-en |
| `JINAV2BASEEN` | jinaai/jina-embeddings-v2-base-en |
| `JINAV3` | jinaai/jina-embeddings-v3 |
| `SPLADEPPENV1` | prithivida/Splade_PP_en_v1 |
| `SPLADEPPENV2` | prithivida/Splade_PP_en_v2 |
| `ModernBERTBase` | nomic-ai/modernbert-embed-base |
Source code in python/python/embed_anything/_embed_anything.pyi
899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 | |
Reranker
Represents the Reranker model.
Source code in python/python/embed_anything/_embed_anything.pyi
__init__(model_id, revision=None, dtype=None, path_in_repo=None)
compute_scores(query, documents, batch_size)
Computes the scores for the given query and documents.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
query
|
list[str]
|
The query to compute the scores for. |
required |
documents
|
list[str]
|
The list of documents to compute the scores for. |
required |
batch_size
|
int
|
The batch size for processing the scores. |
required |
Returns:
| Type | Description |
|---|---|
list[list[float]]
|
A list of scores for the given query and documents. |
Source code in python/python/embed_anything/_embed_anything.pyi
from_pretrained(model_id, revision=None, dtype=None, path_in_repo=None)
Loads a pre-trained Reranker model from the Hugging Face model hub.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
model_id
|
str
|
The ID of the model from Hugging Face. |
required |
revision
|
str | None
|
The revision of the model. |
None
|
dtype
|
Dtype | None
|
The dtype of the model. |
None
|
path_in_repo
|
str | None
|
The path to the model in the repository. |
None
|
Source code in python/python/embed_anything/_embed_anything.pyi
rerank(query, documents, batch_size)
Reranks the given documents for the query and returns a list of RerankerResult objects.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
query
|
list[str]
|
The query to rerank. |
required |
documents
|
list[str]
|
The list of documents to rerank. |
required |
batch_size
|
int
|
The number of documents to process per batch. |
required |
Returns:
| Type | Description |
|---|---|
RerankerResult
|
A list of RerankerResult objects. |
Source code in python/python/embed_anything/_embed_anything.pyi
RerankerResult
Represents the result of the reranking process.
Attributes:
| Name | Type | Description |
|---|---|---|
query |
str
|
The query to rerank. |
documents |
list[DocumentRank]
|
The list of documents to rerank. |
Source code in python/python/embed_anything/_embed_anything.pyi
TextEmbedConfig
Represents the configuration for the Text Embedding model.
Attributes:
| Name | Type | Description |
|---|---|---|
chunk_size |
int | None
|
The chunk size for the Text Embedding model. Default is 1000 Characters. |
batch_size |
int | None
|
The batch size for processing the embeddings. Default is 32. Based on the memory, you can increase or decrease the batch size. |
buffer_size |
int | None
|
The buffer size for the Text Embedding model. Default is 100. |
late_chunking |
bool | None
|
A flag indicating whether to use late chunking for the Text Embedding model. Use late chunking to increase the context that is taken into account for each chunk. Default is False. |
splitting_strategy |
str | None
|
The strategy to use for splitting the text into chunks. Default is "sentence". If semantic splitting is used, semantic_encoder is required. |
semantic_encoder |
EmbeddingModel | None
|
The semantic encoder for the Text Embedding model. Default is None. |
use_ocr |
bool | None
|
A flag indicating whether to use OCR for the Text Embedding model. Default is False. |
tesseract_path |
str | None
|
The path to the Tesseract OCR executable. Default is None and uses the system path. |
pdf_backend |
str | None
|
The backend to use for PDF text extraction. Currently only |
Source code in python/python/embed_anything/_embed_anything.pyi
embed_audio_file(file_path, audio_decoder, embedder, text_embed_config=TextEmbedConfig(chunk_size=1000, batch_size=32))
Embeds the given audio file and returns a list of EmbedData objects.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
file_path
|
str
|
The path to the audio file to embed. |
required |
audio_decoder
|
AudioDecoderModel
|
The audio decoder model to use. |
required |
embedder
|
EmbeddingModel
|
The embedding model to use. |
required |
text_embed_config
|
TextEmbedConfig | None
|
The configuration for the embedding model. |
TextEmbedConfig(chunk_size=1000, batch_size=32)
|
Returns:
| Type | Description |
|---|---|
list[EmbedData]
|
A list of EmbedData objects. |
Example:
import embed_anything
audio_decoder = embed_anything.AudioDecoderModel.from_pretrained_hf(
"openai/whisper-tiny.en", revision="main", model_type="tiny-en", quantized=False
)
embedder = embed_anything.EmbeddingModel.from_pretrained_hf(
embed_anything.WhichModel.Bert,
model_id="sentence-transformers/all-MiniLM-L6-v2",
revision="main",
)
config = embed_anything.TextEmbedConfig(chunk_size=1000, batch_size=32)
data = embed_anything.embed_audio_file(
"test_files/audio/samples_hp0.wav",
audio_decoder=audio_decoder,
embedder=embedder,
text_embed_config=config,
)
Source code in python/python/embed_anything/_embed_anything.pyi
embed_directory(file_path, embedder, extensions, config=None, adapter=None)
Embeds the files in the given directory and returns a list of EmbedData objects.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
file_path
|
str
|
The path to the directory containing the files to embed. |
required |
embedder
|
EmbeddingModel
|
The embedding model to use. |
required |
extensions
|
list[str]
|
The list of file extensions to consider for embedding. |
required |
config
|
TextEmbedConfig | None
|
The configuration for the embedding model. |
None
|
adapter
|
Adapter | None
|
The adapter to use for storing the embeddings in a vector database. |
None
|
Returns:
| Type | Description |
|---|---|
list[EmbedData]
|
A list of EmbedData objects. |
Example:
import embed_anything
model = embed_anything.EmbeddingModel.from_pretrained_hf(
embed_anything.WhichModel.Bert,
model_id="sentence-transformers/all-MiniLM-L6-v2",
revision="main",
)
data = embed_anything.embed_directory("test_files", embedder=model, extensions=[".pdf"])
Source code in python/python/embed_anything/_embed_anything.pyi
embed_file(file_path, embedder, config=None, adapter=None)
Embeds the given file and returns a list of EmbedData objects.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
file_path
|
str
|
The path to the file to embed. |
required |
embedder
|
EmbeddingModel
|
The embedding model to use. |
required |
config
|
TextEmbedConfig | None
|
The configuration for the embedding model. |
None
|
adapter
|
Adapter | None
|
The adapter to use for storing the embeddings in a vector database. |
None
|
Returns:
| Type | Description |
|---|---|
list[EmbedData]
|
A list of EmbedData objects. |
Example:
import embed_anything
model = embed_anything.EmbeddingModel.from_pretrained_hf(
embed_anything.WhichModel.Bert,
model_id="sentence-transformers/all-MiniLM-L6-v2",
revision="main",
)
data = embed_anything.embed_file("test_files/test.pdf", embedder=model)
Source code in python/python/embed_anything/_embed_anything.pyi
embed_files_batch(files, embedder, config=None, adapter=None)
Embeds the given files and returns a list of EmbedData objects.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
files
|
list[str]
|
The list of files to embed. |
required |
embedder
|
EmbeddingModel
|
The embedding model to use. |
required |
config
|
TextEmbedConfig | None
|
The configuration for the embedding model. |
None
|
adapter
|
Adapter | None
|
The adapter to use for storing the embeddings in a vector database. |
None
|
Returns:
| Type | Description |
|---|---|
list[EmbedData]
|
A list of EmbedData objects. |
Example:
import embed_anything
model = embed_anything.EmbeddingModel.from_pretrained_hf(
embed_anything.WhichModel.Bert,
model_id="sentence-transformers/all-MiniLM-L6-v2",
revision="main",
)
data = embed_anything.embed_files_batch(
["test_files/test.pdf", "test_files/test.txt"],
embedder=model,
config=embed_anything.TextEmbedConfig(),
adapter=None,
)
Source code in python/python/embed_anything/_embed_anything.pyi
embed_html(file_name, embedder, origin=None, config=None, adapter=None)
Embeds the given HTML file and returns a list of EmbedData objects.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
file_name
|
str
|
The path to the HTML file to embed. |
required |
embedder
|
EmbeddingModel
|
The embedding model to use. |
required |
origin
|
str | None
|
The origin of the HTML file. |
None
|
config
|
TextEmbedConfig | None
|
The configuration for the embedding model. |
None
|
adapter
|
Adapter | None
|
The adapter to use for storing the embeddings. |
None
|
Returns:
| Type | Description |
|---|---|
list[EmbedData]
|
A list of EmbedData objects. |
Example:
import embed_anything
model = embed_anything.EmbeddingModel.from_pretrained_hf(
embed_anything.WhichModel.Bert,
model_id="sentence-transformers/all-MiniLM-L6-v2",
revision="main",
)
data = embed_anything.embed_html(
"test_files/test.html", embedder=model, origin="https://www.akshaymakes.com/"
)
Source code in python/python/embed_anything/_embed_anything.pyi
embed_image_directory(file_path, embedder, config=None, adapter=None)
Embeds the images in the given directory and returns a list of EmbedData objects.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
file_path
|
str
|
The path to the directory containing the images to embed. |
required |
embedder
|
EmbeddingModel
|
The embedding model to use. |
required |
config
|
ImageEmbedConfig | None
|
The configuration for the embedding model. |
None
|
adapter
|
Adapter | None
|
The adapter to use for storing the embeddings in a vector database. |
None
|
Returns:
| Type | Description |
|---|---|
list[EmbedData]
|
A list of EmbedData objects. |
Source code in python/python/embed_anything/_embed_anything.pyi
embed_query(query, embedder, config=None)
Embeds the given query and returns a list of EmbedData objects.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
query
|
list[str]
|
The query to embed. |
required |
embedder
|
EmbeddingModel
|
The embedding model to use. |
required |
config
|
TextEmbedConfig | None
|
The configuration for the embedding model. |
None
|
Returns:
| Type | Description |
|---|---|
list[EmbedData]
|
A list of EmbedData objects. |
Example:
import embed_anything
model = embed_anything.EmbeddingModel.from_pretrained_hf(
embed_anything.WhichModel.Bert,
model_id="sentence-transformers/all-MiniLM-L6-v2",
revision="main",
)
Source code in python/python/embed_anything/_embed_anything.pyi
embed_webpage(url, embedder, config, adapter)
Embeds the webpage at the given URL and returns a list of EmbedData objects.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
url
|
str
|
The URL of the webpage to embed. |
required |
embedder
|
EmbeddingModel
|
The name of the embedding model to use. Choose between "OpenAI", "Jina", "Bert" |
required |
config
|
TextEmbedConfig | None
|
The configuration for the embedding model. |
required |
adapter
|
Adapter | None
|
The adapter to use for storing the embeddings. |
required |
Returns:
| Type | Description |
|---|---|
list[EmbedData] | None
|
A list of EmbedData objects |
Example:
import embed_anything
config = embed_anything.EmbedConfig(
openai_config=embed_anything.OpenAIConfig(model="text-embedding-3-small")
)
data = embed_anything.embed_webpage(
"https://www.akshaymakes.com/", embedder="OpenAI", config=config
)