Pinecone-Compatible Vector Search Demo

Overview

This tutorial demonstrates how to use MatrixOne Python SDK's Pinecone-compatible API for vector operations. You'll learn how to:

Create a table with 16-dimensional vectors
Build an IVF vector index for efficient similarity search
Use the Pinecone-compatible interface for queries and operations
Perform metadata filtering with Pinecone-style operators
Upsert and delete vectors using familiar Pinecone methods

Key Advantage: MatrixOne provides a Pinecone-compatible wrapper over its native vector capabilities, allowing easy migration from Pinecone while leveraging MatrixOne's enterprise database features.

MatrixOne Python SDK Documentation

For complete API reference and advanced features, please refer to:

Before You Start

Prerequisites

MatrixOne database installed and running (Installation Guide)
Python 3.8 or higher installed
MySQL client for verification (optional)

Install MatrixOne Python SDK

pip3 install matrixone-python-sdk

Complete Working Example

Below is a complete, runnable example demonstrating all Pinecone-compatible features:

from matrixone import Client
from matrixone.config import get_connection_params
from matrixone.sqlalchemy_ext import create_vector_column
from matrixone.orm import declarative_base
from sqlalchemy import BigInteger, Column, String, Float
import numpy as np

# Set random seed for reproducible results
np.random.seed(42)

print("="* 70)
print("MatrixOne Pinecone Index Compatibility Demo")
print("="* 70)

# Step 1: Connect to database
print("\nStep 1: Connect to MatrixOne Database")
print("-"* 70)

host, port, user, password, database = get_connection_params(database='demo')
client = Client()
client.connect(host=host, port=port, user=user, password=password, database=database)
print(f"Successfully connected to database: {host}:{port}/{database}")

# Step 2: Define table structure
print("\nStep 2: Define Table Structure (16-dimensional vectors)")
print("-"* 70)

Base = declarative_base()

class DocumentTable(Base):
    """Document table with 16-dimensional vectors"""
    __tablename__ = "pinecone_demo_docs"

    id = Column(BigInteger, primary_key=True)
    title = Column(String(200))
    category = Column(String(100))
    content = Column(String(1000))
    score = Column(Float)
    embedding = create_vector_column(16, "f32")  # 16-dimensional vector

print(f"Defined table: {DocumentTable.__tablename__}")
print(f"- Primary key: id (BigInteger)")
print(f"- Fields: title, category, content, score")
print(f"- Vector: embedding (16-dimensional, float32)")

# Step 3: Create table
print("\nStep 3: Create Table")
print("-"* 70)

client.drop_table(DocumentTable)  # Drop old table if exists
client.create_table(DocumentTable)
print("Table created successfully")

# Step 4: Prepare and insert initial records
print("\nStep 4: Prepare 3 Initial Records")
print("-"* 70)

initial_documents = [
    {
        "id": 1,
        "title": "Python Programming Basics",
        "category": "Programming",
        "content": "Python is an easy-to-learn high-level programming language...",
        "score": 4.5,
        "embedding": np.random.rand(16).astype(np.float32).tolist()
    },
    {
        "id": 2,
        "title": "Machine Learning Fundamentals",
        "category": "AI",
        "content": "Machine learning is a core technology of artificial intelligence...",
        "score": 4.8,
        "embedding": np.random.rand(16).astype(np.float32).tolist()
    },
    {
        "id": 3,
        "title": "Database Design Principles",
        "category": "Database",
        "content": "Good database design is the foundation for building efficient applications...",
        "score": 4.2,
        "embedding": np.random.rand(16).astype(np.float32).tolist()
    }
]

client.batch_insert(DocumentTable, initial_documents)
print(f"Successfully inserted {len(initial_documents)} initial records")

# Step 5: Create IVF vector index
print("\nStep 5: Enable IVF and Create Vector Index")
print("-"* 70)

client.vector_ops.create_ivf(
    "pinecone_demo_docs",  # Table name
    "idx_embedding_ivf",   # Index name
    "embedding",           # Column name
    lists=2,               # Number of IVF lists
    op_type="vector_l2_ops"  # Use L2 distance
)
print("IVF index created successfully")

# Step 6: Get Pinecone-compatible index object
print("\nStep 6: Get Pinecone-Compatible Index Object")
print("-"* 70)

pinecone_index = client.get_pinecone_index("pinecone_demo_docs", "embedding")
print("Successfully obtained Pinecone-compatible index object")

Key Steps Explained

1. Define Table with ORM

class DocumentTable(Base):
    __tablename__ = "pinecone_demo_docs"
    id = Column(BigInteger, primary_key=True)
    title = Column(String(200))
    category = Column(String(100))
    score = Column(Float)
    embedding = create_vector_column(16, "f32")  # 16-dim vector

2. Create IVF Index

client.vector_ops.create_ivf(
    table_name,    # Your table name
    index_name,    # Index name
    column_name,   # Vector column name
    lists=2,       # Number of clusters (adjust based on data size)
    op_type="vector_l2_ops"  # Distance metric
)

3. Get Pinecone Interface

pinecone_index = client.get_pinecone_index(table_name, vector_column)

This returns a Pinecone-compatible object with methods like query(), upsert(), delete(), etc.

Pinecone API Usage Examples

Vector Similarity Query

Perform basic vector similarity search:

# Step 7: Perform vector similarity queries
query_vector = initial_documents[0]["embedding"]

# Basic query
results = pinecone_index.query(
    vector=query_vector,
    top_k=3,
    include_metadata=True
)

print(f"Query returned {len(results.matches)} results:")
for i, match in enumerate(results.matches, 1):
    print(f"{i}. ID: {match.id}")
    print(f"Similarity score: {match.score:.4f}")
    print(f"Title: {match.metadata['title']}")
    print(f"Category: {match.metadata['category']}")

Query with Metadata Filters

Use Pinecone-style filters to narrow search results:

Simple Equality Filter

# Filter documents with category 'AI'
results_ai = pinecone_index.query(
    vector=query_vector,
    top_k=10,
    include_metadata=True,
    filter={"category": "AI"}
)
print(f"Found {len(results_ai.matches)} AI documents")

Range Filter

# Filter documents with score >= 4.5
results_high_score = pinecone_index.query(
    vector=query_vector,
    top_k=10,
    include_metadata=True,
    filter={"score": {"$gte": 4.5}}
)
print(f"Found {len(results_high_score.matches)} high-score documents")

AND Filter (Multiple Conditions)

# Filter: category 'AI' AND score > 4.0
results_combined = pinecone_index.query(
    vector=query_vector,
    top_k=10,
    include_metadata=True,
    filter={
        "$and": [
            {"category": "AI"},
            {"score": {"$gt": 4.0}}
        ]
    }
)

$in Operator

# Query documents with category in ['AI', 'Programming']
results_in = pinecone_index.query(
    vector=query_vector,
    top_k=10,
    include_metadata=True,
    filter={"category": {"$in": ["AI", "Programming"]}}
)

$or Operator

# Query: category 'AI' OR score < 4.3
results_or = pinecone_index.query(
    vector=query_vector,
    top_k=10,
    include_metadata=True,
    filter={
        "$or": [
            {"category": "AI"},
            {"score": {"$lt": 4.3}}
        ]
    }
)

Upsert Operations

Upsert (insert or update) vectors:

# Prepare data for upsert
upsert_data = [
    {
        "id": 2,  # Update existing record
        "title": "Advanced Deep Learning",
        "category": "AI",
        "content": "Deep learning uses multi-layer neural networks...",
        "score": 4.9,
        "embedding": np.random.rand(16).astype(np.float32).tolist()
    },
    {
        "id": 4,  # New record
        "title": "Cloud Native Architecture",
        "category": "Architecture",
        "content": "Cloud-native leverages containers and microservices...",
        "score": 4.6,
        "embedding": np.random.rand(16).astype(np.float32).tolist()
    }
]

# Perform upsert
pinecone_index.upsert(upsert_data)
print(f"Successfully upserted {len(upsert_data)} records")

Delete Operations

Delete vectors by ID:

# Delete document with ID 5
delete_ids = [5]
pinecone_index.delete(delete_ids)
print(f"Successfully deleted document(s) with ID: {delete_ids}")

Get Index Statistics

Retrieve index information:

stats = pinecone_index.describe_index_stats()
print(f"Index statistics:")
print(f"- Total vector count: {stats.get('total_vector_count', 'N/A')}")
print(f"- Dimension: {stats.get('dimension', 'N/A')}")

Query with Vector Values

Include vector values in query results:

results_with_values = pinecone_index.query(
    vector=query_vector,
    top_k=2,
    include_metadata=True,
    include_values=True  # Include vector values
)

for match in results_with_values.matches:
    print(f"ID: {match.id}, Title: {match.metadata['title']}")
    if match.values:
        print(f"Vector (first 5 dims): {match.values[:5]}")

Full Working Script

Save this as pinecone_demo.py and run with python3 pinecone_demo.py:

from matrixone import Client
from matrixone.config import get_connection_params
from matrixone.sqlalchemy_ext import create_vector_column
from matrixone.orm import declarative_base
from sqlalchemy import BigInteger, Column, String, Float
import numpy as np

np.random.seed(42)

print("="* 70)
print("MatrixOne Pinecone Index Compatibility Demo")
print("="* 70)

# Connect to database
host, port, user, password, database = get_connection_params()
client = Client()
client.connect(host=host, port=port, user=user, password=password, database=database)

# Define table structure
Base = declarative_base()

class DocumentTable(Base):
    __tablename__ = "pinecone_demo_docs"
    id = Column(BigInteger, primary_key=True)
    title = Column(String(200))
    category = Column(String(100))
    content = Column(String(1000))
    score = Column(Float)
    embedding = create_vector_column(16, "f32")

# Create table
client.drop_table(DocumentTable)
client.create_table(DocumentTable)

# Insert initial data
initial_documents = [
    {
        "id": 1,
        "title": "Python Programming Basics",
        "category": "Programming",
        "content": "Python is an easy-to-learn high-level programming language...",
        "score": 4.5,
        "embedding": np.random.rand(16).astype(np.float32).tolist()
    },
    {
        "id": 2,
        "title": "Machine Learning Fundamentals",
        "category": "AI",
        "content": "Machine learning is a core technology...",
        "score": 4.8,
        "embedding": np.random.rand(16).astype(np.float32).tolist()
    },
    {
        "id": 3,
        "title": "Database Design Principles",
        "category": "Database",
        "content": "Good database design is the foundation...",
        "score": 4.2,
        "embedding": np.random.rand(16).astype(np.float32).tolist()
    }
]

client.batch_insert(DocumentTable, initial_documents)

# Create IVF index
client.vector_ops.create_ivf(
    "pinecone_demo_docs",
    "idx_embedding_ivf",
    "embedding",
    lists=2,
    op_type="vector_l2_ops"
)

# Get Pinecone-compatible index
pinecone_index = client.get_pinecone_index("pinecone_demo_docs", "embedding")

# Basic query
query_vector = initial_documents[0]["embedding"]
results = pinecone_index.query(
    vector=query_vector,
    top_k=3,
    include_metadata=True
)

print(f"\n Query returned {len(results.matches)} results")

# Query with filter
results_ai = pinecone_index.query(
    vector=query_vector,
    top_k=10,
    include_metadata=True,
    filter={"category": "AI"}
)

print(f"Found {len(results_ai.matches)} AI documents")

# Upsert new data
upsert_data = [
    {
        "id": 4,
        "title": "Cloud Native Architecture",
        "category": "Architecture",
        "content": "Cloud-native leverages containers...",
        "score": 4.6,
        "embedding": np.random.rand(16).astype(np.float32).tolist()
    }
]

pinecone_index.upsert(upsert_data)
print(f"Upserted {len(upsert_data)} records")

# Get statistics
stats = pinecone_index.describe_index_stats()
print(f"Total vectors: {stats.get('total_vector_count', 'N/A')}")

# Cleanup
client.disconnect()
print("\n Demo completed!")

Key Features Demonstrated

✨ Pinecone-Compatible API Capabilities

Get Pinecone Index Interface

pinecone_index = client.get_pinecone_index(table_name, vector_column)

Wraps existing MatrixOne table with Pinecone API
Works with any table that has an IVF vector index
Supports 16-dimensional (or any dimension) float32 vectors
Query with Similarity Search

results = pinecone_index.query(
    vector=query_vector,
    top_k=10,
    include_metadata=True,
    include_values=False,
    filter={"category": "AI"}
)

Returns results.matches list with id, score, metadata, values
Supports metadata filtering
Configurable result format
Metadata Filtering
Support for Pinecone filter operators: $eq, $ne, $lt, $gt, $lte, $gte, $in, $nin
Logical operators: $and, $or
Filter by any metadata field during query
Upsert Operations

pinecone_index.upsert([
    {"id": 1, "title": "...", "embedding": [...]}
])

Batch insert/update with single API call
Automatic conflict resolution (update if ID exists)
All table columns mapped to metadata
Delete Operations

pinecone_index.delete([1, 2, 3])

Delete specific vectors by ID
Supports batch deletion
Index Statistics

stats = pinecone_index.describe_index_stats()

Get total vector count
Get index dimension
Check index health

Pinecone Filter Operators

MatrixOne's Pinecone API supports standard filter operators:

# Equality
filter={'category': {'$eq': 'Electronics'}}

# Inequality
filter={'price': {'$ne': 0}}

# Less than / Greater than
filter={'price': {'$lt': 500}}
filter={'price': {'$gte': 100}}

# In list
filter={'category': {'$in': ['Electronics', 'Furniture']}}

# Not in list
filter={'status': {'$nin': ['discontinued']}}

# Complex AND condition
filter={
    '$and': [
        {'category': {'$eq': 'Electronics'}},
        {'price': {'$lt': 1000}}
    ]
}

# Complex OR condition
filter={
    '$or': [
        {'category': {'$eq': 'Electronics'}},
        {'price': {'$lt': 50}}
    ]
}

Distance Metrics

MatrixOne supports multiple distance metrics via IVF index creation:

# L2 (Euclidean) Distance
client.vector_ops.create_ivf(
    table_name, index_name, column_name,
    lists=2,
    op_type="vector_l2_ops"  # L2 distance
)

# Cosine Similarity
client.vector_ops.create_ivf(
    table_name, index_name, column_name,
    lists=2,
    op_type="vector_cosine_ops"  # Cosine similarity
)

# Inner Product
client.vector_ops.create_ivf(
    table_name, index_name, column_name,
    lists=2,
    op_type="vector_ip_ops"  # Inner product
)

Best Practices

1. Batch Upsert for Better Performance

# Good: Batch upsert multiple vectors at once
vectors = [
    {'id': f'vec-{i}', 'values': generate_embedding(), 'metadata': {...}}
    for i in range(1000)
]
index.upsert(vectors=vectors)

# Avoid: Individual upserts in loop (slower)
for i in range(1000):
    index.upsert(vectors=[{'id': f'vec-{i}', 'values': generate_embedding()}])

2. Use Filters to Reduce Search Space

# Efficient: Filter before vector search
results = index.query(
    vector=query_vec,
    top_k=5,
    filter={'category': {'$eq': 'Electronics'}}  # Narrows search space
)

# Less efficient: Filter after fetching all results
all_results = index.query(vector=query_vec, top_k=100)
filtered = [r for r in all_results['matches'] if r['metadata']['category'] == 'Electronics'][:5]

3. Choose Appropriate Metric

# For semantic similarity (text embeddings)
metric='cosine'  # Normalized, range [-1, 1]

# For geometric distance
metric='euclidean'  # L2 distance

# For raw similarity scores
metric='dotproduct'  # Inner product

4. Include Only Needed Data

# Efficient: Don't fetch vectors if not needed
results = index.query(
    vector=query_vec,
    top_k=10,
    include_values=False,  # Saves bandwidth
    include_metadata=True
)

# Less efficient: Always fetching vectors
results = index.query(
    vector=query_vec,
    top_k=10,
    include_values=True  # Larger response size
)

Troubleshooting

Issue: "Vector dimension mismatch"

Solution: Ensure all vectors have the same dimension as the index

# Index created with 16 dimensions
index = PineconeIndex(client=client, index_name='idx', dimension=16)

# All upserted vectors must be 16-dimensional
vector_correct = np.random.rand(16).tolist()  # ✅ Correct
vector_wrong = np.random.rand(32).tolist()    # ❌ Wrong - will error

Issue: "Filter not working as expected"

Solution: Use correct Pinecone filter syntax

# Correct: Use Pinecone operators
filter={'category': {'$eq': 'Electronics'}}  # ✅ Correct

# Wrong: Direct comparison (not supported)
filter={'category': 'Electronics'}  # ❌ Wrong

Issue: "No results returned from query"

Solution: Check if vectors exist and filter is not too restrictive

# Check index stats
stats = index.describe_index_stats()
print(f"Total vectors: {stats['total_vector_count']}")

# Try query without filter first
results = index.query(vector=query_vec, top_k=5, filter=None)

# Then add filter incrementally
results = index.query(vector=query_vec, top_k=5, filter={'category': {'$eq': 'Electronics'}})

Issue: "Metadata not returned in query results"

Solution: Set include_metadata=True in query

# Correct: Include metadata flag
results = index.query(
    vector=query_vec,
    top_k=5,
    include_metadata=True  # ✅ Must set to True
)

# Wrong: Default is False
results = index.query(vector=query_vec, top_k=5)  # metadata will be None

Issue: "Table or index not found"

Solution: Ensure table and IVF index exist before getting Pinecone interface

# Verify table exists
try:
    client.create_table(DocumentTable)
except:
    print("Table already exists")

# Verify IVF index exists
try:
    client.vector_ops.create_ivf(table_name, index_name, column_name, lists=2)
except:
    print("Index already exists")

# Then get Pinecone interface
pinecone_index = client.get_pinecone_index(table_name, vector_column)

Issue: "IVF index required"

Solution: Create IVF index before using Pinecone API

# Must create IVF index first
client.vector_ops.create_ivf(
    "my_table",
    "idx_embedding",
    "embedding",
    lists=2,
    op_type="vector_l2_ops"
)

# Then can use Pinecone API
pinecone_index = client.get_pinecone_index("my_table", "embedding")