Embedding Comparison Skill

Evaluate and compare different embedding models on your actual data.

Overview

The default all-MiniLM-L6-v2 model is a good starting point, but may not be optimal for your specific content. This skill helps you:

Benchmark different models on your data
Compare retrieval quality
Make informed model selection decisions

Why Compare Models?

Factor	Impact
Domain vocabulary	Technical jargon may need specialized models
Document length	Some models handle long text better
Query style	Conversational vs keyword queries
Speed requirements	Larger models = better quality but slower
Memory constraints	Some models need significant RAM

Candidate Models

General Purpose

Model	Dimensions	Speed	Quality	Size
`all-MiniLM-L6-v2`	384	Fast	Good	80MB
`all-MiniLM-L12-v2`	384	Medium	Better	120MB
`all-mpnet-base-v2`	768	Slow	Best	420MB

Specialized

Model	Best For	Dimensions
`multi-qa-MiniLM-L6-cos-v1`	Question answering	384
`msmarco-MiniLM-L6-cos-v5`	Search/retrieval	384
`paraphrase-MiniLM-L6-v2`	Semantic similarity	384

Code-Focused

Model	Best For	Source
`krlvi/sentence-t5-base-nlpl-code_search_net`	Code search	HuggingFace
`flax-sentence-embeddings/st-codesearch-distilroberta-base`	Code + docs	HuggingFace

Benchmarking Framework

Step 1: Create Test Dataset

#!/usr/bin/env python3
"""Create a test dataset for embedding comparison."""

from typing import List, Dict
import json

def create_test_dataset(
    documents: List[str],
    queries: List[str],
    relevance: Dict[str, List[int]]
) -> Dict:
    """
    Create a test dataset.

    Args:
        documents: List of documents to search
        queries: List of test queries
        relevance: Dict mapping query index to relevant document indices

    Returns:
        Test dataset dict
    """
    return {
        "documents": documents,
        "queries": queries,
        "relevance": relevance
    }


# Example: Create test dataset from your actual content
def create_from_qdrant(collection_name: str, sample_size: int = 50) -> Dict:
    """Create test dataset from existing Qdrant collection."""
    from qdrant_client import QdrantClient

    client = QdrantClient(url="http://localhost:6333")

    # Scroll through collection to get samples
    results = client.scroll(
        collection_name=collection_name,
        limit=sample_size,
        with_payload=True
    )

    documents = [p.payload.get("content", "") for p in results[0]]

    # You'll need to manually create queries and mark relevance
    # This is the ground truth that benchmarks against

    return {
        "documents": documents,
        "queries": [],  # Fill in manually
        "relevance": {}  # Fill in manually
    }


# Example test dataset
EXAMPLE_DATASET = {
    "documents": [
        "Python is a high-level programming language known for readability.",
        "FastAPI is a modern web framework for building APIs with Python.",
        "Qdrant is a vector database for AI applications.",
        "Docker containers provide isolated runtime environments.",
        "REST APIs use HTTP methods for client-server communication.",
    ],
    "queries": [
        "How do I build a web API?",
        "What is a vector database?",
        "How do I containerize my application?",
    ],
    "relevance": {
        "0": [1, 4],  # Query 0 is relevant to docs 1 and 4
        "1": [2],     # Query 1 is relevant to doc 2
        "2": [3],     # Query 2 is relevant to doc 3
    }
}

if __name__ == "__main__":
    with open("test_dataset.json", "w") as f:
        json.dump(EXAMPLE_DATASET, f, indent=2)
    print("Created test_dataset.json")

Step 2: Benchmark Script

#!/usr/bin/env python3
"""Benchmark embedding models on test dataset."""

import json
import time
from typing import Dict, List
import numpy as np
from sentence_transformers import SentenceTransformer

# Models to compare
MODELS = [
    "all-MiniLM-L6-v2",
    "all-MiniLM-L12-v2",
    "all-mpnet-base-v2",
    "multi-qa-MiniLM-L6-cos-v1",
    "msmarco-MiniLM-L6-cos-v5",
]


def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
    """Compute cosine similarity between two vectors."""
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))


def compute_metrics(
    model: SentenceTransformer,
    documents: List[str],
    queries: List[str],
    relevance: Dict[str, List[int]],
    k: int = 3
) -> Dict:
    """
    Compute retrieval metrics for a model.

    Metrics:
    - Precision@k: Fraction of top-k results that are relevant
    - Recall@k: Fraction of relevant docs found in top-k
    - MRR: Mean Reciprocal Rank
    """
    # Encode documents
    doc_embeddings = model.encode(documents)

    precisions = []
    recalls = []
    reciprocal_ranks = []

    for q_idx, query in enumerate(queries):
        q_key = str(q_idx)
        if q_key not in relevance:
            continue

        relevant_docs = set(relevance[q_key])

        # Encode query and compute similarities
        q_embedding = model.encode([query])[0]
        similarities = [
            cosine_similarity(q_embedding, doc_emb)
            for doc_emb in doc_embeddings
        ]

        # Get top-k results
        top_k_indices = np.argsort(similarities)[-k:][::-1]

        # Precision@k
        hits = len(set(top_k_indices) & relevant_docs)
        precisions.append(hits / k)

        # Recall@k
        recalls.append(hits / len(relevant_docs))

        # MRR (reciprocal rank of first relevant result)
        for rank, idx in enumerate(top_k_indices, 1):
            if idx in relevant_docs:
                reciprocal_ranks.append(1 / rank)
                break
        else:
            reciprocal_ranks.append(0)

    return {
        "precision_at_k": np.mean(precisions),
        "recall_at_k": np.mean(recalls),
        "mrr": np.mean(reciprocal_ranks)
    }


def benchmark_model(model_name: str, dataset: Dict) -> Dict:
    """Benchmark a single model."""
    print(f"\nBenchmarking: {model_name}")

    # Load model (time it)
    load_start = time.perf_counter()
    model = SentenceTransformer(model_name)
    load_time = time.perf_counter() - load_start

    # Time encoding
    encode_start = time.perf_counter()
    _ = model.encode(dataset["documents"])
    encode_time = time.perf_counter() - encode_start

    # Compute retrieval metrics
    metrics = compute_metrics(
        model,
        dataset["documents"],
        dataset["queries"],
        dataset["relevance"]
    )

    # Get model info
    test_embedding = model.encode(["test"])[0]

    return {
        "model": model_name,
        "dimensions": len(test_embedding),
        "load_time_s": round(load_time, 2),
        "encode_time_s": round(encode_time, 3),
        "encode_per_doc_ms": round(encode_time / len(dataset["documents"]) * 1000, 2),
        **{k: round(v, 3) for k, v in metrics.items()}
    }


def run_benchmark(dataset_path: str = "test_dataset.json") -> List[Dict]:
    """Run full benchmark."""
    with open(dataset_path) as f:
        dataset = json.load(f)

    print(f"Dataset: {len(dataset['documents'])} docs, {len(dataset['queries'])} queries")

    results = []
    for model_name in MODELS:
        try:
            result = benchmark_model(model_name, dataset)
            results.append(result)
            print(f"  P@3: {result['precision_at_k']:.3f}, MRR: {result['mrr']:.3f}")
        except Exception as e:
            print(f"  Error: {e}")

    return results


def print_results_table(results: List[Dict]):
    """Print results as formatted table."""
    print("\n" + "=" * 80)
    print("BENCHMARK RESULTS")
    print("=" * 80)

    # Header
    print(f"{'Model':<35} {'Dim':>5} {'P@3':>6} {'R@3':>6} {'MRR':>6} {'ms/doc':>8}")
    print("-" * 80)

    # Sort by MRR (or your preferred metric)
    for r in sorted(results, key=lambda x: -x['mrr']):
        print(f"{r['model']:<35} {r['dimensions']:>5} {r['precision_at_k']:>6.3f} "
              f"{r['recall_at_k']:>6.3f} {r['mrr']:>6.3f} {r['encode_per_doc_ms']:>8.2f}")

    print("=" * 80)


if __name__ == "__main__":
    results = run_benchmark()
    print_results_table(results)

    # Save results
    with open("benchmark_results.json", "w") as f:
        json.dump(results, f, indent=2)
    print("\nResults saved to benchmark_results.json")

Decision Framework

When to Use Different Models

all-MiniLM-L6-v2 (default)
├── Fast enough for real-time
├── Good general quality
├── Low memory footprint
└── Use when: Starting out, general content

all-MiniLM-L12-v2
├── Slightly better quality
├── Still reasonably fast
└── Use when: Quality matters more than speed

all-mpnet-base-v2
├── Best quality
├── Significantly slower
├── Higher memory usage
└── Use when: Accuracy is critical, batch processing OK

multi-qa-MiniLM-L6-cos-v1
├── Optimized for Q&A
├── Better with question-form queries
└── Use when: Building Q&A system, FAQ retrieval

msmarco-MiniLM-L6-cos-v5
├── Optimized for search
├── Better with keyword-style queries
└── Use when: Building search engine, keyword queries

Quick Selection Guide

Your Content	Recommended Model
General documentation	`all-MiniLM-L6-v2`
Technical/code docs	`msmarco-MiniLM-L6-cos-v5`
Q&A / FAQ	`multi-qa-MiniLM-L6-cos-v1`
High-stakes retrieval	`all-mpnet-base-v2`
Mixed content	Run benchmark on your data

Switching Models

After deciding on a model:

# 1. Update environment
export EMBEDDING_MODEL=all-mpnet-base-v2

# 2. Re-embed all collections (embeddings aren't portable between models!)
python scripts/reembed_collections.py

# 3. Rebuild router embeddings
python scripts/rebuild_router.py

Important: Different models produce different dimensional embeddings. You cannot mix embeddings from different models in the same collection!

Reembedding Script

#!/usr/bin/env python3
"""Re-embed all collections with a new model."""

import os
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
from sentence_transformers import SentenceTransformer

NEW_MODEL = os.getenv("EMBEDDING_MODEL", "all-MiniLM-L6-v2")
QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")

def reembed_collection(collection_name: str, model: SentenceTransformer, client: QdrantClient):
    """Re-embed a single collection."""

    # Get existing data
    results = client.scroll(
        collection_name=collection_name,
        limit=10000,
        with_payload=True
    )

    points = results[0]
    if not points:
        print(f"  {collection_name}: empty, skipping")
        return

    # Extract documents
    documents = [p.payload.get("content", "") for p in points]

    # Re-embed
    new_embeddings = model.encode(documents).tolist()
    vector_size = len(new_embeddings[0])

    # Delete and recreate collection
    client.delete_collection(collection_name)
    client.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE)
    )

    # Re-add points
    new_points = [
        PointStruct(
            id=p.id,
            vector=new_embeddings[i],
            payload=p.payload
        )
        for i, p in enumerate(points)
    ]

    client.upsert(collection_name=collection_name, points=new_points)
    print(f"  {collection_name}: re-embedded {len(points)} documents")


def main():
    print(f"Re-embedding with model: {NEW_MODEL}")

    model = SentenceTransformer(NEW_MODEL)
    client = QdrantClient(url=QDRANT_URL)

    collections = client.get_collections().collections
    print(f"Found {len(collections)} collections")

    for coll in collections:
        reembed_collection(coll.name, model, client)

    print("✅ Re-embedding complete!")


if __name__ == "__main__":
    main()

Refinement Notes

Track findings from your benchmarks.

Created test dataset from real content
Ran benchmark on candidate models
Selected optimal model for use case
Re-embedded collections
Verified retrieval quality improved

embedding-comparison