Semantic Search Implementation

Build powerful search systems that understand meaning rather than just matching keywords. Semantic search uses AI embeddings to find relevant content based on intent and context.

Overview

Semantic search enables:
  • Meaning-based search - Find content by intent, not just keywords
  • Contextual understanding - Understand synonyms and related concepts
  • Cross-language search - Search across different languages
  • Fuzzy matching - Find relevant content even with imprecise queries
  • Intelligent ranking - Rank results by semantic relevance

Vector Embeddings

Convert text to numerical vectors that capture meaning

Similarity Search

Find similar content using vector mathematics

Hybrid Search

Combine semantic and keyword search for best results

Real-time Indexing

Index new content automatically for immediate search

Quick Start Implementation

import requests
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import json

class SemanticSearch:
    def __init__(self, api_key):
        self.api_key = api_key
        self.documents = {}
        self.embeddings = {}
        
    def add_document(self, doc_id, content, metadata=None):
        """Add a document to the search index"""
        
        # Generate embedding for the document
        embedding = self.get_embedding(content)
        
        # Store document and embedding
        self.documents[doc_id] = {
            "content": content,
            "metadata": metadata or {},
            "embedding": embedding
        }
        
        print(f"Added document {doc_id} to search index")
    
    def get_embedding(self, text):
        """Generate embedding for text using AnyAPI"""
        
        response = requests.post(
            "https://api.anyapi.ai/v1/embeddings",
            headers={
                "Authorization": f"Bearer {self.api_key}",
                "Content-Type": "application/json"
            },
            json={
                "model": "text-embedding-3-large",
                "input": text
            }
        )
        
        return response.json()["data"][0]["embedding"]
    
    def search(self, query, top_k=5, min_similarity=0.3):
        """Search for similar documents"""
        
        # Get query embedding
        query_embedding = self.get_embedding(query)
        
        # Calculate similarities
        similarities = []
        for doc_id, doc_data in self.documents.items():
            similarity = cosine_similarity(
                [query_embedding], 
                [doc_data["embedding"]]
            )[0][0]
            
            if similarity >= min_similarity:
                similarities.append({
                    "doc_id": doc_id,
                    "similarity": similarity,
                    "content": doc_data["content"],
                    "metadata": doc_data["metadata"]
                })
        
        # Sort by similarity
        similarities.sort(key=lambda x: x["similarity"], reverse=True)
        
        return similarities[:top_k]
    
    def batch_add_documents(self, documents):
        """Add multiple documents efficiently"""
        
        # Extract texts for batch embedding
        texts = [doc["content"] for doc in documents]
        
        # Get embeddings in batch
        response = requests.post(
            "https://api.anyapi.ai/v1/embeddings",
            headers={
                "Authorization": f"Bearer {self.api_key}",
                "Content-Type": "application/json"
            },
            json={
                "model": "text-embedding-3-large",
                "input": texts
            }
        )
        
        embeddings = response.json()["data"]
        
        # Store documents with embeddings
        for doc, embedding in zip(documents, embeddings):
            self.documents[doc["id"]] = {
                "content": doc["content"],
                "metadata": doc.get("metadata", {}),
                "embedding": embedding["embedding"]
            }
        
        print(f"Added {len(documents)} documents to search index")

# Usage example
search_engine = SemanticSearch("YOUR_API_KEY")

# Add sample documents
documents = [
    {
        "id": "doc1",
        "content": "Machine learning is a subset of artificial intelligence that enables computers to learn from data",
        "metadata": {"category": "AI", "author": "John Doe"}
    },
    {
        "id": "doc2", 
        "content": "Python is a popular programming language for data science and web development",
        "metadata": {"category": "Programming", "author": "Jane Smith"}
    },
    {
        "id": "doc3",
        "content": "Deep learning uses neural networks to solve complex problems like image recognition",
        "metadata": {"category": "AI", "author": "Bob Johnson"}
    },
    {
        "id": "doc4",
        "content": "JavaScript is the primary language for building interactive web applications",
        "metadata": {"category": "Programming", "author": "Alice Wilson"}
    }
]

# Add documents to search index
search_engine.batch_add_documents(documents)

# Search examples
results = search_engine.search("AI and computers learning", top_k=3)
print("\nSearch results for 'AI and computers learning':")
for result in results:
    print(f"Score: {result['similarity']:.3f} - {result['content']}")

results = search_engine.search("coding languages for websites", top_k=3)  
print("\nSearch results for 'coding languages for websites':")
for result in results:
    print(f"Score: {result['similarity']:.3f} - {result['content']}")

Advanced Search Implementations

Vector Database Integration

import pinecone
import requests

class PineconeSemanticSearch:
    def __init__(self, api_key, pinecone_key, index_name):
        self.api_key = api_key
        
        # Initialize Pinecone
        pinecone.init(api_key=pinecone_key, environment="us-west1-gcp")
        self.index = pinecone.Index(index_name)
        
    def index_documents(self, documents, namespace="default"):
        """Index documents in Pinecone with metadata"""
        
        # Get embeddings for all documents
        texts = [doc["content"] for doc in documents]
        
        response = requests.post(
            "https://api.anyapi.ai/v1/embeddings",
            headers={
                "Authorization": f"Bearer {self.api_key}",
                "Content-Type": "application/json"
            },
            json={
                "model": "text-embedding-3-large",
                "input": texts
            }
        )
        
        embeddings = response.json()["data"]
        
        # Prepare vectors for Pinecone
        vectors = []
        for doc, embedding in zip(documents, embeddings):
            vectors.append({
                "id": doc["id"],
                "values": embedding["embedding"],
                "metadata": {
                    "content": doc["content"][:1000],  # Truncate for metadata
                    "title": doc.get("title", ""),
                    "category": doc.get("category", ""),
                    "author": doc.get("author", ""),
                    "url": doc.get("url", "")
                }
            })
        
        # Upsert to Pinecone
        self.index.upsert(vectors=vectors, namespace=namespace)
        print(f"Indexed {len(vectors)} documents in Pinecone")
    
    def search(self, query, top_k=10, filters=None, namespace="default"):
        """Search with optional metadata filtering"""
        
        # Get query embedding
        response = requests.post(
            "https://api.anyapi.ai/v1/embeddings",
            headers={
                "Authorization": f"Bearer {self.api_key}",
                "Content-Type": "application/json"
            },
            json={
                "model": "text-embedding-3-large",
                "input": query
            }
        )
        
        query_embedding = response.json()["data"][0]["embedding"]
        
        # Search in Pinecone
        search_results = self.index.query(
            vector=query_embedding,
            top_k=top_k,
            include_metadata=True,
            filter=filters,
            namespace=namespace
        )
        
        return search_results["matches"]
    
    def hybrid_search(self, query, keyword_filter=None, top_k=10):
        """Combine semantic search with keyword filtering"""
        
        # Semantic search
        semantic_results = self.search(query, top_k=top_k * 2)
        
        # Apply keyword filtering if specified
        if keyword_filter:
            filtered_results = []
            for result in semantic_results:
                content = result["metadata"].get("content", "").lower()
                if keyword_filter.lower() in content:
                    filtered_results.append(result)
            semantic_results = filtered_results
        
        return semantic_results[:top_k]

# Usage
pinecone_search = PineconeSemanticSearch(
    "YOUR_API_KEY", 
    "YOUR_PINECONE_KEY", 
    "semantic-search-index"
)

# Index documents
documents = [
    {
        "id": "article_1",
        "content": "The future of artificial intelligence in healthcare looks promising...",
        "title": "AI in Healthcare",
        "category": "Technology",
        "author": "Dr. Sarah Johnson"
    }
    # Add more documents...
]

pinecone_search.index_documents(documents)

# Search with filters
results = pinecone_search.search(
    "machine learning medical applications",
    filters={"category": {"$eq": "Technology"}},
    top_k=5
)

for result in results:
    print(f"Score: {result['score']:.3f} - {result['metadata']['title']}")

E-commerce Search Implementation

class EcommerceSemanticSearch:
    def __init__(self, api_key):
        self.api_key = api_key
        self.products = {}
        
    def index_product(self, product_id, product_data):
        """Index an e-commerce product for semantic search"""
        
        # Create searchable text from product data
        searchable_text = self.create_product_text(product_data)
        
        # Get embedding
        embedding = self.get_embedding(searchable_text)
        
        # Store product with embedding
        self.products[product_id] = {
            "data": product_data,
            "searchable_text": searchable_text,
            "embedding": embedding
        }
        
    def create_product_text(self, product_data):
        """Create comprehensive searchable text from product data"""
        
        text_parts = [
            product_data.get("title", ""),
            product_data.get("description", ""),
            product_data.get("brand", ""),
            product_data.get("category", ""),
            " ".join(product_data.get("tags", [])),
            " ".join(product_data.get("features", [])),
            product_data.get("color", ""),
            product_data.get("material", ""),
            product_data.get("size", "")
        ]
        
        return " ".join(filter(None, text_parts))
    
    def get_embedding(self, text):
        """Generate embedding for text"""
        
        response = requests.post(
            "https://api.anyapi.ai/v1/embeddings",
            headers={
                "Authorization": f"Bearer {self.api_key}",
                "Content-Type": "application/json"
            },
            json={
                "model": "text-embedding-3-large",
                "input": text
            }
        )
        
        return response.json()["data"][0]["embedding"]
    
    def search_products(self, query, filters=None, top_k=20):
        """Search products with optional filters"""
        
        query_embedding = self.get_embedding(query)
        
        # Calculate similarities
        similarities = []
        for product_id, product_info in self.products.items():
            # Check filters first
            if filters and not self.matches_filters(product_info["data"], filters):
                continue
                
            similarity = cosine_similarity(
                [query_embedding], 
                [product_info["embedding"]]
            )[0][0]
            
            similarities.append({
                "product_id": product_id,
                "similarity": similarity,
                "product": product_info["data"]
            })
        
        # Sort by similarity
        similarities.sort(key=lambda x: x["similarity"], reverse=True)
        
        return similarities[:top_k]
    
    def matches_filters(self, product_data, filters):
        """Check if product matches specified filters"""
        
        for key, value in filters.items():
            if key == "price_range":
                price = product_data.get("price", 0)
                if not (value[0] <= price <= value[1]):
                    return False
            elif key == "categories":
                if product_data.get("category") not in value:
                    return False
            elif key == "brands":
                if product_data.get("brand") not in value:
                    return False
            elif key == "in_stock":
                if product_data.get("in_stock", False) != value:
                    return False
        
        return True
    
    def get_recommendations(self, product_id, top_k=5):
        """Get similar products (recommendations)"""
        
        if product_id not in self.products:
            return []
        
        target_embedding = self.products[product_id]["embedding"]
        
        similarities = []
        for pid, product_info in self.products.items():
            if pid == product_id:  # Skip self
                continue
                
            similarity = cosine_similarity(
                [target_embedding], 
                [product_info["embedding"]]
            )[0][0]
            
            similarities.append({
                "product_id": pid,
                "similarity": similarity,
                "product": product_info["data"]
            })
        
        similarities.sort(key=lambda x: x["similarity"], reverse=True)
        return similarities[:top_k]

# Usage example
ecommerce_search = EcommerceSemanticSearch("YOUR_API_KEY")

# Index sample products
products = [
    {
        "id": "prod_1",
        "data": {
            "title": "Wireless Noise-Cancelling Headphones",
            "description": "Premium over-ear headphones with active noise cancellation and 30-hour battery life",
            "brand": "AudioTech",
            "category": "Electronics",
            "price": 299.99,
            "tags": ["wireless", "bluetooth", "noise-cancelling", "premium"],
            "features": ["Active noise cancellation", "30-hour battery", "Quick charge"],
            "color": "Black",
            "in_stock": True
        }
    },
    {
        "id": "prod_2", 
        "data": {
            "title": "Bluetooth Sport Earbuds",
            "description": "Waterproof wireless earbuds perfect for workouts and running",
            "brand": "FitSound",
            "category": "Electronics",
            "price": 89.99,
            "tags": ["wireless", "sport", "waterproof", "earbuds"],
            "features": ["IPX7 waterproof", "Secure fit", "8-hour battery"],
            "color": "Blue",
            "in_stock": True
        }
    }
]

# Index products
for product in products:
    ecommerce_search.index_product(product["id"], product["data"])

# Search examples
results = ecommerce_search.search_products(
    "noise cancelling headphones for music", 
    top_k=5
)

print("Search results:")
for result in results:
    print(f"Score: {result['similarity']:.3f} - {result['product']['title']}")

# Search with filters
filtered_results = ecommerce_search.search_products(
    "wireless audio", 
    filters={
        "price_range": [50, 200],
        "categories": ["Electronics"],
        "in_stock": True
    },
    top_k=5
)

print("\nFiltered search results:")
for result in filtered_results:
    print(f"Score: {result['similarity']:.3f} - {result['product']['title']} - ${result['product']['price']}")

# Get recommendations
recommendations = ecommerce_search.get_recommendations("prod_1", top_k=3)
print("\nRecommendations for product prod_1:")
for rec in recommendations:
    print(f"Score: {rec['similarity']:.3f} - {rec['product']['title']}")
class KnowledgeBaseSearch:
    def __init__(self, api_key):
        self.api_key = api_key
        self.documents = {}
        
    def index_document(self, doc_id, title, content, metadata=None):
        """Index a document with hierarchical content structure"""
        
        # Split content into chunks for better search
        chunks = self.chunk_content(content, chunk_size=500, overlap=50)
        
        # Process each chunk
        for i, chunk in enumerate(chunks):
            chunk_id = f"{doc_id}_chunk_{i}"
            
            # Create comprehensive text for embedding
            searchable_text = f"{title}\n\n{chunk}"
            
            # Get embedding
            embedding = self.get_embedding(searchable_text)
            
            # Store chunk with metadata
            self.documents[chunk_id] = {
                "parent_doc_id": doc_id,
                "title": title,
                "content": chunk,
                "chunk_index": i,
                "searchable_text": searchable_text,
                "embedding": embedding,
                "metadata": metadata or {}
            }
    
    def chunk_content(self, content, chunk_size=500, overlap=50):
        """Split content into overlapping chunks"""
        
        words = content.split()
        chunks = []
        
        for i in range(0, len(words), chunk_size - overlap):
            chunk_words = words[i:i + chunk_size]
            chunk_text = " ".join(chunk_words)
            chunks.append(chunk_text)
            
            # Break if we've covered all words
            if i + chunk_size >= len(words):
                break
                
        return chunks
    
    def get_embedding(self, text):
        """Generate embedding for text"""
        
        response = requests.post(
            "https://api.anyapi.ai/v1/embeddings",
            headers={
                "Authorization": f"Bearer {self.api_key}",
                "Content-Type": "application/json"
            },
            json={
                "model": "text-embedding-3-large",
                "input": text
            }
        )
        
        return response.json()["data"][0]["embedding"]
    
    def search(self, query, top_k=10, min_similarity=0.3):
        """Search across all document chunks"""
        
        query_embedding = self.get_embedding(query)
        
        similarities = []
        for chunk_id, chunk_data in self.documents.items():
            similarity = cosine_similarity(
                [query_embedding], 
                [chunk_data["embedding"]]
            )[0][0]
            
            if similarity >= min_similarity:
                similarities.append({
                    "chunk_id": chunk_id,
                    "doc_id": chunk_data["parent_doc_id"],
                    "title": chunk_data["title"],
                    "content": chunk_data["content"],
                    "similarity": similarity,
                    "metadata": chunk_data["metadata"]
                })
        
        # Sort by similarity
        similarities.sort(key=lambda x: x["similarity"], reverse=True)
        
        # Group by document and take best chunks
        doc_results = {}
        for result in similarities:
            doc_id = result["doc_id"]
            if doc_id not in doc_results:
                doc_results[doc_id] = []
            doc_results[doc_id].append(result)
        
        # Return top results with max 2 chunks per document
        final_results = []
        for doc_id, chunks in doc_results.items():
            final_results.extend(chunks[:2])  # Max 2 chunks per doc
        
        return sorted(final_results, key=lambda x: x["similarity"], reverse=True)[:top_k]
    
    def answer_question(self, question, context_chunks=3):
        """Answer a question using relevant document chunks"""
        
        # Search for relevant chunks
        search_results = self.search(question, top_k=context_chunks)
        
        if not search_results:
            return "I couldn't find relevant information to answer your question."
        
        # Create context from top chunks
        context = "\n\n".join([
            f"From '{result['title']}':\n{result['content']}"
            for result in search_results
        ])
        
        # Generate answer using AI
        response = requests.post(
            "https://api.anyapi.ai/v1/chat/completions",
            headers={
                "Authorization": f"Bearer {self.api_key}",
                "Content-Type": "application/json"
            },
            json={
                "model": "gpt-4o",
                "messages": [
                    {
                        "role": "system",
                        "content": "You are a helpful assistant. Answer the user's question based on the provided context. If the context doesn't contain enough information, say so clearly."
                    },
                    {
                        "role": "user",
                        "content": f"Context:\n{context}\n\nQuestion: {question}\n\nAnswer:"
                    }
                ]
            }
        )
        
        answer = response.json()["choices"][0]["message"]["content"]
        
        return {
            "answer": answer,
            "sources": [
                {
                    "title": result["title"],
                    "relevance": result["similarity"],
                    "doc_id": result["doc_id"]
                }
                for result in search_results
            ]
        }

# Usage
kb_search = KnowledgeBaseSearch("YOUR_API_KEY")

# Index documents
documents = [
    {
        "id": "doc_ai_basics",
        "title": "Introduction to Artificial Intelligence",
        "content": """
        Artificial Intelligence (AI) is a branch of computer science that aims to create 
        machines capable of intelligent behavior. AI systems can perform tasks that typically 
        require human intelligence, such as visual perception, speech recognition, 
        decision-making, and language translation.
        
        There are several types of AI: narrow AI (designed for specific tasks), 
        general AI (human-level intelligence across domains), and superintelligence 
        (exceeding human intelligence). Currently, most AI systems are narrow AI.
        
        Machine learning is a subset of AI that enables systems to learn and improve 
        from experience without being explicitly programmed. Deep learning, a subset 
        of machine learning, uses neural networks with multiple layers to model 
        complex patterns in data.
        """,
        "metadata": {"category": "Technology", "difficulty": "Beginner"}
    }
]

for doc in documents:
    kb_search.index_document(doc["id"], doc["title"], doc["content"], doc["metadata"])

# Search and answer questions
result = kb_search.answer_question("What is the difference between narrow AI and general AI?")
print("Answer:", result["answer"])
print("\nSources:")
for source in result["sources"]:
    print(f"- {source['title']} (relevance: {source['relevance']:.3f})")

Performance Optimization

Caching and Indexing

import redis
import pickle
import hashlib

class CachedSemanticSearch:
    def __init__(self, api_key, redis_url=None):
        self.api_key = api_key
        self.redis_client = redis.from_url(redis_url) if redis_url else None
        self.cache_ttl = 86400  # 24 hours
        
    def get_embedding_cached(self, text):
        """Get embedding with Redis caching"""
        
        if not self.redis_client:
            return self.get_embedding(text)
        
        # Create cache key
        cache_key = f"embedding:{hashlib.md5(text.encode()).hexdigest()}"
        
        # Try to get from cache
        cached_embedding = self.redis_client.get(cache_key)
        if cached_embedding:
            return pickle.loads(cached_embedding)
        
        # Generate new embedding
        embedding = self.get_embedding(text)
        
        # Cache the result
        self.redis_client.setex(
            cache_key, 
            self.cache_ttl, 
            pickle.dumps(embedding)
        )
        
        return embedding
    
    def get_embedding(self, text):
        """Generate embedding using API"""
        
        response = requests.post(
            "https://api.anyapi.ai/v1/embeddings",
            headers={
                "Authorization": f"Bearer {self.api_key}",
                "Content-Type": "application/json"
            },
            json={
                "model": "text-embedding-3-large",
                "input": text
            }
        )
        
        return response.json()["data"][0]["embedding"]
    
    def batch_process_documents(self, documents, batch_size=100):
        """Process documents in batches for efficiency"""
        
        processed_docs = []
        
        for i in range(0, len(documents), batch_size):
            batch = documents[i:i + batch_size]
            
            # Extract texts
            texts = [doc["content"] for doc in batch]
            
            # Get embeddings in batch
            response = requests.post(
                "https://api.anyapi.ai/v1/embeddings",
                headers={
                    "Authorization": f"Bearer {self.api_key}",
                    "Content-Type": "application/json"
                },
                json={
                    "model": "text-embedding-3-large",
                    "input": texts
                }
            )
            
            embeddings = response.json()["data"]
            
            # Combine documents with embeddings
            for doc, embedding in zip(batch, embeddings):
                processed_docs.append({
                    **doc,
                    "embedding": embedding["embedding"]
                })
            
            print(f"Processed batch {i//batch_size + 1}/{(len(documents)-1)//batch_size + 1}")
        
        return processed_docs

Best Practices

1. Embedding Model Selection

  • text-embedding-3-large: Best quality, higher cost
  • text-embedding-3-small: Good balance of quality and speed
  • Cohere embeddings: Strong for specific domains

2. Content Preparation

  • Clean text: Remove unnecessary formatting and noise
  • Chunk appropriately: Split long documents for better search
  • Include metadata: Store searchable metadata separately
  • Normalize content: Consistent formatting and structure

3. Search Optimization

  • Hybrid search: Combine semantic and keyword search
  • Result reranking: Use AI to rerank semantic results
  • Query expansion: Enhance queries with synonyms
  • Personalization: Adapt results to user preferences

4. Performance Tuning

  • Batch processing: Generate embeddings in batches
  • Caching: Cache embeddings and search results
  • Indexing: Use vector databases for large datasets
  • Monitoring: Track search quality and performance

Common Use Cases

Enterprise Search

Internal knowledge bases, document repositories, team wikis

E-commerce

Product discovery, recommendation systems, catalog search

Customer Support

FAQ systems, help desk automation, knowledge retrieval

Content Discovery

Media libraries, educational content, research databases

Legal Research

Case law search, document analysis, contract review

Medical Information

Clinical decision support, research literature, patient records

Financial Services

Research reports, compliance documents, market analysis

Academic Research

Paper discovery, literature review, citation analysis

Getting Started