Semantic Search Implementation
Build powerful search systems that understand meaning rather than just matching keywords. Semantic search uses AI embeddings to find relevant content based on intent and context.Overview
Semantic search enables:- Meaning-based search - Find content by intent, not just keywords
- Contextual understanding - Understand synonyms and related concepts
- Cross-language search - Search across different languages
- Fuzzy matching - Find relevant content even with imprecise queries
- Intelligent ranking - Rank results by semantic relevance
Vector Embeddings
Convert text to numerical vectors that capture meaning
Similarity Search
Find similar content using vector mathematics
Hybrid Search
Combine semantic and keyword search for best results
Real-time Indexing
Index new content automatically for immediate search
Quick Start Implementation
Copy
import requests
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import json
class SemanticSearch:
def __init__(self, api_key):
self.api_key = api_key
self.documents = {}
self.embeddings = {}
def add_document(self, doc_id, content, metadata=None):
"""Add a document to the search index"""
# Generate embedding for the document
embedding = self.get_embedding(content)
# Store document and embedding
self.documents[doc_id] = {
"content": content,
"metadata": metadata or {},
"embedding": embedding
}
print(f"Added document {doc_id} to search index")
def get_embedding(self, text):
"""Generate embedding for text using AnyAPI"""
response = requests.post(
"https://api.anyapi.ai/v1/embeddings",
headers={
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json"
},
json={
"model": "text-embedding-3-large",
"input": text
}
)
return response.json()["data"][0]["embedding"]
def search(self, query, top_k=5, min_similarity=0.3):
"""Search for similar documents"""
# Get query embedding
query_embedding = self.get_embedding(query)
# Calculate similarities
similarities = []
for doc_id, doc_data in self.documents.items():
similarity = cosine_similarity(
[query_embedding],
[doc_data["embedding"]]
)[0][0]
if similarity >= min_similarity:
similarities.append({
"doc_id": doc_id,
"similarity": similarity,
"content": doc_data["content"],
"metadata": doc_data["metadata"]
})
# Sort by similarity
similarities.sort(key=lambda x: x["similarity"], reverse=True)
return similarities[:top_k]
def batch_add_documents(self, documents):
"""Add multiple documents efficiently"""
# Extract texts for batch embedding
texts = [doc["content"] for doc in documents]
# Get embeddings in batch
response = requests.post(
"https://api.anyapi.ai/v1/embeddings",
headers={
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json"
},
json={
"model": "text-embedding-3-large",
"input": texts
}
)
embeddings = response.json()["data"]
# Store documents with embeddings
for doc, embedding in zip(documents, embeddings):
self.documents[doc["id"]] = {
"content": doc["content"],
"metadata": doc.get("metadata", {}),
"embedding": embedding["embedding"]
}
print(f"Added {len(documents)} documents to search index")
# Usage example
search_engine = SemanticSearch("YOUR_API_KEY")
# Add sample documents
documents = [
{
"id": "doc1",
"content": "Machine learning is a subset of artificial intelligence that enables computers to learn from data",
"metadata": {"category": "AI", "author": "John Doe"}
},
{
"id": "doc2",
"content": "Python is a popular programming language for data science and web development",
"metadata": {"category": "Programming", "author": "Jane Smith"}
},
{
"id": "doc3",
"content": "Deep learning uses neural networks to solve complex problems like image recognition",
"metadata": {"category": "AI", "author": "Bob Johnson"}
},
{
"id": "doc4",
"content": "JavaScript is the primary language for building interactive web applications",
"metadata": {"category": "Programming", "author": "Alice Wilson"}
}
]
# Add documents to search index
search_engine.batch_add_documents(documents)
# Search examples
results = search_engine.search("AI and computers learning", top_k=3)
print("\nSearch results for 'AI and computers learning':")
for result in results:
print(f"Score: {result['similarity']:.3f} - {result['content']}")
results = search_engine.search("coding languages for websites", top_k=3)
print("\nSearch results for 'coding languages for websites':")
for result in results:
print(f"Score: {result['similarity']:.3f} - {result['content']}")
Advanced Search Implementations
Vector Database Integration
Copy
import pinecone
import requests
class PineconeSemanticSearch:
def __init__(self, api_key, pinecone_key, index_name):
self.api_key = api_key
# Initialize Pinecone
pinecone.init(api_key=pinecone_key, environment="us-west1-gcp")
self.index = pinecone.Index(index_name)
def index_documents(self, documents, namespace="default"):
"""Index documents in Pinecone with metadata"""
# Get embeddings for all documents
texts = [doc["content"] for doc in documents]
response = requests.post(
"https://api.anyapi.ai/v1/embeddings",
headers={
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json"
},
json={
"model": "text-embedding-3-large",
"input": texts
}
)
embeddings = response.json()["data"]
# Prepare vectors for Pinecone
vectors = []
for doc, embedding in zip(documents, embeddings):
vectors.append({
"id": doc["id"],
"values": embedding["embedding"],
"metadata": {
"content": doc["content"][:1000], # Truncate for metadata
"title": doc.get("title", ""),
"category": doc.get("category", ""),
"author": doc.get("author", ""),
"url": doc.get("url", "")
}
})
# Upsert to Pinecone
self.index.upsert(vectors=vectors, namespace=namespace)
print(f"Indexed {len(vectors)} documents in Pinecone")
def search(self, query, top_k=10, filters=None, namespace="default"):
"""Search with optional metadata filtering"""
# Get query embedding
response = requests.post(
"https://api.anyapi.ai/v1/embeddings",
headers={
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json"
},
json={
"model": "text-embedding-3-large",
"input": query
}
)
query_embedding = response.json()["data"][0]["embedding"]
# Search in Pinecone
search_results = self.index.query(
vector=query_embedding,
top_k=top_k,
include_metadata=True,
filter=filters,
namespace=namespace
)
return search_results["matches"]
def hybrid_search(self, query, keyword_filter=None, top_k=10):
"""Combine semantic search with keyword filtering"""
# Semantic search
semantic_results = self.search(query, top_k=top_k * 2)
# Apply keyword filtering if specified
if keyword_filter:
filtered_results = []
for result in semantic_results:
content = result["metadata"].get("content", "").lower()
if keyword_filter.lower() in content:
filtered_results.append(result)
semantic_results = filtered_results
return semantic_results[:top_k]
# Usage
pinecone_search = PineconeSemanticSearch(
"YOUR_API_KEY",
"YOUR_PINECONE_KEY",
"semantic-search-index"
)
# Index documents
documents = [
{
"id": "article_1",
"content": "The future of artificial intelligence in healthcare looks promising...",
"title": "AI in Healthcare",
"category": "Technology",
"author": "Dr. Sarah Johnson"
}
# Add more documents...
]
pinecone_search.index_documents(documents)
# Search with filters
results = pinecone_search.search(
"machine learning medical applications",
filters={"category": {"$eq": "Technology"}},
top_k=5
)
for result in results:
print(f"Score: {result['score']:.3f} - {result['metadata']['title']}")
E-commerce Search Implementation
Copy
class EcommerceSemanticSearch:
def __init__(self, api_key):
self.api_key = api_key
self.products = {}
def index_product(self, product_id, product_data):
"""Index an e-commerce product for semantic search"""
# Create searchable text from product data
searchable_text = self.create_product_text(product_data)
# Get embedding
embedding = self.get_embedding(searchable_text)
# Store product with embedding
self.products[product_id] = {
"data": product_data,
"searchable_text": searchable_text,
"embedding": embedding
}
def create_product_text(self, product_data):
"""Create comprehensive searchable text from product data"""
text_parts = [
product_data.get("title", ""),
product_data.get("description", ""),
product_data.get("brand", ""),
product_data.get("category", ""),
" ".join(product_data.get("tags", [])),
" ".join(product_data.get("features", [])),
product_data.get("color", ""),
product_data.get("material", ""),
product_data.get("size", "")
]
return " ".join(filter(None, text_parts))
def get_embedding(self, text):
"""Generate embedding for text"""
response = requests.post(
"https://api.anyapi.ai/v1/embeddings",
headers={
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json"
},
json={
"model": "text-embedding-3-large",
"input": text
}
)
return response.json()["data"][0]["embedding"]
def search_products(self, query, filters=None, top_k=20):
"""Search products with optional filters"""
query_embedding = self.get_embedding(query)
# Calculate similarities
similarities = []
for product_id, product_info in self.products.items():
# Check filters first
if filters and not self.matches_filters(product_info["data"], filters):
continue
similarity = cosine_similarity(
[query_embedding],
[product_info["embedding"]]
)[0][0]
similarities.append({
"product_id": product_id,
"similarity": similarity,
"product": product_info["data"]
})
# Sort by similarity
similarities.sort(key=lambda x: x["similarity"], reverse=True)
return similarities[:top_k]
def matches_filters(self, product_data, filters):
"""Check if product matches specified filters"""
for key, value in filters.items():
if key == "price_range":
price = product_data.get("price", 0)
if not (value[0] <= price <= value[1]):
return False
elif key == "categories":
if product_data.get("category") not in value:
return False
elif key == "brands":
if product_data.get("brand") not in value:
return False
elif key == "in_stock":
if product_data.get("in_stock", False) != value:
return False
return True
def get_recommendations(self, product_id, top_k=5):
"""Get similar products (recommendations)"""
if product_id not in self.products:
return []
target_embedding = self.products[product_id]["embedding"]
similarities = []
for pid, product_info in self.products.items():
if pid == product_id: # Skip self
continue
similarity = cosine_similarity(
[target_embedding],
[product_info["embedding"]]
)[0][0]
similarities.append({
"product_id": pid,
"similarity": similarity,
"product": product_info["data"]
})
similarities.sort(key=lambda x: x["similarity"], reverse=True)
return similarities[:top_k]
# Usage example
ecommerce_search = EcommerceSemanticSearch("YOUR_API_KEY")
# Index sample products
products = [
{
"id": "prod_1",
"data": {
"title": "Wireless Noise-Cancelling Headphones",
"description": "Premium over-ear headphones with active noise cancellation and 30-hour battery life",
"brand": "AudioTech",
"category": "Electronics",
"price": 299.99,
"tags": ["wireless", "bluetooth", "noise-cancelling", "premium"],
"features": ["Active noise cancellation", "30-hour battery", "Quick charge"],
"color": "Black",
"in_stock": True
}
},
{
"id": "prod_2",
"data": {
"title": "Bluetooth Sport Earbuds",
"description": "Waterproof wireless earbuds perfect for workouts and running",
"brand": "FitSound",
"category": "Electronics",
"price": 89.99,
"tags": ["wireless", "sport", "waterproof", "earbuds"],
"features": ["IPX7 waterproof", "Secure fit", "8-hour battery"],
"color": "Blue",
"in_stock": True
}
}
]
# Index products
for product in products:
ecommerce_search.index_product(product["id"], product["data"])
# Search examples
results = ecommerce_search.search_products(
"noise cancelling headphones for music",
top_k=5
)
print("Search results:")
for result in results:
print(f"Score: {result['similarity']:.3f} - {result['product']['title']}")
# Search with filters
filtered_results = ecommerce_search.search_products(
"wireless audio",
filters={
"price_range": [50, 200],
"categories": ["Electronics"],
"in_stock": True
},
top_k=5
)
print("\nFiltered search results:")
for result in filtered_results:
print(f"Score: {result['similarity']:.3f} - {result['product']['title']} - ${result['product']['price']}")
# Get recommendations
recommendations = ecommerce_search.get_recommendations("prod_1", top_k=3)
print("\nRecommendations for product prod_1:")
for rec in recommendations:
print(f"Score: {rec['similarity']:.3f} - {rec['product']['title']}")
Document and Knowledge Base Search
Copy
class KnowledgeBaseSearch:
def __init__(self, api_key):
self.api_key = api_key
self.documents = {}
def index_document(self, doc_id, title, content, metadata=None):
"""Index a document with hierarchical content structure"""
# Split content into chunks for better search
chunks = self.chunk_content(content, chunk_size=500, overlap=50)
# Process each chunk
for i, chunk in enumerate(chunks):
chunk_id = f"{doc_id}_chunk_{i}"
# Create comprehensive text for embedding
searchable_text = f"{title}\n\n{chunk}"
# Get embedding
embedding = self.get_embedding(searchable_text)
# Store chunk with metadata
self.documents[chunk_id] = {
"parent_doc_id": doc_id,
"title": title,
"content": chunk,
"chunk_index": i,
"searchable_text": searchable_text,
"embedding": embedding,
"metadata": metadata or {}
}
def chunk_content(self, content, chunk_size=500, overlap=50):
"""Split content into overlapping chunks"""
words = content.split()
chunks = []
for i in range(0, len(words), chunk_size - overlap):
chunk_words = words[i:i + chunk_size]
chunk_text = " ".join(chunk_words)
chunks.append(chunk_text)
# Break if we've covered all words
if i + chunk_size >= len(words):
break
return chunks
def get_embedding(self, text):
"""Generate embedding for text"""
response = requests.post(
"https://api.anyapi.ai/v1/embeddings",
headers={
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json"
},
json={
"model": "text-embedding-3-large",
"input": text
}
)
return response.json()["data"][0]["embedding"]
def search(self, query, top_k=10, min_similarity=0.3):
"""Search across all document chunks"""
query_embedding = self.get_embedding(query)
similarities = []
for chunk_id, chunk_data in self.documents.items():
similarity = cosine_similarity(
[query_embedding],
[chunk_data["embedding"]]
)[0][0]
if similarity >= min_similarity:
similarities.append({
"chunk_id": chunk_id,
"doc_id": chunk_data["parent_doc_id"],
"title": chunk_data["title"],
"content": chunk_data["content"],
"similarity": similarity,
"metadata": chunk_data["metadata"]
})
# Sort by similarity
similarities.sort(key=lambda x: x["similarity"], reverse=True)
# Group by document and take best chunks
doc_results = {}
for result in similarities:
doc_id = result["doc_id"]
if doc_id not in doc_results:
doc_results[doc_id] = []
doc_results[doc_id].append(result)
# Return top results with max 2 chunks per document
final_results = []
for doc_id, chunks in doc_results.items():
final_results.extend(chunks[:2]) # Max 2 chunks per doc
return sorted(final_results, key=lambda x: x["similarity"], reverse=True)[:top_k]
def answer_question(self, question, context_chunks=3):
"""Answer a question using relevant document chunks"""
# Search for relevant chunks
search_results = self.search(question, top_k=context_chunks)
if not search_results:
return "I couldn't find relevant information to answer your question."
# Create context from top chunks
context = "\n\n".join([
f"From '{result['title']}':\n{result['content']}"
for result in search_results
])
# Generate answer using AI
response = requests.post(
"https://api.anyapi.ai/v1/chat/completions",
headers={
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json"
},
json={
"model": "gpt-4o",
"messages": [
{
"role": "system",
"content": "You are a helpful assistant. Answer the user's question based on the provided context. If the context doesn't contain enough information, say so clearly."
},
{
"role": "user",
"content": f"Context:\n{context}\n\nQuestion: {question}\n\nAnswer:"
}
]
}
)
answer = response.json()["choices"][0]["message"]["content"]
return {
"answer": answer,
"sources": [
{
"title": result["title"],
"relevance": result["similarity"],
"doc_id": result["doc_id"]
}
for result in search_results
]
}
# Usage
kb_search = KnowledgeBaseSearch("YOUR_API_KEY")
# Index documents
documents = [
{
"id": "doc_ai_basics",
"title": "Introduction to Artificial Intelligence",
"content": """
Artificial Intelligence (AI) is a branch of computer science that aims to create
machines capable of intelligent behavior. AI systems can perform tasks that typically
require human intelligence, such as visual perception, speech recognition,
decision-making, and language translation.
There are several types of AI: narrow AI (designed for specific tasks),
general AI (human-level intelligence across domains), and superintelligence
(exceeding human intelligence). Currently, most AI systems are narrow AI.
Machine learning is a subset of AI that enables systems to learn and improve
from experience without being explicitly programmed. Deep learning, a subset
of machine learning, uses neural networks with multiple layers to model
complex patterns in data.
""",
"metadata": {"category": "Technology", "difficulty": "Beginner"}
}
]
for doc in documents:
kb_search.index_document(doc["id"], doc["title"], doc["content"], doc["metadata"])
# Search and answer questions
result = kb_search.answer_question("What is the difference between narrow AI and general AI?")
print("Answer:", result["answer"])
print("\nSources:")
for source in result["sources"]:
print(f"- {source['title']} (relevance: {source['relevance']:.3f})")
Performance Optimization
Caching and Indexing
Copy
import redis
import pickle
import hashlib
class CachedSemanticSearch:
def __init__(self, api_key, redis_url=None):
self.api_key = api_key
self.redis_client = redis.from_url(redis_url) if redis_url else None
self.cache_ttl = 86400 # 24 hours
def get_embedding_cached(self, text):
"""Get embedding with Redis caching"""
if not self.redis_client:
return self.get_embedding(text)
# Create cache key
cache_key = f"embedding:{hashlib.md5(text.encode()).hexdigest()}"
# Try to get from cache
cached_embedding = self.redis_client.get(cache_key)
if cached_embedding:
return pickle.loads(cached_embedding)
# Generate new embedding
embedding = self.get_embedding(text)
# Cache the result
self.redis_client.setex(
cache_key,
self.cache_ttl,
pickle.dumps(embedding)
)
return embedding
def get_embedding(self, text):
"""Generate embedding using API"""
response = requests.post(
"https://api.anyapi.ai/v1/embeddings",
headers={
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json"
},
json={
"model": "text-embedding-3-large",
"input": text
}
)
return response.json()["data"][0]["embedding"]
def batch_process_documents(self, documents, batch_size=100):
"""Process documents in batches for efficiency"""
processed_docs = []
for i in range(0, len(documents), batch_size):
batch = documents[i:i + batch_size]
# Extract texts
texts = [doc["content"] for doc in batch]
# Get embeddings in batch
response = requests.post(
"https://api.anyapi.ai/v1/embeddings",
headers={
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json"
},
json={
"model": "text-embedding-3-large",
"input": texts
}
)
embeddings = response.json()["data"]
# Combine documents with embeddings
for doc, embedding in zip(batch, embeddings):
processed_docs.append({
**doc,
"embedding": embedding["embedding"]
})
print(f"Processed batch {i//batch_size + 1}/{(len(documents)-1)//batch_size + 1}")
return processed_docs
Best Practices
1. Embedding Model Selection
- text-embedding-3-large: Best quality, higher cost
- text-embedding-3-small: Good balance of quality and speed
- Cohere embeddings: Strong for specific domains
2. Content Preparation
- Clean text: Remove unnecessary formatting and noise
- Chunk appropriately: Split long documents for better search
- Include metadata: Store searchable metadata separately
- Normalize content: Consistent formatting and structure
3. Search Optimization
- Hybrid search: Combine semantic and keyword search
- Result reranking: Use AI to rerank semantic results
- Query expansion: Enhance queries with synonyms
- Personalization: Adapt results to user preferences
4. Performance Tuning
- Batch processing: Generate embeddings in batches
- Caching: Cache embeddings and search results
- Indexing: Use vector databases for large datasets
- Monitoring: Track search quality and performance
Common Use Cases
Enterprise Search
Internal knowledge bases, document repositories, team wikis
E-commerce
Product discovery, recommendation systems, catalog search
Customer Support
FAQ systems, help desk automation, knowledge retrieval
Content Discovery
Media libraries, educational content, research databases
Legal Research
Case law search, document analysis, contract review
Medical Information
Clinical decision support, research literature, patient records
Financial Services
Research reports, compliance documents, market analysis
Academic Research
Paper discovery, literature review, citation analysis