Build intelligent search systems that understand meaning and context using AI embeddings
import requests
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import json
class SemanticSearch:
def __init__(self, api_key):
self.api_key = api_key
self.documents = {}
self.embeddings = {}
def add_document(self, doc_id, content, metadata=None):
"""Add a document to the search index"""
# Generate embedding for the document
embedding = self.get_embedding(content)
# Store document and embedding
self.documents[doc_id] = {
"content": content,
"metadata": metadata or {},
"embedding": embedding
}
print(f"Added document {doc_id} to search index")
def get_embedding(self, text):
"""Generate embedding for text using AnyAPI"""
response = requests.post(
"https://api.anyapi.ai/v1/embeddings",
headers={
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json"
},
json={
"model": "text-embedding-3-large",
"input": text
}
)
return response.json()["data"][0]["embedding"]
def search(self, query, top_k=5, min_similarity=0.3):
"""Search for similar documents"""
# Get query embedding
query_embedding = self.get_embedding(query)
# Calculate similarities
similarities = []
for doc_id, doc_data in self.documents.items():
similarity = cosine_similarity(
[query_embedding],
[doc_data["embedding"]]
)[0][0]
if similarity >= min_similarity:
similarities.append({
"doc_id": doc_id,
"similarity": similarity,
"content": doc_data["content"],
"metadata": doc_data["metadata"]
})
# Sort by similarity
similarities.sort(key=lambda x: x["similarity"], reverse=True)
return similarities[:top_k]
def batch_add_documents(self, documents):
"""Add multiple documents efficiently"""
# Extract texts for batch embedding
texts = [doc["content"] for doc in documents]
# Get embeddings in batch
response = requests.post(
"https://api.anyapi.ai/v1/embeddings",
headers={
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json"
},
json={
"model": "text-embedding-3-large",
"input": texts
}
)
embeddings = response.json()["data"]
# Store documents with embeddings
for doc, embedding in zip(documents, embeddings):
self.documents[doc["id"]] = {
"content": doc["content"],
"metadata": doc.get("metadata", {}),
"embedding": embedding["embedding"]
}
print(f"Added {len(documents)} documents to search index")
# Usage example
search_engine = SemanticSearch("YOUR_API_KEY")
# Add sample documents
documents = [
{
"id": "doc1",
"content": "Machine learning is a subset of artificial intelligence that enables computers to learn from data",
"metadata": {"category": "AI", "author": "John Doe"}
},
{
"id": "doc2",
"content": "Python is a popular programming language for data science and web development",
"metadata": {"category": "Programming", "author": "Jane Smith"}
},
{
"id": "doc3",
"content": "Deep learning uses neural networks to solve complex problems like image recognition",
"metadata": {"category": "AI", "author": "Bob Johnson"}
},
{
"id": "doc4",
"content": "JavaScript is the primary language for building interactive web applications",
"metadata": {"category": "Programming", "author": "Alice Wilson"}
}
]
# Add documents to search index
search_engine.batch_add_documents(documents)
# Search examples
results = search_engine.search("AI and computers learning", top_k=3)
print("\nSearch results for 'AI and computers learning':")
for result in results:
print(f"Score: {result['similarity']:.3f} - {result['content']}")
results = search_engine.search("coding languages for websites", top_k=3)
print("\nSearch results for 'coding languages for websites':")
for result in results:
print(f"Score: {result['similarity']:.3f} - {result['content']}")
import pinecone
import requests
class PineconeSemanticSearch:
def __init__(self, api_key, pinecone_key, index_name):
self.api_key = api_key
# Initialize Pinecone
pinecone.init(api_key=pinecone_key, environment="us-west1-gcp")
self.index = pinecone.Index(index_name)
def index_documents(self, documents, namespace="default"):
"""Index documents in Pinecone with metadata"""
# Get embeddings for all documents
texts = [doc["content"] for doc in documents]
response = requests.post(
"https://api.anyapi.ai/v1/embeddings",
headers={
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json"
},
json={
"model": "text-embedding-3-large",
"input": texts
}
)
embeddings = response.json()["data"]
# Prepare vectors for Pinecone
vectors = []
for doc, embedding in zip(documents, embeddings):
vectors.append({
"id": doc["id"],
"values": embedding["embedding"],
"metadata": {
"content": doc["content"][:1000], # Truncate for metadata
"title": doc.get("title", ""),
"category": doc.get("category", ""),
"author": doc.get("author", ""),
"url": doc.get("url", "")
}
})
# Upsert to Pinecone
self.index.upsert(vectors=vectors, namespace=namespace)
print(f"Indexed {len(vectors)} documents in Pinecone")
def search(self, query, top_k=10, filters=None, namespace="default"):
"""Search with optional metadata filtering"""
# Get query embedding
response = requests.post(
"https://api.anyapi.ai/v1/embeddings",
headers={
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json"
},
json={
"model": "text-embedding-3-large",
"input": query
}
)
query_embedding = response.json()["data"][0]["embedding"]
# Search in Pinecone
search_results = self.index.query(
vector=query_embedding,
top_k=top_k,
include_metadata=True,
filter=filters,
namespace=namespace
)
return search_results["matches"]
def hybrid_search(self, query, keyword_filter=None, top_k=10):
"""Combine semantic search with keyword filtering"""
# Semantic search
semantic_results = self.search(query, top_k=top_k * 2)
# Apply keyword filtering if specified
if keyword_filter:
filtered_results = []
for result in semantic_results:
content = result["metadata"].get("content", "").lower()
if keyword_filter.lower() in content:
filtered_results.append(result)
semantic_results = filtered_results
return semantic_results[:top_k]
# Usage
pinecone_search = PineconeSemanticSearch(
"YOUR_API_KEY",
"YOUR_PINECONE_KEY",
"semantic-search-index"
)
# Index documents
documents = [
{
"id": "article_1",
"content": "The future of artificial intelligence in healthcare looks promising...",
"title": "AI in Healthcare",
"category": "Technology",
"author": "Dr. Sarah Johnson"
}
# Add more documents...
]
pinecone_search.index_documents(documents)
# Search with filters
results = pinecone_search.search(
"machine learning medical applications",
filters={"category": {"$eq": "Technology"}},
top_k=5
)
for result in results:
print(f"Score: {result['score']:.3f} - {result['metadata']['title']}")
class EcommerceSemanticSearch:
def __init__(self, api_key):
self.api_key = api_key
self.products = {}
def index_product(self, product_id, product_data):
"""Index an e-commerce product for semantic search"""
# Create searchable text from product data
searchable_text = self.create_product_text(product_data)
# Get embedding
embedding = self.get_embedding(searchable_text)
# Store product with embedding
self.products[product_id] = {
"data": product_data,
"searchable_text": searchable_text,
"embedding": embedding
}
def create_product_text(self, product_data):
"""Create comprehensive searchable text from product data"""
text_parts = [
product_data.get("title", ""),
product_data.get("description", ""),
product_data.get("brand", ""),
product_data.get("category", ""),
" ".join(product_data.get("tags", [])),
" ".join(product_data.get("features", [])),
product_data.get("color", ""),
product_data.get("material", ""),
product_data.get("size", "")
]
return " ".join(filter(None, text_parts))
def get_embedding(self, text):
"""Generate embedding for text"""
response = requests.post(
"https://api.anyapi.ai/v1/embeddings",
headers={
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json"
},
json={
"model": "text-embedding-3-large",
"input": text
}
)
return response.json()["data"][0]["embedding"]
def search_products(self, query, filters=None, top_k=20):
"""Search products with optional filters"""
query_embedding = self.get_embedding(query)
# Calculate similarities
similarities = []
for product_id, product_info in self.products.items():
# Check filters first
if filters and not self.matches_filters(product_info["data"], filters):
continue
similarity = cosine_similarity(
[query_embedding],
[product_info["embedding"]]
)[0][0]
similarities.append({
"product_id": product_id,
"similarity": similarity,
"product": product_info["data"]
})
# Sort by similarity
similarities.sort(key=lambda x: x["similarity"], reverse=True)
return similarities[:top_k]
def matches_filters(self, product_data, filters):
"""Check if product matches specified filters"""
for key, value in filters.items():
if key == "price_range":
price = product_data.get("price", 0)
if not (value[0] <= price <= value[1]):
return False
elif key == "categories":
if product_data.get("category") not in value:
return False
elif key == "brands":
if product_data.get("brand") not in value:
return False
elif key == "in_stock":
if product_data.get("in_stock", False) != value:
return False
return True
def get_recommendations(self, product_id, top_k=5):
"""Get similar products (recommendations)"""
if product_id not in self.products:
return []
target_embedding = self.products[product_id]["embedding"]
similarities = []
for pid, product_info in self.products.items():
if pid == product_id: # Skip self
continue
similarity = cosine_similarity(
[target_embedding],
[product_info["embedding"]]
)[0][0]
similarities.append({
"product_id": pid,
"similarity": similarity,
"product": product_info["data"]
})
similarities.sort(key=lambda x: x["similarity"], reverse=True)
return similarities[:top_k]
# Usage example
ecommerce_search = EcommerceSemanticSearch("YOUR_API_KEY")
# Index sample products
products = [
{
"id": "prod_1",
"data": {
"title": "Wireless Noise-Cancelling Headphones",
"description": "Premium over-ear headphones with active noise cancellation and 30-hour battery life",
"brand": "AudioTech",
"category": "Electronics",
"price": 299.99,
"tags": ["wireless", "bluetooth", "noise-cancelling", "premium"],
"features": ["Active noise cancellation", "30-hour battery", "Quick charge"],
"color": "Black",
"in_stock": True
}
},
{
"id": "prod_2",
"data": {
"title": "Bluetooth Sport Earbuds",
"description": "Waterproof wireless earbuds perfect for workouts and running",
"brand": "FitSound",
"category": "Electronics",
"price": 89.99,
"tags": ["wireless", "sport", "waterproof", "earbuds"],
"features": ["IPX7 waterproof", "Secure fit", "8-hour battery"],
"color": "Blue",
"in_stock": True
}
}
]
# Index products
for product in products:
ecommerce_search.index_product(product["id"], product["data"])
# Search examples
results = ecommerce_search.search_products(
"noise cancelling headphones for music",
top_k=5
)
print("Search results:")
for result in results:
print(f"Score: {result['similarity']:.3f} - {result['product']['title']}")
# Search with filters
filtered_results = ecommerce_search.search_products(
"wireless audio",
filters={
"price_range": [50, 200],
"categories": ["Electronics"],
"in_stock": True
},
top_k=5
)
print("\nFiltered search results:")
for result in filtered_results:
print(f"Score: {result['similarity']:.3f} - {result['product']['title']} - ${result['product']['price']}")
# Get recommendations
recommendations = ecommerce_search.get_recommendations("prod_1", top_k=3)
print("\nRecommendations for product prod_1:")
for rec in recommendations:
print(f"Score: {rec['similarity']:.3f} - {rec['product']['title']}")
class KnowledgeBaseSearch:
def __init__(self, api_key):
self.api_key = api_key
self.documents = {}
def index_document(self, doc_id, title, content, metadata=None):
"""Index a document with hierarchical content structure"""
# Split content into chunks for better search
chunks = self.chunk_content(content, chunk_size=500, overlap=50)
# Process each chunk
for i, chunk in enumerate(chunks):
chunk_id = f"{doc_id}_chunk_{i}"
# Create comprehensive text for embedding
searchable_text = f"{title}\n\n{chunk}"
# Get embedding
embedding = self.get_embedding(searchable_text)
# Store chunk with metadata
self.documents[chunk_id] = {
"parent_doc_id": doc_id,
"title": title,
"content": chunk,
"chunk_index": i,
"searchable_text": searchable_text,
"embedding": embedding,
"metadata": metadata or {}
}
def chunk_content(self, content, chunk_size=500, overlap=50):
"""Split content into overlapping chunks"""
words = content.split()
chunks = []
for i in range(0, len(words), chunk_size - overlap):
chunk_words = words[i:i + chunk_size]
chunk_text = " ".join(chunk_words)
chunks.append(chunk_text)
# Break if we've covered all words
if i + chunk_size >= len(words):
break
return chunks
def get_embedding(self, text):
"""Generate embedding for text"""
response = requests.post(
"https://api.anyapi.ai/v1/embeddings",
headers={
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json"
},
json={
"model": "text-embedding-3-large",
"input": text
}
)
return response.json()["data"][0]["embedding"]
def search(self, query, top_k=10, min_similarity=0.3):
"""Search across all document chunks"""
query_embedding = self.get_embedding(query)
similarities = []
for chunk_id, chunk_data in self.documents.items():
similarity = cosine_similarity(
[query_embedding],
[chunk_data["embedding"]]
)[0][0]
if similarity >= min_similarity:
similarities.append({
"chunk_id": chunk_id,
"doc_id": chunk_data["parent_doc_id"],
"title": chunk_data["title"],
"content": chunk_data["content"],
"similarity": similarity,
"metadata": chunk_data["metadata"]
})
# Sort by similarity
similarities.sort(key=lambda x: x["similarity"], reverse=True)
# Group by document and take best chunks
doc_results = {}
for result in similarities:
doc_id = result["doc_id"]
if doc_id not in doc_results:
doc_results[doc_id] = []
doc_results[doc_id].append(result)
# Return top results with max 2 chunks per document
final_results = []
for doc_id, chunks in doc_results.items():
final_results.extend(chunks[:2]) # Max 2 chunks per doc
return sorted(final_results, key=lambda x: x["similarity"], reverse=True)[:top_k]
def answer_question(self, question, context_chunks=3):
"""Answer a question using relevant document chunks"""
# Search for relevant chunks
search_results = self.search(question, top_k=context_chunks)
if not search_results:
return "I couldn't find relevant information to answer your question."
# Create context from top chunks
context = "\n\n".join([
f"From '{result['title']}':\n{result['content']}"
for result in search_results
])
# Generate answer using AI
response = requests.post(
"https://api.anyapi.ai/v1/chat/completions",
headers={
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json"
},
json={
"model": "gpt-4o",
"messages": [
{
"role": "system",
"content": "You are a helpful assistant. Answer the user's question based on the provided context. If the context doesn't contain enough information, say so clearly."
},
{
"role": "user",
"content": f"Context:\n{context}\n\nQuestion: {question}\n\nAnswer:"
}
]
}
)
answer = response.json()["choices"][0]["message"]["content"]
return {
"answer": answer,
"sources": [
{
"title": result["title"],
"relevance": result["similarity"],
"doc_id": result["doc_id"]
}
for result in search_results
]
}
# Usage
kb_search = KnowledgeBaseSearch("YOUR_API_KEY")
# Index documents
documents = [
{
"id": "doc_ai_basics",
"title": "Introduction to Artificial Intelligence",
"content": """
Artificial Intelligence (AI) is a branch of computer science that aims to create
machines capable of intelligent behavior. AI systems can perform tasks that typically
require human intelligence, such as visual perception, speech recognition,
decision-making, and language translation.
There are several types of AI: narrow AI (designed for specific tasks),
general AI (human-level intelligence across domains), and superintelligence
(exceeding human intelligence). Currently, most AI systems are narrow AI.
Machine learning is a subset of AI that enables systems to learn and improve
from experience without being explicitly programmed. Deep learning, a subset
of machine learning, uses neural networks with multiple layers to model
complex patterns in data.
""",
"metadata": {"category": "Technology", "difficulty": "Beginner"}
}
]
for doc in documents:
kb_search.index_document(doc["id"], doc["title"], doc["content"], doc["metadata"])
# Search and answer questions
result = kb_search.answer_question("What is the difference between narrow AI and general AI?")
print("Answer:", result["answer"])
print("\nSources:")
for source in result["sources"]:
print(f"- {source['title']} (relevance: {source['relevance']:.3f})")
import redis
import pickle
import hashlib
class CachedSemanticSearch:
def __init__(self, api_key, redis_url=None):
self.api_key = api_key
self.redis_client = redis.from_url(redis_url) if redis_url else None
self.cache_ttl = 86400 # 24 hours
def get_embedding_cached(self, text):
"""Get embedding with Redis caching"""
if not self.redis_client:
return self.get_embedding(text)
# Create cache key
cache_key = f"embedding:{hashlib.md5(text.encode()).hexdigest()}"
# Try to get from cache
cached_embedding = self.redis_client.get(cache_key)
if cached_embedding:
return pickle.loads(cached_embedding)
# Generate new embedding
embedding = self.get_embedding(text)
# Cache the result
self.redis_client.setex(
cache_key,
self.cache_ttl,
pickle.dumps(embedding)
)
return embedding
def get_embedding(self, text):
"""Generate embedding using API"""
response = requests.post(
"https://api.anyapi.ai/v1/embeddings",
headers={
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json"
},
json={
"model": "text-embedding-3-large",
"input": text
}
)
return response.json()["data"][0]["embedding"]
def batch_process_documents(self, documents, batch_size=100):
"""Process documents in batches for efficiency"""
processed_docs = []
for i in range(0, len(documents), batch_size):
batch = documents[i:i + batch_size]
# Extract texts
texts = [doc["content"] for doc in batch]
# Get embeddings in batch
response = requests.post(
"https://api.anyapi.ai/v1/embeddings",
headers={
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json"
},
json={
"model": "text-embedding-3-large",
"input": texts
}
)
embeddings = response.json()["data"]
# Combine documents with embeddings
for doc, embedding in zip(batch, embeddings):
processed_docs.append({
**doc,
"embedding": embedding["embedding"]
})
print(f"Processed batch {i//batch_size + 1}/{(len(documents)-1)//batch_size + 1}")
return processed_docs