Create Embeddings
Transform text into high-dimensional vector representations for semantic search, similarity matching, content classification, and AI-powered applications. Perfect for search engines, recommendation systems, knowledge bases, and content analysis.Overview
Embeddings enable:- Semantic search - Find content by meaning, not just keywords
- Similarity matching - Compare and group related content
- Content classification - Categorize and organize information automatically
- Knowledge retrieval - Build intelligent search and Q&A systems
- Recommendation engines - Suggest relevant content to users
Key Capabilities
Multi-Language SupportGenerate embeddings for text in multiple languages Batch Processing
Process multiple texts efficiently in single requests High Precision
Latest embedding models for superior accuracy Flexible Integration
Easy integration with vector databases and search systems
Quick Start
Copy
import requests
import numpy as np
from typing import List, Dict, Any
class EmbeddingGenerator:
def __init__(self, api_key: str, model: str = "text-embedding-3-large"):
self.api_key = api_key
self.model = model
self.base_url = "https://api.anyapi.ai/v1"
def create_embedding(self, text: str, **kwargs) -> List[float]:
"""Generate embedding for a single text"""
response = requests.post(
f"{self.base_url}/embeddings",
headers={
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json"
},
json={
"model": self.model,
"input": text,
**kwargs
}
)
if response.status_code == 200:
return response.json()["data"][0]["embedding"]
else:
raise Exception(f"API Error: {response.status_code} - {response.text}")
def create_embeddings_batch(self, texts: List[str], **kwargs) -> List[List[float]]:
"""Generate embeddings for multiple texts"""
response = requests.post(
f"{self.base_url}/embeddings",
headers={
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json"
},
json={
"model": self.model,
"input": texts,
**kwargs
}
)
if response.status_code == 200:
return [item["embedding"] for item in response.json()["data"]]
else:
raise Exception(f"API Error: {response.status_code} - {response.text}")
def get_embedding_info(self) -> Dict[str, Any]:
"""Get information about the embedding model"""
# Model dimension information
model_info = {
"text-embedding-3-small": {"dimensions": 1536, "max_tokens": 8191},
"text-embedding-3-large": {"dimensions": 3072, "max_tokens": 8191},
"text-embedding-ada-002": {"dimensions": 1536, "max_tokens": 8191}
}
return model_info.get(self.model, {"dimensions": "unknown", "max_tokens": "unknown"})
# Basic usage
embedding_gen = EmbeddingGenerator("YOUR_API_KEY")
# Single text embedding
text = "Artificial intelligence is transforming how we work and live."
embedding = embedding_gen.create_embedding(text)
print(f"Embedding dimension: {len(embedding)}")
print(f"First 5 values: {embedding[:5]}")
# Batch processing
texts = [
"Machine learning enables computers to learn without explicit programming.",
"Deep learning uses neural networks with multiple layers.",
"Natural language processing helps computers understand human language.",
"Computer vision allows machines to interpret visual information."
]
embeddings = embedding_gen.create_embeddings_batch(texts)
print(f"Generated {len(embeddings)} embeddings")
print(f"Each embedding has {len(embeddings[0])} dimensions")
# Model information
info = embedding_gen.get_embedding_info()
print(f"Model info: {info}")
Advanced Embedding Operations
Similarity Calculator
Copy
class SimilarityCalculator:
def __init__(self, embedding_generator: EmbeddingGenerator):
self.embedding_gen = embedding_generator
def cosine_similarity(self, vec1: List[float], vec2: List[float]) -> float:
"""Calculate cosine similarity between two vectors"""
vec1 = np.array(vec1)
vec2 = np.array(vec2)
dot_product = np.dot(vec1, vec2)
magnitude1 = np.linalg.norm(vec1)
magnitude2 = np.linalg.norm(vec2)
if magnitude1 == 0 or magnitude2 == 0:
return 0.0
return dot_product / (magnitude1 * magnitude2)
def euclidean_distance(self, vec1: List[float], vec2: List[float]) -> float:
"""Calculate Euclidean distance between two vectors"""
vec1 = np.array(vec1)
vec2 = np.array(vec2)
return np.linalg.norm(vec1 - vec2)
def find_most_similar(self, query_text: str, candidate_texts: List[str], top_k: int = 5) -> List[Dict]:
"""Find most similar texts to a query"""
# Generate embeddings
query_embedding = self.embedding_gen.create_embedding(query_text)
candidate_embeddings = self.embedding_gen.create_embeddings_batch(candidate_texts)
# Calculate similarities
similarities = []
for i, candidate_embedding in enumerate(candidate_embeddings):
similarity = self.cosine_similarity(query_embedding, candidate_embedding)
similarities.append({
"text": candidate_texts[i],
"similarity": similarity,
"index": i
})
# Sort by similarity and return top_k
similarities.sort(key=lambda x: x["similarity"], reverse=True)
return similarities[:top_k]
def cluster_texts(self, texts: List[str], n_clusters: int = 3) -> Dict[str, List[str]]:
"""Simple clustering of texts based on embeddings"""
from sklearn.cluster import KMeans
# Generate embeddings
embeddings = self.embedding_gen.create_embeddings_batch(texts)
# Perform clustering
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(embeddings)
# Group texts by cluster
clusters = {}
for i, label in enumerate(cluster_labels):
cluster_key = f"cluster_{label}"
if cluster_key not in clusters:
clusters[cluster_key] = []
clusters[cluster_key].append(texts[i])
return clusters
# Usage
embedding_gen = EmbeddingGenerator("YOUR_API_KEY")
similarity_calc = SimilarityCalculator(embedding_gen)
# Find similar texts
query = "machine learning algorithms"
candidates = [
"Neural networks are a type of machine learning model",
"Supervised learning requires labeled training data",
"The weather is nice today",
"Deep learning is a subset of machine learning",
"I enjoy cooking pasta for dinner",
"Unsupervised learning finds patterns in unlabeled data"
]
similar_texts = similarity_calc.find_most_similar(query, candidates, top_k=3)
for result in similar_texts:
print(f"Similarity: {result['similarity']:.3f} - {result['text']}")
# Cluster texts
tech_texts = [
"Artificial intelligence and machine learning",
"Software development and programming",
"Cooking recipes and food preparation",
"Travel destinations and vacation planning",
"Data science and analytics",
"Restaurant reviews and dining experiences",
"Computer vision and image recognition",
"Adventure sports and outdoor activities"
]
clusters = similarity_calc.cluster_texts(tech_texts, n_clusters=3)
for cluster_name, cluster_texts in clusters.items():
print(f"\n{cluster_name}:")
for text in cluster_texts:
print(f" - {text}")
Vector Database Integration
Copy
class VectorDatabase:
def __init__(self, embedding_generator: EmbeddingGenerator):
self.embedding_gen = embedding_generator
self.documents = []
self.embeddings = []
self.metadata = []
def add_document(self, text: str, metadata: Dict = None):
"""Add a single document to the database"""
embedding = self.embedding_gen.create_embedding(text)
self.documents.append(text)
self.embeddings.append(embedding)
self.metadata.append(metadata or {})
return len(self.documents) - 1 # Return document ID
def add_documents_batch(self, texts: List[str], metadata_list: List[Dict] = None):
"""Add multiple documents efficiently"""
embeddings = self.embedding_gen.create_embeddings_batch(texts)
if metadata_list is None:
metadata_list = [{}] * len(texts)
start_id = len(self.documents)
self.documents.extend(texts)
self.embeddings.extend(embeddings)
self.metadata.extend(metadata_list)
return list(range(start_id, len(self.documents)))
def search(self, query: str, top_k: int = 5, metadata_filter: Dict = None) -> List[Dict]:
"""Search for similar documents"""
query_embedding = self.embedding_gen.create_embedding(query)
results = []
for i, doc_embedding in enumerate(self.embeddings):
# Apply metadata filter if provided
if metadata_filter:
if not all(self.metadata[i].get(k) == v for k, v in metadata_filter.items()):
continue
# Calculate similarity
similarity = self._cosine_similarity(query_embedding, doc_embedding)
results.append({
"id": i,
"text": self.documents[i],
"metadata": self.metadata[i],
"similarity": similarity
})
# Sort by similarity and return top_k
results.sort(key=lambda x: x["similarity"], reverse=True)
return results[:top_k]
def get_document(self, doc_id: int) -> Dict:
"""Retrieve a document by ID"""
if 0 <= doc_id < len(self.documents):
return {
"id": doc_id,
"text": self.documents[doc_id],
"metadata": self.metadata[doc_id],
"embedding": self.embeddings[doc_id]
}
return None
def update_document(self, doc_id: int, new_text: str = None, new_metadata: Dict = None):
"""Update a document"""
if 0 <= doc_id < len(self.documents):
if new_text:
self.documents[doc_id] = new_text
self.embeddings[doc_id] = self.embedding_gen.create_embedding(new_text)
if new_metadata:
self.metadata[doc_id].update(new_metadata)
return True
return False
def delete_document(self, doc_id: int):
"""Delete a document"""
if 0 <= doc_id < len(self.documents):
del self.documents[doc_id]
del self.embeddings[doc_id]
del self.metadata[doc_id]
return True
return False
def get_stats(self) -> Dict:
"""Get database statistics"""
return {
"total_documents": len(self.documents),
"embedding_dimension": len(self.embeddings[0]) if self.embeddings else 0,
"unique_metadata_keys": list(set().union(*(meta.keys() for meta in self.metadata)))
}
def _cosine_similarity(self, vec1: List[float], vec2: List[float]) -> float:
"""Helper method for cosine similarity"""
vec1 = np.array(vec1)
vec2 = np.array(vec2)
dot_product = np.dot(vec1, vec2)
magnitude1 = np.linalg.norm(vec1)
magnitude2 = np.linalg.norm(vec2)
if magnitude1 == 0 or magnitude2 == 0:
return 0.0
return dot_product / (magnitude1 * magnitude2)
def export_to_json(self, filename: str):
"""Export database to JSON file"""
import json
data = {
"documents": self.documents,
"embeddings": self.embeddings,
"metadata": self.metadata
}
with open(filename, 'w') as f:
json.dump(data, f, indent=2)
def import_from_json(self, filename: str):
"""Import database from JSON file"""
import json
with open(filename, 'r') as f:
data = json.load(f)
self.documents = data["documents"]
self.embeddings = data["embeddings"]
self.metadata = data["metadata"]
# Usage
embedding_gen = EmbeddingGenerator("YOUR_API_KEY")
vector_db = VectorDatabase(embedding_gen)
# Add documents with metadata
docs_and_metadata = [
("Python is a versatile programming language", {"category": "programming", "difficulty": "beginner"}),
("Machine learning algorithms can solve complex problems", {"category": "AI", "difficulty": "advanced"}),
("Cooking pasta requires boiling water first", {"category": "cooking", "difficulty": "beginner"}),
("Deep neural networks have multiple hidden layers", {"category": "AI", "difficulty": "advanced"}),
("JavaScript is essential for web development", {"category": "programming", "difficulty": "intermediate"})
]
for text, metadata in docs_and_metadata:
vector_db.add_document(text, metadata)
# Search with different queries
search_results = vector_db.search("programming languages", top_k=3)
print("Search results for 'programming languages':")
for result in search_results:
print(f" {result['similarity']:.3f}: {result['text']} (Category: {result['metadata']['category']})")
# Search with metadata filter
ai_results = vector_db.search("algorithms", metadata_filter={"category": "AI"})
print("\nAI-specific results for 'algorithms':")
for result in ai_results:
print(f" {result['similarity']:.3f}: {result['text']}")
# Database statistics
stats = vector_db.get_stats()
print(f"\nDatabase stats: {stats}")
Content Classification System
Copy
class ContentClassifier:
def __init__(self, embedding_generator: EmbeddingGenerator):
self.embedding_gen = embedding_generator
self.categories = {}
self.is_trained = False
def add_category(self, category_name: str, example_texts: List[str]):
"""Add a category with example texts"""
# Generate embeddings for all examples
embeddings = self.embedding_gen.create_embeddings_batch(example_texts)
# Calculate centroid (average) embedding for the category
centroid = np.mean(embeddings, axis=0).tolist()
self.categories[category_name] = {
"examples": example_texts,
"embeddings": embeddings,
"centroid": centroid
}
def train(self):
"""Mark classifier as trained"""
if len(self.categories) < 2:
raise ValueError("Need at least 2 categories to train classifier")
self.is_trained = True
def classify(self, text: str, confidence_threshold: float = 0.5) -> Dict:
"""Classify a text into categories"""
if not self.is_trained:
raise ValueError("Classifier must be trained before classification")
text_embedding = self.embedding_gen.create_embedding(text)
# Calculate similarity to each category centroid
similarities = {}
for category, data in self.categories.items():
similarity = self._cosine_similarity(text_embedding, data["centroid"])
similarities[category] = similarity
# Find best match
best_category = max(similarities, key=similarities.get)
best_score = similarities[best_category]
# Check confidence threshold
prediction = {
"text": text,
"predicted_category": best_category if best_score >= confidence_threshold else "unknown",
"confidence": best_score,
"all_scores": similarities
}
return prediction
def classify_batch(self, texts: List[str], confidence_threshold: float = 0.5) -> List[Dict]:
"""Classify multiple texts"""
return [self.classify(text, confidence_threshold) for text in texts]
def evaluate_accuracy(self, test_texts: List[str], true_labels: List[str]) -> Dict:
"""Evaluate classifier accuracy on test data"""
predictions = [self.classify(text)["predicted_category"] for text in test_texts]
correct = sum(1 for pred, true in zip(predictions, true_labels) if pred == true)
accuracy = correct / len(true_labels) if true_labels else 0
# Create confusion matrix
unique_labels = list(set(true_labels + predictions))
confusion_matrix = {label: {label2: 0 for label2 in unique_labels} for label in unique_labels}
for true_label, pred_label in zip(true_labels, predictions):
confusion_matrix[true_label][pred_label] += 1
return {
"accuracy": accuracy,
"correct_predictions": correct,
"total_predictions": len(true_labels),
"confusion_matrix": confusion_matrix
}
def get_category_info(self) -> Dict:
"""Get information about all categories"""
info = {}
for category, data in self.categories.items():
info[category] = {
"example_count": len(data["examples"]),
"sample_examples": data["examples"][:3] # Show first 3 examples
}
return info
def _cosine_similarity(self, vec1: List[float], vec2: List[float]) -> float:
"""Helper method for cosine similarity"""
vec1 = np.array(vec1)
vec2 = np.array(vec2)
dot_product = np.dot(vec1, vec2)
magnitude1 = np.linalg.norm(vec1)
magnitude2 = np.linalg.norm(vec2)
if magnitude1 == 0 or magnitude2 == 0:
return 0.0
return dot_product / (magnitude1 * magnitude2)
# Usage
embedding_gen = EmbeddingGenerator("YOUR_API_KEY")
classifier = ContentClassifier(embedding_gen)
# Define categories with examples
technology_examples = [
"New smartphone features artificial intelligence capabilities",
"Software update improves system performance significantly",
"Cloud computing revolutionizes data storage solutions",
"Machine learning algorithms optimize user experience"
]
sports_examples = [
"Football team wins championship after intense season",
"Basketball player breaks scoring record tonight",
"Olympic swimming competition features world records",
"Tennis tournament showcases incredible athletic skill"
]
food_examples = [
"Delicious pasta recipe with fresh ingredients",
"Restaurant review highlights amazing flavor combinations",
"Cooking techniques for perfect grilled vegetables",
"Traditional bread baking methods explained"
]
# Add categories
classifier.add_category("technology", technology_examples)
classifier.add_category("sports", sports_examples)
classifier.add_category("food", food_examples)
# Train classifier
classifier.train()
# Test classification
test_texts = [
"Latest AI breakthrough improves natural language processing",
"Soccer world cup final draws millions of viewers",
"Homemade pizza dough recipe for crispy crust",
"Virtual reality headset offers immersive gaming experience",
"Marathon runner completes race in record time"
]
for text in test_texts:
result = classifier.classify(text)
print(f"Text: {text}")
print(f"Category: {result['predicted_category']} (Confidence: {result['confidence']:.3f})")
print()
# Evaluate on test data
test_labels = ["technology", "sports", "food", "technology", "sports"]
evaluation = classifier.evaluate_accuracy(test_texts, test_labels)
print(f"Accuracy: {evaluation['accuracy']:.2%}")
Specialized Embedding Applications
Semantic Search Engine
Copy
def create_semantic_search_engine(api_key: str, documents: List[Dict]):
"""Create a semantic search engine for documents"""
embedding_gen = EmbeddingGenerator(api_key)
vector_db = VectorDatabase(embedding_gen)
# Add documents to the database
for doc in documents:
vector_db.add_document(
text=doc.get("content", ""),
metadata={
"title": doc.get("title", ""),
"author": doc.get("author", ""),
"date": doc.get("date", ""),
"url": doc.get("url", ""),
"tags": doc.get("tags", [])
}
)
class SemanticSearchEngine:
def __init__(self, vector_database):
self.db = vector_database
def search(self, query: str, filters: Dict = None, top_k: int = 10):
"""Search documents semantically"""
results = self.db.search(query, top_k=top_k, metadata_filter=filters)
# Format results for display
formatted_results = []
for result in results:
formatted_results.append({
"title": result["metadata"].get("title", "Untitled"),
"content_preview": result["text"][:200] + "...",
"author": result["metadata"].get("author", "Unknown"),
"relevance_score": result["similarity"],
"url": result["metadata"].get("url", ""),
"tags": result["metadata"].get("tags", [])
})
return formatted_results
def get_recommendations(self, document_id: int, top_k: int = 5):
"""Get documents similar to a specific document"""
doc = self.db.get_document(document_id)
if not doc:
return []
return self.search(doc["text"], top_k=top_k + 1)[1:] # Exclude the document itself
return SemanticSearchEngine(vector_db)
# Example usage
documents = [
{
"title": "Introduction to Machine Learning",
"content": "Machine learning is a subset of artificial intelligence that enables computers to learn and improve from experience without being explicitly programmed.",
"author": "Dr. Smith",
"date": "2024-01-15",
"tags": ["AI", "ML", "programming"]
},
{
"title": "Deep Learning Fundamentals",
"content": "Deep learning uses neural networks with multiple layers to model and understand complex patterns in data, revolutionizing fields like computer vision and natural language processing.",
"author": "Prof. Johnson",
"date": "2024-02-20",
"tags": ["deep learning", "neural networks", "AI"]
},
{
"title": "Cooking Perfect Pasta",
"content": "The secret to perfect pasta lies in using plenty of salted water, timing the cooking precisely, and choosing the right sauce pairing for each pasta shape.",
"author": "Chef Maria",
"date": "2024-03-10",
"tags": ["cooking", "pasta", "recipes"]
}
]
search_engine = create_semantic_search_engine("YOUR_API_KEY", documents)
# Search examples
results = search_engine.search("artificial intelligence and neural networks")
for result in results:
print(f"Title: {result['title']}")
print(f"Relevance: {result['relevance_score']:.3f}")
print(f"Preview: {result['content_preview']}")
print()
Content Recommendation System
Copy
def create_recommendation_system(api_key: str):
"""Create a content recommendation system"""
embedding_gen = EmbeddingGenerator(api_key)
class RecommendationSystem:
def __init__(self, embedding_generator):
self.embedding_gen = embedding_generator
self.user_profiles = {}
self.content_embeddings = {}
self.content_metadata = {}
def add_content(self, content_id: str, text: str, metadata: Dict = None):
"""Add content to the system"""
embedding = self.embedding_gen.create_embedding(text)
self.content_embeddings[content_id] = embedding
self.content_metadata[content_id] = metadata or {}
def update_user_profile(self, user_id: str, interactions: List[Dict]):
"""Update user profile based on interactions"""
# interactions: [{"content_id": str, "rating": float, "interaction_type": str}]
if user_id not in self.user_profiles:
self.user_profiles[user_id] = {
"preferences": None,
"interaction_history": []
}
# Add new interactions
self.user_profiles[user_id]["interaction_history"].extend(interactions)
# Calculate user preference vector
weighted_embeddings = []
weights = []
for interaction in self.user_profiles[user_id]["interaction_history"]:
content_id = interaction["content_id"]
rating = interaction["rating"]
if content_id in self.content_embeddings:
# Weight by rating and interaction type
weight = rating
if interaction["interaction_type"] == "like":
weight *= 1.5
elif interaction["interaction_type"] == "share":
weight *= 2.0
elif interaction["interaction_type"] == "dislike":
weight *= -0.5
weighted_embeddings.append(np.array(self.content_embeddings[content_id]) * weight)
weights.append(abs(weight))
if weighted_embeddings:
# Calculate weighted average
total_weight = sum(weights)
preference_vector = sum(weighted_embeddings) / total_weight
self.user_profiles[user_id]["preferences"] = preference_vector.tolist()
def get_recommendations(self, user_id: str, exclude_seen: bool = True, top_k: int = 10) -> List[Dict]:
"""Get content recommendations for a user"""
if user_id not in self.user_profiles or self.user_profiles[user_id]["preferences"] is None:
# Return popular content for new users
return self._get_popular_content(top_k)
user_preferences = self.user_profiles[user_id]["preferences"]
seen_content = {interaction["content_id"] for interaction in self.user_profiles[user_id]["interaction_history"]}
recommendations = []
for content_id, content_embedding in self.content_embeddings.items():
if exclude_seen and content_id in seen_content:
continue
# Calculate similarity to user preferences
similarity = self._cosine_similarity(user_preferences, content_embedding)
recommendations.append({
"content_id": content_id,
"similarity_score": similarity,
"metadata": self.content_metadata[content_id]
})
# Sort by similarity and return top_k
recommendations.sort(key=lambda x: x["similarity_score"], reverse=True)
return recommendations[:top_k]
def get_similar_content(self, content_id: str, top_k: int = 5) -> List[Dict]:
"""Get content similar to a specific item"""
if content_id not in self.content_embeddings:
return []
target_embedding = self.content_embeddings[content_id]
similarities = []
for other_id, other_embedding in self.content_embeddings.items():
if other_id != content_id:
similarity = self._cosine_similarity(target_embedding, other_embedding)
similarities.append({
"content_id": other_id,
"similarity_score": similarity,
"metadata": self.content_metadata[other_id]
})
similarities.sort(key=lambda x: x["similarity_score"], reverse=True)
return similarities[:top_k]
def _get_popular_content(self, top_k: int) -> List[Dict]:
"""Get popular content for new users"""
# Placeholder: return random content
import random
content_ids = list(self.content_embeddings.keys())
selected = random.sample(content_ids, min(top_k, len(content_ids)))
return [{
"content_id": content_id,
"similarity_score": 0.0,
"metadata": self.content_metadata[content_id]
} for content_id in selected]
def _cosine_similarity(self, vec1: List[float], vec2: List[float]) -> float:
"""Calculate cosine similarity"""
vec1 = np.array(vec1)
vec2 = np.array(vec2)
dot_product = np.dot(vec1, vec2)
magnitude1 = np.linalg.norm(vec1)
magnitude2 = np.linalg.norm(vec2)
if magnitude1 == 0 or magnitude2 == 0:
return 0.0
return dot_product / (magnitude1 * magnitude2)
return RecommendationSystem(embedding_gen)
# Usage
rec_system = create_recommendation_system("YOUR_API_KEY")
# Add content
contents = [
("article_1", "Latest advances in artificial intelligence and machine learning", {"category": "technology", "author": "Tech Writer"}),
("article_2", "Deep learning applications in computer vision", {"category": "AI", "author": "AI Researcher"}),
("article_3", "Delicious pasta recipes for home cooking", {"category": "food", "author": "Chef"}),
("article_4", "Understanding neural network architectures", {"category": "AI", "author": "Data Scientist"})
]
for content_id, text, metadata in contents:
rec_system.add_content(content_id, text, metadata)
# Simulate user interactions
user_interactions = [
{"content_id": "article_1", "rating": 4.5, "interaction_type": "like"},
{"content_id": "article_2", "rating": 5.0, "interaction_type": "share"},
{"content_id": "article_3", "rating": 2.0, "interaction_type": "dislike"}
]
rec_system.update_user_profile("user_123", user_interactions)
# Get recommendations
recommendations = rec_system.get_recommendations("user_123", top_k=3)
print("Recommendations for user_123:")
for rec in recommendations:
print(f" Content: {rec['content_id']}")
print(f" Score: {rec['similarity_score']:.3f}")
print(f" Category: {rec['metadata']['category']}")
print()
# Get similar content
similar = rec_system.get_similar_content("article_1", top_k=2)
print("Similar to article_1:")
for item in similar:
print(f" {item['content_id']}: {item['similarity_score']:.3f}")
Best Practices
1. Text Preprocessing
- Clean text: Remove excessive whitespace, special characters
- Normalize content: Consistent formatting and encoding
- Chunk long documents: Split into manageable segments
- Handle multilingual content: Specify language when needed
2. Embedding Management
- Batch processing: Use batch API for multiple texts
- Caching: Store embeddings to avoid recomputation
- Version control: Track embedding model versions
- Dimension consistency: Ensure same model across pipeline
3. Similarity Calculations
- Choose right metric: Cosine similarity for semantic similarity
- Normalize vectors: Consider L2 normalization
- Threshold tuning: Adjust similarity thresholds per use case
- Performance optimization: Use approximate nearest neighbor for large datasets
4. Production Deployment
- Error handling: Robust API error management
- Rate limiting: Respect API rate limits
- Monitoring: Track embedding quality and performance
- Scaling: Plan for increased embedding volume
Common Use Cases
Semantic SearchSearch documents by meaning, not just keywords Content Recommendation
Suggest related articles, products, or media Document Classification
Automatically categorize content by topic Duplicate Detection
Find similar or duplicate content efficiently Knowledge Base Q&A
Build intelligent question-answering systems Content Clustering
Group related content automatically Personalization
Create user preference profiles for recommendations Language Translation Support
Cross-language similarity and matching