Skip to main content

Create Embeddings

Transform text into high-dimensional vector representations for semantic search, similarity matching, content classification, and AI-powered applications. Perfect for search engines, recommendation systems, knowledge bases, and content analysis.

Overview

Embeddings enable:
  • Semantic search - Find content by meaning, not just keywords
  • Similarity matching - Compare and group related content
  • Content classification - Categorize and organize information automatically
  • Knowledge retrieval - Build intelligent search and Q&A systems
  • Recommendation engines - Suggest relevant content to users

Key Capabilities

Multi-Language Support
Generate embeddings for text in multiple languages
Batch Processing
Process multiple texts efficiently in single requests
High Precision
Latest embedding models for superior accuracy
Flexible Integration
Easy integration with vector databases and search systems

Quick Start

import requests
import numpy as np
from typing import List, Dict, Any

class EmbeddingGenerator:
    def __init__(self, api_key: str, model: str = "text-embedding-3-large"):
        self.api_key = api_key
        self.model = model
        self.base_url = "https://api.anyapi.ai/v1"
        
    def create_embedding(self, text: str, **kwargs) -> List[float]:
        """Generate embedding for a single text"""
        response = requests.post(
            f"{self.base_url}/embeddings",
            headers={
                "Authorization": f"Bearer {self.api_key}",
                "Content-Type": "application/json"
            },
            json={
                "model": self.model,
                "input": text,
                **kwargs
            }
        )
        
        if response.status_code == 200:
            return response.json()["data"][0]["embedding"]
        else:
            raise Exception(f"API Error: {response.status_code} - {response.text}")
    
    def create_embeddings_batch(self, texts: List[str], **kwargs) -> List[List[float]]:
        """Generate embeddings for multiple texts"""
        response = requests.post(
            f"{self.base_url}/embeddings",
            headers={
                "Authorization": f"Bearer {self.api_key}",
                "Content-Type": "application/json"
            },
            json={
                "model": self.model,
                "input": texts,
                **kwargs
            }
        )
        
        if response.status_code == 200:
            return [item["embedding"] for item in response.json()["data"]]
        else:
            raise Exception(f"API Error: {response.status_code} - {response.text}")
    
    def get_embedding_info(self) -> Dict[str, Any]:
        """Get information about the embedding model"""
        # Model dimension information
        model_info = {
            "text-embedding-3-small": {"dimensions": 1536, "max_tokens": 8191},
            "text-embedding-3-large": {"dimensions": 3072, "max_tokens": 8191},
            "text-embedding-ada-002": {"dimensions": 1536, "max_tokens": 8191}
        }
        return model_info.get(self.model, {"dimensions": "unknown", "max_tokens": "unknown"})

# Basic usage
embedding_gen = EmbeddingGenerator("YOUR_API_KEY")

# Single text embedding
text = "Artificial intelligence is transforming how we work and live."
embedding = embedding_gen.create_embedding(text)
print(f"Embedding dimension: {len(embedding)}")
print(f"First 5 values: {embedding[:5]}")

# Batch processing
texts = [
    "Machine learning enables computers to learn without explicit programming.",
    "Deep learning uses neural networks with multiple layers.",
    "Natural language processing helps computers understand human language.",
    "Computer vision allows machines to interpret visual information."
]

embeddings = embedding_gen.create_embeddings_batch(texts)
print(f"Generated {len(embeddings)} embeddings")
print(f"Each embedding has {len(embeddings[0])} dimensions")

# Model information
info = embedding_gen.get_embedding_info()
print(f"Model info: {info}")

Advanced Embedding Operations

Similarity Calculator

class SimilarityCalculator:
    def __init__(self, embedding_generator: EmbeddingGenerator):
        self.embedding_gen = embedding_generator
    
    def cosine_similarity(self, vec1: List[float], vec2: List[float]) -> float:
        """Calculate cosine similarity between two vectors"""
        vec1 = np.array(vec1)
        vec2 = np.array(vec2)
        
        dot_product = np.dot(vec1, vec2)
        magnitude1 = np.linalg.norm(vec1)
        magnitude2 = np.linalg.norm(vec2)
        
        if magnitude1 == 0 or magnitude2 == 0:
            return 0.0
        
        return dot_product / (magnitude1 * magnitude2)
    
    def euclidean_distance(self, vec1: List[float], vec2: List[float]) -> float:
        """Calculate Euclidean distance between two vectors"""
        vec1 = np.array(vec1)
        vec2 = np.array(vec2)
        return np.linalg.norm(vec1 - vec2)
    
    def find_most_similar(self, query_text: str, candidate_texts: List[str], top_k: int = 5) -> List[Dict]:
        """Find most similar texts to a query"""
        # Generate embeddings
        query_embedding = self.embedding_gen.create_embedding(query_text)
        candidate_embeddings = self.embedding_gen.create_embeddings_batch(candidate_texts)
        
        # Calculate similarities
        similarities = []
        for i, candidate_embedding in enumerate(candidate_embeddings):
            similarity = self.cosine_similarity(query_embedding, candidate_embedding)
            similarities.append({
                "text": candidate_texts[i],
                "similarity": similarity,
                "index": i
            })
        
        # Sort by similarity and return top_k
        similarities.sort(key=lambda x: x["similarity"], reverse=True)
        return similarities[:top_k]
    
    def cluster_texts(self, texts: List[str], n_clusters: int = 3) -> Dict[str, List[str]]:
        """Simple clustering of texts based on embeddings"""
        from sklearn.cluster import KMeans
        
        # Generate embeddings
        embeddings = self.embedding_gen.create_embeddings_batch(texts)
        
        # Perform clustering
        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        cluster_labels = kmeans.fit_predict(embeddings)
        
        # Group texts by cluster
        clusters = {}
        for i, label in enumerate(cluster_labels):
            cluster_key = f"cluster_{label}"
            if cluster_key not in clusters:
                clusters[cluster_key] = []
            clusters[cluster_key].append(texts[i])
        
        return clusters

# Usage
embedding_gen = EmbeddingGenerator("YOUR_API_KEY")
similarity_calc = SimilarityCalculator(embedding_gen)

# Find similar texts
query = "machine learning algorithms"
candidates = [
    "Neural networks are a type of machine learning model",
    "Supervised learning requires labeled training data",
    "The weather is nice today",
    "Deep learning is a subset of machine learning",
    "I enjoy cooking pasta for dinner",
    "Unsupervised learning finds patterns in unlabeled data"
]

similar_texts = similarity_calc.find_most_similar(query, candidates, top_k=3)
for result in similar_texts:
    print(f"Similarity: {result['similarity']:.3f} - {result['text']}")

# Cluster texts
tech_texts = [
    "Artificial intelligence and machine learning",
    "Software development and programming",
    "Cooking recipes and food preparation",
    "Travel destinations and vacation planning",
    "Data science and analytics",
    "Restaurant reviews and dining experiences",
    "Computer vision and image recognition",
    "Adventure sports and outdoor activities"
]

clusters = similarity_calc.cluster_texts(tech_texts, n_clusters=3)
for cluster_name, cluster_texts in clusters.items():
    print(f"\n{cluster_name}:")
    for text in cluster_texts:
        print(f"  - {text}")

Vector Database Integration

class VectorDatabase:
    def __init__(self, embedding_generator: EmbeddingGenerator):
        self.embedding_gen = embedding_generator
        self.documents = []
        self.embeddings = []
        self.metadata = []
    
    def add_document(self, text: str, metadata: Dict = None):
        """Add a single document to the database"""
        embedding = self.embedding_gen.create_embedding(text)
        
        self.documents.append(text)
        self.embeddings.append(embedding)
        self.metadata.append(metadata or {})
        
        return len(self.documents) - 1  # Return document ID
    
    def add_documents_batch(self, texts: List[str], metadata_list: List[Dict] = None):
        """Add multiple documents efficiently"""
        embeddings = self.embedding_gen.create_embeddings_batch(texts)
        
        if metadata_list is None:
            metadata_list = [{}] * len(texts)
        
        start_id = len(self.documents)
        
        self.documents.extend(texts)
        self.embeddings.extend(embeddings)
        self.metadata.extend(metadata_list)
        
        return list(range(start_id, len(self.documents)))
    
    def search(self, query: str, top_k: int = 5, metadata_filter: Dict = None) -> List[Dict]:
        """Search for similar documents"""
        query_embedding = self.embedding_gen.create_embedding(query)
        
        results = []
        for i, doc_embedding in enumerate(self.embeddings):
            # Apply metadata filter if provided
            if metadata_filter:
                if not all(self.metadata[i].get(k) == v for k, v in metadata_filter.items()):
                    continue
            
            # Calculate similarity
            similarity = self._cosine_similarity(query_embedding, doc_embedding)
            
            results.append({
                "id": i,
                "text": self.documents[i],
                "metadata": self.metadata[i],
                "similarity": similarity
            })
        
        # Sort by similarity and return top_k
        results.sort(key=lambda x: x["similarity"], reverse=True)
        return results[:top_k]
    
    def get_document(self, doc_id: int) -> Dict:
        """Retrieve a document by ID"""
        if 0 <= doc_id < len(self.documents):
            return {
                "id": doc_id,
                "text": self.documents[doc_id],
                "metadata": self.metadata[doc_id],
                "embedding": self.embeddings[doc_id]
            }
        return None
    
    def update_document(self, doc_id: int, new_text: str = None, new_metadata: Dict = None):
        """Update a document"""
        if 0 <= doc_id < len(self.documents):
            if new_text:
                self.documents[doc_id] = new_text
                self.embeddings[doc_id] = self.embedding_gen.create_embedding(new_text)
            
            if new_metadata:
                self.metadata[doc_id].update(new_metadata)
            
            return True
        return False
    
    def delete_document(self, doc_id: int):
        """Delete a document"""
        if 0 <= doc_id < len(self.documents):
            del self.documents[doc_id]
            del self.embeddings[doc_id]
            del self.metadata[doc_id]
            return True
        return False
    
    def get_stats(self) -> Dict:
        """Get database statistics"""
        return {
            "total_documents": len(self.documents),
            "embedding_dimension": len(self.embeddings[0]) if self.embeddings else 0,
            "unique_metadata_keys": list(set().union(*(meta.keys() for meta in self.metadata)))
        }
    
    def _cosine_similarity(self, vec1: List[float], vec2: List[float]) -> float:
        """Helper method for cosine similarity"""
        vec1 = np.array(vec1)
        vec2 = np.array(vec2)
        
        dot_product = np.dot(vec1, vec2)
        magnitude1 = np.linalg.norm(vec1)
        magnitude2 = np.linalg.norm(vec2)
        
        if magnitude1 == 0 or magnitude2 == 0:
            return 0.0
        
        return dot_product / (magnitude1 * magnitude2)
    
    def export_to_json(self, filename: str):
        """Export database to JSON file"""
        import json
        
        data = {
            "documents": self.documents,
            "embeddings": self.embeddings,
            "metadata": self.metadata
        }
        
        with open(filename, 'w') as f:
            json.dump(data, f, indent=2)
    
    def import_from_json(self, filename: str):
        """Import database from JSON file"""
        import json
        
        with open(filename, 'r') as f:
            data = json.load(f)
        
        self.documents = data["documents"]
        self.embeddings = data["embeddings"]
        self.metadata = data["metadata"]

# Usage
embedding_gen = EmbeddingGenerator("YOUR_API_KEY")
vector_db = VectorDatabase(embedding_gen)

# Add documents with metadata
docs_and_metadata = [
    ("Python is a versatile programming language", {"category": "programming", "difficulty": "beginner"}),
    ("Machine learning algorithms can solve complex problems", {"category": "AI", "difficulty": "advanced"}),
    ("Cooking pasta requires boiling water first", {"category": "cooking", "difficulty": "beginner"}),
    ("Deep neural networks have multiple hidden layers", {"category": "AI", "difficulty": "advanced"}),
    ("JavaScript is essential for web development", {"category": "programming", "difficulty": "intermediate"})
]

for text, metadata in docs_and_metadata:
    vector_db.add_document(text, metadata)

# Search with different queries
search_results = vector_db.search("programming languages", top_k=3)
print("Search results for 'programming languages':")
for result in search_results:
    print(f"  {result['similarity']:.3f}: {result['text']} (Category: {result['metadata']['category']})")

# Search with metadata filter
ai_results = vector_db.search("algorithms", metadata_filter={"category": "AI"})
print("\nAI-specific results for 'algorithms':")
for result in ai_results:
    print(f"  {result['similarity']:.3f}: {result['text']}")

# Database statistics
stats = vector_db.get_stats()
print(f"\nDatabase stats: {stats}")

Content Classification System

class ContentClassifier:
    def __init__(self, embedding_generator: EmbeddingGenerator):
        self.embedding_gen = embedding_generator
        self.categories = {}
        self.is_trained = False
    
    def add_category(self, category_name: str, example_texts: List[str]):
        """Add a category with example texts"""
        # Generate embeddings for all examples
        embeddings = self.embedding_gen.create_embeddings_batch(example_texts)
        
        # Calculate centroid (average) embedding for the category
        centroid = np.mean(embeddings, axis=0).tolist()
        
        self.categories[category_name] = {
            "examples": example_texts,
            "embeddings": embeddings,
            "centroid": centroid
        }
    
    def train(self):
        """Mark classifier as trained"""
        if len(self.categories) < 2:
            raise ValueError("Need at least 2 categories to train classifier")
        self.is_trained = True
    
    def classify(self, text: str, confidence_threshold: float = 0.5) -> Dict:
        """Classify a text into categories"""
        if not self.is_trained:
            raise ValueError("Classifier must be trained before classification")
        
        text_embedding = self.embedding_gen.create_embedding(text)
        
        # Calculate similarity to each category centroid
        similarities = {}
        for category, data in self.categories.items():
            similarity = self._cosine_similarity(text_embedding, data["centroid"])
            similarities[category] = similarity
        
        # Find best match
        best_category = max(similarities, key=similarities.get)
        best_score = similarities[best_category]
        
        # Check confidence threshold
        prediction = {
            "text": text,
            "predicted_category": best_category if best_score >= confidence_threshold else "unknown",
            "confidence": best_score,
            "all_scores": similarities
        }
        
        return prediction
    
    def classify_batch(self, texts: List[str], confidence_threshold: float = 0.5) -> List[Dict]:
        """Classify multiple texts"""
        return [self.classify(text, confidence_threshold) for text in texts]
    
    def evaluate_accuracy(self, test_texts: List[str], true_labels: List[str]) -> Dict:
        """Evaluate classifier accuracy on test data"""
        predictions = [self.classify(text)["predicted_category"] for text in test_texts]
        
        correct = sum(1 for pred, true in zip(predictions, true_labels) if pred == true)
        accuracy = correct / len(true_labels) if true_labels else 0
        
        # Create confusion matrix
        unique_labels = list(set(true_labels + predictions))
        confusion_matrix = {label: {label2: 0 for label2 in unique_labels} for label in unique_labels}
        
        for true_label, pred_label in zip(true_labels, predictions):
            confusion_matrix[true_label][pred_label] += 1
        
        return {
            "accuracy": accuracy,
            "correct_predictions": correct,
            "total_predictions": len(true_labels),
            "confusion_matrix": confusion_matrix
        }
    
    def get_category_info(self) -> Dict:
        """Get information about all categories"""
        info = {}
        for category, data in self.categories.items():
            info[category] = {
                "example_count": len(data["examples"]),
                "sample_examples": data["examples"][:3]  # Show first 3 examples
            }
        return info
    
    def _cosine_similarity(self, vec1: List[float], vec2: List[float]) -> float:
        """Helper method for cosine similarity"""
        vec1 = np.array(vec1)
        vec2 = np.array(vec2)
        
        dot_product = np.dot(vec1, vec2)
        magnitude1 = np.linalg.norm(vec1)
        magnitude2 = np.linalg.norm(vec2)
        
        if magnitude1 == 0 or magnitude2 == 0:
            return 0.0
        
        return dot_product / (magnitude1 * magnitude2)

# Usage
embedding_gen = EmbeddingGenerator("YOUR_API_KEY")
classifier = ContentClassifier(embedding_gen)

# Define categories with examples
technology_examples = [
    "New smartphone features artificial intelligence capabilities",
    "Software update improves system performance significantly",
    "Cloud computing revolutionizes data storage solutions",
    "Machine learning algorithms optimize user experience"
]

sports_examples = [
    "Football team wins championship after intense season",
    "Basketball player breaks scoring record tonight",
    "Olympic swimming competition features world records",
    "Tennis tournament showcases incredible athletic skill"
]

food_examples = [
    "Delicious pasta recipe with fresh ingredients",
    "Restaurant review highlights amazing flavor combinations",
    "Cooking techniques for perfect grilled vegetables",
    "Traditional bread baking methods explained"
]

# Add categories
classifier.add_category("technology", technology_examples)
classifier.add_category("sports", sports_examples)
classifier.add_category("food", food_examples)

# Train classifier
classifier.train()

# Test classification
test_texts = [
    "Latest AI breakthrough improves natural language processing",
    "Soccer world cup final draws millions of viewers",
    "Homemade pizza dough recipe for crispy crust",
    "Virtual reality headset offers immersive gaming experience",
    "Marathon runner completes race in record time"
]

for text in test_texts:
    result = classifier.classify(text)
    print(f"Text: {text}")
    print(f"Category: {result['predicted_category']} (Confidence: {result['confidence']:.3f})")
    print()

# Evaluate on test data
test_labels = ["technology", "sports", "food", "technology", "sports"]
evaluation = classifier.evaluate_accuracy(test_texts, test_labels)
print(f"Accuracy: {evaluation['accuracy']:.2%}")

Specialized Embedding Applications

Semantic Search Engine

def create_semantic_search_engine(api_key: str, documents: List[Dict]):
    """Create a semantic search engine for documents"""
    
    embedding_gen = EmbeddingGenerator(api_key)
    vector_db = VectorDatabase(embedding_gen)
    
    # Add documents to the database
    for doc in documents:
        vector_db.add_document(
            text=doc.get("content", ""),
            metadata={
                "title": doc.get("title", ""),
                "author": doc.get("author", ""),
                "date": doc.get("date", ""),
                "url": doc.get("url", ""),
                "tags": doc.get("tags", [])
            }
        )
    
    class SemanticSearchEngine:
        def __init__(self, vector_database):
            self.db = vector_database
        
        def search(self, query: str, filters: Dict = None, top_k: int = 10):
            """Search documents semantically"""
            results = self.db.search(query, top_k=top_k, metadata_filter=filters)
            
            # Format results for display
            formatted_results = []
            for result in results:
                formatted_results.append({
                    "title": result["metadata"].get("title", "Untitled"),
                    "content_preview": result["text"][:200] + "...",
                    "author": result["metadata"].get("author", "Unknown"),
                    "relevance_score": result["similarity"],
                    "url": result["metadata"].get("url", ""),
                    "tags": result["metadata"].get("tags", [])
                })
            
            return formatted_results
        
        def get_recommendations(self, document_id: int, top_k: int = 5):
            """Get documents similar to a specific document"""
            doc = self.db.get_document(document_id)
            if not doc:
                return []
            
            return self.search(doc["text"], top_k=top_k + 1)[1:]  # Exclude the document itself
    
    return SemanticSearchEngine(vector_db)

# Example usage
documents = [
    {
        "title": "Introduction to Machine Learning",
        "content": "Machine learning is a subset of artificial intelligence that enables computers to learn and improve from experience without being explicitly programmed.",
        "author": "Dr. Smith",
        "date": "2024-01-15",
        "tags": ["AI", "ML", "programming"]
    },
    {
        "title": "Deep Learning Fundamentals",
        "content": "Deep learning uses neural networks with multiple layers to model and understand complex patterns in data, revolutionizing fields like computer vision and natural language processing.",
        "author": "Prof. Johnson",
        "date": "2024-02-20",
        "tags": ["deep learning", "neural networks", "AI"]
    },
    {
        "title": "Cooking Perfect Pasta",
        "content": "The secret to perfect pasta lies in using plenty of salted water, timing the cooking precisely, and choosing the right sauce pairing for each pasta shape.",
        "author": "Chef Maria",
        "date": "2024-03-10",
        "tags": ["cooking", "pasta", "recipes"]
    }
]

search_engine = create_semantic_search_engine("YOUR_API_KEY", documents)

# Search examples
results = search_engine.search("artificial intelligence and neural networks")
for result in results:
    print(f"Title: {result['title']}")
    print(f"Relevance: {result['relevance_score']:.3f}")
    print(f"Preview: {result['content_preview']}")
    print()

Content Recommendation System

def create_recommendation_system(api_key: str):
    """Create a content recommendation system"""
    
    embedding_gen = EmbeddingGenerator(api_key)
    
    class RecommendationSystem:
        def __init__(self, embedding_generator):
            self.embedding_gen = embedding_generator
            self.user_profiles = {}
            self.content_embeddings = {}
            self.content_metadata = {}
        
        def add_content(self, content_id: str, text: str, metadata: Dict = None):
            """Add content to the system"""
            embedding = self.embedding_gen.create_embedding(text)
            self.content_embeddings[content_id] = embedding
            self.content_metadata[content_id] = metadata or {}
        
        def update_user_profile(self, user_id: str, interactions: List[Dict]):
            """Update user profile based on interactions"""
            # interactions: [{"content_id": str, "rating": float, "interaction_type": str}]
            
            if user_id not in self.user_profiles:
                self.user_profiles[user_id] = {
                    "preferences": None,
                    "interaction_history": []
                }
            
            # Add new interactions
            self.user_profiles[user_id]["interaction_history"].extend(interactions)
            
            # Calculate user preference vector
            weighted_embeddings = []
            weights = []
            
            for interaction in self.user_profiles[user_id]["interaction_history"]:
                content_id = interaction["content_id"]
                rating = interaction["rating"]
                
                if content_id in self.content_embeddings:
                    # Weight by rating and interaction type
                    weight = rating
                    if interaction["interaction_type"] == "like":
                        weight *= 1.5
                    elif interaction["interaction_type"] == "share":
                        weight *= 2.0
                    elif interaction["interaction_type"] == "dislike":
                        weight *= -0.5
                    
                    weighted_embeddings.append(np.array(self.content_embeddings[content_id]) * weight)
                    weights.append(abs(weight))
            
            if weighted_embeddings:
                # Calculate weighted average
                total_weight = sum(weights)
                preference_vector = sum(weighted_embeddings) / total_weight
                self.user_profiles[user_id]["preferences"] = preference_vector.tolist()
        
        def get_recommendations(self, user_id: str, exclude_seen: bool = True, top_k: int = 10) -> List[Dict]:
            """Get content recommendations for a user"""
            if user_id not in self.user_profiles or self.user_profiles[user_id]["preferences"] is None:
                # Return popular content for new users
                return self._get_popular_content(top_k)
            
            user_preferences = self.user_profiles[user_id]["preferences"]
            seen_content = {interaction["content_id"] for interaction in self.user_profiles[user_id]["interaction_history"]}
            
            recommendations = []
            
            for content_id, content_embedding in self.content_embeddings.items():
                if exclude_seen and content_id in seen_content:
                    continue
                
                # Calculate similarity to user preferences
                similarity = self._cosine_similarity(user_preferences, content_embedding)
                
                recommendations.append({
                    "content_id": content_id,
                    "similarity_score": similarity,
                    "metadata": self.content_metadata[content_id]
                })
            
            # Sort by similarity and return top_k
            recommendations.sort(key=lambda x: x["similarity_score"], reverse=True)
            return recommendations[:top_k]
        
        def get_similar_content(self, content_id: str, top_k: int = 5) -> List[Dict]:
            """Get content similar to a specific item"""
            if content_id not in self.content_embeddings:
                return []
            
            target_embedding = self.content_embeddings[content_id]
            similarities = []
            
            for other_id, other_embedding in self.content_embeddings.items():
                if other_id != content_id:
                    similarity = self._cosine_similarity(target_embedding, other_embedding)
                    similarities.append({
                        "content_id": other_id,
                        "similarity_score": similarity,
                        "metadata": self.content_metadata[other_id]
                    })
            
            similarities.sort(key=lambda x: x["similarity_score"], reverse=True)
            return similarities[:top_k]
        
        def _get_popular_content(self, top_k: int) -> List[Dict]:
            """Get popular content for new users"""
            # Placeholder: return random content
            import random
            content_ids = list(self.content_embeddings.keys())
            selected = random.sample(content_ids, min(top_k, len(content_ids)))
            
            return [{
                "content_id": content_id,
                "similarity_score": 0.0,
                "metadata": self.content_metadata[content_id]
            } for content_id in selected]
        
        def _cosine_similarity(self, vec1: List[float], vec2: List[float]) -> float:
            """Calculate cosine similarity"""
            vec1 = np.array(vec1)
            vec2 = np.array(vec2)
            
            dot_product = np.dot(vec1, vec2)
            magnitude1 = np.linalg.norm(vec1)
            magnitude2 = np.linalg.norm(vec2)
            
            if magnitude1 == 0 or magnitude2 == 0:
                return 0.0
            
            return dot_product / (magnitude1 * magnitude2)
    
    return RecommendationSystem(embedding_gen)

# Usage
rec_system = create_recommendation_system("YOUR_API_KEY")

# Add content
contents = [
    ("article_1", "Latest advances in artificial intelligence and machine learning", {"category": "technology", "author": "Tech Writer"}),
    ("article_2", "Deep learning applications in computer vision", {"category": "AI", "author": "AI Researcher"}),
    ("article_3", "Delicious pasta recipes for home cooking", {"category": "food", "author": "Chef"}),
    ("article_4", "Understanding neural network architectures", {"category": "AI", "author": "Data Scientist"})
]

for content_id, text, metadata in contents:
    rec_system.add_content(content_id, text, metadata)

# Simulate user interactions
user_interactions = [
    {"content_id": "article_1", "rating": 4.5, "interaction_type": "like"},
    {"content_id": "article_2", "rating": 5.0, "interaction_type": "share"},
    {"content_id": "article_3", "rating": 2.0, "interaction_type": "dislike"}
]

rec_system.update_user_profile("user_123", user_interactions)

# Get recommendations
recommendations = rec_system.get_recommendations("user_123", top_k=3)
print("Recommendations for user_123:")
for rec in recommendations:
    print(f"  Content: {rec['content_id']}")
    print(f"  Score: {rec['similarity_score']:.3f}")
    print(f"  Category: {rec['metadata']['category']}")
    print()

# Get similar content
similar = rec_system.get_similar_content("article_1", top_k=2)
print("Similar to article_1:")
for item in similar:
    print(f"  {item['content_id']}: {item['similarity_score']:.3f}")

Best Practices

1. Text Preprocessing

  • Clean text: Remove excessive whitespace, special characters
  • Normalize content: Consistent formatting and encoding
  • Chunk long documents: Split into manageable segments
  • Handle multilingual content: Specify language when needed

2. Embedding Management

  • Batch processing: Use batch API for multiple texts
  • Caching: Store embeddings to avoid recomputation
  • Version control: Track embedding model versions
  • Dimension consistency: Ensure same model across pipeline

3. Similarity Calculations

  • Choose right metric: Cosine similarity for semantic similarity
  • Normalize vectors: Consider L2 normalization
  • Threshold tuning: Adjust similarity thresholds per use case
  • Performance optimization: Use approximate nearest neighbor for large datasets

4. Production Deployment

  • Error handling: Robust API error management
  • Rate limiting: Respect API rate limits
  • Monitoring: Track embedding quality and performance
  • Scaling: Plan for increased embedding volume

Common Use Cases

Semantic Search
Search documents by meaning, not just keywords
Content Recommendation
Suggest related articles, products, or media
Document Classification
Automatically categorize content by topic
Duplicate Detection
Find similar or duplicate content efficiently
Knowledge Base Q&A
Build intelligent question-answering systems
Content Clustering
Group related content automatically
Personalization
Create user preference profiles for recommendations
Language Translation Support
Cross-language similarity and matching
I