Skip to main content

Vision Models Overview

Analyze images, extract text, understand visual content, and perform computer vision tasks with state-of-the-art AI vision models.

Available Models

GPT-4 Vision Models (OpenAI)

  • GPT-4o: Latest multimodal model with advanced vision capabilities
  • GPT-4 Vision: Specialized for image understanding and analysis
  • GPT-4o-mini: Fast and cost-effective vision processing

Claude Vision Models (Anthropic)

  • Claude 3.5 Sonnet: Excellent at detailed image analysis and reasoning
  • Claude 3 Opus: Advanced vision understanding with high accuracy
  • Claude 3 Haiku: Fast vision processing for simple tasks

Google Vision Models

  • Gemini 2.5 Pro: Advanced multimodal with vision capabilities
  • Gemini 1.5 Flash: Fast image processing and understanding

Specialized Vision Models

  • LLaVA: Open-source large language and vision assistant
  • BLIP-2: Image captioning and visual question answering
  • Florence-2: Microsoft’s computer vision foundation model

Model Capabilities

Image Analysis

Describe and analyze image content in detail

OCR (Text Extraction)

Extract and read text from images and documents

Visual Q&A

Answer questions about image content

Object Detection

Identify and locate objects within images

Vision API

Analyze images with text prompts:
POST /v1/chat/completions

Basic Image Analysis

curl -X POST "https://api.anyapi.ai/v1/chat/completions" \
  -H "Authorization: Bearer YOUR_API_KEY" \
  -H "Content-Type: application/json" \
  -d '{
    "model": "gpt-4o",
    "messages": [
      {
        "role": "user",
        "content": [
          {
            "type": "text",
            "text": "What is in this image?"
          },
          {
            "type": "image_url",
            "image_url": {
              "url": "https://example.com/image.jpg"
            }
          }
        ]
      }
    ]
  }'

Advanced Vision Tasks

OCR (Optical Character Recognition)

def extract_text_from_image(image_path):
    """Extract text from image using OCR"""
    base64_image = encode_image(image_path)
    
    response = requests.post(
        "https://api.anyapi.ai/v1/chat/completions",
        headers={
            "Authorization": "Bearer YOUR_API_KEY",
            "Content-Type": "application/json"
        },
        json={
            "model": "gpt-4o",
            "messages": [
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": "Extract all text from this image. Provide the text exactly as it appears, maintaining formatting and structure."
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{base64_image}"
                            }
                        }
                    ]
                }
            ]
        }
    )
    
    return response.json()["choices"][0]["message"]["content"]

# Usage
extracted_text = extract_text_from_image("document.jpg")
print(extracted_text)

Document Analysis

def analyze_document(image_path):
    """Analyze document structure and content"""
    base64_image = encode_image(image_path)
    
    response = requests.post(
        "https://api.anyapi.ai/v1/chat/completions",
        headers={
            "Authorization": "Bearer YOUR_API_KEY",
            "Content-Type": "application/json"
        },
        json={
            "model": "claude-3-5-sonnet",
            "messages": [
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": """
                            Analyze this document and provide:
                            1. Document type (invoice, receipt, contract, etc.)
                            2. Key information (dates, amounts, names, etc.)
                            3. Document structure and layout
                            4. Any important details or anomalies
                            
                            Format the response as structured JSON.
                            """
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{base64_image}"
                            }
                        }
                    ]
                }
            ]
        }
    )
    
    return response.json()["choices"][0]["message"]["content"]

Object Detection and Counting

def detect_and_count_objects(image_path, object_type="cars"):
    """Detect and count specific objects in image"""
    base64_image = encode_image(image_path)
    
    response = requests.post(
        "https://api.anyapi.ai/v1/chat/completions",
        headers={
            "Authorization": "Bearer YOUR_API_KEY",
            "Content-Type": "application/json"
        },
        json={
            "model": "gpt-4o",
            "messages": [
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": f"""
                            Count the number of {object_type} in this image.
                            Provide:
                            1. Total count
                            2. Description of each {object_type.rstrip('s')} you can see
                            3. Their approximate locations in the image
                            4. Any notable characteristics
                            """
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{base64_image}"
                            }
                        }
                    ]
                }
            ]
        }
    )
    
    return response.json()["choices"][0]["message"]["content"]

# Usage examples
car_count = detect_and_count_objects("traffic.jpg", "cars")
people_count = detect_and_count_objects("crowd.jpg", "people")

Visual Question Answering

def visual_qa(image_path, question):
    """Answer specific questions about an image"""
    base64_image = encode_image(image_path)
    
    response = requests.post(
        "https://api.anyapi.ai/v1/chat/completions",
        headers={
            "Authorization": "Bearer YOUR_API_KEY",
            "Content-Type": "application/json"
        },
        json={
            "model": "gpt-4o",
            "messages": [
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": question
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{base64_image}"
                            }
                        }
                    ]
                }
            ]
        }
    )
    
    return response.json()["choices"][0]["message"]["content"]

# Usage examples
answer = visual_qa("restaurant.jpg", "What type of cuisine does this restaurant serve?")
safety_check = visual_qa("construction.jpg", "Are there any safety violations visible in this construction site?")
medical_analysis = visual_qa("xray.jpg", "What abnormalities can you identify in this X-ray?")

Model Comparison

ModelStrengthsBest ForPrice/1K tokens
GPT-4oGeneral vision, reasoningComplex analysis$0.005
Claude 3.5 SonnetDetail accuracyDocument analysis$0.003
Gemini 2.5 ProSpeed, multilingualReal-time processing$0.0015
GPT-4o-miniCost-effectiveSimple vision tasks$0.0015

Advanced Features

Multi-Image Analysis

Compare and analyze multiple images:
def compare_images(image_paths, comparison_task):
    """Compare multiple images"""
    content = [{"type": "text", "text": comparison_task}]
    
    for i, image_path in enumerate(image_paths):
        base64_image = encode_image(image_path)
        content.append({
            "type": "image_url",
            "image_url": {
                "url": f"data:image/jpeg;base64,{base64_image}"
            }
        })
    
    response = requests.post(
        "https://api.anyapi.ai/v1/chat/completions",
        headers={
            "Authorization": "Bearer YOUR_API_KEY",
            "Content-Type": "application/json"
        },
        json={
            "model": "gpt-4o",
            "messages": [{"role": "user", "content": content}]
        }
    )
    
    return response.json()["choices"][0]["message"]["content"]

# Usage
comparison = compare_images(
    ["before.jpg", "after.jpg"],
    "Compare these before and after images. What changes do you notice?"
)

Image Quality Assessment

def assess_image_quality(image_path):
    """Assess technical quality of an image"""
    base64_image = encode_image(image_path)
    
    response = requests.post(
        "https://api.anyapi.ai/v1/chat/completions",
        headers={
            "Authorization": "Bearer YOUR_API_KEY",
            "Content-Type": "application/json"
        },
        json={
            "model": "claude-3-5-sonnet",
            "messages": [
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": """
                            Assess the technical quality of this image:
                            1. Sharpness and focus
                            2. Lighting and exposure
                            3. Color balance and saturation
                            4. Composition and framing
                            5. Any technical issues (noise, artifacts, etc.)
                            6. Overall quality score (1-10)
                            7. Suggestions for improvement
                            """
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{base64_image}"
                            }
                        }
                    ]
                }
            ]
        }
    )
    
    return response.json()["choices"][0]["message"]["content"]

Content Moderation

def moderate_image_content(image_path):
    """Check image for inappropriate content"""
    base64_image = encode_image(image_path)
    
    response = requests.post(
        "https://api.anyapi.ai/v1/chat/completions",
        headers={
            "Authorization": "Bearer YOUR_API_KEY",
            "Content-Type": "application/json"
        },
        json={
            "model": "gpt-4o",
            "messages": [
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": """
                            Analyze this image for content moderation:
                            1. Is there any inappropriate, violent, or harmful content?
                            2. Are there any safety concerns?
                            3. Is the content suitable for all audiences?
                            4. Provide a safety rating: SAFE, CAUTION, or UNSAFE
                            5. Explain your reasoning
                            
                            Be thorough but objective in your assessment.
                            """
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{base64_image}"
                            }
                        }
                    ]
                }
            ]
        }
    )
    
    return response.json()["choices"][0]["message"]["content"]

Supported Image Formats

Input Formats

  • JPEG/JPG: Most common format
  • PNG: Supports transparency
  • GIF: Static images only (no animation)
  • WebP: Modern web format
  • BMP: Basic bitmap format
  • TIFF: High-quality format

Size Limits

  • Maximum file size: 20MB
  • Maximum resolution: 8192x8192 pixels
  • Minimum resolution: 32x32 pixels
  • Recommended: 1024x1024 for best results

Image Quality Tips

  • Use high-resolution images for better OCR results
  • Ensure good lighting and contrast
  • Minimize blur and noise
  • Crop to focus on relevant content

Pricing

Vision models are priced per input token (including image processing):
ModelPrice/1K tokensImage tokens
GPT-4o$0.005~1,000 per image
Claude 3.5 Sonnet$0.003~800 per image
Gemini 2.5 Pro$0.0015~600 per image
GPT-4o-mini$0.0015~400 per image
Image token calculation varies by size and detail level.

Rate Limits

Vision model limits by plan:
PlanRequests/MinImages/HourDaily Limit
Free10100500 images
Pro1001,0005,000 images
EnterpriseCustomCustomCustom

Common Use Cases

Document Processing

Invoice processing, receipt scanning, form digitization

Quality Control

Product inspection, defect detection, compliance checking

Content Moderation

Image safety, policy compliance, automated review

Medical Imaging

Diagnostic assistance, image analysis, report generation

E-commerce

Product categorization, description generation, quality assessment

Security & Surveillance

Threat detection, activity monitoring, incident analysis

Accessibility

Image descriptions, visual assistance, content accessibility

Education

Homework help, diagram analysis, learning assistance

Best Practices

Image Preparation

  • Use clear, well-lit images
  • Ensure text is readable if OCR is needed
  • Crop to focus on relevant areas
  • Use appropriate resolution for the task

Prompt Engineering

  • Be specific about what you want to analyze
  • Ask follow-up questions for more detail
  • Use structured prompts for consistent results
  • Provide context when necessary

Error Handling

  • Implement retry logic for failed requests
  • Handle rate limits gracefully
  • Validate image formats before processing
  • Check file sizes against limits

Privacy and Security

  • Never send sensitive personal information
  • Use secure image storage and transmission
  • Implement proper access controls
  • Follow data retention policies

Integration Examples

React Component for Image Analysis

import React, { useState } from 'react';

function ImageAnalyzer() {
  const [image, setImage] = useState(null);
  const [analysis, setAnalysis] = useState('');
  const [loading, setLoading] = useState(false);

  const analyzeImage = async (file) => {
    setLoading(true);
    
    const base64 = await new Promise((resolve) => {
      const reader = new FileReader();
      reader.onload = () => resolve(reader.result.split(',')[1]);
      reader.readAsDataURL(file);
    });

    try {
      const response = await fetch('/api/analyze-image', {
        method: 'POST',
        headers: { 'Content-Type': 'application/json' },
        body: JSON.stringify({
          image: base64,
          prompt: 'Describe this image in detail'
        })
      });

      const result = await response.json();
      setAnalysis(result.analysis);
    } catch (error) {
      console.error('Analysis failed:', error);
    } finally {
      setLoading(false);
    }
  };

  return (
    <div>
      <input
        type="file"
        accept="image/*"
        onChange={(e) => {
          const file = e.target.files[0];
          setImage(file);
          if (file) analyzeImage(file);
        }}
      />
      
      {loading && <p>Analyzing image...</p>}
      {analysis && <div><h3>Analysis:</h3><p>{analysis}</p></div>}
    </div>
  );
}

Python Batch Processing

import os
import json
from concurrent.futures import ThreadPoolExecutor

def process_image_batch(image_directory, output_file):
    """Process multiple images in parallel"""
    
    def analyze_single_image(image_path):
        try:
            result = visual_qa(image_path, "Describe this image and identify any objects or text present.")
            return {
                "image": os.path.basename(image_path),
                "analysis": result,
                "status": "success"
            }
        except Exception as e:
            return {
                "image": os.path.basename(image_path),
                "error": str(e),
                "status": "failed"
            }
    
    # Get all image files
    image_files = [
        os.path.join(image_directory, f) 
        for f in os.listdir(image_directory)
        if f.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp'))
    ]
    
    # Process images in parallel
    with ThreadPoolExecutor(max_workers=5) as executor:
        results = list(executor.map(analyze_single_image, image_files))
    
    # Save results
    with open(output_file, 'w') as f:
        json.dump(results, f, indent=2)
    
    return results

# Usage
results = process_image_batch('./images/', 'analysis_results.json')

Getting Started

I