Amazon Bedrock supports multimodal AI capabilities including image generation and vision understanding, enabling applications that work with both text and images.
Multimodal Capabilities
| Capability |
Models |
Use Cases |
| Image Generation |
Stable Diffusion, Titan Image |
Creative content, marketing |
| Vision Understanding |
Claude 3 |
Image analysis, OCR |
| Multimodal Embeddings |
Titan Multimodal |
Image search |
Image Generation with Stable Diffusion
Text-to-Image
import boto3
import json
import base64
client = boto3.client('bedrock-runtime')
response = client.invoke_model(
modelId='stability.stable-diffusion-xl-v1',
body=json.dumps({
"text_prompts": [
{"text": "A serene mountain landscape at sunset, digital art", "weight": 1.0}
],
"cfg_scale": 7,
"steps": 50,
"seed": 42,
"width": 1024,
"height": 1024
})
)
result = json.loads(response['body'].read())
image_data = base64.b64decode(result['artifacts'][0]['base64'])
with open('generated_image.png', 'wb') as f:
f.write(image_data)
Negative Prompts
body = {
"text_prompts": [
{"text": "Professional headshot portrait, studio lighting", "weight": 1.0},
{"text": "blurry, low quality, distorted", "weight": -1.0}
],
"cfg_scale": 10,
"steps": 50
}
Stable Diffusion Parameters
| Parameter |
Description |
Range |
| cfg_scale |
Prompt adherence |
1-35 |
| steps |
Generation iterations |
10-150 |
| seed |
Reproducibility |
Integer |
| width/height |
Image dimensions |
512-1024 |
Image Generation with Titan
Text-to-Image
response = client.invoke_model(
modelId='amazon.titan-image-generator-v1',
body=json.dumps({
"taskType": "TEXT_IMAGE",
"textToImageParams": {
"text": "A modern office building with glass facade"
},
"imageGenerationConfig": {
"numberOfImages": 1,
"height": 1024,
"width": 1024,
"cfgScale": 8.0
}
})
)
result = json.loads(response['body'].read())
image_data = base64.b64decode(result['images'][0])
Image Variation
with open('input_image.png', 'rb') as f:
input_image = base64.b64encode(f.read()).decode()
response = client.invoke_model(
modelId='amazon.titan-image-generator-v1',
body=json.dumps({
"taskType": "IMAGE_VARIATION",
"imageVariationParams": {
"text": "Same scene but at night",
"images": [input_image],
"similarityStrength": 0.7
},
"imageGenerationConfig": {
"numberOfImages": 3,
"height": 1024,
"width": 1024
}
})
)
Inpainting
response = client.invoke_model(
modelId='amazon.titan-image-generator-v1',
body=json.dumps({
"taskType": "INPAINTING",
"inPaintingParams": {
"text": "A red sports car",
"image": input_image,
"maskImage": mask_image,
"maskPrompt": "the vehicle"
}
})
)
Vision Understanding with Claude
Analyze Images
with open('document.png', 'rb') as f:
image_data = base64.b64encode(f.read()).decode()
response = client.converse(
modelId='anthropic.claude-3-sonnet-20240229-v1:0',
messages=[
{
"role": "user",
"content": [
{
"image": {
"format": "png",
"source": {"bytes": base64.b64decode(image_data)}
}
},
{"text": "Describe what you see in this image."}
]
}
]
)
print(response['output']['message']['content'][0]['text'])
Document Analysis
response = client.converse(
modelId='anthropic.claude-3-sonnet-20240229-v1:0',
messages=[
{
"role": "user",
"content": [
{
"image": {
"format": "png",
"source": {"bytes": document_bytes}
}
},
{"text": "Extract all text from this document and format as JSON."}
]
}
]
)
Multiple Images
response = client.converse(
modelId='anthropic.claude-3-sonnet-20240229-v1:0',
messages=[
{
"role": "user",
"content": [
{"image": {"format": "png", "source": {"bytes": image1_bytes}}},
{"image": {"format": "png", "source": {"bytes": image2_bytes}}},
{"text": "Compare these two product images and list the differences."}
]
}
]
)
Image Search with Multimodal Embeddings
def get_image_embedding(image_path):
with open(image_path, 'rb') as f:
image_data = base64.b64encode(f.read()).decode()
response = client.invoke_model(
modelId='amazon.titan-embed-image-v1',
body=json.dumps({
"inputImage": image_data
})
)
return json.loads(response['body'].read())['embedding']
def get_text_embedding(text):
response = client.invoke_model(
modelId='amazon.titan-embed-image-v1',
body=json.dumps({
"inputText": text
})
)
return json.loads(response['body'].read())['embedding']
query_embedding = get_text_embedding("red car")
Complete Multimodal Application
import boto3
import base64
import json
from pathlib import Path
class MultimodalAssistant:
def __init__(self):
self.client = boto3.client('bedrock-runtime')
def generate_image(self, prompt: str, negative_prompt: str = None) -> bytes:
body = {
"text_prompts": [{"text": prompt, "weight": 1.0}],
"cfg_scale": 8,
"steps": 50,
"width": 1024,
"height": 1024
}
if negative_prompt:
body["text_prompts"].append({"text": negative_prompt, "weight": -1.0})
response = self.client.invoke_model(
modelId='stability.stable-diffusion-xl-v1',
body=json.dumps(body)
)
result = json.loads(response['body'].read())
return base64.b64decode(result['artifacts'][0]['base64'])
def analyze_image(self, image_bytes: bytes, question: str) -> str:
response = self.client.converse(
modelId='anthropic.claude-3-sonnet-20240229-v1:0',
messages=[{
"role": "user",
"content": [
{"image": {"format": "png", "source": {"bytes": image_bytes}}},
{"text": question}
]
}]
)
return response['output']['message']['content'][0]['text']
def describe_and_recreate(self, image_path: str) -> tuple:
with open(image_path, 'rb') as f:
image_bytes = f.read()
description = self.analyze_image(
image_bytes,
"Describe this image in detail for recreation."
)
new_image = self.generate_image(description)
return description, new_image
assistant = MultimodalAssistant()
image = assistant.generate_image("A cozy coffee shop interior, warm lighting")
analysis = assistant.analyze_image(image, "What style is this interior design?")
Best Practices
| Practice |
Recommendation |
| Prompt clarity |
Be specific and detailed |
| Negative prompts |
Exclude unwanted elements |
| Image quality |
Use appropriate resolution |
| Cost optimization |
Start with fewer steps |
| Content safety |
Implement content filtering |
Key Takeaways
- Multiple generation models - Stable Diffusion and Titan for different needs
- Vision with Claude - Analyze documents, compare images
- Multimodal embeddings - Enable image search with text
- Combine capabilities - Build rich multimodal applications
- Content moderation - Use Guardrails for safety
References