Java SpringAI

Spring AI Image Generation — DALL-E 3 and Stable Diffusion in Spring Boot

Spring AI Image Generation — DALL-E 3 and Stable Diffusion in Spring Boot

Spring AI's ImageModel interface lets you generate images from text prompts using OpenAI's DALL-E 3, Stability AI's Stable Diffusion, or other image generation APIs. A single consistent API works across providers, making it easy to switch or combine them.

Supported Image Providers

Provider         Models                      Strengths
────────────────────────────────────────────────────────────────
OpenAI DALL-E 3  dall-e-3, dall-e-2         Best prompt adherence
Stability AI     stable-diffusion-xl-1024   Fine-grained control
Vertex AI Imagen imagen-3.0-generate-001    Photorealism

Maven Dependency

<!-- OpenAI (includes DALL-E) -->
<dependency>
    <groupId>org.springframework.ai</groupId>
    <artifactId>spring-ai-openai-spring-boot-starter</artifactId>
</dependency>

<!-- Stability AI -->
<dependency>
    <groupId>org.springframework.ai</groupId>
    <artifactId>spring-ai-stability-ai-spring-boot-starter</artifactId>
</dependency>

application.properties

spring.ai.openai.api-key=${OPENAI_API_KEY}
spring.ai.openai.image.options.model=dall-e-3
spring.ai.openai.image.options.quality=hd
spring.ai.openai.image.options.size=1024x1024
spring.ai.openai.image.options.response-format=b64_json    # or url

Basic Image Generation Service

import org.springframework.ai.image.*;

@Service
public class ImageGenerationService {

    private final ImageModel imageModel;

    public ImageGenerationService(ImageModel imageModel) {
        this.imageModel = imageModel;
    }

    // Generate image and return Base64-encoded bytes
    public byte[] generateImage(String prompt) {
        ImageResponse response = imageModel.call(
                new ImagePrompt(prompt)
        );

        // Get first generated image
        Image image = response.getResult().getOutput();

        // image.getUrl() — if response format is URL
        // image.getB64Json() — if response format is b64_json (base64 string)
        String b64 = image.getB64Json();
        return Base64.getDecoder().decode(b64);
    }

    // Generate with explicit options
    public byte[] generateWithOptions(String prompt, String size, String style) {
        ImagePrompt imagePrompt = new ImagePrompt(prompt,
                OpenAiImageOptions.builder()
                        .withModel("dall-e-3")
                        .withQuality("hd")
                        .withStyle(style)        // "natural" or "vivid"
                        .withSize(size)          // "1024x1024", "1792x1024", "1024x1792"
                        .withResponseFormat("b64_json")
                        .build()
        );

        ImageResponse response = imageModel.call(imagePrompt);
        return Base64.getDecoder().decode(
                response.getResult().getOutput().getB64Json()
        );
    }

    // Generate multiple variations (DALL-E 2 only supports n>1)
    public List<byte[]> generateVariations(String prompt, int count) {
        ImagePrompt imagePrompt = new ImagePrompt(prompt,
                OpenAiImageOptions.builder()
                        .withModel("dall-e-2")
                        .withN(count)
                        .withSize("512x512")
                        .withResponseFormat("b64_json")
                        .build()
        );

        return imageModel.call(imagePrompt)
                .getResults()
                .stream()
                .map(r -> Base64.getDecoder().decode(r.getOutput().getB64Json()))
                .toList();
    }
}

REST Controller — Image Endpoint

@RestController
@RequestMapping("/images")
public class ImageController {

    private final ImageGenerationService imageService;

    public ImageController(ImageGenerationService imageService) {
        this.imageService = imageService;
    }

    // Return image as binary PNG
    @GetMapping(value = "/generate", produces = MediaType.IMAGE_PNG_VALUE)
    public ResponseEntity<byte[]> generate(@RequestParam String prompt) {
        byte[] imageBytes = imageService.generateImage(prompt);
        return ResponseEntity.ok()
                .header("Content-Disposition", "inline; filename=\"generated.png\"")
                .body(imageBytes);
    }

    // Return Base64 JSON for frontend
    @PostMapping("/generate-base64")
    public Map<String, String> generateBase64(@RequestBody GenerateRequest req) {
        byte[] bytes = imageService.generateWithOptions(
                req.prompt(), req.size(), req.style());
        return Map.of("image", Base64.getEncoder().encodeToString(bytes));
    }

    // Save to disk
    @PostMapping("/generate-save")
    public Map<String, String> generateAndSave(
            @RequestBody GenerateRequest req) throws IOException {
        byte[] bytes = imageService.generateImage(req.prompt());
        String filename = "img_" + UUID.randomUUID() + ".png";
        Path path = Path.of("images", filename);
        Files.createDirectories(path.getParent());
        Files.write(path, bytes);
        return Map.of("filename", filename, "path", path.toString());
    }
}

record GenerateRequest(String prompt, String size, String style) {}

AI-Powered Image Prompt Enhancement

@Service
public class SmartImageService {

    private final ChatClient  chatClient;
    private final ImageModel  imageModel;

    public SmartImageService(ChatClient.Builder builder, ImageModel imageModel) {
        this.chatClient = builder.build();
        this.imageModel = imageModel;
    }

    // Use AI to improve the user's prompt before generating
    public byte[] generateWithEnhancedPrompt(String userPrompt) {
        // Step 1: Enhance the prompt with DALL-E best practices
        String enhancedPrompt = chatClient.prompt()
                .system("""
                    You are an expert at writing prompts for DALL-E 3.
                    Transform the user's simple description into a detailed,
                    high-quality image generation prompt. Add: style, lighting,
                    composition, mood, and technical details. Keep under 400 chars.
                    Output ONLY the prompt, nothing else.
                    """)
                .user(userPrompt)
                .call()
                .content();

        System.out.println("Enhanced prompt: " + enhancedPrompt);

        // Step 2: Generate with the enhanced prompt
        ImageResponse response = imageModel.call(
                new ImagePrompt(enhancedPrompt,
                        OpenAiImageOptions.builder()
                                .withQuality("hd")
                                .withSize("1792x1024")   // landscape
                                .withStyle("vivid")
                                .build())
        );

        return Base64.getDecoder().decode(
                response.getResult().getOutput().getB64Json());
    }
}

Output

// GET /images/generate?prompt=a+cute+robot+reading+a+book

User prompt: "a cute robot reading a book"

Enhanced prompt by AI:
"A charming, friendly humanoid robot with soft rounded edges sitting in a
cozy library chair, reading an open leather-bound book. Warm golden hour
lighting, bokeh background of shelves. Pixar-style 3D render,
photorealistic texture, shallow depth of field."

Result: High-quality 1792x1024 PNG image
Response time: ~12-15 seconds for DALL-E 3 HD

// Response header:
Content-Type: image/png
Content-Disposition: inline; filename="generated.png"

Key Points

  • DALL-E 3 produces one image per call at 1024x1024, 1792x1024, or 1024x1792 — use withStyle("vivid") for dramatic images and "natural" for realistic photos
  • Use response-format=b64_json for immediate byte access; url format gives a temporary OpenAI-hosted URL that expires in 60 minutes
  • DALL-E 3 HD quality doubles cost but significantly improves detail — use it for user-facing content, not internal tooling
  • The two-step pattern (AI enhances prompt → DALL-E generates) consistently produces better results than passing raw user input directly
  • Generation takes 10-20 seconds — always use async processing or streaming endpoints for user-facing features
Topics: Java SpringAI
← Newer Post Older Post →