Java SpringAI

Spring AI ETL Pipeline — Read, Transform, and Load Documents into Vector Store

Spring AI ETL Pipeline — Read, Transform, and Load Documents into Vector Store

Before you can search documents with RAG, you need to load them into a vector store. Spring AI provides an ETL (Extract, Transform, Load) pipeline with three stages: document readers (Extract), transformers like text splitters (Transform), and the vector store writer (Load). This tutorial covers all built-in readers and transformers with complete examples.

ETL Pipeline Overview

EXTRACT (Read)           TRANSFORM (Split + Enrich)     LOAD (Store)
─────────────────        ────────────────────────────    ─────────────
PagePdfDocumentReader    TokenTextSplitter               VectorStore
TextReader           →   ContentFormatTransformer    →   .add(chunks)
JsonReader               KeywordMetadataEnricher
TikaDocumentReader       SummaryMetadataEnricher
GithubDocumentReader

Each stage returns/accepts: List<Document>

Dependencies

<!-- PDF reader -->
<dependency>
    <groupId>org.springframework.ai</groupId>
    <artifactId>spring-ai-pdf-document-reader</artifactId>
</dependency>

<!-- Tika reader (Word, PowerPoint, HTML, etc.) -->
<dependency>
    <groupId>org.springframework.ai</groupId>
    <artifactId>spring-ai-tika-document-reader</artifactId>
</dependency>

<!-- Vector store (PGVector) -->
<dependency>
    <groupId>org.springframework.ai</groupId>
    <artifactId>spring-ai-pgvector-store-spring-boot-starter</artifactId>
</dependency>

Extract — Reading Different Document Types

import org.springframework.ai.reader.pdf.PagePdfDocumentReader;
import org.springframework.ai.reader.TextReader;
import org.springframework.ai.reader.JsonReader;
import org.springframework.ai.reader.tika.TikaDocumentReader;

@Service
public class DocumentReaderService {

    // Read PDF — one Document per page
    public List<Document> readPdf(Resource pdfResource) {
        PagePdfDocumentReader reader = new PagePdfDocumentReader(pdfResource);
        return reader.get();
    }

    // Read plain text file
    public List<Document> readText(Resource textResource) {
        TextReader reader = new TextReader(textResource);
        reader.getCustomMetadata().put("source-type", "text");
        return reader.get();
    }

    // Read JSON — extract specific fields as document content
    public List<Document> readJson(Resource jsonResource) {
        // Reads each JSON array element as a Document
        // jsonKeysToUse: fields to include in document content
        JsonReader reader = new JsonReader(jsonResource,
                "title", "description", "content");
        return reader.get();
    }

    // Read Word/PowerPoint/HTML/ODT using Apache Tika
    public List<Document> readAnyFormat(Resource resource) {
        TikaDocumentReader reader = new TikaDocumentReader(resource);
        return reader.get();
    }
}

Transform — Splitting and Enriching

import org.springframework.ai.transformer.splitter.TokenTextSplitter;
import org.springframework.ai.transformer.KeywordMetadataEnricher;
import org.springframework.ai.transformer.SummaryMetadataEnricher;

@Service
public class DocumentTransformService {

    private final ChatModel chatModel;

    public DocumentTransformService(ChatModel chatModel) {
        this.chatModel = chatModel;
    }

    // Split into chunks of ~600 tokens with 100-token overlap
    public List<Document> split(List<Document> docs) {
        TokenTextSplitter splitter = new TokenTextSplitter(600, 100, 5, 10000, true);
        return splitter.apply(docs);
    }

    // Add AI-generated keywords to each chunk's metadata
    public List<Document> enrichWithKeywords(List<Document> docs) {
        KeywordMetadataEnricher enricher = new KeywordMetadataEnricher(chatModel, 5);
        return enricher.apply(docs);
        // Each document's metadata now has: {"keywords": "Spring Boot, JPA, database, CRUD, repository"}
    }

    // Add AI-generated summary to each chunk's metadata
    public List<Document> enrichWithSummary(List<Document> docs) {
        SummaryMetadataEnricher enricher = new SummaryMetadataEnricher(
                chatModel,
                List.of(SummaryMetadataEnricher.SummaryType.PREVIOUS,
                        SummaryMetadataEnricher.SummaryType.CURRENT,
                        SummaryMetadataEnricher.SummaryType.NEXT)
        );
        return enricher.apply(docs);
    }
}

Load — Write to Vector Store

import org.springframework.ai.vectorstore.VectorStoreObservationDocumentWriter;

@Service
public class VectorStoreLoaderService {

    private final VectorStore vectorStore;

    public VectorStoreLoaderService(VectorStore vectorStore) {
        this.vectorStore = vectorStore;
    }

    public void load(List<Document> documents) {
        vectorStore.add(documents);
        System.out.println("Loaded " + documents.size() + " documents into vector store");
    }
}

Full ETL Pipeline — Orchestrating All Three Stages

@Service
public class EtlPipelineService {

    private final DocumentReaderService    reader;
    private final DocumentTransformService transformer;
    private final VectorStoreLoaderService loader;

    public EtlPipelineService(DocumentReaderService reader,
                               DocumentTransformService transformer,
                               VectorStoreLoaderService loader) {
        this.reader      = reader;
        this.transformer = transformer;
        this.loader      = loader;
    }

    public void runForPdf(Resource pdfResource, String sourceTag) {
        System.out.println("=== ETL Pipeline Start: " + pdfResource.getFilename());

        // 1. EXTRACT
        List<Document> rawDocs = reader.readPdf(pdfResource);
        System.out.println("Extracted: " + rawDocs.size() + " pages");

        // 2. TRANSFORM — split into chunks
        List<Document> chunks = transformer.split(rawDocs);
        System.out.println("Transformed: " + chunks.size() + " chunks");

        // Add custom metadata before loading
        chunks.forEach(doc -> {
            doc.getMetadata().put("source",      pdfResource.getFilename());
            doc.getMetadata().put("source-tag",  sourceTag);
            doc.getMetadata().put("loaded-date", LocalDate.now().toString());
        });

        // 3. LOAD
        loader.load(chunks);
        System.out.println("=== ETL Pipeline Complete ===");
    }
}

REST Controller for Pipeline Trigger

@RestController
@RequestMapping("/etl")
public class EtlController {

    private final EtlPipelineService pipeline;

    public EtlController(EtlPipelineService pipeline) {
        this.pipeline = pipeline;
    }

    @PostMapping("/upload")
    public ResponseEntity<String> upload(@RequestParam("file") MultipartFile file,
                                          @RequestParam String tag) throws IOException {
        // Save temp file, then run pipeline
        Path temp = Files.createTempFile("upload-", "-" + file.getOriginalFilename());
        file.transferTo(temp);
        pipeline.runForPdf(new FileSystemResource(temp), tag);
        Files.deleteIfExists(temp);
        return ResponseEntity.ok("Ingested: " + file.getOriginalFilename());
    }
}

Output

=== ETL Pipeline Start: spring-boot-guide.pdf
Extracted: 48 pages
Transformed: 92 chunks
Loaded 92 documents into vector store
=== ETL Pipeline Complete ===

Key Points

  • PagePdfDocumentReader splits by page — combine with TokenTextSplitter to get consistently-sized chunks regardless of page length
  • TikaDocumentReader handles 20+ file formats including Word, Excel, PowerPoint, HTML, and OpenDocument — use it as a universal reader
  • KeywordMetadataEnricher calls the LLM once per document to generate keywords — increases API cost but improves filtered search
  • Always add source metadata so you can trace which document a retrieved chunk came from
  • Run the ETL pipeline as a one-time batch job or on a schedule when documents change — not on every user query
Topics: Java SpringAI
← Newer Post Older Post →