Java SpringAI

Spring AI Token Usage and Cost Tracking — Monitor and Control AI API Spend

Spring AI Token Usage and Cost Tracking — Monitor and Control AI API Spend

AI API costs are driven entirely by token usage. Without tracking, a single misbehaving feature can generate unexpected bills. This tutorial builds a complete token tracking system that records usage per user, per feature, and per model — and provides a dashboard endpoint and budget alerts.

Understanding Token Costs

OpenAI GPT-4o (June 2025 pricing):
  Input  : $2.50 per 1M tokens
  Output : $10.00 per 1M tokens

GPT-4o-mini:
  Input  : $0.15 per 1M tokens
  Output : $0.60 per 1M tokens

Claude Sonnet 4.5:
  Input  : $3.00 per 1M tokens
  Output : $15.00 per 1M tokens

Example calculation:
  1 question = 100 input + 300 output tokens = 400 tokens
  GPT-4o-mini: (100 × 0.15 + 300 × 0.60) / 1,000,000 = $0.000195 per call
  1,000 calls/day = $0.195/day = ~$5.85/month

Token Usage Entity

@Entity
@Table(name = "ai_token_usage")
public class AiTokenUsage {

    @Id @GeneratedValue
    private Long id;

    private String  userId;
    private String  feature;      // "chat", "rag", "classify"
    private String  model;
    private long    inputTokens;
    private long    outputTokens;
    private double  costUsd;
    private LocalDateTime timestamp;

    // getters, setters...
}

Cost Calculation Service

@Component
public class TokenCostCalculator {

    // Cost per 1M tokens (in USD)
    private static final Map<String, double[]> PRICING = Map.of(
        "gpt-4o",          new double[]{2.50, 10.00},
        "gpt-4o-mini",     new double[]{0.15,  0.60},
        "claude-opus-4-5", new double[]{15.00, 75.00},
        "claude-sonnet-4-5", new double[]{3.00, 15.00},
        "claude-haiku-4-5",  new double[]{0.25,  1.25}
    );

    public double calculateCost(String model, long inputTokens, long outputTokens) {
        double[] prices = PRICING.getOrDefault(model, new double[]{0.0, 0.0});
        return (inputTokens * prices[0] + outputTokens * prices[1]) / 1_000_000.0;
    }
}

Token Tracking Advisor

import org.springframework.ai.chat.client.advisor.api.*;

@Component
public class TokenTrackingAdvisor implements CallAroundAdvisor {

    private final AiTokenUsageRepository repository;
    private final TokenCostCalculator    calculator;

    public TokenTrackingAdvisor(AiTokenUsageRepository repository,
                                 TokenCostCalculator calculator) {
        this.repository = repository;
        this.calculator = calculator;
    }

    @Override
    public AdvisedResponse aroundCall(AdvisedRequest request, CallAroundAdvisorChain chain) {
        AdvisedResponse response = chain.nextAroundCall(request);

        ChatResponse chatResponse = response.response();
        if (chatResponse != null && chatResponse.getMetadata().getUsage() != null) {
            Usage usage = chatResponse.getMetadata().getUsage();

            String model   = (String) request.adviseContext().getOrDefault("model", "unknown");
            String userId  = (String) request.adviseContext().getOrDefault("userId", "anonymous");
            String feature = (String) request.adviseContext().getOrDefault("feature", "general");

            AiTokenUsage record = new AiTokenUsage();
            record.setUserId(userId);
            record.setFeature(feature);
            record.setModel(model);
            record.setInputTokens(usage.getPromptTokens());
            record.setOutputTokens(usage.getGenerationTokens());
            record.setCostUsd(calculator.calculateCost(model,
                    usage.getPromptTokens(), usage.getGenerationTokens()));
            record.setTimestamp(LocalDateTime.now());

            repository.save(record);
        }

        return response;
    }

    @Override
    public int getOrder() { return Ordered.LOWEST_PRECEDENCE; }

    @Override
    public String getName() { return "TokenTrackingAdvisor"; }
}

Usage-Aware AI Service

@Service
public class TrackedAiService {

    private final ChatClient chatClient;

    public TrackedAiService(ChatClient.Builder builder,
                             TokenTrackingAdvisor trackingAdvisor) {
        this.chatClient = builder
                .defaultAdvisors(trackingAdvisor)
                .build();
    }

    public String ask(String userId, String feature, String question) {
        return chatClient.prompt()
                .user(question)
                .advisors(a -> a
                        .param("userId",  userId)
                        .param("feature", feature)
                        .param("model",   "gpt-4o-mini"))
                .call()
                .content();
    }
}

Usage Dashboard Controller

@RestController
@RequestMapping("/admin/ai-usage")
@PreAuthorize("hasRole('ADMIN')")
public class AiUsageDashboardController {

    private final AiTokenUsageRepository repository;
    private final TokenCostCalculator    calculator;

    @GetMapping("/summary")
    public Map<String, Object> getSummary() {
        LocalDateTime since = LocalDateTime.now().minusDays(30);
        List<AiTokenUsage> usages = repository.findByTimestampAfter(since);

        long totalInput    = usages.stream().mapToLong(AiTokenUsage::getInputTokens).sum();
        long totalOutput   = usages.stream().mapToLong(AiTokenUsage::getOutputTokens).sum();
        double totalCost   = usages.stream().mapToDouble(AiTokenUsage::getCostUsd).sum();

        Map<String, Double> costByUser = usages.stream()
                .collect(Collectors.groupingBy(AiTokenUsage::getUserId,
                        Collectors.summingDouble(AiTokenUsage::getCostUsd)));

        Map<String, Double> costByFeature = usages.stream()
                .collect(Collectors.groupingBy(AiTokenUsage::getFeature,
                        Collectors.summingDouble(AiTokenUsage::getCostUsd)));

        return Map.of(
                "period",          "last 30 days",
                "totalCalls",      usages.size(),
                "totalInputTokens",  totalInput,
                "totalOutputTokens", totalOutput,
                "totalCostUsd",    String.format("$%.4f", totalCost),
                "costByUser",      costByUser,
                "costByFeature",   costByFeature
        );
    }
}

Output

GET /admin/ai-usage/summary

{
  "period": "last 30 days",
  "totalCalls": 2847,
  "totalInputTokens": 892430,
  "totalOutputTokens": 1247890,
  "totalCostUsd": "$1.2847",
  "costByUser": {
    "user-001": 0.342,
    "user-042": 0.178,
    "user-099": 0.091
  },
  "costByFeature": {
    "chat":     0.612,
    "rag":      0.491,
    "classify": 0.181
  }
}

Budget Alert Service

@Service
@Scheduled(cron = "0 0 * * * *")  // run every hour
public class BudgetAlertService {

    private final AiTokenUsageRepository repository;

    private static final double DAILY_BUDGET_USD = 10.0;

    public void checkBudget() {
        LocalDateTime since = LocalDateTime.now().truncatedTo(ChronoUnit.DAYS);
        double todayCost = repository.sumCostSince(since);

        if (todayCost > DAILY_BUDGET_USD * 0.8) {
            System.out.printf("WARNING: Daily AI spend at $%.4f (%.0f%% of $%.2f budget)%n",
                    todayCost, (todayCost / DAILY_BUDGET_USD) * 100, DAILY_BUDGET_USD);
            // Send Slack alert, email, or PagerDuty here
        }

        if (todayCost > DAILY_BUDGET_USD) {
            System.out.printf("ALERT: Daily AI budget EXCEEDED! $%.4f spent vs $%.2f limit%n",
                    todayCost, DAILY_BUDGET_USD);
            // Optionally disable AI features via feature flag
        }
    }
}

Key Points

  • Record token usage per call — chatResponse.getMetadata().getUsage() gives exact input and output token counts
  • Use a custom CallAroundAdvisor to intercept every ChatClient call and record usage without modifying service code
  • Track by feature (chat, RAG, batch) to identify which parts of your app drive the most cost
  • Set up daily budget alerts at 80% threshold — waiting until 100% is too late if usage spikes
  • Prefer gpt-4o-mini or claude-haiku-4-5 for classification and extraction — they're 10-50x cheaper than flagship models with comparable accuracy on structured tasks
Topics: Java SpringAI
← Newer Post Older Post →