Spring AI Token Usage and Cost Tracking — Monitor and Control AI API Spend
AI API costs are driven entirely by token usage. Without tracking, a single misbehaving feature can generate unexpected bills. This tutorial builds a complete token tracking system that records usage per user, per feature, and per model — and provides a dashboard endpoint and budget alerts.
Understanding Token Costs
OpenAI GPT-4o (June 2025 pricing):
Input : $2.50 per 1M tokens
Output : $10.00 per 1M tokens
GPT-4o-mini:
Input : $0.15 per 1M tokens
Output : $0.60 per 1M tokens
Claude Sonnet 4.5:
Input : $3.00 per 1M tokens
Output : $15.00 per 1M tokens
Example calculation:
1 question = 100 input + 300 output tokens = 400 tokens
GPT-4o-mini: (100 × 0.15 + 300 × 0.60) / 1,000,000 = $0.000195 per call
1,000 calls/day = $0.195/day = ~$5.85/month
Token Usage Entity
@Entity
@Table(name = "ai_token_usage")
public class AiTokenUsage {
@Id @GeneratedValue
private Long id;
private String userId;
private String feature; // "chat", "rag", "classify"
private String model;
private long inputTokens;
private long outputTokens;
private double costUsd;
private LocalDateTime timestamp;
// getters, setters...
}
Cost Calculation Service
@Component
public class TokenCostCalculator {
// Cost per 1M tokens (in USD)
private static final Map<String, double[]> PRICING = Map.of(
"gpt-4o", new double[]{2.50, 10.00},
"gpt-4o-mini", new double[]{0.15, 0.60},
"claude-opus-4-5", new double[]{15.00, 75.00},
"claude-sonnet-4-5", new double[]{3.00, 15.00},
"claude-haiku-4-5", new double[]{0.25, 1.25}
);
public double calculateCost(String model, long inputTokens, long outputTokens) {
double[] prices = PRICING.getOrDefault(model, new double[]{0.0, 0.0});
return (inputTokens * prices[0] + outputTokens * prices[1]) / 1_000_000.0;
}
}
Token Tracking Advisor
import org.springframework.ai.chat.client.advisor.api.*;
@Component
public class TokenTrackingAdvisor implements CallAroundAdvisor {
private final AiTokenUsageRepository repository;
private final TokenCostCalculator calculator;
public TokenTrackingAdvisor(AiTokenUsageRepository repository,
TokenCostCalculator calculator) {
this.repository = repository;
this.calculator = calculator;
}
@Override
public AdvisedResponse aroundCall(AdvisedRequest request, CallAroundAdvisorChain chain) {
AdvisedResponse response = chain.nextAroundCall(request);
ChatResponse chatResponse = response.response();
if (chatResponse != null && chatResponse.getMetadata().getUsage() != null) {
Usage usage = chatResponse.getMetadata().getUsage();
String model = (String) request.adviseContext().getOrDefault("model", "unknown");
String userId = (String) request.adviseContext().getOrDefault("userId", "anonymous");
String feature = (String) request.adviseContext().getOrDefault("feature", "general");
AiTokenUsage record = new AiTokenUsage();
record.setUserId(userId);
record.setFeature(feature);
record.setModel(model);
record.setInputTokens(usage.getPromptTokens());
record.setOutputTokens(usage.getGenerationTokens());
record.setCostUsd(calculator.calculateCost(model,
usage.getPromptTokens(), usage.getGenerationTokens()));
record.setTimestamp(LocalDateTime.now());
repository.save(record);
}
return response;
}
@Override
public int getOrder() { return Ordered.LOWEST_PRECEDENCE; }
@Override
public String getName() { return "TokenTrackingAdvisor"; }
}
Usage-Aware AI Service
@Service
public class TrackedAiService {
private final ChatClient chatClient;
public TrackedAiService(ChatClient.Builder builder,
TokenTrackingAdvisor trackingAdvisor) {
this.chatClient = builder
.defaultAdvisors(trackingAdvisor)
.build();
}
public String ask(String userId, String feature, String question) {
return chatClient.prompt()
.user(question)
.advisors(a -> a
.param("userId", userId)
.param("feature", feature)
.param("model", "gpt-4o-mini"))
.call()
.content();
}
}
Usage Dashboard Controller
@RestController
@RequestMapping("/admin/ai-usage")
@PreAuthorize("hasRole('ADMIN')")
public class AiUsageDashboardController {
private final AiTokenUsageRepository repository;
private final TokenCostCalculator calculator;
@GetMapping("/summary")
public Map<String, Object> getSummary() {
LocalDateTime since = LocalDateTime.now().minusDays(30);
List<AiTokenUsage> usages = repository.findByTimestampAfter(since);
long totalInput = usages.stream().mapToLong(AiTokenUsage::getInputTokens).sum();
long totalOutput = usages.stream().mapToLong(AiTokenUsage::getOutputTokens).sum();
double totalCost = usages.stream().mapToDouble(AiTokenUsage::getCostUsd).sum();
Map<String, Double> costByUser = usages.stream()
.collect(Collectors.groupingBy(AiTokenUsage::getUserId,
Collectors.summingDouble(AiTokenUsage::getCostUsd)));
Map<String, Double> costByFeature = usages.stream()
.collect(Collectors.groupingBy(AiTokenUsage::getFeature,
Collectors.summingDouble(AiTokenUsage::getCostUsd)));
return Map.of(
"period", "last 30 days",
"totalCalls", usages.size(),
"totalInputTokens", totalInput,
"totalOutputTokens", totalOutput,
"totalCostUsd", String.format("$%.4f", totalCost),
"costByUser", costByUser,
"costByFeature", costByFeature
);
}
}
Output
GET /admin/ai-usage/summary
{
"period": "last 30 days",
"totalCalls": 2847,
"totalInputTokens": 892430,
"totalOutputTokens": 1247890,
"totalCostUsd": "$1.2847",
"costByUser": {
"user-001": 0.342,
"user-042": 0.178,
"user-099": 0.091
},
"costByFeature": {
"chat": 0.612,
"rag": 0.491,
"classify": 0.181
}
}
Budget Alert Service
@Service
@Scheduled(cron = "0 0 * * * *") // run every hour
public class BudgetAlertService {
private final AiTokenUsageRepository repository;
private static final double DAILY_BUDGET_USD = 10.0;
public void checkBudget() {
LocalDateTime since = LocalDateTime.now().truncatedTo(ChronoUnit.DAYS);
double todayCost = repository.sumCostSince(since);
if (todayCost > DAILY_BUDGET_USD * 0.8) {
System.out.printf("WARNING: Daily AI spend at $%.4f (%.0f%% of $%.2f budget)%n",
todayCost, (todayCost / DAILY_BUDGET_USD) * 100, DAILY_BUDGET_USD);
// Send Slack alert, email, or PagerDuty here
}
if (todayCost > DAILY_BUDGET_USD) {
System.out.printf("ALERT: Daily AI budget EXCEEDED! $%.4f spent vs $%.2f limit%n",
todayCost, DAILY_BUDGET_USD);
// Optionally disable AI features via feature flag
}
}
}
Key Points
- Record token usage per call —
chatResponse.getMetadata().getUsage()gives exact input and output token counts - Use a custom
CallAroundAdvisorto intercept every ChatClient call and record usage without modifying service code - Track by feature (chat, RAG, batch) to identify which parts of your app drive the most cost
- Set up daily budget alerts at 80% threshold — waiting until 100% is too late if usage spikes
- Prefer
gpt-4o-miniorclaude-haiku-4-5for classification and extraction — they're 10-50x cheaper than flagship models with comparable accuracy on structured tasks
Comments