Exa's free tier gives you 1,000 searches per month. An agent workflow that searches on every user query burns through that in a day. The fix is not upgrading -- it is optimizing: cache results, batch similar queries, and route by intent so cheap queries go to cheap APIs and only high-value semantic queries hit Exa. A cost-routing function can cut search spend by 60-80%.
Why agent search is expensive
Agents search aggressively. A single user question might trigger 3-5 search calls: one for the main query, one for each sub-question the agent generates, one to verify claims. At 4 searches per user query and 100 users per day, you need 12,000 searches per month. Exa's free tier covers 8% of that. Even paid tiers ($49/month for 8,000 credits on Websets Starter) run out quickly.
Strategy 1: cache results aggressively
import hashlib, json, time, os
from pathlib import Path
CACHE_DIR = Path("search_cache")
CACHE_DIR.mkdir(exist_ok=True)
CACHE_TTL = 3600 * 6 # 6 hours -- most search results are stable this long
def cached_search(query: str, search_fn, ttl: int = CACHE_TTL) -> dict:
"""Cache search results to avoid redundant API calls."""
cache_key = hashlib.md5(query.lower().strip().encode()).hexdigest()
cache_path = CACHE_DIR / f"{cache_key}.json"
# Check cache
if cache_path.exists():
cached = json.loads(cache_path.read_text())
if time.time() - cached["timestamp"] < ttl:
cached["_from_cache"] = True
return cached["data"]
# Cache miss: call API
result = search_fn(query)
# Store in cache
cache_path.write_text(json.dumps({
"query": query,
"timestamp": time.time(),
"data": result,
}))
result["_from_cache"] = False
return result
# With 6-hour TTL:
# - Same query within 6 hours = free (cache hit)
# - Typical cache hit rate for agent workflows: 40-60%
# - 1,000 unique queries becomes ~500 API callsStrategy 2: batch similar queries
from collections import defaultdict
import re
def normalize_query(query: str) -> str:
"""Normalize query to reduce near-duplicates."""
q = query.lower().strip()
q = re.sub(r'\s+', ' ', q)
# Remove common filler words agents add
fillers = ["please find", "search for", "look up", "what is", "tell me about"]
for filler in fillers:
q = q.replace(filler, "").strip()
return q
def deduplicate_agent_queries(queries: list) -> list:
"""Remove near-duplicate queries before sending to API."""
seen = {}
unique = []
for q in queries:
normalized = normalize_query(q)
if normalized not in seen:
seen[normalized] = q
unique.append(q)
print(f"Deduplicated: {len(queries)} -> {len(unique)} queries "
f"({len(queries) - len(unique)} duplicates removed)")
return unique
# Agent generates these queries for one user question:
agent_queries = [
"best SERP API pricing 2026",
"What is the best SERP API pricing 2026",
"SERP API pricing comparison",
"search for best SERP API pricing 2026",
]
# After dedup: 2 unique queries instead of 4Strategy 3: cost-routing by intent
import requests, os
def classify_query_intent(query: str) -> str:
"""Classify if query needs semantic search or keyword search."""
semantic_indicators = [
"similar to", "like", "related to", "concept",
"meaning", "explain", "analogy", "comparable",
]
for indicator in semantic_indicators:
if indicator in query.lower():
return "semantic"
return "keyword"
def cost_routed_search(query: str) -> dict:
"""Route queries to cheapest appropriate API."""
intent = classify_query_intent(query)
if intent == "semantic":
# Exa: $7/1k searches -- best for semantic/meaning-based
# Only route here when keyword search would miss the point
return exa_search(query)
else:
# Scavio: $5/1k -- structured keyword search
# Handles 80%+ of agent queries
resp = requests.post(
"https://api.scavio.dev/api/v1/search",
headers={"x-api-key": os.environ["SCAVIO_API_KEY"]},
json={"query": query, "platform": "google"},
timeout=10,
)
return resp.json()
def exa_search(query: str) -> dict:
"""Exa semantic search -- use sparingly."""
resp = requests.post(
"https://api.exa.ai/search",
headers={"x-api-key": os.environ["EXA_API_KEY"]},
json={"query": query, "numResults": 10, "type": "neural"},
timeout=10,
)
return resp.json()
# Routing distribution (typical agent workload):
# 80% keyword -> Scavio at $0.005/query
# 20% semantic -> Exa at $0.007/query
# Blended cost: $0.0054/query vs $0.007/query (all-Exa)
# Savings: 23% on per-query cost + cache savings on topFull optimization pipeline
class OptimizedSearchClient:
"""Combines caching, dedup, and routing for minimum cost."""
def __init__(self):
self.stats = {"cache_hits": 0, "api_calls": 0, "total_queries": 0}
def search(self, query: str) -> dict:
self.stats["total_queries"] += 1
# Layer 1: Cache
result = cached_search(query, self._routed_search)
if result.get("_from_cache"):
self.stats["cache_hits"] += 1
else:
self.stats["api_calls"] += 1
return result
def batch_search(self, queries: list) -> list:
"""Search multiple queries with dedup + cache + routing."""
unique = deduplicate_agent_queries(queries)
return [self.search(q) for q in unique]
def _routed_search(self, query: str) -> dict:
return cost_routed_search(query)
def print_stats(self):
total = self.stats["total_queries"]
hits = self.stats["cache_hits"]
calls = self.stats["api_calls"]
print(f"Total queries: {total}")
print(f"Cache hits: {hits} ({hits/max(total,1)*100:.0f}%)")
print(f"API calls: {calls} ({calls/max(total,1)*100:.0f}%)")
print(f"Estimated cost: \${calls * 0.005:.2f}")
# Usage:
# client = OptimizedSearchClient()
# results = client.batch_search(agent_queries)
# client.print_stats()Cost savings at scale
- Raw: 12,000 queries/month on Exa = $84/month
- With caching (50% hit rate): 6,000 API calls = $42/month
- With dedup (30% reduction): 4,200 API calls = $29.40/month
- With routing (80% to Scavio): 840 Exa + 3,360 Scavio = $22.68/month
- Total savings: 73% reduction from $84 to $22.68/month
Start with caching -- it is the highest-impact, lowest-effort optimization. Add dedup next if your agent generates repetitive sub-queries. Add cost routing last, only if you genuinely need semantic search for a portion of your queries. Most agent workloads are keyword-based and do not need Exa's neural retrieval at all.