import asyncio
import json
import time
import uuid
from dataclasses import dataclass
from pathlib import Path
import httpx
ENGINE_URL = "http://localhost:8000"
ENGINE_KEY = "..."
CONCURRENCY = 5
@dataclass
class Case:
id: str
category: str
input: dict
expected: dict
@dataclass
class Result:
case_id: str
passed: bool
score: dict
text: str
tool_calls: list
usage: dict
elapsed_ms: int
async def run_case(case: Case, run_id: str) -> Result:
task_id = f"eval-{run_id}-{case.id}"
start = time.time()
text_parts = []
tool_calls = []
usage = {"input_tokens": 0, "output_tokens": 0, "cache_hit_tokens": 0}
async with httpx.AsyncClient(timeout=None) as client:
async with client.stream(
"POST",
f"{ENGINE_URL}/execute",
headers={"X-Engine-Key": ENGINE_KEY},
json={**case.input, "task_id": task_id},
) as resp:
event_type = None
async for line in resp.aiter_lines():
if line.startswith("event:"):
event_type = line[6:].strip()
elif line.startswith("data:"):
data = json.loads(line[5:].strip())
if event_type == "text_delta":
text_parts.append(data.get("text", ""))
elif event_type == "tool_call":
tool_calls.append(data["tool"])
elif event_type == "usage":
for k in usage:
usage[k] += data.get(k, 0)
text = "".join(text_parts)
score = score_against(case.expected, text, tool_calls, usage)
return Result(
case_id=case.id,
passed=score["passed"],
score=score,
text=text,
tool_calls=tool_calls,
usage=usage,
elapsed_ms=int((time.time() - start) * 1000),
)
async def run_suite(cases: list[Case]) -> list[Result]:
run_id = str(uuid.uuid4())[:8]
sem = asyncio.Semaphore(CONCURRENCY)
async def bounded(case):
async with sem:
return await run_case(case, run_id)
return await asyncio.gather(*(bounded(c) for c in cases))
def score_against(expected, text, tool_calls, usage):
passed = True
notes = []
for tool in expected.get("must_call_tools", []):
if tool not in tool_calls:
passed = False
notes.append(f"missing required tool call: {tool}")
for tool in expected.get("must_not_call_tools", []):
if tool in tool_calls:
passed = False
notes.append(f"forbidden tool call: {tool}")
for needle in expected.get("output_must_contain", []):
if needle not in text:
passed = False
notes.append(f"output missing: {needle!r}")
if (max_tokens := expected.get("max_tokens")) is not None:
if usage["output_tokens"] > max_tokens:
passed = False
notes.append(f"output_tokens={usage['output_tokens']} > {max_tokens}")
return {"passed": passed, "notes": notes}