EvalForge
Eval runner for LLM applications — score outputs against expected answers.
Quick start — hosted API
import httpx
client = httpx.Client(
base_url="https://api.mawlaia.com",
headers={"Authorization": "Bearer mwl_live_..."},
)
# Deterministic scorer
resp = client.post("/v1/eval/score", json={
"scorer": "rouge_l",
"threshold": 0.6,
"cases": [
{"input": "Capital of France?",
"output": "The capital of France is Paris.",
"expected": "Paris"},
],
})
# LLM judge scorer (GPT-4o-mini, subjective tasks)
resp = client.post("/v1/eval/score", json={
"scorer": "llm_judge",
"threshold": 0.7,
"criteria": "Is the answer concise, accurate, and professional?",
"cases": [
{"input": "Summarise our refund policy.",
"output": "Refunds are processed within 5 business days."},
],
})
print(resp.json())
# {"scorer": "llm_judge", "mean_score": 0.92, "pass_rate": 1.0,
# "results": [{"score": 0.92, "rationale": "Clear and professional.", ...}]}Quick start — Python SDK
pip install mawlaia-evalforge
from evalforge import run_eval, EvalCase, RougeScorer
report = run_eval(
cases=[
EvalCase(input="Capital of France?", output="Paris", expected="Paris"),
],
scorer=RougeScorer(threshold=0.6),
)
print(report.summary())