EvalForge

Eval runner for LLM applications — score outputs against expected answers.

Quick start — hosted API

import httpx

client = httpx.Client(
    base_url="https://api.mawlaia.com",
    headers={"Authorization": "Bearer mwl_live_..."},
)

# Deterministic scorer
resp = client.post("/v1/eval/score", json={
    "scorer": "rouge_l",
    "threshold": 0.6,
    "cases": [
        {"input": "Capital of France?",
         "output": "The capital of France is Paris.",
         "expected": "Paris"},
    ],
})

# LLM judge scorer (GPT-4o-mini, subjective tasks)
resp = client.post("/v1/eval/score", json={
    "scorer": "llm_judge",
    "threshold": 0.7,
    "criteria": "Is the answer concise, accurate, and professional?",
    "cases": [
        {"input": "Summarise our refund policy.",
         "output": "Refunds are processed within 5 business days."},
    ],
})
print(resp.json())
# {"scorer": "llm_judge", "mean_score": 0.92, "pass_rate": 1.0,
#  "results": [{"score": 0.92, "rationale": "Clear and professional.", ...}]}

Quick start — Python SDK

pip install mawlaia-evalforge

from evalforge import run_eval, EvalCase, RougeScorer

report = run_eval(
    cases=[
        EvalCase(input="Capital of France?", output="Paris", expected="Paris"),
    ],
    scorer=RougeScorer(threshold=0.6),
)
print(report.summary())