{
  "kind": "tag",
  "slug": "llm-evals-topic",
  "id": 17791452099123760,
  "name": "LLM Evals",
  "type": "topic",
  "aliases": [
    "AI benchmarks",
    "llm_evals",
    "LLM evals",
    "LLM evaluations",
    "model evaluations"
  ],
  "diffbot_id": null,
  "story_count_14d": 2066,
  "cooccurring_tags": [
    {
      "id": 17723038993834764,
      "slug": "artificial-intelligence-topic",
      "name": "Artificial Intelligence",
      "type": "topic",
      "count": 470
    },
    {
      "id": 17723038994323052,
      "slug": "arxiv-organization",
      "name": "arXiv",
      "type": "organization",
      "count": 448
    },
    {
      "id": 17791452099463022,
      "slug": "reasoning-models-topic",
      "name": "Reasoning Models",
      "type": "topic",
      "count": 128
    },
    {
      "id": 17791452097663640,
      "slug": "ai-agents-topic",
      "name": "AI Agents",
      "type": "topic",
      "count": 128
    },
    {
      "id": 17791452102628180,
      "slug": "inference-optimization-topic",
      "name": "Inference Optimization",
      "type": "topic",
      "count": 86
    },
    {
      "id": 17731005482466606,
      "slug": "model-security-topic",
      "name": "Model Security",
      "type": "topic",
      "count": 74
    },
    {
      "id": 17730948119041167,
      "slug": "multimodal-ai-topic",
      "name": "Multimodal AI",
      "type": "topic",
      "count": 70
    },
    {
      "id": 17730934580874483,
      "slug": "rag-topic",
      "name": "RAG",
      "type": "topic",
      "count": 52
    },
    {
      "id": 17791452103823983,
      "slug": "ai-infrastructure-topic",
      "name": "AI Infrastructure",
      "type": "topic",
      "count": 52
    },
    {
      "id": 17730931225185240,
      "slug": "tool-use-topic",
      "name": "Tool Use",
      "type": "topic",
      "count": 40
    },
    {
      "id": 17730928368970588,
      "slug": "synthetic-data-topic",
      "name": "Synthetic Data",
      "type": "topic",
      "count": 38
    },
    {
      "id": 17791452098785214,
      "slug": "multi-agent-systems-topic",
      "name": "Multi-Agent Systems",
      "type": "topic",
      "count": 32
    },
    {
      "id": 17791452102387080,
      "slug": "ai-observability-topic",
      "name": "AI Observability",
      "type": "topic",
      "count": 32
    },
    {
      "id": 17791452098540351,
      "slug": "code-agents-topic",
      "name": "Code Agents",
      "type": "topic",
      "count": 28
    },
    {
      "id": 17731007202817379,
      "slug": "zhipu-ai-organization",
      "name": "Zhipu AI",
      "type": "organization",
      "count": 26
    }
  ],
  "top_sources": [
    {
      "name": "arxiv-rag-search-knowledge",
      "slug": "arxiv-rag-search-knowledge",
      "count": 216
    },
    {
      "name": "arxiv-frontier-methods-select",
      "slug": "arxiv-frontier-methods-select",
      "count": 152
    },
    {
      "name": "arxiv-multimodal-document-ai",
      "slug": "arxiv-multimodal-document-ai",
      "count": 38
    },
    {
      "name": "zhipu-ai-release-notes",
      "slug": "zhipu-ai-release-notes",
      "count": 22
    },
    {
      "name": "arxiv-model-efficiency-engineering",
      "slug": "arxiv-model-efficiency-engineering",
      "count": 18
    },
    {
      "name": "huggingface-nlp-blog",
      "slug": "huggingface-nlp-blog",
      "count": 16
    },
    {
      "name": "arxiv-ai-agents-tool-use",
      "slug": "arxiv-ai-agents-tool-use",
      "count": 10
    },
    {
      "name": "baseten-blog",
      "slug": "baseten-blog",
      "count": 8
    },
    {
      "name": "arxiv-ai-infra-inference-ops",
      "slug": "arxiv-ai-infra-inference-ops",
      "count": 4
    },
    {
      "name": "huggingface-research-blog",
      "slug": "huggingface-research-blog",
      "count": 4
    }
  ],
  "recent_stories": [
    {
      "id": 1780315440539750544,
      "slug": "reasoning-intensive-regression-9750544",
      "headline": "Reasoning-Intensive Regression",
      "source": "arxiv-rag-search-knowledge",
      "home_domain": "engineering-technology",
      "published_date": "2026-06-01"
    },
    {
      "id": 1780315440162413506,
      "slug": "reasoning-intensive-regression-2413506",
      "headline": "Reasoning-Intensive Regression",
      "source": "arxiv-rag-search-knowledge",
      "home_domain": "engineering-technology",
      "published_date": "2026-06-01"
    },
    {
      "id": 1780315301871446910,
      "slug": "from-out-of-distribution-detection-to-hallucination-detectio-1446910",
      "headline": "From Out-of-Distribution Detection to Hallucination Detection: A Geometric View",
      "source": "arxiv-rag-search-knowledge",
      "home_domain": "engineering-technology",
      "published_date": "2026-06-01"
    },
    {
      "id": 1780315302303714426,
      "slug": "from-out-of-distribution-detection-to-hallucination-detectio-3714426",
      "headline": "From Out-of-Distribution Detection to Hallucination Detection: A Geometric View",
      "source": "arxiv-rag-search-knowledge",
      "home_domain": "engineering-technology",
      "published_date": "2026-06-01"
    },
    {
      "id": 1780315297531053527,
      "slug": "evodefense-co-evolving-black-box-defense-with-large-language-1053527",
      "headline": "EvoDefense: Co-Evolving Black-Box Defense with Large Language Models",
      "source": "arxiv-rag-search-knowledge",
      "home_domain": "engineering-technology",
      "published_date": "2026-06-01"
    },
    {
      "id": 1780315297959795628,
      "slug": "evodefense-co-evolving-black-box-defense-with-large-language-9795628",
      "headline": "EvoDefense: Co-Evolving Black-Box Defense with Large Language Models",
      "source": "arxiv-rag-search-knowledge",
      "home_domain": "engineering-technology",
      "published_date": "2026-06-01"
    },
    {
      "id": 1780315272503096519,
      "slug": "what-gets-unmasked-first-trajectory-analysis-of-diffusion-mo-3096519",
      "headline": "What Gets Unmasked First? Trajectory Analysis of Diffusion Models for Graph-to-Text Generation",
      "source": "arxiv-rag-search-knowledge",
      "home_domain": "engineering-technology",
      "published_date": "2026-06-01"
    },
    {
      "id": 1780315272158266888,
      "slug": "what-gets-unmasked-first-trajectory-analysis-of-diffusion-mo-8266888",
      "headline": "What Gets Unmasked First? Trajectory Analysis of Diffusion Models for Graph-to-Text Generation",
      "source": "arxiv-rag-search-knowledge",
      "home_domain": "engineering-technology",
      "published_date": "2026-06-01"
    },
    {
      "id": 1780315235202437979,
      "slug": "maven-improving-generalization-in-agentic-tool-calling-2437979",
      "headline": "MAVEN: Improving Generalization in Agentic Tool Calling",
      "source": "arxiv-ai-agents-tool-use",
      "home_domain": "engineering-technology",
      "published_date": "2026-06-01"
    },
    {
      "id": 1780315235501143072,
      "slug": "maven-improving-generalization-in-agentic-tool-calling-1143072",
      "headline": "MAVEN: Improving Generalization in Agentic Tool Calling",
      "source": "arxiv-ai-agents-tool-use",
      "home_domain": "engineering-technology",
      "published_date": "2026-06-01"
    },
    {
      "id": 1780315171250285838,
      "slug": "language-models-learn-constructional-semantics-not-to-mentio-0285838",
      "headline": "Language Models Learn Constructional Semantics, Not To Mention Syntax: Investigating LM Understanding of Paired-Focus Constructions",
      "source": "arxiv-rag-search-knowledge",
      "home_domain": "engineering-technology",
      "published_date": "2026-06-01"
    },
    {
      "id": 1780315171502080372,
      "slug": "language-models-learn-constructional-semantics-not-to-mentio-2080372",
      "headline": "Language Models Learn Constructional Semantics, Not To Mention Syntax: Investigating LM Understanding of Paired-Focus Constructions",
      "source": "arxiv-rag-search-knowledge",
      "home_domain": "engineering-technology",
      "published_date": "2026-06-01"
    }
  ]
}