{
  "kind": "tag",
  "slug": "inference-optimization-topic",
  "id": 17791452102628180,
  "name": "Inference Optimization",
  "type": "topic",
  "aliases": [
    "inference optimization",
    "inference_optimization",
    "LLM inference optimization"
  ],
  "diffbot_id": null,
  "story_count_14d": 856,
  "cooccurring_tags": [
    {
      "id": 17723038993834764,
      "slug": "artificial-intelligence-topic",
      "name": "Artificial Intelligence",
      "type": "topic",
      "count": 452
    },
    {
      "id": 17723038994323052,
      "slug": "arxiv-organization",
      "name": "arXiv",
      "type": "organization",
      "count": 392
    },
    {
      "id": 17791452103823983,
      "slug": "ai-infrastructure-topic",
      "name": "AI Infrastructure",
      "type": "topic",
      "count": 182
    },
    {
      "id": 17791452099123760,
      "slug": "llm-evals-topic",
      "name": "LLM Evals",
      "type": "topic",
      "count": 172
    },
    {
      "id": 17791452097663640,
      "slug": "ai-agents-topic",
      "name": "AI Agents",
      "type": "topic",
      "count": 74
    },
    {
      "id": 17791452099463022,
      "slug": "reasoning-models-topic",
      "name": "Reasoning Models",
      "type": "topic",
      "count": 62
    },
    {
      "id": 17791452103543441,
      "slug": "gpu-clusters-topic",
      "name": "GPU Clusters",
      "type": "topic",
      "count": 52
    },
    {
      "id": 17723038994304984,
      "slug": "baseten-organization",
      "name": "Baseten",
      "type": "organization",
      "count": 50
    },
    {
      "id": 17791452102923593,
      "slug": "quantization-topic",
      "name": "Quantization",
      "type": "topic",
      "count": 42
    },
    {
      "id": 17730948119041167,
      "slug": "multimodal-ai-topic",
      "name": "Multimodal AI",
      "type": "topic",
      "count": 40
    },
    {
      "id": 17730931225185240,
      "slug": "tool-use-topic",
      "name": "Tool Use",
      "type": "topic",
      "count": 34
    },
    {
      "id": 17723038993835295,
      "slug": "cloud-computing-topic",
      "name": "Cloud Computing",
      "type": "topic",
      "count": 28
    },
    {
      "id": 17730981183425062,
      "slug": "distillation-topic",
      "name": "Distillation",
      "type": "topic",
      "count": 26
    },
    {
      "id": 17731005482466606,
      "slug": "model-security-topic",
      "name": "Model Security",
      "type": "topic",
      "count": 24
    },
    {
      "id": 17733541291350092,
      "slug": "ing-organization",
      "name": "ING",
      "type": "organization",
      "count": 22
    }
  ],
  "top_sources": [
    {
      "name": "arxiv-ai-agents-tool-use",
      "slug": "arxiv-ai-agents-tool-use",
      "count": 138
    },
    {
      "name": "arxiv-frontier-methods-select",
      "slug": "arxiv-frontier-methods-select",
      "count": 100
    },
    {
      "name": "arxiv-ai-infra-inference-ops",
      "slug": "arxiv-ai-infra-inference-ops",
      "count": 50
    },
    {
      "name": "baseten-blog",
      "slug": "baseten-blog",
      "count": 50
    },
    {
      "name": "arxiv-model-efficiency-engineering",
      "slug": "arxiv-model-efficiency-engineering",
      "count": 48
    },
    {
      "name": "arxiv-rag-search-knowledge",
      "slug": "arxiv-rag-search-knowledge",
      "count": 40
    },
    {
      "name": "arxiv-multimodal-document-ai",
      "slug": "arxiv-multimodal-document-ai",
      "count": 10
    },
    {
      "name": "huggingface-nlp-blog",
      "slug": "huggingface-nlp-blog",
      "count": 10
    },
    {
      "name": "aws-machine-learning-blog",
      "slug": "aws-machine-learning-blog",
      "count": 6
    },
    {
      "name": "zhipu-ai-release-notes",
      "slug": "zhipu-ai-release-notes",
      "count": 6
    }
  ],
  "recent_stories": [
    {
      "id": 1780315090368785219,
      "slug": "uniscale-adaptive-unified-inference-scaling-via-online-joint-8785219",
      "headline": "UniScale: Adaptive Unified Inference Scaling via Online Joint Optimization of Model Routing and Test-Time Scaling",
      "source": "arxiv-rag-search-knowledge",
      "home_domain": "engineering-technology",
      "published_date": "2026-06-01"
    },
    {
      "id": 1780315090102845559,
      "slug": "uniscale-adaptive-unified-inference-scaling-via-online-joint-2845559",
      "headline": "UniScale: Adaptive Unified Inference Scaling via Online Joint Optimization of Model Routing and Test-Time Scaling",
      "source": "arxiv-rag-search-knowledge",
      "home_domain": "engineering-technology",
      "published_date": "2026-06-01"
    },
    {
      "id": 1780314955887474283,
      "slug": "self-reflective-generation-at-test-time-7474283",
      "headline": "Self-Reflective Generation at Test Time",
      "source": "arxiv-rag-search-knowledge",
      "home_domain": "engineering-technology",
      "published_date": "2026-06-01"
    },
    {
      "id": 1780314956161440660,
      "slug": "self-reflective-generation-at-test-time-1440660",
      "headline": "Self-Reflective Generation at Test Time",
      "source": "arxiv-rag-search-knowledge",
      "home_domain": "engineering-technology",
      "published_date": "2026-06-01"
    },
    {
      "id": 1780314124443123297,
      "slug": "stop-the-flip-flop-context-preserving-verification-for-fast-3123297",
      "headline": "Stop the Flip-Flop: Context-Preserving Verification for Fast Revocable Diffusion Decoding",
      "source": "arxiv-rag-search-knowledge",
      "home_domain": "engineering-technology",
      "published_date": "2026-06-01"
    },
    {
      "id": 1780314123755466535,
      "slug": "stop-the-flip-flop-context-preserving-verification-for-fast-5466535",
      "headline": "Stop the Flip-Flop: Context-Preserving Verification for Fast Revocable Diffusion Decoding",
      "source": "arxiv-rag-search-knowledge",
      "home_domain": "engineering-technology",
      "published_date": "2026-06-01"
    },
    {
      "id": 1780313966495526846,
      "slug": "prism-self-pruning-intrinsic-selection-method-for-training-f-5526846",
      "headline": "PRISM: Self-Pruning Intrinsic Selection Method for Training-Free Multimodal Data Selection",
      "source": "arxiv-model-efficiency-engineering",
      "home_domain": "engineering-technology",
      "published_date": "2026-06-01"
    },
    {
      "id": 1780313966726965463,
      "slug": "prism-self-pruning-intrinsic-selection-method-for-training-f-6965463",
      "headline": "PRISM: Self-Pruning Intrinsic Selection Method for Training-Free Multimodal Data Selection",
      "source": "arxiv-model-efficiency-engineering",
      "home_domain": "engineering-technology",
      "published_date": "2026-06-01"
    },
    {
      "id": 1780313884519352623,
      "slug": "mads-model-aware-diverse-core-set-selection-for-instruction-9352623",
      "headline": "MADS: Model-Aware Diverse Core Set Selection for Instruction Tuning",
      "source": "arxiv-rag-search-knowledge",
      "home_domain": "engineering-technology",
      "published_date": "2026-06-01"
    },
    {
      "id": 1780313884759978241,
      "slug": "mads-model-aware-diverse-core-set-selection-for-instruction-9978241",
      "headline": "MADS: Model-Aware Diverse Core Set Selection for Instruction Tuning",
      "source": "arxiv-rag-search-knowledge",
      "home_domain": "engineering-technology",
      "published_date": "2026-06-01"
    },
    {
      "id": 1780313843350251894,
      "slug": "towards-efficient-llms-annealing-with-principled-sample-sele-0251894",
      "headline": "Towards Efficient LLMs Annealing with Principled Sample Selection",
      "source": "arxiv-rag-search-knowledge",
      "home_domain": "engineering-technology",
      "published_date": "2026-06-01"
    },
    {
      "id": 1780313843555294596,
      "slug": "towards-efficient-llms-annealing-with-principled-sample-sele-5294596",
      "headline": "Towards Efficient LLMs Annealing with Principled Sample Selection",
      "source": "arxiv-rag-search-knowledge",
      "home_domain": "engineering-technology",
      "published_date": "2026-06-01"
    }
  ]
}