{
  "kind": "story",
  "slug": "building-an-rl-theorem-proving-workflow-on-modal-4507934",
  "id": 1779433399444507934,
  "record_id": 1779430871731850719,
  "headline": "Building an RL Theorem-Proving Workflow on Modal",
  "summary": "",
  "source": "modal-labs-blog",
  "source_url": "https://modal.com/blog/building-an-rl-theorem-proving-workflow-on-modal",
  "home_domain": "engineering-technology",
  "claim_type": null,
  "sentiment": "positive",
  "significance": "medium",
  "claim_count": 99,
  "reading_time_minutes": 7,
  "published_date": "2026-04-29",
  "created_on": "2026-05-22T07:03:19.101923+00:00",
  "claims": [
    {
      "id": 1779433399914101974,
      "text": "The `lean_server_image` uses `projectnumina/kimina-lean-server:2.0.0`.",
      "evidence_type": "direct_quote",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433399902835987,
      "text": "The `gpu_image` uses `debian_slim` with Python 3.11 and `vllm`, `torch`, `transformers`, `datasets`.",
      "evidence_type": "direct_quote",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433399906869237,
      "text": "The `orchestrator_image` uses `debian_slim` with Python 3.11 and `requests`, `tqdm`, `numpy`.",
      "evidence_type": "direct_quote",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433400182478801,
      "text": "Using Modal, a sparse-reward RL workflow was run across three different runtimes without rebuilding the setup.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433399838911153,
      "text": "Running the entire workflow on Modal allowed AE Studio to focus on the experiment instead of infrastructure.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433400094524556,
      "text": "Early results showed ES matched or outperformed GRPO in verified proofs per iteration in several runs.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433400070215760,
      "text": "Modal reduced wasted GPU time by approximately 3.7x compared to less elastic platforms.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433400026403831,
      "text": "Reduced complexity on Modal translated to completing a successful training run in less than two days from project kickoff.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433399776544144,
      "text": "AE Studio wanted to test Evolution Strategies (ES) as an alternative to GRPO.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433399785997131,
      "text": "ES takes an approach inspired by natural selection, creating a \"population\" of slightly different model versions.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433399790489012,
      "text": "ES tests all versions in the population and then steers the original model toward the best-scoring versions.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433399794149869,
      "text": "Recent research has shown ES can outperform GRPO in some settings.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433399798684716,
      "text": "AE Studio aimed to replicate ES's performance for theorem-proving as a first step to accelerating AI-enabled science.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433399802720456,
      "text": "For a language model to prove a theorem, it needs to generate 'code' in a specialized language like Lean.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433399806640955,
      "text": "The Lean compiler can verify if a generated proof is correct.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433399810048003,
      "text": "Code generation by the LLM is GPU/inference heavy.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433399814437922,
      "text": "Proof verification by the Lean compiler runs on the CPU.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433399818540062,
      "text": "The workload required three different execution environments: GPU for generation, CPU for verification, and a lightweight process for coordination.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433399822110719,
      "text": "A vLLM instance running on GPUs is used for generating proof attempts.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433399826254535,
      "text": "Each proof is sent to a Lean verifier running on CPUs, which needs to be isolated.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433399830160083,
      "text": "A lightweight process supervises the training loop, sending batches, collecting results, and tracking progress.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433399834073349,
      "text": "Setting up this system from scratch would involve managing multiple server environments, a job scheduling system, storage for model checkpoints, and a robust verification service.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433399842882976,
      "text": "Modal's per-function images allow each step (GPU generation, Lean verification, orchestration) to declare its own environment.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433399846138852,
      "text": "Modal's `.map()` feature enables fanning out many independent evaluations per ES iteration and streaming results.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433399850837692,
      "text": "Modal Sandboxes provide isolated, short-lived Lean servers for each verification batch, preventing failures from affecting the whole run.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433399854668661,
      "text": "Modal Volumes store the original base model weights, allowing GPU workers to load them without repeated downloads from Hugging Face.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433399858906111,
      "text": "Modal Secrets inject credentials into remote functions without leaking them into local shell state.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433399862611959,
      "text": "GPU sandboxes take time to warm up, and remote debugging is challenging, which are inherent challenges in bursty multi-GPU experiments.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433399866676621,
      "text": "Modal features help reduce cold starts and make remote iteration easier, improving workflow manageability.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433399868227097,
      "text": "The goal was to train a model to be good at math theorem proving using Lean.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433399874520966,
      "text": "Lean provides a verifiable reward for the training loop based on proof correctness.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433399878599603,
      "text": "The training loop was fixed, and only the update rule was changed to compare GRPO and ES performance.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433399885948002,
      "text": "GRPO applies a gradient update based on relative performance within groups of proof attempts.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433399890307008,
      "text": "ES evaluates a population of perturbed models, scores each by proof success rate, and updates the base model using a weighted combination of perturbations based on rewards.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433399898507822,
      "text": "Each compute role was given its own image on Modal (GPU, orchestrator, Lean server).",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433399918153830,
      "text": "ES was the easiest part of the workload to distribute.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433399922585822,
      "text": "Each perturbation evaluation for ES required the current checkpoint, theorem batch, perturbation seed, and generation parameters.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433399930592720,
      "text": "Modal's `.map()` was used for parallel GPU fan-out in ES.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433399934533571,
      "text": "With ES, weight perturbations are fully determined by their seeds, avoiding expensive weight transfers between GPUs.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433399938844461,
      "text": "Each worker reconstructs the current model from base weights and applies its perturbation.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433399942441066,
      "text": "Verification was the part of the system where isolation mattered most due to potential hangs, crashes, or resource consumption.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433399950762224,
      "text": "Modal Sandboxes were used for each verification batch, starting a Lean server, sending proofs, collecting results, and shutting down.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433399954418005,
      "text": "Proofs were verified in parallel using a fan-out pattern with `verify_batch_in_sandbox.map()`.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433399958492926,
      "text": "One iteration created 3,840 proof attempts, split into batches of 64.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433399962774779,
      "text": "Modal's sandbox model allows scaling verification much further, potentially running each proof attempt in its own sandbox.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433399966428133,
      "text": "The entire model state in ES can be described by the base model plus a list of (seed, reward) pairs.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433399970995752,
      "text": "Each perturbation is generated from a deterministic random seed, eliminating the need to store noise vectors.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433399972411698,
      "text": "The orchestrator maintained and passed a running list of seed/reward entries to each worker.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433399994790406,
      "text": "Each history entry for checkpointing was about 200 bytes per iteration.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433399998088866,
      "text": "On the GPU side, workers loaded the original base model from a Modal Volume.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433400001697852,
      "text": "Workers replayed the full history to reconstruct current weights before applying their own perturbation.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433400003266947,
      "text": "The replay process regenerates noise on GPU using deterministic seeds and applies weighted updates.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433400005127234,
      "text": "The base model lives in a Modal Volume, preventing re-downloading from Hugging Face for each worker.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433400006318137,
      "text": "The full model state traveled as a plain Python list, small enough to pass as a function argument to every remote call.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433400014017473,
      "text": "Modal provided an ideal balance of speed, simplicity, and cost for this experiment.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433400018457678,
      "text": "The Modal implementation required approximately 250 lines of platform setup code.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433400022959943,
      "text": "Similar experiments on other platforms typically require about 600 lines of setup code.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433400030681171,
      "text": "This completion time is 60% less than seen when using alternative platforms.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433400031303453,
      "text": "Faster iteration speed is expected for tweaking and optimizing the training pipeline on Modal.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    },
    {
      "id": 1779433400042525222,
      "text": "Runtime efficiency also improved significantly with Modal.",
      "evidence_type": "paraphrase",
      "confidence": "stated",
      "home_domain": "engineering-technology",
      "published_date": "2026-04-29"
    }
  ],
  "tags": [
    {
      "id": 17733546496557335,
      "slug": "enterprise-agents-organization",
      "name": "Enterprise Agents",
      "type": "organization"
    },
    {
      "id": 17730927799169010,
      "slug": "hugging-face-organization",
      "name": "Hugging Face",
      "type": "organization"
    },
    {
      "id": 17733575131389879,
      "slug": "imi-organization",
      "name": "IMI",
      "type": "organization"
    },
    {
      "id": 17733518275501461,
      "slug": "modal-organization",
      "name": "Modal",
      "type": "organization"
    },
    {
      "id": 17730963186726720,
      "slug": "project-numina-organization",
      "name": "Project Numina",
      "type": "organization"
    },
    {
      "id": 17731117272241279,
      "slug": "studio-organization",
      "name": "Studio",
      "type": "organization"
    },
    {
      "id": 17728325767450738,
      "slug": "david-y-gan-person",
      "name": "David Y. Gan",
      "type": "person"
    },
    {
      "id": 17726770817879223,
      "slug": "risto-miikkulainen-person",
      "name": "Risto Miikkulainen",
      "type": "person"
    },
    {
      "id": 17723038994338982,
      "slug": "sebastian-raschka-person",
      "name": "Sebastian Raschka",
      "type": "person"
    },
    {
      "id": 17791452103823983,
      "slug": "ai-infrastructure-topic",
      "name": "AI Infrastructure",
      "type": "topic"
    },
    {
      "id": 17723038993834764,
      "slug": "artificial-intelligence-topic",
      "name": "Artificial Intelligence",
      "type": "topic"
    },
    {
      "id": 17791452103543441,
      "slug": "gpu-clusters-topic",
      "name": "GPU Clusters",
      "type": "topic"
    },
    {
      "id": 17782518580601405,
      "slug": "machine-learning-research-topic",
      "name": "Machine Learning Research",
      "type": "topic"
    },
    {
      "id": 17791452099463022,
      "slug": "reasoning-models-topic",
      "name": "Reasoning Models",
      "type": "topic"
    }
  ]
}