Roblox LLM Leaderboard

Tracking LLM capabilities regarding Roblox game development.

Benchmarks:

  • RobloxQA: Multiple choice question answering about Roblox APIs and concepts.
  • RobloxQA_OpenEnded: Question answering about Roblox APIs and concepts without multiple choices. Response correctness judged by an ensemble of reasoning LLMs by comparing the generated answer to the correct answer.
{
  • "headers": [
    • "Model",
    • "Precision",
    • "Params (B)",
    • "Average",
    • "RobloxQA",
    • "RobloxQA_OpenEnded"
    ],
  • "data": [
    • [
      • "deepseek-ai/DeepSeek-R1",
      • "FP8",
      • 684.531,
      • 61.148,
      • 80.468,
      • 41.829
      ],
    • [
      • "OpenAI/gpt-o3-mini (high)",
      • null,
      • null,
      • 60.931,
      • 81.291,
      • 40.572
      ],
    • [
      • "Anthropic/Claude-3.7-Sonnet (Thinking)",
      • null,
      • null,
      • 60.39,
      • 80.858,
      • 39.922
      ],
    • [
      • "Anthropic/Claude-3.7-Sonnet",
      • null,
      • null,
      • 58.938,
      • 80.121,
      • 37.755
      ],
    • [
      • "OpenAI/gpt-o3-mini (low)",
      • null,
      • null,
      • 57.659,
      • 79.688,
      • 35.631
      ],
    • [
      • "OpenAI/gpt-4o",
      • null,
      • null,
      • 56.078,
      • 77.523,
      • 34.634
      ],
    • [
      • "Anthropic/Claude-3.5-Sonnet",
      • null,
      • null,
      • 55.992,
      • 76.96,
      • 35.024
      ],
    • [
      • "deepseek-ai/DeepSeek-V3",
      • "FP8",
      • 684.531,
      • 55.32,
      • 77.566,
      • 33.073
      ],
    • [
      • "Qwen/QwQ-32B",
      • "FP16",
      • 32.764,
      • 53.89,
      • 76.613,
      • 31.166
      ],
    • [
      • "Google/Gemini-2.0-Flash",
      • null,
      • null,
      • 53.846,
      • 76.7,
      • 30.993
      ],
    • [
      • "boatbomber/Gemma-3-27B-Roblox-Luau",
      • "Q4_K_M",
      • 27.4,
      • 53.175,
      • 75.531,
      • 30.819
      ],
    • [
      • "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
      • "FP16",
      • 70.554,
      • 50.596,
      • 75.401,
      • 25.791
      ],
    • [
      • "Google/Gemini-2.0-Flash-Lite",
      • null,
      • null,
      • 50.379,
      • 74.924,
      • 25.834
      ],
    • [
      • "OpenAI/gpt-4o-mini",
      • null,
      • null,
      • 50.141,
      • 74.015,
      • 26.268
      ],
    • [
      • "Qwen/Qwen2.5-72B-Instruct",
      • "FP8",
      • 72.706,
      • 50.011,
      • 75.314,
      • 24.707
      ],
    • [
      • "meta-llama/Llama-3.3-70B-Instruct",
      • "FP8",
      • 70.554,
      • 49.707,
      • 75.011,
      • 24.404
      ],
    • [
      • "microsoft/phi-4",
      • "Q6_K",
      • 14.66,
      • 49.404,
      • 74.924,
      • 23.884
      ],
    • [
      • "mistralai/Mistral-Small-3.1-24B-Instruct-2503",
      • "Q6_K",
      • 24.011,
      • 48.884,
      • 73.712,
      • 24.057
      ],
    • [
      • "google/gemma-3-27b-it",
      • "Q4_K_M",
      • 27.432,
      • 48.755,
      • 71.893,
      • 25.618
      ],
    • [
      • "mistralai/Mistral-Small-24B-Instruct-2501",
      • "FP16",
      • 23.572,
      • 48.213,
      • 72.889,
      • 23.537
      ],
    • [
      • "Qwen/Qwen2.5-Coder-32B-Instruct",
      • "FP16",
      • 32.764,
      • 48.191,
      • 73.105,
      • 23.277
      ],
    • [
      • "boatbomber/R1-Distill-Qwen-14B-Roblox-Luau",
      • "Q6_K",
      • 14.77,
      • 47.129,
      • 73.971,
      • 20.286
      ],
    • [
      • "google/gemma-3-12b-it",
      • "Q4_K_M",
      • 12.187,
      • 46.003,
      • 70.03,
      • 21.977
      ],
    • [
      • "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
      • "FP8",
      • 14.77,
      • 45.829,
      • 72.109,
      • 19.549
      ],
    • [
      • "Qwen/Qwen2.5-7B-Instruct",
      • "FP8",
      • 7.616,
      • 42.71,
      • 67.692,
      • 17.729
      ],
    • [
      • "internlm/internlm2_5-20b-chat",
      • "Q4_K_M",
      • 19.861,
      • 40.826,
      • 63.794,
      • 17.859
      ],
    • [
      • "mistralai/Mistral-Nemo-Instruct-2407",
      • "Q4_K_M",
      • 12.248,
      • 40.783,
      • 64.227,
      • 17.339
      ],
    • [
      • "ibm-granite/granite-3.2-8b-instruct",
      • "Q4_K_M",
      • 8.171,
      • 40.327,
      • 64.747,
      • 15.908
      ],
    • [
      • "meta-llama/Llama-3.1-8B-Instruct",
      • "FP8",
      • 8.03,
      • 38.268,
      • 65.786,
      • 10.75
      ],
    • [
      • "mistralai/Mistral-7B-Instruct-v0.3",
      • "FP16",
      • 7.248,
      • 37.727,
      • 63.014,
      • 12.44
      ],
    • [
      • "meta-llama/Llama-3.2-3B-Instruct",
      • "FP8",
      • 3.213,
      • 32.917,
      • 59.42,
      • 6.415
      ],
    • [
      • "NousResearch/Hermes-3-Llama-3.2-3B-GGUF",
      • "Q6_K",
      • 3.213,
      • 31.446,
      • 53.703,
      • 9.189
      ],
    • [
      • "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
      • "FP16",
      • 1.777,
      • 23.756,
      • 43.958,
      • 3.554
      ],
    • [
      • "boatbomber/Gemma-3-1B-Roblox-Luau",
      • "Q5_K_M",
      • 1.302,
      • 22.089,
      • 39.541,
      • 4.638
      ],
    • [
      • "google/gemma-3-1b-it",
      • "Q4_K_M",
      • 1,
      • 21.938,
      • 39.541,
      • 4.335
      ],
    • [
      • "boatbomber/R1-Distill-Qwen-1.5B-Roblox-Luau",
      • "Q8_K",
      • 1.5,
      • 20.876,
      • 39.021,
      • 2.731
      ]
    ],
  • "metadata": null
}