Roblox LLM Leaderboard
Tracking LLM capabilities regarding Roblox game development.
Benchmarks:
- RobloxQA: Multiple choice question answering about Roblox APIs and concepts.
- RobloxQA_OpenEnded: Question answering about Roblox APIs and concepts without multiple choices. Response correctness judged by an ensemble of reasoning LLMs by comparing the generated answer to the correct answer.
{
- "headers": [
- "Model",
- "Precision",
- "Params (B)",
- "Average",
- "RobloxQA",
- "RobloxQA_OpenEnded"
- "data": [
- [
- "deepseek-ai/DeepSeek-R1",
- "FP8",
- 684.531,
- 61.148,
- 80.468,
- 41.829
- [
- "OpenAI/gpt-o3-mini (high)",
- null,
- null,
- 60.931,
- 81.291,
- 40.572
- [
- "Anthropic/Claude-3.7-Sonnet (Thinking)",
- null,
- null,
- 60.39,
- 80.858,
- 39.922
- [
- "Anthropic/Claude-3.7-Sonnet",
- null,
- null,
- 58.938,
- 80.121,
- 37.755
- [
- "OpenAI/gpt-o3-mini (low)",
- null,
- null,
- 57.659,
- 79.688,
- 35.631
- [
- "OpenAI/gpt-4o",
- null,
- null,
- 56.078,
- 77.523,
- 34.634
- [
- "Anthropic/Claude-3.5-Sonnet",
- null,
- null,
- 55.992,
- 76.96,
- 35.024
- [
- "deepseek-ai/DeepSeek-V3",
- "FP8",
- 684.531,
- 55.32,
- 77.566,
- 33.073
- [
- "Qwen/QwQ-32B",
- "FP16",
- 32.764,
- 53.89,
- 76.613,
- 31.166
- [
- "Google/Gemini-2.0-Flash",
- null,
- null,
- 53.846,
- 76.7,
- 30.993
- [
- "boatbomber/Gemma-3-27B-Roblox-Luau",
- "Q4_K_M",
- 27.4,
- 53.175,
- 75.531,
- 30.819
- [
- "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
- "FP16",
- 70.554,
- 50.596,
- 75.401,
- 25.791
- [
- "Google/Gemini-2.0-Flash-Lite",
- null,
- null,
- 50.379,
- 74.924,
- 25.834
- [
- "OpenAI/gpt-4o-mini",
- null,
- null,
- 50.141,
- 74.015,
- 26.268
- [
- "Qwen/Qwen2.5-72B-Instruct",
- "FP8",
- 72.706,
- 50.011,
- 75.314,
- 24.707
- [
- "meta-llama/Llama-3.3-70B-Instruct",
- "FP8",
- 70.554,
- 49.707,
- 75.011,
- 24.404
- [
- "microsoft/phi-4",
- "Q6_K",
- 14.66,
- 49.404,
- 74.924,
- 23.884
- [
- "mistralai/Mistral-Small-3.1-24B-Instruct-2503",
- "Q6_K",
- 24.011,
- 48.884,
- 73.712,
- 24.057
- [
- "google/gemma-3-27b-it",
- "Q4_K_M",
- 27.432,
- 48.755,
- 71.893,
- 25.618
- [
- "mistralai/Mistral-Small-24B-Instruct-2501",
- "FP16",
- 23.572,
- 48.213,
- 72.889,
- 23.537
- [
- "Qwen/Qwen2.5-Coder-32B-Instruct",
- "FP16",
- 32.764,
- 48.191,
- 73.105,
- 23.277
- [
- "boatbomber/R1-Distill-Qwen-14B-Roblox-Luau",
- "Q6_K",
- 14.77,
- 47.129,
- 73.971,
- 20.286
- [
- "google/gemma-3-12b-it",
- "Q4_K_M",
- 12.187,
- 46.003,
- 70.03,
- 21.977
- [
- "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
- "FP8",
- 14.77,
- 45.829,
- 72.109,
- 19.549
- [
- "Qwen/Qwen2.5-7B-Instruct",
- "FP8",
- 7.616,
- 42.71,
- 67.692,
- 17.729
- [
- "internlm/internlm2_5-20b-chat",
- "Q4_K_M",
- 19.861,
- 40.826,
- 63.794,
- 17.859
- [
- "mistralai/Mistral-Nemo-Instruct-2407",
- "Q4_K_M",
- 12.248,
- 40.783,
- 64.227,
- 17.339
- [
- "ibm-granite/granite-3.2-8b-instruct",
- "Q4_K_M",
- 8.171,
- 40.327,
- 64.747,
- 15.908
- [
- "meta-llama/Llama-3.1-8B-Instruct",
- "FP8",
- 8.03,
- 38.268,
- 65.786,
- 10.75
- [
- "mistralai/Mistral-7B-Instruct-v0.3",
- "FP16",
- 7.248,
- 37.727,
- 63.014,
- 12.44
- [
- "meta-llama/Llama-3.2-3B-Instruct",
- "FP8",
- 3.213,
- 32.917,
- 59.42,
- 6.415
- [
- "NousResearch/Hermes-3-Llama-3.2-3B-GGUF",
- "Q6_K",
- 3.213,
- 31.446,
- 53.703,
- 9.189
- [
- "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
- "FP16",
- 1.777,
- 23.756,
- 43.958,
- 3.554
- [
- "boatbomber/Gemma-3-1B-Roblox-Luau",
- "Q5_K_M",
- 1.302,
- 22.089,
- 39.541,
- 4.638
- [
- "google/gemma-3-1b-it",
- "Q4_K_M",
- 1,
- 21.938,
- 39.541,
- 4.335
- [
- "boatbomber/R1-Distill-Qwen-1.5B-Roblox-Luau",
- "Q8_K",
- 1.5,
- 20.876,
- 39.021,
- 2.731
- [
- "metadata": null