
Nanogpt
Build and train minimal GPT-2 models to understand transformer architecture and language model internals.
Install
npx skills add https://github.com/orchestra-research/ai-research-skills --skill nanogptWhat is this skill?
- Clean GPT-2 implementation in ~300 lines
- Educational transformer architecture with multi-head attention
- Minimal dependencies for learning language model internals
Adoption & trust: 1 installs on skills.sh; 9.4k GitHub stars; 2/3 security scanners passed (skills.sh audits).
Recommended Skills
Microsoft Foundrymicrosoft/azure-skills
Azure Aimicrosoft/azure-skills
Azure Hosted Copilot Sdkmicrosoft/azure-skills
Lark Eventlarksuite/cli
Running Claude Code Via Litellm Copilotxixu-me/skills
Setup Matt Pocock Skillsmattpocock/skills
Journey fit
Primary fit
NanoGPT helps builders validate core transformer concepts and LLM mechanics through clean, minimal code before scaling to production systems. The ~300-line implementation is ideal for prototyping and experimenting with GPT architecture without the complexity of production frameworks.
Common Questions / FAQ
Is Nanogpt safe to install?
skills.sh reports 2 of 3 security scanners passed. Review the Security Audits panel on this page before installing in production.
SKILL.md
READMESKILL.md - Nanogpt
# NanoGPT Architecture ## Model Structure (~300 Lines) NanoGPT implements a clean GPT-2 architecture in minimal code for educational purposes. ### Complete Model (model.py) ```python import torch import torch.nn as nn from torch.nn import functional as F class CausalSelfAttention(nn.Module): """Multi-head masked self-attention layer.""" def __init__(self, config): super().__init__() assert config.n_embd % config.n_head == 0 # Key, query, value projections for all heads (batched) self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias) # Output projection self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias) # Regularization self.attn_dropout = nn.Dropout(config.dropout) self.resid_dropout = nn.Dropout(config.dropout) self.n_head = config.n_head self.n_embd = config.n_embd self.dropout = config.dropout # Flash attention flag self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention') if not self.flash: # Causal mask (lower triangular) self.register_buffer("bias", torch.tril( torch.ones(config.block_size, config.block_size) ).view(1, 1, config.block_size, config.block_size)) def forward(self, x): B, T, C = x.size() # batch, seq_len, embedding_dim # Calculate Q, K, V for all heads in batch q, k, v = self.c_attn(x).split(self.n_embd, dim=2) # Reshape for multi-head attention k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs) q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs) v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs) # Attention if self.flash: # Flash Attention (PyTorch 2.0+) y = torch.nn.functional.scaled_dot_product_attention( q, k, v, attn_mask=None, dropout_p=self.dropout if self.training else 0, is_causal=True ) else: # Manual attention implementation att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf')) att = F.softmax(att, dim=-1) att = self.attn_dropout(att) y = att @ v # (B, nh, T, hs) # Reassemble all head outputs y = y.transpose(1, 2).contiguous().view(B, T, C) # Output projection y = self.resid_dropout(self.c_proj(y)) return y class MLP(nn.Module): """Feedforward network (2-layer with GELU activation).""" def __init__(self, config): super().__init__() self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias) self.gelu = nn.GELU() self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias) self.dropout = nn.Dropout(config.dropout) def forward(self, x): x = self.c_fc(x) x = self.gelu(x) x = self.c_proj(x) x = self.dropout(x) return x class Block(nn.Module): """Transformer block (attention + MLP with residuals).""" def __init__(self, config): super().__init__() self.ln_1 = nn.LayerNorm(config.n_embd) self.attn = CausalSelfAttention(config) self.ln_2 = nn.LayerNorm(config.n_embd) self.mlp = MLP(config) def forward(self, x): x = x + self.attn(self.ln_1(x)) # Pre-norm + residual x = x + self.mlp(self.ln_2(x)) # Pre-norm + residual return x @dataclass class GPTConfig: """GPT model configuration.""" block_size: int = 1024 # Max sequence length vocab_size: int = 50304 # GPT-2 vocab size (50257 rounded up for efficiency) n_layer: int = 12 # Number of layers n_head: int = 12