diff --git a/assets/dash.css b/assets/dash.css
new file mode 100644
index 000000000..fa03745b9
--- /dev/null
+++ b/assets/dash.css
@@ -0,0 +1,22 @@
+:root {
+    --font-color: #F1F1F1;
+    --dropdown-bg: #005050;
+}
+
+body {
+    background-color: black !important;
+    color: var(--font-color) !important;
+}
+
+.rc-slider-mark-text {
+    color: var(--font-color) !important;
+}
+
+.Select-control, .Select-menu-outer, .Select-value-label, .Select-option {
+    color: var(--font-color) !important;
+    background-color: var(--dropdown-bg) !important;
+}
+
+h1, h2, h3, h4, h5, h6 {
+    color: var(--font-color) !important;
+}
diff --git a/cache_data.py b/cache_data.py
new file mode 100644
index 000000000..d1d5c0a13
--- /dev/null
+++ b/cache_data.py
@@ -0,0 +1,185 @@
+import numpy as np
+
+import json
+import glob
+import os
+
+
+env_names = sorted([
+    'breakout',
+    'impulse_wars',
+    'pacman',
+    'tetris',
+    'g2048',
+    'moba',
+    'pong',
+    'tower_climb',
+    'grid',
+    'nmmo3',
+    'snake',
+    'tripletriad'
+])
+
+HYPERS = [
+    'train/learning_rate',
+    'train/ent_coef',
+    'train/gamma',
+    'train/gae_lambda',
+    'train/vtrace_rho_clip',
+    'train/vtrace_c_clip',
+    'train/clip_coef',
+    'train/vf_clip_coef',
+    'train/vf_coef',
+    'train/max_grad_norm',
+    'train/adam_beta1',
+    'train/adam_beta2',
+    'train/adam_eps',
+    'train/prio_alpha',
+    'train/prio_beta0',
+    'train/bptt_horizon',
+    'train/num_minibatches',
+    'train/minibatch_size',
+    'policy/hidden_size',
+    'env/num_envs',
+]
+
+ALL_KEYS = [
+    'agent_steps',
+    'cost',
+    'environment/score',
+    'environment/perf'
+] + HYPERS
+
+def pareto_idx(steps, costs, scores):
+    idxs = []
+    for i in range(len(steps)):
+        better = [scores[j] >= scores[i] and
+            costs[j] < costs[i] and steps[j] < steps[i]
+            for j in range(len(scores))]
+        if not any(better):
+            idxs.append(i)
+
+    return idxs
+
+def load_sweep_data(path):
+    data = {}
+    keys = None
+    for fpath in glob.glob(path):
+        if 'cache.json' in fpath:
+            continue
+
+        with open(fpath, 'r') as f:
+            exp = json.load(f)
+
+        if not data:
+            for kk in exp.keys():
+                if kk == 'data':
+                    for k, v in exp[kk][-1].items():
+                        data[k] = []
+                else:
+                    data[kk] = []
+
+        discard = False
+        for kk in list(data.keys()):
+            if kk not in exp and kk not in exp['data'][-1]:
+                discard = True
+                break
+
+        if discard:
+            continue
+
+        for kk in list(data.keys()):
+            if kk in exp:
+                v = exp[kk]
+                sweep_key = f'sweep/{kk}/distribution'
+                if sweep_key in data and exp[sweep_key] == 'logit_normal':
+                    v = 1 - v
+                elif kk in ('train/vtrace_rho_clip', 'train/vtrace_c_clip'):
+                    v = max(v, 0.1)
+
+                data[kk].append(v)
+            else:
+                data[kk].append(exp['data'][-1][kk])
+
+    steps = data['agent_steps']
+    costs = data['cost']
+    scores = data['environment/score']
+
+    idxs = pareto_idx(steps, costs, scores)
+
+    # Filter to pareto
+    for k in data:
+        data[k] = [data[k][i] for i in idxs]
+
+    # Monkey patch: Cap performance
+    data['environment/perf'] = [min(e, 1.0) for e in data['environment/perf']]
+    
+    # Monkey patch: Adjust steps by frameskip if present
+    if 'env/frameskip' in data:
+        skip = data['env/frameskip']
+        data['agent_steps'] = [n*m for n, m in zip(data['agent_steps'], skip)]
+ 
+    return data
+
+def cached_sweep_load(path, env_name):
+    cache_file = os.path.join(path, 'c_cache.json')
+    if not os.path.exists(cache_file):
+        data = load_sweep_data(os.path.join(path, '*.json'))
+        with open(cache_file, 'w') as f:
+            json.dump(data, f)
+
+    with open(cache_file, 'r') as f:
+        data = json.load(f)
+
+    print(f'Loaded {env_name}')
+    return data
+
+def compute_tsne():
+    data = {name: cached_sweep_load(f'experiments/logs/puffer_{name}', name) for name in env_names}
+
+    flat = []
+    flat_mmin = []
+    flat_mmax = []
+    for env in env_names:
+        flat.append(np.stack([data[env][hyper] for hyper in HYPERS], axis=1))
+        flat_mmin.append(np.stack([data[env][f'sweep/{hyper}/min'] for hyper in HYPERS], axis=1))
+        flat_mmax.append(np.stack([data[env][f'sweep/{hyper}/max'] for hyper in HYPERS], axis=1))
+
+    flat_distribution = [data[env][f'sweep/{hyper}/distribution'] for env in env_names for hyper in HYPERS]
+
+    flat = np.concatenate(flat, axis=0)
+    flat_mmin = np.concatenate(flat_mmin, axis=0).min(axis=0)
+    flat_mmax = np.concatenate(flat_mmax, axis=0).max(axis=0)
+
+    normed = flat.copy()
+    for i in range(len(HYPERS)):
+        dist = flat_distribution[i]
+        if 'log' in dist or 'pow2' in dist:
+            flat_mmin[i] = np.log(flat_mmin[i])
+            flat_mmax[i] = np.log(flat_mmax[i])
+            normed[:, i] = np.log(flat[:, i])
+
+        normed[:, i] = (normed[:, i] - flat_mmin[i]) / (flat_mmax[i] - flat_mmin[i])
+
+    from sklearn.manifold import TSNE
+    proj = TSNE(n_components=2)
+    reduced = proj.fit_transform(normed)
+
+    row = 0
+    for env in env_names:
+        '''
+        for i, hyper in enumerate(HYPERS):
+            sz = len(data[env][hyper])
+            data[env][hyper] = normed[row:row+sz, i].tolist()
+        '''
+        sz = len(data[env]['agent_steps'])
+
+        data[env] = {k: v for k, v in data[env].items() if k in ALL_KEYS}
+        data[env]['tsne1'] = reduced[row:row+sz, 0].tolist()
+        data[env]['tsne2'] = reduced[row:row+sz, 1].tolist()
+        row += sz
+
+    json.dump(data, open('all_cache.json', 'w'))
+
+if __name__ == '__main__':
+    compute_tsne()
diff --git a/compile_puffer.py b/compile_puffer.py
new file mode 100644
index 000000000..73d6a665c
--- /dev/null
+++ b/compile_puffer.py
@@ -0,0 +1,196 @@
+import torch
+from torch import nn
+from torch.utils.benchmark import Timer
+from torch.utils.flop_counter import FlopCounterMode
+from torch import func
+
+from torch.backends import cudnn
+cudnn.benchmark = True
+cudnn.deterministic = False
+cudnn.benchmark_limit = 32
+
+torch.set_float32_matmul_precision('high')
+torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
+
+class Default(nn.Module):
+    def __init__(self, input_size, hidden_size, output_size):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.encoder = torch.nn.Sequential(
+            nn.Linear(input_size, hidden_size),
+            nn.GELU(),
+        )
+        self.decoder = nn.Linear(hidden_size, output_size)
+        self.value = nn.Linear(hidden_size, 1)
+
+    def forward(self, observations):
+        hidden = self.encode_observations(observations)
+        logits, values = self.decode_actions(hidden)
+        return logits, values
+
+    def encode_observations(self, observations, state=None):
+        batch_size = observations.shape[0]
+        observations = observations.view(batch_size, -1)
+        return self.encoder(observations)
+
+    def decode_actions(self, hidden):
+        logits = self.decoder(hidden)
+        values = self.value(hidden)
+        return logits, values
+
+
+class LSTMWrapper(nn.Module):
+    def __init__(self, policy, input_size, hidden_size, output_size):
+        super().__init__()
+        self.policy = policy
+        input_size = hidden_size
+
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+
+        self.cell = torch.nn.LSTMCell(input_size, hidden_size)
+
+    def forward(self, observations, h, c):
+        hidden = self.policy.encode_observations(observations)
+        hidden, c = self.cell(hidden, (h, c))
+        logits, values = self.policy.decode_actions(hidden)
+        return logits, values, hidden, c
+
+def get_params_and_buffers(model):
+    buffers = dict(model.named_buffers())
+    param_names = [k for k, v in model.named_parameters() if v.requires_grad]
+    params = [v for k, v in model.named_parameters() if v.requires_grad]
+    params_dict = dict(zip(param_names, params))
+    return {**buffers, **params_dict}
+
+
+@torch.compile(fullgraph=True, dynamic=False, mode='reduce-overhead')
+def functional_forward(model, params_and_buffers, batch, h, c):
+    return func.functional_call(model, params_and_buffers, (batch, h, c))
+
+def rollout(model, params_and_buffers, batch, h, c, seq):
+    all_logits = []
+    all_values = []
+    for i in range(seq):
+        logits, values, h, c = functional_forward(model, params_and_buffers, batch[i], h, c)
+        all_logits.append(logits)
+        all_values.append(values)
+
+    logits = torch.stack(all_logits, dim=0)
+    values = torch.stack(all_values, dim=0)
+
+    return logits, values
+
+@torch.compile(fullgraph=True, dynamic=False, mode='reduce-overhead')
+def fast_rollout(model, batch, h, c, seq):
+    logits = torch.empty(seq, batch.shape[1], OUTPUT_SIZE, device=batch.device, dtype=batch.dtype)
+    values = torch.empty(seq, batch.shape[1], 1, device=batch.device, dtype=batch.dtype)
+    for i in range(seq):
+        l, v, h, c = model(batch[i], h, c)
+        logits[i] = l
+        values[i] = v
+
+    return logits, values
+
+def evaluate(model, params_and_buffers, batch, h, c, seq):
+    with torch.no_grad(), torch.autocast(device_type="cuda", dtype=torch.bfloat16):
+        return fast_rollout(model, batch, h, c, seq)
+
+def compute_loss(params_and_buffers, model, batch, h, c, seq):
+    logits, values = rollout(model, params_and_buffers, batch, h, c, seq)
+    loss = -torch.log(torch.softmax(logits, dim=-1)).mean() + (values**2).mean()
+    return loss
+
+grad_fn = torch.compile(func.grad(compute_loss),
+    fullgraph=True, dynamic=False, mode='reduce-overhead')
+
+#grad_fn = func.grad(compute_loss)
+
+def train(model, params_and_buffers, batch, h, c, loops, seq):
+    with torch.no_grad(), torch.autocast(device_type="cuda", dtype=torch.bfloat16):
+        for _ in range(loops):
+            grads = grad_fn(params_and_buffers, model, batch, h, c, seq)
+            for name in grads:
+                params_and_buffers[name].sub_(0.01 * grads[name])
+
+    return params_and_buffers
+
+if __name__ == '__main__':
+    INPUT_SIZE = 128
+    HIDDEN_SIZE = 128
+    OUTPUT_SIZE = 4
+    B = 256
+    SEQ = 64
+    LOOPS = 4  
+    dtype = torch.bfloat16
+
+    model = LSTMWrapper(
+        Default(INPUT_SIZE, HIDDEN_SIZE, OUTPUT_SIZE),
+        INPUT_SIZE, HIDDEN_SIZE, OUTPUT_SIZE
+    ).cuda()
+
+    # TODO: carefully test slowdown from this
+    params_and_buffers = get_params_and_buffers(model)
+    #model = torch.compile(model, mode='reduce-overhead', dynamic=False, fullgraph=True)
+
+    # Create input batch
+    batch = torch.randn(SEQ, B, INPUT_SIZE).cuda().to(dtype)
+
+    # Define a multi-step function to run multiple forwards in one compiled graph
+    # Manual FLOPs calculation
+    I = INPUT_SIZE
+    H = HIDDEN_SIZE
+    O = OUTPUT_SIZE
+    flops = B * (2*I*H + 16*H*H + 2*H*O + 2*H)
+
+    h = torch.zeros(B, HIDDEN_SIZE).cuda().to(dtype)
+    c = torch.zeros(B, HIDDEN_SIZE).cuda().to(dtype)
+
+    # Warmup
+    for _ in range(3):
+        _ = evaluate(model, params_and_buffers, batch, h, c, SEQ)
+    # Timing
+    timer = Timer(
+        stmt='evaluate(model, params_and_buffers, batch, h, c, SEQ)',
+        globals={
+            'evaluate': evaluate,
+            'params_and_buffers': params_and_buffers,
+            'model': model,
+            'batch': batch,
+            'h': h,
+            'c': c,
+            'SEQ': SEQ,
+        }
+    )
+    output = timer.timeit(LOOPS)
+
+    cost = output.mean / SEQ  # Average time per forward pass (fixed from times[0] to mean)
+    FLOPS = flops / cost
+    perf_evaluate = f'FLOPS: {FLOPS / 1e12:.2f}T, SPS: {B/cost/1e6:.2f}M'
+
+    # Warmup
+    for _ in range(1):
+        _ = train(model, params_and_buffers, batch, h, c, LOOPS, SEQ)
+
+    # Timing
+    timer = Timer(
+        stmt='train(model, params_and_buffers, batch, h, c, LOOPS, SEQ)',
+        globals={
+            'train': train,
+            'params_and_buffers': params_and_buffers,
+            'model': model,
+            'batch': batch,
+            'h': h,
+            'c': c,
+            'LOOPS': LOOPS,
+            'SEQ': SEQ,
+        }
+    )
+
+    output = timer.timeit(1)
+    cost = output.mean / SEQ / LOOPS  # Average time per forward pass (fixed from times[0] to mean)
+    FLOPS = 3*flops / cost
+    perf_train = f'FLOPS: {FLOPS / 1e12:.2f}T, SPS: {B/cost/1e6:.2f}M'
+
+    print(perf_evaluate)
+    print(perf_train)
diff --git a/constellation.py b/constellation.py
new file mode 100644
index 000000000..a0bed137f
--- /dev/null
+++ b/constellation.py
@@ -0,0 +1,890 @@
+from dash import Dash, html, dcc
+from dash.dependencies import Input, Output
+import pandas as pd
+import plotly.graph_objects as go
+import plotly.express as px
+import numpy as np
+import json
+import glob
+import os
+
+FONT_FAMILY = 'Arial'
+FONT_SIZE_TITLE = 28
+FONT_SIZE_AXIS = 22
+FONT_SIZE_TICK = 20
+FONT_SIZE_TICK_3D = 14
+FONT_SIZE_LEGEND = 18
+FONT_COLOR = '#f1f1f1'
+PLOT_BG_COLOR = '#061a1a'
+PAPER_BG_COLOR = '#061a1a'
+LINE_WIDTH = 4
+LINE_COLORS = ["#0000b3", "#0010d9", "#0020ff", "#0040ff", "#0060ff", "#0080ff", "#009fff", "#00bfff", "#00ffff"][::-1]
+roygbiv = np.random.permutation(['aliceblue', 'antiquewhite', 'aqua', 'aquamarine', 'azure', 'beige', 'bisque', 'black', 'blanchedalmond', 'blue', 'blueviolet', 'brown', 'burlywood', 'cadetblue', 'chartreuse', 'chocolate', 'coral', 'cornflowerblue', 'cornsilk', 'crimson', 'cyan', 'darkblue', 'darkcyan', 'darkgoldenrod', 'darkgray', 'darkgrey', 'darkgreen', 'darkkhaki', 'darkmagenta', 'darkolivegreen', 'darkorange', 'darkorchid', 'darkred', 'darksalmon', 'darkseagreen', 'darkslateblue', 'darkslategray', 'darkslategrey', 'darkturquoise', 'darkviolet', 'deeppink', 'deepskyblue', 'dimgray', 'dimgrey', 'dodgerblue', 'firebrick', 'floralwhite', 'forestgreen', 'fuchsia', 'gainsboro', 'ghostwhite', 'gold', 'goldenrod', 'gray', 'grey', 'green', 'greenyellow', 'honeydew', 'hotpink', 'indianred', 'indigo', 'ivory', 'khaki', 'lavender', 'lavenderblush', 'lawngreen', 'lemonchiffon', 'lightblue', 'lightcoral', 'lightcyan', 'lightgoldenrodyellow', 'lightgray', 'lightgrey', 'lightgreen', 'lightpink', 'lightsalmon', 'lightseagreen', 'lightskyblue', 'lightslategray', 'lightslategrey', 'lightsteelblue', 'lightyellow', 'lime', 'limegreen', 'linen', 'magenta', 'maroon', 'mediumaquamarine', 'mediumblue', 'mediumorchid', 'mediumpurple', 'mediumseagreen', 'mediumslateblue', 'mediumspringgreen', 'mediumturquoise', 'mediumvioletred', 'midnightblue', 'mintcream', 'mistyrose', 'moccasin', 'navajowhite', 'navy', 'oldlace', 'olive', 'olivedrab', 'orange', 'orangered', 'orchid', 'palegoldenrod', 'palegreen', 'paleturquoise', 'palevioletred', 'papayawhip', 'peachpuff', 'peru', 'pink', 'plum', 'powderblue', 'purple', 'red', 'rosybrown', 'royalblue', 'saddlebrown', 'salmon', 'sandybrown', 'seagreen', 'seashell', 'sienna', 'silver', 'skyblue', 'slateblue', 'slategray', 'slategrey', 'snow', 'springgreen', 'steelblue', 'tan', 'teal', 'thistle', 'tomato', 'turquoise', 'violet', 'wheat', 'white', 'whitesmoke', 'yellow', 'yellowgreen'])
+#roygbiv = ['red', 'orange', 'yellow', 'green', 'blue', 'indigo', 'violet']
+TITLE_FONT = dict(
+    family=FONT_FAMILY,
+    size=FONT_SIZE_TITLE,
+    color=FONT_COLOR
+)
+AXIS_FONT = dict(
+    family=FONT_FAMILY,
+    size=FONT_SIZE_AXIS,
+    color=FONT_COLOR
+)
+TICK_FONT = dict(
+    family=FONT_FAMILY,
+    size=FONT_SIZE_TICK,
+    color=FONT_COLOR
+)
+GRID_COLOR = '#00f1f1'
+TICK_FONT_3D = dict(
+    family=FONT_FAMILY,
+    size=FONT_SIZE_TICK_3D,
+    color=FONT_COLOR
+)
+LEGEND_FONT = dict(
+    family=FONT_FAMILY,
+    size=FONT_SIZE_LEGEND,
+    color=FONT_COLOR
+)
+HYPERS = [
+    'train/learning_rate',
+    'train/ent_coef',
+    'train/gamma',
+    'train/gae_lambda',
+    'train/vtrace_rho_clip',
+    'train/vtrace_c_clip',
+    'train/clip_coef',
+    'train/vf_clip_coef',
+    'train/vf_coef',
+    'train/max_grad_norm',
+    'train/adam_beta1',
+    'train/adam_beta2',
+    'train/adam_eps',
+    'train/prio_alpha',
+    'train/prio_beta0',
+    'train/bptt_horizon',
+    'train/num_minibatches',
+    'train/minibatch_size',
+    'policy/hidden_size',
+    'env/num_envs',
+]
+ALL_KEYS = [
+    'agent_steps',
+    'cost',
+    'environment/score',
+    'environment/perf'
+] + HYPERS
+
+SCATTER_COLOR = ['env_name'] + ALL_KEYS
+
+import colorsys
+import numpy as np
+
+def rgb_to_hex(rgb):
+    """Convert RGB tuple to hex string."""
+    return '#%02x%02x%02x' % (int(rgb[0]*255), int(rgb[1]*255), int(rgb[2]*255))
+
+def generate_distinct_palette(n):
+    """
+    Generate a palette with n maximally distinct colors across the hue spectrum.
+    
+    Parameters:
+    n (int): Number of colors to generate.
+    
+    Returns:
+    list: List of hex color strings.
+    """
+    if n < 1:
+        raise ValueError("n must be at least 1")
+    
+    # Generate hues evenly spaced across the spectrum (0 to 1)
+    hues = np.linspace(0, 1, n, endpoint=False)
+    colors = []
+    for hue in hues:
+        # Use full saturation and value for vivid colors
+        rgb = colorsys.hsv_to_rgb(hue, 1.0, 1.0)
+        colors.append(rgb)
+    hex_colors = [rgb_to_hex(color) for color in colors]
+    return hex_colors
+
+def pareto_idx(steps, costs, scores):
+    idxs = []
+    for i in range(len(steps)):
+        better = [scores[j] >= scores[i] and
+            costs[j] < costs[i] and steps[j] < steps[i]
+            for j in range(len(scores))]
+        if not any(better):
+            idxs.append(i)
+
+    return idxs
+
+def build_dataset(dataframe):
+    dataset = []
+    for hyper in HYPERS:
+        dat = dataframe[hyper]
+        #mmin = dataframe[f'sweep/{hyper}/min']
+        #mmax = dataframe[f'sweep/{hyper}/max']
+        #distribution = dataframe[f'sweep/{hyper}/distribution']
+
+
+
+def load_sweep_data(path):
+    data = {}
+    keys = None
+    for fpath in glob.glob(path):
+        with open(fpath, 'r') as f:
+            exp = json.load(f)
+
+        if not data:
+            for kk in exp.keys():
+                if kk == 'data':
+                    for k, v in exp[kk][-1].items():
+                        data[k] = []
+                else:
+                    data[kk] = []
+
+        discard = False
+        for kk in list(data.keys()):
+            if kk not in exp and kk not in exp['data'][-1]:
+                discard = True
+                break
+
+        if discard:
+            continue
+
+        for kk in list(data.keys()):
+            if kk in exp:
+                v = exp[kk]
+                sweep_key = f'sweep/{kk}/distribution'
+                if sweep_key in data and exp[sweep_key] == 'logit_normal':
+                    v = 1 - v
+                elif kk in ('train/vtrace_rho_clip', 'train/vtrace_c_clip'):
+                    v = max(v, 0.1)
+
+                data[kk].append(v)
+            else:
+                data[kk].append(exp['data'][-1][kk])
+
+    return data
+
+def cached_sweep_load(path, env_name):
+    cache_file = os.path.join(path, 'cache.json')
+    if not os.path.exists(cache_file):
+        data = load_sweep_data(os.path.join(path, '*.json'))
+        with open(cache_file, 'w') as f:
+            json.dump(data, f)
+
+    with open(cache_file, 'r') as f:
+        data = json.load(f)
+
+    steps = data['agent_steps']
+    costs = data['cost']
+    scores = data['environment/score']
+
+    idxs = pareto_idx(steps, costs, scores)
+    
+    # Create a DataFrame for this environment
+    df_data = {}
+    for k in data:
+        df_data[k] = [data[k][i] for i in idxs]
+    
+    # Apply performance cap
+    df_data['environment/perf'] = [min(e, 1.0) for e in df_data['environment/perf']]
+    
+    # Adjust steps by frameskip if present
+    if 'env/frameskip' in df_data:
+        skip = df_data['env/frameskip']
+        df_data['agent_steps'] = [n*m for n, m in zip(df_data['agent_steps'], skip)]
+    
+    # Add environment name
+    df_data['env_name'] = [env_name] * len(idxs)
+    
+    return pd.DataFrame(df_data)
+
+def compute_tsne():
+    dataset = EXPERIMENTS[HYPERS].copy()  # Create a copy to avoid modifying the original
+
+    # Normalize each hyperparameter column using its corresponding min and max columns
+    for hyper in HYPERS:
+        min_col = f'sweep/{hyper}/min'
+        max_col = f'sweep/{hyper}/max'
+
+        mmin = min(EXPERIMENTS[min_col])
+        mmax = max(EXPERIMENTS[max_col])
+
+        distribution = EXPERIMENTS[f'sweep/{hyper}/distribution']
+        if 'log' in distribution or 'pow2' in distribution:
+            mmin = np.log(mmin)
+            mmax = np.log(mmax)
+            normed = np.log(dataset[hyper])
+        else:
+            normed = dataset[hyper]
+
+        dataset[hyper] = (normed - mmin) / (mmax - mmin)
+        # Normalize: (value - min) / (max - min) for each row
+
+        #dataset[hyper] = (dataset[hyper] - EXPERIMENTS[min_col]) / (EXPERIMENTS[max_col] - EXPERIMENTS[min_col])
+
+    # Filter dataset based on performance threshold
+    # Apply TSNE
+    from sklearn.manifold import TSNE
+    proj = TSNE(n_components=2)
+    reduced = proj.fit_transform(dataset)
+    EXPERIMENTS['tsne1'] = reduced[:, 0]
+    EXPERIMENTS['tsne2'] = reduced[:, 1]
+
+env_names = ['tripletriad', 'grid', 'moba', 'tower_climb', 'tetris', 'breakout', 'pong', 'g2048', 'snake', 'pacman']
+env_all = ['all'] + env_names
+#env_names = ['grid', 'breakout', 'g2048']
+#env_names = ['grid']
+
+roygbiv = generate_distinct_palette(len(env_names))
+
+# Create a list of DataFrames for each environment
+dfs = [cached_sweep_load(f'experiments/logs/puffer_{name}', name) for name in env_names]
+
+# Concatenate all DataFrames into a single DataFrame
+EXPERIMENTS = pd.concat(dfs, ignore_index=True)
+#EXPERIMENTS.set_index('env_name', inplace=True)
+compute_tsne()
+
+app = Dash()
+app.css.append_css({'external_stylesheets': 'dash.css'})
+app.layout = html.Div([
+    html.H1('Puffer Constellation', style={'textAlign': 'center'}),
+    html.Br(),
+
+    html.Label([
+        "X: ",
+        dcc.Dropdown(
+            id="optimal-dropdown-x",
+            options=[{"label": key, "value": key} for key in ALL_KEYS],
+            value="cost",
+            style={"width": "50%"}
+        )
+    ]),
+    html.Label([
+        "Y: ",
+        dcc.Dropdown(
+            id="optimal-dropdown-y",
+            options=[{"label": key, "value": key} for key in ALL_KEYS],
+            value="agent_steps",
+            style={"width": "50%"}
+        )
+    ]),
+    html.Label([
+        "Z: ",
+        dcc.Dropdown(
+            id="optimal-dropdown-z",
+            options=[{"label": key, "value": key} for key in ALL_KEYS],
+            value="environment/perf",
+            style={"width": "50%"}
+        )
+    ]),
+    dcc.Graph(id='optimal'),
+    html.Br(),
+
+    html.Label([
+        "Environment: ",
+        dcc.Dropdown(
+            id="scatter-dropdown-env",
+            options=[{"label": key, "value": key} for key in env_all],
+            value="all",
+            style={"width": "50%"}
+        )
+    ]),
+    html.Br(),
+    html.Label([
+        "X: ",
+        dcc.Dropdown(
+            id="scatter-dropdown-x",
+            options=[{"label": key, "value": key} for key in ALL_KEYS],
+            value="train/learning_rate",
+            style={"width": "50%"}
+        ),
+        dcc.Checklist(
+            id="scatter-checkbox-logx",
+            options=[{"label": "Log", "value": "log"}],
+            value=["log"],
+            style={"display": "inline-block", "margin-left": "10px"}
+        ),
+    ]),
+    html.Br(),
+    html.Label([
+        "Y: ",
+        dcc.Dropdown(
+            id="scatter-dropdown-y",
+            options=[{"label": key, "value": key} for key in ALL_KEYS],
+            value="environment/perf",
+            style={"width": "50%"}
+        ),
+        dcc.Checklist(
+            id="scatter-checkbox-logy",
+            options=[{"label": "Log", "value": "log"}],
+            value=[],
+            style={"display": "inline-block", "margin-left": "10px"}
+        ),
+       
+    ]),
+    html.Br(),
+    html.Label([
+        "Color: ",
+        dcc.Dropdown(
+            id="scatter-dropdown-color",
+            options=[{"label": key, "value": key} for key in SCATTER_COLOR],
+            value="env_name",
+            style={"width": "50%"}
+        )
+    ]),
+    html.Br(),
+    html.Label([
+        "Range 1: ",
+        dcc.Dropdown(
+            id="scatter-dropdown-range-1",
+            options=[{"label": key, "value": key} for key in ALL_KEYS],
+            value="agent_steps",
+            style={"width": "50%"}
+        ),
+        dcc.RangeSlider(
+            id='scatter-range-1',
+            min=0.0,
+            max=1.0,
+            step=0.05,
+            value=[0.0, 0.25]
+        ),
+    ]),
+    html.Br(),
+    html.Label([
+        "Range 2: ",
+        dcc.Dropdown(
+            id="scatter-dropdown-range-2",
+            options=[{"label": key, "value": key} for key in ALL_KEYS],
+            value="cost",
+            style={"width": "50%"}
+        ),
+        dcc.RangeSlider(
+            id='scatter-range-2',
+            min=0.0,
+            max=1.0,
+            step=0.05,
+            value=[0.0, 0.95]
+        ),
+    ]),
+    dcc.Graph(id='scatter'),
+    html.Br(),
+
+    #html.Label([
+    #    "X Axis: ",
+    #    dcc.Dropdown(
+    #        id="hyper-box-x",
+    #        options=[{"label": key, "value": key} for key in ['cost', 'agent_steps']],
+    #        value="agent_steps",
+    #        style={"width": "50%"}
+    #    )
+    #]),
+    #dcc.Graph(id='hyper-box'),
+
+    html.Br(),
+    html.Label([
+        "Range 1: ",
+        dcc.Dropdown(
+            id="hyper-dropdown-range-1",
+            options=[{"label": key, "value": key} for key in ALL_KEYS],
+            value="environment/perf",
+            style={"width": "50%"}
+        ),
+        dcc.RangeSlider(
+            id='hyper-range-1',
+            min=0.0,
+            max=1.0,
+            step=0.05,
+            value=[0.8, 1.0]
+        ),
+    ]),
+    html.Br(),
+    html.Label([
+        "Range 2: ",
+        dcc.Dropdown(
+            id="hyper-dropdown-range-2",
+            options=[{"label": key, "value": key} for key in ALL_KEYS],
+            value="agent_steps",
+            style={"width": "50%"}
+        ),
+        dcc.RangeSlider(
+            id='hyper-range-2',
+            min=0.0,
+            max=1.0,
+            step=0.05,
+            value=[0.0, 1.0]
+        ),
+    ]),
+    dcc.Graph(id='hyper'),
+
+
+    html.Br(),
+    html.Label([
+        "Range 1: ",
+        dcc.Dropdown(
+            id="tsnee-dropdown-range-1",
+            options=[{"label": key, "value": key} for key in ALL_KEYS],
+            value="environment/perf",
+            style={"width": "50%"}
+        ),
+        dcc.RangeSlider(
+            id='tsnee-range-1',
+            min=0.0,
+            max=1.0,
+            step=0.05,
+            value=[0.5, 1.0]
+        ),
+    ]),
+    html.Br(),
+    html.Label([
+        "Range 2: ",
+        dcc.Dropdown(
+            id="tsnee-dropdown-range-2",
+            options=[{"label": key, "value": key} for key in ALL_KEYS],
+            value="cost",
+            style={"width": "50%"}
+        ),
+        dcc.RangeSlider(
+            id='tsnee-range-2',
+            min=0.0,
+            max=1.0,
+            step=0.05,
+            value=[0.0, 1.0]
+        ),
+    ]),
+    dcc.Graph(id='tsnee'),
+
+],
+style={"width": 1280}
+)
+
+import plotly.express as px
+import plotly.graph_objects as go
+import numpy as np
+from scipy.spatial.distance import cdist
+
+# Assuming EXPERIMENTS is your pandas DataFrame, and xkey, ykey, zkey are defined.
+# Also assuming percentages for cutoffs, e.g.:
+percentage1 = 5.0  # Percentage for XYZ distance threshold relative to plot diagonal in transformed space
+percentage2 = 0.5  # Percentage for PCA distance threshold relative to PCA diagonal
+
+@app.callback(
+    Output("optimal", "figure"),
+    Input("optimal-dropdown-x", "value"),
+    Input("optimal-dropdown-y", "value"),
+    Input("optimal-dropdown-z", "value")
+)
+def update_optimal_plot(xkey, ykey, zkey):
+    all_x = EXPERIMENTS[xkey].values
+    all_y = EXPERIMENTS[ykey].values
+    all_z = EXPERIMENTS[zkey].values
+    all_pca1 = EXPERIMENTS['tsne1'].values
+    all_pca2 = EXPERIMENTS['tsne2'].values
+    all_env = EXPERIMENTS['env_name'].values# Handle transformed coordinates for XYZ (accounting for log axes)
+    trans_x = np.log10(all_x)  # Assuming all_x > 0
+    trans_y = np.log10(all_y)  # Assuming all_y > 0
+    trans_z = all_z
+    points_trans_xyz = np.column_stack((trans_x, trans_y, trans_z))
+
+    # Compute ranges in transformed space
+    range_tx = np.max(trans_x) - np.min(trans_x)
+    range_ty = np.max(trans_y) - np.min(trans_y)
+    range_tz = np.max(trans_z) - np.min(trans_z)
+    diagonal_xyz = np.sqrt(range_tx**2 + range_ty**2 + range_tz**2)
+    delta1 = (percentage1 / 100.0) * diagonal_xyz
+
+    # For PCA (assuming linear scales)
+    points_pca = np.column_stack((all_pca1, all_pca2))
+    range_p1 = np.max(all_pca1) - np.min(all_pca1)
+    range_p2 = np.max(all_pca2) - np.min(all_pca2)
+    diagonal_pca = np.sqrt(range_p1**2 + range_p2**2)
+    delta2 = (percentage2 / 100.0) * diagonal_pca
+
+    # Create the base scatter plot
+    f = px.scatter_3d(
+        x=all_x,
+        y=all_y,
+        z=all_z,
+        color=all_env,
+        log_x=True,
+        log_y=True,
+        log_z=False,
+        color_discrete_sequence=roygbiv
+    )
+
+    # Compute pairwise L2 distances in transformed spaces
+    dists_xyz = cdist(points_trans_xyz, points_trans_xyz)
+    dists_pca = cdist(points_pca, points_pca)
+
+    # Create boolean masks
+    xyz_mask = dists_xyz < delta1
+    pca_mask = dists_pca < delta2
+    # Use boolean array for upper triangle to avoid type mismatch
+    triu_mask = np.triu(np.ones_like(dists_xyz, dtype=bool), k=1)
+
+    # Combine masks with boolean operations
+    mask = xyz_mask & pca_mask & triu_mask
+
+    # Get indices of valid pairs
+    i, j = np.where(mask)
+
+    # Collect line segment coordinates (in original space)
+    line_x = []
+    line_y = []
+    line_z = []
+    for k in range(len(i)):
+        line_x.extend([all_x[i[k]], all_x[j[k]], None])
+        line_y.extend([all_y[i[k]], all_y[j[k]], None])
+        line_z.extend([all_z[i[k]], all_z[j[k]], None])
+
+    # Add the lines as a single trace
+    if line_x:
+        f.add_trace(
+            go.Scatter3d(
+                x=line_x,
+                y=line_y,
+                z=line_z,
+                mode='lines',
+                line=dict(color='rgba(255,255,255,0.25)', width=2),
+                showlegend=False
+            )
+        )
+
+    # Show the figure
+    f.show()
+
+    layout_dict = {
+        'title': dict(text='Pareto', font=TITLE_FONT),
+        'showlegend': True,
+        'legend': dict(font=LEGEND_FONT),
+        'plot_bgcolor': PLOT_BG_COLOR,
+        'paper_bgcolor': PAPER_BG_COLOR,
+        'width': 1280,
+        'height': 720,
+        'autosize': False,
+        'scene': dict(
+            xaxis=dict(
+                title=dict(text=xkey, font=AXIS_FONT),
+                tickfont=TICK_FONT_3D,
+                type='log',
+                showgrid=True,
+                gridcolor=GRID_COLOR,
+                backgroundcolor=PLOT_BG_COLOR,
+                zeroline=False
+            ),
+            yaxis=dict(
+                title=dict(text=ykey, font=AXIS_FONT),
+                tickfont=TICK_FONT_3D,
+                type='log',
+                showgrid=True,
+                gridcolor=GRID_COLOR,
+                backgroundcolor=PLOT_BG_COLOR,
+                zeroline=False
+            ),
+            zaxis=dict(
+                title=dict(text=zkey, font=AXIS_FONT),
+                tickfont=TICK_FONT_3D,
+                type='linear',
+                showgrid=True,
+                gridcolor=GRID_COLOR,
+                backgroundcolor=PLOT_BG_COLOR,
+                zeroline=False
+            ),
+            bgcolor=PLOT_BG_COLOR,
+        )
+    }
+    f.update_layout(**layout_dict)
+    return f
+
+
+@app.callback(
+    Output("scatter", "figure"),
+    Input("scatter-dropdown-env", "value"),
+    Input("scatter-dropdown-x", "value"),
+    Input("scatter-checkbox-logx", "value"),
+    Input("scatter-dropdown-y", "value"),
+    Input("scatter-checkbox-logy", "value"),
+    Input("scatter-dropdown-color", "value"),
+    Input("scatter-dropdown-range-1", "value"),
+    Input("scatter-range-1", "value"),
+    Input("scatter-dropdown-range-2", "value"),
+    Input("scatter-range-2", "value"),
+)
+def update_scatter(env, xkey, logx, ykey, logy, zkey, range1_key, range1, range2_key, range2):
+    #env_data = EXPERIMENTS.loc[env]
+    if env == 'all':
+        env_data = EXPERIMENTS
+    else:
+        env_data = EXPERIMENTS[EXPERIMENTS['env_name'] == env]
+
+    range1_mmin = min(EXPERIMENTS[range1_key])
+    range1_mmax = max(EXPERIMENTS[range1_key])
+    norm_range1 = (EXPERIMENTS[range1_key] - range1_mmin) / (range1_mmax - range1_mmin)
+
+    range2_mmin = min(EXPERIMENTS[range2_key])
+    range2_mmax = max(EXPERIMENTS[range2_key])
+    norm_range2 = (EXPERIMENTS[range2_key] - range2_mmin) / (range2_mmax - range2_mmin)
+
+    mask = (norm_range1 >= range1[0]) & (norm_range1 <= range1[1]) & (norm_range2 >= range2[0]) & (norm_range2 <= range2[1])
+
+    env_data = env_data[mask]
+
+    x = env_data[xkey]
+    y = env_data[ykey]
+    z = env_data[zkey]
+
+    if zkey == 'env_name':
+        f = px.scatter(x=x, y=y, color=z, color_discrete_sequence=roygbiv)
+    else:
+        mmin = min(z)
+        mmax = max(z)
+        thresh = np.geomspace(mmin, mmax, 8)
+        all_fx = []
+        all_fy = []
+        bin_label = []
+        for j in range(7):
+            filter = (thresh[j] < z) & (z < thresh[j+1])
+            if filter.sum() <= 2:
+                continue
+
+            fx = x[filter]
+            fy = y[filter]
+            all_fx += fx.tolist()
+            all_fy += fy.tolist()
+            bin_label += [str(thresh[j])] * len(fx)
+
+        f = px.scatter(x=all_fx, y=all_fy, color=bin_label, color_discrete_sequence=roygbiv)
+
+    f.update_traces(marker_size=10)
+    layout_dict = {
+        'title': dict(text='Experiments', font=TITLE_FONT),
+        'showlegend': True,
+        'legend': dict(font=LEGEND_FONT),
+        'plot_bgcolor': PLOT_BG_COLOR,
+        'paper_bgcolor': PAPER_BG_COLOR,
+        'width': 1280,
+        'height': 720,
+        'autosize': False,
+        'xaxis': dict(
+            title=dict(text=xkey, font=AXIS_FONT),
+            tickfont=TICK_FONT,
+            showgrid=False,
+            type='log' if 'log' in logx else 'linear',
+        ),
+        'yaxis': dict(
+            title=dict(text=ykey, font=AXIS_FONT),
+            tickfont=TICK_FONT,
+            showgrid=False,
+            type='log' if 'log' in logy else 'linear',
+        )
+    }
+    f.update_layout(**layout_dict)
+    return f
+
+@app.callback(
+    Output("hyper-box", "figure"),
+    Input("hyper-box-x", "value")
+)
+def update_hyper_box(x):
+    buckets = 4
+    env_data = {}
+    for env in env_names:
+        #data = EXPERIMENTS.loc[env]
+        data = EXPERIMENTS[EXPERIMENTS['env_name'] == env]
+        steps = data['agent_steps']
+        costs = data['cost']
+        scores = data['environment/score']
+        x_data = costs if x == 'cost' else steps
+        hyper_data = {}
+        env_data[env] = {'x': x_data, 'hypers': hyper_data}
+        for h in HYPERS:
+            hyper_data[h] = data[h]
+    all_x = [x for env in env_data for x in env_data[env]['x']]
+    x_min, x_max = min(all_x), max(all_x)
+    bucket_edges = np.linspace(x_min, x_max, buckets + 1)
+    bucket_centers = (bucket_edges[:-1] + bucket_edges[1:]) / 2
+    heatmap_data = np.zeros((len(HYPERS), buckets))
+    for i, hyper in enumerate(HYPERS):
+        for j in range(buckets):
+            bucket_means = []
+            for env in env_data:
+                if hyper not in env_data[env]['hypers']:
+                    continue
+                x_vals = np.array(env_data[env]['x'])
+                hyper_vals = np.array(env_data[env]['hypers'][hyper])
+                idxs = (x_vals >= bucket_edges[j]) & (x_vals < bucket_edges[j+1])
+                if np.any(idxs):
+                    bucket_means.append(np.mean(hyper_vals[idxs]))
+            heatmap_data[i, j] = np.mean(bucket_means) if bucket_means else np.nan
+    heatmap_data = np.log(heatmap_data)
+    heatmap_data -= heatmap_data[:, 0, None] # Normalize
+    f = px.imshow(heatmap_data, x=bucket_centers, y=HYPERS, color_continuous_scale='Viridis', zmin=np.nanmin(heatmap_data), zmax=np.nanmax(heatmap_data), labels=dict(color="Value"))
+    layout_dict = {
+        'title': dict(text="Hyperparameter Drift", font=TITLE_FONT),
+        'showlegend': True,
+        'legend': dict(font=LEGEND_FONT),
+        'plot_bgcolor': PLOT_BG_COLOR,
+        'paper_bgcolor': PAPER_BG_COLOR,
+        'width': 1280,
+        'height': 720,
+        'autosize': False,
+        'xaxis': dict(
+            title=dict(text=x.capitalize(), font=AXIS_FONT),
+            tickfont=TICK_FONT,
+            showgrid=False
+        ),
+        'yaxis': dict(
+            title=dict(text="Hyperparameters", font=AXIS_FONT),
+            tickfont=TICK_FONT,
+            showgrid=False
+        )
+    }
+    f.update_layout(**layout_dict)
+    return f
+
+@app.callback(
+    Output("hyper", "figure"),
+    Input("hyper-dropdown-range-1", "value"),
+    Input("hyper-range-1", "value"),
+    Input("hyper-dropdown-range-2", "value"),
+    Input("hyper-range-2", "value"),
+)
+def update_hyper_plot(xkey, range1, ykey, range2):
+    # Initialize figure
+    f = go.Figure()
+    f.update_layout(
+        title=dict(text='Hyperparameter Stable Range', font=TITLE_FONT),
+        xaxis=dict(title=dict(text='Value', font=AXIS_FONT), tickfont=TICK_FONT),
+        yaxis=dict(title=dict(text='Hyper', font=AXIS_FONT), tickfont=TICK_FONT),
+        showlegend=True,
+        legend=dict(font=LEGEND_FONT),
+        plot_bgcolor=PLOT_BG_COLOR,
+        paper_bgcolor=PAPER_BG_COLOR,
+        width=1280,
+        height=720,
+        autosize=False,
+        xaxis_type='log',
+        barmode='overlay',  # Overlay bars instead of stacking
+    )
+    f.update_xaxes(showgrid=False)
+    f.update_yaxes(showgrid=False)
+
+    range1_mmin = min(EXPERIMENTS[xkey])
+    range1_mmax = max(EXPERIMENTS[xkey])
+    norm_x = (EXPERIMENTS[xkey] - range1_mmin) / (range1_mmax - range1_mmin)
+    range2_mmin = min(EXPERIMENTS[ykey])
+    range2_mmax = max(EXPERIMENTS[ykey])
+    norm_y = (EXPERIMENTS[ykey] - range2_mmin) / (range2_mmax - range2_mmin)
+    mask = (norm_x >= range1[0]) & (norm_x <= range1[1]) & (norm_y >= range2[0]) & (norm_y <= range2[1])
+    filtered = EXPERIMENTS[mask]
+
+    for i, env in enumerate(env_names):
+        #env_data = EXPERIMENTS.loc[env]
+        env_data = filtered[filtered['env_name'] == env]
+        if len(env_data) < 2:
+            continue
+
+        steps = env_data['agent_steps']
+        costs = env_data['cost']
+        scores = env_data['environment/score']
+
+        max_score = max(scores)
+        max_steps = max(steps)
+        n = len(scores)
+
+       
+        for k, hyper in enumerate(HYPERS):
+            y = env_data[hyper]
+
+            ymin = min(y)
+            ymax = max(y)
+            f.add_trace(
+                go.Bar(
+                    x=[ymax - ymin],
+                    y=[hyper],  # Hyperparameter as x-axis
+                    base=ymin,
+                    showlegend=False,
+                    marker_color='#00f1f1',
+                    opacity=0.25,
+                    width=1.0,
+                    orientation='h'
+                )
+            )
+
+    return f
+
+
+@app.callback(
+    Output("tsnee", "figure"),
+    Input("tsnee-dropdown-range-1", "value"),
+    Input("tsnee-range-1", "value"),
+    Input("tsnee-dropdown-range-2", "value"),
+    Input("tsnee-range-2", "value"),
+)
+def update_pca_plot(xkey, range1, ykey, range2):
+    # Initialize figure
+    f = go.Figure()
+    f.update_layout(
+        title=dict(text='Hyperparameter Stable Range', font=TITLE_FONT),
+        xaxis=dict(title=dict(text='Value', font=AXIS_FONT), tickfont=TICK_FONT),
+        yaxis=dict(title=dict(text='Hyper', font=AXIS_FONT), tickfont=TICK_FONT),
+        showlegend=True,
+        legend=dict(font=LEGEND_FONT),
+        plot_bgcolor=PLOT_BG_COLOR,
+        paper_bgcolor=PAPER_BG_COLOR,
+        width=1280,
+        height=720,
+        autosize=False,
+        xaxis_type='log',
+        barmode='overlay',  # Overlay bars instead of stacking
+    )
+    f.update_xaxes(showgrid=False)
+    f.update_yaxes(showgrid=False)
+
+    range1_mmin = min(EXPERIMENTS[xkey])
+    range1_mmax = max(EXPERIMENTS[xkey])
+    norm_x = (EXPERIMENTS[xkey] - range1_mmin) / (range1_mmax - range1_mmin)
+    range2_mmin = min(EXPERIMENTS[ykey])
+    range2_mmax = max(EXPERIMENTS[ykey])
+    norm_y = (EXPERIMENTS[ykey] - range2_mmin) / (range2_mmax - range2_mmin)
+    mask = (norm_x >= range1[0]) & (norm_x <= range1[1]) & (norm_y >= range2[0]) & (norm_y <= range2[1])
+    filtered = EXPERIMENTS[mask]
+
+    f = px.scatter(
+        x=filtered['tsne1'],
+        y=filtered['tsne2'],
+        color=filtered['env_name'],
+        color_discrete_sequence=roygbiv
+    )
+ 
+    f.update_traces(marker_size=10)
+    layout_dict = {
+        'title': dict(text='Experiments', font=TITLE_FONT),
+        'showlegend': True,
+        'legend': dict(font=LEGEND_FONT),
+        'plot_bgcolor': PLOT_BG_COLOR,
+        'paper_bgcolor': PAPER_BG_COLOR,
+        'width': 1280,
+        'height': 720,
+        'autosize': False,
+        'xaxis': dict(
+            title=dict(text='TSNE-1', font=AXIS_FONT),
+            tickfont=TICK_FONT,
+            showgrid=False
+        ),
+        'yaxis': dict(
+            title=dict(text='TSNE-2', font=AXIS_FONT),
+            tickfont=TICK_FONT,
+            showgrid=False
+        )
+    }
+    f.update_layout(**layout_dict)
+    return f
+
+
+if __name__ == '__main__':
+    app.run(host='0.0.0.0', port=8000)
diff --git a/profile_jax.py b/profile_jax.py
new file mode 100644
index 000000000..8d5d51a43
--- /dev/null
+++ b/profile_jax.py
@@ -0,0 +1,67 @@
+import jax
+import jax.numpy as jnp
+from jax import jit, random, lax
+import timeit
+
+INPUT_SIZE = 16
+HIDDEN_SIZE = 128
+OUTPUT_SIZE = 16
+B = 2048
+dtype = jnp.bfloat16
+inner_loops = 100  # Number of inner iterations to amortize overhead
+
+def init_params(key):
+    keys = random.split(key, 3)
+    # Use uniform initialization to match PyTorch's Kaiming uniform for ReLU
+    bound1 = jnp.sqrt(6 / INPUT_SIZE)
+    w1 = random.uniform(keys[0], shape=(INPUT_SIZE, HIDDEN_SIZE), minval=-bound1, maxval=bound1, dtype=dtype)
+    b1 = jnp.zeros(HIDDEN_SIZE, dtype=dtype)
+    bound2 = jnp.sqrt(6 / HIDDEN_SIZE)
+    w2 = random.uniform(keys[1], shape=(HIDDEN_SIZE, HIDDEN_SIZE), minval=-bound2, maxval=bound2, dtype=dtype)
+    b2 = jnp.zeros(HIDDEN_SIZE, dtype=dtype)
+    bound3 = jnp.sqrt(6 / HIDDEN_SIZE)
+    w3 = random.uniform(keys[2], shape=(HIDDEN_SIZE, OUTPUT_SIZE), minval=-bound3, maxval=bound3, dtype=dtype)
+    b3 = jnp.zeros(OUTPUT_SIZE, dtype=dtype)
+    return {'w1': w1, 'b1': b1, 'w2': w2, 'b2': b2, 'w3': w3, 'b3': b3}
+
+def model(params, x):
+    precision = lax.Precision.HIGH  # Use HIGH precision for 4090 to leverage Tensor Cores
+    h = jnp.maximum(jnp.dot(x, params['w1'], precision=precision) + params['b1'], 0)
+    h = jnp.maximum(jnp.dot(h, params['w2'], precision=precision) + params['b2'], 0)
+    return jnp.dot(h, params['w3'], precision=precision) + params['b3']
+
+# Manual FLOPs calculation (ignores bias adds and ReLUs as negligible)
+flops_per_forward = (
+    2 * B * INPUT_SIZE * HIDDEN_SIZE +      # First matmul
+    2 * B * HIDDEN_SIZE * HIDDEN_SIZE +     # Second matmul
+    2 * B * HIDDEN_SIZE * OUTPUT_SIZE       # Third matmul
+)
+
+# Create concrete inputs
+key = random.key(0)
+params = init_params(key)
+batch = random.normal(random.key(1), (B, INPUT_SIZE), dtype=dtype)
+
+# Define a jitted multi-step function with lax.scan for better optimization
+@jit
+def multi_step(params, batch):
+    def body_fun(carry, _):
+        y = model(params, batch)
+        carry += y.sum()  # Forces computation without noise
+        return carry, None
+    carry, _ = lax.scan(body_fun, jnp.array(0.0, dtype=jnp.float32), None, length=inner_loops)
+    return carry
+
+# Warmup
+for _ in range(10):
+    _ = multi_step(params, batch).block_until_ready()
+
+# Timing
+def run():
+    return multi_step(params, batch).block_until_ready()
+
+t = timeit.timeit(run, number=10)
+cost = t / 10 / inner_loops  # Average time per forward pass
+
+FLOPS = flops_per_forward / cost
+print(f'TFLOPS: {FLOPS / 1e12:.2f}')
diff --git a/profile_kernels.cu b/profile_kernels.cu
new file mode 100644
index 000000000..19579197d
--- /dev/null
+++ b/profile_kernels.cu
@@ -0,0 +1,1035 @@
+// profile_kernels.cu
+// Minimal standalone profiler for CUDA kernels
+//
+// Without torch: nvcc -O3 -arch=sm_80 profile_kernels.cu -o profile_kernels -I.
+// With torch:    Build with cmake/pytorch and -DUSE_TORCH
+//
+// Run: ./profile_kernels
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#ifdef USE_TORCH
+#include <torch/torch.h>
+#include <c10/cuda/CUDAStream.h>
+#include <ATen/cuda/CUDAGraph.h>
+#include "pufferlib/extensions/cuda/modules.cu"
+#else
+#include "pufferlib/extensions/cuda/kernels.cu"
+#endif
+
+const int WARMUP_ITERS = 1000;
+const int TIMING_ITERS = 10000;
+
+const int BR = 4096;  // Rollout batch (no T dim)
+const int BT = 512;   // Train batch (with T dim)
+const int T = 64;
+const int H = 128;
+const int A = 4;
+
+typedef void (*kernel_fn)(void*);
+
+void print_timing(const char* name, float ms, int N) {
+    printf("  %-18s %6.1f us  %6.2f M elem/s\n", name, ms * 1000, N / ms / 1e3);
+}
+
+void warmup_gpu() {
+    // Warm up GPU clocks with some busy work
+    float* dummy;
+    cudaMalloc(&dummy, 64 * 1024 * 1024);  // 64MB
+    for (int i = 0; i < 100; i++) {
+        cudaMemset(dummy, 0, 64 * 1024 * 1024);
+    }
+    cudaDeviceSynchronize();
+    cudaFree(dummy);
+}
+
+float profile_kernel(kernel_fn fn, void* args) {
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+
+    for (int i = 0; i < WARMUP_ITERS; ++i) {
+        fn(args);
+        cudaDeviceSynchronize();
+    }
+
+    cudaEventRecord(start);
+    for (int i = 0; i < TIMING_ITERS; ++i) {
+        fn(args);
+    }
+    cudaEventRecord(stop);
+    cudaEventSynchronize(stop);
+
+    float ms = 0;
+    cudaEventElapsedTime(&ms, start, stop);
+    cudaEventDestroy(start);
+    cudaEventDestroy(stop);
+
+    cudaDeviceSynchronize();
+    c10::cuda::CUDACachingAllocator::emptyCache();
+    return ms / TIMING_ITERS;
+}
+
+#ifdef USE_TORCH
+float profile_graph(kernel_fn fn, void* args) {
+    cudaDeviceSynchronize();
+
+    at::cuda::CUDAGraph cuda_graph;
+    at::cuda::CUDAStream current_stream = at::cuda::getCurrentCUDAStream();
+
+    at::cuda::CUDAStream warmup_stream = at::cuda::getStreamFromPool();
+    at::cuda::setCurrentCUDAStream(warmup_stream);
+    for (int i = 0; i < WARMUP_ITERS; ++i) {
+        fn(args);
+    }
+    warmup_stream.synchronize();
+
+    at::cuda::CUDAStream cap_stream = at::cuda::getStreamFromPool();
+    at::cuda::setCurrentCUDAStream(cap_stream);
+    cuda_graph.capture_begin();
+    fn(args);
+    cuda_graph.capture_end();
+    cap_stream.synchronize();
+
+    cudaDeviceSynchronize();
+    at::cuda::setCurrentCUDAStream(current_stream);
+
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+
+    cudaEventRecord(start);
+    for (int i = 0; i < TIMING_ITERS; ++i) {
+        cuda_graph.replay();
+    }
+    cudaEventRecord(stop);
+    cudaEventSynchronize(stop);
+
+    float ms = 0;
+    cudaEventElapsedTime(&ms, start, stop);
+    cudaEventDestroy(start);
+    cudaEventDestroy(stop);
+
+    return ms / TIMING_ITERS;
+}
+#endif
+
+float rand1() {
+    return (float)rand() / RAND_MAX * 2.0f - 1.0f;
+}
+
+// Fused mingru_gate for inference: takes combined (B, 1, 3*H) = [hidden, gate, proj]
+// Outputs: out = sigmoid(proj) * mingru_out, next_state = mingru_out (for recurrence)
+typedef struct {
+    float* state;       // (B, 1, H) - input state
+    float* combined;    // (B, 1, 3*H) = [hidden, gate, proj]
+    float* out;         // (B, 1, H) - sigmoid(proj) * mingru_out
+    float* next_state;  // (B, 1, H) - raw mingru_out
+    int B;
+    int H;
+} MingruGateArgs;
+
+MingruGateArgs* create_mingrugateargs(int batch, int hidden) {
+    MingruGateArgs* args = (MingruGateArgs*)calloc(1, sizeof(MingruGateArgs));
+    args->B = batch;
+    args->H = hidden;
+
+    int N_state = batch * hidden;
+    int N_combined = batch * 3 * hidden;
+
+    cudaMalloc(&args->state, N_state * sizeof(float));
+    cudaMalloc(&args->combined, N_combined * sizeof(float));
+    cudaMalloc(&args->out, N_state * sizeof(float));
+    cudaMalloc(&args->next_state, N_state * sizeof(float));
+
+    float* state_buf = (float*)malloc(N_state * sizeof(float));
+    float* combined_buf = (float*)malloc(N_combined * sizeof(float));
+
+    // Initialize state with positive values
+    for (int i = 0; i < N_state; ++i) {
+        state_buf[i] = fabsf(rand1()) + 0.1f;
+    }
+    // Initialize combined = [hidden, gate, proj]
+    for (int b = 0; b < batch; ++b) {
+        int base = b * 3 * hidden;
+        for (int h = 0; h < hidden; ++h) {
+            combined_buf[base + h] = rand1() * 5.0f;              // hidden
+            combined_buf[base + hidden + h] = rand1() * 5.0f;     // gate
+            combined_buf[base + 2 * hidden + h] = rand1() * 2.0f; // proj
+        }
+    }
+
+    cudaMemcpy(args->state, state_buf, N_state * sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(args->combined, combined_buf, N_combined * sizeof(float), cudaMemcpyHostToDevice);
+
+    free(state_buf);
+    free(combined_buf);
+    return args;
+}
+
+void free_mingrugateargs(MingruGateArgs* args) {
+    cudaFree(args->state);
+    cudaFree(args->combined);
+    cudaFree(args->out);
+    cudaFree(args->next_state);
+    free(args);
+}
+
+void run_mingrugate_forward(MingruGateArgs* args) {
+    launch_mingru_gate_inference<float>(
+        args->out, args->next_state, args->combined, args->state,
+        args->H, args->B, 0);
+}
+
+#ifdef USE_TORCH
+
+typedef struct {
+    torch::Tensor state;     // (B, 1, H)
+    torch::Tensor combined;  // (B, 1, 3*H)
+    int B;
+    int H;
+} MingruGateArgsTorch;
+
+MingruGateArgsTorch* create_mingrugateargs_torch(MingruGateArgs* raw) {
+    MingruGateArgsTorch* args = new MingruGateArgsTorch();
+    args->B = raw->B;
+    args->H = raw->H;
+
+    auto opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
+    args->state = torch::from_blob(raw->state, {raw->B, 1, raw->H}, opts);
+    args->combined = torch::from_blob(raw->combined, {raw->B, 1, 3 * raw->H}, opts);
+
+    return args;
+}
+
+void run_mingrugate_forward_torch(MingruGateArgsTorch* args) {
+    torch::NoGradGuard no_grad;
+    mingru_gate(args->state, args->combined);
+}
+
+void run_mingrugate_forward_cpp(MingruGateArgsTorch* args) {
+    torch::NoGradGuard no_grad;
+    mingru_gate_cpp(args->state, args->combined);
+}
+
+#endif
+
+void profile_mingrugate(int batch, int hidden) {
+    MingruGateArgs* args = create_mingrugateargs(batch, hidden);
+
+    printf("mingru_gate (B=%d, H=%d, combined=%dx%d)\n", batch, hidden, batch, 3*hidden);
+
+    float fwd_ms = profile_kernel((kernel_fn)run_mingrugate_forward, args);
+    print_timing("\tforward", fwd_ms, batch);
+
+#ifdef USE_TORCH
+    MingruGateArgsTorch* args_torch = create_mingrugateargs_torch(args);
+
+    float fwd_torch_ms = profile_kernel((kernel_fn)run_mingrugate_forward_torch, args_torch);
+    print_timing("\tforward (torch)", fwd_torch_ms, batch);
+
+    float fwd_cpp_ms = profile_kernel((kernel_fn)run_mingrugate_forward_cpp, args_torch);
+    print_timing("\tforward (cpp)", fwd_cpp_ms, batch);
+
+    float fwd_graph_ms = profile_graph((kernel_fn)run_mingrugate_forward_cpp, args_torch);
+    print_timing("\tforward (graph)", fwd_graph_ms, batch);
+
+    delete args_torch;
+#endif
+    printf("\n");
+
+    free_mingrugateargs(args);
+}
+
+typedef struct {
+    float* gate;
+    float* hidden;
+    float* log_coeffs;
+    float* log_values;
+    float* grad_log_coeffs;
+    float* grad_log_values;
+    float* grad_gate;
+    float* grad_hidden;
+    int N;
+} LogCoeffsAndValuesArgs;
+
+LogCoeffsAndValuesArgs* create_logcoeffsandvaluesargs(int batch, int seq, int hidden) {
+    LogCoeffsAndValuesArgs* args = (LogCoeffsAndValuesArgs*)calloc(1, sizeof(LogCoeffsAndValuesArgs));
+    args->N = batch*seq * hidden;
+
+    cudaMalloc(&args->gate, args->N * sizeof(float));
+    cudaMalloc(&args->hidden, args->N * sizeof(float));
+    cudaMalloc(&args->log_coeffs, args->N * sizeof(float));
+    cudaMalloc(&args->log_values, args->N * sizeof(float));
+    cudaMalloc(&args->grad_gate, args->N * sizeof(float));
+    cudaMalloc(&args->grad_hidden, args->N * sizeof(float));
+    cudaMalloc(&args->grad_log_coeffs, args->N * sizeof(float));
+    cudaMalloc(&args->grad_log_values, args->N * sizeof(float));
+
+    float* gate_buf = (float*)malloc(args->N * sizeof(float));
+    float* hidden_buf = (float*)malloc(args->N * sizeof(float));
+    float* grad_log_coeffs_buf = (float*)malloc(args->N * sizeof(float));
+    float* grad_log_values_buf = (float*)malloc(args->N * sizeof(float));
+    for (int i = 0; i < args->N; ++i) {
+        gate_buf[i] = rand1() * 5.0f;
+        hidden_buf[i] = rand1() * 5.0f;
+        grad_log_coeffs_buf[i] = rand1();
+        grad_log_values_buf[i] = rand1();
+    }
+
+    cudaMemcpy(args->gate, gate_buf, args->N * sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(args->hidden, hidden_buf, args->N * sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(args->grad_log_coeffs, grad_log_coeffs_buf, args->N * sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(args->grad_log_values, grad_log_values_buf, args->N * sizeof(float), cudaMemcpyHostToDevice);
+
+    free(gate_buf);
+    free(hidden_buf);
+    free(grad_log_coeffs_buf);
+    free(grad_log_values_buf);
+
+    return args;
+}
+
+void free_logcoeffsandvaluesargs(LogCoeffsAndValuesArgs* args) {
+    cudaFree(args->gate);
+    cudaFree(args->hidden);
+    cudaFree(args->log_coeffs);
+    cudaFree(args->log_values);
+    cudaFree(args->grad_gate);
+    cudaFree(args->grad_hidden);
+    cudaFree(args->grad_log_coeffs);
+    cudaFree(args->grad_log_values);
+    free(args);
+}
+
+void run_logcoeffsandvalues_forward(LogCoeffsAndValuesArgs* args) {
+    launch_log_coeffs_and_values<float>(
+        args->log_coeffs, args->log_values, args->gate, args->hidden, args->N, 0);
+}
+
+void run_logcoeffsandvalues_backward(LogCoeffsAndValuesArgs* args) {
+    launch_log_coeffs_and_values_backward<float>(
+        args->grad_gate, args->grad_hidden, args->grad_log_coeffs,
+        args->grad_log_values, args->gate, args->hidden, args->N, 0);
+}
+
+#ifdef USE_TORCH
+
+typedef struct {
+    torch::Tensor gate;
+    torch::Tensor hidden;
+    torch::Tensor grad_log_coeffs;
+    torch::Tensor grad_log_values;
+    torch::Tensor out_log_coeffs;
+    torch::Tensor out_log_values;
+    int N;
+} LogCoeffsAndValuesArgsTorch;
+
+LogCoeffsAndValuesArgsTorch* create_logcoeffsandvaluesargs_torch(LogCoeffsAndValuesArgs* raw) {
+    LogCoeffsAndValuesArgsTorch* args = new LogCoeffsAndValuesArgsTorch();
+    args->N = raw->N;
+
+    auto opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
+    args->gate = torch::from_blob(raw->gate, {raw->N}, opts).requires_grad_(true);
+    args->hidden = torch::from_blob(raw->hidden, {raw->N}, opts).requires_grad_(true);
+    args->grad_log_coeffs = torch::from_blob(raw->grad_log_coeffs, {raw->N}, opts);
+    args->grad_log_values = torch::from_blob(raw->grad_log_values, {raw->N}, opts);
+
+    return args;
+}
+
+void run_logcoeffsandvalues_forward_torch(LogCoeffsAndValuesArgsTorch* args) {
+    torch::NoGradGuard no_grad;
+    log_coeffs_and_values(args->gate, args->hidden);
+}
+
+void run_logcoeffsandvalues_backward_torch(LogCoeffsAndValuesArgsTorch* args) {
+    args->gate.mutable_grad() = torch::Tensor();
+    args->hidden.mutable_grad() = torch::Tensor();
+    torch::autograd::backward(
+        {args->out_log_coeffs, args->out_log_values},
+        {args->grad_log_coeffs, args->grad_log_values},
+        /*retain_graph=*/true);
+}
+
+void run_logcoeffsandvalues_forward_cpp(LogCoeffsAndValuesArgsTorch* args) {
+    torch::NoGradGuard no_grad;
+    log_coeffs_and_values_cpp(args->gate, args->hidden);
+}
+
+#endif
+
+void profile_logcoeffsandvalues(int batch, int seq, int hidden) {
+    LogCoeffsAndValuesArgs* args = create_logcoeffsandvaluesargs(batch, seq, hidden);
+
+    printf("log_coeffs_and_values (N=%d, %dx%dx%d)\n", args->N, batch, seq, hidden);
+
+    float fwd_ms = profile_kernel((kernel_fn)run_logcoeffsandvalues_forward, args);
+    print_timing("\tforward", fwd_ms, batch*seq);
+
+    float bwd_ms = profile_kernel((kernel_fn)run_logcoeffsandvalues_backward, args);
+    print_timing("\tbackward", bwd_ms, batch*seq);
+
+#ifdef USE_TORCH
+    LogCoeffsAndValuesArgsTorch* args_torch = create_logcoeffsandvaluesargs_torch(args);
+
+    float fwd_torch_ms = profile_kernel((kernel_fn)run_logcoeffsandvalues_forward_torch, args_torch);
+    print_timing("\tforward (torch)", fwd_torch_ms, batch*seq);
+
+    auto kernel_outputs = log_coeffs_and_values(args_torch->gate, args_torch->hidden);
+    args_torch->out_log_coeffs = kernel_outputs[0];
+    args_torch->out_log_values = kernel_outputs[1];
+
+    float bwd_torch_ms = profile_kernel((kernel_fn)run_logcoeffsandvalues_backward_torch, args_torch);
+    print_timing("\tbackward (torch)", bwd_torch_ms, batch*seq);
+
+    float fwd_cpp_ms = profile_kernel((kernel_fn)run_logcoeffsandvalues_forward_cpp, args_torch);
+    print_timing("\tforward (cpp)", fwd_cpp_ms, batch*seq);
+
+    auto cpp_outputs = log_coeffs_and_values_cpp(args_torch->gate, args_torch->hidden);
+    args_torch->out_log_coeffs = cpp_outputs[0];
+    args_torch->out_log_values = cpp_outputs[1];
+
+    float bwd_cpp_ms = profile_kernel((kernel_fn)run_logcoeffsandvalues_backward_torch, args_torch);
+    print_timing("\tbackward (cpp)", bwd_cpp_ms, batch*seq);
+
+    float fwd_graph_ms = profile_graph((kernel_fn)run_logcoeffsandvalues_forward_cpp, args_torch);
+    print_timing("\tforward (graph)", fwd_graph_ms, batch*seq);
+
+    delete args_torch;
+#endif
+    printf("\n");
+
+    free_logcoeffsandvaluesargs(args);
+}
+
+typedef struct {
+    float* x;
+    float* out;
+    double* s_buf;
+    float* grad_x;
+    float* grad_out;
+    int B;
+    int T;
+    int H;
+    int N;
+} LogcumsumexpArgs;
+
+LogcumsumexpArgs* create_logcumsumexpargs(int batch, int seq, int hidden) {
+    LogcumsumexpArgs* args = (LogcumsumexpArgs*)calloc(1, sizeof(LogcumsumexpArgs));
+    args->B = batch;
+    args->T = seq;
+    args->H = hidden;
+    args->N = batch*seq * hidden;
+
+    cudaMalloc(&args->x, args->N * sizeof(float));
+    cudaMalloc(&args->out, args->N * sizeof(float));
+    cudaMalloc(&args->s_buf, args->N * sizeof(double));
+    cudaMalloc(&args->grad_x, args->N * sizeof(float));
+    cudaMalloc(&args->grad_out, args->N * sizeof(float));
+
+    float* buf = (float*)malloc(args->N * sizeof(float) * 2);
+    float* x_buf = buf;
+    float* grad_out_buf = buf + args->N;
+    for (int i = 0; i < args->N; ++i) {
+        x_buf[i] = rand1();
+        grad_out_buf[i] = rand1();
+    }
+
+    cudaMemcpy(args->x, x_buf, args->N * sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(args->grad_out, grad_out_buf, args->N * sizeof(float), cudaMemcpyHostToDevice);
+
+    free(buf);
+    return args;
+}
+
+void free_logcumsumexpargs(LogcumsumexpArgs* args) {
+    cudaFree(args->x);
+    cudaFree(args->out);
+    cudaFree(args->s_buf);
+    cudaFree(args->grad_x);
+    cudaFree(args->grad_out);
+    free(args);
+}
+
+void run_logcumsumexp_forward(LogcumsumexpArgs* args) {
+    launch_logcumsumexp_forward<float>(
+        args->out, args->s_buf, args->x, args->T, args->H, args->B, 0);
+}
+
+void run_logcumsumexp_backward(LogcumsumexpArgs* args) {
+    launch_logcumsumexp_backward<float>(
+        args->grad_x, args->grad_out, args->x, args->s_buf, args->T, args->H, args->B, 0);
+}
+
+#ifdef USE_TORCH
+
+typedef struct {
+    torch::Tensor x;
+    torch::Tensor out;
+    torch::Tensor grad_out;
+    int N;
+} LogcumsumexpArgsTorch;
+
+LogcumsumexpArgsTorch* create_logcumsumexpargs_torch(LogcumsumexpArgs* raw) {
+    LogcumsumexpArgsTorch* args = new LogcumsumexpArgsTorch();
+    args->N = raw->N;
+
+    auto opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
+    args->x = torch::from_blob(raw->x, {raw->B, raw->T, raw->H}, opts).requires_grad_(true);
+    args->grad_out = torch::from_blob(raw->grad_out, {raw->B, raw->T, raw->H}, opts);
+
+    return args;
+}
+
+void run_logcumsumexp_forward_torch(LogcumsumexpArgsTorch* args) {
+    torch::NoGradGuard no_grad;
+    logcumsumexp_cuda(args->x);
+}
+
+void run_logcumsumexp_backward_torch(LogcumsumexpArgsTorch* args) {
+    args->x.mutable_grad() = torch::Tensor();
+    args->out.backward(args->grad_out, /*retain_graph=*/true);
+}
+
+void run_logcumsumexp_forward_cpp(LogcumsumexpArgsTorch* args) {
+    torch::NoGradGuard no_grad;
+    logcumsumexp_cpp(args->x);
+}
+
+#endif
+
+void profile_logcumsumexp(int batch, int seq, int hidden) {
+    LogcumsumexpArgs* args = create_logcumsumexpargs(batch, seq, hidden);
+
+    printf("logcumsumexp (N=%d, %dx%dx%d)\n", args->N, batch, seq, hidden);
+
+    float fwd_ms = profile_kernel((kernel_fn)run_logcumsumexp_forward, args);
+    print_timing("\tforward", fwd_ms, batch*seq);
+
+    float bwd_ms = profile_kernel((kernel_fn)run_logcumsumexp_backward, args);
+    print_timing("\tbackward", bwd_ms, batch*seq);
+
+#ifdef USE_TORCH
+    LogcumsumexpArgsTorch* args_torch = create_logcumsumexpargs_torch(args);
+
+    float fwd_torch_ms = profile_kernel((kernel_fn)run_logcumsumexp_forward_torch, args_torch);
+    print_timing("\tforward (torch)", fwd_torch_ms, batch*seq);
+
+    args_torch->out = logcumsumexp_cuda(args_torch->x);
+
+    float bwd_torch_ms = profile_kernel((kernel_fn)run_logcumsumexp_backward_torch, args_torch);
+    print_timing("\tbackward (torch)", bwd_torch_ms, batch*seq);
+
+    float fwd_cpp_ms = profile_kernel((kernel_fn)run_logcumsumexp_forward_cpp, args_torch);
+    print_timing("\tforward (cpp)", fwd_cpp_ms, batch*seq);
+
+    args_torch->out = logcumsumexp_cpp(args_torch->x);
+
+    float bwd_cpp_ms = profile_kernel((kernel_fn)run_logcumsumexp_backward_torch, args_torch);
+    print_timing("\tbackward (cpp)", bwd_cpp_ms, batch*seq);
+
+    float fwd_graph_ms = profile_graph((kernel_fn)run_logcumsumexp_forward_cpp, args_torch);
+    print_timing("\tforward (graph)", fwd_graph_ms, batch*seq);
+
+    delete args_torch;
+#endif
+    printf("\n");
+
+    free_logcumsumexpargs(args);
+}
+
+// New fused_scan takes combined (B, T, 3*H) = [hidden, gate, proj] and state (B, 1, H)
+// Outputs: out (B, T, H) = sigmoid(proj) * scan_result, next_state (B, 1, H)
+typedef struct {
+    float* combined;       // (B, T, 3*H) = [hidden, gate, proj]
+    float* state;          // (B, 1, H)
+    float* out;            // (B, T, H)
+    float* next_state;     // (B, 1, H)
+    float* a_star;         // (B, T+1, H)
+    float* s_vals;         // (B, T+1, H)
+    float* log_values_buf; // (B, T+1, H)
+    float* grad_combined;  // (B, T, 3*H)
+    float* grad_state;     // (B, 1, H)
+    float* grad_out;       // (B, T, H)
+    float* grad_next_state;// (B, 1, H)
+    int B;
+    int T;
+    int H;
+    int N;
+} FusedScanArgs;
+
+FusedScanArgs* create_fusedscanargs(int batch, int seq, int hidden) {
+    FusedScanArgs* args = (FusedScanArgs*)calloc(1, sizeof(FusedScanArgs));
+    args->B = batch;
+    args->T = seq;
+    args->H = hidden;
+    args->N = batch * seq * hidden;
+
+    int N_combined = batch * seq * 3 * hidden;
+    int N_state = batch * hidden;
+    int N_buf = batch * (seq + 1) * hidden;
+
+    cudaMalloc(&args->combined, N_combined * sizeof(float));
+    cudaMalloc(&args->state, N_state * sizeof(float));
+    cudaMalloc(&args->out, args->N * sizeof(float));
+    cudaMalloc(&args->next_state, N_state * sizeof(float));
+    cudaMalloc(&args->a_star, N_buf * sizeof(float));
+    cudaMalloc(&args->s_vals, N_buf * sizeof(float));
+    cudaMalloc(&args->log_values_buf, N_buf * sizeof(float));
+    cudaMalloc(&args->grad_combined, N_combined * sizeof(float));
+    cudaMalloc(&args->grad_state, N_state * sizeof(float));
+    cudaMalloc(&args->grad_out, args->N * sizeof(float));
+    cudaMalloc(&args->grad_next_state, N_state * sizeof(float));
+
+    // Allocate and initialize host buffers
+    float* combined_buf = (float*)malloc(N_combined * sizeof(float));
+    float* state_buf = (float*)malloc(N_state * sizeof(float));
+    float* grad_out_buf = (float*)malloc(args->N * sizeof(float));
+    float* grad_next_state_buf = (float*)malloc(N_state * sizeof(float));
+
+    // Initialize combined = [hidden, gate, proj] with reasonable values
+    for (int b = 0; b < batch; ++b) {
+        for (int t = 0; t < seq; ++t) {
+            for (int h = 0; h < hidden; ++h) {
+                int base = b * seq * 3 * hidden + t * 3 * hidden;
+                combined_buf[base + h] = rand1() * 5.0f;             // hidden
+                combined_buf[base + hidden + h] = rand1() * 5.0f;    // gate
+                combined_buf[base + 2 * hidden + h] = rand1() * 2.0f; // proj
+            }
+        }
+    }
+    // Initialize state with positive values (will be log'd)
+    for (int i = 0; i < N_state; ++i) {
+        state_buf[i] = fabsf(rand1()) + 0.1f;
+    }
+    // Initialize gradients
+    for (int i = 0; i < args->N; ++i) {
+        grad_out_buf[i] = rand1();
+    }
+    for (int i = 0; i < N_state; ++i) {
+        grad_next_state_buf[i] = rand1();
+    }
+
+    cudaMemcpy(args->combined, combined_buf, N_combined * sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(args->state, state_buf, N_state * sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(args->grad_out, grad_out_buf, args->N * sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(args->grad_next_state, grad_next_state_buf, N_state * sizeof(float), cudaMemcpyHostToDevice);
+
+    free(combined_buf);
+    free(state_buf);
+    free(grad_out_buf);
+    free(grad_next_state_buf);
+    return args;
+}
+
+void free_fusedscanargs(FusedScanArgs* args) {
+    cudaFree(args->combined);
+    cudaFree(args->state);
+    cudaFree(args->out);
+    cudaFree(args->next_state);
+    cudaFree(args->a_star);
+    cudaFree(args->s_vals);
+    cudaFree(args->log_values_buf);
+    cudaFree(args->grad_combined);
+    cudaFree(args->grad_state);
+    cudaFree(args->grad_out);
+    cudaFree(args->grad_next_state);
+    free(args);
+}
+
+void run_fusedscan_forward(FusedScanArgs* args) {
+    launch_fused_scan_forward<float>(
+        args->out, args->next_state,
+        args->a_star, args->s_vals, args->log_values_buf,
+        args->combined, args->state,
+        args->T, args->H, args->B, 0);
+}
+
+void run_fusedscan_backward(FusedScanArgs* args) {
+    launch_fused_scan_backward<float>(
+        args->grad_combined, args->grad_state,
+        args->grad_out, args->grad_next_state,
+        args->combined, args->state,
+        args->a_star, args->s_vals, args->log_values_buf,
+        args->T, args->H, args->B, 0);
+}
+
+#ifdef USE_TORCH
+
+typedef struct {
+    torch::Tensor combined;    // (B, T, 3*H)
+    torch::Tensor state;       // (B, 1, H)
+    torch::Tensor out;         // (B, T, H)
+    torch::Tensor next_state;  // (B, 1, H)
+    torch::Tensor grad_out;    // (B, T, H)
+    torch::Tensor grad_next_state; // (B, 1, H)
+    int B;
+    int T;
+    int H;
+} FusedScanArgsTorch;
+
+FusedScanArgsTorch* create_fusedscanargs_torch(FusedScanArgs* raw) {
+    FusedScanArgsTorch* args = new FusedScanArgsTorch();
+    args->B = raw->B;
+    args->T = raw->T;
+    args->H = raw->H;
+
+    auto opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
+    args->combined = torch::from_blob(raw->combined, {raw->B, raw->T, 3 * raw->H}, opts).requires_grad_(true);
+    args->state = torch::from_blob(raw->state, {raw->B, 1, raw->H}, opts).requires_grad_(true);
+    args->grad_out = torch::from_blob(raw->grad_out, {raw->B, raw->T, raw->H}, opts);
+    args->grad_next_state = torch::from_blob(raw->grad_next_state, {raw->B, 1, raw->H}, opts);
+
+    return args;
+}
+
+void run_fusedscan_forward_torch(FusedScanArgsTorch* args) {
+    torch::NoGradGuard no_grad;
+    fused_scan(args->combined, args->state);
+}
+
+void run_fusedscan_backward_torch(FusedScanArgsTorch* args) {
+    args->combined.mutable_grad() = torch::Tensor();
+    args->state.mutable_grad() = torch::Tensor();
+    torch::autograd::backward(
+        {args->out, args->next_state},
+        {args->grad_out, args->grad_next_state},
+        /*retain_graph=*/true);
+}
+
+void run_fusedscan_forward_cpp(FusedScanArgsTorch* args) {
+    torch::NoGradGuard no_grad;
+    fused_scan_cpp(args->combined, args->state);
+}
+
+#endif
+
+void profile_fusedscan(int batch, int seq, int hidden) {
+    FusedScanArgs* args = create_fusedscanargs(batch, seq, hidden);
+
+    printf("fused_scan (N=%d, %dx%dx%d, combined=%dx%dx%d)\n",
+           args->N, batch, seq, hidden, batch, seq, 3*hidden);
+
+    float fwd_ms = profile_kernel((kernel_fn)run_fusedscan_forward, args);
+    print_timing("\tforward", fwd_ms, batch*seq);
+
+    float bwd_ms = profile_kernel((kernel_fn)run_fusedscan_backward, args);
+    print_timing("\tbackward", bwd_ms, batch*seq);
+
+#ifdef USE_TORCH
+    FusedScanArgsTorch* args_torch = create_fusedscanargs_torch(args);
+
+    float fwd_torch_ms = profile_kernel((kernel_fn)run_fusedscan_forward_torch, args_torch);
+    print_timing("\tforward (torch)", fwd_torch_ms, batch*seq);
+
+    auto scan_out = fused_scan(args_torch->combined, args_torch->state);
+    args_torch->out = scan_out[0];
+    args_torch->next_state = scan_out[1];
+
+    float bwd_torch_ms = profile_kernel((kernel_fn)run_fusedscan_backward_torch, args_torch);
+    print_timing("\tbackward (torch)", bwd_torch_ms, batch*seq);
+
+    float fwd_cpp_ms = profile_kernel((kernel_fn)run_fusedscan_forward_cpp, args_torch);
+    print_timing("\tforward (cpp)", fwd_cpp_ms, batch*seq);
+
+    auto scan_out_cpp = fused_scan_cpp(args_torch->combined, args_torch->state);
+    args_torch->out = scan_out_cpp[0];
+    args_torch->next_state = scan_out_cpp[1];
+
+    float bwd_cpp_ms = profile_kernel((kernel_fn)run_fusedscan_backward_torch, args_torch);
+    print_timing("\tbackward (cpp)", bwd_cpp_ms, batch*seq);
+
+    float fwd_graph_ms = profile_graph((kernel_fn)run_fusedscan_forward_cpp, args_torch);
+    print_timing("\tforward (graph)", fwd_graph_ms, batch*seq);
+
+    delete args_torch;
+#endif
+    printf("\n");
+
+    free_fusedscanargs(args);
+}
+
+typedef struct {
+    float* logits;
+    float* values_pred;
+    int64_t* actions;
+    float* old_logprobs;
+    float* advantages;
+    float* prio;
+    float* values;
+    float* returns;
+    float* adv_mean;
+    float* adv_std;
+    float* loss;
+    double* saved_for_backward;
+    float* grad_logits;
+    float* grad_values_pred;
+    float* grad_loss;
+    float clip_coef;
+    float vf_clip_coef;
+    float vf_coef;
+    float ent_coef;
+    int N;
+    int T;
+    int A;
+} PPOLossArgs;
+
+PPOLossArgs* create_ppolossargs(int batch, int seq, int actions) {
+    PPOLossArgs* args = (PPOLossArgs*)calloc(1, sizeof(PPOLossArgs));
+    args->N = batch;
+    args->T = seq;
+    args->A = actions;
+
+    int NT = batch*seq;
+    int NTA = batch*seq * actions;
+
+    cudaMalloc(&args->logits, NTA * sizeof(float));
+    cudaMalloc(&args->values_pred, NT * sizeof(float));
+    cudaMalloc(&args->actions, NT * sizeof(int64_t));
+    cudaMalloc(&args->old_logprobs, NT * sizeof(float));
+    cudaMalloc(&args->advantages, NT * sizeof(float));
+    cudaMalloc(&args->prio, batch * sizeof(float));
+    cudaMalloc(&args->values, NT * sizeof(float));
+    cudaMalloc(&args->returns, NT * sizeof(float));
+    cudaMalloc(&args->adv_mean, sizeof(float));
+    cudaMalloc(&args->adv_std, sizeof(float));
+    cudaMalloc(&args->loss, sizeof(float));
+    cudaMalloc(&args->saved_for_backward, NT * 5 * sizeof(double));
+    cudaMalloc(&args->grad_logits, NTA * sizeof(float));
+    cudaMalloc(&args->grad_values_pred, NT * sizeof(float));
+    cudaMalloc(&args->grad_loss, sizeof(float));
+
+    float* buf = (float*)malloc((NTA + NT * 5 + batch) * sizeof(float));
+    float* logits_buf = buf;
+    float* values_pred_buf = buf + NTA;
+    float* old_logprobs_buf = buf + NTA + NT;
+    float* advantages_buf = buf + NTA + NT * 2;
+    float* values_buf = buf + NTA + NT * 3;
+    float* returns_buf = buf + NTA + NT * 4;
+    float* prio_buf = buf + NTA + NT * 5;
+
+    int64_t* actions_buf = (int64_t*)malloc(NT * sizeof(int64_t));
+
+    float adv_sum = 0.0f, adv_sq_sum = 0.0f;
+    for (int i = 0; i < NT; ++i) {
+        advantages_buf[i] = rand1();
+        adv_sum += advantages_buf[i];
+        adv_sq_sum += advantages_buf[i] * advantages_buf[i];
+    }
+    float adv_mean = adv_sum / NT;
+    float adv_std = sqrtf(adv_sq_sum / NT - adv_mean * adv_mean);
+
+    for (int i = 0; i < NTA; ++i) {
+        logits_buf[i] = rand1() * 2.0f;
+    }
+    for (int i = 0; i < NT; ++i) {
+        values_pred_buf[i] = rand1();
+        actions_buf[i] = rand() % actions;
+        old_logprobs_buf[i] = rand1() * 2.0f;
+        values_buf[i] = rand1();
+        returns_buf[i] = rand1();
+    }
+    for (int i = 0; i < batch; ++i) {
+        prio_buf[i] = (float)rand() / RAND_MAX;
+    }
+
+    cudaMemcpy(args->logits, logits_buf, NTA * sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(args->values_pred, values_pred_buf, NT * sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(args->actions, actions_buf, NT * sizeof(int64_t), cudaMemcpyHostToDevice);
+    cudaMemcpy(args->old_logprobs, old_logprobs_buf, NT * sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(args->advantages, advantages_buf, NT * sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(args->prio, prio_buf, batch * sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(args->values, values_buf, NT * sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(args->returns, returns_buf, NT * sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(args->adv_mean, &adv_mean, sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(args->adv_std, &adv_std, sizeof(float), cudaMemcpyHostToDevice);
+
+    float grad_loss_val = 1.0f;
+    cudaMemcpy(args->grad_loss, &grad_loss_val, sizeof(float), cudaMemcpyHostToDevice);
+
+    args->clip_coef = 0.1f;
+    args->vf_clip_coef = 0.1f;
+    args->vf_coef = 0.5f;
+    args->ent_coef = 0.01f;
+
+    free(buf);
+    free(actions_buf);
+    return args;
+}
+
+void free_ppolossargs(PPOLossArgs* args) {
+    cudaFree(args->logits);
+    cudaFree(args->values_pred);
+    cudaFree(args->actions);
+    cudaFree(args->old_logprobs);
+    cudaFree(args->advantages);
+    cudaFree(args->prio);
+    cudaFree(args->values);
+    cudaFree(args->returns);
+    cudaFree(args->adv_mean);
+    cudaFree(args->adv_std);
+    cudaFree(args->loss);
+    cudaFree(args->saved_for_backward);
+    cudaFree(args->grad_logits);
+    cudaFree(args->grad_values_pred);
+    cudaFree(args->grad_loss);
+    free(args);
+}
+
+void run_ppoloss_forward(PPOLossArgs* args) {
+    launch_ppo_loss_forward<float>(
+        args->loss, args->saved_for_backward,
+        args->logits, args->values_pred, args->actions,
+        args->old_logprobs, args->advantages, args->prio,
+        args->values, args->returns, args->adv_mean, args->adv_std,
+        args->clip_coef, args->vf_clip_coef, args->vf_coef, args->ent_coef,
+        args->T, args->A, args->N, 0);
+}
+
+void run_ppoloss_backward(PPOLossArgs* args) {
+    launch_ppo_loss_backward<float>(
+        args->grad_logits, args->grad_values_pred, args->grad_loss,
+        args->logits, args->actions, args->old_logprobs,
+        args->advantages, args->prio, args->values, args->returns,
+        args->saved_for_backward, args->adv_mean, args->adv_std,
+        args->clip_coef, args->vf_clip_coef, args->vf_coef, args->ent_coef,
+        args->T, args->A, args->N, 0);
+}
+
+#ifdef USE_TORCH
+
+typedef struct {
+    torch::Tensor logits;
+    torch::Tensor values_pred;
+    torch::Tensor actions;
+    torch::Tensor old_logprobs;
+    torch::Tensor advantages;
+    torch::Tensor prio;
+    torch::Tensor values;
+    torch::Tensor returns;
+    torch::Tensor adv_mean;
+    torch::Tensor adv_std;
+    torch::Tensor loss;
+    float clip_coef;
+    float vf_clip_coef;
+    float vf_coef;
+    float ent_coef;
+    int N;
+    int T;
+    int A;
+} PPOLossArgsTorch;
+
+PPOLossArgsTorch* create_ppolossargs_torch(PPOLossArgs* raw) {
+    PPOLossArgsTorch* args = new PPOLossArgsTorch();
+    args->N = raw->N;
+    args->T = raw->T;
+    args->A = raw->A;
+    args->clip_coef = raw->clip_coef;
+    args->vf_clip_coef = raw->vf_clip_coef;
+    args->vf_coef = raw->vf_coef;
+    args->ent_coef = raw->ent_coef;
+
+    auto opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
+    auto opts_int = torch::TensorOptions().dtype(torch::kInt64).device(torch::kCUDA);
+
+    args->logits = torch::from_blob(raw->logits, {raw->N, raw->T, raw->A}, opts).requires_grad_(true);
+    args->values_pred = torch::from_blob(raw->values_pred, {raw->N, raw->T}, opts).requires_grad_(true);
+    args->actions = torch::from_blob(raw->actions, {raw->N, raw->T}, opts_int);
+    args->old_logprobs = torch::from_blob(raw->old_logprobs, {raw->N, raw->T}, opts);
+    args->advantages = torch::from_blob(raw->advantages, {raw->N, raw->T}, opts);
+    args->prio = torch::from_blob(raw->prio, {raw->N}, opts);
+    args->values = torch::from_blob(raw->values, {raw->N, raw->T}, opts);
+    args->returns = torch::from_blob(raw->returns, {raw->N, raw->T}, opts);
+    args->adv_mean = torch::from_blob(raw->adv_mean, {1}, opts);
+    args->adv_std = torch::from_blob(raw->adv_std, {1}, opts);
+
+    return args;
+}
+
+void run_ppoloss_forward_torch(PPOLossArgsTorch* args) {
+    torch::NoGradGuard no_grad;
+    fused_ppo_loss(
+        args->logits, args->values_pred, args->actions,
+        args->old_logprobs, args->advantages, args->prio,
+        args->values, args->returns, args->adv_mean, args->adv_std,
+        args->clip_coef, args->vf_clip_coef, args->vf_coef, args->ent_coef);
+}
+
+void run_ppoloss_backward_torch(PPOLossArgsTorch* args) {
+    args->logits.mutable_grad() = torch::Tensor();
+    args->values_pred.mutable_grad() = torch::Tensor();
+    args->loss.backward({}, /*retain_graph=*/true);
+}
+
+void run_ppoloss_forward_cpp(PPOLossArgsTorch* args) {
+    torch::NoGradGuard no_grad;
+    fused_ppo_loss_cpp(
+        args->logits, args->values_pred, args->actions,
+        args->old_logprobs, args->advantages, args->prio,
+        args->values, args->returns, args->adv_mean, args->adv_std,
+        args->clip_coef, args->vf_clip_coef, args->vf_coef, args->ent_coef);
+}
+
+#endif
+
+void profile_ppoloss(int batch, int seq, int actions) {
+    PPOLossArgs* args = create_ppolossargs(batch, seq, actions);
+
+    int NT = batch*seq;
+    printf("ppo_loss (NT=%d, %dx%d, A=%d)\n", NT, batch, seq, actions);
+
+    float fwd_ms = profile_kernel((kernel_fn)run_ppoloss_forward, args);
+    print_timing("\tforward", fwd_ms, NT);
+
+    float bwd_ms = profile_kernel((kernel_fn)run_ppoloss_backward, args);
+    print_timing("\tbackward", bwd_ms, NT);
+
+#ifdef USE_TORCH
+    PPOLossArgsTorch* args_torch = create_ppolossargs_torch(args);
+
+    float fwd_torch_ms = profile_kernel((kernel_fn)run_ppoloss_forward_torch, args_torch);
+    print_timing("\tforward (torch)", fwd_torch_ms, NT);
+
+    args_torch->loss = fused_ppo_loss(
+        args_torch->logits, args_torch->values_pred, args_torch->actions,
+        args_torch->old_logprobs, args_torch->advantages, args_torch->prio,
+        args_torch->values, args_torch->returns, args_torch->adv_mean, args_torch->adv_std,
+        args_torch->clip_coef, args_torch->vf_clip_coef, args_torch->vf_coef, args_torch->ent_coef)[0];
+
+    float bwd_torch_ms = profile_kernel((kernel_fn)run_ppoloss_backward_torch, args_torch);
+    print_timing("\tbackward (torch)", bwd_torch_ms, NT);
+
+    float fwd_cpp_ms = profile_kernel((kernel_fn)run_ppoloss_forward_cpp, args_torch);
+    print_timing("\tforward (cpp)", fwd_cpp_ms, NT);
+
+    args_torch->loss = fused_ppo_loss_cpp(
+        args_torch->logits, args_torch->values_pred, args_torch->actions,
+        args_torch->old_logprobs, args_torch->advantages, args_torch->prio,
+        args_torch->values, args_torch->returns, args_torch->adv_mean, args_torch->adv_std,
+        args_torch->clip_coef, args_torch->vf_clip_coef, args_torch->vf_coef, args_torch->ent_coef);
+
+    float bwd_cpp_ms = profile_kernel((kernel_fn)run_ppoloss_backward_torch, args_torch);
+    print_timing("\tbackward (cpp)", bwd_cpp_ms, NT);
+
+    float fwd_graph_ms = profile_graph((kernel_fn)run_ppoloss_forward_cpp, args_torch);
+    print_timing("\tforward (graph)", fwd_graph_ms, NT);
+
+    delete args_torch;
+#endif
+    printf("\n");
+
+    free_ppolossargs(args);
+}
+
+int main(int argc, char** argv) {
+    warmup_gpu();
+    profile_mingrugate(BR, H);
+    profile_logcoeffsandvalues(BT, T, H);
+    profile_logcumsumexp(BT, T, H);
+    profile_fusedscan(BT, T, H);
+    profile_ppoloss(BT, T, A);
+    return 0;
+}
diff --git a/profile_torch.py b/profile_torch.py
new file mode 100644
index 000000000..656e7447a
--- /dev/null
+++ b/profile_torch.py
@@ -0,0 +1,65 @@
+import torch
+from torch import nn
+from torch.utils.benchmark import Timer
+from torch.utils.flop_counter import FlopCounterMode
+
+from torch.backends import cudnn
+cudnn.benchmark = True
+cudnn.deterministic = False
+cudnn.benchmark_limit = 32
+
+torch.set_float32_matmul_precision('high')
+torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
+
+INPUT_SIZE = 128
+HIDDEN_SIZE1 = 128
+HIDDEN_SIZE2 = 512
+OUTPUT_SIZE = 128
+B = 8192
+dtype = torch.bfloat16
+inner_loops = 100  # Number of inner iterations to amortize overhead
+
+# Define the model with explicit Kaiming uniform initialization to match JAX
+model = torch.nn.Sequential(
+    torch.nn.Linear(INPUT_SIZE, HIDDEN_SIZE1),
+    torch.nn.ReLU(),
+    torch.nn.Linear(HIDDEN_SIZE1, HIDDEN_SIZE2),
+    torch.nn.ReLU(),
+    torch.nn.Linear(HIDDEN_SIZE2, OUTPUT_SIZE),
+).cuda().to(dtype)
+
+# Create input batch
+batch = torch.randn(B, INPUT_SIZE).cuda().to(dtype)
+
+# Define a multi-step function to run multiple forwards in one compiled graph
+@torch.compile(mode='max-autotune')
+def multi_step(model, batch, inner_loops):
+    with torch.no_grad():
+        carry = torch.tensor(0.0, dtype=torch.float32, device='cuda')
+        for i in range(inner_loops):
+            y = model(batch)
+            carry = carry + y.sum()
+
+        return carry
+
+# Manual FLOPs calculation to match JAX (ignores bias adds and ReLUs as negligible)
+flops = (
+    2 * B * INPUT_SIZE * HIDDEN_SIZE1 +
+    2 * B * HIDDEN_SIZE1 * HIDDEN_SIZE2 +
+    2 * B * HIDDEN_SIZE2 * OUTPUT_SIZE
+)
+
+# Warmup
+for _ in range(10):
+    _ = multi_step(model, batch, inner_loops)
+
+# Timing
+timer = Timer(
+    stmt='multi_step(model, batch, inner_loops)',
+    globals={'multi_step': multi_step, 'model': model, 'batch': batch, 'inner_loops': inner_loops}
+)
+output = timer.timeit(50)
+
+cost = output.mean / inner_loops  # Average time per forward pass (fixed from times[0] to mean)
+FLOPS = flops / cost
+print(f'TFLOPS: {FLOPS / 1e12:.2f}')
diff --git a/pufferlib/config/cogames.ini b/pufferlib/config/cogames.ini
index 674b48e2e..b50ad3bef 100644
--- a/pufferlib/config/cogames.ini
+++ b/pufferlib/config/cogames.ini
@@ -5,7 +5,7 @@ policy_name = Policy
 rnn_name = Recurrent
 
 [vec]
-num_envs = 64
+num_envs = 4096
 num_workers = 16
 batch_size = auto
 zero_copy = True
@@ -15,7 +15,7 @@ render_mode = none
 variants = heart_chorus inventory_heart_tune
 
 [train]
-total_timesteps = 50_000_000
+total_timesteps = 3_000_000_000
 batch_size = auto
-minibatch_size = 1024
+minibatch_size = 32768
 bptt_horizon = 64 
diff --git a/pufferlib/config/default.ini b/pufferlib/config/default.ini
index cc4bf1dae..595dc2261 100644
--- a/pufferlib/config/default.ini
+++ b/pufferlib/config/default.ini
@@ -32,7 +32,7 @@ anneal_lr = True
 min_lr_ratio = 0.0
 gamma = 0.995
 gae_lambda = 0.90
-update_epochs = 1
+num_minibatches = 16
 clip_coef = 0.2
 vf_coef = 2.0
 vf_clip_coef = 0.2
@@ -51,7 +51,7 @@ minibatch_size = 8192
 max_minibatch_size = 32768
 bptt_horizon = 64
 compile = False
-compile_mode = max-autotune-no-cudagraphs
+compile_mode = reduce-overhead
 compile_fullgraph = True
 
 vtrace_rho_clip = 1.0
@@ -60,6 +60,8 @@ vtrace_c_clip = 1.0
 prio_alpha = 0.8
 prio_beta0 = 0.2
 
+max_cost = -1
+
 [sweep]
 method = Protein 
 metric = score
@@ -82,6 +84,20 @@ min = 3e7
 max = 1e10
 scale = time
 
+[sweep.policy.hidden_size]
+distribution = uniform_pow2
+min = 16
+max = 1024
+mean = 128
+scale = auto
+
+[sweep.env.num_envs]
+distribution = uniform_pow2
+min = 1
+max = 4096
+mean = 2048
+scale = auto
+
 [sweep.train.bptt_horizon]
 distribution = uniform_pow2
 min = 16
@@ -90,7 +106,7 @@ scale = auto
 
 [sweep.train.minibatch_size]
 distribution = uniform_pow2
-min = 8192
+min = 512
 max = 65536
 scale = auto
 
@@ -130,11 +146,12 @@ min = 0.1
 max = 5.0
 scale = auto
 
-#[sweep.train.update_epochs]
-#distribution = int_uniform
-#min = 1
-#max = 8
-#scale = 2.0
+[sweep.train.num_minibatches]
+distribution = uniform_pow2
+min = 1
+max = 1024
+mean = 32
+scale = auto
 
 [sweep.train.clip_coef]
 distribution = uniform
diff --git a/pufferlib/config/ocean/breakout.ini b/pufferlib/config/ocean/breakout.ini
index d261503f5..dabe4e737 100644
--- a/pufferlib/config/ocean/breakout.ini
+++ b/pufferlib/config/ocean/breakout.ini
@@ -1,14 +1,16 @@
 [base]
 package = ocean
 env_name = puffer_breakout
-policy_name = Policy
+policy_name = MinGRU
 rnn_name = Recurrent
 
 [vec]
-num_envs = 8
+#num_envs = 4
+num_envs = 1
 
 [env]
-num_envs = 1024
+#num_envs = 2048
+num_envs = 8192
 frameskip = 4
 width = 576
 height = 330
@@ -27,13 +29,46 @@ continuous = 0
  
 [policy]
 hidden_size = 128
+num_layers = 4
+expansion_factor = 1
 
-[rnn]
-input_size = 128
-hidden_size = 128
+[sweep.policy.hidden_size]
+distribution = uniform_pow2
+min = 16
+max = 512
+mean = 128
+scale = auto
+
+[sweep.policy.num_layers]
+distribution = int_uniform
+min = 1
+max = 4
+mean = 2
+scale = auto
+
+#[sweep.policy.d_state]
+#distribution = uniform_pow2
+#min = 32
+#max = 128
+#mean = 32
+#scale = auto
+
+#[sweep.policy.d_conv]
+#distribution = int_uniform
+#min = 1
+#max = 4
+#mean = 2
+#scale = auto
+
+[sweep.policy.expansion_factor]
+distribution = int_uniform
+min = 1
+max = 2
+mean = 1
+scale = auto
 
 [train]
-total_timesteps = 90_000_000
+total_timesteps = 120_000_000
 adam_beta1 = 0.8946507418260217
 adam_beta2 = 0.9
 adam_eps = 0.0001
@@ -53,11 +88,56 @@ vf_coef = 1.6832989594296321
 vtrace_c_clip = 2.878171091654008
 vtrace_rho_clip = 0.7876748061547312
 
-[sweep.train.total_timesteps]
-distribution = log_normal
-min = 3e7
-max = 2e8
-mean = 8e7
+#total_timesteps = 120_000_000
+#adam_beta1 = 0.8166332218104871
+#adam_beta2 = 0.9984879989750705
+#adam_eps = 0.0001
+#batch_size = auto
+#bptt_horizon = 64
+#clip_coef = 0.42526610231849393
+#ent_coef = 0.0026822968018267775
+#gae_lambda = 0.995
+#gamma = 0.9731819086255716
+#learning_rate = 0.04301709139429238
+#max_grad_norm = 0.7029618837611082
+#minibatch_size = 16384
+#prio_alpha = 0.09999999999999998
+#prio_beta0 = 0.8437844355214735
+#vf_clip_coef = 0.807798225723059
+#vf_coef = 2.9089121311247554
+#vtrace_c_clip = 1.6205569942514606
+#vtrace_rho_clip = 1.1777184656786774
+
+#total_timesteps = 40_000_000
+#adam_beta1 = 0.9389740236912132
+#adam_beta2 = 0.9998225039929157
+#adam_eps = 1.0267361590791064e-8
+#batch_size = auto
+#bptt_horizon = 64
+#clip_coef = 0.01557913923814178
+#ent_coef = 0.0031759371032913
+#gae_lambda = 0.916681264452842
+#gamma = 0.9997053654668936
+#learning_rate = 0.012744235594115342
+#max_grad_norm = 1.8013800046071862
+#num_minibatches = 8
+#minibatch_size = 4096
+#prio_alpha = 0.9500430793857082
+#prio_beta0 = 0.9436845548994959
+#vf_clip_coef = 0.1
+#vf_coef = 2.5994729835919834
+#vtrace_c_clip = 2.878171091654008
+#vtrace_rho_clip = 1.3235791596831579
+
+[sweep]
+downsample = 10
+max_cost = 300
+
+[sweep.env.num_envs]
+distribution = uniform_pow2
+min = 1
+max = 4096
+mean = 2048
 scale = auto
 
 [sweep.env.frameskip]
diff --git a/pufferlib/config/ocean/g2048.ini b/pufferlib/config/ocean/g2048.ini
index 3ca7f4e8c..1e8196051 100644
--- a/pufferlib/config/ocean/g2048.ini
+++ b/pufferlib/config/ocean/g2048.ini
@@ -1,11 +1,13 @@
 [base]
 package = ocean
 env_name = puffer_g2048
-policy_name = G2048
+policy_name = G2048LSTM
 rnn_name = Recurrent
 
 [policy]
 hidden_size = 512
+#num_layers = 4
+expansion_factor = 1
 
 [rnn]
 input_size = 512
@@ -22,6 +24,42 @@ scaffolding_ratio = 0.67
 use_heuristic_rewards = True
 snake_reward_weight = 0.0005
 
+[sweep.policy.hidden_size]
+distribution = uniform_pow2
+min = 16
+max = 256
+mean = 128
+scale = auto
+
+[sweep.policy.num_layers]
+distribution = int_uniform
+min = 1
+max = 4
+mean = 2
+scale = auto
+
+[sweep.policy.d_state]
+distribution = uniform_pow2
+min = 8
+max = 128
+mean = 32
+scale = auto
+
+[sweep.policy.d_conv]
+distribution = int_uniform
+min = 1
+max = 4
+mean = 2
+scale = auto
+
+[sweep.policy.expand]
+distribution = int_uniform
+min = 1
+max = 2
+mean = 1
+scale = auto
+
+
 [train]
 # 512 hidden: https://wandb.ai/kywch/pufferlib/runs/5thsjr61?nw=nwuserkywch
 total_timesteps = 6_767_676_767
@@ -30,6 +68,7 @@ min_lr_ratio = 0.15
 batch_size = auto
 bptt_horizon = 64
 minibatch_size = 32768
+num_minibatches = 32
 
 clip_coef = 0.067
 ent_coef = 0.0267
@@ -164,4 +203,4 @@ scale = auto
 ; min = 0.001
 ; max = 0.5
 ; mean = 0.05
-; scale = auto
\ No newline at end of file
+; scale = auto
diff --git a/pufferlib/config/ocean/grid.ini b/pufferlib/config/ocean/grid.ini
index 65bd540b6..e28885c2b 100644
--- a/pufferlib/config/ocean/grid.ini
+++ b/pufferlib/config/ocean/grid.ini
@@ -7,10 +7,6 @@ rnn_name = Recurrent
 [policy]
 hidden_size = 512
 
-[rnn]
-input_size = 512
-hidden_size = 512
-
 [vec]
 #num_envs = 8
 num_envs = 1
@@ -63,10 +59,25 @@ vtrace_rho_clip = 4.7398234531013985
 
 [sweep]
 downsample = 0
+max_cost = 300
 
 [sweep.train.total_timesteps]
 distribution = log_normal
-min = 3e8
-max = 6e8
+min = 1e7
+max = 1e9
 mean = 3e8
 scale = time
+
+[sweep.policy.hidden_size]
+distribution = uniform_pow2
+min = 16
+max = 1024
+mean = 128
+scale = auto
+
+[sweep.env.num_envs]
+distribution = uniform_pow2
+min = 1
+max = 4096
+mean = 2048
+scale = auto
diff --git a/pufferlib/config/ocean/impulse_wars.ini b/pufferlib/config/ocean/impulse_wars.ini
index c4b3bdcc1..50adb8008 100644
--- a/pufferlib/config/ocean/impulse_wars.ini
+++ b/pufferlib/config/ocean/impulse_wars.ini
@@ -6,9 +6,8 @@ rnn_name = ImpulseWarsLSTM
 max_suggestion_cost = 10_800
 
 [policy]
-cnn_channels = 64
-input_size = 512
 hidden_size = 512
+cnn_channels = 64
 
 # These must match what's set in env below
 continuous = False
@@ -16,12 +15,12 @@ num_drones = 2
 is_training = True
 
 [vec]
-num_envs = 16
-num_workers = 16
-batch_size = 4
+num_envs = 4
+#num_workers = 4
+#batch_size = 4
 
 [env]
-num_envs = 256
+num_envs = 1024
 num_drones = 2
 num_agents = 1
 enable_teams = False
@@ -40,10 +39,14 @@ compile_mode = reduce-overhead
 compile_fullgraph = False
 device = cuda
 
+[sweep]
+downsample = 10
+max_cost = 900
+
 [sweep.env.num_envs]
 distribution = uniform_pow2
-min = 16
-max = 512
+min = 1
+max = 1024
 mean = 128
 scale = auto
 
@@ -140,51 +143,3 @@ max = 256
 mean = 128
 scale = auto
 
-[sweep.train.minibatch_size]
-distribution = uniform_pow2
-min = 1024
-max = 262_144
-mean = 16_384
-scale = auto
-
-[sweep.train.learning_rate]
-distribution = log_normal
-min = 0.00001
-mean = 0.001
-max = 0.1
-scale = 0.5
-
-[sweep.train.ent_coef]
-distribution = log_normal
-min = 0.000001
-mean = 0.001
-max = 0.2
-scale = auto
-
-[sweep.train.gamma]
-distribution = logit_normal
-min = 0.8
-mean = 0.98
-max = 0.99999
-scale = auto
-
-[sweep.train.gae_lambda]
-distribution = logit_normal
-min = 0.6
-mean = 0.93
-max = 0.995
-scale = auto
-
-[sweep.train.vf_coef]
-distribution = uniform
-min = 0.0
-max = 5.0
-mean = 1.0
-scale = auto
-
-[sweep.train.max_grad_norm]
-distribution = uniform
-min = 0.0
-mean = 1.0
-max = 5.0
-scale = auto
diff --git a/pufferlib/config/ocean/moba.ini b/pufferlib/config/ocean/moba.ini
index 2e0e8cea3..73bcdeb68 100644
--- a/pufferlib/config/ocean/moba.ini
+++ b/pufferlib/config/ocean/moba.ini
@@ -12,18 +12,36 @@ reward_tower = 4.525112152099609
 num_envs = 128
 
 [vec]
-num_envs = 8
+num_envs = 4
 
 [train]
 total_timesteps = 150_000_000
 
+[sweep]
+downsample = 10
+max_cost = 500
+
 [sweep.train.total_timesteps]
 distribution = log_normal
 min = 2e7
-max = 2e8
+max = 5e8
 mean = 1e8
 scale = auto
 
+[sweep.policy.hidden_size]
+distribution = uniform_pow2
+min = 16
+max = 1024
+mean = 128
+scale = auto
+
+[sweep.env.num_envs]
+distribution = uniform_pow2
+min = 1
+max = 4096
+mean = 2048
+scale = auto
+
 [sweep.env.reward_death]
 distribution = uniform
 min = -1.0
diff --git a/pufferlib/config/ocean/nmmo3.ini b/pufferlib/config/ocean/nmmo3.ini
index c04c77dc3..cb719cc41 100644
--- a/pufferlib/config/ocean/nmmo3.ini
+++ b/pufferlib/config/ocean/nmmo3.ini
@@ -1,11 +1,11 @@
 [base]
 package = ocean
 env_name = puffer_nmmo3
-policy_name = NMMO3
+policy_name =  NMMO3MinGRU
 rnn_name = NMMO3LSTM
 
 [vec]
-num_envs = 8
+num_envs = 4
 
 [env]
 reward_combat_level = 1.0
@@ -13,7 +13,12 @@ reward_prof_level = 1.0
 reward_item_level = 1.0
 reward_market = 0.0
 reward_death = -1.0
-num_envs = 1
+num_envs = 2
+
+[policy]
+hidden_size = 512
+num_layers = 4
+expansion_factor = 1
 
 [train]
 total_timesteps = 107000000000
@@ -31,6 +36,7 @@ max_minibatch_size = 32768
 
 [sweep]
 metric = min_comb_prof
+max_cost = 900
 
 [sweep.env.num_envs]
 distribution = uniform_pow2
@@ -39,13 +45,6 @@ max = 8
 mean = 4
 scale = 0.5
 
-[sweep.train.total_timesteps]
-distribution = log_normal
-min = 2e8
-max = 1e9
-mean = 5e8
-scale = 0.5
-
 [sweep.env.reward_combat_level]
 distribution = uniform
 min = 0.0
diff --git a/pufferlib/config/ocean/pacman.ini b/pufferlib/config/ocean/pacman.ini
index 45055e79b..07f03517e 100644
--- a/pufferlib/config/ocean/pacman.ini
+++ b/pufferlib/config/ocean/pacman.ini
@@ -5,7 +5,7 @@ policy_name = Policy
 rnn_name = Recurrent
 
 [vec]
-num_envs = 8
+num_envs = 4
 
 [env]
 num_envs = 1024
@@ -31,3 +31,27 @@ vf_coef = 0.31518694995467555
 vtrace_c_clip = 0.30575543665366217
 vtrace_rho_clip = 1.5301756939690652
 
+[sweep]
+downsample = 10
+max_cost = 300
+
+[sweep.train.total_timesteps]
+distribution = log_normal
+min = 2e7
+max = 5e8
+mean = 1e8
+scale = auto
+
+[sweep.policy.hidden_size]
+distribution = uniform_pow2
+min = 16
+max = 1024
+mean = 128
+scale = auto
+
+[sweep.env.num_envs]
+distribution = uniform_pow2
+min = 1
+max = 4096
+mean = 2048
+scale = auto
diff --git a/pufferlib/config/ocean/pong.ini b/pufferlib/config/ocean/pong.ini
index a0bf24d93..9ca522646 100644
--- a/pufferlib/config/ocean/pong.ini
+++ b/pufferlib/config/ocean/pong.ini
@@ -4,15 +4,20 @@ env_name = puffer_pong
 policy_name = Policy
 rnn_name = Recurrent
 
+[policy]
+hidden_size = 512
+
 [vec]
 num_envs = 4
 
 [env]
-num_envs = 1024 
-frameskip = 8
+num_envs = 1024
+frameskip = 4
 
 [train]
-total_timesteps = 12_000_000
+max_cost=20
+
+total_timesteps = 500_000
 adam_beta1 = 0.9766295300012044
 adam_beta2 = 0.9998113167362397
 adam_eps = 6.301709731262074e-9
@@ -24,6 +29,7 @@ gamma = 0.9608378504980243
 learning_rate = 0.07109386062895108
 max_grad_norm = 1.7820203601055993
 minibatch_size = 32768
+num_minibatches = 8
 prio_alpha = 0.09999999999999998
 prio_beta0 = 0.7475661360032159
 vf_clip_coef = 2.7025841941932303
@@ -31,16 +37,51 @@ vf_coef = 1.9960893747329385
 vtrace_c_clip = 1.0873122745787867
 vtrace_rho_clip = 2.784150207139061
 
+#total_timesteps = 20000000.0
+#learning_rate = 0.08878791349515394
+#gamma = 0.9354145180237635
+#gae_lambda = 0.9020935398076688
+#num_minibatches = 32
+#clip_coef = 0.5882777043345978
+#vf_coef = 4.196442104147645
+#vf_clip_coef = 0.265385659520976
+#max_grad_norm = 0.3661413663411234
+#ent_coef = 0.0011560317997450196
+#adam_beta1 = 0.9462393585831101
+#adam_beta2 = 0.9667417156941432
+#adam_eps = 1.1005478999774079e-09
+#minibatch_size = 65536
+#max_minibatch_size = 32768
+#bptt_horizon = 64
+#vtrace_rho_clip = 1.8180933155594725
+#vtrace_c_clip = 1.4235484929825957
+#prio_alpha = 0.9553779337727483
+#prio_beta0 = 0.7125182812602482
+
+
+[sweep]
+downsample = 0
+
+[sweep.policy.hidden_size]
+distribution = uniform_pow2
+min = 16
+max = 1024
+scale = auto
+
 [sweep.train.total_timesteps]
 distribution = log_normal
-min = 1e7
-max = 2e8
-mean = 8e7
+min = 5e5
+max = 5e6
+scale = auto
+
+[sweep.env.num_envs]
+distribution = uniform_pow2
+min = 1
+max = 4096
 scale = auto
 
 [sweep.env.frameskip]
 distribution = int_uniform
 min = 1
 max = 8
-mean = 4
 scale = 2.0
diff --git a/pufferlib/config/ocean/rware.ini b/pufferlib/config/ocean/rware.ini
index 705e0af3e..791c426f4 100644
--- a/pufferlib/config/ocean/rware.ini
+++ b/pufferlib/config/ocean/rware.ini
@@ -5,10 +5,10 @@ policy_name = Policy
 rnn_name = Recurrent
 
 [vec]
-num_envs = 8
+num_envs = 4
 
 [env]
-num_envs = 128
+num_envs = 256
 map_choice = 2
 num_agents = 8
 num_requested_shelves = 8
@@ -17,10 +17,3 @@ num_requested_shelves = 8
 total_timesteps = 100_000_000
 learning_rate = 0.05
 minibatch_size = 32768
-
-[sweep.train.total_timesteps]
-distribution = log_normal
-min = 3e7
-max = 3e8
-mean = 1e8
-scale = 0.25
diff --git a/pufferlib/config/ocean/snake.ini b/pufferlib/config/ocean/snake.ini
index 3827b0252..9eafe9400 100644
--- a/pufferlib/config/ocean/snake.ini
+++ b/pufferlib/config/ocean/snake.ini
@@ -6,7 +6,7 @@ policy_name = Snake
 rnn_name = Recurrent
 
 [env]
-num_envs = 4
+num_envs = 16
 width = 640
 height = 360
 num_snakes = 256
@@ -18,7 +18,7 @@ reward_corpse = 0.1
 reward_death = -1.0
 
 [vec]
-num_envs = 16
+num_envs = 1
 
 [train]
 total_timesteps = 500_000_000
@@ -40,6 +40,9 @@ vf_coef = 3.9655925817980053
 vtrace_c_clip = 0
 vtrace_rho_clip = 0.9285200248552337
 
+[sweep]
+max_cost = 500
+
 [sweep.env.reward_food]
 distribution = uniform
 min = 0.0
@@ -56,7 +59,21 @@ scale = auto
 
 [sweep.train.total_timesteps]
 distribution = log_normal
-min = 5e7
-max = 2e8
+min = 2e7
+max = 5e8
 mean = 1e8
 scale = auto
+
+[sweep.policy.hidden_size]
+distribution = uniform_pow2
+min = 16
+max = 1024
+mean = 128
+scale = auto
+
+[sweep.env.num_envs]
+distribution = uniform_pow2
+min = 1
+max = 32
+mean = 8
+scale = auto
diff --git a/pufferlib/config/ocean/squared.ini b/pufferlib/config/ocean/squared.ini
index ac9f69d0f..c4e4ad9db 100644
--- a/pufferlib/config/ocean/squared.ini
+++ b/pufferlib/config/ocean/squared.ini
@@ -1,14 +1,27 @@
 [base]
 package = ocean
 env_name = puffer_squared
-policy_name = Policy
+policy_name = MinGRU
 rnn_name = Recurrent
 
+[vec]
+num_envs = 1
+backend = Serial
+
+[policy]
+hidden_size = 128
+num_layers = 1
+expand = 2
+
 [env]
 num_envs = 4096
 
 [train]
-total_timesteps = 20_000_000
-gamma = 0.95
-learning_rate = 0.05
+optimizer = adam
+total_timesteps = 200_000_000
+gamma = 0.99
+learning_rate = 0.01
 minibatch_size = 32768
+num_minibatches = 8
+ent_coef = 0.0 # TODO: Are numerics bad here in cpp?
+#adam_eps = 1e-5
diff --git a/pufferlib/config/ocean/tetris.ini b/pufferlib/config/ocean/tetris.ini
index 5aab21422..6d53031d4 100644
--- a/pufferlib/config/ocean/tetris.ini
+++ b/pufferlib/config/ocean/tetris.ini
@@ -1,11 +1,11 @@
 [base]
 package = ocean
 env_name = puffer_tetris
-policy_name = Policy
+policy_name = MinGRU
 rnn_name = Recurrent
 
 [vec]
-num_envs = 8
+num_envs = 4
 
 [env]
 num_envs = 2048
@@ -18,10 +18,45 @@ n_noise_obs = 0
 
 [policy]
 hidden_size = 256
+num_layers = 1
+#d_state = 32
+#d_conv = 4
+expand = 2
 
-[rnn]
-input_size = 256
-hidden_size = 256
+[sweep.policy.hidden_size]
+distribution = uniform_pow2
+min = 16
+max = 512
+mean = 128
+scale = auto
+
+[sweep.policy.num_layers]
+distribution = int_uniform
+min = 1
+max = 4
+mean = 2
+scale = auto
+
+[sweep.policy.d_state]
+distribution = uniform_pow2
+min = 8
+max = 128
+mean = 32
+scale = auto
+
+#[sweep.policy.d_conv]
+#distribution = int_uniform
+#min = 1
+#max = 4
+#mean = 2
+#scale = auto
+
+[sweep.policy.expand]
+distribution = int_uniform
+min = 1
+max = 2
+mean = 1
+scale = auto
 
 [train]
 # https://wandb.ai/kywch/pufferlib/runs/era6a8p6?nw=nwuserkywch
@@ -46,10 +81,10 @@ vf_coef = 4.74
 vtrace_c_clip = 1.29
 vtrace_rho_clip = 0.70
 
-
 [sweep]
 metric = score
 goal = maximize
+max_cost = 3600
 
 [sweep.train.total_timesteps]
 distribution = log_normal
@@ -78,3 +113,10 @@ min = 0.5
 mean = 0.95
 max = 0.999
 scale = auto
+
+[sweep.env.num_envs]
+distribution = uniform_pow2
+min = 1
+max = 4096
+mean = 2048
+scale = auto
diff --git a/pufferlib/config/ocean/tower_climb.ini b/pufferlib/config/ocean/tower_climb.ini
index ce6f75d59..629c09cbc 100644
--- a/pufferlib/config/ocean/tower_climb.ini
+++ b/pufferlib/config/ocean/tower_climb.ini
@@ -2,56 +2,86 @@
 package = ocean
 env_name = puffer_tower_climb
 policy_name = TowerClimb
-rnn_name = TowerClimbLSTM
+rnn_name = Recurrent
+
+[policy]
+hidden_size = 256
+
+[rnn]
+hidden_size = 256
+num_layers = 1
 
 [vec]
-num_envs = 8
+num_envs = 4
 
 [env]
 num_envs = 1024
-num_maps = 50
-reward_climb_row = 0.636873185634613
-reward_fall_row = -0.15898257493972778
-reward_illegal_move = -0.003928301855921745
-reward_move_block = 0.235064297914505
+reward_climb_row = 0.16
+reward_fall_row = -0.13
+reward_illegal_move = -0.005
+reward_move_block = 0.035
 
 [train]
-total_timesteps = 150_000_000
-#gamma = 0.98
-#learning_rate = 0.05
-minibatch_size = 32768
+# https://wandb.ai/kywch/pufferlib/runs/b8ym2mvu/overview
+total_timesteps = 600_000_000
+anneal_lr = True
+min_lr_ratio = 0.1
+batch_size = auto
+bptt_horizon = 64
+minibatch_size = 65536
+
+clip_coef = 0.6
+ent_coef = 0.08
+gae_lambda = 0.6
+gamma = 0.95
+vf_clip_coef = 5.0
+vf_coef = 5.0
+
+learning_rate = 0.023
+max_grad_norm = 5.0
+
+adam_beta1 = 0.81
+adam_beta2 = 0.95
+adam_eps = 1.0e-8
+prio_alpha = 0.99
+prio_beta0 = 0.99
+vtrace_c_clip = 3.7
+vtrace_rho_clip = 3.8
+
+[sweep]
+metric = perf
+metric_distribution = percentile
+
+# configs for targeted sweep. Comment these out for broad sweep
+; downsample = 1
+; sweep_only = reward_climb_row, reward_fall_row, reward_illegal_move, reward_move_block, learning_rate, adam_beta1, adam_beta2, adam_eps, vtrace_c_clip, vtrace_rho_clip
 
 [sweep.train.total_timesteps]
 distribution = uniform
-min = 50_000_000
-max = 200_000_000
-mean = 100_000_000
+min = 100_000_000
+max = 2_000_000_000
 scale = 0.5
 
 [sweep.env.reward_climb_row]
 distribution = uniform
 min = 0.0
 max = 1.0
-mean = 0.5
 scale = auto
 
 [sweep.env.reward_fall_row]
 distribution = uniform
 min = -1.0
 max = 0.0
-mean = -0.5
 scale = auto
 
 [sweep.env.reward_illegal_move]
 distribution = uniform
 min = -1e-2
 max = -1e-4
-mean = -1e-3
 scale = auto
 
 [sweep.env.reward_move_block]
 distribution = uniform
 min = 0.0
 max = 1.0
-mean = 0.5
 scale = auto
diff --git a/pufferlib/config/ocean/tripletriad.ini b/pufferlib/config/ocean/tripletriad.ini
index aae55d096..4d4a1ffd8 100644
--- a/pufferlib/config/ocean/tripletriad.ini
+++ b/pufferlib/config/ocean/tripletriad.ini
@@ -8,14 +8,14 @@ rnn_name = Recurrent
 num_envs = 1024
 
 [vec]
-num_envs = 8
+num_envs = 4
 
 [train]
 total_timesteps = 100_000_000
 
 [sweep.train.total_timesteps]
 distribution = log_normal
-min = 5e7
+min = 1e7
 max = 2e8
 mean = 1e8
-scale = 0.25
+scale = time
diff --git a/pufferlib/environments/cogames/environment.py b/pufferlib/environments/cogames/environment.py
index 0fbe47595..61e36cd27 100644
--- a/pufferlib/environments/cogames/environment.py
+++ b/pufferlib/environments/cogames/environment.py
@@ -21,7 +21,7 @@ def make(name="cogames.cogs_v_clips.machina_1.open_world", variants=None, cogs=N
     simulator = Simulator()
     simulator.add_event_handler(StatsTracker(NoopStatsWriter()))
     env = PufferMettaGridEnv(simulator=simulator, cfg=env_cfg, buf=buf, seed=seed or 0)
-    env.render_mode = render
+    #env.render_mode = render
     if seed:
         env.reset(seed)
     return env
diff --git a/pufferlib/extensions/breakout.c b/pufferlib/extensions/breakout.c
new file mode 100644
index 000000000..560fa16a2
--- /dev/null
+++ b/pufferlib/extensions/breakout.c
@@ -0,0 +1,34 @@
+#include "../ocean/breakout/breakout.h"
+#define OBS_SIZE 118
+#define ACT_SIZE 1
+#define OBS_TYPE FLOAT
+#define ACT_TYPE FLOAT
+
+#define Env Breakout
+#include "env_binding.h"
+
+void my_init(Env* env, Dict* kwargs) {
+    env->frameskip = dict_get(kwargs, "frameskip")->int_value;
+    env->width = dict_get(kwargs, "width")->int_value;
+    env->height = dict_get(kwargs, "height")->int_value;
+    env->initial_paddle_width = dict_get(kwargs, "paddle_width")->int_value;
+    env->paddle_height = dict_get(kwargs, "paddle_height")->int_value;
+    env->ball_width = dict_get(kwargs, "ball_width")->int_value;
+    env->ball_height = dict_get(kwargs, "ball_height")->int_value;
+    env->brick_width = dict_get(kwargs, "brick_width")->int_value;
+    env->brick_height = dict_get(kwargs, "brick_height")->int_value;
+    env->brick_rows = dict_get(kwargs, "brick_rows")->int_value;
+    env->brick_cols = dict_get(kwargs, "brick_cols")->int_value;
+    env->initial_ball_speed = dict_get(kwargs, "initial_ball_speed")->int_value;
+    env->max_ball_speed = dict_get(kwargs, "max_ball_speed")->int_value;
+    env->paddle_speed = dict_get(kwargs, "paddle_speed")->int_value;
+    env->continuous = dict_get(kwargs, "continuous")->int_value;
+    init(env);
+}
+
+void my_log(Log* log, Dict* out) {
+    dict_set_float(out, "perf", log->perf);
+    dict_set_float(out, "score", log->score);
+    dict_set_float(out, "episode_return", log->episode_return);
+    dict_set_float(out, "episode_length", log->episode_length);
+}
diff --git a/pufferlib/extensions/cuda/kernels.cu b/pufferlib/extensions/cuda/kernels.cu
new file mode 100644
index 000000000..8bfdfcf2a
--- /dev/null
+++ b/pufferlib/extensions/cuda/kernels.cu
@@ -0,0 +1,1518 @@
+/* Kernels must launch on the current torch stream to be traced by cudagraphs.
+ * Launch functions take cudaStream_t as parameter - callers (modules.cu) should
+ * pass at::cuda::getCurrentCUDAStream() when using with torch.
+ */
+
+#include <cuda_runtime.h>
+#include "ops.cuh"
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+
+#include <cstdio>
+#include <cstdint>
+
+#define SEQ_SIZE 32
+#define BLOCK_SIZE 256
+inline int grid_size(int N) {
+    return (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
+}
+inline int seq_size(int N) {
+    return (N + SEQ_SIZE - 1) / SEQ_SIZE;
+}
+
+// If you can get this to work, go ahead. I tried.
+// NVCC won't parse templated types in kernel launches
+/*
+template <template <class> class KernelFn, typename... Args>
+void dispatch_and_launch(const at::Tensor& example_tensor, Args... args) {
+    const int64_t N = example_tensor.numel();
+    const int64_t block = LAUNCH_BLOCK_SIZE;
+    const int64_t grid = (N + block - 1) / block;
+    auto stream = at::cuda::getCurrentCUDAStream();
+    at::cuda::CUDAGuard device_guard(example_tensor.device());
+
+    at::ScalarType dtype = example_tensor.scalar_type();
+    if (dtype == at::ScalarType::Float) {
+        KernelFn<float><<<grid, block, 0, stream>>>(args..., N);
+    } else if (dtype == at::ScalarType::Half) {
+        KernelFn<__half><<<grid, block, 0, stream>>>(args..., N);
+    } else if (dtype == at::ScalarType::BFloat16) {
+        KernelFn<__nv_bfloat16><<<grid, block, 0, stream>>>(args..., N);
+    } else {
+        AT_ERROR("Unsupported dtype: ", dtype);
+    }
+}
+*/
+
+template<typename T>
+__global__ void rmsnorm_forward_kernel(
+    T* __restrict__ out,
+    float* __restrict__ inv_norm_buf,
+    const T* __restrict__ x,
+    const T* __restrict__ weight,
+    double eps,
+    int T_total,
+    int H,
+    int B
+) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= B * T_total) return;
+
+    int b = idx / T_total;
+    int t = idx % T_total;
+    int base = b*T_total*H + t*H;
+
+    float sum_sq = 0.0f;
+    for (int h = 0; h < H; h++) {
+        int curr = base + h;
+        float x_val = float(x[curr]);
+        sum_sq += x_val * x_val;
+    }
+
+    float rms = sqrtf(sum_sq/H + eps);
+    float inv_rms = 1.0f / rms;
+    inv_norm_buf[idx] = inv_rms;
+
+    for (int h = 0; h < H; h++) {
+        int curr = base + h;
+        out[curr] = T(weight[h] * x[curr] * inv_rms);
+    }
+}
+
+template<typename T>
+__global__ void rmsnorm_backward_kernel(
+    T* __restrict__ grad_x,
+    T* __restrict__ grad_weight,
+    const T* __restrict__ grad_out,
+    const float* __restrict__ inv_norm_buf,
+    const T* __restrict__ x_buf,
+    const T* __restrict__ weight,
+    double eps,
+    int T_total,
+    int H,
+    int B
+) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= T_total*H*B) return;
+    int base = idx % H;
+    int norm_idx = idx / H;
+
+    float inv_rms = inv_norm_buf[norm_idx];
+    float inv_rms_3 = inv_rms * inv_rms * inv_rms;
+
+    grad_x[idx] = weight[base] * grad_out[idx] * inv_rms;
+    grad_weight[idx] = grad_out[idx] * inv_rms;
+
+    float wg_x = 0.0f;
+    for (int h=0; h<H; h++) {
+        float x = x_buf[base + h];
+        float w = weight[h];
+        float g = grad_out[base + h];
+        wg_x += w*g*x;
+    }
+    float x = x_buf[idx];
+    grad_x[idx] -= x*wg_x*inv_rms_3/float(H);
+}
+
+/*
+template<typename T>
+__global__ void rmsnorm_backward_kernel(
+    T* grad_x,
+    T* grad_weight,
+    const T* grad_out,
+    const float* inv_norm_buf,
+    const T* x,
+    const T* weight,
+    double eps,
+    int T_total,
+    int H,
+    int B
+) {
+    int total_elements = B * T_total * H;
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= total_elements) return;
+
+    int h = idx % H;
+    int vec_idx = idx / H;                    // index of the vector (b,t)
+    int offset = vec_idx * H;
+
+    float inv_rms = inv_norm_buf[vec_idx];
+    float inv_rms3 = inv_rms * inv_rms * inv_rms;
+
+    // ∂L/∂γ_h += grad_out * (x / rms)
+    float gw = grad_out[idx] * (float)x[idx] * inv_rms;
+    atomicAdd((float*)&grad_weight[h], gw);
+
+    // Compute reduction: sum_h weight[h] * grad_out[h] * x[h]
+    float sum = 0.0f;
+    for (int i = 0; i < H; ++i) {
+        sum += (float)weight[i] * (float)grad_out[offset + i] * (float)x[offset + i];
+    }
+    float reduction = sum * inv_rms;  // = σ γ g hat_x
+
+    float dx = (float)weight[h] * (float)grad_out[idx] * inv_rms
+               - (float)x[idx] * reduction * inv_rms3 / H;
+
+    grad_x[idx] = T(dx);
+}
+*/
+
+template<typename T>
+void launch_rmsnorm_forward(
+    T* __restrict__ out,
+    float* __restrict__ inv_norm_buf,
+    const T* __restrict__ x,
+    const T* __restrict__ weight,
+    double eps,
+    int T_total,
+    int H,
+    int B,
+    cudaStream_t stream
+) {
+    int total = B * T_total;
+    int grid = grid_size(total);
+
+    rmsnorm_forward_kernel<T><<<grid, BLOCK_SIZE, 0, stream>>>(
+        out,
+        inv_norm_buf,
+        x,
+        weight,
+        eps,
+        T_total,
+        H,
+        B
+    );
+
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        fprintf(stderr, "CUDA kernel launch error in forward: %s\n", cudaGetErrorString(err));
+    }
+}
+
+template<typename T>
+void launch_rmsnorm_backward(
+    T* __restrict__ grad_x,
+    T* __restrict__ grad_weight,
+    const T* __restrict__ grad_out,
+    const float* __restrict__ inv_norm_buf,
+    const T* __restrict__ x_buf,
+    const T* __restrict__ weight,
+    double eps,
+    int T_total,
+    int H,
+    int B,
+    cudaStream_t stream
+) {
+    // The backward is fully parallel
+    // since the inv norm is cached
+    int total = B * T_total * H;
+    int grid = grid_size(total);
+
+    rmsnorm_backward_kernel<T><<<grid, BLOCK_SIZE, 0, stream>>>(
+        grad_x,
+        grad_weight,
+        grad_out,
+        inv_norm_buf,
+        x_buf,
+        weight,
+        eps,
+        T_total,
+        H,
+        B
+    );
+
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        fprintf(stderr, "CUDA kernel launch error in backward: %s\n", cudaGetErrorString(err));
+    }
+}
+
+
+// Fused kernel: chunk + mingru_gate + sigmoid(proj) * out
+// combined is (B, 1, 3*H) containing [hidden, gate, proj] concatenated on last dim
+// state is (B, 1, H)
+// out is (B, 1, H) = sigmoid(proj) * mingru_out (final output)
+// next_state is (B, 1, H) = mingru_out (recurrent state, without proj)
+template<typename T>
+__global__ void mingru_gate_inference_kernel(
+    T* out,
+    T* next_state,
+    const T* combined,    // (B, 1, 3*H) = [hidden, gate, proj]
+    const T* state_in,    // (B, 1, H)
+    int H,
+    int B
+) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int N = B * H;
+    if (idx >= N) return;
+
+    int b = idx / H;
+    int h = idx % H;
+
+    // Read from combined: layout is [hidden(H), gate(H), proj(H)] for each batch
+    int combined_base = b * 3 * H;
+    float hidden = float(combined[combined_base + h]);
+    float gate = float(combined[combined_base + H + h]);
+    float proj = float(combined[combined_base + 2 * H + h]);
+
+    float state = float(state_in[idx]);
+
+    // mingru_gate computation
+    float gate_sigmoid = fast_sigmoid(gate);
+    float hidden_tilde = tilde_relu_fwd(hidden);
+    float mingru_out = lerp(state, hidden_tilde, gate_sigmoid);
+
+    // next_state is mingru_out (for recurrence)
+    next_state[idx] = T(mingru_out);
+
+    // out is sigmoid(proj) * mingru_out (final output)
+    float proj_sigmoid = fast_sigmoid(proj);
+    out[idx] = T(proj_sigmoid * mingru_out);
+}
+
+template<typename T>
+void launch_mingru_gate_inference(
+    T* out,
+    T* next_state,
+    const T* combined,
+    const T* state_in,
+    int H,
+    int B,
+    cudaStream_t stream
+) {
+    int N = B * H;
+    int grid = grid_size(N);
+    mingru_gate_inference_kernel<T><<<grid, BLOCK_SIZE, 0, stream>>>(
+        out,
+        next_state,
+        combined,
+        state_in,
+        H,
+        B
+    );
+
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        fprintf(stderr, "CUDA kernel launch error: %s\n", cudaGetErrorString(err));
+    }
+}
+
+
+template<typename T>
+__global__ void log_coeffs_and_values_kernel(
+    T* log_coeffs,
+    T* log_values,
+    const T* gate,
+    const T* hidden,
+    int N
+) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= N) return;
+
+    float g = float(gate[idx]);
+    float h = float(hidden[idx]);
+
+    log_coeffs[idx] = -softplus_fwd(g);
+    float log_z = -softplus_fwd(-g);
+    float log_tilde_h;
+    if (h >= 0.0f) {
+        float relu_h = relu(h);
+        log_tilde_h = logf(relu_h + 0.5f);
+    } else {
+        log_tilde_h = -softplus_fwd(-h);
+    }
+    log_values[idx] = log_z + log_tilde_h;
+}
+
+template<typename T>
+__global__ void log_coeffs_and_values_backward_kernel(
+    T* grad_gate,
+    T* grad_hidden,
+    const T* grad_log_coeffs,
+    const T* grad_log_values,
+    const T* gate,
+    const T* hidden,
+    int N
+) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= N) return;
+
+    float g = float(gate[idx]);
+    float h = float(hidden[idx]);
+
+    float grad_lc = float(grad_log_coeffs[idx]);
+    float grad_lv = float(grad_log_values[idx]);
+    float grad_g_from_lc = -softplus_bwd(grad_lc, g);
+    float grad_g_from_lz = -softplus_bwd(-grad_lv, -g);
+    float grad_g_total = grad_g_from_lc + grad_g_from_lz;
+    grad_gate[idx] = T(grad_g_total);
+    float log_tilde_h;
+    float grad_h_from_lt;
+    if (h >= 0.0f) {
+        float relu_h = relu(h);
+        log_tilde_h = logf(relu_h + 0.5f);
+        float inner_grad = 1.0f / (relu_h + 0.5f);
+        grad_h_from_lt = relu_backward(h, inner_grad * grad_lv);
+    } else {
+        log_tilde_h = -softplus_fwd(-h);
+        grad_h_from_lt = -softplus_bwd(-grad_lv, -h);
+    }
+    grad_hidden[idx] = T(grad_h_from_lt);
+}
+
+template<typename T>
+void launch_log_coeffs_and_values(
+    T* log_coeffs,
+    T* log_values,
+    const T* gate,
+    const T* hidden,
+    int N,
+    cudaStream_t stream
+) {
+    int grid = grid_size(N);
+    log_coeffs_and_values_kernel<T><<<grid, BLOCK_SIZE, 0, stream>>>(
+        log_coeffs,
+        log_values,
+        gate,
+        hidden,
+        N
+    );
+
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        fprintf(stderr, "CUDA kernel launch error: %s\n", cudaGetErrorString(err));
+    }
+}
+
+template<typename T>
+void launch_log_coeffs_and_values_backward(
+    T* grad_gate,
+    T* grad_hidden,
+    const T* grad_log_coeffs,
+    const T* grad_log_values,
+    const T* gate,
+    const T* hidden,
+    int N,
+    cudaStream_t stream
+) {
+    int grid = grid_size(N);
+    log_coeffs_and_values_backward_kernel<T><<<grid, BLOCK_SIZE, 0, stream>>>(
+        grad_gate,
+        grad_hidden,
+        grad_log_coeffs,
+        grad_log_values,
+        gate,
+        hidden,
+        N
+    );
+
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        fprintf(stderr, "CUDA kernel launch error: %s\n", cudaGetErrorString(err));
+    }
+}
+
+__device__ __forceinline__ double logcumsumexp_forward(double x, double acc) {
+    if (acc == -INFINITY) {
+        return x;
+    } else {
+        double min_val = fmin(acc, x);
+        double max_val = fmax(acc, x);
+        return max_val + log1pf(expf(min_val - max_val));
+    }
+}
+
+__device__ __forceinline__ double logcumsumexp_backward(double x, double* acc, double grad, double s, double* s_nxt) {
+    *acc = grad + *acc * exp(s - *s_nxt);
+    *s_nxt = s;
+    return *acc * exp(x - s);
+}
+
+// Fully fused forward: chunk + log_coeffs_and_values + scan + sigmoid(proj)*out
+// Takes combined (B, T, 3*H) = [hidden, gate, proj] and outputs gated result
+template<typename T>
+__global__ void fused_scan_forward_kernel(
+    T* __restrict__ out,                 // (B, T, H) - sigmoid(proj) * scan_result
+    T* __restrict__ next_state,          // (B, 1, H) - raw scan_result at T (for recurrence)
+    float* __restrict__ a_star_buf,      // (B, T+1, H) - for backward
+    float* __restrict__ s_buf,           // (B, T+1, H) - for backward
+    float* __restrict__ log_values_buf,  // (B, T+1, H) - cached log_values for backward
+    const T* __restrict__ combined,      // (B, T, 3*H) = [hidden(H), gate(H), proj(H)]
+    const T* __restrict__ state,         // (B, 1, H)
+    int T_seq,                           // sequence length (T)
+    int H,
+    int B
+) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= B * H) return;
+
+    int b = idx / H;
+    int h = idx % H;
+
+    int T_out = T_seq + 1;
+    int buf_base = b * T_out * H + h;    // base for a_star/s/log_values buffers (T+1 timesteps)
+    int out_base = b * T_seq * H + h;    // base for output (T timesteps)
+    int state_idx = b * H + h;           // state is (B, 1, H) -> flatten to (B, H)
+
+    float a_star = 0.0f;
+    float s = -INFINITY;  // logcumsumexp accumulator
+
+    for (int t = 0; t < T_out; t++) {
+        int buf_curr = buf_base + t * H;
+
+        float log_coeff_val, log_value_val;
+        float proj_val = 0.0f;
+
+        if (t == 0) {
+            // First timestep: use log(state), coeff = 0
+            log_coeff_val = 0.0f;
+            log_value_val = logf(float(state[state_idx]));
+        } else {
+            // Read from combined: layout is [hidden(H), gate(H), proj(H)] for each timestep
+            int combined_base = b * T_seq * 3 * H + (t - 1) * 3 * H;
+            float hidden_val = float(combined[combined_base + h]);
+            float gate_val = float(combined[combined_base + H + h]);
+            proj_val = float(combined[combined_base + 2 * H + h]);
+
+            log_coeffs_and_values_fwd(gate_val, hidden_val, &log_coeff_val, &log_value_val);
+        }
+
+        // Cache log_value for backward (avoid recomputation)
+        log_values_buf[buf_curr] = log_value_val;
+
+        // a_star[t] = sum_{i=0}^t log_coeffs[i]
+        a_star += log_coeff_val;
+
+        float z = log_value_val - a_star;
+
+        if (s == -INFINITY) {
+            s = z;
+        } else {
+            float min_val = fminf(s, z);
+            float max_val = fmaxf(s, z);
+            s = max_val + log1pf(expf(min_val - max_val));
+        }
+
+        float log_h = a_star + s;
+        float scan_result = expf(log_h);
+
+        // Write to out for t=1..T (indices 0..T-1)
+        // out = sigmoid(proj) * scan_result
+        if (t >= 1) {
+            int out_curr = out_base + (t - 1) * H;
+            float proj_sigmoid = fast_sigmoid(proj_val);
+            out[out_curr] = T(proj_sigmoid * scan_result);
+        }
+
+        // Write timestep T to next_state (raw scan_result, no proj, for recurrence)
+        if (t == T_seq) {
+            next_state[state_idx] = T(scan_result);
+        }
+
+        a_star_buf[buf_curr] = a_star;
+        s_buf[buf_curr] = s;
+    }
+}
+
+// Fully fused backward: chains through sigmoid(proj)*out and log_coeffs_and_values
+// Takes combined (B, T, 3*H), outputs grad_combined (B, T, 3*H) = [grad_hidden, grad_gate, grad_proj]
+template<typename T>
+__global__ void fused_scan_backward_kernel(
+    T* __restrict__ grad_combined,         // (B, T, 3*H) = [grad_hidden, grad_gate, grad_proj]
+    T* __restrict__ grad_state,            // (B, 1, H)
+    const T* __restrict__ grad_out,        // (B, T, H) - gradient of sigmoid(proj)*scan_result
+    const T* __restrict__ grad_next_state, // (B, 1, H) - gradient of raw scan_result at T
+    const T* __restrict__ combined,        // (B, T, 3*H) = [hidden, gate, proj]
+    const T* __restrict__ state,           // (B, 1, H)
+    const float* __restrict__ a_star_buf,  // (B, T+1, H)
+    const float* __restrict__ s_buf,       // (B, T+1, H)
+    const float* __restrict__ log_values_buf, // (B, T+1, H) - cached from forward
+    int T_seq,                             // sequence length (T)
+    int H,
+    int B
+) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= B * H) return;
+
+    int b = idx / H;
+    int h = idx % H;
+
+    int T_out = T_seq + 1;
+    int buf_base = b * T_out * H + h;    // base for a_star/s/log_values buffers (T+1 timesteps)
+    int out_base = b * T_seq * H + h;    // base for grad_out (T timesteps)
+    int state_idx = b * H + h;           // state is (B, 1, H) -> flatten to (B, H)
+
+    float acc = 0.0;
+    float s_val_next = 0.0;
+    float carry_grad_a = 0.0;
+
+    for (int t = T_out - 1; t >= 0; --t) {
+        int buf_curr = buf_base + t * H;
+
+        float a_star = a_star_buf[buf_curr];
+        float s = s_buf[buf_curr];
+        float scan_result = expf(a_star + s);  // reconstruct scan result
+
+        // Read cached log_value from forward pass (no recomputation needed)
+        float log_value_val = log_values_buf[buf_curr];
+
+        // Read from combined for t >= 1 (still need gate/hidden for backward, proj for output gate)
+        float gate_val = 0.0f, hidden_val = 0.0f, proj_val = 0.0f;
+        int combined_base = 0;
+
+        if (t >= 1) {
+            combined_base = b * T_seq * 3 * H + (t - 1) * 3 * H;
+            hidden_val = float(combined[combined_base + h]);
+            gate_val = float(combined[combined_base + H + h]);
+            proj_val = float(combined[combined_base + 2 * H + h]);
+        }
+
+        float z = log_value_val - a_star;
+
+        // Get gradient for this timestep
+        // For t >= 1: grad_out is gradient of (sigmoid(proj) * scan_result)
+        // For t = T: also add grad_next_state (gradient of raw scan_result)
+        float grad_gated_out = 0.0f;
+        float grad_scan_from_next = 0.0f;
+
+        if (t >= 1) {
+            int grad_out_idx = out_base + (t - 1) * H;
+            grad_gated_out = float(grad_out[grad_out_idx]);
+        }
+        if (t == T_seq) {
+            grad_scan_from_next = float(grad_next_state[state_idx]);
+        }
+
+        // Chain through sigmoid(proj) * scan_result
+        // out = sigmoid(proj) * scan_result
+        // d_out/d_scan_result = sigmoid(proj)
+        // d_out/d_proj = scan_result * sigmoid(proj) * (1 - sigmoid(proj))
+        float grad_scan_result = grad_scan_from_next;
+        float grad_proj = 0.0f;
+
+        if (t >= 1) {
+            float proj_sigmoid = fast_sigmoid(proj_val);
+            grad_scan_result += grad_gated_out * proj_sigmoid;
+            // sigmoid'(x) = sigmoid(x) * (1 - sigmoid(x))
+            grad_proj = grad_gated_out * scan_result * proj_sigmoid * (1.0f - proj_sigmoid);
+        }
+
+        // Now chain grad_scan_result through the scan backward
+        float grad_log_h = grad_scan_result * scan_result;
+        float grad_s = grad_log_h;
+
+        if (t == T_out - 1) {
+            acc = grad_s;
+        } else {
+            acc = grad_s + acc * expf(s - s_val_next);
+        }
+        float grad_z = acc * expf(z - s);
+        s_val_next = s;
+
+        float grad_a = grad_log_h + carry_grad_a - grad_z;
+        carry_grad_a = grad_a;
+
+        if (t == 0) {
+            // grad_state = grad_z * d(log(state))/d(state) = grad_z / state
+            grad_state[state_idx] = T(grad_z / float(state[state_idx]));
+        } else {
+            // Chain through log_coeffs_and_values backward to get grad_gate, grad_hidden
+            float grad_g, grad_h;
+            log_coeffs_and_values_bwd(grad_a, grad_z, gate_val, hidden_val, &grad_g, &grad_h);
+
+            // Write to grad_combined: [grad_hidden, grad_gate, grad_proj]
+            grad_combined[combined_base + h] = T(grad_h);
+            grad_combined[combined_base + H + h] = T(grad_g);
+            grad_combined[combined_base + 2 * H + h] = T(grad_proj);
+        }
+    }
+}
+
+/*
+template<typename T>
+__global__ void fused_scan_backward_kernel(
+    T* __restrict__ grad_log_coeffs,
+    T* __restrict__ grad_log_values,
+    const T* __restrict__ grad_out,
+    const T* __restrict__ out_buf,
+    const double* __restrict__ a_star_buf,
+    const double* __restrict__ s_buf,
+    const T* __restrict__ log_values,
+    int T_total,
+    int H,
+    int B
+) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= B * H) return;
+
+    int b = idx / H;
+    int h = idx % H;
+
+    int base = b * T_total * H + h;
+
+    double carry_grad_a = 0.0;
+    double carry_grad_s = 0.0;
+
+    for (int t = T_total - 1; t >= 0; --t) {
+        int curr = base + t * H;
+
+        double a_star = a_star_buf[curr];
+        double s = s_buf[curr];
+        double z = double(log_values[curr]) - a_star;
+        double grad_log_h = double(grad_out[curr]) * double(out_buf[curr]); // out_buf[t] = exp(log_h[t])
+
+        double grad_s = grad_log_h + carry_grad_s;
+
+        double s_prev = -INFINITY;
+        if (t > 0) {
+            s_prev = s_buf[base + (t - 1) * H];
+        }
+
+        double max_val = fmax(s_prev, z);
+
+        double exp_prev = 0.0;
+        if (s_prev != -INFINITY) {
+            exp_prev = exp(s_prev - max_val);
+        }
+
+        double exp_z = 0.0;
+        if (z != -INFINITY) {
+            exp_z = exp(z - max_val);
+        }
+
+        double denom = exp_prev + exp_z;
+
+        double frac_prev = 0.0;
+        double frac_z = 0.0;
+        if (denom != 0.0) {
+            frac_prev = exp_prev / denom;
+            frac_z = exp_z / denom;
+        }
+
+        // grad_z = (grad_log_h + carry_grad_s) * exp(z - max_val) / (exp(s_prev - max_val) + exp(z - max_val))
+        // grad_z = (grad_log_h + exp(s - exp_nxt)) * exp(z - s) 
+
+        double d_Z = frac_z * grad_s;
+        double d_A = grad_log_h + carry_grad_a - d_Z;
+
+        grad_log_values[curr] = T(d_Z);
+        grad_log_coeffs[curr] = T(d_A);
+
+        carry_grad_a = d_A;
+        carry_grad_s = frac_prev * grad_s;
+    }
+}
+*/
+
+
+/*
+template<typename T>
+__global__ void fused_scan_backward_kernel(
+    T* __restrict__ grad_log_coeffs,
+    T* __restrict__ grad_log_values,
+    const T* __restrict__ grad_out,
+    const T* __restrict__ log_coeffs,
+    const T* __restrict__ log_values,
+    const T* __restrict__ out,
+    const double* __restrict__ a_star_buf,
+    const double* __restrict__ s_buf,
+    int T_total,
+    int H,
+    int B
+) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= B * H) return;
+
+    int b = idx / H;
+    int h = idx % H;
+
+    int base = b * T_total * H + h;
+
+    double grad_a_star[1025] = {0};  // Assuming T_total <= 1024
+    double W = 0.0;  // Accumulates sum_{i=t}^{T-1} [grad_log_h[i] * exp(-s[i])]
+
+    for (int t = T_total - 1; t >= 0; t--) {
+        int curr = base + t * H;
+
+        double a_star = a_star_buf[curr];
+        double s_val = s_buf[curr];
+        double z_val = double(log_values[curr]) - a_star;
+
+        // Compute dL/d(log_h[t]) = dL/d(out[t]) * d(out[t])/d(log_h[t])
+        double grad_log_h = double(grad_out[curr]) * double(out[curr]);
+
+        // Update W: W[t] = grad_log_h[t] * exp(-s_val) + W[t+1]
+        W = grad_log_h * exp(-s_val) + W;
+
+        // Compute dL/d(z[t]) = exp(z_val) * W[t]
+        double grad_z = exp(z_val) * W;
+
+        // dL/d(log_values[t]) = dL/d(z[t]) * dz[t]/d(log_values[t]) = grad_z
+        grad_log_values[curr] = T(grad_z);
+
+        // dL/da_star[t] = dL/d(log_h[t]) - dL/d(z[t]) (due to chain rule)
+        grad_a_star[t] = grad_log_h - grad_z;
+    }
+
+    // Compute dL/d(log_coeffs) via cumulative sum of dL/da_star
+    double accum = 0.0;
+    for (int t = T_total - 1; t >= 0; t--) {
+        accum += grad_a_star[t];
+        grad_log_coeffs[base + t * H] = T(accum);
+    }
+}
+*/
+
+
+/*
+template<typename T>
+__global__ void fused_scan_backward_kernel(
+    T* __restrict__ grad_log_coeffs,
+    T* __restrict__ grad_log_values,
+    const T* __restrict__ grad_out,
+    const T* __restrict__ log_coeffs,
+    const T* __restrict__ log_values,
+    const T* __restrict__ out,
+    const float* __restrict__ a_star_buf,
+    const float* __restrict__ s_buf,
+    int T_total,
+    int H,
+    int B
+) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= B * H) return;
+
+    int b = idx / H;
+    int h = idx % H;
+
+    int base = b * T_total * H + h;
+
+    float grad_a_star[1025] = {0};
+    float G = 0.0f;  // G[t] = sum_{i=t}^{T-1} grad_s[i]
+    for (int t = T_total - 1; t >= 0; t--) {
+        int curr = base + t * H;
+
+        float a_star = a_star_buf[curr];
+        float s_val = s_buf[curr];
+        float z = float(log_values[curr]) - a_star;
+
+        // grad_log_h[t] = grad_out[t] * out[t]
+        float grad_log_h = float(grad_out[curr]) * float(out[curr]);
+
+        // G = sum of grad_s from t to end (grad_s[t] = grad_log_h[t])
+        G += grad_log_h;
+
+        // grad_z[t] = exp(z - s_val) * G
+        float prob = expf(z - s_val);
+        float grad_z = prob * G;
+
+        // grad_log_values[t] = grad_z
+        grad_log_values[curr] = T(grad_z);
+
+        // grad_a_star[t] gets:
+        // - +grad_log_h (from log_h = a_star + s)
+        // - -grad_z    (from z = log_values - a_star)
+        grad_a_star[t] = grad_log_h - grad_z;
+    }
+
+    // grad_log_coeffs[t] = sum_{i=t}^{T-1} grad_a_star[i]
+    float accum = 0.0f;
+    for (int t = T_total - 1; t >= 0; t--) {
+        accum += grad_a_star[t];
+        grad_log_coeffs[base + t * H] = T(accum);
+    }
+}
+
+ template<typename T>
+__global__ void fused_scan_backward_kernel(
+    T* __restrict__ grad_log_coeffs,
+    T* __restrict__ grad_log_values,
+    const T* __restrict__ grad_out,
+    const T* __restrict__ log_coeffs,
+    const T* __restrict__ log_values,
+    const T* __restrict__ out,
+    const float* __restrict__ a_star_buf,
+    const float* __restrict__ s_buf,
+    int T_total,
+    int H,
+    int B
+) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= B * H) return;
+
+    int b = idx / H;
+    int h = idx % H;
+
+    int base = b * T_total * H + h;
+
+    // Recompute z[t] = log_values[t] - a_star[t]
+    float z[1025];
+    for (int t = 0; t < T_total; t++) {
+        int curr = base + t * H;
+        z[t] = float(log_values[curr]) - a_star_buf[curr];
+    }
+
+    // g_log_h[t] = grad_out[t] * out[t]
+    float g_log_h[1025];
+    for (int t = 0; t < T_total; t++) {
+        int curr = base + t * H;
+        g_log_h[t] = float(grad_out[curr]) * float(out[curr]);
+    }
+
+    // Step: Online logcumsumexp backward for g_z
+    float g_z[1025] = {0};
+    g_z[T_total - 1] = g_log_h[T_total - 1];
+
+    for (int t = T_total - 2; t >= 0; t--) {
+        float exp_term = expf(z[t] - s_buf[base + (t + 1) * H]);
+        g_z[t] = g_log_h[t] + g_z[t + 1] * exp_term;
+    }
+
+    // grad_log_values[t] = g_z[t]
+    for (int t = 0; t < T_total; t++) {
+        int curr = base + t * H;
+        grad_log_values[curr] = T(g_z[t]);
+    }
+
+    // g_a_star[t] = g_log_h[t] - g_z[t]
+    float g_a_star[1025] = {0};
+    for (int t = 0; t < T_total; t++) {
+        g_a_star[t] = g_log_h[t] - g_z[t];
+    }
+
+    // grad_log_coeffs[t] = reverse cumsum of g_a_star
+    float accum = 0.0f;
+    for (int t = T_total - 1; t >= 0; t--) {
+        accum += g_a_star[t];
+        grad_log_coeffs[base + t * H] = T(accum);
+    }
+}
+*/
+// Fully fused forward launch: takes combined (B, T, 3*H) = [hidden, gate, proj]
+template<typename T>
+void launch_fused_scan_forward(
+    T* out,
+    T* next_state,
+    float* a_star,
+    float* s_vals,
+    float* log_values_buf,  // (B, T+1, H) - cached for backward
+    const T* combined,  // (B, T, 3*H) = [hidden, gate, proj]
+    const T* state,
+    int T_seq,
+    int H,
+    int B,
+    cudaStream_t stream
+) {
+    int total = B * H;
+    int grid = seq_size(total);
+
+    fused_scan_forward_kernel<T><<<grid, SEQ_SIZE, 0, stream>>>(
+        out,
+        next_state,
+        a_star,
+        s_vals,
+        log_values_buf,
+        combined,
+        state,
+        T_seq,
+        H,
+        B
+    );
+
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        fprintf(stderr, "CUDA kernel launch error in forward: %s\n", cudaGetErrorString(err));
+    }
+}
+
+// Fully fused backward launch: outputs grad_combined (B, T, 3*H) = [grad_hidden, grad_gate, grad_proj]
+template<typename T>
+void launch_fused_scan_backward(
+    T* grad_combined,   // (B, T, 3*H) = [grad_hidden, grad_gate, grad_proj]
+    T* grad_state,
+    const T* grad_out,
+    const T* grad_next_state,
+    const T* combined,  // (B, T, 3*H) = [hidden, gate, proj]
+    const T* state,
+    const float* a_star_buf,
+    const float* s_buf,
+    const float* log_values_buf,  // (B, T+1, H) - cached from forward
+    int T_seq,
+    int H,
+    int B,
+    cudaStream_t stream
+) {
+    int total = B * H;
+    int grid = seq_size(total);
+
+    fused_scan_backward_kernel<T><<<grid, SEQ_SIZE, 0, stream>>>(
+        grad_combined,
+        grad_state,
+        grad_out,
+        grad_next_state,
+        combined,
+        state,
+        a_star_buf,
+        s_buf,
+        log_values_buf,
+        T_seq,
+        H,
+        B
+    );
+
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        fprintf(stderr, "CUDA kernel launch error in backward: %s\n", cudaGetErrorString(err));
+    }
+}
+
+/*
+__device__ __forceinline__ float log_add_exp(const float a, const float b) {
+  if (::isnan(a) || ::isnan(b)) {
+    return std::numeric_limits<float>::quiet_NaN();
+  }
+  float min_val = fminf(a, b);
+  float max_val = fmaxf(a, b);
+  if (min_val != max_val || ::isfinite(min_val)) {
+    return max_val + log1pf(expf(min_val - max_val));
+  } else {
+      return a;
+  }
+}
+
+__device__ __forceinline__ float log_add_exp_backward(float x_val, float s_val) {
+  if (::isnan(x_val) || ::isnan(s_val)) {
+    return std::numeric_limits<float>::quiet_NaN();
+  }
+  return expf(x_val - s_val);
+}
+*/
+
+__device__ __forceinline__ double log_add_exp(const double a, const double b) {
+  double min_val = fmin(a, b);
+  double max_val = fmax(a, b);
+  return max_val + log1p(exp(min_val - max_val));
+}
+
+__device__ __forceinline__ double log_add_exp_backward(double x, double s) {
+    return exp(x - s);
+}
+
+ 
+// This exactly matches pytorch in double, but not in float
+template<typename T>
+__global__ void logcumsumexp_forward_kernel(
+    T* __restrict__ out,           // exp(s[t])
+    double* __restrict__ s_buf,     // s[t] = logcumsumexp(x[0..t])
+    const T* __restrict__ x,       // input: log_values
+    int T_total,
+    int H,
+    int B
+) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= B * H) return;
+
+    int b = idx / H;
+    int h = idx % H;
+
+    int base = b * T_total * H + h;
+
+    double s = -INFINITY;
+
+    for (int t = 0; t < T_total; t++) {
+        int curr = base + t * H;
+        double x_val = double(x[curr]);
+        s = logcumsumexp_forward(x_val, s);
+        out[curr] = T(s);
+        s_buf[curr] = s;
+    }
+}
+template<typename T>
+__global__ void logcumsumexp_backward_kernel(
+    T* __restrict__ grad_x,
+    const T* __restrict__ grad_out,
+    const T* __restrict__ x,
+    const double* __restrict__ s_buf,
+    int T_total,
+    int H,
+    int B
+) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= B * H) return;
+
+    int b = idx / H;
+    int h = idx % H;
+
+    int base = b * T_total * H + h;
+
+    double acc = 0.0;
+    double s_val_next = 0.0;
+
+    for (int t = T_total - 1; t >= 0; --t) {
+        int curr = base + t * H;
+
+        double x_val = double(x[curr]);
+        double s_val = double(s_buf[curr]);
+        double g_val = double(grad_out[curr]);
+        grad_x[curr] = T(logcumsumexp_backward(x_val, &acc, g_val, s_val, &s_val_next));
+    }
+}
+/*
+template<typename T>
+__global__ void logcumsumexp_backward_kernel(
+    T* __restrict__ grad_x,
+    const T* __restrict__ grad_out,
+    const T* __restrict__ x,
+    const float* __restrict__ s_buf,
+    int T_total,
+    int H,
+    int B
+) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= B * H) return;
+
+    int b = idx / H;
+    int h = idx % H;
+
+    int base = b * T_total * H + h;
+
+    // grad_x[i] = sum_{t≥i} grad_out[t] * exp(x[i] - s[t])
+    for (int i = 0; i < T_total; i++) {
+        int curr_i = base + i * H;
+        float x_i = float(x[curr_i]);
+        float g = 0.0f;
+
+        for (int t = i; t < T_total; t++) {
+            int curr_t = base + t * H;
+            float s_t = s_buf[curr_t];
+            float prob = expf(x_i - s_t);
+            //float prob = log_add_exp_backward(x_i, s_t);
+            g += float(grad_out[curr_t]) * prob;
+        }
+
+        grad_x[curr_i] = T(g);
+    }
+}
+*/
+
+template<typename T>
+void launch_logcumsumexp_forward(
+    T* out,
+    double* s_buf,
+    const T* x,
+    int T_total,
+    int H,
+    int B,
+    cudaStream_t stream
+) {
+    int total = B * H;
+    int grid = grid_size(total);
+
+    logcumsumexp_forward_kernel<T><<<grid, BLOCK_SIZE, 0, stream>>>(
+        out, s_buf, x, T_total, H, B
+    );
+
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess)
+        fprintf(stderr, "Forward kernel error: %s\n", cudaGetErrorString(err));
+}
+
+template<typename T>
+void launch_logcumsumexp_backward(
+    T* grad_x,
+    const T* grad_out,
+    const T* x,
+    const double* s_buf,
+    int T_total,
+    int H,
+    int B,
+    cudaStream_t stream
+) {
+    int total = B * H;
+    int grid = grid_size(total);
+
+    logcumsumexp_backward_kernel<T><<<grid, BLOCK_SIZE, 0, stream>>>(
+        grad_x, grad_out, x, s_buf, T_total, H, B
+    );
+
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess)
+        fprintf(stderr, "Backward kernel error: %s\n", cudaGetErrorString(err));
+}
+
+template<typename T>
+__global__ void ppo_loss_forward_kernel(
+    float* __restrict__ loss,
+    double* __restrict__ saved_for_backward,
+    const T* __restrict__ logits,
+    const T* __restrict__ values_pred,
+    const int64_t* __restrict__ actions,
+    const T* __restrict__ old_logprobs,
+    const T* __restrict__ advantages,
+    const T* __restrict__ prio,
+    const T* __restrict__ values,
+    const T* __restrict__ returns,
+    const float* __restrict__ adv_mean,
+    const float* __restrict__ adv_std,
+    double clip_coef,
+    double vf_clip_coef,
+    double vf_coef,
+    double ent_coef,
+    int T_seq,
+    int A,
+    int N
+) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int total_elements = N * T_seq;
+    if (idx >= total_elements) return;
+    __shared__ float block_loss[BLOCK_SIZE];
+
+    int n = idx / T_seq;  // batch index
+    int t = idx % T_seq;  // timestep
+
+    // === Direct indexing: no lambdas ===
+    int nt = n * T_seq + t;                    // index into (N, T_seq) tensors
+    int logits_offset = n * T_seq * A + t * A; // base index into logits
+
+    // === Step 1: Read action and compute logsumexp ===
+    int act = actions[nt];  // action taken at (n,t)
+
+    // Compute logsumexp: log(sum_a exp(logits[a]))
+    double max_logit = -INFINITY;
+    for (int a = 0; a < A; a++) {
+        double l = double(logits[logits_offset + a]);
+        max_logit = fmax(max_logit, l);
+    }
+
+    double logsumexp = 0.0;
+    double sum = 0.0;
+    for (int a = 0; a < A; a++) {
+        double l = double(logits[logits_offset + a]);
+        sum += exp(l - max_logit);
+    }
+    logsumexp = max_logit + log(sum);
+
+    // === Step 2: new_logprob[action] = logits[action] - logsumexp ===
+    // log_softmax = (logits - max_logit) - max_logit - logsumexp
+    double new_logp = double(logits[logits_offset + act]) - logsumexp;
+
+    // === Step 3: entropy = -sum_a p_a * log p_a ===
+    double entropy = 0.0;
+    for (int a = 0; a < A; a++) {
+        double l = double(logits[logits_offset + a]);
+        double p = exp(l - logsumexp);
+        double logp = l - logsumexp;
+        entropy -= p * logp;
+    }
+
+    // === Step 4: policy gradient loss ===
+    double old_logp = double(old_logprobs[nt]);
+    double adv = double(advantages[nt]);
+    double w = double(prio[n]);  // importance weight, per-sequence
+    double adv_normalized = (adv - adv_mean[0]) / (adv_std[0] + 1e-8);
+
+    double logratio = new_logp - old_logp;
+    double ratio = exp(logratio);
+
+    double ratio_clipped = fmax(1.0 - clip_coef, fmin(1.0 + clip_coef, ratio));
+    double pg_loss1 = -w * adv_normalized * ratio;
+    double pg_loss2 = -w * adv_normalized * ratio_clipped;
+    double pg_loss = fmax(pg_loss1, pg_loss2);  // PPO clipped surrogate loss
+
+    // === Step 5: value function loss ===
+    double val = double(values[nt]);
+    double ret = double(returns[nt]);
+    double val_pred = double(values_pred[nt]);
+
+    double v_error = val_pred - val;
+    double v_clipped = val + fmax(-vf_clip_coef, fmin(vf_clip_coef, v_error));
+    double v_loss_unclipped = (val_pred - ret) * (val_pred - ret);
+    double v_loss_clipped = (v_clipped - ret) * (v_clipped - ret);
+    double v_loss = 0.5f * fmax(v_loss_unclipped, v_loss_clipped);
+
+    // === Step 6: total sample loss ===
+    double thread_loss = pg_loss + vf_coef * v_loss - ent_coef * entropy;
+
+    // === Save for backward ===
+    double* saved_row = saved_for_backward + idx * 5;
+    saved_row[0] = new_logp;
+    saved_row[1] = ratio;
+    saved_row[2] = val_pred;
+    saved_row[3] = v_clipped;
+    saved_row[4] = entropy;
+
+    // === Block-local reduction using shared memory ===
+    int tid = threadIdx.x;
+    block_loss[tid] = thread_loss;
+    __syncthreads();
+
+    // Reduce within block using tree reduction
+    for (int stride = BLOCK_SIZE / 2; stride > 0; stride >>= 1) {
+        if (tid < stride) {
+            block_loss[tid] += block_loss[tid + stride];
+        }
+        __syncthreads();
+    }
+
+    // === Accumulate into loss_output (scalar via atomic add) ===
+    if (tid == 0) {
+        atomicAdd(loss, block_loss[0]);
+    }
+}
+
+template<typename T>
+__global__ void ppo_loss_backward_kernel(
+    T* __restrict__ grad_logits,
+    T* __restrict__ grad_values_pred,
+    const float* __restrict__ grad_loss,  // scalar, [1], dL/dloss
+    const T* __restrict__ logits,
+    const int64_t* __restrict__ actions,
+    const T* __restrict__ old_logprobs,
+    const T* __restrict__ advantages,
+    const T* __restrict__ prio,
+    const T* __restrict__ values,
+    const T* __restrict__ returns,
+    const double* __restrict__ saved_for_backward,
+    const float* __restrict__ adv_mean,
+    const float* __restrict__ adv_std,
+    double clip_coef,
+    double vf_clip_coef,
+    double vf_coef,
+    double ent_coef,
+    int T_seq,
+    int A,
+    int N
+) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int total_elements = N * T_seq;
+    if (idx >= total_elements) return;
+
+    double inv_NT = 1.0f / (N * T_seq);
+    int n = idx / T_seq;
+    int t = idx % T_seq;
+
+    // === Direct indexing ===
+    int nt = n * T_seq + t;
+    int logits_offset = n * T_seq * A + t * A;
+
+    // === Retrieve saved values from forward pass ===
+    const double* saved = saved_for_backward + idx * 5;
+    double new_logp = saved[0];   // new log prob of selected action
+    double ratio = saved[1];      // exp(new_logp - old_logp)
+    double val_pred = saved[2];   // value prediction
+    double v_clipped = saved[3];  // clipped value target
+    double entropy = saved[4];    // entropy at (n,t)
+
+    // === Read inputs ===
+    double old_logp = double(old_logprobs[nt]);
+    double adv = double(advantages[nt]);
+    double w = double(prio[n]);  // importance weight
+    double val = double(values[nt]);
+    double ret = double(returns[nt]);
+
+    // === Normalize advantage (same as forward) ===
+    double adv_normalized = (adv - adv_mean[0]) / (adv_std[0] + 1e-8f);
+
+    // Total loss gradient (scalar from autograd)
+    double dL = grad_loss[0] * inv_NT;  // dL/dloss
+
+    // Gradients w.r.t. components
+    double d_pg_loss = dL;                    // policy loss contributes dL
+    double d_v_loss = dL * vf_coef;           // value loss scaled by vf_coef
+    double d_entropy_term = dL * (-ent_coef); // entropy bonus gradient
+
+    // ===================================================
+    // 1. Gradient w.r.t. value function prediction
+    // ===================================================
+    double v_loss_unclipped = (val_pred - ret) * (val_pred - ret);
+    double v_loss_clipped = (v_clipped - ret) * (v_clipped - ret);
+
+    // Which branch was taken in forward? (same logic as PyTorch: use unclipped if tie)
+    bool use_clipped_vf = (v_loss_clipped > v_loss_unclipped);
+    double d_val_pred = 0.0;
+
+    if (use_clipped_vf) {
+        double v_error = val_pred - val;
+        if (v_error >= -vf_clip_coef && v_error <= vf_clip_coef) {
+            d_val_pred = v_clipped - ret;  // = val_pred - ret
+        }
+    } else {
+        d_val_pred = val_pred - ret;
+    }
+
+    d_val_pred = dL * vf_coef * d_val_pred;
+    grad_values_pred[nt] = T(d_val_pred);
+
+    // ===================================================
+    // 2. Gradient w.r.t. policy and entropy (logits)
+    // ===================================================
+    // Recompute logsumexp for gradient
+    double max_logit = -INFINITY;
+    for (int a = 0; a < A; a++) {
+        double l = double(logits[logits_offset + a]);
+        max_logit = fmax(max_logit, l);
+    }
+
+    double logsumexp = 0.0;
+    double sum = 0.0;
+    for (int a = 0; a < A; a++) {
+        double l = double(logits[logits_offset + a]);
+        sum += exp(l - max_logit);
+    }
+    logsumexp = max_logit + log(sum);
+ 
+    // Zero grad_logits for this (n,t)
+    for (int a = 0; a < A; a++) {
+        grad_logits[logits_offset + a] = T(0.0f);
+    }
+
+    // --- Policy Loss Gradient ---
+    double logratio = new_logp - old_logp;
+    double ratio_clipped = fmax(1.0f - clip_coef, fmin(1.0f + clip_coef, ratio));
+    double pg_loss1 = -w * adv_normalized * ratio;
+    double pg_loss2 = -w * adv_normalized * ratio_clipped;
+
+    double d_ratio = -w * adv_normalized * d_pg_loss;
+    if (pg_loss2 > pg_loss1) {
+        if (ratio <= (1.0 - clip_coef) || ratio >= (1.0 + clip_coef)) {
+            d_ratio = 0.0;
+        }
+    }
+
+    // d(ratio)/d(new_logp) = ratio
+    double d_new_logp = d_ratio * ratio;
+
+    // --- Entropy Gradient ---
+    // dH/dlogits[a] = p_a * (entropy - log p_a)
+    for (int a = 0; a < A; a++) {
+        double l = double(logits[logits_offset + a]);
+        double p = exp(l - logsumexp);
+        double logp = l - logsumexp;
+
+        // Gradient from policy loss: d/dlogits[a] new_logp = δ_{a,act} - p_a
+        double d_logit = 0.0f;
+        if (a == actions[nt]) {
+            d_logit += d_new_logp;
+        }
+        d_logit -= p * d_new_logp;
+
+        // Gradient from entropy
+        // TODO: Grad is a bit more off than I would like (1e-6)
+        // Probably need to check logsumexp (not cumulative) vs
+        // torch / actually look at the puffer 3 entropy impl
+        double d_entropy_dlogit = p * (entropy - logp);
+        d_logit += d_entropy_term * d_entropy_dlogit;
+
+        grad_logits[logits_offset + a] = T(d_logit);
+    }
+}
+
+template<typename T>
+inline void launch_ppo_loss_forward(
+    float* loss_output,
+    double* saved_for_backward,
+    const T* logits,
+    const T* values_pred,
+    const int64_t* actions,
+    const T* old_logprobs,
+    const T* advantages,
+    const T* prio,
+    const T* values,
+    const T* returns,
+    const float* adv_mean,
+    const float* adv_std,
+    double clip_coef,
+    double vf_clip_coef,
+    double vf_coef,
+    double ent_coef,
+    int T_seq,
+    int A,
+    int N,
+    cudaStream_t stream
+) {
+    int total_elements = N * T_seq;
+    int grid = grid_size(total_elements);
+
+    ppo_loss_forward_kernel<T><<<grid, BLOCK_SIZE, 0, stream>>>(
+        loss_output,
+        saved_for_backward,
+        logits,
+        values_pred,
+        actions,
+        old_logprobs,
+        advantages,
+        prio,
+        values,
+        returns,
+        adv_mean,
+        adv_std,
+        clip_coef,
+        vf_clip_coef,
+        vf_coef,
+        ent_coef,
+        T_seq,
+        A,
+        N
+    );
+
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        fprintf(stderr, "PPO forward kernel error: %s\n", cudaGetErrorString(err));
+    }
+}
+
+template<typename T>
+void launch_ppo_loss_backward(
+    T* grad_logits,
+    T* grad_values_pred,
+    const float* grad_loss,
+    const T* logits,
+    const int64_t* actions,
+    const T* old_logprobs,
+    const T* advantages,
+    const T* prio,
+    const T* values,
+    const T* returns,
+    const double* saved_for_backward,
+    const float* adv_mean,
+    const float* adv_std,
+    double clip_coef,
+    double vf_clip_coef,
+    double vf_coef,
+    double ent_coef,
+    int T_seq,
+    int A,
+    int N,
+    cudaStream_t stream
+) {
+    int total_elements = N * T_seq;
+    int grid = grid_size(total_elements);
+
+    ppo_loss_backward_kernel<T><<<grid, BLOCK_SIZE, 0, stream>>>(
+        grad_logits,
+        grad_values_pred,
+        grad_loss,
+        logits,
+        actions,
+        old_logprobs,
+        advantages,
+        prio,
+        values,
+        returns,
+        saved_for_backward,
+        adv_mean,
+        adv_std,
+        clip_coef,
+        vf_clip_coef,
+        vf_coef,
+        ent_coef,
+        T_seq,
+        A,
+        N
+    );
+
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        fprintf(stderr, "PPO backward kernel error: %s\n", cudaGetErrorString(err));
+    }
+}
diff --git a/pufferlib/extensions/cuda/modules.cu b/pufferlib/extensions/cuda/modules.cu
new file mode 100644
index 000000000..c85e17535
--- /dev/null
+++ b/pufferlib/extensions/cuda/modules.cu
@@ -0,0 +1,925 @@
+#include <torch/extension.h>
+#include <torch/torch.h>
+#include <c10/cuda/CUDAStream.h>
+#include <cuda_runtime.h>
+
+#include "kernels.cu"
+
+#include <stdio.h>
+#include <stdlib.h>
+
+// Fused: chunk + mingru_gate + sigmoid(proj) * out
+// combined is (B, 1, 3*H) = [hidden, gate, proj]
+// state is (B, 1, H)
+// returns {out, next_state} where:
+//   out (B, 1, H) = sigmoid(proj) * mingru_out
+//   next_state (B, 1, H) = mingru_out (for recurrence)
+std::vector<torch::Tensor> mingru_gate(
+    torch::Tensor state,
+    torch::Tensor combined
+) {
+    TORCH_CHECK(state.is_cuda(), "state must be on CUDA");
+    TORCH_CHECK(combined.is_cuda(), "combined must be on CUDA");
+    TORCH_CHECK(state.dtype() == combined.dtype(), "dtypes must match");
+    TORCH_CHECK(state.dim() == 3 && combined.dim() == 3, "must be 3D tensors");
+    TORCH_CHECK(combined.size(2) == 3 * state.size(2), "combined must be 3*H");
+    TORCH_CHECK(state.size(0) == combined.size(0), "batch size must match");
+    TORCH_CHECK(state.is_contiguous() && combined.is_contiguous(), "must be contiguous");
+
+    auto dtype = state.dtype();
+    auto B = state.size(0);
+    auto H = state.size(2);
+
+    auto out = torch::empty_like(state);
+    auto next_state = torch::empty_like(state);
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+    if (dtype == torch::kFloat32) {
+        launch_mingru_gate_inference<float>(
+            out.data_ptr<float>(),
+            next_state.data_ptr<float>(),
+            combined.data_ptr<float>(),
+            state.data_ptr<float>(),
+            static_cast<int>(H),
+            static_cast<int>(B),
+            stream
+        );
+    } else if (dtype == torch::kBFloat16) {
+        launch_mingru_gate_inference<at::BFloat16>(
+            out.data_ptr<at::BFloat16>(),
+            next_state.data_ptr<at::BFloat16>(),
+            combined.data_ptr<at::BFloat16>(),
+            state.data_ptr<at::BFloat16>(),
+            static_cast<int>(H),
+            static_cast<int>(B),
+            stream
+        );
+    } else {
+        TORCH_CHECK(false, "Unsupported dtype. Supported dtypes are float32 and bfloat16");
+    }
+    return {out, next_state};
+}
+
+class LogCoeffsAndValuesFunction : public torch::autograd::Function<LogCoeffsAndValuesFunction> {
+public:
+    static torch::autograd::tensor_list forward(
+        torch::autograd::AutogradContext* ctx,
+        torch::Tensor gate,
+        torch::Tensor hidden
+    ) {
+
+        auto dtype = gate.dtype();
+        auto log_coeffs = torch::empty_like(gate, gate.options().dtype(dtype));
+        auto log_values = torch::empty_like(gate, gate.options().dtype(dtype));
+
+        const int N = gate.numel();
+        cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+        if (dtype == torch::kFloat32) {
+            launch_log_coeffs_and_values<float>(
+                log_coeffs.data_ptr<float>(),
+                log_values.data_ptr<float>(),
+                gate.data_ptr<float>(),
+                hidden.data_ptr<float>(),
+                N,
+                stream
+            );
+        } else if (dtype == torch::kBFloat16) {
+            launch_log_coeffs_and_values<at::BFloat16>(
+                log_coeffs.data_ptr<at::BFloat16>(),
+                log_values.data_ptr<at::BFloat16>(),
+                gate.data_ptr<at::BFloat16>(),
+                hidden.data_ptr<at::BFloat16>(),
+                N,
+                stream
+            );
+        } else {
+            TORCH_CHECK(false, "Unsupported dtype. Supported dtypes are float32 and bfloat16");
+        }
+
+        ctx->save_for_backward({gate, hidden});
+        return {log_coeffs, log_values};
+    }
+
+    static torch::autograd::tensor_list backward(
+        torch::autograd::AutogradContext* ctx,
+        torch::autograd::tensor_list grad_outputs
+    ) {
+        auto saved = ctx->get_saved_variables();
+        auto gate = saved[0];
+        auto hidden = saved[1];
+        auto grad_log_values = grad_outputs[1].contiguous();
+        auto grad_log_coeffs = grad_outputs[0].contiguous();
+
+        auto grad_gate = torch::empty_like(gate);
+        auto grad_hidden = torch::empty_like(hidden);
+
+        const int N = gate.numel();
+        auto dtype = gate.dtype();
+        cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+        if (dtype == torch::kFloat32) {
+            launch_log_coeffs_and_values_backward<float>(
+                grad_gate.data_ptr<float>(),
+                grad_hidden.data_ptr<float>(),
+                grad_log_coeffs.data_ptr<float>(),
+                grad_log_values.data_ptr<float>(),
+                gate.data_ptr<float>(),
+                hidden.data_ptr<float>(),
+                N,
+                stream
+            );
+        } else if (dtype == torch::kBFloat16) {
+            launch_log_coeffs_and_values_backward<at::BFloat16>(
+                grad_gate.data_ptr<at::BFloat16>(),
+                grad_hidden.data_ptr<at::BFloat16>(),
+                grad_log_coeffs.data_ptr<at::BFloat16>(),
+                grad_log_values.data_ptr<at::BFloat16>(),
+                gate.data_ptr<at::BFloat16>(),
+                hidden.data_ptr<at::BFloat16>(),
+                N,
+                stream
+            );
+        } else {
+            TORCH_CHECK(false, "Unsupported dtype in backward");
+        }
+
+        return {grad_gate, grad_hidden};
+    }
+};
+
+torch::autograd::tensor_list log_coeffs_and_values(
+    torch::Tensor gate,
+    torch::Tensor hidden
+) {
+    return LogCoeffsAndValuesFunction::apply(gate, hidden);
+}
+
+/*
+class RMSNormFunction: public torch::autograd::Function<RMSNormFunction> {
+public:
+    static torch::autograd::tensor_list forward(
+        torch::autograd::AutogradContext* ctx,
+        torch::Tensor x,
+        torch::Tensor weight,
+        double eps
+    ) {
+        TORCH_CHECK(x.is_cuda(), "x must be on CUDA");
+        TORCH_CHECK(weight.is_cuda(), "weight must be on CUDA");
+        TORCH_CHECK(x.dtype() == weight.dtype(), "dtypes must match");
+        TORCH_CHECK(x.dim() == 3, "x must be (B, T, H)");
+        TORCH_CHECK(weight.dim() == 1, "weight must be (H,)");
+        TORCH_CHECK(x.size(2) == weight.size(0), "H must match");
+
+        auto dtype = x.dtype();
+        auto device = x.device();
+        auto B = x.size(0);
+        auto T = x.size(1);
+        auto H = x.size(2);
+
+        auto out = torch::empty({B, T, H}, x.options());
+
+        auto options_float = torch::TensorOptions().dtype(torch::kFloat32).device(device);
+        auto inv_norm = torch::empty({B, T}, options_float);
+
+        if (dtype == torch::kFloat32) {
+            launch_rmsnorm_forward<float>(
+                out.data_ptr<float>(),
+                inv_norm.data_ptr<float>(),
+                x.data_ptr<float>(),
+                weight.data_ptr<float>(),
+                static_cast<double>(eps),
+                static_cast<int>(T),
+                static_cast<int>(H),
+                static_cast<int>(B)
+            );
+        } else if (dtype == torch::kBFloat16) {
+            launch_rmsnorm_forward<at::BFloat16>(
+                out.data_ptr<at::BFloat16>(),
+                inv_norm.data_ptr<float>(),
+                x.data_ptr<at::BFloat16>(),
+                weight.data_ptr<at::BFloat16>(),
+                static_cast<double>(eps),
+                static_cast<int>(T),
+                static_cast<int>(H),
+                static_cast<int>(B)
+            );
+        } else {
+            TORCH_CHECK(false, "Unsupported dtype. Only float32 and bfloat16 supported.");
+        }
+
+        // TODO: don't save eps as a tensor
+        //ctx->saved_data["eps"] = eps;   // store in saved_data instead
+                                    
+        // Save for backward
+        auto eps_tensor = torch::tensor(eps);
+        ctx->save_for_backward({x, weight, out, inv_norm, eps_tensor});
+
+        return {out};
+    }
+    static torch::autograd::tensor_list backward(
+        torch::autograd::AutogradContext* ctx,
+        torch::autograd::tensor_list grad_outputs
+    ) {
+        auto saved = ctx->get_saved_variables();
+        auto x = saved[0].contiguous();
+        auto weight = saved[1].contiguous();
+        auto out = saved[2].contiguous();
+        auto inv_norm = saved[3].contiguous();
+        double eps = saved[4].item<double>();
+
+        auto grad_out = grad_outputs[0].contiguous();
+        auto dtype = x.dtype();
+
+        auto B = x.size(0);
+        auto T = x.size(1);
+        auto H = x.size(2);
+
+        auto grad_x = torch::empty_like(x);
+        auto grad_weight = torch::empty_like(weight);
+        auto grad_eps = torch::Tensor();
+
+        if (dtype == torch::kFloat32) {
+            launch_rmsnorm_backward<float>(
+                grad_x.data_ptr<float>(),
+                grad_weight.data_ptr<float>(),
+                grad_out.data_ptr<float>(),
+                inv_norm.data_ptr<float>(),
+                x.data_ptr<float>(),
+                weight.data_ptr<float>(),
+                eps,
+                static_cast<int>(T),
+                static_cast<int>(H),
+                static_cast<int>(B)
+            );
+        } else if (dtype == torch::kBFloat16) {
+            launch_rmsnorm_backward<at::BFloat16>(
+                grad_x.data_ptr<at::BFloat16>(),
+                grad_weight.data_ptr<at::BFloat16>(),
+                grad_out.data_ptr<at::BFloat16>(),
+                inv_norm.data_ptr<float>(),
+                x.data_ptr<at::BFloat16>(),
+                weight.data_ptr<at::BFloat16>(),
+                eps,
+                static_cast<int>(T),
+                static_cast<int>(H),
+                static_cast<int>(B)
+            );
+        } else {
+            TORCH_CHECK(false, "Unsupported dtype");
+        }
+
+        return {grad_x, grad_weight, grad_eps};
+    }
+};
+torch::autograd::tensor_list rmsnorm(
+    torch::Tensor x,
+    torch::Tensor weight,
+    double eps
+) {
+    return RMSNormFunction::apply(x, weight, eps);
+}
+*/
+
+/*
+class RMSNormImpl : public torch::nn::Module {
+public:
+    explicit RMSNormImpl(int64_t hidden_size, double eps = 1e-5)
+        : eps(eps)
+    {
+        // weight is the learnable scale (same shape as the last dimension)
+        // We register it as a parameter so it lives on the right device and is trainable
+        weight = register_parameter("weight", torch::ones({1, 1, hidden_size}));
+        // Optional: initialize weight to 1.0 (common practice)
+        reset_parameters();
+    }
+
+    void reset_parameters() {
+        torch::nn::init::ones_(weight);
+    }
+
+    torch::Tensor forward(torch::Tensor x) {
+        // x is expected to be (B, T, H)
+        // Our custom function handles everything (including broadcasting weight correctly)
+        return rmsnorm(x, weight, eps)[0];   // rmsnorm returns a tensor_list with one element
+    }
+
+    // Expose eps if you want to change it later (optional)
+    double eps;
+    torch::Tensor weight;
+};
+*/
+
+// Fully fused scan: chunk + log_coeffs_and_values + scan + sigmoid(proj)*out
+// Takes combined (B, T, 3*H) = [hidden, gate, proj] as input
+class FusedScanFunction : public torch::autograd::Function<FusedScanFunction> {
+public:
+    static torch::autograd::tensor_list forward(
+        torch::autograd::AutogradContext* ctx,
+        torch::Tensor combined,  // (B, T, 3*H) = [hidden, gate, proj]
+        torch::Tensor state      // (B, 1, H)
+    ) {
+        TORCH_CHECK(combined.is_cuda(), "combined must be on CUDA");
+        TORCH_CHECK(state.is_cuda(), "state must be on CUDA");
+        TORCH_CHECK(combined.dtype() == state.dtype(), "dtypes must match");
+        TORCH_CHECK(combined.dim() == 3, "combined must be (B, T, 3*H)");
+        TORCH_CHECK(state.dim() == 3, "state must be (B, 1, H)");
+        TORCH_CHECK(state.size(0) == combined.size(0), "B must match");
+        TORCH_CHECK(state.size(1) == 1, "state T dim must be 1");
+        TORCH_CHECK(combined.size(2) == 3 * state.size(2), "combined must be 3*H");
+        TORCH_CHECK(combined.is_contiguous() && state.is_contiguous(),
+                    "All tensors must be contiguous");
+
+        auto dtype = combined.dtype();        // e.g., kBFloat16 or kFloat32
+        auto device = combined.device();      // e.g., cuda:0
+        auto B = combined.size(0);
+        auto T = combined.size(1);            // T = sequence length
+        auto H = state.size(2);               // H = hidden size (combined is 3*H)
+        auto T_buf = T + 1;                   // Buffer has T+1 timesteps for backward
+
+        // Output: (B, T, H) - sigmoid(proj) * scan_result
+        auto out = torch::empty({B, T, H}, state.options());
+        // Next state: (B, 1, H) - raw scan_result at T (for recurrence)
+        auto next_state = torch::empty({B, 1, H}, state.options());
+
+        // Intermediates: must be float32, T+1 timesteps for backward
+        auto options_float = torch::TensorOptions().dtype(torch::kFloat32).device(device);
+        auto a_star = torch::empty({B, T_buf, H}, options_float);
+        auto s_vals = torch::empty({B, T_buf, H}, options_float);
+        auto log_values_buf = torch::empty({B, T_buf, H}, options_float);  // cached for backward
+        cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+        // Launch kernel - takes combined (B, T, 3*H) directly
+        if (dtype == torch::kFloat32) {
+            launch_fused_scan_forward<float>(
+                out.data_ptr<float>(),
+                next_state.data_ptr<float>(),
+                a_star.data_ptr<float>(),
+                s_vals.data_ptr<float>(),
+                log_values_buf.data_ptr<float>(),
+                combined.data_ptr<float>(),
+                state.data_ptr<float>(),
+                static_cast<int>(T),
+                static_cast<int>(H),
+                static_cast<int>(B),
+                stream
+            );
+
+        } else if (dtype == torch::kBFloat16) {
+            launch_fused_scan_forward<at::BFloat16>(
+                out.data_ptr<at::BFloat16>(),
+                next_state.data_ptr<at::BFloat16>(),
+                a_star.data_ptr<float>(),
+                s_vals.data_ptr<float>(),
+                log_values_buf.data_ptr<float>(),
+                combined.data_ptr<at::BFloat16>(),
+                state.data_ptr<at::BFloat16>(),
+                static_cast<int>(T),
+                static_cast<int>(H),
+                static_cast<int>(B),
+                stream
+            );
+        } else {
+            TORCH_CHECK(false, "Unsupported dtype. Only float32 and bfloat16 supported.");
+        }
+
+        // Save for backward
+        ctx->save_for_backward({combined, state, a_star, s_vals, log_values_buf});
+
+        return {out, next_state};
+    }
+    static torch::autograd::tensor_list backward(
+        torch::autograd::AutogradContext* ctx,
+        torch::autograd::tensor_list grad_outputs
+    ) {
+        auto saved = ctx->get_saved_variables();
+        auto combined = saved[0].contiguous();
+        auto state = saved[1].contiguous();
+        auto a_star_buf = saved[2].contiguous();      // float tensor
+        auto s_vals = saved[3].contiguous();          // float tensor
+        auto log_values_buf = saved[4].contiguous();  // float tensor - cached from forward
+
+        auto grad_out = grad_outputs[0].contiguous();          // (B, T, H)
+        auto grad_next_state = grad_outputs[1].contiguous();   // (B, 1, H)
+        auto dtype = combined.dtype();
+
+        auto B = combined.size(0);
+        auto T = combined.size(1);            // T = sequence length
+        auto H = state.size(2);               // H = hidden size
+
+        // Output grad_combined (B, T, 3*H) = [grad_hidden, grad_gate, grad_proj]
+        auto grad_combined = torch::empty_like(combined);
+        auto grad_state = torch::empty_like(state);
+        cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+        if (dtype == torch::kFloat32) {
+            launch_fused_scan_backward<float>(
+                grad_combined.data_ptr<float>(),
+                grad_state.data_ptr<float>(),
+                grad_out.data_ptr<float>(),
+                grad_next_state.data_ptr<float>(),
+                combined.data_ptr<float>(),
+                state.data_ptr<float>(),
+                a_star_buf.data_ptr<float>(),
+                s_vals.data_ptr<float>(),
+                log_values_buf.data_ptr<float>(),
+                static_cast<int>(T),
+                static_cast<int>(H),
+                static_cast<int>(B),
+                stream
+            );
+        } else if (dtype == torch::kBFloat16) {
+            launch_fused_scan_backward<at::BFloat16>(
+                grad_combined.data_ptr<at::BFloat16>(),
+                grad_state.data_ptr<at::BFloat16>(),
+                grad_out.data_ptr<at::BFloat16>(),
+                grad_next_state.data_ptr<at::BFloat16>(),
+                combined.data_ptr<at::BFloat16>(),
+                state.data_ptr<at::BFloat16>(),
+                a_star_buf.data_ptr<float>(),
+                s_vals.data_ptr<float>(),
+                log_values_buf.data_ptr<float>(),
+                static_cast<int>(T),
+                static_cast<int>(H),
+                static_cast<int>(B),
+                stream
+            );
+        } else {
+            TORCH_CHECK(false, "Unsupported dtype");
+        }
+
+        return {grad_combined, grad_state};
+    }
+};
+
+// Named entrypoint: fused_scan(combined, state) -> {out, next_state}
+// Takes combined (B, T, 3*H) = [hidden, gate, proj] directly
+// Outputs out = sigmoid(proj) * scan_result, next_state = raw scan_result at T
+torch::autograd::tensor_list fused_scan(
+    torch::Tensor combined,  // (B, T, 3*H) = [hidden, gate, proj]
+    torch::Tensor state      // (B, 1, H)
+) {
+    return FusedScanFunction::apply(combined, state);
+}
+
+class LogCumsumExpFunction : public torch::autograd::Function<LogCumsumExpFunction> {
+public:
+    static torch::autograd::tensor_list forward(
+        torch::autograd::AutogradContext* ctx,
+        torch::Tensor x  // (B, T, H)
+    ) {
+        TORCH_CHECK(x.is_cuda(), "x must be on CUDA");
+        auto dtype = x.dtype();
+        auto device = x.device();
+        auto B = x.size(0), T = x.size(1), H = x.size(2);
+
+        auto out = torch::empty({B, T, H}, x.options());
+        //auto options_float = torch::TensorOptions().dtype(torch::kFloat32).device(device);
+        auto options_double = torch::TensorOptions().dtype(torch::kFloat64).device(device);
+        auto s_buf = torch::empty({B, T, H}, options_double);
+        cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+        if (dtype == torch::kFloat32) {
+            launch_logcumsumexp_forward<float>(
+                out.data_ptr<float>(),
+                s_buf.data_ptr<double>(),
+                x.data_ptr<float>(),
+                (int)T, (int)H, (int)B,
+                stream
+            );
+        } else if (dtype == torch::kBFloat16) {
+            launch_logcumsumexp_forward<at::BFloat16>(
+                out.data_ptr<at::BFloat16>(),
+                s_buf.data_ptr<double>(),
+                x.data_ptr<at::BFloat16>(),
+                (int)T, (int)H, (int)B,
+                stream
+            );
+        } else {
+            TORCH_CHECK(false, "Only float32 and bfloat16 supported");
+        }
+
+        ctx->save_for_backward({x, out, s_buf});
+        return {out};
+    }
+    static torch::autograd::tensor_list backward(
+        torch::autograd::AutogradContext* ctx,
+        torch::autograd::tensor_list grad_outputs
+    ) {
+        auto saved = ctx->get_saved_variables();
+        auto x = saved[0].contiguous();
+        auto s_buf = saved[2].contiguous();  // s_buf is saved, out is not needed
+
+        auto grad_out = grad_outputs[0].contiguous();
+        auto dtype = x.dtype();
+        auto B = x.size(0), T = x.size(1), H = x.size(2);
+
+        auto grad_x = torch::empty_like(x);
+        cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+        if (dtype == torch::kFloat32) {
+            launch_logcumsumexp_backward<float>(
+                grad_x.data_ptr<float>(),
+                grad_out.data_ptr<float>(),
+                x.data_ptr<float>(),
+                s_buf.data_ptr<double>(),
+                (int)T, (int)H, (int)B,
+                stream
+            );
+        } else if (dtype == torch::kBFloat16) {
+            launch_logcumsumexp_backward<at::BFloat16>(
+                grad_x.data_ptr<at::BFloat16>(),
+                grad_out.data_ptr<at::BFloat16>(),
+                x.data_ptr<at::BFloat16>(),
+                s_buf.data_ptr<double>(),
+                (int)T, (int)H, (int)B,
+                stream
+            );
+        } else {
+            TORCH_CHECK(false, "Unsupported dtype in backward");
+        }
+
+        return {grad_x};
+    }
+};
+
+// Entry point
+torch::Tensor logcumsumexp_cuda(torch::Tensor x) {
+    return LogCumsumExpFunction::apply(x)[0];
+}
+
+class PPOFusedLossFunction : public torch::autograd::Function<PPOFusedLossFunction> {
+public:
+    static torch::autograd::tensor_list forward(
+        torch::autograd::AutogradContext* ctx,
+        torch::Tensor logits,           // (N, T, A)
+        torch::Tensor values_pred,      // (N, T, 1) or (N, T)
+        torch::Tensor actions,          // (N, T)
+        torch::Tensor old_logprobs,     // (N, T)
+        torch::Tensor advantages,       // (N, T)
+        torch::Tensor prio,             // (N, 1) — importance weights
+        torch::Tensor values,           // (N, T)
+        torch::Tensor returns,          // (N, T)
+        torch::Tensor adv_mean,         // (1)
+        torch::Tensor adv_std,          // (1)
+        double clip_coef,
+        double vf_clip_coef,
+        double vf_coef,
+        double ent_coef
+    ) {
+        TORCH_CHECK(logits.is_cuda(), "logits must be on CUDA");
+        auto dtype = logits.dtype();
+        TORCH_CHECK(dtype == torch::kFloat32 || dtype == torch::kBFloat16,
+                    "Only float32 and bfloat16 supported");
+
+        auto device = logits.device();
+        auto N = logits.size(0);
+        auto T = logits.size(1);
+        auto A = logits.size(2);
+
+        // Extract scalar hyperparams
+        /*
+        float adv_mean_val = adv_mean.item<float>();
+        float adv_std_val = adv_std.item<float>();
+        float clip_coef_val = clip_coef.item<float>();
+        float vf_clip_coef_val = vf_clip_coef.item<float>();
+        float vf_coef_val = vf_coef.item<float>();
+        float ent_coef_val = ent_coef.item<float>();
+        */
+
+        // DO NOT let the compiler know these values at compile time
+        /*
+        float adv_mean = rand() / static_cast<float>(RAND_MAX);
+        float adv_std = 0.5f + (rand() / static_cast<float>(RAND_MAX)) * 2.0f;  // [0.5, 2.5]
+        float clip_coef = (rand() / static_cast<float>(RAND_MAX)) * 0.4f;       // [0.0, 0.4]
+        float vf_clip_coef = (rand() / static_cast<float>(RAND_MAX)) * 0.4f;
+        float vf_coef = (rand() / static_cast<float>(RAND_MAX)) * 1.0f;
+        float ent_coef = (rand() / static_cast<float>(RAND_MAX)) * 0.1f;
+        float adv_mean= 1.0f;
+        float adv_std= 1.0f;
+        float clip_coef= 1.0f;
+        float vf_clip_coef= 1.0f;
+        float vf_coef= 1.0f;
+        float ent_coef= 1.0f;
+        */
+
+        // Output: scalar loss
+        auto options_float = torch::TensorOptions().dtype(torch::kFloat32).device(device);
+        auto options_double = torch::TensorOptions().dtype(torch::kFloat64).device(device);
+        auto loss_output = torch::zeros({1}, options_float);
+
+        // Saved for backward: (N, T, 5) → but use (N*T, 5) for flat indexing
+        auto saved_for_backward = torch::empty({N * T, 5}, options_double);
+        cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+        if (dtype == torch::kFloat32) {
+            launch_ppo_loss_forward<float>(
+                loss_output.data_ptr<float>(),
+                saved_for_backward.data_ptr<double>(),
+                logits.data_ptr<float>(),
+                values_pred.data_ptr<float>(),
+                actions.data_ptr<int64_t>(),
+                old_logprobs.data_ptr<float>(),
+                advantages.data_ptr<float>(),
+                prio.data_ptr<float>(),      // (N, 1) → index as [n]
+                values.data_ptr<float>(),
+                returns.data_ptr<float>(),
+                adv_mean.data_ptr<float>(),
+                adv_std.data_ptr<float>(),
+                clip_coef,
+                vf_clip_coef,
+                vf_coef,
+                ent_coef,
+                T, A, N,
+                stream
+            );
+        } else if (dtype == torch::kBFloat16) {
+            launch_ppo_loss_forward<at::BFloat16>(
+                loss_output.data_ptr<float>(),
+                saved_for_backward.data_ptr<double>(),
+                logits.data_ptr<at::BFloat16>(),
+                values_pred.data_ptr<at::BFloat16>(),
+                actions.data_ptr<int64_t>(),
+                old_logprobs.data_ptr<at::BFloat16>(),
+                advantages.data_ptr<at::BFloat16>(),
+                prio.data_ptr<at::BFloat16>(),
+                values.data_ptr<at::BFloat16>(),
+                returns.data_ptr<at::BFloat16>(),
+                adv_mean.data_ptr<float>(), // TODO: is this correct?
+                adv_std.data_ptr<float>(),
+                clip_coef,
+                vf_clip_coef,
+                vf_coef,
+                ent_coef,
+                T, A, N,
+                stream
+            );
+        }
+
+        // Check errors
+        /*
+        cudaError_t err = cudaGetLastError();
+        if (err != cudaSuccess) {
+            fprintf(stderr, "CUDA kernel error: %s\n", cudaGetErrorString(err));
+        }
+        err = cudaDeviceSynchronize();
+        if (err != cudaSuccess) {
+            fprintf(stderr, "CUDA sync error: %s\n", cudaGetErrorString(err));
+        }
+        */
+
+        // Compute mean loss: divide by (N * T)
+        //float accumulated;
+        //cudaMemcpy(&accumulated, loss_output.data_ptr<float>(), sizeof(float), cudaMemcpyDeviceToHost);
+        //auto mean_loss = torch::tensor(accumulated / (N * T), options_float);
+
+        // Save scalars
+        ctx->saved_data["clip_coef"] = clip_coef;
+        ctx->saved_data["vf_clip_coef"] = vf_clip_coef;
+        ctx->saved_data["vf_coef"] = vf_coef;
+        ctx->saved_data["ent_coef"] = ent_coef;
+
+        // Save inputs and intermediates
+        ctx->save_for_backward({logits, values_pred, actions, old_logprobs, advantages,
+                                prio, values, returns, adv_mean, adv_std, saved_for_backward});
+
+        return {loss_output / (N * T)};
+    }
+    static torch::autograd::tensor_list backward(
+        torch::autograd::AutogradContext* ctx,
+        torch::autograd::tensor_list grad_outputs
+    ) {
+        auto saved = ctx->get_saved_variables();
+        auto logits = saved[0].contiguous();           // (N, T, A)
+        auto values_pred = saved[1].contiguous();      // (N, T, 1) or (N, T)
+        auto actions = saved[2].contiguous();          // (N, T)
+        auto old_logprobs = saved[3].contiguous();     // (N, T)
+        auto advantages = saved[4].contiguous();       // (N, T)
+        auto prio = saved[5].contiguous();             // (N, 1)
+        auto values = saved[6].contiguous();           // (N, T)
+        auto returns = saved[7].contiguous();          // (N, T)
+        auto adv_mean = saved[8].contiguous();         // (1)
+        auto adv_std = saved[9].contiguous();          // (1)
+        auto saved_for_backward = saved[10].contiguous();  // (N*T, 5)
+
+        auto dtype = logits.dtype();
+        auto N = logits.size(0);
+        auto T = logits.size(1);
+        auto A = logits.size(2);
+
+        float clip_coef = ctx->saved_data["clip_coef"].to<double>();
+        float vf_clip_coef = ctx->saved_data["vf_clip_coef"].to<double>();
+        float vf_coef = ctx->saved_data["vf_coef"].to<double>();
+        float ent_coef = ctx->saved_data["ent_coef"].to<double>();
+
+        auto grad_loss = grad_outputs[0].sum().to(torch::kFloat32).reshape({1});
+        //auto grad_out_scalar = grad_outputs[0].sum();  // dL/d(loss)
+        //auto grad_loss = torch::empty({1}, logits.options()).to(torch::kFloat32);
+        //grad_loss.fill_(grad_out_scalar.item<float>());
+
+        auto grad_logits = torch::empty_like(logits);
+        auto grad_values_pred = torch::empty_like(values_pred);
+        cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+        // TODO: Why are we passing grad loss in float?
+        if (dtype == torch::kFloat32) {
+            launch_ppo_loss_backward<float>(
+                grad_logits.data_ptr<float>(),
+                grad_values_pred.data_ptr<float>(),
+                grad_loss.data_ptr<float>(),
+                logits.data_ptr<float>(),
+                actions.data_ptr<int64_t>(),
+                old_logprobs.data_ptr<float>(),
+                advantages.data_ptr<float>(),
+                prio.data_ptr<float>(),
+                values.data_ptr<float>(),
+                returns.data_ptr<float>(),
+                saved_for_backward.data_ptr<double>(),
+                adv_mean.data_ptr<float>(),
+                adv_std.data_ptr<float>(),
+                clip_coef, vf_clip_coef,
+                vf_coef, ent_coef,
+                T, A, N,
+                stream
+            );
+        } else if (dtype == torch::kBFloat16) {
+            launch_ppo_loss_backward<at::BFloat16>(
+                grad_logits.data_ptr<at::BFloat16>(),
+                grad_values_pred.data_ptr<at::BFloat16>(),
+                grad_loss.data_ptr<float>(),
+                logits.data_ptr<at::BFloat16>(),
+                actions.data_ptr<int64_t>(),
+                old_logprobs.data_ptr<at::BFloat16>(),
+                advantages.data_ptr<at::BFloat16>(),
+                prio.data_ptr<at::BFloat16>(),
+                values.data_ptr<at::BFloat16>(),
+                returns.data_ptr<at::BFloat16>(),
+                saved_for_backward.data_ptr<double>(),
+                adv_mean.data_ptr<float>(),
+                adv_std.data_ptr<float>(),
+                clip_coef, vf_clip_coef,
+                vf_coef, ent_coef,
+                T, A, N,
+                stream
+            );
+        }
+
+        cudaError_t err = cudaGetLastError();
+        if (err != cudaSuccess) {
+            fprintf(stderr, "Backward kernel error: %s\n", cudaGetErrorString(err));
+        }
+        /*
+        err = cudaDeviceSynchronize();
+        if (err != cudaSuccess) {
+            fprintf(stderr, "Backward sync error: %s\n", cudaGetErrorString(err));
+        }
+        */
+
+        return {
+            grad_logits,
+            grad_values_pred,
+            {}, {}, {}, {}, {}, {},  // actions, old_logprobs, advantages, prio, values, returns
+            {}, {}, {}, {}, {}, {}   // adv_mean, adv_std, clip_coef, vf_clip_coef, vf_coef, ent_coef
+        };
+    }
+};
+
+torch::autograd::tensor_list fused_ppo_loss(
+    torch::Tensor logits,
+    torch::Tensor values_pred,
+    torch::Tensor actions,
+    torch::Tensor old_logprobs,
+    torch::Tensor advantages,
+    torch::Tensor prio,
+    torch::Tensor values,
+    torch::Tensor returns,
+    torch::Tensor adv_mean,
+    torch::Tensor adv_std,
+    float clip_coef,
+    float vf_clip_coef,
+    float vf_coef,
+    float ent_coef
+) {
+    return PPOFusedLossFunction::apply(logits, values_pred, actions,
+        old_logprobs, advantages, prio, values, returns, adv_mean,
+        adv_std, clip_coef, vf_clip_coef, vf_coef, ent_coef);
+}
+
+// Reference implementation for mingru_gate (inference path)
+// Takes combined (B, 1, 3*H) = [hidden, gate, proj] and state (B, 1, H)
+// Returns {out, next_state} where:
+//   out = sigmoid(proj) * mingru_out
+//   next_state = mingru_out (for recurrence)
+std::vector<torch::Tensor> mingru_gate_cpp(torch::Tensor state, torch::Tensor combined) {
+    auto chunks = combined.chunk(3, 2);
+    auto hidden = chunks[0];
+    auto gate = chunks[1];
+    auto proj = chunks[2];
+
+    auto h = torch::where(hidden >= 0, hidden + 0.5, hidden.sigmoid());
+    auto g = gate.sigmoid();
+    auto mingru_out = torch::lerp(state, h, g);
+    auto out = torch::sigmoid(proj) * mingru_out;
+    return {out, mingru_out};
+}
+
+torch::autograd::tensor_list log_coeffs_and_values_cpp(torch::Tensor gate, torch::Tensor hidden) {
+    auto log_coeffs = -torch::nn::functional::softplus(gate);
+    auto log_z = -torch::nn::functional::softplus(-gate);
+    auto log_tilde_h = torch::where(hidden >= 0,
+        (torch::nn::functional::relu(hidden) + 0.5).log(),
+        -torch::nn::functional::softplus(-hidden));
+    auto log_values = log_z + log_tilde_h;
+    return {log_coeffs, log_values};
+}
+
+torch::Tensor logcumsumexp_cpp(torch::Tensor x) {
+    return x.exp().cumsum(1).log();
+}
+
+// Reference implementation for fused_scan (training path)
+// Takes combined (B, T, 3*H) = [hidden, gate, proj] and state (B, 1, H)
+// Returns {out, next_state} where:
+//   out (B, T, H) = sigmoid(proj) * scan_result
+//   next_state (B, 1, H) = raw scan_result at T (for recurrence)
+std::vector<torch::Tensor> fused_scan_cpp(torch::Tensor combined, torch::Tensor state) {
+    auto seq_len = combined.size(1);
+
+    // Split combined into hidden, gate, proj
+    auto chunks = combined.chunk(3, 2);
+    auto hidden = chunks[0];
+    auto gate = chunks[1];
+    auto proj = chunks[2];
+
+    // Compute log_coeffs and log_values
+    auto log_coeffs = -torch::nn::functional::softplus(gate);
+    auto log_z = -torch::nn::functional::softplus(-gate);
+    auto log_tilde_h = torch::where(hidden >= 0,
+        (torch::nn::functional::relu(hidden) + 0.5).log(),
+        -torch::nn::functional::softplus(-hidden));
+    auto log_values = log_z + log_tilde_h;
+
+    // Cat state and pad for scan
+    log_values = torch::cat({state.log(), log_values}, 1);
+    log_coeffs = torch::pad(log_coeffs, {0, 0, 1, 0});
+
+    // Heinsen associative scan
+    auto a_star = log_coeffs.cumsum(1);
+    auto log_h0_plus_b_star = (log_values - a_star).logcumsumexp(1);
+    auto log_h = a_star + log_h0_plus_b_star;
+    auto scan_result = log_h.exp();
+
+    // Extract output and next_state
+    scan_result = scan_result.narrow(1, scan_result.size(1) - seq_len, seq_len);
+    auto next_state = scan_result.narrow(1, scan_result.size(1) - 1, 1);
+
+    // Apply sigmoid(proj) * scan_result for output
+    auto out = torch::sigmoid(proj) * scan_result;
+
+    return {out, next_state};
+}
+
+torch::Tensor fused_ppo_loss_cpp(
+    torch::Tensor logits,
+    torch::Tensor newvalue,
+    torch::Tensor actions,
+    torch::Tensor old_logprobs,
+    torch::Tensor advantages,
+    torch::Tensor prio,
+    torch::Tensor values,
+    torch::Tensor returns,
+    torch::Tensor adv_mean,
+    torch::Tensor adv_std,
+    float clip_coef,
+    float vf_clip_coef,
+    float vf_coef,
+    float ent_coef
+) {
+    auto segments = logits.size(0);
+    auto horizon = logits.size(1);
+
+    auto flat_logits = logits.reshape({-1, logits.size(-1)});
+    auto flat_actions = actions.reshape({-1});
+    auto logprobs_new = torch::log_softmax(flat_logits, 1);
+
+    auto probs_new = logprobs_new.exp();
+    auto entropy = -(probs_new * logprobs_new).sum(1).mean();
+
+    auto newlogprob_flat = logprobs_new.gather(1, flat_actions.unsqueeze(1)).squeeze(1);
+    auto newlogprob = newlogprob_flat.reshape({segments, horizon});
+    auto logratio = newlogprob - old_logprobs;
+    auto ratio_new = logratio.exp();
+
+    auto adv_normalized = prio.unsqueeze(1) * (advantages - adv_mean) / (adv_std + 1e-8);
+    auto pg_loss1 = -adv_normalized * ratio_new;
+    auto pg_loss2 = -adv_normalized * torch::clamp(ratio_new, 1.0 - clip_coef, 1.0 + clip_coef);
+    auto pg_loss = torch::max(pg_loss1, pg_loss2).mean();
+
+    auto nv = newvalue.view(returns.sizes());
+    auto v_clipped = values + torch::clamp(nv - values, -vf_clip_coef, vf_clip_coef);
+    auto v_loss_unclipped = (nv - returns).pow(2);
+    auto v_loss_clipped = (v_clipped - returns).pow(2);
+    auto v_loss = 0.5 * torch::max(v_loss_unclipped, v_loss_clipped).mean();
+
+    return pg_loss + vf_coef * v_loss - ent_coef * entropy;
+}
diff --git a/pufferlib/extensions/cuda/ops.cuh b/pufferlib/extensions/cuda/ops.cuh
new file mode 100644
index 000000000..cac550e12
--- /dev/null
+++ b/pufferlib/extensions/cuda/ops.cuh
@@ -0,0 +1,266 @@
+#pragma once
+
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <cmath>
+
+// These functions are ported from pytorch/pytorch/blob/main/aten/src/ATen/native/cuda/
+// PyTorch defines these kernels anonymously and with extra saftey and features
+// This is a lower-overhead port of the math for use in fused kernels
+// The caller handles templating, so these can just be simple C-style code
+#ifndef CUDART_INF_F
+#define CUDART_INF_F __int_as_float(0x7f800000)
+#endif
+
+#define SOFTPLUS_BETA 1.0f
+#define SOFTPLUS_THRESHOLD 20.0f
+
+__device__ __forceinline__ float softplus_fwd(float x) {
+    float x_scaled = x * SOFTPLUS_BETA;
+    if (x_scaled > SOFTPLUS_THRESHOLD) {
+        return x;
+    } else {
+        return log1pf(expf(x_scaled)) / SOFTPLUS_BETA;
+    }
+}
+
+__device__ __forceinline__ float softplus_bwd(float grad_output, float x) {
+    float beta_x = SOFTPLUS_BETA * x;
+    if (beta_x > SOFTPLUS_THRESHOLD) {
+        return grad_output;
+    } else {
+        float exp_beta_x = expf(beta_x);
+        return grad_output * (exp_beta_x / (1.0f + exp_beta_x));
+    }
+}
+
+__device__ __forceinline__ float relu(float x) {
+    return fmaxf(0.0f, x);
+}
+
+__device__ __forceinline__ float relu_backward(float x, float grad_output) {
+    return (x > 0.0f) ? grad_output : 0.0f;
+}
+
+__device__ __forceinline__ float sigmoid(float x) {
+    float z = expf(-fabsf(x));
+    return x >= 0.0f ? 1.0f / (1.0f + z) : z / (1.0f + z);
+}
+
+__device__ __forceinline__ float sigmoid_backward(float x, float grad_output) {
+    float sig = sigmoid(x);
+    return grad_output * sig * (1.0f - sig);
+}
+
+__device__ __inline__ float fast_tanh(float x) {
+  const float plus_9 = 9.0f;
+  const float minus_9 = -9.0f;
+  float v1 = fminf(x, plus_9);
+  v1 = fmaxf(v1, minus_9);
+
+  const float alpha_1 = 4.89352455891786e-03f;
+  const float alpha_3 = 6.37261928875436e-04f;
+  const float alpha_5 = 1.48572235717979e-05f;
+  const float alpha_7 = 5.12229709037114e-08f;
+  const float alpha_9 = -8.60467152213735e-11f;
+  const float alpha_11 = 2.00018790482477e-13f;
+  const float alpha_13 = -2.76076847742355e-16f;
+  const float beta_0 = 4.89352518554385e-03f;
+  const float beta_2 = 2.26843463243900e-03f;
+  const float beta_4 = 1.18534705686654e-04f;
+  const float beta_6 = 1.19825839466702e-06f;
+
+  // Horner's method. Matches PyTorch implementation
+  float v2 = v1 * v1;
+  float p = v2 * alpha_13 + alpha_11;
+  p = v2 * p + alpha_9;
+  p = v2 * p + alpha_7;
+  p = v2 * p + alpha_5;
+  p = v2 * p + alpha_3;
+  p = v2 * p + alpha_1;
+  p = v1 * p;
+
+  float q = v2 * beta_6 + beta_4;
+  q = v2 * q + beta_2;
+  q = v2 * q + beta_0;
+
+  return p / q;
+}
+
+__device__ __inline__ float fast_sigmoid(float x) {
+  const float one_v = 1.0f;
+  const float half_v = 0.5f;
+  const float zero_v = 0.0f;
+  float x2 = x * half_v;
+  float y = fast_tanh(x2);
+  float z = (y + one_v) * half_v;
+  return fminf(one_v, fmaxf(zero_v, z));
+}
+
+__device__ __forceinline__ float tilde_relu_fwd(float x) {
+    if (x >= 0.0f) {
+        return x + 0.5f;
+    } else {
+        return fast_sigmoid(x);
+    }
+}
+
+__device__ __forceinline__ float tilde_relu_bwd(float x, float grad_output) {
+    if (x >= 0.0f) {
+        return grad_output * 1.0f;
+    } else {
+        float sig = fast_sigmoid(x);
+        return grad_output * sig * (1.0f - sig);
+    }
+}
+
+__device__ __forceinline__ float lerp(float a, float b, float w) {
+    float diff = b - a;
+    if (fabsf(w) < 0.5f) {
+        return a + w * diff;
+    } else {
+        return b - diff * (1.0f - w);
+    }
+}
+
+__device__ __forceinline__ float lerp_backward_a(float a, float b, float w, float grad_output) {
+    return grad_output * (1.0f - w);
+}
+
+__device__ __forceinline__ float lerp_backward_b(float a, float b, float w, float grad_output) {
+    return grad_output * w;
+}
+
+__device__ __forceinline__ float lerp_backward_w(float a, float b, float w, float grad_output) {
+    float diff = b - a;
+    if (fabsf(w) < 0.5f) {
+        return grad_output * diff;
+    } else {
+        return grad_output * (-diff) * -1.0f; // derivative of (1 - w)
+    }
+}
+
+__device__ __forceinline__ float mingru_gate(float h) {
+    return h >= 0.0f ? h + 0.5f : sigmoid(h);
+}
+
+__device__ __forceinline__ float mingru_gate_backward(float h, float grad_output) {
+    if (h > 0.0f) {
+        return grad_output * 1.0f;
+    } else {
+        float sig = sigmoid(h);
+        return grad_output * sig * (1.0f - sig);
+    }
+}
+
+__device__ __forceinline__ float logaddexp(float a, float b) {
+    float m = fmaxf(a, b);
+    float v = fminf(a, b);
+    float diff = v - m;
+    return (diff < -88.0f) ? m : m + log1pf(expf(diff));
+}
+
+__device__ __forceinline__ float exp_safe(float x) {
+    return (x > 88.0f) ? 1.651e38f : ((x < -88.0f) ? 0.0f : expf(x));
+}
+
+__device__ __forceinline__ void cumsum_forward(const float* x, float* y, int T) {
+    float sum = 0.0f;
+    for (int t = 0; t < T; t++) {
+        sum += x[t];
+        y[t] = sum;
+    }
+}
+
+__device__ __forceinline__ void cumsum_backward(const float* grad_y, float* grad_x, int T) {
+    float running = 0.0f;
+    for (int t = T - 1; t >= 0; t--) {
+        running += grad_y[t];
+        grad_x[t] = running;
+    }
+}
+
+__device__ __forceinline__ void logcumsumexp_forward(const float* x, float* y, int T) {
+    float lse = -CUDART_INF_F;
+    for (int t = 0; t < T; t++) {
+        lse = (lse == -CUDART_INF_F) ? x[t] : logaddexp(lse, x[t]);
+        y[t] = lse;
+    }
+}
+
+__device__ __forceinline__ void logcumsumexp_backward(
+    const float* x,       // input: x[s]
+    const float* y,       // output: y[t]
+    const float* grad_y,  // dL/dy[t]
+    float* grad_x,        // output: dL/dx[s]
+    int T
+) {
+    float running = 0.0f;
+    for (int t = T - 1; t >= 0; t--) {
+        float weight = exp_safe(x[t] - y[t]);
+        running += grad_y[t] * weight;
+        grad_x[t] = running;
+    }
+}
+
+// ============================================================================
+// log_coeffs_and_values: Single-element ops for fused scan
+// These are ported directly from log_coeffs_and_values_kernel in kernels.cu
+// ============================================================================
+
+// Forward: computes log_coeffs and log_values for mingru scan
+// log_coeffs = -softplus(gate)
+// log_values = -softplus(-gate) + log_tilde_h
+//   where log_tilde_h = log(relu(hidden) + 0.5) if hidden >= 0
+//                     = -softplus(-hidden)      if hidden < 0
+__device__ __forceinline__ void log_coeffs_and_values_fwd(
+    float gate,
+    float hidden,
+    float* log_coeff_out,
+    float* log_value_out
+) {
+    *log_coeff_out = -softplus_fwd(gate);
+
+    float log_z = -softplus_fwd(-gate);
+    float log_tilde_h;
+    if (hidden >= 0.0f) {
+        float relu_h = relu(hidden);
+        log_tilde_h = logf(relu_h + 0.5f);
+    } else {
+        log_tilde_h = -softplus_fwd(-hidden);
+    }
+    *log_value_out = log_z + log_tilde_h;
+}
+
+// Backward: computes gradients for gate and hidden
+// given gradients for log_coeffs and log_values
+__device__ __forceinline__ void log_coeffs_and_values_bwd(
+    float grad_log_coeffs,
+    float grad_log_values,
+    float gate,
+    float hidden,
+    float* grad_gate_out,
+    float* grad_hidden_out
+) {
+    // grad_gate from log_coeffs: d(-softplus(g))/dg = -softplus'(g)
+    float grad_g_from_lc = -softplus_bwd(grad_log_coeffs, gate);
+    // grad_gate from log_values via log_z: d(-softplus(-g))/dg = -(-1)*softplus'(-g) = softplus'(-g)
+    // But we have -softplus_bwd(-grad_lv, -g) which computes -softplus'(-g) * (-grad_lv)
+    float grad_g_from_lz = -softplus_bwd(-grad_log_values, -gate);
+    *grad_gate_out = grad_g_from_lc + grad_g_from_lz;
+
+    // grad_hidden from log_tilde_h
+    float grad_h_from_lt;
+    if (hidden >= 0.0f) {
+        float relu_h = relu(hidden);
+        // log_tilde_h = log(relu_h + 0.5)
+        // d(log_tilde_h)/d(hidden) = (1/(relu_h + 0.5)) * relu'(hidden)
+        float inner_grad = 1.0f / (relu_h + 0.5f);
+        grad_h_from_lt = relu_backward(hidden, inner_grad * grad_log_values);
+    } else {
+        // log_tilde_h = -softplus(-hidden)
+        // d(-softplus(-h))/dh = -(-1)*softplus'(-h) = softplus'(-h)
+        grad_h_from_lt = -softplus_bwd(-grad_log_values, -hidden);
+    }
+    *grad_hidden_out = grad_h_from_lt;
+}
diff --git a/pufferlib/extensions/cuda/squared.cu b/pufferlib/extensions/cuda/squared.cu
new file mode 100644
index 000000000..9f1371cd6
--- /dev/null
+++ b/pufferlib/extensions/cuda/squared.cu
@@ -0,0 +1,141 @@
+//#include "squared.hpp"
+#include <cuda_runtime.h>
+#include <curand_kernel.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "squared.hpp"
+
+
+// Device: Reset environment
+__device__ void cuda_reset(Squared* env, curandState* rng) {
+    int tiles = env->size * env->size;
+    int center = env->size / 2 * env->size + env->size / 2;
+
+    // Clear grid
+    for (int i = 0; i < tiles; i++) {
+        env->observations[i] = EMPTY;
+    }
+
+    // Place agent at center
+    env->observations[center] = AGENT;
+    env->r = env->size / 2;
+    env->c = env->size / 2;
+    env->tick = 0;
+
+    // Place target randomly (not on agent)
+    int target_idx;
+    do {
+        target_idx = curand(rng) % tiles;
+    } while (target_idx == center);
+
+    env->observations[target_idx] = TARGET;
+}
+
+// Device: Step environment
+__device__ void cuda_step(Squared* env) {
+    env->tick += 1;
+    int action = env->actions[0];
+    env->terminals[0] = 0;
+    env->rewards[0] = 0.0f;
+
+    int pos = env->r * env->size + env->c;
+    env->observations[pos] = EMPTY;  // Clear old agent pos
+
+    // Move agent
+    if (action == DOWN) {
+        env->r += 1;
+    } else if (action == UP) {
+        env->r -= 1;
+    } else if (action == RIGHT) {
+        env->c += 1;
+    } else if (action == LEFT) {
+        env->c -= 1;
+    }
+
+    pos = env->r * env->size + env->c;
+
+    // Check bounds and timeout
+    if (env->r < 0 || env->c < 0 || env->r >= env->size || env->c >= env->size ||
+        env->tick > 3 * env->size) {
+        env->terminals[0] = 1;
+        env->rewards[0] = -1.0f;
+        env->log.perf += 0;
+        env->log.score += -1.0f;
+        env->log.episode_return += -1.0f;
+        env->log.episode_length += env->tick;
+        env->log.n += 1;
+        cuda_reset(env, &env->rng);
+        return;
+    }
+
+    // Check if reached target
+    if (env->observations[pos] == TARGET) {
+        env->terminals[0] = 1;
+        env->rewards[0] = 1.0f;
+        env->log.perf += 1;
+        env->log.score += 1.0f;
+        env->log.episode_return += 1.0f;
+        env->log.episode_length += env->tick;
+        env->log.n += 1;
+        cuda_reset(env, &env->rng);
+        return;
+    }
+
+    // Place agent
+    env->observations[pos] = AGENT;
+}
+
+// Kernel: Step all environments
+__global__ void step_environments(Squared* envs, int num_envs) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= num_envs) return;
+    cuda_step(&envs[idx]);
+}
+
+// Kernel: Reset specific environments
+__global__ void reset_environments(Squared* envs, int* indices, int num_reset) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= num_reset) return;
+    int env_idx = indices[idx];
+    cuda_reset(&envs[env_idx], &envs[env_idx].rng);
+}
+
+// Kernel: Initialize all environments
+__global__ void init_environments(Squared* envs,
+                                 unsigned char* obs_mem,
+                                 int* actions_mem,
+                                 float* rewards_mem,
+                                 unsigned char* terminals_mem,
+                                 int num_envs,
+                                 int grid_size) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= num_envs) return;
+
+    Squared* env = &envs[idx];
+
+    // Initialize log
+    env->log.perf = 0;
+    env->log.score = 0;
+    env->log.episode_return = 0;
+    env->log.episode_length = 0;
+    env->log.n = 0;
+
+    // Set pointers into memory pools
+    env->observations = obs_mem + idx * grid_size * grid_size;
+    env->actions = actions_mem + idx;
+    env->rewards = rewards_mem + idx;
+    env->terminals = terminals_mem + idx;
+
+    env->size = grid_size;
+    env->tick = 0;
+    env->r = grid_size / 2;
+    env->c = grid_size / 2;
+
+    // Initialize RNG
+    curand_init(clock64(), idx, 0, &env->rng);
+
+    // Initial reset
+    cuda_reset(env, &env->rng);
+}
+
+
diff --git a/pufferlib/extensions/cuda/squared.hpp b/pufferlib/extensions/cuda/squared.hpp
new file mode 100644
index 000000000..1aedd7270
--- /dev/null
+++ b/pufferlib/extensions/cuda/squared.hpp
@@ -0,0 +1,47 @@
+#pragma once
+
+#include <cuda_runtime.h>
+#include <curand_kernel.h>
+
+static constexpr unsigned char NOOP   = 0;
+static constexpr unsigned char DOWN   = 1;
+static constexpr unsigned char UP     = 2;
+static constexpr unsigned char LEFT   = 3;
+static constexpr unsigned char RIGHT  = 4;
+
+static constexpr unsigned char EMPTY  = 0;
+static constexpr unsigned char AGENT  = 1;
+static constexpr unsigned char TARGET = 2;
+
+struct Log {
+    float perf;
+    float score;
+    float episode_return;
+    float episode_length;
+    float n;
+};
+
+struct Squared {
+    Log log;
+    unsigned char* observations;
+    int* actions;
+    float* rewards;
+    unsigned char* terminals;
+    int size;
+    int tick;
+    int r, c;
+    curandState rng;
+};
+
+__global__ void init_environments(
+    Squared* envs,
+    unsigned char* obs_mem,
+    int* actions_mem,
+    float* rewards_mem,
+    unsigned char* terminals_mem,
+    int num_envs,
+    int grid_size
+);
+
+__global__ void step_environments(Squared* envs, int num_envs);
+__global__ void reset_environments(Squared* envs, int* indices, int num_reset);
diff --git a/pufferlib/extensions/cuda/squared_torch.cu b/pufferlib/extensions/cuda/squared_torch.cu
new file mode 100644
index 000000000..8f137a30b
--- /dev/null
+++ b/pufferlib/extensions/cuda/squared_torch.cu
@@ -0,0 +1,246 @@
+#include <torch/extension.h>
+#include <torch/torch.h>
+#include <cuda_runtime.h>
+
+#include <curand_kernel.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+namespace pufferlib {
+
+static constexpr unsigned char NOOP   = 0;
+static constexpr unsigned char DOWN   = 1;
+static constexpr unsigned char UP     = 2;
+static constexpr unsigned char LEFT   = 3;
+static constexpr unsigned char RIGHT  = 4;
+
+static constexpr unsigned char EMPTY  = 0;
+static constexpr unsigned char AGENT  = 1;
+static constexpr unsigned char TARGET = 2;
+
+struct Log {
+    float perf;
+    float score;
+    float episode_return;
+    float episode_length;
+    float n;
+};
+
+struct __align__(16) Squared {
+    curandState rng;
+    unsigned char* observations;
+    int* actions;
+    float* rewards;
+    unsigned char* terminals;
+    int size;
+    int tick;
+    int r, c;
+    Log log;
+    int padding[3];
+};
+
+__device__ void add_log(Squared* env) {
+    env->log.perf += (env->rewards[0] > 0) ? 1 : 0;
+    env->log.score += env->rewards[0];
+    env->log.episode_length += env->tick;
+    env->log.episode_return += env->rewards[0];
+    env->log.n++;
+}
+
+
+// Device: Reset environment
+__device__ void cuda_reset(Squared* env, curandState* rng) {
+    int tiles = env->size * env->size;
+    for (int i = 0; i < tiles; i++) {
+        env->observations[i] = EMPTY;
+    }
+    env->observations[tiles/2] = AGENT;
+    env->r = env->size/2;
+    env->c = env->size/2;
+    env->tick = 0;
+    int target_idx = 0; // Deterministic for testing
+    do {
+        target_idx = curand(rng) % tiles;
+    } while (target_idx == tiles/2);
+    env->observations[target_idx] = TARGET;
+}
+
+// Device: Step environment
+__device__ void cuda_step(Squared* env) {
+    env->tick += 1;
+    int action = env->actions[0];
+    env->terminals[0] = 0;
+    env->rewards[0] = 0.0f;
+
+    int pos = env->r * env->size + env->c;
+    env->observations[pos] = EMPTY;
+
+    if (action == DOWN) {
+        env->r += 1;
+    } else if (action == UP) {
+        env->r -= 1;
+    } else if (action == RIGHT) {
+        env->c += 1;
+    } else if (action == LEFT) {
+        env->c -= 1;
+    }
+
+
+    if (env->tick > 3 * env->size
+            || env->r < 0
+            || env->c < 0
+            || env->r >= env->size
+            || env->c >= env->size) {
+        env->terminals[0] = 1;
+        env->rewards[0] = -1.0f;
+        add_log(env);
+        cuda_reset(env, &env->rng);
+        return;
+    }
+
+    pos = env->r*env->size + env->c;
+    if (env->observations[pos] == TARGET) {
+        env->terminals[0] = 1;
+        env->rewards[0] = 1.0f;
+        add_log(env);
+        cuda_reset(env, &env->rng);
+        return;
+    }
+
+    env->observations[pos] = AGENT;
+}
+
+// Kernel: Step all environments
+__global__ void step_environments(Squared* envs, int* indices, int num_envs) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= num_envs) return;
+    cuda_step(&envs[idx]);
+}
+
+// Kernel: Reset specific environments
+__global__ void reset_environments(Squared* envs, int* indices, int num_reset) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= num_reset) return;
+    int env_idx = indices[idx];
+    cuda_reset(&envs[env_idx], &envs[env_idx].rng);
+}
+
+// Kernel: Reset specific environment logs
+__global__ void reset_logs(Squared* envs, int* indices, int num_reset) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= num_reset) return;
+    int env_idx = indices[idx];
+    envs[env_idx].log = {0};
+}
+
+// Kernel: Initialize all environments
+__global__ void init_environments(Squared* envs,
+                                 unsigned char* obs_mem,
+                                 int* actions_mem,
+                                 float* rewards_mem,
+                                 unsigned char* terminals_mem,
+                                 int num_envs,
+                                 int grid_size) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= num_envs) return;
+
+    Squared* env = &envs[idx];
+
+    // Initialize log
+    env->log.perf = 0;
+    env->log.score = 0;
+    env->log.episode_return = 0;
+    env->log.episode_length = 0;
+    env->log.n = 0;
+
+    // Set pointers into memory pools
+    env->observations = obs_mem + idx * grid_size * grid_size;
+    env->actions = actions_mem + idx;
+    env->rewards = rewards_mem + idx;
+    env->terminals = terminals_mem + idx;
+    env->size = grid_size;
+
+    // Initialize RNG
+    curand_init(clock64(), idx, 0, &env->rng);
+
+    // Initial reset
+    cuda_reset(env, &env->rng);
+}
+
+
+inline dim3 make_grid(int n) {
+    return dim3((n + 255) / 256);
+}
+
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
+create_squared_environments(int64_t num_envs, int64_t grid_size, torch::Tensor dummy) {
+    auto device = dummy.device();
+    TORCH_CHECK(device.type() == at::kCUDA, "Dummy tensor must be on CUDA device");
+
+    auto envs_tensor = torch::empty({static_cast<int64_t>(num_envs * sizeof(Squared))}, torch::kUInt8).to(device);
+    auto obs = torch::zeros({num_envs, grid_size, grid_size}, torch::kUInt8).to(device);
+    auto actions = torch::zeros({num_envs}, torch::kInt32).to(device);
+    auto rewards = torch::zeros({num_envs}, torch::kFloat32).to(device);
+    auto terminals = torch::zeros({num_envs}, torch::kUInt8).to(device);
+
+    Squared* envs = reinterpret_cast<Squared*>(envs_tensor.data_ptr<unsigned char>());
+
+    init_environments<<<make_grid(num_envs), 256>>>(
+        envs,
+        obs.data_ptr<unsigned char>(),
+        actions.data_ptr<int>(),
+        rewards.data_ptr<float>(),
+        terminals.data_ptr<unsigned char>(),
+        num_envs,
+        grid_size
+    );
+    cudaDeviceSynchronize();
+
+    return std::make_tuple(envs_tensor, obs, actions, rewards, terminals);
+}
+
+void step_environments_cuda(torch::Tensor envs_tensor, torch::Tensor indices_tensor) {
+    Squared* envs = reinterpret_cast<Squared*>(envs_tensor.data_ptr<unsigned char>());
+    auto indices = indices_tensor.data_ptr<int>();
+    int num_envs = indices_tensor.size(0);
+    step_environments<<<make_grid(num_envs), 256>>>(envs, indices, num_envs);
+    cudaDeviceSynchronize();
+}
+
+void reset_environments_cuda(torch::Tensor envs_tensor, torch::Tensor indices_tensor) {
+    Squared* envs = reinterpret_cast<Squared*>(envs_tensor.data_ptr<unsigned char>());
+    auto indices = indices_tensor.data_ptr<int>();
+    int num_reset = indices_tensor.size(0);
+
+    reset_environments<<<make_grid(num_reset), 256>>>(envs, indices, num_reset);
+    cudaDeviceSynchronize();
+}
+
+Log log_environments_cuda(torch::Tensor envs_tensor, torch::Tensor indices_tensor) {
+    Squared* envs = reinterpret_cast<Squared*>(envs_tensor.cpu().data_ptr<unsigned char>());
+    torch::Tensor cpu_indices = indices_tensor.cpu();
+    auto indices = cpu_indices.data_ptr<int>();
+    int num_log = indices_tensor.size(0);
+    Log log = {0};
+    for (int i=0; i<num_log; i++) {
+        log.perf += envs[indices[i]].log.perf;
+        log.score += envs[indices[i]].log.score;
+        log.episode_return += envs[indices[i]].log.episode_return;
+        log.episode_length += envs[indices[i]].log.episode_length;
+        log.n += envs[indices[i]].log.n;
+    }
+    log.perf /= log.n;
+    log.score /= log.n;
+    log.episode_return /= log.n;
+    log.episode_length /= log.n;
+
+    // Reset must be done in cuda
+    Squared* envs_gpu = reinterpret_cast<Squared*>(envs_tensor.data_ptr<unsigned char>());
+    auto indices_gpu = indices_tensor.data_ptr<int>();
+    reset_logs<<<make_grid(num_log), 256>>>(envs_gpu, indices_gpu, num_log);
+    cudaDeviceSynchronize();
+
+    return log;
+}
+
+}
diff --git a/pufferlib/extensions/env_binding.h b/pufferlib/extensions/env_binding.h
new file mode 100644
index 000000000..29abef6f6
--- /dev/null
+++ b/pufferlib/extensions/env_binding.h
@@ -0,0 +1,525 @@
+#include <assert.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdbool.h>
+
+#include <cuda_runtime.h>
+
+#include "vecenv.h"
+#include <stdatomic.h>
+
+#define FLOAT 1
+#define INT 2
+#define UNSIGNED_CHAR 3
+
+#define CHECK_CUDA(call)                                            \
+    do {                                                            \
+        cudaError_t err = (call);                                   \
+        if (err != cudaSuccess) {                                   \
+            fprintf(stderr, "CUDA error in %s: %s (error %d)\n",    \
+                    #call, cudaGetErrorString(err), (int)err);      \
+            exit(EXIT_FAILURE);                                     \
+        }                                                           \
+    } while(0)
+
+__attribute__((visibility("default"))) const int OBS_N = OBS_SIZE;
+__attribute__((visibility("default"))) const int ACT_N = ACT_SIZE;
+__attribute__((visibility("default"))) const int OBS_T = OBS_TYPE;
+__attribute__((visibility("default"))) const int ACT_T = ACT_TYPE;
+
+#define INIT 0
+#define OBS_READY_ON_CPU 1
+#define OBS_READY_ON_GPU 2
+#define ATN_READY_ON_GPU 3
+#define ATN_READY_ON_CPU 4
+
+typedef struct Threading {
+    atomic_long* completed;
+    long start_index;
+    long end_index;
+    int num_threads;
+    pthread_cond_t wake_cond;
+    pthread_mutex_t wake_mutex;
+    pthread_cond_t all_done_cond;
+    pthread_mutex_t all_done_mutex;
+    pthread_t* threads;
+    atomic_int* buffer_states;
+    int block_size;
+    int num_envs;
+    int num_buffers;
+    bool use_gpu;
+    int test_idx;
+    long min_expected;
+    int iters;
+} Threading;
+
+typedef struct WorkerArg {
+    VecEnv* vec;
+    int idx;
+} WorkerArg;
+
+// Forward declarations for env-specific functions supplied by user
+void my_log(Log* log, Dict* out);
+void my_init(Env* env, Dict* args);
+
+void* my_shared(Env* env, Dict* kwargs);
+#ifndef MY_SHARED
+void* my_shared(Env* env, Dict* kwargs) {
+    return NULL;
+}
+#endif
+
+void my_shared_close(Env* env);
+#ifndef MY_SHARED_CLOSE
+void my_shared_close(Env* env) {}
+#endif
+
+void* my_get(Env* env, Dict* out);
+#ifndef MY_GET
+void* my_get(Env* env, Dict* out) {
+    return NULL;
+}
+#endif
+
+int my_put(Env* env, Dict* kwargs);
+#ifndef MY_PUT
+int my_put(Env* env, Dict* kwargs) {
+    return 0;
+}
+#endif
+
+void update_buffer_state(Threading* threading, int buf, int val) {
+    atomic_int* states = threading->buffer_states;
+    int old_val = atomic_load(&states[buf]);
+    atomic_store(&states[buf], val);
+    //printf("Updated vecenv %d buf %d from %d to %d \n", threading->test_idx, buf, old_val, val);
+}
+
+static void* c_threadstep(void* arg)
+{
+    WorkerArg* worker_arg = (WorkerArg*)arg;
+    VecEnv* vec = worker_arg->vec;
+    Threading* threading = vec->threading;
+
+    int block_size = threading->block_size;
+    int num_envs = vec->size;
+    atomic_long* completed = &threading->completed[worker_arg->idx];
+    long end = 0;
+    long block_start = 0;
+    while (1) {
+        pthread_mutex_lock(&threading->wake_mutex);
+        while (threading->start_index >= threading->end_index) {
+            atomic_store(completed, threading->start_index);
+            //printf("Min completed %d on thread %d. end %d test idx %d\n", threading->start_index, worker_arg->idx, threading->end_index, threading->test_idx);
+            pthread_cond_wait(&threading->wake_cond, &threading->wake_mutex);
+        }
+ 
+        long start = threading->start_index;
+        long end = threading->start_index + block_size;
+        if (end > threading->end_index) {
+            end = threading->end_index;
+        }
+        threading->start_index = end;
+        pthread_mutex_unlock(&threading->wake_mutex);
+
+        for (int i=start; i<end; i++) {
+            c_step(&vec->envs[i % num_envs]);
+        }
+        atomic_store(completed, end);
+    }
+    return NULL;
+}
+
+static void* c_threadmanager(void* arg) {
+    VecEnv* vec = (VecEnv*)arg;
+    Threading* threading = vec->threading;
+
+    int buffer_size = vec->size / vec->buffers;
+    atomic_int* buffer_states = threading->buffer_states;
+    long iters = 0;
+    int curr_buf = 0;
+
+    //printf("Thread manager initialized\n");
+    // TODO: Init?
+    while (1) {
+        for (int buf=0; buf < vec->buffers; buf++) {
+            int state = atomic_load(&buffer_states[buf]);
+            bool cuda_ready = !threading->use_gpu || cudaStreamQuery(vec->streams[buf]) == cudaSuccess;
+            if (state == ATN_READY_ON_GPU && cuda_ready) {
+                update_buffer_state(threading, buf, ATN_READY_ON_CPU);
+                pthread_mutex_lock(&threading->wake_mutex);
+                threading->end_index += buffer_size;
+                // Actions are ready on CPU
+                //printf("Buffer %d Actions ready on CPU. end idx: %d \n", buf, threading->end_index);
+                pthread_cond_broadcast(&threading->wake_cond);
+                pthread_mutex_unlock(&threading->wake_mutex);
+            }
+
+            if (buf != curr_buf) {
+                continue;
+            }
+
+            long min_expected = (iters + 1) * buffer_size;
+            threading->min_expected = min_expected;
+            long min_completed = LONG_MAX;
+            for (int i=0; i<threading->num_threads; i++) {
+                long completed = atomic_load(threading->completed + i);
+                if (completed < min_completed) {
+                    min_completed = completed;
+                }
+            }
+            //usleep(100000);
+            //printf("Buffer %d Observations ready on CPU. start_index = %d, end_index = %d, completed = %d, min_expected = %d\n", buf, threading->start_index, threading->end_index, min_completed, min_expected);
+            if (min_completed < min_expected) {
+                continue;
+            }
+
+            if (state == ATN_READY_ON_CPU) {
+                curr_buf = (curr_buf + 1) % vec->buffers;
+                iters++;
+                threading->iters = iters;
+
+                //buffer_states[buf] = OBS_READY_ON_CPU;
+                //printf("Buffer %d Observations ready on CPU. start_index = %d, end_index = %d, completed = %d, min_expected = %d\n", buf, threading->start_index, threading->end_index, min_completed, min_expected);
+                //threading->start_index += buffer_size;
+
+                // Observations are ready on CPU
+                int start = buf * buffer_size;
+
+                if (threading->use_gpu) {
+                    cudaMemcpyAsync(
+                        &vec->gpu_observations[start*OBS_SIZE],
+                        &vec->observations[start*OBS_SIZE],
+                        buffer_size*OBS_SIZE*sizeof(float),
+                        cudaMemcpyHostToDevice,
+                        vec->streams[buf]
+                    );
+                    cudaMemcpyAsync(
+                        &vec->gpu_rewards[start],
+                        &vec->rewards[start],
+                        buffer_size*sizeof(float),
+                        cudaMemcpyHostToDevice,
+                        vec->streams[buf]
+                    );
+                    cudaMemcpyAsync(
+                        &vec->gpu_terminals[start],
+                        &vec->terminals[start],
+                        buffer_size*sizeof(unsigned char),
+                        cudaMemcpyHostToDevice,
+                        vec->streams[buf]
+                    );
+                }
+                update_buffer_state(threading, buf, OBS_READY_ON_CPU);
+                //atomic_store(buffer_tasks + buf,  buffer_size);
+            } else {
+                //printf("Somehow messed up\n");
+            }
+        }
+    }
+}
+ 
+__attribute__((visibility("default")))
+VecEnv* create_environments(int num_envs, int buffers, bool use_gpu, int test_idx, Dict* kwargs) {
+    Env* envs = (Env*)calloc(num_envs, sizeof(Env));
+    VecEnv* vec = (VecEnv*)calloc(1, sizeof(VecEnv));
+    vec->envs = envs;
+    vec->size = num_envs;
+    vec->buffers = buffers;
+    vec->threading = calloc(1, sizeof(Threading));
+    vec->threading->use_gpu = use_gpu;
+    vec->threading->test_idx = test_idx;
+
+    int num_agents = 0;
+    for (int i = 0; i < num_envs; i++) {
+        srand(i);
+        my_init(&envs[i], kwargs);
+        //num_agents += envs[i].num_agents;
+        num_agents += 1;
+    }
+
+    /*
+    vec->observations = (float*)calloc(num_agents*OBS_SIZE, sizeof(float));
+    vec->actions = (float*)calloc(num_agents*ACT_SIZE, sizeof(float));
+    vec->rewards = (float*)calloc(num_agents, sizeof(float));
+    vec->terminals = (unsigned char*)calloc(num_agents, sizeof(unsigned char));
+    */
+    //printf("Size of alloc: %d\n", num_agents*OBS_SIZE*sizeof(float));
+    //printf("Before allocated mem host\n");
+    if (use_gpu) {
+        cudaSetDevice(0);
+        CHECK_CUDA(cudaHostAlloc((void**)&vec->observations, num_agents*OBS_SIZE*sizeof(float), cudaHostAllocPortable));
+        CHECK_CUDA(cudaHostAlloc((void**)&vec->actions, num_agents*ACT_SIZE*sizeof(float), cudaHostAllocPortable));
+        CHECK_CUDA(cudaHostAlloc((void**)&vec->rewards, num_agents*sizeof(float), cudaHostAllocPortable));
+        CHECK_CUDA(cudaHostAlloc((void**)&vec->terminals, num_agents*sizeof(unsigned char), cudaHostAllocPortable));
+    } else {
+        vec->observations = calloc(num_agents*OBS_SIZE, sizeof(float));
+        vec->actions = calloc(num_agents*ACT_SIZE, sizeof(float));
+        vec->rewards = calloc(num_agents, sizeof(float));
+        vec->terminals = calloc(num_agents, sizeof(unsigned char));
+    }
+
+    memset(vec->observations, 0, num_agents*OBS_SIZE*sizeof(float));
+    memset(vec->actions, 0, num_agents*ACT_SIZE*sizeof(float));
+    memset(vec->rewards, 0, num_agents*sizeof(float));
+    memset(vec->terminals, 0, num_agents*sizeof(unsigned char));
+    //printf("allocated mem host\n");
+
+    if (use_gpu) {
+        CHECK_CUDA(cudaMalloc((void**)&vec->gpu_observations, num_agents*OBS_SIZE*sizeof(float)));
+        CHECK_CUDA(cudaMalloc((void**)&vec->gpu_actions, num_agents*ACT_SIZE*sizeof(float)));
+        CHECK_CUDA(cudaMalloc((void**)&vec->gpu_rewards, num_agents*sizeof(float)));
+        CHECK_CUDA(cudaMalloc((void**)&vec->gpu_terminals, num_agents*sizeof(unsigned char)));
+        cudaMemset(vec->gpu_observations, 0, num_agents*OBS_SIZE*sizeof(float));
+        cudaMemset(vec->gpu_actions, 0, num_agents*ACT_SIZE*sizeof(float));
+        cudaMemset(vec->gpu_rewards, 0, num_agents*sizeof(float));
+        cudaMemset(vec->gpu_terminals, 0, num_agents*sizeof(unsigned char));
+    } else {
+        vec->gpu_observations = vec->observations;
+        vec->gpu_actions = vec->actions;
+        vec->gpu_rewards = vec->rewards;
+        vec->gpu_terminals = vec->terminals;
+    }
+
+    int agent = 0;
+    for (int i = 0; i < num_envs; i++) {
+        Env* env = &envs[i];
+        env->observations = vec->observations + agent*OBS_SIZE;
+        env->actions = vec->actions + agent*ACT_SIZE;
+        env->rewards = vec->rewards + agent;
+        env->terminals = vec->terminals + agent;
+        //agent += env->num_agents;
+        agent += 1;
+    }
+
+    return vec;
+}
+
+void create_threads(VecEnv* vec, int threads, int block_size) {
+    //printf("Finished creating %d envs\n", num_envs);
+    Threading* threading = vec->threading;
+    threading->num_threads = threads;
+    threading->block_size = block_size;
+    threading->completed = (atomic_long*)calloc(threads, sizeof(atomic_long));
+    threading->buffer_states = (atomic_int*)calloc(vec->buffers, sizeof(atomic_int));
+    threading->num_envs = vec->size;
+    threading->num_buffers = vec->buffers;
+
+    vec->streams = (cudaStream_t*)calloc(vec->buffers, sizeof(cudaStream_t));
+    for (int i = 0; i < vec->buffers; i++) {
+        cudaStreamCreateWithFlags(&vec->streams[i], cudaStreamNonBlocking);
+    }
+
+
+    if (threads > 0) {
+        WorkerArg* worker_args = (WorkerArg*)calloc(threads, sizeof(WorkerArg));
+
+        threading->threads = (pthread_t*)calloc(threads + 1, sizeof(pthread_t));
+        assert(threading->threads != NULL && "create_vecenv failed to allocate memory for threads\n");
+        assert(pthread_cond_init(&threading->wake_cond, NULL) == 0 && "create_vecenv failed to initialize wake_cond\n");
+        assert(pthread_mutex_init(&threading->wake_mutex, NULL) == 0 && "create_vecenv failed to initialize wake_mutex\n");
+        //atomic_store(&threading->end_index, 0);
+        //atomic_store(&threading->work_index, 0);
+
+        for (int i = 0; i < threads; i++) {
+            WorkerArg* arg = &worker_args[i];
+            arg->vec = vec;
+            arg->idx = i;
+
+            int err = pthread_create(&threading->threads[i], NULL, c_threadstep, (void*)(arg));
+            assert(err == 0 && "create_vecenv failed to create thread\n");
+        }
+
+        // Last thread manages host device syncs
+        int err = pthread_create(&threading->threads[threads], NULL, c_threadmanager, (void*)(vec));
+        assert(err == 0 && "create_vecenv failed to create manager thread\n");
+    }
+}
+
+Env* env_init(float* observations, float* actions, float* rewards,
+        unsigned char* terminals, int seed, Dict* kwargs) {
+    Env* env = (Env*)calloc(1, sizeof(Env));
+    assert(env != NULL && "env_init failed to allocated memory\n");
+
+    // TODO: Types can vary
+    env->observations = observations;
+    env->actions = actions;
+    env->rewards = rewards;
+    env->terminals = terminals;
+
+    srand(seed);
+    my_init(env, kwargs);
+    return env;
+}
+
+void vec_reset(VecEnv* vec) {
+    for (int i = 0; i < vec->size; i++) {
+        Env* env = &vec->envs[i];
+        c_reset(env);
+    }
+    cudaMemcpy(
+        vec->gpu_observations,
+        vec->observations,
+        vec->size*OBS_SIZE*sizeof(float),
+        cudaMemcpyHostToDevice
+    );
+    cudaMemcpy(
+        vec->gpu_rewards,
+        vec->rewards,
+        vec->size*sizeof(float),
+        cudaMemcpyHostToDevice
+    );
+    cudaMemcpy(
+        vec->gpu_terminals,
+        vec->terminals,
+        vec->size*sizeof(unsigned char),
+        cudaMemcpyHostToDevice
+    );
+    cudaDeviceSynchronize();
+ 
+    Threading* threading = vec->threading;
+    if (threading->num_threads > 0) {
+        atomic_int* buffer_states = threading->buffer_states;
+        for (int buf=0; buf < vec->buffers; buf++) {
+            int block_size = vec->size / vec->buffers;
+            int start = buf * block_size;
+            update_buffer_state(threading, buf, OBS_READY_ON_CPU);
+        }
+    }
+}
+
+void vec_send(VecEnv* vec, int buffer) {
+    int block_size = vec->size / vec->buffers;
+    int start = buffer * block_size;
+
+    // For testing
+    //int val = rand() % 8192;
+    //cudaMemset(&vec->gpu_actions[start], val, block_size*ACT_SIZE*sizeof(float));
+
+    Threading* threading = vec->threading;
+
+    // Single threaded
+    if (threading->num_threads == 0) {
+        //float val = rand()%3;
+        //cudaMemcpy(&vec->gpu_actions[start], &val, sizeof(float), cudaMemcpyHostToDevice);
+        cudaDeviceSynchronize();
+        cudaMemcpy(
+            &vec->actions[start*ACT_SIZE],
+            &vec->gpu_actions[start*ACT_SIZE],
+            block_size*ACT_SIZE*sizeof(float),
+            cudaMemcpyDeviceToHost
+        );
+        for (int i = start; i < start + block_size; i++) {
+            Env* env = &vec->envs[i];
+            c_step(env);
+        }
+        cudaMemcpy(
+            &vec->gpu_observations[start*OBS_SIZE],
+            &vec->observations[start*OBS_SIZE],
+            block_size*OBS_SIZE*sizeof(float),
+            cudaMemcpyHostToDevice
+        );
+        cudaMemcpy(
+            &vec->gpu_rewards[start],
+            &vec->rewards[start],
+            block_size*sizeof(float),
+            cudaMemcpyHostToDevice
+        );
+        cudaMemcpy(
+            &vec->gpu_terminals[start],
+            &vec->terminals[start],
+            block_size*sizeof(unsigned char),
+            cudaMemcpyHostToDevice
+        );
+        cudaDeviceSynchronize();
+    } else {
+        if (threading->use_gpu) {
+            //cudaMemcpyAsync(
+            cudaMemcpy(
+                &vec->actions[start*ACT_SIZE],
+                &vec->gpu_actions[start*ACT_SIZE],
+                block_size*ACT_SIZE*sizeof(float),
+                cudaMemcpyDeviceToHost
+                //vec->streams[buffer]
+            );
+        }
+
+        atomic_int* buffer_states = threading->buffer_states;
+        update_buffer_state(threading, buffer, ATN_READY_ON_GPU);
+        //buffer_states[buffer] = ATN_READY_ON_GPU;
+        //printf("vec_send initiated actions->CPU\n");
+    }
+
+
+}
+
+void vec_recv(VecEnv* vec, int buffer) {
+    cudaDeviceSynchronize();
+    Threading* threading = vec->threading;
+    // TODO: Single stream architecture requires busy waiting here
+    //printf("Recv buf %d\n", buffer);
+
+    if (threading->num_threads > 0) {
+        atomic_int* buffer_states = threading->buffer_states;
+        //printf("vec_recv waiting on CPU obs\n");
+        while (atomic_load(&buffer_states[buffer]) != OBS_READY_ON_CPU) {}
+        //printf("vec_recv waiting on obs->GPU\n");
+        if (threading->use_gpu) {
+            cudaStreamSynchronize(vec->streams[buffer]);
+        }
+        update_buffer_state(vec->threading, buffer, OBS_READY_ON_GPU);
+        float obs_6;
+        cudaMemcpy(&obs_6, &vec->gpu_observations[6], sizeof(float), cudaMemcpyDeviceToHost);
+        //buffer_states[buffer] = OBS_READY_ON_GPU;
+        //printf("vec_recv got obs on GPU for buffer %d\n", buffer);
+    }
+}
+
+void vec_step(VecEnv* vec, int buffer) {
+    vec_send(vec, buffer);
+    vec_recv(vec, buffer);
+}
+
+void env_close(Env* env) {
+    c_close(env);
+    free(env);
+}
+
+void vec_close(VecEnv* vec) {
+    for (int i = 0; i < vec->size; i++) {
+        Env* env = &vec->envs[i];
+        c_close(env);
+    }
+    free(vec->envs);
+}
+
+void vec_render(VecEnv* vec, int env_idx) {
+    Env* env = &vec->envs[env_idx];
+    c_render(env);
+}
+
+void vec_log(VecEnv* vec, Dict* out) {
+    Log aggregate = {0};
+    int num_keys = sizeof(Log) / sizeof(float);
+    for (int i = 0; i < vec->size; i++) {
+        Env* env = &vec->envs[i];
+        for (int j = 0; j < num_keys; j++) {
+            ((float*)&aggregate)[j] += ((float*)&env->log)[j];
+            ((float*)&env->log)[j] = 0.0f;
+        }
+    }
+
+    if (aggregate.n == 0.0f) {
+        return;
+    }
+
+    // Average
+    float n = aggregate.n;
+    for (int i = 0; i < num_keys; i++) {
+        ((float*)&aggregate)[i] /= n;
+    }
+
+    // User populates dict
+    dict_set_float(out, "n", n);
+    my_log(&aggregate, out);
+}
diff --git a/pufferlib/extensions/g2048.c b/pufferlib/extensions/g2048.c
new file mode 100644
index 000000000..e048f31e5
--- /dev/null
+++ b/pufferlib/extensions/g2048.c
@@ -0,0 +1,35 @@
+#include "../ocean/g2048/g2048.h"
+
+#define OBS_SIZE 289
+#define ACT_SIZE 1
+#define OBS_TYPE UNSIGNED_CHAR
+#define ACT_TYPE INT
+
+
+#define Env Game
+#include "env_binding.h"
+
+void my_init(Env* env, Dict* kwargs) {
+    env->can_go_over_65536 = dict_get(kwargs, "can_go_over_65536")->int_value;
+    env->reward_scaler = dict_get(kwargs, "reward_scaler")->float_value;
+    env->endgame_env_prob = dict_get(kwargs, "endgame_env_prob")->float_value;
+    env->scaffolding_ratio = dict_get(kwargs, "scaffolding_ratio")->float_value;
+    env->use_heuristic_rewards = dict_get(kwargs, "use_heuristic_rewards")->int_value;
+    env->snake_reward_weight = dict_get(kwargs, "snake_reward_weight")->float_value;
+    env->use_sparse_reward = dict_get(kwargs, "use_sparse_reward")->int_value;
+    init(env);
+}
+
+void my_log(Log* log, Dict* out) {
+    dict_set_float(out, "perf", log->perf);
+    dict_set_float(out, "score", log->score);
+    dict_set_float(out, "merge_score", log->merge_score);
+    dict_set_float(out, "episode_return", log->episode_return);
+    dict_set_float(out, "episode_length", log->episode_length);
+    dict_set_float(out, "lifetime_max_tile", log->lifetime_max_tile);
+    dict_set_float(out, "reached_32768", log->reached_32768);
+    dict_set_float(out, "reached_65536", log->reached_65536);
+    dict_set_float(out, "monotonicity_reward", log->monotonicity_reward);
+    dict_set_float(out, "snake_state", log->snake_state);
+    dict_set_float(out, "snake_reward", log->snake_reward);
+}
diff --git a/pufferlib/extensions/muon.cpp b/pufferlib/extensions/muon.cpp
new file mode 100644
index 000000000..bfcfdf0f0
--- /dev/null
+++ b/pufferlib/extensions/muon.cpp
@@ -0,0 +1,201 @@
+//#include <torch/optim/muon.h>
+#include "muon.h"
+
+#include <torch/csrc/autograd/variable.h>
+#include <torch/nn/module.h>
+#include <torch/serialize/archive.h>
+#include <torch/utils.h>
+
+#include <ATen/ATen.h>
+#include <c10/util/irange.h>
+
+#include <cmath>
+#include <functional>
+#include <iostream>
+
+#include "muon.h"
+
+namespace torch::optim {
+
+const double coeffs[5][3] = {
+    {4.0848, -6.8946, 2.9270},
+    {3.9505, -6.3029, 2.6377},
+    {3.7418, -5.5913, 2.3037},
+    {2.8769, -3.1427, 1.2046},
+    {2.8366, -3.0525, 1.2012},
+};
+
+MuonOptions::MuonOptions(double initial_lr) : initial_lr_(initial_lr) {}
+
+bool operator==(const MuonOptions& lhs, const MuonOptions& rhs) {
+  return (lhs.initial_lr() == rhs.initial_lr()) &&
+      (lhs.eps() == rhs.eps()) &&
+      (lhs.weight_decay() == rhs.weight_decay()) &&
+      (lhs.momentum() == rhs.momentum());
+}
+
+/*
+void MuonOptions::serialize(torch::serialize::OutputArchive& archive) const {
+  _TORCH_OPTIM_SERIALIZE_TORCH_ARG(lr);
+  _TORCH_OPTIM_SERIALIZE_TORCH_ARG(weight_decay);
+  _TORCH_OPTIM_SERIALIZE_TORCH_ARG(momentum);
+  _TORCH_OPTIM_SERIALIZE_TORCH_ARG(eps);
+}
+
+void MuonOptions::serialize(torch::serialize::InputArchive& archive) {
+  _TORCH_OPTIM_DESERIALIZE_TORCH_ARG(double, lr);
+  _TORCH_OPTIM_DESERIALIZE_TORCH_ARG(double, weight_decay);
+  _TORCH_OPTIM_DESERIALIZE_TORCH_ARG(double, momentum);
+  _TORCH_OPTIM_DESERIALIZE_TORCH_ARG(double, eps);
+}
+*/
+
+double MuonOptions::get_lr() const {
+  return initial_lr();
+}
+
+void MuonOptions::set_lr(const double initial_lr) {
+  this->initial_lr(initial_lr);
+}
+
+bool operator==(const MuonParamState& lhs, const MuonParamState& rhs) {
+  return (lhs.step() == rhs.step()) &&
+      torch::equal(lhs.momentum_buffer(), rhs.momentum_buffer());
+}
+
+/*
+void MuonParamState::serialize(
+    torch::serialize::OutputArchive& archive) const {
+  _TORCH_OPTIM_SERIALIZE_TORCH_ARG(step);
+  _TORCH_OPTIM_SERIALIZE_TORCH_ARG(momentum_buffer);
+}
+
+void MuonParamState::serialize(torch::serialize::InputArchive& archive) {
+  _TORCH_OPTIM_DESERIALIZE_TORCH_ARG(int64_t, step);
+  _TORCH_OPTIM_DESERIALIZE_TORCH_ARG(Tensor, momentum_buffer);
+}
+*/
+
+//TODO: You actually want this in bfloat16. Still seems slow
+Tensor _zeropower_via_newtonschulz(Tensor G) {
+    auto x = G.to(torch::kBFloat16);
+    //auto x = G.clone();
+    if (G.size(-2) > G.size(-1)) {
+        x = x.mT();
+    }
+
+    // Heavyball hardcodes 1e-7
+    x.div_(x.norm().clamp(1e-7));
+
+    for (int i = 0; i < 5; ++i) {
+        auto a = coeffs[i][0];
+        auto b = coeffs[i][1];
+        auto c = coeffs[i][2];
+        auto A = x.mm(x.mT());
+        auto gram_update = at::addmm(A, A, A, b, c);  // beta=b, alpha=c
+        x = at::addmm(x, gram_update, x, a, 1.0);
+    }
+
+    if (G.size(-2) > G.size(-1)) {
+        x = x.mT();
+    }
+
+    return x.to(G.dtype());
+}
+
+Tensor Muon::step(LossClosure closure) {
+  NoGradGuard no_grad;
+  Tensor loss = {};
+  if (closure != nullptr) {
+    at::AutoGradMode enable_grad(true);
+    loss = closure();
+  }
+  for (auto& group : param_groups_) {
+    for (auto& p : group.params()) {
+      if (!p.grad().defined()) {
+        continue;
+      }
+      const auto& grad = p.grad();
+      TORCH_CHECK(!grad.is_sparse(), "Muon does not support sparse gradients");
+      auto param_state = state_.find(p.unsafeGetTensorImpl());
+      auto& options = static_cast<MuonOptions&>(group.options());
+
+      // Perform stepweight decay
+      /*
+      if (options.weight_decay() != 0) {
+        p.mul_(1 - options.lr() * options.weight_decay());
+      }
+      */
+
+      // State initialization
+      if (param_state == state_.end()) {
+        auto state = std::make_unique<MuonParamState>();
+        state->step(0);
+        state->momentum_buffer(torch::zeros_like(p, MemoryFormat::Preserve));
+        state_[p.unsafeGetTensorImpl()] = std::move(state);
+      }
+
+      auto& state =
+          static_cast<MuonParamState&>(*state_[p.unsafeGetTensorImpl()]);
+      auto& buf = state.momentum_buffer();
+      auto& momentum = options.momentum();
+      auto weight_decay = options.weight_decay();
+      auto eps = options.eps();
+
+      state.step(state.step() + 1);
+
+      // Nesterov momentum. Do not use EMA
+      buf.mul_(momentum);
+      buf.add_(grad);
+      grad.add_(buf*momentum);
+
+      torch::Tensor update = grad.clone();
+
+      if (grad.dim() >= 2) {
+          auto G = update.view({update.size(0), -1});
+          update = _zeropower_via_newtonschulz(G); // original has hardcoded steps and eps
+          double ratio = (double)update.size(-2) / (double)update.size(-1);
+          double scale = std::sqrt(std::max(1.0, ratio)); // Matches heavyball and Keller
+          update.mul_(scale);
+      }
+
+      if (options.weight_decay() != 0) {
+        p.mul_(1 - lr * weight_decay);
+      }  
+      p.sub_(lr*update.view(p.sizes()));
+    }
+  }
+  return loss;
+}
+
+/*
+void Muon::save(serialize::OutputArchive& archive) const {
+  serialize(*this, archive);
+}
+
+void Muon::load(serialize::InputArchive& archive) {
+  IValue pytorch_version;
+  if (archive.try_read("pytorch_version", pytorch_version)) {
+    serialize(*this, archive);
+  } else { // deserializing archives saved in old format (prior to
+           // version 1.5.0)
+    TORCH_WARN(
+        "Your serialized Muon optimizer is still using the old serialization format. "
+        "You should re-save your Muon optimizer to use the new serialization format.");
+    std::vector<int64_t> step_buffers;
+    std::vector<at::Tensor> momentum_buffers;
+    torch::optim::serialize(archive, "step_buffers", step_buffers);
+    torch::optim::serialize(archive, "momentum_buffers", momentum_buffers);
+    // since there were no param_groups prior to version 1.5.0, assuming all
+    // tensors are now in one param_group
+    std::vector<Tensor> params = param_groups_.at(0).params();
+    for (const auto idx : c10::irange(step_buffers.size())) {
+      auto state = std::make_unique<MuonParamState>();
+      state->step(step_buffers.at(idx));
+      state->momentum_buffer(momentum_buffers.at(idx));
+      state_[params.at(idx).unsafeGetTensorImpl()] = std::move(state);
+    }
+  }
+}
+*/
+} // namespace torch::optim
diff --git a/pufferlib/extensions/muon.h b/pufferlib/extensions/muon.h
new file mode 100644
index 000000000..cefbfc9e2
--- /dev/null
+++ b/pufferlib/extensions/muon.h
@@ -0,0 +1,76 @@
+#pragma once
+
+#include <torch/nn/module.h>
+#include <torch/optim/optimizer.h>
+#include <torch/optim/serialize.h>
+
+#include <utility>
+#include <vector>
+
+namespace torch::serialize {
+class OutputArchive;
+class InputArchive;
+} // namespace torch::serialize
+
+namespace torch::optim {
+
+struct TORCH_API MuonOptions : public OptimizerCloneableOptions<MuonOptions> {
+  MuonOptions(double initial_lr = 0.0025);
+  TORCH_ARG(double, initial_lr) = 0.0025;
+  TORCH_ARG(double, weight_decay) = 0.0;
+  TORCH_ARG(double, momentum) = 0.9;
+  TORCH_ARG(double, eps) = 1e-8;
+
+ public:
+  //void serialize(torch::serialize::InputArchive& archive) override;
+  //void serialize(torch::serialize::OutputArchive& archive) const override;
+  TORCH_API friend bool operator==(
+      const MuonOptions& lhs,
+      const MuonOptions& rhs);
+  double get_lr() const override;
+  void set_lr(const double lr) override;
+};
+
+struct TORCH_API MuonParamState
+    : public OptimizerCloneableParamState<MuonParamState> {
+  TORCH_ARG(int64_t, step) = 0;
+  TORCH_ARG(torch::Tensor, momentum_buffer);
+
+ public:
+  //void serialize(torch::serialize::InputArchive& archive) override;
+  //void serialize(torch::serialize::OutputArchive& archive) const override;
+  TORCH_API friend bool operator==(
+      const MuonParamState& lhs,
+      const MuonParamState& rhs);
+};
+
+class TORCH_API Muon : public Optimizer {
+ public:
+  torch::Tensor lr;
+  explicit Muon(
+      const std::vector<OptimizerParamGroup>& param_groups,
+      MuonOptions defaults = {})
+      : Optimizer(param_groups, std::make_unique<MuonOptions>(defaults)) {
+    TORCH_CHECK(defaults.initial_lr() >= 0, "Invalid initial learning rate: ", defaults.initial_lr());
+    TORCH_CHECK(defaults.eps() >= 0, "Invalid epsilon value: ", defaults.eps());
+    TORCH_CHECK(
+        defaults.weight_decay() >= 0,
+        "Invalid weight_decay value: ",
+        defaults.weight_decay());
+
+    lr = torch::tensor(defaults.initial_lr(), torch::dtype(torch::kFloat32).device(torch::kCUDA).requires_grad(false));
+  }
+  explicit Muon(std::vector<Tensor> params, MuonOptions defaults = {})
+      : Muon({OptimizerParamGroup(std::move(params))}, std::move(defaults)) {}
+
+  torch::Tensor step(LossClosure closure = nullptr) override;
+  //void save(serialize::OutputArchive& archive) const override;
+  //void load(serialize::InputArchive& archive) override;
+
+ //private:
+ // template <typename Self, typename Archive>
+ // static void serialize(Self& self, Archive& archive) {
+ //   _TORCH_OPTIM_SERIALIZE_WITH_TEMPLATE_ARG(Muon);
+ // }
+};
+} // namespace torch::optim
diff --git a/pufferlib/extensions/pufferlib.cpp b/pufferlib/extensions/pufferlib.cpp
index a20d58bc2..40fb87cad 100644
--- a/pufferlib/extensions/pufferlib.cpp
+++ b/pufferlib/extensions/pufferlib.cpp
@@ -1,28 +1,253 @@
-#include <Python.h>
-#include <ATen/Operators.h>
-#include <torch/all.h>
-#include <torch/library.h>
+//TODO:
+//5.6% cat overhead from grad clip. Preallocate?
+//11% seqwise overhead from fused scan
+//30% elemwise form random ops
+//5% on log_coeffs_and_values
+
+#include <torch/extension.h>
+#include <torch/torch.h>
+#include <torch/optim/optimizer.h>
+
+#include <c10/cuda/CUDAGuard.h>
+#include <cuda_runtime.h>
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include <atomic>
+#include "vecenv.h"
+#include <dlfcn.h>
+#include "muon.h"
+
+#include <ATen/cuda/CUDAGraph.h>
+#include <ATen/cuda/CUDAGeneratorImpl.h>
+//#include <c10/cuda/CUDAGuard.h>
+
+#include <nvToolsExt.h>
+
+#include <iostream>
 #include <vector>
 
-extern "C" {
-  /* Creates a dummy empty _C module that can be imported from Python.
-     The import from Python will load the .so consisting of this file
-     in this extension, so that the TORCH_LIBRARY static initializers
-     below are run. */
-  PyObject* PyInit__C(void)
-  {
-      static struct PyModuleDef module_def = {
-          PyModuleDef_HEAD_INIT,
-          "_C",   /* name of module */
-          NULL,   /* module documentation, may be NULL */
-          -1,     /* size of per-interpreter state of the module,
-                     or -1 if the module keeps state in global variables. */
-          NULL,   /* methods */
-      };
-      return PyModule_Create(&module_def);
+create_environments_fn create_envs;
+create_threads_fn create_threads;
+env_init_fn env_init;
+vec_reset_fn vec_reset;
+vec_step_fn vec_step;
+vec_send_fn vec_send;
+vec_recv_fn vec_recv;
+env_close_fn env_close;
+vec_close_fn vec_close;
+vec_log_fn vec_log;
+vec_render_fn vec_render;
+
+torch::Dtype to_torch_dtype(int dtype) {
+    if (dtype == FLOAT) {
+        return torch::kFloat32;
+    } else if (dtype == INT) {
+        return torch::kInt32;
+    } else if (dtype == UNSIGNED_CHAR) {
+        return torch::kUInt8;
+    } else {
+        assert(false && "to_torch_dtype failed to convert dtype");
+    }
+    return torch::kFloat32;
+}
+
+// Torch is stupid. Had to clip out a redundant cuda sync.
+void clip_grad_norm_(
+    const std::vector<torch::Tensor>& parameters,
+    double max_norm,
+    double norm_type = 2.0
+    ) {
+  std::vector<torch::Tensor> params_with_grad;
+
+  for (const auto& param : parameters) {
+    auto& grad = param.grad();
+    if (grad.defined()) {
+      params_with_grad.push_back(param);
+    }
+  }
+
+  if (params_with_grad.empty()) {
+    return;
+  }
+
+  torch::Tensor total_norm_tensor;
+  if (norm_type == std::numeric_limits<double>::infinity()) {
+    std::vector<torch::Tensor> norms;
+    norms.reserve(params_with_grad.size());
+
+    for (const auto& param : params_with_grad) {
+      norms.emplace_back(param.grad().data().abs().max());
+    }
+    total_norm_tensor =
+        (norms.size() == 1) ? norms[0] : torch::max(torch::stack(norms));
+  } else if (norm_type == 0) {
+    total_norm_tensor =
+        torch::full({}, static_cast<double>(params_with_grad.size()));
+  } else {
+    std::vector<torch::Tensor> norms;
+    norms.reserve(params_with_grad.size());
+
+    for (const auto& param : params_with_grad) {
+      norms.emplace_back(param.grad().data().norm(norm_type));
+    }
+    total_norm_tensor =
+        (norms.size() == 1) ? norms[0] : torch::stack(norms).norm(norm_type);
+  }
+
+  auto clip_coef = max_norm / (total_norm_tensor + 1e-6);
+  auto clip_coef_clamped =
+      torch::clamp(clip_coef, std::nullopt /* min */, 1.0 /* max */);
+  for (auto& param : params_with_grad) {
+    param.grad().data().mul_(clip_coef_clamped);
   }
 }
 
+std::tuple<VecEnv*, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
+create_environments(int64_t num_envs) {
+    void* handle = dlopen("./breakout.so", RTLD_NOW);
+    if (!handle) {
+        fprintf(stderr, "dlopen error: %s\n", dlerror());
+        exit(1);
+    }
+    dlerror();
+
+    // Load the function pointer
+    create_envs = (create_environments_fn)dlsym(handle, "create_environments");
+    create_threads = (create_threads_fn)dlsym(handle, "create_threads");
+    env_init = (env_init_fn)dlsym(handle, "env_init");
+    vec_reset = (vec_reset_fn)dlsym(handle, "vec_reset");
+    vec_step = (vec_step_fn)dlsym(handle, "vec_step");
+    vec_send = (vec_send_fn)dlsym(handle, "vec_send");
+    vec_recv = (vec_recv_fn)dlsym(handle, "vec_recv");
+    env_close = (env_close_fn)dlsym(handle, "env_close");
+    vec_close = (vec_close_fn)dlsym(handle, "vec_close");
+    vec_log = (vec_log_fn)dlsym(handle, "vec_log");
+    vec_render = (vec_render_fn)dlsym(handle, "vec_render");
+    int obs_n = *(int*)dlsym(handle, "OBS_N");
+    int act_n = *(int*)dlsym(handle, "ACT_N");
+    int obs_t = *(int*)dlsym(handle, "OBS_T");
+    int act_t = *(int*)dlsym(handle, "ACT_T");
+    
+    const char* dlsym_error = dlerror();
+    if (dlsym_error) {
+        fprintf(stderr, "dlsym error: %s\n", dlsym_error);
+        dlclose(handle);
+        exit(1);
+    }
+
+    Dict* kwargs = create_dict(32);
+    dict_set_int(kwargs, "frameskip", 4);
+    dict_set_int(kwargs, "width", 576);
+    dict_set_int(kwargs, "height", 330);
+    dict_set_int(kwargs, "paddle_width", 62);
+    dict_set_int(kwargs, "paddle_height", 8);
+    dict_set_int(kwargs, "ball_width", 32);
+    dict_set_int(kwargs, "ball_height", 32);
+    dict_set_int(kwargs, "brick_width", 32);
+    dict_set_int(kwargs, "brick_height", 12);
+    dict_set_int(kwargs, "brick_rows", 6);
+    dict_set_int(kwargs, "brick_cols", 18);
+    dict_set_int(kwargs, "initial_ball_speed", 256);
+    dict_set_int(kwargs, "max_ball_speed", 448);
+    dict_set_int(kwargs, "paddle_speed", 620);
+    dict_set_int(kwargs, "continuous", 0);
+
+    /*
+    Dict* kwargs = create_dict(32);
+    dict_set_int(kwargs, "can_go_over_65536", 0);
+    dict_set_float(kwargs, "reward_scaler", 0.67);
+    dict_set_float(kwargs, "endgame_env_prob", 0.05);
+    dict_set_float(kwargs, "scaffolding_ratio", 0.67);
+    dict_set_int(kwargs, "use_heuristic_rewards", 1);
+    dict_set_float(kwargs, "snake_reward_weight", 0.0005);
+    dict_set_int(kwargs, "use_sparse_reward", 0);
+    */
+
+    VecEnv* vec = create_envs(num_envs, 2, true, 0, kwargs);
+    printf("Created VecEnv with %d environments\n", vec->size);
+
+    // Close the library
+    //dlclose(handle);
+ 
+    auto obs_dtype = to_torch_dtype(obs_t);
+    auto atn_dtype = to_torch_dtype(act_t);
+
+    auto obs = torch::from_blob(vec->gpu_observations, {num_envs, obs_n}, torch::dtype(obs_dtype).device(torch::kCUDA));
+    auto actions = torch::from_blob(vec->gpu_actions, {num_envs}, torch::dtype(atn_dtype).device(torch::kCUDA));
+    auto rewards = torch::from_blob(vec->gpu_rewards, {num_envs}, torch::dtype(torch::kFloat32).device(torch::kCUDA));
+    auto terminals = torch::from_blob(vec->gpu_terminals, {num_envs}, torch::dtype(torch::kUInt8).device(torch::kCUDA));
+
+    // TODO: RESET
+    return std::make_tuple(vec, obs, actions, rewards, terminals);
+}
+
+namespace py = pybind11;
+
+// Forward declare modules
+std::vector<torch::Tensor> mingru_gate(
+    torch::Tensor state,
+    torch::Tensor combined
+);
+torch::autograd::tensor_list log_coeffs_and_values(
+    torch::Tensor gate,
+    torch::Tensor hidden
+);
+// Fully fused scan: takes combined (B, T, 3*H) = [hidden, gate, proj]
+// Returns {out, next_state} where out = sigmoid(proj) * scan_result
+torch::autograd::tensor_list fused_scan(
+    torch::Tensor combined,  // (B, T, 3*H) = [hidden, gate, proj]
+    torch::Tensor state      // (B, 1, H)
+);
+torch::Tensor logcumsumexp_cuda(torch::Tensor x);
+torch::autograd::tensor_list fused_ppo_loss(
+    torch::Tensor logits,
+    torch::Tensor values_pred,
+    torch::Tensor actions,
+    torch::Tensor old_logprobs,
+    torch::Tensor advantages,
+    torch::Tensor prio,
+    torch::Tensor values,
+    torch::Tensor returns,
+    torch::Tensor adv_mean,
+    torch::Tensor adv_std,
+    float clip_coef,
+    float vf_clip_coef,
+    float vf_coef,
+    float ent_coef
+    /*
+    torch::Tensor adv_mean,
+    torch::Tensor adv_std,
+    torch::Tensor clip_coef,
+    torch::Tensor vf_clip_coef,
+    torch::Tensor vf_coef,
+    torch::Tensor ent_coef
+    */
+);
+
+/*
+torch::autograd::tensor_list rmsnorm(
+    torch::Tensor x,
+    torch::Tensor weight,
+    double eps
+);
+class RMSNormImpl : public torch::nn::Module {
+public:
+    explicit RMSNormImpl(int64_t hidden_size, double eps = 1e-5);
+    torch::Tensor forward(torch::Tensor x);
+    double eps{1e-5};
+    torch::Tensor weight;
+};
+
+TORCH_MODULE(RMSNorm);
+*/
+
+
+auto DTYPE = torch::kFloat32;
+
+// Note: Reference implementations (mingru_gate_cpp, fused_scan_cpp, etc.) are in modules.cu
+
 namespace pufferlib {
 
 void puff_advantage_row(float* values, float* rewards, float* dones,
@@ -40,6 +265,7 @@ void puff_advantage_row(float* values, float* rewards, float* dones,
     }
 }
 
+
 void vtrace_check(torch::Tensor values, torch::Tensor rewards,
         torch::Tensor dones, torch::Tensor importance, torch::Tensor advantages,
         int num_steps, int horizon) {
@@ -58,7 +284,6 @@ void vtrace_check(torch::Tensor values, torch::Tensor rewards,
     }
 }
 
-
 // [num_steps, horizon]
 void puff_advantage(float* values, float* rewards, float* dones, float* importance,
         float* advantages, float gamma, float lambda, float rho_clip, float c_clip,
@@ -71,7 +296,6 @@ void puff_advantage(float* values, float* rewards, float* dones, float* importan
     }
 }
 
-
 void compute_puff_advantage_cpu(torch::Tensor values, torch::Tensor rewards,
         torch::Tensor dones, torch::Tensor importance, torch::Tensor advantages,
         double gamma, double lambda, double rho_clip, double c_clip) {
@@ -92,4 +316,1485 @@ TORCH_LIBRARY_IMPL(pufferlib, CPU, m) {
   m.impl("compute_puff_advantage", &compute_puff_advantage_cpu);
 }
 
+/*
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
+create_squared_environments(int64_t num_envs, int64_t grid_size, torch::Tensor dummy);
+
+struct Log {
+    float perf;
+    float score;
+    float episode_return;
+    float episode_length;
+    float n;
+};
+
+void step_environments_cuda(torch::Tensor envs_tensor, torch::Tensor indices_tensor);
+
+void reset_environments_cuda(torch::Tensor envs_tensor, torch::Tensor indices_tensor);
+
+Log log_environments_cuda(torch::Tensor envs_tensor, torch::Tensor indices_tensor);
+*/
+
+void compute_puff_advantage_cuda(
+    torch::Tensor values,
+    torch::Tensor rewards,
+    torch::Tensor dones,
+    torch::Tensor importance,
+    torch::Tensor advantages,
+    double gamma,
+    double lambda,  // Note: 'lambda' is fine as a param name in C++
+    double rho_clip,
+    double c_clip
+);
+
+struct ShareableLSTMCell : public torch::nn::LSTMCellImpl {
+    ShareableLSTMCell(const torch::nn::LSTMCellOptions& options) : torch::nn::LSTMCellImpl(options) {}
+
+    void set_shared_weights(torch::Tensor w_ih, torch::Tensor w_hh, torch::Tensor b_ih, torch::Tensor b_hh) {
+        weight_ih = w_ih;
+        weight_hh = w_hh;
+        bias_ih = b_ih;
+        bias_hh = b_hh;
+
+        // Remove the original (unused) tensors from the parameter dict to avoid waste
+        parameters_.erase("weight_ih");
+        parameters_.erase("weight_hh");
+        parameters_.erase("bias_ih");
+        parameters_.erase("bias_hh");
+    }
+};
+
+class RMSNorm : public torch::nn::Module {
+private:
+    int64_t dim;
+    torch::Tensor weight{nullptr};
+
+public:
+    RMSNorm(int64_t dim)
+        : dim(dim) {
+
+        weight = register_parameter("weight", torch::ones(dim));
+    }
+
+    torch::Tensor forward(torch::Tensor x) {
+        int ndim = x.dim();
+        TORCH_CHECK(x.size(ndim - 1) == dim, "Last dimension must match expected size");
+        double eps = 1.19e-07;
+        //return torch::nn::functional::normalize(
+        //    x, torch::nn::functional::NormalizeFuncOptions().p(2.0).dim(-1)) * weight;
+        auto rms = (x.pow(2).mean(-1, true) + eps).rsqrt();
+        return x * rms * weight;
+        //auto mean_sq = (x*x).mean(ndim - 1, true);
+        //return weight * x/mean_sq.sqrt();
+    }
+};
+
+class DyT : public torch::nn::Module {
+    private:
+        int64_t dim;
+        torch::Tensor alpha{nullptr};
+        torch::Tensor weight{nullptr};
+        torch::Tensor bias{nullptr};
+
+    public:
+        DyT(int64_t dim)
+            : dim(dim) {
+
+            alpha = register_parameter("alpha", 0.5*torch::ones({dim}));
+            weight = register_parameter("weight", torch::ones({dim}));
+            bias = register_parameter("bias", torch::zeros({dim}));
+        }
+
+        torch::Tensor forward(torch::Tensor x) {
+            x = torch::tanh(alpha*x);
+            x = x*weight + bias;
+            return x;
+        }
+};
+
+
+class MinGRULayer : public torch::nn::Module {
+private:
+    int64_t dim;
+    torch::nn::Linear to_hidden_and_gate{nullptr};
+    //torch::Tensor to_hidden_and_gate_bf16{nullptr};
+    torch::nn::Linear to_out{nullptr};
+    //torch::Tensor rmsnorm_weight{nullptr};
+    //RMSNorm rmsnorm{nullptr};
+    std::shared_ptr<RMSNorm> norm{nullptr};
+    //std::shared_ptr<DyT> dyt{nullptr};
+    bool kernels;
+
+public:
+    int64_t expansion_factor;
+    MinGRULayer(int64_t dim, int64_t expansion_factor = 1., bool kernels = true)
+        : dim(dim), expansion_factor(expansion_factor), kernels(kernels) {
+
+        int dim_inner = int(dim * expansion_factor);
+        to_hidden_and_gate = register_module("to_hidden_and_gate",
+                torch::nn::Linear(torch::nn::LinearOptions(dim, 3*dim_inner).bias(false)));
+        torch::nn::init::orthogonal_(to_hidden_and_gate->weight);
+
+        //to_hidden_and_gate_bf16 = register_parameter("to_hidden_and_gate_bf16", torch::zeros({dim, 2*dim_inner}, torch::dtype(torch::kBFloat16).device(torch::kCUDA)));
+        //torch::nn::init::orthogonal_(to_hidden_and_gate_bf16);
+
+        // TODO: Is there a way to have this be identity to keep param count correct?
+        //if (expansion_factor != 1.) 
+        //to_out = register_module("to_out",
+        //        torch::nn::Linear(torch::nn::LinearOptions(dim*expansion_factor, dim).bias(false)));
+        //torch::nn::init::orthogonal_(to_out->weight);
+
+        norm = register_module("norm", std::make_shared<RMSNorm>(dim));
+        //dyt = register_module("dyt", std::make_shared<DyT>(dim));
+
+        //rmsnorm_weight = register_parameter("rmsnorm_weight", torch::ones({dim}));
+    }
+
+    std::tuple<torch::Tensor, torch::Tensor> forward(torch::Tensor x, torch::Tensor state = torch::Tensor()) {
+        TORCH_CHECK(x.dim() == 3, "x must be [B, seq, input_size]");
+        TORCH_CHECK(state.dim() == 3, "state must be [B, seq, hidden_size]");
+        TORCH_CHECK(x.size(0) == state.size(0), "x and state must have the same batch size");
+
+        auto seq_len = x.size(1);
+        auto output = to_hidden_and_gate->forward(x);
+
+        torch::Tensor out;
+        torch::Tensor next_prev_hidden;
+
+        if (seq_len == 1) {
+            // Inference path: fused chunk + mingru + sigmoid(proj) * out
+            if (kernels) {
+                auto result = mingru_gate(state, output.contiguous());
+                out = result[0];              // sigmoid(proj) * mingru_out
+                next_prev_hidden = result[1]; // mingru_out (for recurrence)
+            } else {
+                auto chunks = output.chunk(3, 2);
+                auto hidden = chunks[0];
+                auto gate = chunks[1];
+                auto proj = chunks[2];
+                hidden = torch::where(hidden >= 0, hidden + 0.5, hidden.sigmoid());
+                gate = gate.sigmoid();
+                out = torch::lerp(state, hidden, gate);
+                next_prev_hidden = out;
+                proj = torch::sigmoid(proj);
+                out = proj * out;
+            }
+        } else {
+            // Training path: fully fused kernel
+            if (kernels) {
+                // fused_scan takes combined (B, T, 3*H) directly
+                // output already has layout [hidden, gate, proj] from to_hidden_and_gate
+                auto scan_out = fused_scan(output.contiguous(), state.contiguous());
+                out = scan_out[0];                // (B, T, H) = sigmoid(proj) * scan_result
+                next_prev_hidden = scan_out[1];   // (B, 1, H) = raw scan_result at T
+            } else {
+                // Non-kernel path: chunk for gate/hidden/proj
+                auto chunks = output.chunk(3, 2);
+                auto hidden = chunks[0];
+                auto gate = chunks[1];
+                auto proj = chunks[2];
+
+                // Compute log_coeffs/values manually
+                auto log_coeffs = -torch::nn::functional::softplus(gate);
+                auto log_z = -torch::nn::functional::softplus(-gate);
+                auto log_tilde_h = torch::where(hidden >= 0,
+                    (torch::nn::functional::relu(hidden) + 0.5).log(),
+                    -torch::nn::functional::softplus(-hidden));
+                auto log_values = log_z + log_tilde_h;
+
+                // Non-kernel path still needs cat+pad+narrow
+                log_values = torch::cat({state.log(), log_values}, 1);
+                log_coeffs = torch::pad(log_coeffs, {0, 0, 1, 0});
+                auto a_star = log_coeffs.cumsum(1);
+                auto log_h0_plus_b_star = (log_values - a_star).logcumsumexp(1);
+                auto log_h = a_star + log_h0_plus_b_star;
+                out = log_h.exp();
+                out = out.narrow(1, out.size(1) - seq_len, seq_len);
+                next_prev_hidden = out.narrow(1, out.size(1) - 1, 1);
+
+                proj = torch::sigmoid(proj);
+                out = proj * out;
+            }
+        }
+
+        return std::make_tuple(out, next_prev_hidden);
+    }
+};
+
+
+class DefaultEncoder : public torch::nn::Module {
+    public:
+        torch::nn::Linear encoder{nullptr};
+        int input_size;
+        int hidden_size;
+
+    DefaultEncoder(int64_t input_size, int64_t hidden_size)
+        : input_size(input_size), hidden_size(hidden_size) {
+        
+        encoder = register_module("encoder", torch::nn::Linear(input_size, hidden_size));
+        torch::nn::init::orthogonal_(encoder->weight, std::sqrt(2.0));
+        torch::nn::init::constant_(encoder->bias, 0.0);
+    }
+
+    torch::Tensor forward(torch::Tensor x) {
+        torch::Tensor hidden = encoder->forward(x);
+        return torch::nn::functional::gelu(hidden);
+    }
+};
+
+class DefaultDecoder : public torch::nn::Module {
+    public:
+        torch::nn::Linear decoder{nullptr};
+        torch::nn::Linear value_function{nullptr};
+        int hidden_size;
+        int output_size;
+
+    DefaultDecoder(int64_t hidden_size, int64_t output_size)
+        : hidden_size(hidden_size), output_size(output_size) {
+        
+        decoder = register_module("decoder", torch::nn::Linear(hidden_size, output_size)),
+        torch::nn::init::orthogonal_(decoder->weight, 0.01);
+        torch::nn::init::constant_(decoder->bias, 0.0);
+
+        value_function = register_module("value_function", torch::nn::Linear(hidden_size, 1));
+        torch::nn::init::orthogonal_(value_function->weight, 1.0);
+        torch::nn::init::constant_(value_function->bias, 0.0);
+    }
+
+    std::tuple<torch::Tensor, torch::Tensor> forward(torch::Tensor hidden) {
+        torch::Tensor logits = decoder->forward(hidden);
+        torch::Tensor value = value_function->forward(hidden);
+        return {logits, value};
+    }
+};  
+
+
+class PolicyMinGRU : public torch::nn::Module {
+private:
+    //torch::nn::Sequential encoder{nullptr};
+    //torch::nn::Linear decoder{nullptr};
+    std::shared_ptr<DefaultEncoder> encoder{nullptr};
+    std::shared_ptr<DefaultDecoder> decoder{nullptr};
+    //std::shared_ptr<MinGRULayer> mingru{nullptr};
+    torch::nn::ModuleList mingru{nullptr};
+    bool kernels;
+
+public:
+    torch::nn::Linear value{nullptr};
+    int64_t input_size;
+    int64_t hidden_size;
+    int64_t num_atns;
+    int64_t num_layers;
+    float expansion_factor;
+
+    PolicyMinGRU(int64_t input_size, int64_t num_atns, int64_t hidden_size = 128, int64_t expansion_factor = 1, int64_t num_layers = 1, bool kernels = true)
+        : input_size(input_size), hidden_size(hidden_size), expansion_factor(expansion_factor),
+          num_atns(num_atns), num_layers(num_layers), kernels(kernels) {
+        encoder = register_module("encoder", std::make_shared<DefaultEncoder>(input_size, hidden_size));
+        decoder = register_module("decoder", std::make_shared<DefaultDecoder>(hidden_size, num_atns));
+        /*
+        encoder = register_module("encoder", torch::nn::Sequential(
+            torch::nn::Linear(input_size, hidden_size),
+            torch::nn::GELU()
+        ));
+        auto encoder_linear = (*encoder)[0]->as<torch::nn::LinearImpl>();
+        torch::nn::init::orthogonal_(encoder_linear->weight, std::sqrt(2.0));
+        torch::nn::init::constant_(encoder_linear->bias, 0.0);
+
+        decoder = register_module("decoder", torch::nn::Linear(hidden_size, num_atns));
+        torch::nn::init::orthogonal_(decoder->weight, 0.01);
+        torch::nn::init::constant_(decoder->bias, 0.0);
+
+        value = register_module("value", torch::nn::Linear(hidden_size, 1));
+        torch::nn::init::orthogonal_(value->weight, 1.0);
+        torch::nn::init::constant_(value->bias, 0.0);
+        */
+
+        //mingru = register_module("mingru", std::make_shared<MinGRULayer>(hidden_size, 1));
+        mingru = torch::nn::ModuleList();
+        for (int64_t i = 0; i < num_layers; ++i) {
+            mingru->push_back(MinGRULayer(hidden_size, expansion_factor, kernels));
+        }
+        register_module("mingru", mingru);
+    }
+
+    torch::Tensor initial_state(int64_t batch_size, torch::Device device) {
+        return torch::zeros(
+            {num_layers, batch_size, hidden_size*expansion_factor},
+            torch::dtype(torch::kFloat32).device(device)
+        );
+    }
+
+    std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> forward(
+        torch::Tensor observations, torch::Tensor state) {
+        int64_t B = observations.size(0);
+
+        // Ensure flat input: [B, input_size]
+        TORCH_CHECK(observations.dim() == 2 && observations.size(1) == input_size,
+            "Observations must be [B, input_size]");
+
+        TORCH_CHECK(state.dim() == 3 && state.size(0) == num_layers && state.size(1) == B && state.size(2) == hidden_size*expansion_factor,
+            "state must be [num_layers, B, hidden_size]");
+
+        auto hidden = encoder->forward(observations);
+
+        hidden = hidden.unsqueeze(1);
+        state = state.unsqueeze(2);
+
+        std::tuple<torch::Tensor, torch::Tensor> mingru_out;
+        std::vector<torch::Tensor> state_out;
+
+        for (int64_t i = 0; i < num_layers; ++i) {
+            auto state_in = state.select(0, i);
+            auto layer = (*mingru)[i]->as<MinGRULayer>();
+            mingru_out = layer->forward(hidden, state_in);
+            hidden = std::get<0>(mingru_out);
+            auto state_out = std::get<1>(mingru_out);
+            state.select(0, i).copy_(state_out);
+            //state_out.push_back(std::get<1>(mingru_out));
+        }
+
+        hidden = hidden.squeeze(1);
+        state = state.squeeze(2);
+        //state = torch::stack(state_out, 0).squeeze(2);
+
+        std::tuple<torch::Tensor, torch::Tensor> out = decoder->forward(hidden);
+        auto logits = std::get<0>(out);
+        auto values = std::get<1>(out);
+        //auto logits = decoder->forward(hidden);
+        //auto values = value->forward(hidden);
+
+        return {logits, values, state};
+    }
+
+    std::tuple<torch::Tensor, torch::Tensor> forward_train(
+        torch::Tensor observations, torch::Tensor state) {
+
+        auto x = observations;
+        auto x_shape = x.sizes();
+
+        // Expecting [B, TT, input_size] or [B, input_size]
+        TORCH_CHECK((x.dim() == 2 || x.dim() == 3),
+                    "Observations must be [B, input_size] or [B, TT, input_size]");
+        TORCH_CHECK(x.size(-1) == input_size,
+                    "Last dimension of observations must match input_size");
+
+        int64_t B = x_shape[0];
+        int64_t TT = (x.dim() == 3) ? x_shape[1] : 1;
+
+        TORCH_CHECK(state.dim() == 4 && state.size(0) == num_layers && state.size(1) == B && state.size(2) == 1 && state.size(3) == hidden_size*expansion_factor,
+            "state must be [num_layers, B, 1, hidden_size*expansion_factor]");
+
+        // Flatten time steps if needed
+        if (x.dim() == 3) {
+            x = x.reshape({B * TT, input_size});
+        } else {
+            TT = 1;
+        }
+
+        auto hidden = encoder->forward(x);
+
+        hidden = hidden.reshape({B, TT, hidden_size});
+
+        std::tuple<torch::Tensor, torch::Tensor> mingru_out;
+        for (int64_t i = 0; i < num_layers; ++i) {
+            auto state_in = state.select(0, i);
+            auto layer = (*mingru)[i]->as<MinGRULayer>();
+            mingru_out = layer->forward(hidden, state_in);
+            hidden = std::get<0>(mingru_out);
+        }
+
+        auto flat_hidden = hidden.reshape({-1, hidden_size});
+
+        std::tuple<torch::Tensor, torch::Tensor> out = decoder->forward(flat_hidden);
+        auto logits = std::get<0>(out);
+        auto values = std::get<1>(out);
+ 
+        //auto logits = decoder->forward(flat_hidden);
+        //auto values = value->forward(flat_hidden);
+
+        logits = logits.reshape({B, TT, num_atns});
+        values = values.reshape({B, TT, 1});
+
+        return {logits, values};
+    }
+};
+
+
+class PolicyLSTM : public torch::nn::Module {
+private:
+    int64_t input_size_;
+    int64_t hidden_size_;
+    int64_t num_atns_;
+    torch::nn::Sequential encoder{nullptr};
+    torch::nn::Linear decoder{nullptr};
+    torch::nn::Linear value{nullptr};
+    torch::nn::LSTM lstm{nullptr};
+    std::shared_ptr<ShareableLSTMCell> cell{nullptr};
+
+public:
+    // Constructor: input_size instead of grid_size
+    PolicyLSTM(int64_t input_size, int64_t num_atns, int64_t hidden_size = 128)
+        : input_size_(input_size), hidden_size_(hidden_size), num_atns_(num_atns) {
+        encoder = register_module("encoder", torch::nn::Sequential(
+            torch::nn::Linear(input_size_, hidden_size_),
+            torch::nn::GELU()
+        ));
+        auto encoder_linear = (*encoder)[0]->as<torch::nn::LinearImpl>();
+        torch::nn::init::orthogonal_(encoder_linear->weight, std::sqrt(2.0));
+        torch::nn::init::constant_(encoder_linear->bias, 0.0);
+
+        decoder = register_module("decoder", torch::nn::Linear(hidden_size_, num_atns_));
+        torch::nn::init::orthogonal_(decoder->weight, 0.01);
+        torch::nn::init::constant_(decoder->bias, 0.0);
+
+        value = register_module("value", torch::nn::Linear(hidden_size_, 1));
+        torch::nn::init::orthogonal_(value->weight, 1.0);
+        torch::nn::init::constant_(value->bias, 0.0);
+
+        lstm = register_module("lstm", torch::nn::LSTM(torch::nn::LSTMOptions(hidden_size_, hidden_size_).num_layers(1)));
+        torch::nn::init::orthogonal_(lstm->named_parameters()["weight_ih_l0"], 1.0);
+        torch::nn::init::orthogonal_(lstm->named_parameters()["weight_hh_l0"], 1.0);
+        lstm->named_parameters()["bias_ih_l0"].data().zero_();
+        lstm->named_parameters()["bias_hh_l0"].data().zero_();
+
+        cell = register_module("cell", std::make_shared<ShareableLSTMCell>(torch::nn::LSTMCellOptions(hidden_size_, hidden_size_)));
+        cell->set_shared_weights(lstm->named_parameters()["weight_ih_l0"],
+             lstm->named_parameters()["weight_hh_l0"],
+             lstm->named_parameters()["bias_ih_l0"],
+             lstm->named_parameters()["bias_hh_l0"]);
+        /*
+        // Share weights between LSTM and LSTMCell. Do not register or you'll double-update during optim.
+        //cell = torch::nn::LSTMCell(hidden_size_, hidden_size_);
+        cell = register_module("cell", torch::nn::LSTMCell(hidden_size_, hidden_size_));
+        cell->named_parameters()["weight_ih"].data() = lstm->named_parameters()["weight_ih_l0"].data();
+        cell->named_parameters()["weight_hh"].data() = lstm->named_parameters()["weight_hh_l0"].data();
+        cell->named_parameters()["bias_ih"].data() = lstm->named_parameters()["bias_ih_l0"].data();
+        cell->named_parameters()["bias_hh"].data() = lstm->named_parameters()["bias_hh_l0"].data();
+        //cell->to(torch::kCUDA);
+        */
+    }
+
+    // Forward for evaluation/inference (uses LSTMCell)
+    std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor> forward(
+        torch::Tensor observations, torch::Tensor h, torch::Tensor c) {
+        int64_t B = observations.size(0);
+
+        // Ensure flat input: [B, input_size]
+        TORCH_CHECK(observations.dim() == 2 && observations.size(1) == input_size_,
+                    "Observations must be [B, input_size]");
+
+        if (h.defined() && h.numel() > 0) {
+            TORCH_CHECK(h.dim() == 2 && h.size(0) == B && h.size(1) == hidden_size_,
+                        "h must be [B, hidden_size]");
+            TORCH_CHECK(c.dim() == 2 && c.size(0) == B && c.size(1) == hidden_size_,
+                        "c must be [B, hidden_size]");
+        }
+
+        auto hidden = encoder->forward(observations);
+
+        std::tuple<torch::Tensor, torch::Tensor> cell_out;
+        if (h.defined() && h.numel() > 0) {
+            cell_out = cell->forward(hidden, std::make_optional(std::make_tuple(h, c)));
+        } else {
+            cell_out = cell->forward(hidden);
+        }
+
+        auto hidden_out = std::get<0>(cell_out);
+        auto c_out = std::get<1>(cell_out);
+
+        //std::std::cout << std::fixed << std::setprecision(10);
+        //std::std::cout << "Hidden 0 cpp: " << hidden_out[0][0].item<float>() << std::std::endl;
+
+
+        auto logits = decoder->forward(hidden_out);
+        auto values = value->forward(hidden_out);
+
+        return {logits, values, hidden_out, c_out};
+    }
+
+    // Forward for training (uses LSTM)
+    std::tuple<torch::Tensor, torch::Tensor> forward_train(
+        torch::Tensor observations, torch::Tensor lstm_h, torch::Tensor lstm_c) {
+        auto x = observations;
+        auto x_shape = x.sizes();
+
+        // Expecting [B, TT, input_size] or [B, input_size]
+        TORCH_CHECK((x.dim() == 2 || x.dim() == 3),
+                    "Observations must be [B, input_size] or [B, TT, input_size]");
+        TORCH_CHECK(x.size(-1) == input_size_,
+                    "Last dimension of observations must match input_size");
+
+        int64_t B = x_shape[0];
+        int64_t TT = (x.dim() == 3) ? x_shape[1] : 1;
+
+        if (lstm_h.defined() && lstm_h.numel() > 0) {
+            TORCH_CHECK(lstm_h.dim() == 3 && lstm_h.size(0) == 1 && lstm_h.size(1) == B,
+                        "lstm_h must be [1, B, hidden_size]");
+            TORCH_CHECK(lstm_c.dim() == 3 && lstm_c.size(0) == 1 && lstm_c.size(1) == B,
+                        "lstm_c must be [1, B, hidden_size]");
+        }
+
+        // Flatten time steps if needed
+        if (x.dim() == 3) {
+            x = x.reshape({B * TT, input_size_});
+        } else {
+            TT = 1;
+        }
+
+        auto hidden = encoder->forward(x);
+
+        hidden = hidden.reshape({B, TT, hidden_size_});
+        hidden = hidden.transpose(0, 1);  // [TT, B, hidden_size]
+
+        std::tuple<torch::Tensor, std::tuple<torch::Tensor, torch::Tensor>> lstm_out;
+        if (lstm_h.defined() && lstm_h.numel() > 0) {
+            lstm_out = lstm->forward(hidden, std::make_optional(std::make_tuple(lstm_h, lstm_c)));
+        } else {
+            lstm_out = lstm->forward(hidden);
+        }
+
+        hidden = std::get<0>(lstm_out);
+        hidden = hidden.transpose(0, 1);  // [B, TT, hidden_size]
+
+        auto flat_hidden = hidden.reshape({-1, hidden_size_});
+        auto logits = decoder->forward(flat_hidden);
+        auto values = value->forward(flat_hidden);
+
+        logits = logits.reshape({B, TT, num_atns_});
+        values = values.reshape({B, TT, 1});
+
+        return {logits, values};
+    }
+};
+
+double cosine_annealing(double lr_base, double lr_min, int64_t t, int64_t T) {
+    if (T == 0) return lr_base;  // avoid division by zero
+    double ratio = static_cast<double>(t) / static_cast<double>(T);
+    ratio = std::max(0.0, std::min(1.0, ratio));  // clamp to [0, 1]
+    return lr_min + 0.5*(lr_base - lr_min)*(1 + std::cos(M_PI * ratio));
+}
+
+void sync_fp16_fp32(pufferlib::PolicyLSTM* policy_16, pufferlib::PolicyLSTM* policy_32) {
+    auto params_32 = policy_32->parameters();
+    auto params_16 = policy_16->parameters();
+    for (size_t i = 0; i < params_32.size(); ++i) {
+        params_16[i].copy_(params_32[i].to(torch::kFloat32));
+    }
+}
+
+typedef struct {
+    PolicyMinGRU* policy;
+    VecEnv* vec;
+    torch::optim::Muon* muon;
+    torch::Tensor rollout_state;
+    torch::Tensor observations;
+    torch::Tensor actions;
+    torch::Tensor values;
+    torch::Tensor logprobs;
+    torch::Tensor rewards;
+    torch::Tensor terminals;
+    torch::Tensor ratio;
+    torch::Tensor importance;
+    torch::Tensor debug;
+    torch::Tensor env_obs;
+    torch::Tensor env_actions;
+    torch::Tensor env_rewards;
+    torch::Tensor env_terminals;
+    torch::Tensor graph_obs;
+    torch::Tensor graph_actions;
+    torch::Tensor graph_state;
+    torch::Tensor graph_state_out;
+    torch::Tensor graph_value;
+    torch::Tensor graph_logprobs;
+    torch::Tensor graph_train_mb_obs;
+    torch::Tensor graph_train_mb_state;
+    torch::Tensor graph_train_mb_actions;
+    torch::Tensor graph_train_mb_logprobs;
+    torch::Tensor graph_train_mb_advantages;
+    torch::Tensor graph_train_mb_prio;
+    torch::Tensor graph_train_mb_values;
+    torch::Tensor graph_train_mb_returns;
+    torch::Tensor graph_train_ratio;
+    torch::Tensor graph_train_logits;
+    torch::Tensor graph_train_newvalue;
+    //void* cudagraphs;
+    at::cuda::CUDAGraph rollout_graph;
+    at::cuda::CUDAGraph train_forward_graph;
+    at::cuda::CUDAGraph rollout_copy_graphs[64][2];
+    torch::Tensor obs_input;
+    torch::Tensor state_input;
+    torch::Tensor logits_output;
+    torch::Tensor value_output;
+    torch::Tensor state_output;
+    bool captured;
+    torch::Tensor adv_mean;
+    torch::Tensor adv_std;
+    int segments;
+    int horizon;
+    int input_size;
+    int num_atns;
+    int hidden_size;
+    int expansion_factor;
+    int num_layers;
+    int minibatch_segments;
+    double lr;
+    double min_lr_ratio;
+    double beta1;
+    double beta2;
+    double eps;
+    int epoch;
+    int max_epochs;
+    double prio_beta0;
+    double prio_alpha;
+    double clip_coef;
+    double vf_clip_coef;
+    double gamma;
+    double gae_lambda;
+    double vtrace_rho_clip;
+    double vtrace_c_clip;
+    double vf_coef;
+    double ent_coef;
+    double max_grad_norm;
+    bool use_rnn;
+    bool anneal_lr;
+    int total_minibatches;
+    int num_envs;
+    int accumulate_minibatches;
+    int num_buffers;
+    bool cudagraphs;
+    bool kernels;
+    bool profile;
+    int i_tmp;
+    int j_tmp;
+} PuffeRL;
+
+pybind11::dict log_environments(pybind11::object pufferl_obj) {
+    auto& pufferl = pufferl_obj.cast<PuffeRL&>();
+    auto& vec = pufferl.vec;
+
+    Dict* out = create_dict(32);
+    vec_log(vec, out);
+
+    pybind11::dict py_out;
+    for (int i = 0; i < out->size; i++) {
+        py_out[out->items[i].key] = out->items[i].float_value;
+    }
+    return py_out;
+}
+
+torch::Tensor initial_state(pybind11::object pufferl_obj, int64_t batch_size, torch::Device device) {
+    auto& pufferl = pufferl_obj.cast<PuffeRL&>();
+    auto& policy = pufferl.policy;
+    return policy->initial_state(batch_size, device);
+}
+
+void forward_call(PuffeRL* pufferl) {
+    torch::NoGradGuard no_grad;
+
+    torch::Tensor obs = pufferl->graph_obs;
+    torch::Tensor state = pufferl->graph_state;
+    auto* policy = pufferl->policy;
+ 
+    auto [logits, value, state_out] = policy->forward(obs, state);
+
+    //pufferl->debug.copy_(state_out[-1]);
+
+    logits = torch::nan_to_num(logits, 1e-8, 1e-8, 1e-8);
+    auto logprobs = torch::log_softmax(logits, 1);
+    auto action = at::multinomial(logprobs.exp(), 1, true).squeeze(1);
+    auto logprob = logprobs.gather(1, action.unsqueeze(1)).squeeze(1);
+
+    pufferl->graph_actions.copy_(action.to(torch::kInt32), false);
+    pufferl->graph_value.copy_(value.flatten(), false);
+    pufferl->graph_logprobs.copy_(logprob, false);
+    //pufferl->graph_state.copy_(state_out, false);
+    pufferl->graph_state_out.copy_(state_out, false);
+}
+
+void rollout_copy_call(PuffeRL* pufferl) {
+    int h = pufferl->i_tmp;
+    int buf = pufferl->j_tmp;
+    int num_buffers = pufferl->num_buffers;
+    int num_envs = pufferl->num_envs;
+    int block_size = num_envs / num_buffers;
+
+    auto obs_buffer = pufferl->observations;
+    auto act_buffer = pufferl->actions;
+    auto logprob_buffer = pufferl->logprobs;
+    auto rew_buffer = pufferl->rewards;
+    auto term_buffer = pufferl->terminals;
+    auto val_buffer = pufferl->values;
+
+    auto actions = pufferl->env_actions;
+    auto rewards = pufferl->env_rewards;
+    auto terminals = pufferl->env_terminals;
+
+    //buf_state.copy_(pufferl->graph_state_out, false);
+
+    // Store with non-blocking copies
+    obs_buffer.select(1, h).narrow(0, buf*block_size, block_size).copy_(pufferl->graph_obs, true);
+    act_buffer.select(1, h).narrow(0, buf*block_size, block_size).copy_(pufferl->graph_actions.to(torch::kInt64), true);
+    logprob_buffer.select(1, h).narrow(0, buf*block_size, block_size).copy_(pufferl->graph_logprobs.to(torch::kFloat32), true);
+    val_buffer.select(1, h).narrow(0, buf*block_size, block_size).copy_(pufferl->graph_value.to(torch::kFloat32), true);
+
+    auto rewards_batch = rewards.narrow(0, buf*block_size, block_size);
+    auto rewards_clamped = torch::clamp(rewards_batch, -1.0f, 1.0f);
+    rew_buffer.select(1, h).narrow(0, buf*block_size, block_size).copy_(rewards_clamped.to(torch::kFloat32), true);
+
+    auto terminals_batch = terminals.narrow(0, buf*block_size, block_size);
+    term_buffer.select(1, h).narrow(0, buf*block_size, block_size).copy_(terminals_batch.to(torch::kFloat32), true);
+
+    actions.narrow(0, buf*block_size, block_size).copy_(pufferl->graph_actions.to(torch::kFloat32), true);
+}
+ 
+//std::tuple<torch::Tensor, torch::Tensor> train_forward_call(PuffeRL* pufferl) {
+void train_forward_call(PuffeRL* pufferl) {
+    torch::Tensor mb_obs = pufferl->graph_train_mb_obs;
+    torch::Tensor mb_state = pufferl->graph_train_mb_state;
+    torch::Tensor mb_actions = pufferl->graph_train_mb_actions;
+    torch::Tensor mb_logprobs = pufferl->graph_train_mb_logprobs;
+    torch::Tensor mb_advantages = pufferl->graph_train_mb_advantages;
+    torch::Tensor mb_prio = pufferl->graph_train_mb_prio;
+    torch::Tensor mb_values = pufferl->graph_train_mb_values;
+    torch::Tensor mb_returns = pufferl->graph_train_mb_returns;
+    auto minibatch_segments = pufferl->minibatch_segments;
+    auto horizon = pufferl->horizon;
+    auto adv_mean = pufferl->adv_mean;
+    auto adv_std = pufferl->adv_std;
+    auto clip_coef = pufferl->clip_coef;
+    auto vf_clip_coef = pufferl->vf_clip_coef;
+    auto vf_coef = pufferl->vf_coef;
+    auto ent_coef = pufferl->ent_coef;
+
+    auto* policy = pufferl->policy;
+
+    auto [logits, newvalue] = policy->forward_train(mb_obs.to(DTYPE), mb_state);
+
+    torch::Tensor loss;
+    if (false) {
+    //if (pufferl->kernels) {
+        loss = fused_ppo_loss(
+            logits,
+            newvalue,
+            mb_actions,
+            mb_logprobs.to(logits.dtype()),
+            mb_advantages.to(logits.dtype()),
+            mb_prio.to(logits.dtype()),
+            mb_values.to(logits.dtype()),
+            mb_returns.to(logits.dtype()),
+            adv_mean,
+            adv_std,
+            clip_coef,
+            vf_clip_coef,
+            vf_coef,
+            ent_coef
+        )[0];
+    } else {
+        // Flatten for action lookup
+        auto flat_logits = logits.reshape({-1, logits.size(-1)});
+        auto flat_actions = mb_actions.reshape({-1});
+        auto logprobs_new = torch::log_softmax(flat_logits, 1);
+        auto probs_new = logprobs_new.exp();
+
+        // Gather logprobs for taken actions
+        auto newlogprob_flat = logprobs_new.gather(1, flat_actions.unsqueeze(1)).squeeze(1);
+        auto newlogprob = newlogprob_flat.reshape({minibatch_segments, horizon});
+        auto entropy = - (probs_new * logprobs_new).sum(1).mean();
+
+        // Compute ratio
+        auto logratio = newlogprob - mb_logprobs;
+        auto ratio_new = logratio.exp();
+        pufferl->graph_train_ratio.copy_(ratio_new, false);
+        pufferl->graph_train_newvalue.copy_(newvalue, false);
+
+        // Normalize advantages: (adv - mean) / std, then weight
+        auto adv_normalized = mb_advantages;
+        adv_normalized = mb_prio * (adv_normalized - adv_normalized.mean()) / (adv_normalized.std() + 1e-8);
+
+        // Policy loss
+        auto pg_loss1 = -adv_normalized * ratio_new;
+        auto pg_loss2 = -adv_normalized * torch::clamp(ratio_new, 1.0 - clip_coef, 1.0 + clip_coef);
+        auto pg_loss = torch::max(pg_loss1, pg_loss2).mean();
+
+        // Value loss
+        newvalue = newvalue.view(mb_returns.sizes());
+        auto v_clipped = mb_values + torch::clamp(newvalue - mb_values, -vf_clip_coef, vf_clip_coef);
+        auto v_loss_unclipped = (newvalue - mb_returns).pow(2);
+        auto v_loss_clipped = (v_clipped - mb_returns).pow(2);
+        auto v_loss = 0.5 * torch::max(v_loss_unclipped, v_loss_clipped).mean();
+
+        // Total loss
+        loss = pg_loss + vf_coef*v_loss - ent_coef*entropy;
+        /*
+        {
+            torch::NoGradGuard no_grad;
+
+            // Accumulate stats
+            pg_sum += pg_loss.detach();
+            v_sum += v_loss.detach();
+            ent_sum += entropy.detach();
+            total_sum += loss.detach();
+
+            // KL and clipping diagnostics (matches Python)
+            auto old_kl = (-logratio).mean();
+            auto kl = ((ratio_new - 1) - logratio).mean();
+            auto cf = (ratio_new - 1.0).abs().gt(clip_coef).to(torch::kFloat32).mean();
+            auto imp = ratio_new.mean();
+
+            old_approx_kl_sum += old_kl.detach();
+            approx_kl_sum += kl.detach();
+            clipfrac_sum += cf.detach();
+            importance_sum += imp.detach();
+        }
+        */
+    }
+
+    loss.backward();
+    clip_grad_norm_(policy->parameters(), pufferl->max_grad_norm);
+    pufferl->muon->step();
+    pufferl->muon->zero_grad();
+
+    //return std::make_tuple(logits, newvalue);
+
+    //std::cout << "call logits sizes: " << pufferl->graph_train_logits.sizes() << std::endl;
+    //std::cout << "call newvalue sizes: " << pufferl->graph_train_newvalue.sizes() << std::endl;
+
+    //pufferl->graph_train_logits.copy_(logits, false);
+    //pufferl->graph_train_newvalue.copy_(newvalue, false);
+}
+
+// Capture
+void pufferl_capture_graph(PuffeRL* pufferl, at::cuda::CUDAGraph* graph, void (*func)(PuffeRL*)) {
+    /* Checklist for avoiding diabolical capture bugs:
+     * 1. Don't start separate streams before tracing (i.e. env gpu buffers)
+     * 2. Make sure input/output buffer pointers don't change
+     * 3. Make sure to restore the original stream after tracing
+     * 4. All custom kernels need to use the default torch stream
+     * 5. Make sure you are using the torch stream fns, not the c10 ones.
+     * 6. Scalars get captured by value. They cannot change between calls.
+     */
+    at::cuda::CUDAStream current_stream = at::cuda::getCurrentCUDAStream();
+
+    at::cuda::CUDAStream warmup_stream = at::cuda::getStreamFromPool();
+    at::cuda::setCurrentCUDAStream(warmup_stream);
+    for (int i = 0; i < 10; ++i) {
+        func(pufferl);
+    }
+    warmup_stream.synchronize();
+
+    auto cap_stream = at::cuda::getStreamFromPool();
+    at::cuda::setCurrentCUDAStream(cap_stream);
+    graph->capture_begin();
+    func(pufferl);
+    graph->capture_end();
+    cap_stream.synchronize();
+
+    cudaDeviceSynchronize();
+
+    at::cuda::setCurrentCUDAStream(current_stream);
+}
+
+/*
+// Destroy
+void pufferl_destroy_cudagraph(PuffeRL* pufferl) {
+    auto* graph = static_cast<at::cuda::CUDAGraph*>(pufferl->cudagraph);
+    delete graph;
+    pufferl->cudagraph = nullptr;
+}
+
+
+void capture_forward(std::unique_ptr<pufferlib::PuffeRL>& pufferl) {
+    auto& policy = pufferl->policy;
+    pufferl->forward_graph.reset();
+    pufferl->forward_graph.capture_begin(
+        c10::cuda::MempoolId_t{0, 0},
+        cudaStreamCaptureModeGlobal
+    );
+
+    try {
+        auto output_tuple = policy->forward(pufferl->obs_buf, pufferl->state_in_buf);
+        pufferl->logits_buf  = std::get<0>(output_tuple);
+        pufferl->value_buf   = std::get<1>(output_tuple);
+        pufferl->state_out_buf = std::get<2>(output_tuple);
+
+        pufferl->forward_graph.capture_end();
+    } catch (...) {
+        pufferl->forward_graph.reset();
+        throw;
+    }
+
+    pufferl->forward_graph.instantiate();
+}
+*/
+
+std::unique_ptr<pufferlib::PuffeRL> create_pufferl(pybind11::dict kwargs) {
+    auto pufferl = std::make_unique<pufferlib::PuffeRL>();
+
+    pufferl->segments = kwargs["segments"].cast<int>();
+    pufferl->horizon = kwargs["horizon"].cast<int>();
+    pufferl->input_size = kwargs["input_size"].cast<int>();
+    pufferl->num_atns = kwargs["num_atns"].cast<int>();
+    pufferl->hidden_size = kwargs["hidden_size"].cast<int>();
+    pufferl->expansion_factor = kwargs["expansion_factor"].cast<int>();
+    pufferl->num_layers = kwargs["num_layers"].cast<int>();
+    pufferl->minibatch_segments = kwargs["minibatch_segments"].cast<int>();
+    pufferl->lr = kwargs["lr"].cast<double>();
+    pufferl->min_lr_ratio = kwargs["min_lr_ratio"].cast<double>();
+    pufferl->beta1 = kwargs["beta1"].cast<double>();
+    pufferl->beta2 = kwargs["beta2"].cast<double>();
+    pufferl->eps = kwargs["eps"].cast<double>();
+    pufferl->max_epochs = kwargs["max_epochs"].cast<int>();
+    pufferl->prio_beta0 = kwargs["prio_beta0"].cast<double>();
+    pufferl->prio_alpha = kwargs["prio_alpha"].cast<double>();
+    pufferl->clip_coef = kwargs["clip_coef"].cast<double>();
+    pufferl->vf_clip_coef = kwargs["vf_clip_coef"].cast<double>();
+    pufferl->gamma = kwargs["gamma"].cast<double>();
+    pufferl->gae_lambda = kwargs["gae_lambda"].cast<double>();
+    pufferl->vtrace_rho_clip = kwargs["vtrace_rho_clip"].cast<double>();
+    pufferl->vtrace_c_clip = kwargs["vtrace_c_clip"].cast<double>();
+    pufferl->vf_coef = kwargs["vf_coef"].cast<double>();
+    pufferl->ent_coef = kwargs["ent_coef"].cast<double>();
+    pufferl->max_grad_norm = kwargs["max_grad_norm"].cast<double>();
+    pufferl->use_rnn = kwargs["use_rnn"].cast<bool>();
+    pufferl->anneal_lr = kwargs["anneal_lr"].cast<bool>();
+    pufferl->total_minibatches = kwargs["total_minibatches"].cast<int>();
+    pufferl->num_envs = kwargs["num_envs"].cast<int>();
+    pufferl->accumulate_minibatches = kwargs["accumulate_minibatches"].cast<int>();
+    pufferl->num_buffers = kwargs["num_buffers"].cast<int>();
+    pufferl->cudagraphs = kwargs["cudagraphs"].cast<bool>();
+    pufferl->kernels = kwargs["kernels"].cast<bool>();
+    pufferl->profile = kwargs["profile"].cast<bool>();
+
+    // Seeding
+    torch::manual_seed(42);
+    torch::cuda::manual_seed(42);
+
+    // Enable cuDNN benchmarking
+    torch::globalContext().setBenchmarkCuDNN(true);
+    torch::globalContext().setDeterministicCuDNN(false);
+    torch::globalContext().setBenchmarkLimitCuDNN(32);
+
+    // Enable TF32 for faster FP32 math (uses Tensor Cores on 4090)
+    torch::globalContext().setAllowTF32CuBLAS(true);
+    torch::globalContext().setAllowTF32CuDNN(true);
+
+    // Enable faster FP16 reductions
+    torch::globalContext().setAllowFP16ReductionCuBLAS(true);
+
+    // BF16 reduction (if using bfloat16)
+    torch::globalContext().setAllowBF16ReductionCuBLAS(true);
+
+    int input_size = pufferl->input_size;
+    int num_atns = pufferl->num_atns;
+    int hidden_size = pufferl->hidden_size;
+    int expansion_factor = pufferl->expansion_factor;
+    int num_layers = pufferl->num_layers;
+    bool kernels = pufferl->kernels;
+    PolicyMinGRU* policy = new PolicyMinGRU(input_size, num_atns, hidden_size, expansion_factor, num_layers, kernels);
+    policy->to(torch::kCUDA);
+    policy->to(DTYPE);
+    pufferl->policy = policy;
+
+    double lr = pufferl->lr;
+    double beta1 = pufferl->beta1;
+    double eps = pufferl->eps;
+    pufferl->muon = new torch::optim::Muon(policy->parameters(),
+        torch::optim::MuonOptions(lr).momentum(beta1).eps(eps));
+
+    // Allocate buffers
+    // TODO: Match env type, alloc on gpu native
+    int segments = pufferl->segments;
+    int horizon = pufferl->horizon;
+    int batch = pufferl->num_envs / pufferl->num_buffers;
+    pufferl->observations = torch::zeros({segments, horizon, input_size}, torch::dtype(torch::kFloat32).device(torch::kCUDA));
+    pufferl->actions = torch::zeros({segments, horizon}, torch::dtype(torch::kFloat32).device(torch::kCUDA));
+    pufferl->values = torch::zeros({segments, horizon}, torch::dtype(DTYPE).device(torch::kCUDA));
+    pufferl->logprobs = torch::zeros({segments, horizon}, torch::dtype(torch::kFloat32).device(torch::kCUDA));
+    pufferl->rewards = torch::zeros({segments, horizon}, torch::dtype(torch::kFloat32).device(torch::kCUDA));
+    pufferl->terminals = torch::zeros({segments, horizon}, torch::dtype(torch::kFloat32).device(torch::kCUDA));
+    pufferl->ratio = torch::zeros({segments, horizon}, torch::dtype(torch::kFloat32).device(torch::kCUDA));
+    pufferl->importance = torch::zeros({segments, horizon}, torch::dtype(torch::kFloat32).device(torch::kCUDA));
+    pufferl->debug = torch::zeros({batch, hidden_size}, torch::dtype(torch::kFloat32).device(torch::kCUDA));
+
+    pufferl->graph_obs = torch::zeros({batch, input_size}, DTYPE).to(torch::kCUDA);
+    pufferl->graph_actions = torch::zeros(batch, torch::kInt32).to(torch::kCUDA);
+    pufferl->graph_value = torch::zeros(batch, DTYPE).to(torch::kCUDA);
+    pufferl->graph_logprobs = torch::zeros(batch, DTYPE).to(torch::kCUDA);
+    pufferl->graph_state = policy->initial_state(batch, torch::kCUDA);
+    pufferl->graph_state_out = policy->initial_state(batch, torch::kCUDA);
+    pufferl->rollout_state = policy->initial_state(pufferl->num_envs, torch::kCUDA);
+
+    int minibatch_segments = pufferl->minibatch_segments;
+    pufferl->graph_train_mb_obs = torch::zeros({minibatch_segments, horizon, input_size}, DTYPE).to(torch::kCUDA);
+    pufferl->graph_train_mb_state = torch::zeros(
+        {policy->num_layers, minibatch_segments, 1, policy->hidden_size*policy->expansion_factor},
+        torch::dtype(DTYPE).device(torch::kCUDA)
+    );
+
+    auto options = torch::TensorOptions()
+    .dtype(DTYPE)
+    .device(torch::kCUDA);
+
+    //pufferl->graph_train_logits = torch::zeros({minibatch_segments, horizon, num_atns}, options);
+    pufferl->graph_train_newvalue = torch::zeros({minibatch_segments, horizon, 1}, options);
+    pufferl->graph_train_ratio = torch::zeros({minibatch_segments, horizon}, options);
+    pufferl->graph_train_mb_actions = torch::zeros({minibatch_segments, horizon}, options).to(torch::kInt64);
+    pufferl->graph_train_mb_logprobs = torch::zeros({minibatch_segments, horizon}, options);
+    pufferl->graph_train_mb_advantages = torch::zeros({minibatch_segments, horizon}, options);
+    pufferl->graph_train_mb_prio = torch::zeros({minibatch_segments, 1}, options);
+    pufferl->graph_train_mb_values = torch::zeros({minibatch_segments, horizon}, options);
+    pufferl->graph_train_mb_returns = torch::zeros({minibatch_segments, horizon}, options);
+    pufferl->adv_mean = torch::zeros({1}, options);
+    pufferl->adv_std = torch::ones({1}, options);
+
+    /*
+    std::cout << "value weight: " << pufferl->policy->value->weight[0][0].item<float>() << std::endl;
+    {
+        pybind11::gil_scoped_release no_gil;
+        train_forward_call(pufferl.get());
+    }
+    std::cout << "value weight: " << pufferl->policy->value->weight[0][0].item<float>() << std::endl;
+    */
+
+    auto [vec, obs, actions, rewards, terminals] = create_environments(pufferl->num_envs);
+    pufferl->vec = vec;
+    pufferl->env_obs = obs;
+    pufferl->env_actions = actions;
+    pufferl->env_rewards = rewards;
+    pufferl->env_terminals = terminals;
+
+    if (pufferl->cudagraphs) {
+        pufferl->rollout_graph = at::cuda::CUDAGraph();
+        pufferl->train_forward_graph = at::cuda::CUDAGraph();
+        pufferl_capture_graph(pufferl.get(), &pufferl->rollout_graph, forward_call);
+        {
+            pybind11::gil_scoped_release no_gil;
+            pufferl_capture_graph(pufferl.get(), &pufferl->train_forward_graph, train_forward_call);
+        }
+
+        for (int i = 0; i < pufferl->horizon; ++i) {
+            for (int j = 0; j < pufferl->num_buffers; ++j) {
+                pufferl->i_tmp = i;
+                pufferl->j_tmp = j;
+                pufferl->rollout_copy_graphs[i][j] = at::cuda::CUDAGraph();
+                pufferl_capture_graph(pufferl.get(), &pufferl->rollout_copy_graphs[i][j], rollout_copy_call);
+            }
+        }
+    }
+
+    // FAILS IF DONE AFTER CREATE_ENVIRONMENTS
+    create_threads(vec, 8, 256);
+    vec_reset(vec);
+
+    return pufferl;
+}
+
+void python_vec_recv(pybind11::object pufferl_obj, int buf) {
+    auto& pufferl = pufferl_obj.cast<PuffeRL&>();
+    auto& vec = pufferl.vec;
+    vec_recv(vec, buf);
+}
+
+void python_vec_send(pybind11::object pufferl_obj, int buf) {
+    auto& pufferl = pufferl_obj.cast<PuffeRL&>();
+    auto& vec = pufferl.vec;
+    vec_send(vec, buf);
+}
+
+torch::autograd::tensor_list env_buffers(pybind11::object pufferl_obj) {
+    auto& pufferl = pufferl_obj.cast<PuffeRL&>();
+    auto& vec = pufferl.vec;
+    return {pufferl.env_obs, pufferl.env_actions, pufferl.env_rewards, pufferl.env_terminals};
+}
+
+torch::Tensor rollouts(pybind11::object pufferl_obj) {
+    torch::NoGradGuard no_grad;
+
+    auto& pufferl = pufferl_obj.cast<PuffeRL&>();
+    int64_t horizon = pufferl.horizon;
+    int64_t num_envs = pufferl.num_envs;
+
+    auto obs_buffer = pufferl.observations;
+    auto act_buffer = pufferl.actions;
+    auto logprob_buffer = pufferl.logprobs;
+    auto rew_buffer = pufferl.rewards;
+    auto term_buffer = pufferl.terminals;
+    auto val_buffer = pufferl.values;
+
+    auto& policy = pufferl.policy;
+    auto& vec = pufferl.vec;
+
+    auto env_obs = pufferl.env_obs;
+    auto env_actions = pufferl.env_actions;
+    auto env_rewards = pufferl.env_rewards;
+    auto env_terminals = pufferl.env_terminals;
+
+    auto state = pufferl.rollout_state;
+    state.zero_();
+
+    auto device = torch::kCUDA;
+
+    int num_buffers = pufferl.num_buffers;
+    int block_size = num_envs / num_buffers;
+    for (int64_t i = 0; i < num_buffers*horizon; ++i) {
+        int buf = i % num_buffers;
+	    int h = i / num_buffers;
+
+        if (pufferl.profile) {
+            cudaDeviceSynchronize();
+            nvtxRangePushA("vec_recv");
+        }
+        vec_recv(vec, buf);
+        if (pufferl.profile) {
+            cudaDeviceSynchronize();
+            nvtxRangePop();
+
+            cudaDeviceSynchronize();
+            nvtxRangePushA("rollout_copy_inputs");
+        }
+        auto buf_state = state.narrow(1, buf*block_size, block_size);
+        pufferl.graph_obs.copy_(pufferl.env_obs.narrow(0, buf*block_size, block_size).to(torch::kFloat32), true);
+        pufferl.graph_state.copy_(buf_state, false);
+
+        if (pufferl.profile) {
+            cudaDeviceSynchronize();
+            nvtxRangePop();
+
+            cudaDeviceSynchronize();
+            nvtxRangePushA("rollout_graph");
+        }
+        if (pufferl.cudagraphs) {
+            pufferl.rollout_graph.replay();
+        } else {
+            forward_call(&pufferl);
+        }
+        if (pufferl.profile) {
+            cudaDeviceSynchronize();
+            nvtxRangePop();
+        
+            cudaDeviceSynchronize();
+            nvtxRangePushA("rollout_copy_outputs");
+        }
+        buf_state.copy_(pufferl.graph_state_out, false);
+        // Store with non-blocking copies
+        pufferl.i_tmp = h;
+        pufferl.j_tmp = buf;
+        if (pufferl.cudagraphs) {
+            pufferl.rollout_copy_graphs[h][buf].replay();
+        } else {
+            rollout_copy_call(&pufferl);
+        }
+        if (pufferl.profile) {
+            cudaDeviceSynchronize();
+            nvtxRangePop();
+        }
+
+        // TODO: There should be a lighter way to sync. You need to make sure the torch data streams
+        // are ready because puffer vec uses different streams. Setting to non-blocking is not enough.
+        cudaDeviceSynchronize();
+        //c10::cuda::getCurrentCUDAStream().synchronize();
+
+        {
+            pybind11::gil_scoped_release no_gil;
+            //step_environments_cuda(envs_tensor, indices_tensor);
+            // Losing 1m sps here
+            if (pufferl.profile) {
+                cudaDeviceSynchronize();
+                nvtxRangePushA("vec_send");
+            }
+            vec_send(vec, buf);
+            if (pufferl.profile) {
+                cudaDeviceSynchronize();
+                nvtxRangePop();
+            }
+            //float reward_sum = 0;
+            //for (int j = 0; j < vec->size; j++) {
+            //    reward_sum += vec->rewards[j];
+            //}
+            //render_environments(envs_tensor, indices_tensor);
+        }
+
+	// Bad clamp
+    //    rewards.clamp_(-1.0f, 1.0f);
+    }
+
+    return state;
+}
+
+pybind11::dict train(pybind11::object pufferl_obj) {
+    auto& pufferl = pufferl_obj.cast<PuffeRL&>();
+
+    torch::Tensor observations = pufferl.observations;
+    torch::Tensor actions = pufferl.actions;
+    torch::Tensor logprobs = pufferl.logprobs;
+    torch::Tensor rewards = pufferl.rewards;
+    torch::Tensor terminals_input = pufferl.terminals;
+    torch::Tensor ratio = pufferl.ratio;
+    torch::Tensor values = pufferl.values;
+
+    int64_t total_minibatches = pufferl.total_minibatches;
+    int64_t minibatch_segments = pufferl.minibatch_segments;
+    int64_t segments = pufferl.segments;
+    int64_t accumulate_minibatches = pufferl.accumulate_minibatches;
+    int64_t horizon = pufferl.horizon;
+    double prio_beta0 = pufferl.prio_beta0;
+    double prio_alpha = pufferl.prio_alpha;
+    double clip_coef = pufferl.clip_coef;
+    double vf_clip_coef = pufferl.vf_clip_coef;
+    double gamma = pufferl.gamma;
+    double gae_lambda = pufferl.gae_lambda;
+    double vtrace_rho_clip = pufferl.vtrace_rho_clip;
+    double vtrace_c_clip = pufferl.vtrace_c_clip;
+    double vf_coef = pufferl.vf_coef;
+    double ent_coef = pufferl.ent_coef;
+    double max_grad_norm = pufferl.max_grad_norm;
+    bool use_rnn = pufferl.use_rnn;
+    bool anneal_lr = pufferl.anneal_lr;
+    int64_t total_epochs = pufferl.max_epochs;
+    int64_t current_epoch = pufferl.epoch;
+
+    // Accumulators
+    auto device = values.device();
+    auto scalar_opts = torch::TensorOptions().dtype(torch::kFloat32).device(device);
+    torch::Tensor pg_sum = torch::zeros({}, scalar_opts);
+    torch::Tensor v_sum = torch::zeros({}, scalar_opts);
+    torch::Tensor ent_sum = torch::zeros({}, scalar_opts);
+    torch::Tensor total_sum = torch::zeros({}, scalar_opts);
+    torch::Tensor old_approx_kl_sum = torch::zeros({}, scalar_opts);
+    torch::Tensor approx_kl_sum = torch::zeros({}, scalar_opts);
+    torch::Tensor clipfrac_sum = torch::zeros({}, scalar_opts);
+    torch::Tensor importance_sum = torch::zeros({}, scalar_opts);
+
+    {
+    pybind11::gil_scoped_release no_gil;
+    auto& policy = pufferl.policy;
+    auto& muon = pufferl.muon;
+
+    auto device = values.device();
+    auto terminals = terminals_input.to(torch::kFloat32);
+
+    if (anneal_lr) {
+        double lr_min = pufferl.min_lr_ratio * pufferl.lr;
+        double lr = cosine_annealing(pufferl.lr, lr_min,current_epoch, pufferl.max_epochs);
+        muon->lr.fill_(lr);
+        //muon->param_groups().at(0).options().set_lr(lr);
+    }
+
+    // Annealed priority exponent
+    double anneal_beta = prio_beta0 + (1.0 - prio_beta0) * prio_alpha * static_cast<double>(current_epoch) / total_epochs;
+
+    // Zero out ratio at start of epoch (matches Python: self.ratio[:] = 1)
+    ratio.fill_(1.0);
+
+    auto advantages = torch::zeros_like(values);
+    compute_puff_advantage_cuda(
+        values, rewards, terminals, ratio,
+        advantages, gamma, gae_lambda,
+        vtrace_rho_clip, vtrace_c_clip
+    );
+
+    pufferl.adv_mean.copy_(advantages.mean().detach());
+    pufferl.adv_std.copy_(advantages.std().detach());
+
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+
+    for (int64_t mb = 0; mb < total_minibatches; ++mb) {
+        advantages.fill_(0.0);
+
+        if (pufferl.profile) {
+            cudaDeviceSynchronize();
+            nvtxRangePushA("compute_puff_advantage");
+        }
+        compute_puff_advantage_cuda(
+            values, rewards, terminals, ratio,
+            advantages, gamma, gae_lambda,
+            vtrace_rho_clip, vtrace_c_clip
+        );
+        if (pufferl.profile) {
+            cudaDeviceSynchronize();
+            nvtxRangePop();
+
+            cudaDeviceSynchronize();
+            nvtxRangePushA("train_misc");
+        }
+        // Prioritization
+        auto adv = advantages.abs().sum(1);  // [num_envs]
+        auto prio_weights = adv.pow(prio_alpha).nan_to_num_(0.0, 0.0, 0.0);
+        auto prio_probs = (prio_weights + 1e-6)/(prio_weights.sum() + 1e-6);
+        auto idx = at::multinomial(prio_probs, minibatch_segments, true);
+        auto mb_prio = torch::pow(segments*prio_probs.index_select(0, idx).unsqueeze(1), -anneal_beta);
+
+        // Index into data
+        torch::Tensor mb_obs = observations.index_select(0, idx);
+        torch::Tensor mb_actions = actions.index_select(0, idx);
+        torch::Tensor mb_logprobs = logprobs.index_select(0, idx);
+        torch::Tensor mb_values = values.index_select(0, idx);
+        torch::Tensor mb_advantages = advantages.index_select(0, idx);
+        torch::Tensor mb_returns = mb_advantages + mb_values;
+
+        // Reshape obs if not using RNN
+        if (!use_rnn) {
+            auto flat_shape = std::vector<int64_t>{-1, mb_obs.size(2), mb_obs.size(3)};
+            mb_obs = mb_obs.reshape(flat_shape);
+        }
+
+        torch::Tensor mb_state = torch::zeros(
+            {policy->num_layers, minibatch_segments, 1, policy->hidden_size*policy->expansion_factor},
+            torch::dtype(DTYPE).device(values.device())
+        );
+
+        // Forward pass
+        //auto [logits, newvalue] = policy->forward_train(mb_obs.to(DTYPE), mb_state);
+        pufferl.graph_train_mb_obs.copy_(mb_obs, false);
+        pufferl.graph_train_mb_state.copy_(mb_state, false);
+        pufferl.graph_train_mb_actions.copy_(mb_actions, false);
+        pufferl.graph_train_mb_logprobs.copy_(mb_logprobs, false);
+        pufferl.graph_train_mb_advantages.copy_(mb_advantages, false);
+        pufferl.graph_train_mb_prio.copy_(mb_prio, false);
+        pufferl.graph_train_mb_values.copy_(mb_values, false);
+        pufferl.graph_train_mb_returns.copy_(mb_returns, false);
+        if (pufferl.profile) {
+            cudaDeviceSynchronize();
+            nvtxRangePop();
+
+            //auto [logits, newvalue] = train_forward_call(&pufferl);
+            cudaDeviceSynchronize();
+            nvtxRangePushA("train_forward_graph");
+        }
+        if (pufferl.cudagraphs) {
+            pufferl.train_forward_graph.replay();
+        } else {
+            train_forward_call(&pufferl);
+        }
+        if (pufferl.profile) {
+            cudaDeviceSynchronize();
+            nvtxRangePop();
+        }
+
+        // Update global ratio and values in-place (matches Python)
+        // This one can be commented, doesn't matter much on breakout
+        pufferl.ratio.index_copy_(0, idx, pufferl.graph_train_ratio.detach().squeeze(-1).to(torch::kFloat32));
+
+        // This one matters a lot even on breakout
+        pufferl.values.index_copy_(0, idx, pufferl.graph_train_newvalue.detach().squeeze(-1).to(torch::kFloat32));
+
+
+        //torch::Tensor loss = torch::zeros({1}, logits.options());
+        // Gradient accumulation and step
+        // ~10% overhead in this impl. Can save a ton of launches
+        /*
+        if ((mb + 1) % accumulate_minibatches == 0) {
+            // We use our version that doesn't sync for no reason
+            // 2m+ sps right here on clip + step!
+            clip_grad_norm_(policy->parameters(), max_grad_norm);
+            muon->step();
+            muon->zero_grad();
+            //pufferl.graph_train_logits.detach_();
+            //pufferl.graph_train_newvalue.detach_();
+        }
+        */
+    }
+    pufferl.epoch += 1;
+
+    // Compute explained variance at end of epoch
+    auto y_true = advantages.flatten() + values.flatten();
+    auto y_pred = values.flatten();
+    auto var_y = y_true.var();
+    //double explained_var = (var_y.abs() < 1e-8) ? NAN : (1 - (y_true - y_pred).var() / var_y).item<double>();
+
+    }
+    // Return losses (averaged)
+    pybind11::dict losses;
+    /*
+    losses["pg_loss"] = pg_sum.item<float>() / total_minibatches;
+    losses["value_loss"] = v_sum.item<float>() / total_minibatches;
+    losses["entropy"] = ent_sum.item<float>() / total_minibatches;
+    losses["total_loss"] = total_sum.item<float>() / total_minibatches;
+    losses["old_approx_kl"] = old_approx_kl_sum.item<float>() / total_minibatches;
+    losses["approx_kl"] = approx_kl_sum.item<float>() / total_minibatches;
+    losses["clipfrac"] = clipfrac_sum.item<float>() / total_minibatches;
+    losses["importance"] = importance_sum.item<float>() / total_minibatches;
+    */
+    //losses["explained_variance"] = explained_var;
+
+    return losses;
+}
+
+
+// PYBIND11_MODULE with the extension name (pufferlib._C)
+TORCH_LIBRARY(_C, m) {
+    m.def("mingru_gate(Tensor state, Tensor combined) -> (Tensor, Tensor)");
+    m.def("log_coeffs_and_values(Tensor gate, Tensor hidden) -> (Tensor, Tensor)");
+    m.def("fused_scan(Tensor combined, Tensor state) -> (Tensor, Tensor)");
+    m.def("fused_ppo_loss(Tensor logits, Tensor values, Tensor actions, Tensor old_logprobs, Tensor advantages, Tensor prio, Tensor values, Tensor returns, Tensor adv_mean, Tensor adv_std, float clip_coef, float vf_clip_coef, float vf_coef, float ent_coef) -> Tensor");
+    m.def("policy_forward(Tensor obs, Tensor state) -> (Tensor, Tensor, Tensor)");
+}
+
+PYBIND11_MODULE(_C, m) {
+    m.def("log_environments", &log_environments);
+    m.def("rollouts", &rollouts);
+
+    //m.def("evaluate_step", &evaluate_step);
+    m.def("train", &train);
+    m.def("logcumsumexp_cuda", &logcumsumexp_cuda);
+    m.def("policy_forward", &PolicyMinGRU::forward);
+
+    m.def("initial_state", &initial_state);
+
+    // TODO: Why tf are these needed?
+    m.def("mingru_gate", &mingru_gate);
+    m.def("log_coeffs_and_values", &log_coeffs_and_values);
+    m.def("fused_scan", &fused_scan);
+    m.def("fused_ppo_loss", &fused_ppo_loss);
+    //m.def("rmsnorm", &rmsnorm);
+
+    m.def("python_vec_recv", &python_vec_recv);
+    m.def("python_vec_send", &python_vec_send);
+    m.def("env_buffers", &env_buffers);
+    /*
+    py::class_<RMSNorm, torch::nn::ModuleHolder<RMSNormImpl>>(m, "RMSNorm")
+        .def(py::init<int64_t, double>(),
+             py::arg("hidden_size"),
+             py::arg("eps") = 1e-5)
+        .def("forward", &RMSNorm::forward)
+        .def("__call__", &RMSNorm::operator())
+        .def_readwrite("weight", &RMSNormImpl::weight)
+        .def_readonly("eps", &RMSNormImpl::eps);
+    */
+
+
+    py::class_<torch::optim::MuonOptions>(m, "MuonOptions")
+        .def(py::init<double>());
+
+    py::class_<torch::optim::MuonParamState>(m, "MuonParamState")
+        .def(py::init<>());
+
+    py::class_<torch::optim::Muon>(m, "Muon")
+        .def(py::init<std::vector<torch::optim::OptimizerParamGroup>, torch::optim::MuonOptions>());
+
+    m.def("create_pufferl", &create_pufferl);
+    py::class_<pufferlib::PuffeRL, std::unique_ptr<pufferlib::PuffeRL>>(m, "PuffeRL")
+        .def_readwrite("policy", &pufferlib::PuffeRL::policy)
+        .def_readwrite("muon", &pufferlib::PuffeRL::muon)
+        .def_readwrite("observations", &pufferlib::PuffeRL::observations)
+        .def_readwrite("actions", &pufferlib::PuffeRL::actions)
+        .def_readwrite("rewards", &pufferlib::PuffeRL::rewards)
+        .def_readwrite("terminals", &pufferlib::PuffeRL::terminals)
+        .def_readwrite("logprobs", &pufferlib::PuffeRL::logprobs)
+        .def_readwrite("values", &pufferlib::PuffeRL::values)
+        .def_readwrite("debug", &pufferlib::PuffeRL::debug);
+
+    py::class_<pufferlib::PolicyLSTM, std::shared_ptr<pufferlib::PolicyLSTM>, torch::nn::Module> cls(m, "PolicyLSTM");
+    cls.def(py::init<int64_t, int64_t, int64_t>());
+    cls.def("forward", &pufferlib::PolicyLSTM::forward);
+    cls.def("forward_train", &pufferlib::PolicyLSTM::forward_train);
+
+    py::class_<pufferlib::PolicyMinGRU, std::shared_ptr<pufferlib::PolicyMinGRU>, torch::nn::Module> cls2(m, "PolicyMinGRU");
+    cls2.def(py::init<int64_t, int64_t, int64_t>());
+    cls2.def("forward", &pufferlib::PolicyMinGRU::forward);
+    cls2.def("forward_train", &pufferlib::PolicyMinGRU::forward_train);
+}
 }
diff --git a/pufferlib/extensions/test_binding.c b/pufferlib/extensions/test_binding.c
new file mode 100644
index 000000000..458a6f77b
--- /dev/null
+++ b/pufferlib/extensions/test_binding.c
@@ -0,0 +1,46 @@
+#include "binding.h"
+
+int main() {
+    Dict* kwargs = create_dict(32);
+    dict_set_int(kwargs, "frameskip", 4);
+    dict_set_int(kwargs, "width", 576);
+    dict_set_int(kwargs, "height", 330);
+    dict_set_int(kwargs, "paddle_width", 62);
+    dict_set_int(kwargs, "paddle_height", 8);
+    dict_set_int(kwargs, "ball_width", 32);
+    dict_set_int(kwargs, "ball_height", 32);
+    dict_set_int(kwargs, "brick_width", 32);
+    dict_set_int(kwargs, "brick_height", 12);
+    dict_set_int(kwargs, "brick_rows", 6);
+    dict_set_int(kwargs, "brick_cols", 18);
+    dict_set_int(kwargs, "initial_ball_speed", 256);
+    dict_set_int(kwargs, "max_ball_speed", 448);
+    dict_set_int(kwargs, "paddle_speed", 620);
+    dict_set_int(kwargs, "continuous", 0);
+
+    int num_envs = 1024;
+    int threads = 8;
+    int buffers = 2;
+
+    VecEnv* vec = create_environments(num_envs, threads, buffers, kwargs);
+    vec_reset(vec);
+
+    for (int i = 0; i < 10000; i++) {
+        int buf = i % buffers;
+        vec_recv(vec, buf);
+        for (int j = 0; j < num_envs; j++) {
+            Env* env = &vec->envs[j];
+            env->actions[j] = rand() % 3;
+        }
+        vec_send(vec, buf);
+
+        /*
+        Env* env = &vec.envs[0];
+        c_render(env);
+        env->actions[0] = rand() % 3;
+        c_step(env);
+        */
+    }
+    return 0;
+}
+
diff --git a/pufferlib/extensions/test_dll.c b/pufferlib/extensions/test_dll.c
new file mode 100644
index 000000000..a763b986a
--- /dev/null
+++ b/pufferlib/extensions/test_dll.c
@@ -0,0 +1,184 @@
+#include <stdio.h>
+#include <dlfcn.h>
+#include <unistd.h>
+#include <assert.h>
+#include <stdbool.h>
+#include <time.h>
+#include "vecenv.h"
+
+vec_send_fn vec_send;
+vec_recv_fn vec_recv;
+
+int timeout = 10;
+ 
+float perf_test(VecEnv* vec, int buffers) {
+    int start = time(NULL);
+    int i = 0;
+    while (time(NULL) - start < timeout) {
+        int buf = i % buffers;
+        vec_recv(vec, buf);
+        vec_send(vec, buf);
+        i++;
+    }
+
+    return (float)i / (float)timeout;
+}
+ 
+int main() {
+    void* handle = dlopen("./breakout.so", RTLD_NOW);
+    if (!handle) {
+        fprintf(stderr, "dlopen error: %s\n", dlerror());
+        return 1;
+    }
+
+    // Clear any existing errors
+    dlerror();
+
+    // Load the function pointer
+    create_environments_fn create_environments = (create_environments_fn)dlsym(handle, "create_environments");
+    env_init_fn env_init = (env_init_fn)dlsym(handle, "env_init");
+    vec_reset_fn vec_reset = (vec_reset_fn)dlsym(handle, "vec_reset");
+    vec_step_fn vec_step = (vec_step_fn)dlsym(handle, "vec_step");
+    vec_send = (vec_send_fn)dlsym(handle, "vec_send");
+    vec_recv = (vec_recv_fn)dlsym(handle, "vec_recv");
+    env_close_fn env_close = (env_close_fn)dlsym(handle, "env_close");
+    vec_close_fn vec_close = (vec_close_fn)dlsym(handle, "vec_close");
+    vec_log_fn vec_log = (vec_log_fn)dlsym(handle, "vec_log");
+    vec_render_fn vec_render = (vec_render_fn)dlsym(handle, "vec_render");
+    int obs_n = *(int*)dlsym(handle, "OBS_N");
+    int act_n = *(int*)dlsym(handle, "ACT_N");
+    int obs_t = *(int*)dlsym(handle, "OBS_T");
+    int act_t = *(int*)dlsym(handle, "ACT_T");
+    
+    const char* dlsym_error = dlerror();
+    if (dlsym_error) {
+        fprintf(stderr, "dlsym error: %s\n", dlsym_error);
+        dlclose(handle);
+        return 1;
+    }
+
+    Dict* kwargs = create_dict(32);
+    dict_set_int(kwargs, "frameskip", 4);
+    dict_set_int(kwargs, "width", 576);
+    dict_set_int(kwargs, "height", 330);
+    dict_set_int(kwargs, "paddle_width", 62);
+    dict_set_int(kwargs, "paddle_height", 8);
+    dict_set_int(kwargs, "ball_width", 32);
+    dict_set_int(kwargs, "ball_height", 32);
+    dict_set_int(kwargs, "brick_width", 32);
+    dict_set_int(kwargs, "brick_height", 12);
+    dict_set_int(kwargs, "brick_rows", 6);
+    dict_set_int(kwargs, "brick_cols", 18);
+    dict_set_int(kwargs, "initial_ball_speed", 256);
+    dict_set_int(kwargs, "max_ball_speed", 448);
+    dict_set_int(kwargs, "paddle_speed", 620);
+    dict_set_int(kwargs, "continuous", 0);
+
+    int num_envs = 8192;
+    int threads = 8;
+    int buffers = 1;
+    int block_size = 256;
+
+    /*
+    int num_envs = 32;
+    int threads = 0;
+    int buffers = 2;
+    int block_size = 2;
+    */
+
+    float* actions = (float*)calloc(num_envs, sizeof(float));
+
+    VecEnv* vec1 = create_environments(num_envs, 0, buffers, block_size, true, 0, kwargs);
+    vec_reset(vec1);
+
+    VecEnv* vec2 = create_environments(num_envs, threads, buffers, block_size, true, 1, kwargs);
+    vec_reset(vec2);
+
+    /*
+    float sps = perf_test(vec1, buffers) * num_envs / (float)buffers;
+    printf("Performance: %f\n M SPS (%f GB/s)\n", sps/1e6f, 118.0f*sps/1e9f);
+    exit(0);
+    */
+
+
+    /*
+    for (int i = 0; i < vec1->size; i++) {
+        float* obs = vec1->observations + i*obs_n;
+        obs[0] = i;
+
+        obs = vec2->observations + i*obs_n;
+        obs[0] = i;
+    }
+    */
+
+    for (int i = 0; i < 10000; i++) {
+        vec_recv(vec1, i%buffers);
+        vec_recv(vec2, i%buffers);
+
+        /*
+        if (i % 2 == 0) {
+            vec_recv(vec1, 0);
+         }
+        vec_recv(vec2, i%2);
+        */
+
+        int start = (i % buffers) * (num_envs / buffers);
+        int end = start + num_envs / buffers;
+        // Doesnt work for 1 buffer (bad end index)
+        for (int j = start; j < end; j++) {
+            float* obs1 = vec1->observations + j*obs_n;
+            float* obs2 = vec2->observations + j*obs_n;
+            for (int k = 0; k < obs_n; k++) {
+                if (obs1[k] != obs2[k]) {
+                    sleep(1);
+                    printf("Observation mismatch at index %d\n", j);
+                    exit(1);
+                }
+            }
+            assert(vec1->actions[j] == vec2->actions[j]);
+            assert(vec1->rewards[j] == vec2->rewards[j]);
+            assert(vec1->terminals[j] == vec2->terminals[j]);
+        }
+        printf("Passed %d\n", i);
+
+        /*
+        if (i % 2 == 1) {
+            vec_send(vec1, 0);
+        }
+        vec_send(vec2, i%2);
+        */
+
+        cudaDeviceSynchronize();
+        for (int i=start; i<end; i++) {
+            int atn = rand() % 3;
+            actions[i] = atn;
+        }
+
+        int num_cpy = end - start;
+        cudaMemcpy(vec1->gpu_actions, actions, num_cpy*sizeof(float), cudaMemcpyHostToDevice);
+        cudaMemcpy(vec2->gpu_actions, actions, num_cpy*sizeof(float), cudaMemcpyHostToDevice);
+        cudaDeviceSynchronize();
+
+        vec_send(vec1, i%buffers);
+        vec_send(vec2, i%buffers);
+    }
+
+    /*
+    VecEnv* vec = create_environments(num_envs, threads, buffers, block_size, kwargs);
+    for (int i = 0; i < 10000; i++) {
+        int buf = i % buffers;
+        vec_recv(vec, buf);
+        vec_render(vec, 0);
+        vec_send(vec, buf);
+    }
+    */
+
+    //printf("Created VecEnv with %d environments\n", vec->size);
+    // TODO: Add a `close_vecenv` function to clean up
+    // vec.envs, etc.
+    printf("Done\n");
+
+    // Close the library
+    //dlclose(handle);
+    return 0;
+}
diff --git a/pufferlib/extensions/vecenv.h b/pufferlib/extensions/vecenv.h
new file mode 100644
index 000000000..995e166c3
--- /dev/null
+++ b/pufferlib/extensions/vecenv.h
@@ -0,0 +1,136 @@
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include <pthread.h>
+#include <stdatomic.h>
+#include <cuda_runtime.h>
+
+#define FLOAT 1
+#define INT 2
+#define UNSIGNED_CHAR 3
+
+typedef struct {
+    const char* key;
+    void* void_value;
+    int int_value;
+    float float_value;
+} DictItem;
+
+typedef struct {
+    DictItem* items;
+    int size;
+    int capacity;
+} Dict;
+
+typedef struct Env Env;
+typedef struct Threading Threading;
+
+typedef struct {
+    Env* envs;
+    int size;
+    float* observations;
+    float* actions;
+    float* rewards;
+    unsigned char* terminals;
+    float* gpu_observations;
+    float* gpu_actions;
+    float* gpu_rewards;
+    unsigned char* gpu_terminals;
+    Threading* threading;
+    cudaStream_t* streams;
+    int buffers;
+} VecEnv;
+
+Dict* create_dict(int capacity) {
+    Dict* dict = (Dict*)calloc(1, sizeof(Dict));
+    dict->capacity = capacity;
+    dict->items = (DictItem*)calloc(capacity, sizeof(DictItem));
+    return dict;
+}
+
+DictItem* dict_get_unsafe(Dict* dict, const char* key) {
+    for (int i = 0; i < dict->size; i++) {
+        if (strcmp(dict->items[i].key, key) == 0) {
+            return &dict->items[i];
+        }
+    }
+    return NULL;
+}
+
+DictItem* dict_get(Dict* dict, const char* key) {
+    DictItem* item = dict_get_unsafe(dict, key);
+    assert(item != NULL && "dict_get failed to find key");
+    return item;
+}
+
+void dict_set_int(Dict* dict, const char* key, int value) {
+    assert(dict->size < dict->capacity);
+    DictItem* item = dict_get_unsafe(dict, key);
+
+    if (item != NULL) {
+        item->int_value = value;
+        return;
+    }
+
+    dict->items[dict->size].key = key;
+    dict->items[dict->size].int_value = value;
+    dict->size++;
+}
+
+void dict_set_float(Dict* dict, const char* key, float value) {
+    assert(dict->size < dict->capacity);
+    DictItem* item = dict_get_unsafe(dict, key);
+
+    if (item != NULL) {
+        item->float_value = value;
+        return;
+    }
+
+    dict->items[dict->size].key = key;
+    dict->items[dict->size].float_value = value;
+    dict->size++;
+}
+
+void dict_set_void(Dict* dict, const char* key, void* value) {
+    assert(dict->size < dict->capacity);
+    DictItem* item = dict_get_unsafe(dict, key);
+
+    if (item != NULL) {
+        item->void_value = value;
+        return;
+    }
+
+    dict->items[dict->size].key = key;
+    dict->items[dict->size].void_value = value;
+    dict->size++;
+}
+
+void* my_shared(Env* env, Dict* kwargs);
+void my_shared_close(Env* env);
+void* my_get(Env* env, Dict* out);
+int my_put(Env* env, Dict* kwargs);
+
+typedef struct Log Log;
+void my_log(Log* log, Dict* out);
+
+// Sharp bit (puffers have spikes)
+// Define function types to be exported to the shared library
+// You don't need these, but you have to do some really gross
+// casts after loading the library without them.
+typedef VecEnv* (*create_environments_fn)(int num_envs, int buffers, bool use_gpu, int test_idx, Dict* kwargs);
+typedef Env* (*env_init_fn)(float* observations, float* actions, float* rewards,
+        unsigned char* terminals, int seed, Dict* kwargs);
+typedef void (*create_threads_fn)(VecEnv* vec, int threads, int block_size);
+typedef void (*vec_reset_fn)(VecEnv* vec);
+typedef void (*vec_step_fn)(VecEnv* vec);
+typedef void (*vec_recv_fn)(VecEnv* vec, int buffer);
+typedef void (*vec_send_fn)(VecEnv* vec, int buffer);
+typedef void (*env_close_fn)(Env* env);
+typedef void (*vec_close_fn)(VecEnv* vec);
+typedef void (*vec_render_fn)(VecEnv* vec, int env_idx);
+typedef void (*vec_log_fn)(VecEnv* vec, Dict* out);
+
+typedef void (*c_reset_fn)(Env* env);
+typedef void (*c_step_fn)(Env* env);
+typedef void (*c_close_fn)(Env* env);
+typedef void (*c_render_fn)(Env* env);
diff --git a/pufferlib/fake_tensors.py b/pufferlib/fake_tensors.py
new file mode 100644
index 000000000..1bd9996b9
--- /dev/null
+++ b/pufferlib/fake_tensors.py
@@ -0,0 +1,30 @@
+# This file defines PyTorch FakeTensor kernels. These operations
+# match the input and output parameters and tensor shapes of the real
+# operations defined in our C++ extensions and are required by PyTorch to
+# add compile support. The PyTorch devs in their infinite wisdom have
+# decided not to add compile support to the C++ API directly and to prioritize
+# this interface over the C++ FakeTensor interface, so go complain to them
+# if you think this is jank.
+
+import torch
+
+from pufferlib import _C
+
+
+@torch.library.register_fake("_C::mingru_gate")
+def mingru_gate_abstract(state: torch.Tensor, gate: torch.Tensor, hidden: torch.Tensor) -> torch.Tensor:
+    return torch.empty_like(state)
+
+@torch.library.register_fake("_C::log_coeffs_and_values")
+def log_coeffs_and_values_abstract(gate, hidden):
+    log_coeffs = torch.empty_like(gate)
+    values = torch.empty_like(hidden)
+    return log_coeffs, values
+
+@torch.library.register_fake("_C::policy_forward")
+def policy_forward_abstract(obs, state):
+    batch = obs.size(0)
+    logits = torch.empty(batch, 3)
+    value = torch.empty(batch, 1)
+    state_out = torch.empty_like(state)
+    return logits, value, state_out
diff --git a/pufferlib/models.py b/pufferlib/models.py
index fa43d7071..2435a67bb 100644
--- a/pufferlib/models.py
+++ b/pufferlib/models.py
@@ -3,47 +3,130 @@
 
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 
 import pufferlib.emulation
-import pufferlib.pytorch
 import pufferlib.spaces
 
+# https://arxiv.org/abs/2410.01201v1
 
-class Default(nn.Module):
-    '''Default PyTorch policy. Flattens obs and applies a linear layer.
-
-    PufferLib is not a framework. It does not enforce a base class.
-    You can use any PyTorch policy that returns actions and values.
-    We structure our forward methods as encode_observations and decode_actions
-    to make it easier to wrap policies with LSTMs. You can do that and use
-    our LSTM wrapper or implement your own. To port an existing policy
-    for use with our LSTM wrapper, simply put everything from forward() before
-    the recurrent cell into encode_observations and put everything after
-    into decode_actions.
-    '''
+import torch
+import torch.nn.functional as F
+from torch.nn import Linear, Identity, Module
+
+def exists(v):
+    return v is not None
+
+def default(v, d):
+    return v if exists(v) else d
+
+# appendix B
+# https://github.com/glassroom/heinsen_sequence
+
+def heinsen_associative_scan_log(log_coeffs, log_values):
+    a_star = log_coeffs.cumsum(dim = 1)
+    log_h0_plus_b_star = (log_values - a_star).logcumsumexp(dim = 1)
+    log_h = a_star + log_h0_plus_b_star
+    return log_h.exp()
+
+# appendix B.3
+
+def g(x):
+    return torch.where(x >= 0, x + 0.5, x.sigmoid())
+
+def log_g(x):
+    return torch.where(x >= 0, (F.relu(x) + 0.5).log(), -F.softplus(-x))
+
+# log-space version of minGRU - B.3.1
+# they enforce the hidden states to be positive
+
+class MinGRULayer(Module):
+    def __init__(self, dim, expansion_factor=1., proj_out = None):
+        super().__init__()
+
+        dim_inner = int(dim * expansion_factor)
+        self.proj_out = default(proj_out, expansion_factor != 1.)
+
+        self.to_hidden_and_gate = Linear(dim, dim_inner * 2, bias = False)
+        nn.init.orthogonal_(self.to_hidden_and_gate.weight)
+
+        self.to_out = Linear(dim_inner, dim, bias = False)
+        nn.init.orthogonal_(self.to_out.weight)
+
+        self.norm = torch.nn.RMSNorm(dim)
+
+    def forward(self, x, prev_hidden = None):
+        seq_len = x.shape[1]
+        hidden, gate = self.to_hidden_and_gate(x).chunk(2, dim = -1)
+
+        if seq_len == 1:
+            # handle sequential
+
+            hidden = g(hidden)
+            gate = gate.sigmoid()
+            out = torch.lerp(prev_hidden, hidden, gate) if exists(prev_hidden) else (hidden * gate)
+        else:
+            # parallel
+            log_coeffs = -F.softplus(gate)
+
+            log_z = -F.softplus(-gate)
+            log_tilde_h = log_g(hidden)
+            log_values = log_z + log_tilde_h
+
+            if exists(prev_hidden):
+                log_values = torch.cat((prev_hidden.log(), log_values), dim = 1)
+                log_coeffs = F.pad(log_coeffs, (0, 0, 1, 0))
+
+            out = heinsen_associative_scan_log(log_coeffs, log_values)
+            out = out[:, -seq_len:]
+
+        next_prev_hidden = out[:, -1:]
+
+        if self.proj_out:
+            out = self.to_out(out)
+
+        out = out + x
+        out = self.norm(out)
+
+        return out, next_prev_hidden
+
+class DefaultEncoder(nn.Module):
     def __init__(self, env, hidden_size=128):
         super().__init__()
-        self.hidden_size = hidden_size
-        self.is_multidiscrete = isinstance(env.single_action_space,
-                pufferlib.spaces.MultiDiscrete)
-        self.is_continuous = isinstance(env.single_action_space,
-                pufferlib.spaces.Box)
         try:
             self.is_dict_obs = isinstance(env.env.observation_space, pufferlib.spaces.Dict) 
         except:
             self.is_dict_obs = isinstance(env.observation_space, pufferlib.spaces.Dict) 
 
         if self.is_dict_obs:
-            self.dtype = pufferlib.pytorch.nativize_dtype(env.emulated)
+            dtype = pufferlib.pytorch.nativize_dtype(env.emulated)
             input_size = int(sum(np.prod(v.shape) for v in env.env.observation_space.values()))
-            self.encoder = nn.Linear(input_size, self.hidden_size)
         else:
             num_obs = np.prod(env.single_observation_space.shape)
-            self.encoder = torch.nn.Sequential(
-                pufferlib.pytorch.layer_init(nn.Linear(num_obs, hidden_size)),
-                nn.GELU(),
-            )
-            
+            dtype = env.single_observation_space.dtype
+
+        self.dtype = dtype
+        self.encoder = pufferlib.pytorch.layer_init(nn.Linear(num_obs, hidden_size))
+
+    def forward(self, observations):
+        batch_size = observations.shape[0]
+        if self.is_dict_obs:
+            observations = pufferlib.pytorch.nativize_tensor(observations, self.dtype)
+            observations = torch.cat([v.view(batch_size, -1) for v in observations.values()], dim=1)
+        else: 
+            observations = observations.view(batch_size, -1)
+
+        hidden = self.encoder(observations.float())
+        return F.gelu(hidden)
+
+class DefaultDecoder(nn.Module):
+    def __init__(self, env, hidden_size=128):
+        super().__init__()
+        self.is_multidiscrete = isinstance(env.single_action_space,
+                pufferlib.spaces.MultiDiscrete)
+        self.is_continuous = isinstance(env.single_action_space,
+                pufferlib.spaces.Box)
+
         if self.is_multidiscrete:
             self.action_nvec = tuple(env.single_action_space.nvec)
             num_atns = sum(self.action_nvec)
@@ -59,31 +142,11 @@ def __init__(self, env, hidden_size=128):
             self.decoder_logstd = nn.Parameter(torch.zeros(
                 1, env.single_action_space.shape[0]))
 
-        self.value = pufferlib.pytorch.layer_init(
+        self.value_function = pufferlib.pytorch.layer_init(
             nn.Linear(hidden_size, 1), std=1)
 
-    def forward_eval(self, observations, state=None):
-        hidden = self.encode_observations(observations, state=state)
-        logits, values = self.decode_actions(hidden)
-        return logits, values
-
-    def forward(self, observations, state=None):
-        return self.forward_eval(observations, state)
 
-    def encode_observations(self, observations, state=None):
-        '''Encodes a batch of observations into hidden states. Assumes
-        no time dimension (handled by LSTM wrappers).'''
-        batch_size = observations.shape[0]
-        if self.is_dict_obs:
-            observations = pufferlib.pytorch.nativize_tensor(observations, self.dtype)
-            observations = torch.cat([v.view(batch_size, -1) for v in observations.values()], dim=1)
-        else: 
-            observations = observations.view(batch_size, -1)
-        return self.encoder(observations.float())
-
-    def decode_actions(self, hidden):
-        '''Decodes a batch of hidden states into (multi)discrete actions.
-        Assumes no time dimension (handled by LSTM wrappers).'''
+    def forward(self, hidden):
         if self.is_multidiscrete:
             logits = self.decoder(hidden).split(self.action_nvec, dim=1)
         elif self.is_continuous:
@@ -94,21 +157,291 @@ def decode_actions(self, hidden):
         else:
             logits = self.decoder(hidden)
 
-        values = self.value(hidden)
+        values = self.value_function(hidden)
+        return logits, values
+ 
+
+class Default(nn.Module):
+    def __init__(self, env, hidden_size=128, num_layers=1, **kwargs):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.input_size = hidden_size
+        self.num_layers = num_layers
+        self.obs_shape = env.single_observation_space.shape
+        self.encoder = DefaultEncoder(env, hidden_size)
+        self.decoder = DefaultDecoder(env, hidden_size)
+
+        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers=num_layers)
+        self.cell = nn.ModuleList([torch.nn.LSTMCell(hidden_size, hidden_size) for _ in range(num_layers)])
+
+        for i in range(num_layers):
+            cell = self.cell[i]
+
+            w_ih = getattr(self.lstm, f'weight_ih_l{i}')
+            w_hh = getattr(self.lstm, f'weight_hh_l{i}')
+            b_ih = getattr(self.lstm, f'bias_ih_l{i}')
+            b_hh = getattr(self.lstm, f'bias_hh_l{i}')
+
+            nn.init.orthogonal_(w_ih, 1.0)
+            nn.init.orthogonal_(w_hh, 1.0)
+            b_ih.data.zero_()
+            b_hh.data.zero_()
+
+            cell.weight_ih = w_ih
+            cell.weight_hh = w_hh
+            cell.bias_ih = b_ih
+            cell.bias_hh = b_hh
+
+    def initial_state(self, batch_size, device):
+        h = torch.zeros(self.num_layers, batch_size, self.hidden_size, device=device)
+        c = torch.zeros(self.num_layers, batch_size, self.hidden_size, device=device)
+        return h, c
+
+    def forward_eval(self, x, state):
+        '''Forward function for inference. 3x faster than using LSTM directly'''
+        assert state[0].shape[1] == state[1].shape[1] == x.shape[0], 'LSTM state must be (h, c)'
+        h = self.encoder(x)
+        lstm_h, lstm_c = state
+        for i in range(self.num_layers):
+            h, c = self.cell[i](h, (lstm_h[i], lstm_c[i]))
+            lstm_h[i] = h
+            lstm_c[i] = c
+
+        logits, values = self.decoder(h)
+        return logits, values, (lstm_h, lstm_c)
+
+    def forward(self, x):
+        '''Forward function for training. Uses LSTM for fast time-batching'''
+        x_shape, space_shape = x.shape, self.obs_shape
+        x_n, space_n = len(x_shape), len(space_shape)
+        assert x_shape[-space_n:] == space_shape, f'Invalid input tensor shape {x.shape} != {space_shape}'
+
+        B, TT = x_shape[:2]
+        x = x.reshape(B*TT, *space_shape)
+        h = self.encoder(x)
+        assert h.shape == (B*TT, self.input_size)
+        h = h.reshape(B, TT, self.input_size)
+
+        h = h.transpose(0, 1)
+        h, (lstm_h, lstm_c) = self.lstm.forward(h)
+        h = h.transpose(0, 1)
+
+        flat_hidden = h.reshape(B*TT, self.hidden_size)
+        logits, values = self.decoder(flat_hidden)
+        values = values.reshape(B, TT)
+        return logits, values
+
+class GRU(nn.Module):
+    def __init__(self, env, hidden_size=128, num_layers=1, **kwargs):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.input_size = hidden_size
+        self.num_layers = num_layers
+        self.obs_shape = env.single_observation_space.shape
+        self.encoder = DefaultEncoder(env, hidden_size)
+        self.decoder = DefaultDecoder(env, hidden_size)
+
+        self.gru = nn.GRU(hidden_size, hidden_size, num_layers=num_layers)
+        self.cell = nn.ModuleList([torch.nn.GRUCell(hidden_size, hidden_size) for _ in range(num_layers)])
+        self.norm = torch.nn.RMSNorm(hidden_size)
+
+        for i in range(num_layers):
+            cell = self.cell[i]
+
+            w_ih = getattr(self.gru, f'weight_ih_l{i}')
+            w_hh = getattr(self.gru, f'weight_hh_l{i}')
+            b_ih = getattr(self.gru, f'bias_ih_l{i}')
+            b_hh = getattr(self.gru, f'bias_hh_l{i}')
+
+            nn.init.orthogonal_(w_ih, 1.0)
+            nn.init.orthogonal_(w_hh, 1.0)
+            b_ih.data.zero_()
+            b_hh.data.zero_()
+
+            cell.weight_ih = w_ih
+            cell.weight_hh = w_hh
+            cell.bias_ih = b_ih
+            cell.bias_hh = b_hh
+
+    def initial_state(self, batch_size, device):
+        h = torch.zeros(self.num_layers, batch_size, self.hidden_size, device=device)
+        return (h,)
+
+    def forward_eval(self, x, state):
+        '''Forward function for inference. 3x faster than using LSTM directly'''
+        assert state[0].shape[1] == x.shape[0]
+        h = self.encoder(x)
+        state = state[0]
+        for i in range(self.num_layers):
+            h_in = h    
+            h = self.cell[i](h, state[i])
+            state[i] = h
+            h = h + h_in
+            h = self.norm(h)
+
+        logits, values = self.decoder(h)
+        return logits, values, (state,)
+
+    def forward(self, x):
+        '''Forward function for training. Uses LSTM for fast time-batching'''
+        x_shape, space_shape = x.shape, self.obs_shape
+        x_n, space_n = len(x_shape), len(space_shape)
+        assert x_shape[-space_n:] == space_shape, f'Invalid input tensor shape {x.shape} != {space_shape}'
+
+        B, TT = x_shape[:2]
+        x = x.reshape(B*TT, *space_shape)
+        h = self.encoder(x)
+        assert h.shape == (B*TT, self.input_size)
+        h = h.reshape(B, TT, self.input_size)
+
+        h = h.transpose(0, 1)
+        h_in = h
+        h, _ = self.gru.forward(h)
+        h = h + h_in
+        h = self.norm(h)
+        h = h.transpose(0, 1)
+
+        flat_hidden = h.reshape(B*TT, self.hidden_size)
+        logits, values = self.decoder(flat_hidden)
+        values = values.reshape(B, TT)
+        return logits, values
+
+
+class MinGRU(nn.Module):
+    def __init__(self, env, hidden_size=128, num_layers=1, expansion_factor=2, **kwargs):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.input_size = hidden_size
+        self.expansion_factor = expansion_factor
+        self.obs_shape = env.single_observation_space.shape
+        self.encoder = DefaultEncoder(env, hidden_size)
+        self.decoder = DefaultDecoder(env, hidden_size)
+        self.expansion_factor = expansion_factor
+        self.num_layers = num_layers
+        self.mingru = nn.ModuleList([MinGRULayer(hidden_size, expansion_factor) for _ in range(num_layers)])
+
+    def initial_state(self, batch_size, device):
+        state = torch.zeros(self.num_layers, batch_size, self.hidden_size*self.expansion_factor, device=device)
+        return (state,)
+
+    def forward_eval(self, x, state):
+        state = state[0]
+        assert state.shape[1] == x.shape[0]
+        h = self.encoder(x)
+        h = h.unsqueeze(1)
+        state = state.unsqueeze(2)
+        state_out = []
+        for i in range(self.num_layers):
+            h, s = self.mingru[i](h, state[i])
+            state_out.append(s)
+
+        h = h.squeeze(1)
+        state = torch.stack(state_out, 0).squeeze(2)
+        logits, values = self.decoder(h)
+        return logits, values, (state,)
+
+    def forward(self, x):
+        '''Forward function for training. Uses LSTM for fast time-batching'''
+        x_shape, space_shape = x.shape, self.obs_shape
+        x_n, space_n = len(x_shape), len(space_shape)
+        assert x_shape[-space_n:] == space_shape, f'Invalid input tensor shape {x.shape} != {space_shape}'
+
+        B, TT = x_shape[:2]
+        x = x.reshape(B*TT, *space_shape)
+        h = self.encoder(x)
+        assert h.shape == (B*TT, self.input_size)
+        h = h.reshape(B, TT, self.input_size)
+
+        state = self.initial_state(B, h.device)[0].unsqueeze(2)
+        for i in range(self.num_layers):
+            h, _ = self.mingru[i](h, state[i])
+
+        flat_hidden = h.reshape(B*TT, self.hidden_size)
+        logits, values = self.decoder(flat_hidden)
+        values = values.reshape(B, TT)
+        return logits, values
+
+class Mamba(nn.Module):
+    def __init__(self, env, hidden_size=128, num_layers=1, d_state=32, d_conv=4, expand=1):
+        super().__init__()
+        self.obs_shape = env.single_observation_space.shape
+        self.hidden_size = hidden_size
+        self.input_size = hidden_size
+        self.obs_shape = env.single_observation_space.shape
+        self.encoder = DefaultEncoder(env, hidden_size)
+        self.decoder = DefaultDecoder(env, hidden_size)
+
+        self.num_layers = num_layers
+        from mamba_ssm import Mamba2
+        self.mamba = nn.ModuleList([Mamba2(d_model=hidden_size, d_state=d_state, d_conv=d_conv, expand=expand)
+            for _ in range(num_layers)])
+
+    def initial_state(self, batch_size, device):
+        conv_state = torch.zeros(
+            self.num_layers,
+            batch_size,
+            self.mamba[0].d_conv,
+            self.mamba[0].conv1d.weight.shape[0],
+            device=device,
+            dtype=self.mamba[0].conv1d.weight.dtype,
+        ).transpose(2, 3).to(device)
+        ssm_state = torch.zeros(
+            self.num_layers,
+            batch_size,
+            self.mamba[0].nheads,
+            self.mamba[0].headdim,
+            self.mamba[0].d_state,
+            device=device,
+            dtype=self.mamba[0].in_proj.weight.dtype,
+        ).to(device)
+        return conv_state, ssm_state
+
+    def forward_eval(self, x, state):
+        h = self.encoder(x)
+        h = h.unsqueeze(1)
+        conv_state, ssm_state = state
+        for i in range(self.num_layers):
+            h, conv_state[i], ssm_state[i] = self.mamba[i].step(h, conv_state[i], ssm_state[i])
+
+        state = (conv_state, ssm_state)
+        h = h.squeeze(1)
+        logits, values = self.decoder(h)
+        return logits, values, state
+
+    def forward(self, x):
+        x_shape, space_shape = x.shape, self.obs_shape
+        x_n, space_n = len(x_shape), len(space_shape)
+        assert x_shape[-space_n:] == space_shape, f'Invalid input tensor shape {x.shape} != {space_shape}'
+
+        B, TT = x_shape[:2]
+        x = x.reshape(B*TT, *space_shape)
+        h = self.encoder(x)
+        assert h.shape == (B*TT, self.input_size)
+        h = h.reshape(B, TT, self.input_size)
+
+        for i in range(self.num_layers):
+            h = self.mamba[i](h)
+
+        flat_hidden = h.reshape(B*TT, self.hidden_size)
+        logits, values = self.decoder(flat_hidden)
+        values = values.reshape(B, TT)
         return logits, values
 
 class LSTMWrapper(nn.Module):
-    def __init__(self, env, policy, input_size=128, hidden_size=128):
+    def __init__(self, env, make_policy_fn, hidden_size=128, num_layers=1, **kwargs):
         '''Wraps your policy with an LSTM without letting you shoot yourself in the
         foot with bad transpose and shape operations. This saves much pain.
         Requires that your policy define encode_observations and decode_actions.
         See the Default policy for an example.'''
         super().__init__()
         self.obs_shape = env.single_observation_space.shape
+        input_size = hidden_size
 
-        self.policy = policy
+        # NOTE: LSTM API is changing. Should revisit this.
+        self.policy = make_policy_fn()
         self.input_size = input_size
         self.hidden_size = hidden_size
+        self.num_layers = num_layers
         self.is_continuous = self.policy.is_continuous
 
         for name, param in self.named_parameters():
@@ -120,83 +453,63 @@ def __init__(self, env, policy, input_size=128, hidden_size=128):
                 nn.init.orthogonal_(param, 1.0)
 
         self.lstm = nn.LSTM(input_size, hidden_size)
+        self.cell = nn.ModuleList([torch.nn.LSTMCell(hidden_size, hidden_size) for _ in range(num_layers)])
 
-        self.cell = torch.nn.LSTMCell(input_size, hidden_size)
-        self.cell.weight_ih = self.lstm.weight_ih_l0
-        self.cell.weight_hh = self.lstm.weight_hh_l0
-        self.cell.bias_ih = self.lstm.bias_ih_l0
-        self.cell.bias_hh = self.lstm.bias_hh_l0
+        for i in range(num_layers):
+            cell = self.cell[i]
 
-        #self.pre_layernorm = nn.LayerNorm(hidden_size)
-        #self.post_layernorm = nn.LayerNorm(hidden_size)
+            w_ih = getattr(self.lstm, f'weight_ih_l{i}')
+            w_hh = getattr(self.lstm, f'weight_hh_l{i}')
+            b_ih = getattr(self.lstm, f'bias_ih_l{i}')
+            b_hh = getattr(self.lstm, f'bias_hh_l{i}')
 
-    def forward_eval(self, observations, state):
+            nn.init.orthogonal_(w_ih, 1.0)
+            nn.init.orthogonal_(w_hh, 1.0)
+            b_ih.data.zero_()
+            b_hh.data.zero_()
+
+            cell.weight_ih = w_ih
+            cell.weight_hh = w_hh
+            cell.bias_ih = b_ih
+            cell.bias_hh = b_hh
+
+    def initial_state(self, batch_size, device):
+        h = torch.zeros(self.num_layers, batch_size, self.hidden_size, device=device)
+        c = torch.zeros(self.num_layers, batch_size, self.hidden_size, device=device)
+        return h, c
+
+    def forward_eval(self, x, state):
         '''Forward function for inference. 3x faster than using LSTM directly'''
-        hidden = self.policy.encode_observations(observations, state=state)
-        h = state['lstm_h']
-        c = state['lstm_c']
-
-        # TODO: Don't break compile
-        if h is not None:
-            assert h.shape[0] == c.shape[0] == observations.shape[0], 'LSTM state must be (h, c)'
-            lstm_state = (h, c)
-        else:
-            lstm_state = None
-
-        #hidden = self.pre_layernorm(hidden)
-        hidden, c = self.cell(hidden, lstm_state)
-        #hidden = self.post_layernorm(hidden)
-        state['hidden'] = hidden
-        state['lstm_h'] = hidden
-        state['lstm_c'] = c
-        logits, values = self.policy.decode_actions(hidden)
-        return logits, values
+        assert state[0].shape[1] == state[1].shape[1] == x.shape[0], 'LSTM state must be (h, c)'
+        h = self.policy.encode_observations(x)
+        lstm_h, lstm_c = state
+        for i in range(self.num_layers):
+            h, c = self.cell[i](h, (lstm_h[i], lstm_c[i]))
+            lstm_h[i] = h
+            lstm_c[i] = c
 
-    def forward(self, observations, state):
-        '''Forward function for training. Uses LSTM for fast time-batching'''
-        x = observations
-        lstm_h = state['lstm_h']
-        lstm_c = state['lstm_c']
+        logits, values = self.policy.decode_actions(h)
+        return logits, values, (lstm_h, lstm_c)
 
+    def forward(self, x):
+        '''Forward function for training. Uses LSTM for fast time-batching'''
         x_shape, space_shape = x.shape, self.obs_shape
         x_n, space_n = len(x_shape), len(space_shape)
-        if x_shape[-space_n:] != space_shape:
-            raise ValueError('Invalid input tensor shape', x.shape)
-
-        if x_n == space_n + 1:
-            B, TT = x_shape[0], 1
-        elif x_n == space_n + 2:
-            B, TT = x_shape[:2]
-        else:
-            raise ValueError('Invalid input tensor shape', x.shape)
-
-        if lstm_h is not None:
-            assert lstm_h.shape[1] == lstm_c.shape[1] == B, 'LSTM state must be (h, c)'
-            lstm_state = (lstm_h, lstm_c)
-        else:
-            lstm_state = None
+        assert x_shape[-space_n:] == space_shape, f'Invalid input tensor shape {x.shape} != {space_shape}'
 
+        B, TT = x_shape[:2]
         x = x.reshape(B*TT, *space_shape)
-        hidden = self.policy.encode_observations(x, state)
-        assert hidden.shape == (B*TT, self.input_size)
+        h = self.policy.encode_observations(x)
+        assert h.shape == (B*TT, self.input_size)
+        h = h.reshape(B, TT, self.input_size)
 
-        hidden = hidden.reshape(B, TT, self.input_size)
-
-        hidden = hidden.transpose(0, 1)
-        #hidden = self.pre_layernorm(hidden)
-        hidden, (lstm_h, lstm_c) = self.lstm.forward(hidden, lstm_state)
-        hidden = hidden.float()
- 
-        #hidden = self.post_layernorm(hidden)
-        hidden = hidden.transpose(0, 1)
+        h = h.transpose(0, 1)
+        h, (lstm_h, lstm_c) = self.lstm.forward(h)
+        h = h.transpose(0, 1)
 
-        flat_hidden = hidden.reshape(B*TT, self.hidden_size)
+        flat_hidden = h.reshape(B*TT, self.hidden_size)
         logits, values = self.policy.decode_actions(flat_hidden)
         values = values.reshape(B, TT)
-        #state.batch_logits = logits.reshape(B, TT, -1)
-        state['hidden'] = hidden
-        state['lstm_h'] = lstm_h.detach()
-        state['lstm_c'] = lstm_c.detach()
         return logits, values
 
 class Convolutional(nn.Module):
diff --git a/pufferlib/muon.py b/pufferlib/muon.py
new file mode 100644
index 000000000..37ee18e33
--- /dev/null
+++ b/pufferlib/muon.py
@@ -0,0 +1,148 @@
+"""Simple Muon optimizer numerically matched to Lukas's HeavyBall implementation."""
+
+import math
+from collections.abc import MutableMapping
+from typing import Optional
+
+import torch
+from torch import Tensor
+
+from torch.optim.optimizer import (
+    _disable_dynamo_if_unsupported,
+    _params_doc,
+    _to_scalar,
+    Optimizer,
+    ParamsT,
+)
+
+
+__all__ = ["Muon"]
+
+def zeropower_via_newtonschulz5(G, eps=1e-7):
+    G = G.clone()
+    x = G
+    if G.size(-2) > G.size(-1):
+        x = x.mT
+
+    x = x / torch.clamp(G.norm(dim=(-2, -1)), min=eps)
+ 
+    # Perform the NS iterations
+    for a, b, c in [
+        (4.0848, -6.8946, 2.9270),
+        (3.9505, -6.3029, 2.6377),
+        (3.7418, -5.5913, 2.3037),
+        (2.8769, -3.1427, 1.2046),
+        (2.8366, -3.0525, 1.2012),
+    ]:
+        s = x @ x.mT
+        y = c * s
+        y.diagonal(dim1=-2, dim2=-1).add_(b)
+        y = y @ s
+        y.diagonal(dim1=-2, dim2=-1).add_(a)
+        x = y @ x
+
+    if G.size(-2) > G.size(-1):
+        x = x.mT
+
+    return x.to(G.dtype)
+
+class Muon(Optimizer):
+    def __init__(
+        self,
+        params: ParamsT,
+        lr: float = 0.0025,
+        weight_decay: float = 0.0,
+        momentum: float = 0.9,
+        eps: float = 1e-8,
+    ) -> None:
+        if isinstance(lr, Tensor) and lr.numel() != 1:
+            raise ValueError("Tensor lr must be 1-element")
+        if lr < 0.0:
+            raise ValueError(f"Learning rate should be >= 0 but is: {lr}")
+        if momentum < 0.0:
+            raise ValueError(f"momentum should be >= 0 but is: {momentum}")
+        if weight_decay < 0.0:
+            raise ValueError(f"weight decay should be >= 0 but is: {weight_decay}")
+
+        defaults = {
+            "lr": lr,
+            "weight_decay": weight_decay,
+            "momentum": momentum,
+            "eps": eps,
+        }
+        super().__init__(params, defaults)
+
+    def _init_group(
+        self,
+        group: MutableMapping,
+        params_with_grad: list[Tensor],
+        grads: list[Tensor],
+        muon_momentum_bufs: list[Tensor],
+    ):
+        for p in group["params"]:
+            if p.grad is None:
+                continue
+
+            params_with_grad.append(p)
+            grads.append(p.grad)
+
+            state = self.state[p]
+
+            if "momentum_buffer" not in state:
+                state["momentum_buffer"] = torch.zeros_like(
+                    p.grad, memory_format=torch.preserve_format
+                )
+            muon_momentum_bufs.append(state["momentum_buffer"])
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step."""
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group in self.param_groups:
+            lr = group["lr"]
+            weight_decay = group["weight_decay"]
+            momentum = group["momentum"]
+            eps = group["eps"]
+
+            params_with_grad: list[Tensor] = []
+            grads: list[Tensor] = []
+            muon_momentum_bufs: list[Tensor] = []
+            lr = _to_scalar(lr)
+
+            for p in group["params"]:
+                if p.grad is None:
+                    continue
+
+                params_with_grad.append(p)
+                grads.append(p.grad)
+
+                state = self.state[p]
+
+                if "momentum_buffer" not in state:
+                    state["momentum_buffer"] = torch.zeros_like(
+                        p.grad, memory_format=torch.preserve_format
+                    )
+                muon_momentum_bufs.append(state["momentum_buffer"])
+
+            for i, param in enumerate(params_with_grad):
+
+                grad = grads[i]
+
+                buf = muon_momentum_bufs[i]
+                buf.mul_(momentum)
+                buf.add_(grad)
+                grad.add_(buf*momentum)
+
+                if grad.ndim >= 2:
+                    grad = grad.view(grad.shape[0], -1)
+                    grad = zeropower_via_newtonschulz5(grad) # original has hardcoded steps and eps
+                    grad *= max(1, grad.size(-2) / grad.size(-1)) ** 0.5 # Matches heavyball and Keller
+
+                param.mul_(1 - lr * weight_decay)
+                param.sub_(lr*grad.view(param.shape))
+
+        return loss
diff --git a/pufferlib/ocean/breakout/breakout.c b/pufferlib/ocean/breakout/breakout.c
index da51a09ff..4539d9bd6 100644
--- a/pufferlib/ocean/breakout/breakout.c
+++ b/pufferlib/ocean/breakout/breakout.c
@@ -11,6 +11,7 @@ void demo() {
         .frameskip = 1,
         .width = 576,
         .height = 330,
+        .initial_paddle_width = 62,
         .paddle_width = 62,
         .paddle_height = 8,
         .ball_width = 32,
diff --git a/pufferlib/ocean/constellation/KHR/khrplatform.h b/pufferlib/ocean/constellation/KHR/khrplatform.h
new file mode 100644
index 000000000..01646449c
--- /dev/null
+++ b/pufferlib/ocean/constellation/KHR/khrplatform.h
@@ -0,0 +1,311 @@
+#ifndef __khrplatform_h_
+#define __khrplatform_h_
+
+/*
+** Copyright (c) 2008-2018 The Khronos Group Inc.
+**
+** Permission is hereby granted, free of charge, to any person obtaining a
+** copy of this software and/or associated documentation files (the
+** "Materials"), to deal in the Materials without restriction, including
+** without limitation the rights to use, copy, modify, merge, publish,
+** distribute, sublicense, and/or sell copies of the Materials, and to
+** permit persons to whom the Materials are furnished to do so, subject to
+** the following conditions:
+**
+** The above copyright notice and this permission notice shall be included
+** in all copies or substantial portions of the Materials.
+**
+** THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+** EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+** MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+** IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+** CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+** TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+** MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+*/
+
+/* Khronos platform-specific types and definitions.
+ *
+ * The master copy of khrplatform.h is maintained in the Khronos EGL
+ * Registry repository at https://github.com/KhronosGroup/EGL-Registry
+ * The last semantic modification to khrplatform.h was at commit ID:
+ *      67a3e0864c2d75ea5287b9f3d2eb74a745936692
+ *
+ * Adopters may modify this file to suit their platform. Adopters are
+ * encouraged to submit platform specific modifications to the Khronos
+ * group so that they can be included in future versions of this file.
+ * Please submit changes by filing pull requests or issues on
+ * the EGL Registry repository linked above.
+ *
+ *
+ * See the Implementer's Guidelines for information about where this file
+ * should be located on your system and for more details of its use:
+ *    http://www.khronos.org/registry/implementers_guide.pdf
+ *
+ * This file should be included as
+ *        #include <KHR/khrplatform.h>
+ * by Khronos client API header files that use its types and defines.
+ *
+ * The types in khrplatform.h should only be used to define API-specific types.
+ *
+ * Types defined in khrplatform.h:
+ *    khronos_int8_t              signed   8  bit
+ *    khronos_uint8_t             unsigned 8  bit
+ *    khronos_int16_t             signed   16 bit
+ *    khronos_uint16_t            unsigned 16 bit
+ *    khronos_int32_t             signed   32 bit
+ *    khronos_uint32_t            unsigned 32 bit
+ *    khronos_int64_t             signed   64 bit
+ *    khronos_uint64_t            unsigned 64 bit
+ *    khronos_intptr_t            signed   same number of bits as a pointer
+ *    khronos_uintptr_t           unsigned same number of bits as a pointer
+ *    khronos_ssize_t             signed   size
+ *    khronos_usize_t             unsigned size
+ *    khronos_float_t             signed   32 bit floating point
+ *    khronos_time_ns_t           unsigned 64 bit time in nanoseconds
+ *    khronos_utime_nanoseconds_t unsigned time interval or absolute time in
+ *                                         nanoseconds
+ *    khronos_stime_nanoseconds_t signed time interval in nanoseconds
+ *    khronos_boolean_enum_t      enumerated boolean type. This should
+ *      only be used as a base type when a client API's boolean type is
+ *      an enum. Client APIs which use an integer or other type for
+ *      booleans cannot use this as the base type for their boolean.
+ *
+ * Tokens defined in khrplatform.h:
+ *
+ *    KHRONOS_FALSE, KHRONOS_TRUE Enumerated boolean false/true values.
+ *
+ *    KHRONOS_SUPPORT_INT64 is 1 if 64 bit integers are supported; otherwise 0.
+ *    KHRONOS_SUPPORT_FLOAT is 1 if floats are supported; otherwise 0.
+ *
+ * Calling convention macros defined in this file:
+ *    KHRONOS_APICALL
+ *    KHRONOS_APIENTRY
+ *    KHRONOS_APIATTRIBUTES
+ *
+ * These may be used in function prototypes as:
+ *
+ *      KHRONOS_APICALL void KHRONOS_APIENTRY funcname(
+ *                                  int arg1,
+ *                                  int arg2) KHRONOS_APIATTRIBUTES;
+ */
+
+#if defined(__SCITECH_SNAP__) && !defined(KHRONOS_STATIC)
+#   define KHRONOS_STATIC 1
+#endif
+
+/*-------------------------------------------------------------------------
+ * Definition of KHRONOS_APICALL
+ *-------------------------------------------------------------------------
+ * This precedes the return type of the function in the function prototype.
+ */
+#if defined(KHRONOS_STATIC)
+    /* If the preprocessor constant KHRONOS_STATIC is defined, make the
+     * header compatible with static linking. */
+#   define KHRONOS_APICALL
+#elif defined(_WIN32)
+#   define KHRONOS_APICALL __declspec(dllimport)
+#elif defined (__SYMBIAN32__)
+#   define KHRONOS_APICALL IMPORT_C
+#elif defined(__ANDROID__)
+#   define KHRONOS_APICALL __attribute__((visibility("default")))
+#else
+#   define KHRONOS_APICALL
+#endif
+
+/*-------------------------------------------------------------------------
+ * Definition of KHRONOS_APIENTRY
+ *-------------------------------------------------------------------------
+ * This follows the return type of the function  and precedes the function
+ * name in the function prototype.
+ */
+#if defined(_WIN32) && !defined(_WIN32_WCE) && !defined(__SCITECH_SNAP__)
+    /* Win32 but not WinCE */
+#   define KHRONOS_APIENTRY __stdcall
+#else
+#   define KHRONOS_APIENTRY
+#endif
+
+/*-------------------------------------------------------------------------
+ * Definition of KHRONOS_APIATTRIBUTES
+ *-------------------------------------------------------------------------
+ * This follows the closing parenthesis of the function prototype arguments.
+ */
+#if defined (__ARMCC_2__)
+#define KHRONOS_APIATTRIBUTES __softfp
+#else
+#define KHRONOS_APIATTRIBUTES
+#endif
+
+/*-------------------------------------------------------------------------
+ * basic type definitions
+ *-----------------------------------------------------------------------*/
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || defined(__GNUC__) || defined(__SCO__) || defined(__USLC__)
+
+
+/*
+ * Using <stdint.h>
+ */
+#include <stdint.h>
+typedef int32_t                 khronos_int32_t;
+typedef uint32_t                khronos_uint32_t;
+typedef int64_t                 khronos_int64_t;
+typedef uint64_t                khronos_uint64_t;
+#define KHRONOS_SUPPORT_INT64   1
+#define KHRONOS_SUPPORT_FLOAT   1
+/*
+ * To support platform where unsigned long cannot be used interchangeably with
+ * inptr_t (e.g. CHERI-extended ISAs), we can use the stdint.h intptr_t.
+ * Ideally, we could just use (u)intptr_t everywhere, but this could result in
+ * ABI breakage if khronos_uintptr_t is changed from unsigned long to
+ * unsigned long long or similar (this results in different C++ name mangling).
+ * To avoid changes for existing platforms, we restrict usage of intptr_t to
+ * platforms where the size of a pointer is larger than the size of long.
+ */
+#if defined(__SIZEOF_LONG__) && defined(__SIZEOF_POINTER__)
+#if __SIZEOF_POINTER__ > __SIZEOF_LONG__
+#define KHRONOS_USE_INTPTR_T
+#endif
+#endif
+
+#elif defined(__VMS ) || defined(__sgi)
+
+/*
+ * Using <inttypes.h>
+ */
+#include <inttypes.h>
+typedef int32_t                 khronos_int32_t;
+typedef uint32_t                khronos_uint32_t;
+typedef int64_t                 khronos_int64_t;
+typedef uint64_t                khronos_uint64_t;
+#define KHRONOS_SUPPORT_INT64   1
+#define KHRONOS_SUPPORT_FLOAT   1
+
+#elif defined(_WIN32) && !defined(__SCITECH_SNAP__)
+
+/*
+ * Win32
+ */
+typedef __int32                 khronos_int32_t;
+typedef unsigned __int32        khronos_uint32_t;
+typedef __int64                 khronos_int64_t;
+typedef unsigned __int64        khronos_uint64_t;
+#define KHRONOS_SUPPORT_INT64   1
+#define KHRONOS_SUPPORT_FLOAT   1
+
+#elif defined(__sun__) || defined(__digital__)
+
+/*
+ * Sun or Digital
+ */
+typedef int                     khronos_int32_t;
+typedef unsigned int            khronos_uint32_t;
+#if defined(__arch64__) || defined(_LP64)
+typedef long int                khronos_int64_t;
+typedef unsigned long int       khronos_uint64_t;
+#else
+typedef long long int           khronos_int64_t;
+typedef unsigned long long int  khronos_uint64_t;
+#endif /* __arch64__ */
+#define KHRONOS_SUPPORT_INT64   1
+#define KHRONOS_SUPPORT_FLOAT   1
+
+#elif 0
+
+/*
+ * Hypothetical platform with no float or int64 support
+ */
+typedef int                     khronos_int32_t;
+typedef unsigned int            khronos_uint32_t;
+#define KHRONOS_SUPPORT_INT64   0
+#define KHRONOS_SUPPORT_FLOAT   0
+
+#else
+
+/*
+ * Generic fallback
+ */
+#include <stdint.h>
+typedef int32_t                 khronos_int32_t;
+typedef uint32_t                khronos_uint32_t;
+typedef int64_t                 khronos_int64_t;
+typedef uint64_t                khronos_uint64_t;
+#define KHRONOS_SUPPORT_INT64   1
+#define KHRONOS_SUPPORT_FLOAT   1
+
+#endif
+
+
+/*
+ * Types that are (so far) the same on all platforms
+ */
+typedef signed   char          khronos_int8_t;
+typedef unsigned char          khronos_uint8_t;
+typedef signed   short int     khronos_int16_t;
+typedef unsigned short int     khronos_uint16_t;
+
+/*
+ * Types that differ between LLP64 and LP64 architectures - in LLP64,
+ * pointers are 64 bits, but 'long' is still 32 bits. Win64 appears
+ * to be the only LLP64 architecture in current use.
+ */
+#ifdef KHRONOS_USE_INTPTR_T
+typedef intptr_t               khronos_intptr_t;
+typedef uintptr_t              khronos_uintptr_t;
+#elif defined(_WIN64)
+typedef signed   long long int khronos_intptr_t;
+typedef unsigned long long int khronos_uintptr_t;
+#else
+typedef signed   long  int     khronos_intptr_t;
+typedef unsigned long  int     khronos_uintptr_t;
+#endif
+
+#if defined(_WIN64)
+typedef signed   long long int khronos_ssize_t;
+typedef unsigned long long int khronos_usize_t;
+#else
+typedef signed   long  int     khronos_ssize_t;
+typedef unsigned long  int     khronos_usize_t;
+#endif
+
+#if KHRONOS_SUPPORT_FLOAT
+/*
+ * Float type
+ */
+typedef          float         khronos_float_t;
+#endif
+
+#if KHRONOS_SUPPORT_INT64
+/* Time types
+ *
+ * These types can be used to represent a time interval in nanoseconds or
+ * an absolute Unadjusted System Time.  Unadjusted System Time is the number
+ * of nanoseconds since some arbitrary system event (e.g. since the last
+ * time the system booted).  The Unadjusted System Time is an unsigned
+ * 64 bit value that wraps back to 0 every 584 years.  Time intervals
+ * may be either signed or unsigned.
+ */
+typedef khronos_uint64_t       khronos_utime_nanoseconds_t;
+typedef khronos_int64_t        khronos_stime_nanoseconds_t;
+#endif
+
+/*
+ * Dummy value used to pad enum types to 32 bits.
+ */
+#ifndef KHRONOS_MAX_ENUM
+#define KHRONOS_MAX_ENUM 0x7FFFFFFF
+#endif
+
+/*
+ * Enumerated boolean type
+ *
+ * Values other than zero should be considered to be true.  Therefore
+ * comparisons should not be made against KHRONOS_TRUE.
+ */
+typedef enum {
+    KHRONOS_FALSE = 0,
+    KHRONOS_TRUE  = 1,
+    KHRONOS_BOOLEAN_ENUM_FORCE_SIZE = KHRONOS_MAX_ENUM
+} khronos_boolean_enum_t;
+
+#endif /* __khrplatform_h_ */
diff --git a/pufferlib/ocean/constellation/constellation.c b/pufferlib/ocean/constellation/constellation.c
new file mode 100644
index 000000000..81f8121b8
--- /dev/null
+++ b/pufferlib/ocean/constellation/constellation.c
@@ -0,0 +1,1496 @@
+#include <math.h>
+#include <float.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "cJSON.h"
+#include "raylib.h"
+
+#define RAYGUI_IMPLEMENTATION
+#include "raygui.h"
+#include "rcamera.h"
+
+#if defined(PLATFORM_DESKTOP) || defined(PLATFORM_DESKTOP_SDL)
+    #if defined(GRAPHICS_API_OPENGL_ES2)
+        #include "glad_gles2.h"       // Required for: OpenGL functionality
+        #define glGenVertexArrays glGenVertexArraysOES
+        #define glBindVertexArray glBindVertexArrayOES
+        #define glDeleteVertexArrays glDeleteVertexArraysOES
+        #define GLSL_VERSION            100
+    #else
+        #if defined(__APPLE__)
+            #define GL_SILENCE_DEPRECATION // Silence Opengl API deprecation warnings
+            #include <OpenGL/gl3.h>     // OpenGL 3 library for OSX
+            #include <OpenGL/gl3ext.h>  // OpenGL 3 extensions library for OSX
+        #else
+            #include "glad.h"       // Required for: OpenGL functionality
+        #endif
+        #define GLSL_VERSION            330
+    #endif
+#else   // PLATFORM_ANDROID, PLATFORM_WEB
+    #define GLSL_VERSION            100
+#endif
+
+#include "rlgl.h"
+#include "raymath.h"
+
+#define CAMERA_ORBITAL_SPEED 0.05f
+void CustomUpdateCamera(Camera *camera, float orbitSpeed) {
+    float cameraOrbitalSpeed = CAMERA_ORBITAL_SPEED*GetFrameTime();
+    Matrix rotation = MatrixRotate(GetCameraUp(camera), cameraOrbitalSpeed);
+    Vector3 view = Vector3Subtract(camera->position, camera->target);
+    view = Vector3Transform(view, rotation);
+    camera->position = Vector3Add(camera->target, view);
+    CameraMoveToTarget(camera, -GetMouseWheelMove());
+    if (IsKeyPressed(KEY_KP_SUBTRACT)) CameraMoveToTarget(camera, 2.0f);
+    if (IsKeyPressed(KEY_KP_ADD)) CameraMoveToTarget(camera, -2.0f);
+}
+
+#define SETTINGS_HEIGHT 20
+#define TOGGLE_WIDTH 60
+#define DROPDOWN_WIDTH 136
+
+const Color PUFF_CYAN = (Color){0, 187, 187, 255};
+const Color PUFF_WHITE = (Color){241, 241, 241, 241};
+const Color PUFF_BACKGROUND = (Color){6, 24, 24, 255};
+const Color CONSTELLATION = (Color){255, 255, 255, 128};
+
+#define MAX_PARTICLES 10000
+#define MAX_POINTS 10000
+
+typedef struct Glyph {
+    float x;
+    float y;
+    float i;
+    float r;
+    float g;
+    float b;
+    float a;
+} Glyph;
+
+typedef struct Point {
+    float x;
+    float y;
+    float z;
+    float c;
+} Point;
+
+typedef struct {
+    float click_x;
+    float click_y;
+    float x;
+    float y;
+    int env_idx;
+    int ary_idx;
+    bool active;
+} Tooltip;
+
+typedef struct {
+    char *key;
+    float *ary;
+    int n;
+} Hyper;
+
+typedef struct {
+    char *key;
+    Hyper *hypers;
+    int n;
+} Env;
+
+typedef struct {
+    Env *envs;
+    int n;
+} Dataset;
+
+Hyper* get_hyper(Dataset *data, char *env, char* hyper) {
+    for (int i = 0; i < data->n; i++) {
+        if (strcmp(data->envs[i].key, env) != 0) {
+            continue;
+        }
+        for (int j = 0; j < data->envs[i].n; j++) {
+            if (strcmp(data->envs[i].hypers[j].key, hyper) == 0) {
+                return &data->envs[i].hypers[j];
+            }
+        }
+    }
+    printf("Error: hyper %s not found in env %s\n", hyper, env);
+    exit(1);
+    return NULL;
+}
+
+Color rgb(float h) {
+    return ColorFromHSV(120*(1.0 + h), 0.8f, 0.15f);
+}
+
+typedef struct PlotArgs {
+    float x_min;
+    float x_max;
+    float y_min;
+    float y_max;
+    float z_min;
+    float z_max;
+    float c_min;
+    float c_max;
+    bool log_x;
+    bool log_y;
+    bool log_z;
+    bool log_c;
+    int width;
+    int height;
+    int title_font_size;
+    int axis_font_size;
+    int axis_tick_font_size;
+    int legend_font_size;
+    int line_width;
+    int tick_length;
+    int top_margin;
+    int bottom_margin;
+    int left_margin;
+    int right_margin;
+    int tick_margin;
+    Color font_color;
+    Color background_color;
+    Color axis_color;
+    char* x_label;
+    char* y_label;
+    char* z_label;
+    Font font;
+    Font font_small;
+    Camera3D camera;
+} PlotArgs;
+
+PlotArgs DEFAULT_PLOT_ARGS = {
+    .x_min = 0.0f,
+    .x_max = 0.0f,
+    .y_min = 0.0f,
+    .y_max = 0.0f,
+    .z_min = 0.0f,
+    .z_max = 0.0f,
+    .width = 960,
+    .height = 540 - SETTINGS_HEIGHT,
+    .title_font_size = 32,
+    .axis_font_size = 32,
+    .axis_tick_font_size = 16,
+    .legend_font_size = 12,
+    .line_width = 2,
+    .tick_length = 8,
+    .tick_margin = 8,
+    .top_margin = 70,
+    .bottom_margin = 70,
+    .left_margin = 100,
+    .right_margin = 100,
+    .font_color = PUFF_WHITE,
+    .background_color = PUFF_BACKGROUND,
+    .axis_color = PUFF_WHITE,
+    .x_label = "Cost",
+    .y_label = "Score",
+    .z_label = "Train/Learning Rate",
+};
+
+float safe_log10(float x) {
+    if (x <= 0) {
+        return x;
+    }
+    return log10(x);
+}
+
+const char* format_tick_label(double value) {
+    static char buffer[32];
+    int precision = 2;
+
+    if (fabs(value) < 1e-10) {
+        strcpy(buffer, "0");
+        return buffer;
+    }
+
+    if (fabs(value) < 0.01 || fabs(value) > 10000) {
+        snprintf(buffer, sizeof(buffer), "%.2e\0", value);
+    } else {
+        snprintf(buffer, sizeof(buffer), "%.2f\0", value);
+        //char *end = buffer + strlen(buffer) - 1;
+        //while (end > buffer && *end == '0') *end-- = '\0';
+        //if (end > buffer && *end == '.') *end = '\0';
+    }
+
+    return buffer;
+}
+
+void draw_axes(PlotArgs args) {
+    DrawLine(args.left_margin, args.top_margin,
+        args.left_margin, args.height - args.bottom_margin, PUFF_WHITE);
+    DrawLine(args.left_margin, args.height - args.bottom_margin,
+        args.width - args.right_margin, args.height - args.bottom_margin, PUFF_WHITE);
+}
+
+void draw_labels(PlotArgs args) {
+    // X label
+    Vector2 x_font_size = MeasureTextEx(args.font, args.x_label, args.axis_font_size, 0);
+    DrawTextEx(
+        args.font,
+        args.x_label,
+        (Vector2){
+            args.width/2 - x_font_size.x/2,
+            args.height - x_font_size.y,
+        },
+        args.axis_font_size,
+        0,
+        PUFF_WHITE
+    );
+
+    // Y label
+    Vector2 y_font_size = MeasureTextEx(args.font, args.y_label, args.axis_font_size, 0);
+    DrawTextPro(
+        args.font,
+        args.y_label,
+        (Vector2){
+            0,
+            args.height/2 + y_font_size.x/2
+        },
+        (Vector2){ 0, 0 },
+        -90,
+        args.axis_font_size,
+        0,
+        PUFF_WHITE
+    );
+}
+
+void draw_x_tick(char* label, float x_pos, PlotArgs args) {
+    float y_pos = args.height - args.bottom_margin;
+    DrawLine(
+        x_pos,
+        y_pos - args.tick_length,
+        x_pos,
+        y_pos + args.tick_length,
+        args.axis_color
+    );
+    Vector2 this_tick_size = MeasureTextEx(args.font, label, args.axis_tick_font_size, 0);
+    DrawTextEx(
+        args.font_small,
+        label,
+        (Vector2){
+            x_pos - this_tick_size.x/2,
+            y_pos + args.tick_length + args.tick_margin,
+        },
+        args.axis_tick_font_size,
+        0,
+        PUFF_WHITE
+    );
+}
+
+void draw_y_tick(char* label, float y_pos, PlotArgs args) {
+    DrawLine(
+        args.left_margin - args.tick_length,
+        y_pos,
+        args.left_margin + args.tick_length,
+        y_pos,
+        args.axis_color
+    );
+    Vector2 this_tick_size = MeasureTextEx(args.font, label, args.axis_tick_font_size, 0);
+    DrawTextEx(
+        args.font_small,
+        label,
+        (Vector2){
+            args.left_margin - this_tick_size.x - args.tick_length - args.tick_margin,
+            y_pos - this_tick_size.y/2,
+        },
+        args.axis_tick_font_size,
+        0,
+        PUFF_WHITE
+    );
+}
+
+Vector2 compute_ticks(PlotArgs args) {
+    int width = args.width;
+    int height = args.height;
+
+    float plot_width = width - args.left_margin - args.right_margin;
+    float plot_height = height - args.top_margin - args.bottom_margin;
+
+    Vector2 tick_label_size = MeasureTextEx(args.font, "estimate", args.axis_font_size, 0);
+    int num_x_ticks = 1 + plot_width/tick_label_size.x;
+    int num_y_ticks = 1 + plot_height/tick_label_size.y;
+
+    return (Vector2){num_x_ticks, num_y_ticks};
+}
+
+
+void draw_ticks(char x_ticks[][32], int x_n, char y_ticks[][32], int y_n, PlotArgs args) {
+    int width = args.width;
+    int height = args.height;
+
+    float plot_width = width - args.left_margin - args.right_margin;
+    float plot_height = height - args.top_margin - args.bottom_margin;
+ 
+    for (int i=0; i<x_n; i++) {
+        char* label = x_ticks[i];
+        draw_x_tick(label, args.left_margin + i*plot_width/(x_n - 1.0f), args);
+    }
+    for (int i=0; i<y_n; i++) {
+        float y_pos = height - args.bottom_margin - i*plot_height/(y_n - 1.0f);
+        char* label = y_ticks[i];
+        draw_y_tick(label, y_pos, args);
+    }
+}
+
+void draw_all_ticks(PlotArgs args) {
+    Vector2 tick_n = compute_ticks(args);
+    char x_ticks[(int)tick_n.x][32];
+    float x_min = args.log_x ? safe_log10(args.x_min) : args.x_min;
+    float x_max = args.log_x ? safe_log10(args.x_max) : args.x_max;
+    for (int i=0; i<tick_n.x; i++) {
+        float val = x_min + i*(x_max - x_min)/(tick_n.x - 1.0f);
+        if (args.log_x) {
+            val = pow(10, val);
+        }
+        char* label = format_tick_label(val);
+        strcpy(x_ticks[i], label);
+    }
+
+    char y_ticks[(int)tick_n.y][32];
+    float y_min = args.log_y ? safe_log10(args.y_min) : args.y_min;
+    float y_max = args.log_y ? safe_log10(args.y_max) : args.y_max;
+    for (int i=0; i<tick_n.y; i++) {
+        float val = y_min + i*(y_max - y_min)/(tick_n.y - 1.0f);
+        if (args.log_y) {
+            val = pow(10, val);
+        }
+        char* label = format_tick_label(val);
+        strcpy(y_ticks[i], label);
+    }
+
+    draw_ticks(x_ticks, tick_n.x, y_ticks, tick_n.y, args);
+}
+
+void draw_box_ticks(char* hypers[], int hyper_count, PlotArgs args) {
+    Vector2 tick_n = compute_ticks(args);
+    char x_ticks[(int)tick_n.x][32];
+    for (int i=0; i<tick_n.x; i++) {
+        float val = args.x_min + i*(args.x_max - args.x_min)/(tick_n.x - 1.0f);
+        char* label = format_tick_label(val);
+        strcpy(x_ticks[i], label);
+    }
+    char fixed_hypers[hyper_count][32];
+    for (int i=0; i<hyper_count; i++) {
+        strncpy(fixed_hypers[i], hypers[i], 32);
+    }
+
+    draw_ticks(x_ticks, tick_n.x, fixed_hypers, hyper_count, args);
+}
+
+
+void draw_axes3() {
+    DrawLine3D(
+        (Vector3){0, 0, 0},
+        (Vector3){1, 0, 0},
+        RED
+    );
+    DrawLine3D(
+        (Vector3){0, 0, 0},
+        (Vector3){0, 1, 0},
+        GREEN
+    );
+    DrawLine3D(
+        (Vector3){0, 0, 0},
+        (Vector3){0, 0, 1},
+        BLUE
+    );
+}
+
+float hyper_min(Dataset *data, char* key, int start, int end) {
+    float mmin = FLT_MAX;
+    for (int env=start; env<end; env++) {
+        for (int i=0; i<data->envs[env].n; i++) {
+            Hyper* hyper = &data->envs[env].hypers[i];
+            if (strcmp(hyper->key, key) != 0) {
+                continue;
+            }
+            for (int j=0; j<hyper->n; j++) {
+                float val = hyper->ary[j];
+                if (val < mmin){
+                    mmin = val;
+                }
+            }
+        }
+    }
+    return mmin;
+}
+
+float hyper_max(Dataset *data, char* key, int start, int end) {
+    float mmax = -FLT_MAX;
+    for (int i=start; i<end; i++) {
+        for (int j=0; j<data->envs[i].n; j++) {
+            Hyper* hyper = &data->envs[i].hypers[j];
+            if (strcmp(hyper->key, key) != 0) {
+                continue;
+            }
+            for (int k=0; k<hyper->n; k++) {
+                float val = hyper->ary[k];
+                if (val > mmax){
+                    mmax = val;
+                }
+            }
+        }
+    }
+    return mmax;
+}
+
+
+void boxplot(Hyper* hyper, bool log_x, int i, int hyper_count, PlotArgs args, Color color, bool* filter) {
+    int width = args.width;
+    int height = args.height;
+
+    float x_min = args.x_min;
+    float x_max = args.x_max;
+
+    float plot_width = width - args.left_margin - args.right_margin;
+    float plot_height = height - args.top_margin - args.bottom_margin;
+
+    if (log_x) {
+        x_min = x_min<=1e-8 ? -8 : log10(x_min);
+        x_max = x_max<=1e-8 ? -8 : log10(x_max);
+    }
+
+    float dx = x_max - x_min;
+    if (dx == 0) dx = 1.0f;
+    x_min -= 0.1f * dx; x_max += 0.1f * dx;
+    dx = x_max - x_min;
+    float dy = plot_height/((float)hyper_count);
+
+    Color faded = Fade(color, 0.15f);
+
+    float* ary = hyper->ary;
+    float mmin = ary[0];
+    float mmax = ary[0];
+    for (int j=0; j<hyper->n; j++) {
+        if (filter != NULL && !filter[j]) {
+            continue;
+        }
+        mmin = fmin(mmin, ary[j]);
+        mmax = fmax(mmax, ary[j]);
+    }
+
+    if (log_x) {
+        mmin = mmin <= 0 ? 0 : log10(mmin);
+        mmax = mmax <= 0 ? 0 : log10(mmax);
+    }
+
+    float left = args.left_margin + (mmin - x_min)/(x_max - x_min)*plot_width;
+    float right = args.left_margin + (mmax - x_min)/(x_max - x_min)*plot_width;
+
+    // TODO - rough patch
+    left = fmax(left, args.left_margin);
+    right = fmin(right, width - args.right_margin);
+    DrawRectangle(left, args.top_margin + i*dy, right - left, dy, faded);
+}
+
+void plot_gl(Glyph* glyphs, int size, Shader* shader) {
+    int n = size;
+
+    GLuint vao = 0;
+    GLuint vbo = 0;
+    glGenVertexArrays(1, &vao);
+    glBindVertexArray(vao);
+        glGenBuffers(1, &vbo);
+        glBindBuffer(GL_ARRAY_BUFFER, vbo);
+        glBufferData(GL_ARRAY_BUFFER, n*sizeof(Glyph), glyphs, GL_STATIC_DRAW);
+        glVertexAttribPointer(shader->locs[SHADER_LOC_VERTEX_POSITION], 3, GL_FLOAT, GL_FALSE, sizeof(Glyph), 0);
+        glEnableVertexAttribArray(shader->locs[SHADER_LOC_VERTEX_POSITION]);
+        int vertexColorLoc = shader->locs[SHADER_LOC_VERTEX_COLOR];
+        glVertexAttribPointer(vertexColorLoc, 4, GL_FLOAT, GL_FALSE, sizeof(Glyph), (void*)(3*sizeof(float)));
+        glEnableVertexAttribArray(vertexColorLoc);
+        glBindBuffer(GL_ARRAY_BUFFER, 0);
+    glBindVertexArray(0);
+
+    rlDrawRenderBatchActive();
+    rlSetBlendMode(RL_BLEND_ADDITIVE);
+    int currentTimeLoc = GetShaderLocation(*shader, "currentTime");
+    glUseProgram(shader->id);
+        glUniform1f(currentTimeLoc, GetTime());
+        Matrix modelViewProjection = MatrixMultiply(rlGetMatrixModelview(), rlGetMatrixProjection());
+        glUniformMatrix4fv(shader->locs[SHADER_LOC_MATRIX_MVP], 1, false, MatrixToFloat(modelViewProjection));
+        glBindVertexArray(vao);
+            glDrawArrays(GL_POINTS, 0, n);
+        glBindVertexArray(0);
+    glUseProgram(0);
+    glDeleteBuffers(1, &vbo);
+    glDeleteVertexArrays(1, &vao);
+    rlSetBlendMode(RL_BLEND_ALPHA);
+}
+
+int cleanup(Hyper *map, int map_count, cJSON *root, char *json_str) {
+    if (map) {
+        for (int i=0; i<map_count; i++) {
+            if (map[i].key) free(map[i].key);
+            if (map[i].ary) free(map[i].ary);
+        }
+    }
+    if (root) cJSON_Delete(root);
+    if (json_str) free(json_str);
+    return 1;
+}
+
+void GuiDropdownCheckbox(int x, int y, char* options, int *selection, bool *active, char *text, bool *checked) {
+    Rectangle rect = {x, y, DROPDOWN_WIDTH, SETTINGS_HEIGHT};
+    if (GuiDropdownBox(rect, options, selection, *active)) {
+        *active = !*active;
+    }
+    Rectangle check_rect = {x + rect.width , y, SETTINGS_HEIGHT, rect.height};
+    GuiCheckBox(check_rect, text, checked);
+}
+
+void GuiDropdownFilter(int x, int y, char* options, int *selection, bool *dropdown_active,
+        Vector2 focus, char *text1, float *text1_val, char *text2, float *text2_val) {
+    Rectangle rect = {x, y, DROPDOWN_WIDTH, SETTINGS_HEIGHT};
+    if (GuiDropdownBox(rect, options, selection, *dropdown_active)) {
+        *dropdown_active = !*dropdown_active;
+    }
+    Rectangle text1_rect = {x + rect.width, y, DROPDOWN_WIDTH/2, SETTINGS_HEIGHT};
+    bool text1_active = CheckCollisionPointRec(focus, text1_rect);
+    if (GuiTextBox(text1_rect, text1, 32, text1_active)) {
+        *text1_val = atof(text1);
+    }
+    Rectangle text2_rect = {x + 1.5*DROPDOWN_WIDTH, y, DROPDOWN_WIDTH/2, SETTINGS_HEIGHT};
+    bool text2_active = CheckCollisionPointRec(focus, text2_rect);
+    if (GuiTextBox(text2_rect, text2, 32, text2_active)) {
+        *text2_val = atof(text2);
+    }
+}
+ 
+void apply_filter(bool* filter, Hyper* param, float min, float max) {
+    for (int i=0; i<param->n; i++) {
+        float val = param->ary[i];
+        if (val < min || val > max) {
+            filter[i] = false;
+        }
+    }
+}
+
+float scale_param(float val, float min, float max, bool log) {
+    if (log) {
+        val = safe_log10(val);
+        min = safe_log10(min);
+        max = safe_log10(max);
+    }
+    return (val - min)/(max - min);
+}
+
+void autoscale(Point* points, int size, PlotArgs *args) {
+    float x_min = FLT_MAX;
+    float x_max = -FLT_MAX;
+    for (int i=0; i<size; i++) {
+        float xi = points[i].x;
+        if (xi < x_min) x_min = xi;
+        if (xi > x_max) x_max = xi;
+    }
+    args->x_min = x_min;
+    args->x_max = x_max;
+
+    float y_min = FLT_MAX;
+    float y_max = -FLT_MAX;
+    for (int i=0; i<size; i++) {
+        float yi = points[i].y;
+        if (yi < y_min) y_min = yi;
+        if (yi > y_max) y_max = yi;
+    }
+    args->y_min = y_min;
+    args->y_max = y_max;
+
+    float z_min = FLT_MAX;
+    float z_max = -FLT_MAX;
+    for (int i=0; i<size; i++) {
+        float zi = points[i].z;
+        if (zi < z_min) z_min = zi;
+        if (zi > z_max) z_max = zi;
+    }
+    args->z_min = z_min;
+    args->z_max = z_max;
+
+    float c_min = FLT_MAX;
+    float c_max = -FLT_MAX;
+    for (int i=0; i<size; i++) {
+        float ci = points[i].c;
+        if (ci < c_min) c_min = ci;
+        if (ci > c_max) c_max = ci;
+    }
+    args->c_min = c_min;
+    args->c_max = c_max;
+}
+
+void toPx(Point *points, Glyph* glyphs, int size, PlotArgs args) {
+    float x_min = args.log_x ? safe_log10(args.x_min) : args.x_min;
+    float x_max = args.log_x ? safe_log10(args.x_max) : args.x_max;
+    float y_min = args.log_y ? safe_log10(args.y_min) : args.y_min;
+    float y_max = args.log_y ? safe_log10(args.y_max) : args.y_max;
+    float z_min = args.log_z ? safe_log10(args.z_min) : args.z_min;
+    float z_max = args.log_z ? safe_log10(args.z_max) : args.z_max;
+    float c_min = args.log_c ? safe_log10(args.c_min) : args.c_min;
+    float c_max = args.log_c ? safe_log10(args.c_max) : args.c_max;
+
+    float dx = x_max - x_min;
+    float dy = y_max - y_min;
+    float dz = z_max - z_min;
+
+    for (int i = 0; i < size; i++) {
+        Point p = points[i];
+        float xi = (args.log_x) ? safe_log10(p.x) : p.x;
+        float yi = (args.log_y) ? safe_log10(p.y) : p.y;
+        float zi = (args.log_z) ? safe_log10(p.z) : p.z;
+        float px, py;
+
+        if (args.z_min != 0 || args.z_max != 0) {
+            Vector3 v = (Vector3){
+                (xi - x_min)/dx,
+                (yi - y_min)/dy,
+                (zi - z_min)/dz
+            };
+            assert(args.camera.fovy != 0);
+            Vector2 screen_pos = GetWorldToScreenEx(v, args.camera, args.width, args.height);
+            px = screen_pos.x;
+            py = screen_pos.y;
+        } else {
+            px = args.left_margin + (xi - x_min) / dx * args.width;
+            py = args.height - args.bottom_margin - (yi - y_min) / dy * args.height;
+        }
+
+        float cmap = points[i].c;
+        if (args.log_c) {
+            cmap = safe_log10(cmap);
+        }
+        if (c_min != c_max) {
+            cmap = (cmap - c_min)/(c_max - c_min);
+        }
+        Color c = rgb(cmap);
+        glyphs[i] = (Glyph){
+            px,
+            py,
+            i,
+            c.r/255.0f,
+            c.g/255.0f,
+            c.b/255.0f,
+            c.a/255.0f,
+        };
+    }
+}
+
+void update_closest(Tooltip* tooltip, Vector2 *indices, Glyph* glyphs, int size, float x_offset, float y_offset) {
+    float dx = tooltip->click_x - tooltip->x;
+    float dy = tooltip->click_y - tooltip->y;
+    float dist = sqrt(dx*dx + dy*dy);
+
+    for (int i=0; i<size; i++) {
+        dx = x_offset + glyphs[i].x - tooltip->click_x;
+        dy = y_offset + glyphs[i].y - tooltip->click_y;
+        float d = sqrt(dx*dx + dy*dy);
+        if (d < dist) {
+            dist = d;
+            tooltip->x = x_offset + glyphs[i].x;
+            tooltip->y = y_offset + glyphs[i].y;
+            tooltip->env_idx = indices[i].x;
+            tooltip->ary_idx = indices[i].y;
+        }
+    }
+}
+
+void copy_hypers_to_clipboard(Env *env, char* buffer, int ary_idx) {
+    char* start = buffer;
+    for (int hyper_idx = 0; hyper_idx < env->n; hyper_idx++) {
+        Hyper *hyper = &env->hypers[hyper_idx];
+        char *slash = strchr(hyper->key, '/');
+        if (!slash) {
+            continue;
+        }
+        char* suffix = slash + 1;
+        buffer += sprintf(buffer, "%s = %f\n", suffix, hyper->ary[ary_idx]);
+    }
+    buffer[0] = '\0';
+    SetClipboardText(start);
+}
+
+void compute_constellation(Dataset *data, int* env_idxs, float* env_dists,
+        float env_perf, float perf_threshold, Vector2 tsne, float tsne_thresh) {
+    for (int i=0; i<data->n; i++) {
+        Env* env = &data->envs[i];
+        Hyper* perf = get_hyper(data, env->key, "environment/perf");
+        Hyper* tsne1 = get_hyper(data, env->key, "tsne1");
+        Hyper* tsne2 = get_hyper(data, env->key, "tsne2");
+        for (int j=0; j<tsne1->n; j++) {
+            if (perf->ary[j] < perf_threshold) {
+                continue;
+            }
+            float t1_dist = tsne1->ary[j] - tsne.x;
+            float t2_dist = tsne2->ary[j] - tsne.y;
+            float tsne_dist = t1_dist*t1_dist + t2_dist*t2_dist;
+            if (tsne_dist > tsne_thresh) {
+                continue;
+            }
+            if (tsne_dist < env_dists[i]) {
+                env_dists[i] = tsne_dist;
+                env_idxs[i] = j;
+            }
+        }
+    }
+}
+    
+ 
+int main(void) {
+    FILE *file = fopen("pufferlib/ocean/constellation/default.json", "r");
+    if (!file) {
+        printf("Error opening file\n");
+        return 1;
+    }
+
+    // Read in file
+    fseek(file, 0, SEEK_END);
+    long file_size = ftell(file);
+    fseek(file, 0, SEEK_SET);
+    char *json_str = malloc(file_size + 1);
+    fread(json_str, 1, file_size, file);
+    json_str[file_size] = '\0';
+    fclose(file);
+    cJSON *root = cJSON_Parse(json_str);
+    if (!root) {
+        printf("JSON parse error: %s\n", cJSON_GetErrorPtr());
+        free(json_str);
+        return 1;
+    }
+    if (!cJSON_IsObject(root)) {
+        printf("Error: Root is not an object\n");
+        return cleanup(NULL, 0, root, json_str);
+    }
+
+    // Load in dataset
+    Dataset data = {NULL, 0};
+    cJSON *json_env = root->child;
+    while (json_env) {
+        data.n++;
+        json_env = json_env->next;
+    }
+
+    Env *envs = calloc(data.n, sizeof(Env));
+    data.envs = envs;
+    json_env = root->child;
+    int max_data_points = 0;
+    for (int i=0; i<data.n; i++) {
+        json_env = cJSON_GetArrayItem(root, i);
+        cJSON *json_hyper = json_env->child;
+        int hyper_points = 0;
+        while (json_hyper) {
+            envs[i].n++;
+            envs[i].key = strdup(json_env->string);
+            int nxt_hyper_points = cJSON_GetArraySize(json_hyper);
+            if (hyper_points == 0) {
+                hyper_points = nxt_hyper_points;
+            } else {
+                assert(hyper_points == nxt_hyper_points);
+            }
+            if (hyper_points > max_data_points) {
+                max_data_points = hyper_points;
+            }
+            json_hyper = json_hyper->next;
+        }
+        envs[i].hypers = calloc(envs[i].n, sizeof(Hyper));
+        for (int j=0; j<envs[i].n; j++) {
+            cJSON *json_hyper = cJSON_GetArrayItem(json_env, j);
+            envs[i].hypers[j].key = strdup(json_hyper->string);
+            envs[i].hypers[j].ary = calloc(hyper_points, sizeof(float));
+            int n = cJSON_GetArraySize(json_hyper);
+            envs[i].hypers[j].n = n;
+            for (int k = 0; k < n; k++) {
+                cJSON *sub = cJSON_GetArrayItem(json_hyper, k);
+                if (cJSON_IsNumber(sub)) {
+                    envs[i].hypers[j].ary[k] = (float)sub->valuedouble;
+                } else {
+                    continue;
+                    //printf("Error: Non-number in array for key '%s' at index %d\n", map[idx].key, j);
+                }
+            }
+        }
+    }
+
+    int hyper_count = 24;
+    char *hyper_key[24] = {
+        "agent_steps",
+        "cost",
+        "environment/perf",
+        "environment/score",
+        "train/learning_rate",
+        "train/ent_coef",
+        "train/gamma",
+        "train/gae_lambda",
+        "train/vtrace_rho_clip",
+        "train/vtrace_c_clip",
+        "train/clip_coef",
+        "train/vf_clip_coef",
+        "train/vf_coef",
+        "train/max_grad_norm",
+        "train/adam_beta1",
+        "train/adam_beta2",
+        "train/adam_eps",
+        "train/prio_alpha",
+        "train/prio_beta0",
+        "train/bptt_horizon",
+        "train/num_minibatches",
+        "train/minibatch_size",
+        "policy/hidden_size",
+        "env/num_envs",
+    };
+
+    //char* items[] = {"environment/score", "cost", "train/learning_rate", "train/gamma", "train/gae_lambda"};
+    //char options[] = "environment/score;cost;train/learning_rate;train/gamma;train/gae_lambda";
+          
+    // Create options as a semicolon-separated string
+    size_t options_len = 0;
+    for (int i = 0; i < hyper_count; i++) {
+        options_len += strlen(hyper_key[i]) + 1;
+    }
+    char *options = malloc(options_len);
+    options[0] = '\0';
+    for (int i = 0; i < hyper_count; i++) {
+        if (i > 0) strcat(options, ";");
+        strcat(options, hyper_key[i]);
+    }
+
+    // Options with extra "env_name;"
+    char* extra = "env_name;";
+    char *env_hyper_options = malloc(options_len + strlen(extra));
+    strcpy(env_hyper_options, extra);
+    strcat(env_hyper_options, options);
+
+    // Env names as semi-colon-separated string
+    size_t env_options_len = 4;
+    for (int i = 0; i < data.n; i++) {
+        env_options_len += strlen(data.envs[i].key) + 1;
+    }
+    char *env_options = malloc(env_options_len);
+    strcpy(env_options, "all;");
+    env_options[4] = '\0';
+    for (int i = 0; i < data.n; i++) {
+        if (i > 0) strcat(env_options, ";");
+        strcat(env_options, data.envs[i].key);
+    }
+
+    char* clipboard = malloc(1024);
+
+    // Points
+    Point* points = calloc(MAX_POINTS, sizeof(Point));
+    Glyph* glyphs = calloc(MAX_PARTICLES, sizeof(Glyph));
+    Vector2* env_indices = calloc(MAX_POINTS, sizeof(Vector2));
+
+    // Initialize Raylib
+    SetConfigFlags(FLAG_MSAA_4X_HINT);
+    InitWindow(2*DEFAULT_PLOT_ARGS.width, 2*DEFAULT_PLOT_ARGS.height + 2*SETTINGS_HEIGHT, "Puffer Constellation");
+
+    DEFAULT_PLOT_ARGS.font = LoadFontEx("resources/shared/JetBrainsMono-SemiBold.ttf", 32, NULL, 255);
+    DEFAULT_PLOT_ARGS.font_small = LoadFontEx("resources/shared/JetBrainsMono-SemiBold.ttf", 16, NULL, 255);
+    Font gui_font = LoadFontEx("resources/shared/JetBrainsMono-SemiBold.ttf", 14, NULL, 255);
+
+    GuiLoadStyle("pufferlib/ocean/constellation/puffer.rgs");
+    GuiSetFont(gui_font);
+    ClearBackground(PUFF_BACKGROUND);
+    SetTargetFPS(60);
+
+    Shader shader = LoadShader(TextFormat("pufferlib/ocean/constellation/point_particle.vs", GLSL_VERSION),
+                               TextFormat("pufferlib/ocean/constellation/point_particle.fs", GLSL_VERSION));
+
+    Shader blur_shader = LoadShader(
+            "pufferlib/ocean/constellation/blur.vs",
+            "pufferlib/ocean/constellation/blur.fs");
+
+    // Allows the vertex shader to set the point size of each particle individually
+    #ifndef GRAPHICS_API_OPENGL_ES2
+    glEnable(GL_PROGRAM_POINT_SIZE);
+    #endif
+
+    Camera3D camera = (Camera3D){ 0 };
+    PlotArgs args1 = DEFAULT_PLOT_ARGS;
+    args1.camera = (Camera3D){ 0 };
+    args1.camera.position = (Vector3){ 1.5f, 1.25f, 1.5f };
+    args1.camera.target = (Vector3){ 0.5f, 0.5f, 0.5f };
+    args1.camera.up = (Vector3){ 0.0f, 1.0f, 0.0f };
+    args1.camera.fovy = 45.0f;
+    args1.camera.projection = CAMERA_PERSPECTIVE;
+    args1.log_x = true;
+    args1.log_z = true;
+    RenderTexture2D fig1 = LoadRenderTexture(args1.width, args1.height);
+    RenderTexture2D fig1_overlay = LoadRenderTexture(args1.width, args1.height);
+    int fig1_env_idx = 0;
+    bool fig1_env_active = false;
+    bool fig1_x_active = false;
+    int fig1_x_idx = 0;
+    bool fig1_y_active = false;
+    int fig1_y_idx = 2;
+    bool fig1_z_active = false;
+    int fig1_z_idx = 1;
+    int fig1_color_idx = 0;
+    bool fig1_color_active = false;
+
+    PlotArgs args2 = DEFAULT_PLOT_ARGS;
+    RenderTexture2D fig2 = LoadRenderTexture(args2.width, args2.height);
+    //SetTextureFilter(fig2.texture, TEXTURE_FILTER_POINT);
+    args2.right_margin = 50;
+    args2.log_x = true;
+    int fig2_env_idx = 1;
+    bool fig2_env_active = false;
+    bool fig2_x_active = false;
+    int fig2_x_idx = 1;
+    bool fig2_y_active = false;
+    int fig2_y_idx = 3;
+    int fig2_color_idx = 1;
+    bool fig2_color_active = false;
+
+    PlotArgs args3 = DEFAULT_PLOT_ARGS;
+    RenderTexture2D fig3 = LoadRenderTexture(args3.width, args3.height);
+    RenderTexture2D fig3_overlay = LoadRenderTexture(args1.width, args1.height);
+    args3.left_margin = 10;
+    args3.right_margin = 10;
+    args3.top_margin = 10;
+    args3.bottom_margin = 10;
+    args3.x_label = "tsne1";
+    args3.y_label = "tsne2";
+    bool fig3_range1_active = false;
+    int fig3_range1_idx = 2;
+    char fig3_range1_min[32];
+    char fig3_range1_max[32];
+    float fig3_range1_min_val = 0;
+    float fig3_range1_max_val = 1;
+    bool fig3_range2_active = false;
+    int fig3_range2_idx = 1;
+    char fig3_range2_min[32];
+    char fig3_range2_max[32];
+    float fig3_range2_min_val = 0;
+    float fig3_range2_max_val = 10000;
+
+    PlotArgs args4 = DEFAULT_PLOT_ARGS;
+    RenderTexture2D fig4 = LoadRenderTexture(args4.width, args4.height);
+    args4.x_label = "Value";
+    args4.y_label = "Hyperparameter";
+    args4.left_margin = 170;
+    args4.right_margin = 50;
+    args4.top_margin = 10;
+    args4.bottom_margin = 50;
+    args4.x_min = 1e-8;
+    args4.x_max = 1e8;
+    bool fig4_x_log = true;
+    bool fig4_range1_active = false;
+    int fig4_range1_idx = 2;
+    char fig4_range1_min[32];
+    char fig4_range1_max[32];
+    float fig4_range1_min_val = 0;
+    float fig4_range1_max_val = 1;
+    bool fig4_range2_active = false;
+    int fig4_range2_idx = 1;
+    char fig4_range2_min[32];
+    char fig4_range2_max[32];
+    float fig4_range2_min_val = 0;
+    float fig4_range2_max_val = 10000;
+
+    float perf_thresholds[4] = {0.5f, 0.75f, 0.9f, 0.95f};
+    int best_srci[4];
+    int best_n[4];
+    int* best_idx[4];
+    float* temp_dist[4];
+    int* temp_idx[4];
+    for (int i=0; i<4; i++) {
+        best_idx[i] = calloc(data.n, sizeof(int));
+        temp_dist[i] = calloc(data.n, sizeof(float));
+        temp_idx[i] = calloc(data.n, sizeof(int));
+    }
+
+    Hyper* x;
+    Hyper* y;
+    Hyper* z;
+    Hyper* c;
+    char* x_label;
+    char* y_label;
+    char* z_label;
+
+    bool *filter = calloc(max_data_points, sizeof(bool));
+
+    Tooltip tooltip = {0};
+
+    Vector2 focus = {0, 0};
+
+    // Find best hypers
+    float tsne_thresh = 100.0f;
+    memset(best_n, 0, sizeof(int)*4);
+    memset(best_srci, 0, sizeof(int)*4);
+    for (int env_i=0; env_i<data.n; env_i++) {
+        Env* src = &data.envs[env_i];
+        Hyper* src_perf = get_hyper(&data, src->key, "environment/perf");
+        Hyper* src_tsne1 = get_hyper(&data, src->key, "tsne1");
+        Hyper* src_tsne2 = get_hyper(&data, src->key, "tsne2");
+        for (int i=0; i<src_tsne1->n; i++) {
+            float perfi = src_perf->ary[i];
+            Vector2 tsnei = (Vector2){src_tsne1->ary[i], src_tsne2->ary[i]};
+            for (int ki=0; ki<4; ki++) {
+                if (perfi < perf_thresholds[ki]) {
+                    continue;
+                }
+                for (int kj=0; kj<data.n; kj++) {
+                    temp_idx[ki][kj] = -1;
+                    temp_dist[ki][kj] = FLT_MAX;
+                }
+                compute_constellation(&data, temp_idx[ki], temp_dist[ki], perfi, perf_thresholds[ki], tsnei, tsne_thresh);
+                int temp_n = 0;
+                for (int kj=0; kj<data.n; kj++) {
+                    if (temp_idx[ki][kj] != -1) {
+                        temp_n++;
+                    }
+                }
+                if (temp_n > best_n[ki]) {
+                    best_n[ki] = temp_n;
+                    best_srci[ki] = env_i;
+                    for (int kj=0; kj<data.n; kj++) {
+                        best_idx[ki][kj] = temp_idx[ki][kj];
+                    }
+                    if (best_idx[ki][env_i] == -1) {
+                        compute_constellation(&data, temp_idx[ki], temp_dist[ki], perfi, perf_thresholds[ki], tsnei, tsne_thresh);
+                        printf("Error: Best index not found\n");
+                        exit(1);
+                    }
+                }
+            }
+        }
+    }
+
+
+    while (!WindowShouldClose()) {
+        int screen_points_count = 0;
+        bool right_clicked = false;
+
+        BeginDrawing();
+        ClearBackground(PUFF_BACKGROUND);
+
+        if (IsMouseButtonPressed(MOUSE_LEFT_BUTTON)) {
+            focus = GetMousePosition();
+            tooltip.active = false;
+        }
+        if (IsMouseButtonPressed(MOUSE_RIGHT_BUTTON)) {
+            Vector2 mouse_pos = GetMousePosition();
+            right_clicked = true;
+            tooltip.active = true;
+            tooltip.click_x = mouse_pos.x;
+            tooltip.click_y = mouse_pos.y;
+        }
+
+        // Figure 1
+        x_label = hyper_key[fig1_x_idx];
+        y_label = hyper_key[fig1_y_idx];
+        z_label = hyper_key[fig1_z_idx];
+        args1.x_label = x_label;
+        args1.y_label = y_label;
+        args1.z_label = z_label;
+        int start = 0;
+        int end = data.n;
+        if (fig1_env_idx != 0) {
+            start = fig1_env_idx - 1;
+            end = fig1_env_idx;
+        }
+        BeginTextureMode(fig1);
+        ClearBackground(PUFF_BACKGROUND);
+
+        int size = 0;
+        for (int i=start; i<end; i++) {
+            char* env = data.envs[i].key;
+            x = get_hyper(&data, env, hyper_key[fig1_x_idx]);
+            y = get_hyper(&data, env, hyper_key[fig1_y_idx]);
+            z = get_hyper(&data, env, hyper_key[fig1_z_idx]);
+            if (fig1_color_idx != 0) {
+                c = get_hyper(&data, env, hyper_key[fig1_color_idx - 1]);
+            }
+            for (int j=0; j<x->n; j++) {
+                points[size] = (Point){
+                    x->ary[j],
+                    y->ary[j],
+                    z->ary[j],
+                    (fig1_color_idx == 0) ? i/(float)data.n : c->ary[j],
+                };
+                env_indices[size] = (Vector2){i, j};
+                size++;
+            }
+        }
+        autoscale(points, size, &args1);
+        toPx(points, glyphs, size, args1);
+        update_closest(&tooltip, env_indices, glyphs, size, 0, 2*SETTINGS_HEIGHT);
+        plot_gl(glyphs, size, &shader);
+
+        BeginMode3D(args1.camera);
+        CustomUpdateCamera(&args1.camera, CAMERA_ORBITAL_SPEED);
+        draw_axes3();
+        EndMode3D();
+        EndTextureMode();
+
+        // Figure 2
+        x_label = hyper_key[fig2_x_idx];
+        y_label = hyper_key[fig2_y_idx];
+        args2.x_label = x_label;
+        args2.y_label = y_label;
+        args2.top_margin = 20;
+        args2.left_margin = 100;
+        BeginTextureMode(fig2);
+        ClearBackground(PUFF_BACKGROUND);
+
+        start = 0;
+        end = data.n;
+        if (fig2_env_idx != 0) {
+            start = fig2_env_idx - 1;
+            end = fig2_env_idx;
+        }
+        size = 0;
+        for (int i=start; i<end; i++) {
+            char* env = data.envs[i].key;
+            x = get_hyper(&data, env, hyper_key[fig2_x_idx]);
+            y = get_hyper(&data, env, hyper_key[fig2_y_idx]);
+            if (fig2_color_idx != 0) {
+                c = get_hyper(&data, env, hyper_key[fig2_color_idx - 1]);
+            }
+            for (int j=0; j<x->n; j++) {
+                points[size] = (Point){
+                    x->ary[j],
+                    y->ary[j],
+                    0.0f,
+                    (fig2_color_idx == 0) ? i/(float)data.n : c->ary[j],
+                };
+                env_indices[size] = (Vector2){i, j};
+                size++;
+            }
+        }
+
+        autoscale(points, size, &args2);
+        toPx(points, glyphs, size, args2);
+        update_closest(&tooltip, env_indices, glyphs, size, fig1.texture.width, 2*SETTINGS_HEIGHT);
+        plot_gl(glyphs, size, &shader);
+        draw_axes(args2);
+        draw_all_ticks(args2);
+        EndTextureMode();
+
+        // Figure 3
+        BeginTextureMode(fig3);
+        ClearBackground(PUFF_BACKGROUND);
+        size = 0;
+        for (int i=0; i<data.n; i++) {
+            char* env = data.envs[i].key;
+            x = get_hyper(&data, env, "tsne1");
+            y = get_hyper(&data, env, "tsne2");
+            for (int j=0; j<x->n; j++) {
+                filter[j] = true;
+            }
+            Hyper* filter_param_1 = get_hyper(&data, env, hyper_key[fig3_range1_idx]);
+            apply_filter(filter, filter_param_1, fig3_range1_min_val, fig3_range1_max_val);
+            Hyper* filter_param_2 = get_hyper(&data, env, hyper_key[fig3_range2_idx]);
+            apply_filter(filter, filter_param_2, fig3_range2_min_val, fig3_range2_max_val);
+ 
+            for (int j=0; j<x->n; j++) {
+                if (!filter[j]) {
+                    continue;
+                }
+                points[size] = (Point){
+                    x->ary[j],
+                    y->ary[j],
+                    0.0f,
+                    i/(float)data.n
+                };
+                env_indices[size] = (Vector2){i, j};
+                size++;
+            }
+        }
+        autoscale(points, size, &args3);
+        toPx(points, glyphs, size, args3);
+        update_closest(&tooltip, env_indices, glyphs, size, 0, fig1.texture.height + 2*SETTINGS_HEIGHT);
+        plot_gl(glyphs, size, &shader);
+
+        //draw_axes(args3);
+        EndTextureMode();
+
+        // Figure 4
+        BeginTextureMode(fig4);
+        ClearBackground(PUFF_BACKGROUND);
+        rlSetBlendFactorsSeparate(0x0302, 0x0303, 1, 0x0303, 0x8006, 0x8006);
+        BeginBlendMode(BLEND_CUSTOM_SEPARATE);
+        for (int i=0; i<data.n; i++) {
+            Env* env = &data.envs[i];
+            Hyper* filter_param_1 = get_hyper(&data, env->key, hyper_key[fig4_range1_idx]);
+            Hyper* filter_param_2 = get_hyper(&data, env->key, hyper_key[fig4_range2_idx]);
+            for (int j=0; j<hyper_count; j++) {
+                Hyper* hyper = get_hyper(&data, env->key, hyper_key[j]);
+                for (int k=0; k<hyper->n; k++) {
+                    filter[k] = true;
+                }
+                apply_filter(filter, filter_param_1, fig4_range1_min_val, fig4_range1_max_val);
+                apply_filter(filter, filter_param_2, fig4_range2_min_val, fig4_range2_max_val);
+                boxplot(hyper, fig4_x_log, j, hyper_count, args4, PUFF_CYAN, filter);
+            }
+        }
+        EndBlendMode();
+        draw_axes(args4);
+        draw_box_ticks(hyper_key, hyper_count, args4);
+        EndTextureMode();
+
+        // Figure 1-4
+        DrawTextureRec(
+            fig1.texture,
+            (Rectangle){0, 0, fig1.texture.width, -fig1.texture.height },
+            (Vector2){ 0, 2*SETTINGS_HEIGHT }, WHITE
+        );
+        BeginShaderMode(blur_shader);
+        rlSetBlendMode(RL_BLEND_ADDITIVE);
+        DrawTextureRec(
+            fig1_overlay.texture,
+            (Rectangle){0, 0, fig1_overlay.texture.width, -fig1_overlay.texture.height },
+            (Vector2){ 0, 2*SETTINGS_HEIGHT }, WHITE
+        );
+        rlSetBlendMode(RL_BLEND_ALPHA);
+        EndShaderMode();
+        DrawTextureRec(
+            fig2.texture,
+            (Rectangle){ 0, 0, fig2.texture.width, -fig2.texture.height },
+            (Vector2){ fig1.texture.width, 2*SETTINGS_HEIGHT }, WHITE
+        );
+        DrawTextureRec(
+            fig3.texture,
+            (Rectangle){ 0, 0, fig3.texture.width, -fig3.texture.height },
+            (Vector2){ 0, 2*SETTINGS_HEIGHT + fig1.texture.height }, WHITE
+        );
+        BeginShaderMode(blur_shader);
+        rlSetBlendMode(RL_BLEND_ADDITIVE);
+        DrawTextureRec(
+            fig3_overlay.texture,
+            (Rectangle){0, 0, fig3_overlay.texture.width, -fig3_overlay.texture.height },
+            (Vector2){ 0, 2*SETTINGS_HEIGHT + fig1.texture.height }, WHITE
+        );
+        rlSetBlendMode(RL_BLEND_ALPHA);
+        EndShaderMode();
+        DrawTextureRec(
+            fig4.texture,
+            (Rectangle){ 0, 0, fig4.texture.width, -fig4.texture.height },
+            (Vector2){ fig1.texture.width, fig1.texture.height + 2*SETTINGS_HEIGHT }, WHITE
+        );
+
+        // Figure 1 Overlay
+        if (fig1_env_idx == 0) {
+            float x_min = (args1.log_x) ? safe_log10(args1.x_min) : args1.x_min;
+            float x_max = (args1.log_x) ? safe_log10(args1.x_max) : args1.x_max;
+            float y_min = (args1.log_y) ? safe_log10(args1.y_min) : args1.y_min;
+            float y_max = (args1.log_y) ? safe_log10(args1.y_max) : args1.y_max;
+            float z_min = (args1.log_z) ? safe_log10(args1.z_min) : args1.z_min;
+            float z_max = (args1.log_z) ? safe_log10(args1.z_max) : args1.z_max;
+            for (int k=0; k<4; k++) {
+                int bsi = best_srci[k];
+                char* src_env = data.envs[bsi].key;
+                int src_idx = best_idx[k][bsi];
+                x = get_hyper(&data, src_env, hyper_key[fig1_x_idx]);
+                y = get_hyper(&data, src_env, hyper_key[fig1_y_idx]);
+                z = get_hyper(&data, src_env, hyper_key[fig1_z_idx]);
+                float xi = x->ary[src_idx];
+                float yi = y->ary[src_idx];
+                float zi = z->ary[src_idx];
+
+                xi = (args1.log_x) ? safe_log10(xi) : xi;
+                yi = (args1.log_y) ? safe_log10(yi) : yi;
+                zi = (args1.log_z) ? safe_log10(zi) : zi;
+
+                Vector3 src_point = (Vector3){
+                    (xi - x_min)/(x_max - x_min),
+                    (yi - y_min)/(y_max - y_min),
+                    (zi - z_min)/(z_max - z_min)
+                };
+
+                Vector2 screen_i = GetWorldToScreenEx(src_point, args1.camera, args1.width, args1.height);
+
+                for (int i=0; i<data.n; i++) {
+                    int bdi = best_idx[k][i];
+                    if (bdi == -1 || i == bsi) {
+                        continue;
+                    }
+                    char* dst_env = data.envs[i].key;
+                    x = get_hyper(&data, dst_env, hyper_key[fig1_x_idx]);
+                    y = get_hyper(&data, dst_env, hyper_key[fig1_y_idx]);
+                    z = get_hyper(&data, dst_env, hyper_key[fig1_z_idx]);
+                    float xj = x->ary[bdi];
+                    float yj = y->ary[bdi];
+                    float zj = z->ary[bdi];
+
+                    xj = (args1.log_x) ? safe_log10(xj) : xj;
+                    yj = (args1.log_y) ? safe_log10(yj) : yj;
+                    zj = (args1.log_z) ? safe_log10(zj) : zj;
+
+                    Vector3 dst_point = (Vector3){
+                        (xj - x_min)/(x_max - x_min),
+                        (yj - y_min)/(y_max - y_min),
+                        (zj - z_min)/(z_max - z_min)
+                    };
+                    Vector2 screen_j = GetWorldToScreenEx(dst_point, args1.camera, args1.width, args1.height);
+                    DrawLineEx(
+                        (Vector2){screen_i.x, screen_i.y},
+                        (Vector2){screen_j.x, screen_j.y},
+                        2,
+                        CONSTELLATION
+                    );
+                }
+            }
+        }
+            /*
+            Rectangle bounds = {0, 0, fig1.texture.width, fig1.texture.height};
+            int env_n = data.envs[0].n;
+            Point points[4*(env_n + 1)];
+            Glyph glyphs[4*(env_n + 1)];
+            for (int k=0; k<4; k++) {
+                int bsi = best_srci[k];
+                char* src_env = data.envs[bsi].key;
+                int src_idx = best_idx[k][bsi];
+                float xx = get_hyper(&data, src_env, hyper_key[fig1_x_idx])->ary[src_idx];
+                float yy = get_hyper(&data, src_env, hyper_key[fig1_y_idx])->ary[src_idx];
+                float zz = get_hyper(&data, src_env, hyper_key[fig1_z_idx])->ary[src_idx];
+                float cc = get_hyper(&data, src_env, hyper_key[fig1_color_idx])->ary[src_idx];
+                points[k*env_n] = (Point){.x = xx, .y = yy, .z = zz, .c = cc};
+                for (int i=0; i<data.n; i++) {
+                    char* dst_env = data.envs[i].key;
+                    float xj = get_hyper(&data, dst_env, hyper_key[fig1_x_idx])->ary[i];
+                    float yj = get_hyper(&data, dst_env, hyper_key[fig1_y_idx])->ary[i];
+                    float zj = get_hyper(&data, dst_env, hyper_key[fig1_z_idx])->ary[i];
+                    float cj = get_hyper(&data, dst_env, hyper_key[fig1_color_idx])->ary[i];
+                    points[k*env_n + i] = (Point){.x = xj, .y = yj, .z = zj, .c = cj};
+                }
+            }
+            toPx(points, glyphs, 4*(env_n + 1), args1);
+            for (int k=0; k<4; k++) {
+                Glyph src = glyphs[k*env_n];
+                Vector2 src_point = (Vector2){src.x, src.y};
+                if (!CheckCollisionPointRec(src_point, bounds)) {
+                    continue;
+                }
+                for (int i=0; i<env_n + 1; i++) {
+                    Glyph dst = glyphs[k*env_n + i];
+                    Vector2 dst_point = (Vector2){dst.x, dst.y};
+                    if (!CheckCollisionPointRec(dst_point, bounds)) {
+                        continue;
+                    }
+                    DrawLineEx(
+                        (Vector2){dst.x, dst.y},
+                        (Vector2){src.x, src.y},
+                        2,
+                        CONSTELLATION
+                    );
+                }
+            }
+            */
+
+
+        // Figure 3 Overlay 
+        float offset = fig1.texture.height + 2*SETTINGS_HEIGHT;
+        for (int k=0; k<4; k++) {
+            int bsi = best_srci[k];
+            char* src_env = data.envs[bsi].key;
+            int src_idx = best_idx[k][bsi];
+            x = get_hyper(&data, src_env, "tsne1");
+            y = get_hyper(&data, src_env, "tsne2");
+            float xi = x->ary[src_idx];
+            float yi = y->ary[src_idx];
+
+            xi = args3.left_margin + args3.width*(xi - args3.x_min)/(args3.x_max - args3.x_min);
+            yi = offset + args3.height - args3.bottom_margin - args3.height*(yi - args3.y_min)/(args3.y_max - args3.y_min);
+
+            for (int i=0; i<data.n; i++) {
+                int bdi = best_idx[k][i];
+                if (bdi == -1 || i == bsi) {
+                    continue;
+                }
+                char* dst_env = data.envs[i].key;
+                x = get_hyper(&data, dst_env, "tsne1");
+                y = get_hyper(&data, dst_env, "tsne2");
+                float xj = x->ary[bdi];
+                float yj = y->ary[bdi];
+
+                xj = args3.left_margin + args3.width*(xj - args3.x_min)/(args3.x_max - args3.x_min);
+                yj = offset + args3.height - args3.bottom_margin - args3.height*(yj - args3.y_min)/(args3.y_max - args3.y_min);
+
+                DrawLineEx(
+                    (Vector2){xi, yi},
+                    (Vector2){xj, yj},
+                    2,
+                    CONSTELLATION
+                );
+            }
+
+            //float tsne_thresh_px = sqrt(tsne_thresh)*args3.width/(args3.x_max - args3.x_min);
+            //DrawCircleLines(xi, yi, tsne_thresh_px, CONSTELLATION);
+        }
+
+        // Figure 3 UI
+        GuiDropdownFilter(0, SETTINGS_HEIGHT, options, &fig3_range1_idx, &fig3_range1_active, focus,
+            fig3_range1_min, &fig3_range1_min_val, fig3_range1_max, &fig3_range1_max_val);
+        GuiDropdownFilter(2*DROPDOWN_WIDTH, SETTINGS_HEIGHT, options, &fig3_range2_idx, &fig3_range2_active, focus,
+            fig3_range2_min, &fig3_range2_min_val, fig3_range2_max, &fig3_range2_max_val);
+
+        // Figure 4 UI
+        GuiDropdownFilter(fig1.texture.width, SETTINGS_HEIGHT, options, &fig4_range1_idx, &fig4_range1_active, focus,
+            fig4_range1_min, &fig4_range1_min_val, fig4_range1_max, &fig4_range1_max_val);
+        GuiDropdownFilter(fig1.texture.width + 2*DROPDOWN_WIDTH, SETTINGS_HEIGHT, options, &fig4_range2_idx, &fig4_range2_active, focus,
+            fig4_range2_min, &fig4_range2_min_val, fig4_range2_max, &fig4_range2_max_val); 
+        
+        // Figure 1 UI
+        Rectangle fig1_env_rect = {0, 0, DROPDOWN_WIDTH, SETTINGS_HEIGHT};
+        if (GuiDropdownBox(fig1_env_rect, env_options, &fig1_env_idx, fig1_env_active)){
+            fig1_env_active = !fig1_env_active;
+        }
+        GuiDropdownCheckbox(DROPDOWN_WIDTH, 0, options, &fig1_x_idx, &fig1_x_active, "Log X", &args1.log_x);
+        GuiDropdownCheckbox(2*DROPDOWN_WIDTH + TOGGLE_WIDTH, 0, options, &fig1_y_idx, &fig1_y_active, "Log Y", &args1.log_y);
+        GuiDropdownCheckbox(3*DROPDOWN_WIDTH + 2*TOGGLE_WIDTH, 0, options, &fig1_z_idx, &fig1_z_active, "Log Z", &args1.log_z);
+        GuiDropdownCheckbox(4*DROPDOWN_WIDTH + 3*TOGGLE_WIDTH, 0, env_hyper_options, &fig1_color_idx, &fig1_color_active, "Log Color", &args1.log_c);
+
+        // Figure 2 UI
+        Rectangle fig2_env_rect = {fig1.texture.width, 0, DROPDOWN_WIDTH, SETTINGS_HEIGHT};
+        if (GuiDropdownBox(fig2_env_rect, env_options, &fig2_env_idx, fig2_env_active)){
+            fig2_env_active = !fig2_env_active;
+        }
+        GuiDropdownCheckbox(fig1.texture.width + DROPDOWN_WIDTH, 0, options, &fig2_x_idx, &fig2_x_active, "Log X", &args2.log_x);
+        GuiDropdownCheckbox(fig1.texture.width + 2*DROPDOWN_WIDTH + TOGGLE_WIDTH, 0, options, &fig2_y_idx, &fig2_y_active, "Log Y", &args2.log_y);
+        GuiDropdownCheckbox(fig1.texture.width + 3*DROPDOWN_WIDTH + 2*TOGGLE_WIDTH, 0, env_hyper_options, &fig2_color_idx, &fig2_color_active, "Log Color", &args2.log_c);
+
+        // Tooltip
+        int env_idx = tooltip.env_idx;
+        int ary_idx = tooltip.ary_idx;
+        Env* env = &data.envs[env_idx];
+        char* env_key = env->key;
+
+        float cost = get_hyper(&data, env_key, "cost")->ary[ary_idx];
+        float score = get_hyper(&data, env_key, "environment/score")->ary[ary_idx];
+        float steps = get_hyper(&data, env_key, "agent_steps")->ary[ary_idx];
+        float perf = get_hyper(&data, env_key, "environment/perf")->ary[ary_idx];
+        float tsne1 = get_hyper(&data, env_key, "tsne1")->ary[ary_idx];
+        float tsne2 = get_hyper(&data, env_key, "tsne2")->ary[ary_idx];
+        Vector2 tsne = (Vector2){tsne1, tsne2};
+
+        if (tooltip.active) {
+            /*
+            float idx[env->n];
+            float dist[env->n];
+            compute_constellation(&data, idx, dist, perf, perf, tsne, tsne_thresh);
+            for (int i=0; i<env->n; i++) {
+                if (idx[i] == -1) {
+                    continue;
+                }
+            */
+
+            char* text = TextFormat("%s\nscore = %f\ncost = %f\nsteps = %f", env_key, score, cost, steps);
+            Vector2 text_size = MeasureTextEx(args1.font_small, text, args1.axis_tick_font_size, 0);
+            DrawRectangle(tooltip.x, tooltip.y, text_size.x + 4, text_size.y + 4, PUFF_BACKGROUND);
+            DrawCircle(tooltip.x, tooltip.y, 2, PUFF_CYAN);
+            DrawTextEx(args1.font_small, text, (Vector2){tooltip.x + 2, tooltip.y + 2}, args1.axis_tick_font_size, 0, WHITE);
+        }
+        //DrawFPS(GetScreenWidth() - 95, 10);
+        EndDrawing();
+
+        // Copy hypers to clipboard
+        int total_len = 0;
+        if (right_clicked) {
+            copy_hypers_to_clipboard(env, clipboard, ary_idx);
+        }
+    }
+
+    UnloadShader(shader);
+    CloseWindow();
+    return 0;
+}
diff --git a/pufferlib/ocean/constellation/glad.h b/pufferlib/ocean/constellation/glad.h
new file mode 100644
index 000000000..20ff05c28
--- /dev/null
+++ b/pufferlib/ocean/constellation/glad.h
@@ -0,0 +1,2129 @@
+/*
+
+    OpenGL loader generated by glad 0.1.36 on Tue Oct 14 18:01:52 2025.
+
+    Language/Generator: C/C++
+    Specification: gl
+    APIs: gl=3.3
+    Profile: core
+    Extensions:
+        
+    Loader: True
+    Local files: False
+    Omit khrplatform: False
+    Reproducible: False
+
+    Commandline:
+        --profile="core" --api="gl=3.3" --generator="c" --spec="gl" --extensions=""
+    Online:
+        https://glad.dav1d.de/#profile=core&language=c&specification=gl&loader=on&api=gl%3D3.3
+*/
+
+
+#ifndef __glad_h_
+#define __glad_h_
+
+#ifdef __gl_h_
+#error OpenGL header already included, remove this include, glad already provides it
+#endif
+#define __gl_h_
+
+#if defined(_WIN32) && !defined(APIENTRY) && !defined(__CYGWIN__) && !defined(__SCITECH_SNAP__)
+#define APIENTRY __stdcall
+#endif
+
+#ifndef APIENTRY
+#define APIENTRY
+#endif
+#ifndef APIENTRYP
+#define APIENTRYP APIENTRY *
+#endif
+
+#ifndef GLAPIENTRY
+#define GLAPIENTRY APIENTRY
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct gladGLversionStruct {
+    int major;
+    int minor;
+};
+
+typedef void* (* GLADloadproc)(const char *name);
+
+#ifndef GLAPI
+# if defined(GLAD_GLAPI_EXPORT)
+#  if defined(_WIN32) || defined(__CYGWIN__)
+#   if defined(GLAD_GLAPI_EXPORT_BUILD)
+#    if defined(__GNUC__)
+#     define GLAPI __attribute__ ((dllexport)) extern
+#    else
+#     define GLAPI __declspec(dllexport) extern
+#    endif
+#   else
+#    if defined(__GNUC__)
+#     define GLAPI __attribute__ ((dllimport)) extern
+#    else
+#     define GLAPI __declspec(dllimport) extern
+#    endif
+#   endif
+#  elif defined(__GNUC__) && defined(GLAD_GLAPI_EXPORT_BUILD)
+#   define GLAPI __attribute__ ((visibility ("default"))) extern
+#  else
+#   define GLAPI extern
+#  endif
+# else
+#  define GLAPI extern
+# endif
+#endif
+
+GLAPI struct gladGLversionStruct GLVersion;
+
+GLAPI int gladLoadGL(void);
+
+GLAPI int gladLoadGLLoader(GLADloadproc);
+
+#include <KHR/khrplatform.h>
+typedef unsigned int GLenum;
+typedef unsigned char GLboolean;
+typedef unsigned int GLbitfield;
+typedef void GLvoid;
+typedef khronos_int8_t GLbyte;
+typedef khronos_uint8_t GLubyte;
+typedef khronos_int16_t GLshort;
+typedef khronos_uint16_t GLushort;
+typedef int GLint;
+typedef unsigned int GLuint;
+typedef khronos_int32_t GLclampx;
+typedef int GLsizei;
+typedef khronos_float_t GLfloat;
+typedef khronos_float_t GLclampf;
+typedef double GLdouble;
+typedef double GLclampd;
+typedef void *GLeglClientBufferEXT;
+typedef void *GLeglImageOES;
+typedef char GLchar;
+typedef char GLcharARB;
+#ifdef __APPLE__
+typedef void *GLhandleARB;
+#else
+typedef unsigned int GLhandleARB;
+#endif
+typedef khronos_uint16_t GLhalf;
+typedef khronos_uint16_t GLhalfARB;
+typedef khronos_int32_t GLfixed;
+typedef khronos_intptr_t GLintptr;
+typedef khronos_intptr_t GLintptrARB;
+typedef khronos_ssize_t GLsizeiptr;
+typedef khronos_ssize_t GLsizeiptrARB;
+typedef khronos_int64_t GLint64;
+typedef khronos_int64_t GLint64EXT;
+typedef khronos_uint64_t GLuint64;
+typedef khronos_uint64_t GLuint64EXT;
+typedef struct __GLsync *GLsync;
+struct _cl_context;
+struct _cl_event;
+typedef void (APIENTRY *GLDEBUGPROC)(GLenum source,GLenum type,GLuint id,GLenum severity,GLsizei length,const GLchar *message,const void *userParam);
+typedef void (APIENTRY *GLDEBUGPROCARB)(GLenum source,GLenum type,GLuint id,GLenum severity,GLsizei length,const GLchar *message,const void *userParam);
+typedef void (APIENTRY *GLDEBUGPROCKHR)(GLenum source,GLenum type,GLuint id,GLenum severity,GLsizei length,const GLchar *message,const void *userParam);
+typedef void (APIENTRY *GLDEBUGPROCAMD)(GLuint id,GLenum category,GLenum severity,GLsizei length,const GLchar *message,void *userParam);
+typedef unsigned short GLhalfNV;
+typedef GLintptr GLvdpauSurfaceNV;
+typedef void (APIENTRY *GLVULKANPROCNV)(void);
+#define GL_DEPTH_BUFFER_BIT 0x00000100
+#define GL_STENCIL_BUFFER_BIT 0x00000400
+#define GL_COLOR_BUFFER_BIT 0x00004000
+#define GL_FALSE 0
+#define GL_TRUE 1
+#define GL_POINTS 0x0000
+#define GL_LINES 0x0001
+#define GL_LINE_LOOP 0x0002
+#define GL_LINE_STRIP 0x0003
+#define GL_TRIANGLES 0x0004
+#define GL_TRIANGLE_STRIP 0x0005
+#define GL_TRIANGLE_FAN 0x0006
+#define GL_NEVER 0x0200
+#define GL_LESS 0x0201
+#define GL_EQUAL 0x0202
+#define GL_LEQUAL 0x0203
+#define GL_GREATER 0x0204
+#define GL_NOTEQUAL 0x0205
+#define GL_GEQUAL 0x0206
+#define GL_ALWAYS 0x0207
+#define GL_ZERO 0
+#define GL_ONE 1
+#define GL_SRC_COLOR 0x0300
+#define GL_ONE_MINUS_SRC_COLOR 0x0301
+#define GL_SRC_ALPHA 0x0302
+#define GL_ONE_MINUS_SRC_ALPHA 0x0303
+#define GL_DST_ALPHA 0x0304
+#define GL_ONE_MINUS_DST_ALPHA 0x0305
+#define GL_DST_COLOR 0x0306
+#define GL_ONE_MINUS_DST_COLOR 0x0307
+#define GL_SRC_ALPHA_SATURATE 0x0308
+#define GL_NONE 0
+#define GL_FRONT_LEFT 0x0400
+#define GL_FRONT_RIGHT 0x0401
+#define GL_BACK_LEFT 0x0402
+#define GL_BACK_RIGHT 0x0403
+#define GL_FRONT 0x0404
+#define GL_BACK 0x0405
+#define GL_LEFT 0x0406
+#define GL_RIGHT 0x0407
+#define GL_FRONT_AND_BACK 0x0408
+#define GL_NO_ERROR 0
+#define GL_INVALID_ENUM 0x0500
+#define GL_INVALID_VALUE 0x0501
+#define GL_INVALID_OPERATION 0x0502
+#define GL_OUT_OF_MEMORY 0x0505
+#define GL_CW 0x0900
+#define GL_CCW 0x0901
+#define GL_POINT_SIZE 0x0B11
+#define GL_POINT_SIZE_RANGE 0x0B12
+#define GL_POINT_SIZE_GRANULARITY 0x0B13
+#define GL_LINE_SMOOTH 0x0B20
+#define GL_LINE_WIDTH 0x0B21
+#define GL_LINE_WIDTH_RANGE 0x0B22
+#define GL_LINE_WIDTH_GRANULARITY 0x0B23
+#define GL_POLYGON_MODE 0x0B40
+#define GL_POLYGON_SMOOTH 0x0B41
+#define GL_CULL_FACE 0x0B44
+#define GL_CULL_FACE_MODE 0x0B45
+#define GL_FRONT_FACE 0x0B46
+#define GL_DEPTH_RANGE 0x0B70
+#define GL_DEPTH_TEST 0x0B71
+#define GL_DEPTH_WRITEMASK 0x0B72
+#define GL_DEPTH_CLEAR_VALUE 0x0B73
+#define GL_DEPTH_FUNC 0x0B74
+#define GL_STENCIL_TEST 0x0B90
+#define GL_STENCIL_CLEAR_VALUE 0x0B91
+#define GL_STENCIL_FUNC 0x0B92
+#define GL_STENCIL_VALUE_MASK 0x0B93
+#define GL_STENCIL_FAIL 0x0B94
+#define GL_STENCIL_PASS_DEPTH_FAIL 0x0B95
+#define GL_STENCIL_PASS_DEPTH_PASS 0x0B96
+#define GL_STENCIL_REF 0x0B97
+#define GL_STENCIL_WRITEMASK 0x0B98
+#define GL_VIEWPORT 0x0BA2
+#define GL_DITHER 0x0BD0
+#define GL_BLEND_DST 0x0BE0
+#define GL_BLEND_SRC 0x0BE1
+#define GL_BLEND 0x0BE2
+#define GL_LOGIC_OP_MODE 0x0BF0
+#define GL_DRAW_BUFFER 0x0C01
+#define GL_READ_BUFFER 0x0C02
+#define GL_SCISSOR_BOX 0x0C10
+#define GL_SCISSOR_TEST 0x0C11
+#define GL_COLOR_CLEAR_VALUE 0x0C22
+#define GL_COLOR_WRITEMASK 0x0C23
+#define GL_DOUBLEBUFFER 0x0C32
+#define GL_STEREO 0x0C33
+#define GL_LINE_SMOOTH_HINT 0x0C52
+#define GL_POLYGON_SMOOTH_HINT 0x0C53
+#define GL_UNPACK_SWAP_BYTES 0x0CF0
+#define GL_UNPACK_LSB_FIRST 0x0CF1
+#define GL_UNPACK_ROW_LENGTH 0x0CF2
+#define GL_UNPACK_SKIP_ROWS 0x0CF3
+#define GL_UNPACK_SKIP_PIXELS 0x0CF4
+#define GL_UNPACK_ALIGNMENT 0x0CF5
+#define GL_PACK_SWAP_BYTES 0x0D00
+#define GL_PACK_LSB_FIRST 0x0D01
+#define GL_PACK_ROW_LENGTH 0x0D02
+#define GL_PACK_SKIP_ROWS 0x0D03
+#define GL_PACK_SKIP_PIXELS 0x0D04
+#define GL_PACK_ALIGNMENT 0x0D05
+#define GL_MAX_TEXTURE_SIZE 0x0D33
+#define GL_MAX_VIEWPORT_DIMS 0x0D3A
+#define GL_SUBPIXEL_BITS 0x0D50
+#define GL_TEXTURE_1D 0x0DE0
+#define GL_TEXTURE_2D 0x0DE1
+#define GL_TEXTURE_WIDTH 0x1000
+#define GL_TEXTURE_HEIGHT 0x1001
+#define GL_TEXTURE_BORDER_COLOR 0x1004
+#define GL_DONT_CARE 0x1100
+#define GL_FASTEST 0x1101
+#define GL_NICEST 0x1102
+#define GL_BYTE 0x1400
+#define GL_UNSIGNED_BYTE 0x1401
+#define GL_SHORT 0x1402
+#define GL_UNSIGNED_SHORT 0x1403
+#define GL_INT 0x1404
+#define GL_UNSIGNED_INT 0x1405
+#define GL_FLOAT 0x1406
+#define GL_CLEAR 0x1500
+#define GL_AND 0x1501
+#define GL_AND_REVERSE 0x1502
+#define GL_COPY 0x1503
+#define GL_AND_INVERTED 0x1504
+#define GL_NOOP 0x1505
+#define GL_XOR 0x1506
+#define GL_OR 0x1507
+#define GL_NOR 0x1508
+#define GL_EQUIV 0x1509
+#define GL_INVERT 0x150A
+#define GL_OR_REVERSE 0x150B
+#define GL_COPY_INVERTED 0x150C
+#define GL_OR_INVERTED 0x150D
+#define GL_NAND 0x150E
+#define GL_SET 0x150F
+#define GL_TEXTURE 0x1702
+#define GL_COLOR 0x1800
+#define GL_DEPTH 0x1801
+#define GL_STENCIL 0x1802
+#define GL_STENCIL_INDEX 0x1901
+#define GL_DEPTH_COMPONENT 0x1902
+#define GL_RED 0x1903
+#define GL_GREEN 0x1904
+#define GL_BLUE 0x1905
+#define GL_ALPHA 0x1906
+#define GL_RGB 0x1907
+#define GL_RGBA 0x1908
+#define GL_POINT 0x1B00
+#define GL_LINE 0x1B01
+#define GL_FILL 0x1B02
+#define GL_KEEP 0x1E00
+#define GL_REPLACE 0x1E01
+#define GL_INCR 0x1E02
+#define GL_DECR 0x1E03
+#define GL_VENDOR 0x1F00
+#define GL_RENDERER 0x1F01
+#define GL_VERSION 0x1F02
+#define GL_EXTENSIONS 0x1F03
+#define GL_NEAREST 0x2600
+#define GL_LINEAR 0x2601
+#define GL_NEAREST_MIPMAP_NEAREST 0x2700
+#define GL_LINEAR_MIPMAP_NEAREST 0x2701
+#define GL_NEAREST_MIPMAP_LINEAR 0x2702
+#define GL_LINEAR_MIPMAP_LINEAR 0x2703
+#define GL_TEXTURE_MAG_FILTER 0x2800
+#define GL_TEXTURE_MIN_FILTER 0x2801
+#define GL_TEXTURE_WRAP_S 0x2802
+#define GL_TEXTURE_WRAP_T 0x2803
+#define GL_REPEAT 0x2901
+#define GL_COLOR_LOGIC_OP 0x0BF2
+#define GL_POLYGON_OFFSET_UNITS 0x2A00
+#define GL_POLYGON_OFFSET_POINT 0x2A01
+#define GL_POLYGON_OFFSET_LINE 0x2A02
+#define GL_POLYGON_OFFSET_FILL 0x8037
+#define GL_POLYGON_OFFSET_FACTOR 0x8038
+#define GL_TEXTURE_BINDING_1D 0x8068
+#define GL_TEXTURE_BINDING_2D 0x8069
+#define GL_TEXTURE_INTERNAL_FORMAT 0x1003
+#define GL_TEXTURE_RED_SIZE 0x805C
+#define GL_TEXTURE_GREEN_SIZE 0x805D
+#define GL_TEXTURE_BLUE_SIZE 0x805E
+#define GL_TEXTURE_ALPHA_SIZE 0x805F
+#define GL_DOUBLE 0x140A
+#define GL_PROXY_TEXTURE_1D 0x8063
+#define GL_PROXY_TEXTURE_2D 0x8064
+#define GL_R3_G3_B2 0x2A10
+#define GL_RGB4 0x804F
+#define GL_RGB5 0x8050
+#define GL_RGB8 0x8051
+#define GL_RGB10 0x8052
+#define GL_RGB12 0x8053
+#define GL_RGB16 0x8054
+#define GL_RGBA2 0x8055
+#define GL_RGBA4 0x8056
+#define GL_RGB5_A1 0x8057
+#define GL_RGBA8 0x8058
+#define GL_RGB10_A2 0x8059
+#define GL_RGBA12 0x805A
+#define GL_RGBA16 0x805B
+#define GL_UNSIGNED_BYTE_3_3_2 0x8032
+#define GL_UNSIGNED_SHORT_4_4_4_4 0x8033
+#define GL_UNSIGNED_SHORT_5_5_5_1 0x8034
+#define GL_UNSIGNED_INT_8_8_8_8 0x8035
+#define GL_UNSIGNED_INT_10_10_10_2 0x8036
+#define GL_TEXTURE_BINDING_3D 0x806A
+#define GL_PACK_SKIP_IMAGES 0x806B
+#define GL_PACK_IMAGE_HEIGHT 0x806C
+#define GL_UNPACK_SKIP_IMAGES 0x806D
+#define GL_UNPACK_IMAGE_HEIGHT 0x806E
+#define GL_TEXTURE_3D 0x806F
+#define GL_PROXY_TEXTURE_3D 0x8070
+#define GL_TEXTURE_DEPTH 0x8071
+#define GL_TEXTURE_WRAP_R 0x8072
+#define GL_MAX_3D_TEXTURE_SIZE 0x8073
+#define GL_UNSIGNED_BYTE_2_3_3_REV 0x8362
+#define GL_UNSIGNED_SHORT_5_6_5 0x8363
+#define GL_UNSIGNED_SHORT_5_6_5_REV 0x8364
+#define GL_UNSIGNED_SHORT_4_4_4_4_REV 0x8365
+#define GL_UNSIGNED_SHORT_1_5_5_5_REV 0x8366
+#define GL_UNSIGNED_INT_8_8_8_8_REV 0x8367
+#define GL_UNSIGNED_INT_2_10_10_10_REV 0x8368
+#define GL_BGR 0x80E0
+#define GL_BGRA 0x80E1
+#define GL_MAX_ELEMENTS_VERTICES 0x80E8
+#define GL_MAX_ELEMENTS_INDICES 0x80E9
+#define GL_CLAMP_TO_EDGE 0x812F
+#define GL_TEXTURE_MIN_LOD 0x813A
+#define GL_TEXTURE_MAX_LOD 0x813B
+#define GL_TEXTURE_BASE_LEVEL 0x813C
+#define GL_TEXTURE_MAX_LEVEL 0x813D
+#define GL_SMOOTH_POINT_SIZE_RANGE 0x0B12
+#define GL_SMOOTH_POINT_SIZE_GRANULARITY 0x0B13
+#define GL_SMOOTH_LINE_WIDTH_RANGE 0x0B22
+#define GL_SMOOTH_LINE_WIDTH_GRANULARITY 0x0B23
+#define GL_ALIASED_LINE_WIDTH_RANGE 0x846E
+#define GL_TEXTURE0 0x84C0
+#define GL_TEXTURE1 0x84C1
+#define GL_TEXTURE2 0x84C2
+#define GL_TEXTURE3 0x84C3
+#define GL_TEXTURE4 0x84C4
+#define GL_TEXTURE5 0x84C5
+#define GL_TEXTURE6 0x84C6
+#define GL_TEXTURE7 0x84C7
+#define GL_TEXTURE8 0x84C8
+#define GL_TEXTURE9 0x84C9
+#define GL_TEXTURE10 0x84CA
+#define GL_TEXTURE11 0x84CB
+#define GL_TEXTURE12 0x84CC
+#define GL_TEXTURE13 0x84CD
+#define GL_TEXTURE14 0x84CE
+#define GL_TEXTURE15 0x84CF
+#define GL_TEXTURE16 0x84D0
+#define GL_TEXTURE17 0x84D1
+#define GL_TEXTURE18 0x84D2
+#define GL_TEXTURE19 0x84D3
+#define GL_TEXTURE20 0x84D4
+#define GL_TEXTURE21 0x84D5
+#define GL_TEXTURE22 0x84D6
+#define GL_TEXTURE23 0x84D7
+#define GL_TEXTURE24 0x84D8
+#define GL_TEXTURE25 0x84D9
+#define GL_TEXTURE26 0x84DA
+#define GL_TEXTURE27 0x84DB
+#define GL_TEXTURE28 0x84DC
+#define GL_TEXTURE29 0x84DD
+#define GL_TEXTURE30 0x84DE
+#define GL_TEXTURE31 0x84DF
+#define GL_ACTIVE_TEXTURE 0x84E0
+#define GL_MULTISAMPLE 0x809D
+#define GL_SAMPLE_ALPHA_TO_COVERAGE 0x809E
+#define GL_SAMPLE_ALPHA_TO_ONE 0x809F
+#define GL_SAMPLE_COVERAGE 0x80A0
+#define GL_SAMPLE_BUFFERS 0x80A8
+#define GL_SAMPLES 0x80A9
+#define GL_SAMPLE_COVERAGE_VALUE 0x80AA
+#define GL_SAMPLE_COVERAGE_INVERT 0x80AB
+#define GL_TEXTURE_CUBE_MAP 0x8513
+#define GL_TEXTURE_BINDING_CUBE_MAP 0x8514
+#define GL_TEXTURE_CUBE_MAP_POSITIVE_X 0x8515
+#define GL_TEXTURE_CUBE_MAP_NEGATIVE_X 0x8516
+#define GL_TEXTURE_CUBE_MAP_POSITIVE_Y 0x8517
+#define GL_TEXTURE_CUBE_MAP_NEGATIVE_Y 0x8518
+#define GL_TEXTURE_CUBE_MAP_POSITIVE_Z 0x8519
+#define GL_TEXTURE_CUBE_MAP_NEGATIVE_Z 0x851A
+#define GL_PROXY_TEXTURE_CUBE_MAP 0x851B
+#define GL_MAX_CUBE_MAP_TEXTURE_SIZE 0x851C
+#define GL_COMPRESSED_RGB 0x84ED
+#define GL_COMPRESSED_RGBA 0x84EE
+#define GL_TEXTURE_COMPRESSION_HINT 0x84EF
+#define GL_TEXTURE_COMPRESSED_IMAGE_SIZE 0x86A0
+#define GL_TEXTURE_COMPRESSED 0x86A1
+#define GL_NUM_COMPRESSED_TEXTURE_FORMATS 0x86A2
+#define GL_COMPRESSED_TEXTURE_FORMATS 0x86A3
+#define GL_CLAMP_TO_BORDER 0x812D
+#define GL_BLEND_DST_RGB 0x80C8
+#define GL_BLEND_SRC_RGB 0x80C9
+#define GL_BLEND_DST_ALPHA 0x80CA
+#define GL_BLEND_SRC_ALPHA 0x80CB
+#define GL_POINT_FADE_THRESHOLD_SIZE 0x8128
+#define GL_DEPTH_COMPONENT16 0x81A5
+#define GL_DEPTH_COMPONENT24 0x81A6
+#define GL_DEPTH_COMPONENT32 0x81A7
+#define GL_MIRRORED_REPEAT 0x8370
+#define GL_MAX_TEXTURE_LOD_BIAS 0x84FD
+#define GL_TEXTURE_LOD_BIAS 0x8501
+#define GL_INCR_WRAP 0x8507
+#define GL_DECR_WRAP 0x8508
+#define GL_TEXTURE_DEPTH_SIZE 0x884A
+#define GL_TEXTURE_COMPARE_MODE 0x884C
+#define GL_TEXTURE_COMPARE_FUNC 0x884D
+#define GL_BLEND_COLOR 0x8005
+#define GL_BLEND_EQUATION 0x8009
+#define GL_CONSTANT_COLOR 0x8001
+#define GL_ONE_MINUS_CONSTANT_COLOR 0x8002
+#define GL_CONSTANT_ALPHA 0x8003
+#define GL_ONE_MINUS_CONSTANT_ALPHA 0x8004
+#define GL_FUNC_ADD 0x8006
+#define GL_FUNC_REVERSE_SUBTRACT 0x800B
+#define GL_FUNC_SUBTRACT 0x800A
+#define GL_MIN 0x8007
+#define GL_MAX 0x8008
+#define GL_BUFFER_SIZE 0x8764
+#define GL_BUFFER_USAGE 0x8765
+#define GL_QUERY_COUNTER_BITS 0x8864
+#define GL_CURRENT_QUERY 0x8865
+#define GL_QUERY_RESULT 0x8866
+#define GL_QUERY_RESULT_AVAILABLE 0x8867
+#define GL_ARRAY_BUFFER 0x8892
+#define GL_ELEMENT_ARRAY_BUFFER 0x8893
+#define GL_ARRAY_BUFFER_BINDING 0x8894
+#define GL_ELEMENT_ARRAY_BUFFER_BINDING 0x8895
+#define GL_VERTEX_ATTRIB_ARRAY_BUFFER_BINDING 0x889F
+#define GL_READ_ONLY 0x88B8
+#define GL_WRITE_ONLY 0x88B9
+#define GL_READ_WRITE 0x88BA
+#define GL_BUFFER_ACCESS 0x88BB
+#define GL_BUFFER_MAPPED 0x88BC
+#define GL_BUFFER_MAP_POINTER 0x88BD
+#define GL_STREAM_DRAW 0x88E0
+#define GL_STREAM_READ 0x88E1
+#define GL_STREAM_COPY 0x88E2
+#define GL_STATIC_DRAW 0x88E4
+#define GL_STATIC_READ 0x88E5
+#define GL_STATIC_COPY 0x88E6
+#define GL_DYNAMIC_DRAW 0x88E8
+#define GL_DYNAMIC_READ 0x88E9
+#define GL_DYNAMIC_COPY 0x88EA
+#define GL_SAMPLES_PASSED 0x8914
+#define GL_SRC1_ALPHA 0x8589
+#define GL_BLEND_EQUATION_RGB 0x8009
+#define GL_VERTEX_ATTRIB_ARRAY_ENABLED 0x8622
+#define GL_VERTEX_ATTRIB_ARRAY_SIZE 0x8623
+#define GL_VERTEX_ATTRIB_ARRAY_STRIDE 0x8624
+#define GL_VERTEX_ATTRIB_ARRAY_TYPE 0x8625
+#define GL_CURRENT_VERTEX_ATTRIB 0x8626
+#define GL_VERTEX_PROGRAM_POINT_SIZE 0x8642
+#define GL_VERTEX_ATTRIB_ARRAY_POINTER 0x8645
+#define GL_STENCIL_BACK_FUNC 0x8800
+#define GL_STENCIL_BACK_FAIL 0x8801
+#define GL_STENCIL_BACK_PASS_DEPTH_FAIL 0x8802
+#define GL_STENCIL_BACK_PASS_DEPTH_PASS 0x8803
+#define GL_MAX_DRAW_BUFFERS 0x8824
+#define GL_DRAW_BUFFER0 0x8825
+#define GL_DRAW_BUFFER1 0x8826
+#define GL_DRAW_BUFFER2 0x8827
+#define GL_DRAW_BUFFER3 0x8828
+#define GL_DRAW_BUFFER4 0x8829
+#define GL_DRAW_BUFFER5 0x882A
+#define GL_DRAW_BUFFER6 0x882B
+#define GL_DRAW_BUFFER7 0x882C
+#define GL_DRAW_BUFFER8 0x882D
+#define GL_DRAW_BUFFER9 0x882E
+#define GL_DRAW_BUFFER10 0x882F
+#define GL_DRAW_BUFFER11 0x8830
+#define GL_DRAW_BUFFER12 0x8831
+#define GL_DRAW_BUFFER13 0x8832
+#define GL_DRAW_BUFFER14 0x8833
+#define GL_DRAW_BUFFER15 0x8834
+#define GL_BLEND_EQUATION_ALPHA 0x883D
+#define GL_MAX_VERTEX_ATTRIBS 0x8869
+#define GL_VERTEX_ATTRIB_ARRAY_NORMALIZED 0x886A
+#define GL_MAX_TEXTURE_IMAGE_UNITS 0x8872
+#define GL_FRAGMENT_SHADER 0x8B30
+#define GL_VERTEX_SHADER 0x8B31
+#define GL_MAX_FRAGMENT_UNIFORM_COMPONENTS 0x8B49
+#define GL_MAX_VERTEX_UNIFORM_COMPONENTS 0x8B4A
+#define GL_MAX_VARYING_FLOATS 0x8B4B
+#define GL_MAX_VERTEX_TEXTURE_IMAGE_UNITS 0x8B4C
+#define GL_MAX_COMBINED_TEXTURE_IMAGE_UNITS 0x8B4D
+#define GL_SHADER_TYPE 0x8B4F
+#define GL_FLOAT_VEC2 0x8B50
+#define GL_FLOAT_VEC3 0x8B51
+#define GL_FLOAT_VEC4 0x8B52
+#define GL_INT_VEC2 0x8B53
+#define GL_INT_VEC3 0x8B54
+#define GL_INT_VEC4 0x8B55
+#define GL_BOOL 0x8B56
+#define GL_BOOL_VEC2 0x8B57
+#define GL_BOOL_VEC3 0x8B58
+#define GL_BOOL_VEC4 0x8B59
+#define GL_FLOAT_MAT2 0x8B5A
+#define GL_FLOAT_MAT3 0x8B5B
+#define GL_FLOAT_MAT4 0x8B5C
+#define GL_SAMPLER_1D 0x8B5D
+#define GL_SAMPLER_2D 0x8B5E
+#define GL_SAMPLER_3D 0x8B5F
+#define GL_SAMPLER_CUBE 0x8B60
+#define GL_SAMPLER_1D_SHADOW 0x8B61
+#define GL_SAMPLER_2D_SHADOW 0x8B62
+#define GL_DELETE_STATUS 0x8B80
+#define GL_COMPILE_STATUS 0x8B81
+#define GL_LINK_STATUS 0x8B82
+#define GL_VALIDATE_STATUS 0x8B83
+#define GL_INFO_LOG_LENGTH 0x8B84
+#define GL_ATTACHED_SHADERS 0x8B85
+#define GL_ACTIVE_UNIFORMS 0x8B86
+#define GL_ACTIVE_UNIFORM_MAX_LENGTH 0x8B87
+#define GL_SHADER_SOURCE_LENGTH 0x8B88
+#define GL_ACTIVE_ATTRIBUTES 0x8B89
+#define GL_ACTIVE_ATTRIBUTE_MAX_LENGTH 0x8B8A
+#define GL_FRAGMENT_SHADER_DERIVATIVE_HINT 0x8B8B
+#define GL_SHADING_LANGUAGE_VERSION 0x8B8C
+#define GL_CURRENT_PROGRAM 0x8B8D
+#define GL_POINT_SPRITE_COORD_ORIGIN 0x8CA0
+#define GL_LOWER_LEFT 0x8CA1
+#define GL_UPPER_LEFT 0x8CA2
+#define GL_STENCIL_BACK_REF 0x8CA3
+#define GL_STENCIL_BACK_VALUE_MASK 0x8CA4
+#define GL_STENCIL_BACK_WRITEMASK 0x8CA5
+#define GL_PIXEL_PACK_BUFFER 0x88EB
+#define GL_PIXEL_UNPACK_BUFFER 0x88EC
+#define GL_PIXEL_PACK_BUFFER_BINDING 0x88ED
+#define GL_PIXEL_UNPACK_BUFFER_BINDING 0x88EF
+#define GL_FLOAT_MAT2x3 0x8B65
+#define GL_FLOAT_MAT2x4 0x8B66
+#define GL_FLOAT_MAT3x2 0x8B67
+#define GL_FLOAT_MAT3x4 0x8B68
+#define GL_FLOAT_MAT4x2 0x8B69
+#define GL_FLOAT_MAT4x3 0x8B6A
+#define GL_SRGB 0x8C40
+#define GL_SRGB8 0x8C41
+#define GL_SRGB_ALPHA 0x8C42
+#define GL_SRGB8_ALPHA8 0x8C43
+#define GL_COMPRESSED_SRGB 0x8C48
+#define GL_COMPRESSED_SRGB_ALPHA 0x8C49
+#define GL_COMPARE_REF_TO_TEXTURE 0x884E
+#define GL_CLIP_DISTANCE0 0x3000
+#define GL_CLIP_DISTANCE1 0x3001
+#define GL_CLIP_DISTANCE2 0x3002
+#define GL_CLIP_DISTANCE3 0x3003
+#define GL_CLIP_DISTANCE4 0x3004
+#define GL_CLIP_DISTANCE5 0x3005
+#define GL_CLIP_DISTANCE6 0x3006
+#define GL_CLIP_DISTANCE7 0x3007
+#define GL_MAX_CLIP_DISTANCES 0x0D32
+#define GL_MAJOR_VERSION 0x821B
+#define GL_MINOR_VERSION 0x821C
+#define GL_NUM_EXTENSIONS 0x821D
+#define GL_CONTEXT_FLAGS 0x821E
+#define GL_COMPRESSED_RED 0x8225
+#define GL_COMPRESSED_RG 0x8226
+#define GL_CONTEXT_FLAG_FORWARD_COMPATIBLE_BIT 0x00000001
+#define GL_RGBA32F 0x8814
+#define GL_RGB32F 0x8815
+#define GL_RGBA16F 0x881A
+#define GL_RGB16F 0x881B
+#define GL_VERTEX_ATTRIB_ARRAY_INTEGER 0x88FD
+#define GL_MAX_ARRAY_TEXTURE_LAYERS 0x88FF
+#define GL_MIN_PROGRAM_TEXEL_OFFSET 0x8904
+#define GL_MAX_PROGRAM_TEXEL_OFFSET 0x8905
+#define GL_CLAMP_READ_COLOR 0x891C
+#define GL_FIXED_ONLY 0x891D
+#define GL_MAX_VARYING_COMPONENTS 0x8B4B
+#define GL_TEXTURE_1D_ARRAY 0x8C18
+#define GL_PROXY_TEXTURE_1D_ARRAY 0x8C19
+#define GL_TEXTURE_2D_ARRAY 0x8C1A
+#define GL_PROXY_TEXTURE_2D_ARRAY 0x8C1B
+#define GL_TEXTURE_BINDING_1D_ARRAY 0x8C1C
+#define GL_TEXTURE_BINDING_2D_ARRAY 0x8C1D
+#define GL_R11F_G11F_B10F 0x8C3A
+#define GL_UNSIGNED_INT_10F_11F_11F_REV 0x8C3B
+#define GL_RGB9_E5 0x8C3D
+#define GL_UNSIGNED_INT_5_9_9_9_REV 0x8C3E
+#define GL_TEXTURE_SHARED_SIZE 0x8C3F
+#define GL_TRANSFORM_FEEDBACK_VARYING_MAX_LENGTH 0x8C76
+#define GL_TRANSFORM_FEEDBACK_BUFFER_MODE 0x8C7F
+#define GL_MAX_TRANSFORM_FEEDBACK_SEPARATE_COMPONENTS 0x8C80
+#define GL_TRANSFORM_FEEDBACK_VARYINGS 0x8C83
+#define GL_TRANSFORM_FEEDBACK_BUFFER_START 0x8C84
+#define GL_TRANSFORM_FEEDBACK_BUFFER_SIZE 0x8C85
+#define GL_PRIMITIVES_GENERATED 0x8C87
+#define GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN 0x8C88
+#define GL_RASTERIZER_DISCARD 0x8C89
+#define GL_MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS 0x8C8A
+#define GL_MAX_TRANSFORM_FEEDBACK_SEPARATE_ATTRIBS 0x8C8B
+#define GL_INTERLEAVED_ATTRIBS 0x8C8C
+#define GL_SEPARATE_ATTRIBS 0x8C8D
+#define GL_TRANSFORM_FEEDBACK_BUFFER 0x8C8E
+#define GL_TRANSFORM_FEEDBACK_BUFFER_BINDING 0x8C8F
+#define GL_RGBA32UI 0x8D70
+#define GL_RGB32UI 0x8D71
+#define GL_RGBA16UI 0x8D76
+#define GL_RGB16UI 0x8D77
+#define GL_RGBA8UI 0x8D7C
+#define GL_RGB8UI 0x8D7D
+#define GL_RGBA32I 0x8D82
+#define GL_RGB32I 0x8D83
+#define GL_RGBA16I 0x8D88
+#define GL_RGB16I 0x8D89
+#define GL_RGBA8I 0x8D8E
+#define GL_RGB8I 0x8D8F
+#define GL_RED_INTEGER 0x8D94
+#define GL_GREEN_INTEGER 0x8D95
+#define GL_BLUE_INTEGER 0x8D96
+#define GL_RGB_INTEGER 0x8D98
+#define GL_RGBA_INTEGER 0x8D99
+#define GL_BGR_INTEGER 0x8D9A
+#define GL_BGRA_INTEGER 0x8D9B
+#define GL_SAMPLER_1D_ARRAY 0x8DC0
+#define GL_SAMPLER_2D_ARRAY 0x8DC1
+#define GL_SAMPLER_1D_ARRAY_SHADOW 0x8DC3
+#define GL_SAMPLER_2D_ARRAY_SHADOW 0x8DC4
+#define GL_SAMPLER_CUBE_SHADOW 0x8DC5
+#define GL_UNSIGNED_INT_VEC2 0x8DC6
+#define GL_UNSIGNED_INT_VEC3 0x8DC7
+#define GL_UNSIGNED_INT_VEC4 0x8DC8
+#define GL_INT_SAMPLER_1D 0x8DC9
+#define GL_INT_SAMPLER_2D 0x8DCA
+#define GL_INT_SAMPLER_3D 0x8DCB
+#define GL_INT_SAMPLER_CUBE 0x8DCC
+#define GL_INT_SAMPLER_1D_ARRAY 0x8DCE
+#define GL_INT_SAMPLER_2D_ARRAY 0x8DCF
+#define GL_UNSIGNED_INT_SAMPLER_1D 0x8DD1
+#define GL_UNSIGNED_INT_SAMPLER_2D 0x8DD2
+#define GL_UNSIGNED_INT_SAMPLER_3D 0x8DD3
+#define GL_UNSIGNED_INT_SAMPLER_CUBE 0x8DD4
+#define GL_UNSIGNED_INT_SAMPLER_1D_ARRAY 0x8DD6
+#define GL_UNSIGNED_INT_SAMPLER_2D_ARRAY 0x8DD7
+#define GL_QUERY_WAIT 0x8E13
+#define GL_QUERY_NO_WAIT 0x8E14
+#define GL_QUERY_BY_REGION_WAIT 0x8E15
+#define GL_QUERY_BY_REGION_NO_WAIT 0x8E16
+#define GL_BUFFER_ACCESS_FLAGS 0x911F
+#define GL_BUFFER_MAP_LENGTH 0x9120
+#define GL_BUFFER_MAP_OFFSET 0x9121
+#define GL_DEPTH_COMPONENT32F 0x8CAC
+#define GL_DEPTH32F_STENCIL8 0x8CAD
+#define GL_FLOAT_32_UNSIGNED_INT_24_8_REV 0x8DAD
+#define GL_INVALID_FRAMEBUFFER_OPERATION 0x0506
+#define GL_FRAMEBUFFER_ATTACHMENT_COLOR_ENCODING 0x8210
+#define GL_FRAMEBUFFER_ATTACHMENT_COMPONENT_TYPE 0x8211
+#define GL_FRAMEBUFFER_ATTACHMENT_RED_SIZE 0x8212
+#define GL_FRAMEBUFFER_ATTACHMENT_GREEN_SIZE 0x8213
+#define GL_FRAMEBUFFER_ATTACHMENT_BLUE_SIZE 0x8214
+#define GL_FRAMEBUFFER_ATTACHMENT_ALPHA_SIZE 0x8215
+#define GL_FRAMEBUFFER_ATTACHMENT_DEPTH_SIZE 0x8216
+#define GL_FRAMEBUFFER_ATTACHMENT_STENCIL_SIZE 0x8217
+#define GL_FRAMEBUFFER_DEFAULT 0x8218
+#define GL_FRAMEBUFFER_UNDEFINED 0x8219
+#define GL_DEPTH_STENCIL_ATTACHMENT 0x821A
+#define GL_MAX_RENDERBUFFER_SIZE 0x84E8
+#define GL_DEPTH_STENCIL 0x84F9
+#define GL_UNSIGNED_INT_24_8 0x84FA
+#define GL_DEPTH24_STENCIL8 0x88F0
+#define GL_TEXTURE_STENCIL_SIZE 0x88F1
+#define GL_TEXTURE_RED_TYPE 0x8C10
+#define GL_TEXTURE_GREEN_TYPE 0x8C11
+#define GL_TEXTURE_BLUE_TYPE 0x8C12
+#define GL_TEXTURE_ALPHA_TYPE 0x8C13
+#define GL_TEXTURE_DEPTH_TYPE 0x8C16
+#define GL_UNSIGNED_NORMALIZED 0x8C17
+#define GL_FRAMEBUFFER_BINDING 0x8CA6
+#define GL_DRAW_FRAMEBUFFER_BINDING 0x8CA6
+#define GL_RENDERBUFFER_BINDING 0x8CA7
+#define GL_READ_FRAMEBUFFER 0x8CA8
+#define GL_DRAW_FRAMEBUFFER 0x8CA9
+#define GL_READ_FRAMEBUFFER_BINDING 0x8CAA
+#define GL_RENDERBUFFER_SAMPLES 0x8CAB
+#define GL_FRAMEBUFFER_ATTACHMENT_OBJECT_TYPE 0x8CD0
+#define GL_FRAMEBUFFER_ATTACHMENT_OBJECT_NAME 0x8CD1
+#define GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_LEVEL 0x8CD2
+#define GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_CUBE_MAP_FACE 0x8CD3
+#define GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_LAYER 0x8CD4
+#define GL_FRAMEBUFFER_COMPLETE 0x8CD5
+#define GL_FRAMEBUFFER_INCOMPLETE_ATTACHMENT 0x8CD6
+#define GL_FRAMEBUFFER_INCOMPLETE_MISSING_ATTACHMENT 0x8CD7
+#define GL_FRAMEBUFFER_INCOMPLETE_DRAW_BUFFER 0x8CDB
+#define GL_FRAMEBUFFER_INCOMPLETE_READ_BUFFER 0x8CDC
+#define GL_FRAMEBUFFER_UNSUPPORTED 0x8CDD
+#define GL_MAX_COLOR_ATTACHMENTS 0x8CDF
+#define GL_COLOR_ATTACHMENT0 0x8CE0
+#define GL_COLOR_ATTACHMENT1 0x8CE1
+#define GL_COLOR_ATTACHMENT2 0x8CE2
+#define GL_COLOR_ATTACHMENT3 0x8CE3
+#define GL_COLOR_ATTACHMENT4 0x8CE4
+#define GL_COLOR_ATTACHMENT5 0x8CE5
+#define GL_COLOR_ATTACHMENT6 0x8CE6
+#define GL_COLOR_ATTACHMENT7 0x8CE7
+#define GL_COLOR_ATTACHMENT8 0x8CE8
+#define GL_COLOR_ATTACHMENT9 0x8CE9
+#define GL_COLOR_ATTACHMENT10 0x8CEA
+#define GL_COLOR_ATTACHMENT11 0x8CEB
+#define GL_COLOR_ATTACHMENT12 0x8CEC
+#define GL_COLOR_ATTACHMENT13 0x8CED
+#define GL_COLOR_ATTACHMENT14 0x8CEE
+#define GL_COLOR_ATTACHMENT15 0x8CEF
+#define GL_COLOR_ATTACHMENT16 0x8CF0
+#define GL_COLOR_ATTACHMENT17 0x8CF1
+#define GL_COLOR_ATTACHMENT18 0x8CF2
+#define GL_COLOR_ATTACHMENT19 0x8CF3
+#define GL_COLOR_ATTACHMENT20 0x8CF4
+#define GL_COLOR_ATTACHMENT21 0x8CF5
+#define GL_COLOR_ATTACHMENT22 0x8CF6
+#define GL_COLOR_ATTACHMENT23 0x8CF7
+#define GL_COLOR_ATTACHMENT24 0x8CF8
+#define GL_COLOR_ATTACHMENT25 0x8CF9
+#define GL_COLOR_ATTACHMENT26 0x8CFA
+#define GL_COLOR_ATTACHMENT27 0x8CFB
+#define GL_COLOR_ATTACHMENT28 0x8CFC
+#define GL_COLOR_ATTACHMENT29 0x8CFD
+#define GL_COLOR_ATTACHMENT30 0x8CFE
+#define GL_COLOR_ATTACHMENT31 0x8CFF
+#define GL_DEPTH_ATTACHMENT 0x8D00
+#define GL_STENCIL_ATTACHMENT 0x8D20
+#define GL_FRAMEBUFFER 0x8D40
+#define GL_RENDERBUFFER 0x8D41
+#define GL_RENDERBUFFER_WIDTH 0x8D42
+#define GL_RENDERBUFFER_HEIGHT 0x8D43
+#define GL_RENDERBUFFER_INTERNAL_FORMAT 0x8D44
+#define GL_STENCIL_INDEX1 0x8D46
+#define GL_STENCIL_INDEX4 0x8D47
+#define GL_STENCIL_INDEX8 0x8D48
+#define GL_STENCIL_INDEX16 0x8D49
+#define GL_RENDERBUFFER_RED_SIZE 0x8D50
+#define GL_RENDERBUFFER_GREEN_SIZE 0x8D51
+#define GL_RENDERBUFFER_BLUE_SIZE 0x8D52
+#define GL_RENDERBUFFER_ALPHA_SIZE 0x8D53
+#define GL_RENDERBUFFER_DEPTH_SIZE 0x8D54
+#define GL_RENDERBUFFER_STENCIL_SIZE 0x8D55
+#define GL_FRAMEBUFFER_INCOMPLETE_MULTISAMPLE 0x8D56
+#define GL_MAX_SAMPLES 0x8D57
+#define GL_FRAMEBUFFER_SRGB 0x8DB9
+#define GL_HALF_FLOAT 0x140B
+#define GL_MAP_READ_BIT 0x0001
+#define GL_MAP_WRITE_BIT 0x0002
+#define GL_MAP_INVALIDATE_RANGE_BIT 0x0004
+#define GL_MAP_INVALIDATE_BUFFER_BIT 0x0008
+#define GL_MAP_FLUSH_EXPLICIT_BIT 0x0010
+#define GL_MAP_UNSYNCHRONIZED_BIT 0x0020
+#define GL_COMPRESSED_RED_RGTC1 0x8DBB
+#define GL_COMPRESSED_SIGNED_RED_RGTC1 0x8DBC
+#define GL_COMPRESSED_RG_RGTC2 0x8DBD
+#define GL_COMPRESSED_SIGNED_RG_RGTC2 0x8DBE
+#define GL_RG 0x8227
+#define GL_RG_INTEGER 0x8228
+#define GL_R8 0x8229
+#define GL_R16 0x822A
+#define GL_RG8 0x822B
+#define GL_RG16 0x822C
+#define GL_R16F 0x822D
+#define GL_R32F 0x822E
+#define GL_RG16F 0x822F
+#define GL_RG32F 0x8230
+#define GL_R8I 0x8231
+#define GL_R8UI 0x8232
+#define GL_R16I 0x8233
+#define GL_R16UI 0x8234
+#define GL_R32I 0x8235
+#define GL_R32UI 0x8236
+#define GL_RG8I 0x8237
+#define GL_RG8UI 0x8238
+#define GL_RG16I 0x8239
+#define GL_RG16UI 0x823A
+#define GL_RG32I 0x823B
+#define GL_RG32UI 0x823C
+#define GL_VERTEX_ARRAY_BINDING 0x85B5
+#define GL_SAMPLER_2D_RECT 0x8B63
+#define GL_SAMPLER_2D_RECT_SHADOW 0x8B64
+#define GL_SAMPLER_BUFFER 0x8DC2
+#define GL_INT_SAMPLER_2D_RECT 0x8DCD
+#define GL_INT_SAMPLER_BUFFER 0x8DD0
+#define GL_UNSIGNED_INT_SAMPLER_2D_RECT 0x8DD5
+#define GL_UNSIGNED_INT_SAMPLER_BUFFER 0x8DD8
+#define GL_TEXTURE_BUFFER 0x8C2A
+#define GL_MAX_TEXTURE_BUFFER_SIZE 0x8C2B
+#define GL_TEXTURE_BINDING_BUFFER 0x8C2C
+#define GL_TEXTURE_BUFFER_DATA_STORE_BINDING 0x8C2D
+#define GL_TEXTURE_RECTANGLE 0x84F5
+#define GL_TEXTURE_BINDING_RECTANGLE 0x84F6
+#define GL_PROXY_TEXTURE_RECTANGLE 0x84F7
+#define GL_MAX_RECTANGLE_TEXTURE_SIZE 0x84F8
+#define GL_R8_SNORM 0x8F94
+#define GL_RG8_SNORM 0x8F95
+#define GL_RGB8_SNORM 0x8F96
+#define GL_RGBA8_SNORM 0x8F97
+#define GL_R16_SNORM 0x8F98
+#define GL_RG16_SNORM 0x8F99
+#define GL_RGB16_SNORM 0x8F9A
+#define GL_RGBA16_SNORM 0x8F9B
+#define GL_SIGNED_NORMALIZED 0x8F9C
+#define GL_PRIMITIVE_RESTART 0x8F9D
+#define GL_PRIMITIVE_RESTART_INDEX 0x8F9E
+#define GL_COPY_READ_BUFFER 0x8F36
+#define GL_COPY_WRITE_BUFFER 0x8F37
+#define GL_UNIFORM_BUFFER 0x8A11
+#define GL_UNIFORM_BUFFER_BINDING 0x8A28
+#define GL_UNIFORM_BUFFER_START 0x8A29
+#define GL_UNIFORM_BUFFER_SIZE 0x8A2A
+#define GL_MAX_VERTEX_UNIFORM_BLOCKS 0x8A2B
+#define GL_MAX_GEOMETRY_UNIFORM_BLOCKS 0x8A2C
+#define GL_MAX_FRAGMENT_UNIFORM_BLOCKS 0x8A2D
+#define GL_MAX_COMBINED_UNIFORM_BLOCKS 0x8A2E
+#define GL_MAX_UNIFORM_BUFFER_BINDINGS 0x8A2F
+#define GL_MAX_UNIFORM_BLOCK_SIZE 0x8A30
+#define GL_MAX_COMBINED_VERTEX_UNIFORM_COMPONENTS 0x8A31
+#define GL_MAX_COMBINED_GEOMETRY_UNIFORM_COMPONENTS 0x8A32
+#define GL_MAX_COMBINED_FRAGMENT_UNIFORM_COMPONENTS 0x8A33
+#define GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT 0x8A34
+#define GL_ACTIVE_UNIFORM_BLOCK_MAX_NAME_LENGTH 0x8A35
+#define GL_ACTIVE_UNIFORM_BLOCKS 0x8A36
+#define GL_UNIFORM_TYPE 0x8A37
+#define GL_UNIFORM_SIZE 0x8A38
+#define GL_UNIFORM_NAME_LENGTH 0x8A39
+#define GL_UNIFORM_BLOCK_INDEX 0x8A3A
+#define GL_UNIFORM_OFFSET 0x8A3B
+#define GL_UNIFORM_ARRAY_STRIDE 0x8A3C
+#define GL_UNIFORM_MATRIX_STRIDE 0x8A3D
+#define GL_UNIFORM_IS_ROW_MAJOR 0x8A3E
+#define GL_UNIFORM_BLOCK_BINDING 0x8A3F
+#define GL_UNIFORM_BLOCK_DATA_SIZE 0x8A40
+#define GL_UNIFORM_BLOCK_NAME_LENGTH 0x8A41
+#define GL_UNIFORM_BLOCK_ACTIVE_UNIFORMS 0x8A42
+#define GL_UNIFORM_BLOCK_ACTIVE_UNIFORM_INDICES 0x8A43
+#define GL_UNIFORM_BLOCK_REFERENCED_BY_VERTEX_SHADER 0x8A44
+#define GL_UNIFORM_BLOCK_REFERENCED_BY_GEOMETRY_SHADER 0x8A45
+#define GL_UNIFORM_BLOCK_REFERENCED_BY_FRAGMENT_SHADER 0x8A46
+#define GL_INVALID_INDEX 0xFFFFFFFF
+#define GL_CONTEXT_CORE_PROFILE_BIT 0x00000001
+#define GL_CONTEXT_COMPATIBILITY_PROFILE_BIT 0x00000002
+#define GL_LINES_ADJACENCY 0x000A
+#define GL_LINE_STRIP_ADJACENCY 0x000B
+#define GL_TRIANGLES_ADJACENCY 0x000C
+#define GL_TRIANGLE_STRIP_ADJACENCY 0x000D
+#define GL_PROGRAM_POINT_SIZE 0x8642
+#define GL_MAX_GEOMETRY_TEXTURE_IMAGE_UNITS 0x8C29
+#define GL_FRAMEBUFFER_ATTACHMENT_LAYERED 0x8DA7
+#define GL_FRAMEBUFFER_INCOMPLETE_LAYER_TARGETS 0x8DA8
+#define GL_GEOMETRY_SHADER 0x8DD9
+#define GL_GEOMETRY_VERTICES_OUT 0x8916
+#define GL_GEOMETRY_INPUT_TYPE 0x8917
+#define GL_GEOMETRY_OUTPUT_TYPE 0x8918
+#define GL_MAX_GEOMETRY_UNIFORM_COMPONENTS 0x8DDF
+#define GL_MAX_GEOMETRY_OUTPUT_VERTICES 0x8DE0
+#define GL_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS 0x8DE1
+#define GL_MAX_VERTEX_OUTPUT_COMPONENTS 0x9122
+#define GL_MAX_GEOMETRY_INPUT_COMPONENTS 0x9123
+#define GL_MAX_GEOMETRY_OUTPUT_COMPONENTS 0x9124
+#define GL_MAX_FRAGMENT_INPUT_COMPONENTS 0x9125
+#define GL_CONTEXT_PROFILE_MASK 0x9126
+#define GL_DEPTH_CLAMP 0x864F
+#define GL_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION 0x8E4C
+#define GL_FIRST_VERTEX_CONVENTION 0x8E4D
+#define GL_LAST_VERTEX_CONVENTION 0x8E4E
+#define GL_PROVOKING_VERTEX 0x8E4F
+#define GL_TEXTURE_CUBE_MAP_SEAMLESS 0x884F
+#define GL_MAX_SERVER_WAIT_TIMEOUT 0x9111
+#define GL_OBJECT_TYPE 0x9112
+#define GL_SYNC_CONDITION 0x9113
+#define GL_SYNC_STATUS 0x9114
+#define GL_SYNC_FLAGS 0x9115
+#define GL_SYNC_FENCE 0x9116
+#define GL_SYNC_GPU_COMMANDS_COMPLETE 0x9117
+#define GL_UNSIGNALED 0x9118
+#define GL_SIGNALED 0x9119
+#define GL_ALREADY_SIGNALED 0x911A
+#define GL_TIMEOUT_EXPIRED 0x911B
+#define GL_CONDITION_SATISFIED 0x911C
+#define GL_WAIT_FAILED 0x911D
+#define GL_TIMEOUT_IGNORED 0xFFFFFFFFFFFFFFFF
+#define GL_SYNC_FLUSH_COMMANDS_BIT 0x00000001
+#define GL_SAMPLE_POSITION 0x8E50
+#define GL_SAMPLE_MASK 0x8E51
+#define GL_SAMPLE_MASK_VALUE 0x8E52
+#define GL_MAX_SAMPLE_MASK_WORDS 0x8E59
+#define GL_TEXTURE_2D_MULTISAMPLE 0x9100
+#define GL_PROXY_TEXTURE_2D_MULTISAMPLE 0x9101
+#define GL_TEXTURE_2D_MULTISAMPLE_ARRAY 0x9102
+#define GL_PROXY_TEXTURE_2D_MULTISAMPLE_ARRAY 0x9103
+#define GL_TEXTURE_BINDING_2D_MULTISAMPLE 0x9104
+#define GL_TEXTURE_BINDING_2D_MULTISAMPLE_ARRAY 0x9105
+#define GL_TEXTURE_SAMPLES 0x9106
+#define GL_TEXTURE_FIXED_SAMPLE_LOCATIONS 0x9107
+#define GL_SAMPLER_2D_MULTISAMPLE 0x9108
+#define GL_INT_SAMPLER_2D_MULTISAMPLE 0x9109
+#define GL_UNSIGNED_INT_SAMPLER_2D_MULTISAMPLE 0x910A
+#define GL_SAMPLER_2D_MULTISAMPLE_ARRAY 0x910B
+#define GL_INT_SAMPLER_2D_MULTISAMPLE_ARRAY 0x910C
+#define GL_UNSIGNED_INT_SAMPLER_2D_MULTISAMPLE_ARRAY 0x910D
+#define GL_MAX_COLOR_TEXTURE_SAMPLES 0x910E
+#define GL_MAX_DEPTH_TEXTURE_SAMPLES 0x910F
+#define GL_MAX_INTEGER_SAMPLES 0x9110
+#define GL_VERTEX_ATTRIB_ARRAY_DIVISOR 0x88FE
+#define GL_SRC1_COLOR 0x88F9
+#define GL_ONE_MINUS_SRC1_COLOR 0x88FA
+#define GL_ONE_MINUS_SRC1_ALPHA 0x88FB
+#define GL_MAX_DUAL_SOURCE_DRAW_BUFFERS 0x88FC
+#define GL_ANY_SAMPLES_PASSED 0x8C2F
+#define GL_SAMPLER_BINDING 0x8919
+#define GL_RGB10_A2UI 0x906F
+#define GL_TEXTURE_SWIZZLE_R 0x8E42
+#define GL_TEXTURE_SWIZZLE_G 0x8E43
+#define GL_TEXTURE_SWIZZLE_B 0x8E44
+#define GL_TEXTURE_SWIZZLE_A 0x8E45
+#define GL_TEXTURE_SWIZZLE_RGBA 0x8E46
+#define GL_TIME_ELAPSED 0x88BF
+#define GL_TIMESTAMP 0x8E28
+#define GL_INT_2_10_10_10_REV 0x8D9F
+#ifndef GL_VERSION_1_0
+#define GL_VERSION_1_0 1
+GLAPI int GLAD_GL_VERSION_1_0;
+typedef void (APIENTRYP PFNGLCULLFACEPROC)(GLenum mode);
+GLAPI PFNGLCULLFACEPROC glad_glCullFace;
+#define glCullFace glad_glCullFace
+typedef void (APIENTRYP PFNGLFRONTFACEPROC)(GLenum mode);
+GLAPI PFNGLFRONTFACEPROC glad_glFrontFace;
+#define glFrontFace glad_glFrontFace
+typedef void (APIENTRYP PFNGLHINTPROC)(GLenum target, GLenum mode);
+GLAPI PFNGLHINTPROC glad_glHint;
+#define glHint glad_glHint
+typedef void (APIENTRYP PFNGLLINEWIDTHPROC)(GLfloat width);
+GLAPI PFNGLLINEWIDTHPROC glad_glLineWidth;
+#define glLineWidth glad_glLineWidth
+typedef void (APIENTRYP PFNGLPOINTSIZEPROC)(GLfloat size);
+GLAPI PFNGLPOINTSIZEPROC glad_glPointSize;
+#define glPointSize glad_glPointSize
+typedef void (APIENTRYP PFNGLPOLYGONMODEPROC)(GLenum face, GLenum mode);
+GLAPI PFNGLPOLYGONMODEPROC glad_glPolygonMode;
+#define glPolygonMode glad_glPolygonMode
+typedef void (APIENTRYP PFNGLSCISSORPROC)(GLint x, GLint y, GLsizei width, GLsizei height);
+GLAPI PFNGLSCISSORPROC glad_glScissor;
+#define glScissor glad_glScissor
+typedef void (APIENTRYP PFNGLTEXPARAMETERFPROC)(GLenum target, GLenum pname, GLfloat param);
+GLAPI PFNGLTEXPARAMETERFPROC glad_glTexParameterf;
+#define glTexParameterf glad_glTexParameterf
+typedef void (APIENTRYP PFNGLTEXPARAMETERFVPROC)(GLenum target, GLenum pname, const GLfloat *params);
+GLAPI PFNGLTEXPARAMETERFVPROC glad_glTexParameterfv;
+#define glTexParameterfv glad_glTexParameterfv
+typedef void (APIENTRYP PFNGLTEXPARAMETERIPROC)(GLenum target, GLenum pname, GLint param);
+GLAPI PFNGLTEXPARAMETERIPROC glad_glTexParameteri;
+#define glTexParameteri glad_glTexParameteri
+typedef void (APIENTRYP PFNGLTEXPARAMETERIVPROC)(GLenum target, GLenum pname, const GLint *params);
+GLAPI PFNGLTEXPARAMETERIVPROC glad_glTexParameteriv;
+#define glTexParameteriv glad_glTexParameteriv
+typedef void (APIENTRYP PFNGLTEXIMAGE1DPROC)(GLenum target, GLint level, GLint internalformat, GLsizei width, GLint border, GLenum format, GLenum type, const void *pixels);
+GLAPI PFNGLTEXIMAGE1DPROC glad_glTexImage1D;
+#define glTexImage1D glad_glTexImage1D
+typedef void (APIENTRYP PFNGLTEXIMAGE2DPROC)(GLenum target, GLint level, GLint internalformat, GLsizei width, GLsizei height, GLint border, GLenum format, GLenum type, const void *pixels);
+GLAPI PFNGLTEXIMAGE2DPROC glad_glTexImage2D;
+#define glTexImage2D glad_glTexImage2D
+typedef void (APIENTRYP PFNGLDRAWBUFFERPROC)(GLenum buf);
+GLAPI PFNGLDRAWBUFFERPROC glad_glDrawBuffer;
+#define glDrawBuffer glad_glDrawBuffer
+typedef void (APIENTRYP PFNGLCLEARPROC)(GLbitfield mask);
+GLAPI PFNGLCLEARPROC glad_glClear;
+#define glClear glad_glClear
+typedef void (APIENTRYP PFNGLCLEARCOLORPROC)(GLfloat red, GLfloat green, GLfloat blue, GLfloat alpha);
+GLAPI PFNGLCLEARCOLORPROC glad_glClearColor;
+#define glClearColor glad_glClearColor
+typedef void (APIENTRYP PFNGLCLEARSTENCILPROC)(GLint s);
+GLAPI PFNGLCLEARSTENCILPROC glad_glClearStencil;
+#define glClearStencil glad_glClearStencil
+typedef void (APIENTRYP PFNGLCLEARDEPTHPROC)(GLdouble depth);
+GLAPI PFNGLCLEARDEPTHPROC glad_glClearDepth;
+#define glClearDepth glad_glClearDepth
+typedef void (APIENTRYP PFNGLSTENCILMASKPROC)(GLuint mask);
+GLAPI PFNGLSTENCILMASKPROC glad_glStencilMask;
+#define glStencilMask glad_glStencilMask
+typedef void (APIENTRYP PFNGLCOLORMASKPROC)(GLboolean red, GLboolean green, GLboolean blue, GLboolean alpha);
+GLAPI PFNGLCOLORMASKPROC glad_glColorMask;
+#define glColorMask glad_glColorMask
+typedef void (APIENTRYP PFNGLDEPTHMASKPROC)(GLboolean flag);
+GLAPI PFNGLDEPTHMASKPROC glad_glDepthMask;
+#define glDepthMask glad_glDepthMask
+typedef void (APIENTRYP PFNGLDISABLEPROC)(GLenum cap);
+GLAPI PFNGLDISABLEPROC glad_glDisable;
+#define glDisable glad_glDisable
+typedef void (APIENTRYP PFNGLENABLEPROC)(GLenum cap);
+GLAPI PFNGLENABLEPROC glad_glEnable;
+#define glEnable glad_glEnable
+typedef void (APIENTRYP PFNGLFINISHPROC)(void);
+GLAPI PFNGLFINISHPROC glad_glFinish;
+#define glFinish glad_glFinish
+typedef void (APIENTRYP PFNGLFLUSHPROC)(void);
+GLAPI PFNGLFLUSHPROC glad_glFlush;
+#define glFlush glad_glFlush
+typedef void (APIENTRYP PFNGLBLENDFUNCPROC)(GLenum sfactor, GLenum dfactor);
+GLAPI PFNGLBLENDFUNCPROC glad_glBlendFunc;
+#define glBlendFunc glad_glBlendFunc
+typedef void (APIENTRYP PFNGLLOGICOPPROC)(GLenum opcode);
+GLAPI PFNGLLOGICOPPROC glad_glLogicOp;
+#define glLogicOp glad_glLogicOp
+typedef void (APIENTRYP PFNGLSTENCILFUNCPROC)(GLenum func, GLint ref, GLuint mask);
+GLAPI PFNGLSTENCILFUNCPROC glad_glStencilFunc;
+#define glStencilFunc glad_glStencilFunc
+typedef void (APIENTRYP PFNGLSTENCILOPPROC)(GLenum fail, GLenum zfail, GLenum zpass);
+GLAPI PFNGLSTENCILOPPROC glad_glStencilOp;
+#define glStencilOp glad_glStencilOp
+typedef void (APIENTRYP PFNGLDEPTHFUNCPROC)(GLenum func);
+GLAPI PFNGLDEPTHFUNCPROC glad_glDepthFunc;
+#define glDepthFunc glad_glDepthFunc
+typedef void (APIENTRYP PFNGLPIXELSTOREFPROC)(GLenum pname, GLfloat param);
+GLAPI PFNGLPIXELSTOREFPROC glad_glPixelStoref;
+#define glPixelStoref glad_glPixelStoref
+typedef void (APIENTRYP PFNGLPIXELSTOREIPROC)(GLenum pname, GLint param);
+GLAPI PFNGLPIXELSTOREIPROC glad_glPixelStorei;
+#define glPixelStorei glad_glPixelStorei
+typedef void (APIENTRYP PFNGLREADBUFFERPROC)(GLenum src);
+GLAPI PFNGLREADBUFFERPROC glad_glReadBuffer;
+#define glReadBuffer glad_glReadBuffer
+typedef void (APIENTRYP PFNGLREADPIXELSPROC)(GLint x, GLint y, GLsizei width, GLsizei height, GLenum format, GLenum type, void *pixels);
+GLAPI PFNGLREADPIXELSPROC glad_glReadPixels;
+#define glReadPixels glad_glReadPixels
+typedef void (APIENTRYP PFNGLGETBOOLEANVPROC)(GLenum pname, GLboolean *data);
+GLAPI PFNGLGETBOOLEANVPROC glad_glGetBooleanv;
+#define glGetBooleanv glad_glGetBooleanv
+typedef void (APIENTRYP PFNGLGETDOUBLEVPROC)(GLenum pname, GLdouble *data);
+GLAPI PFNGLGETDOUBLEVPROC glad_glGetDoublev;
+#define glGetDoublev glad_glGetDoublev
+typedef GLenum (APIENTRYP PFNGLGETERRORPROC)(void);
+GLAPI PFNGLGETERRORPROC glad_glGetError;
+#define glGetError glad_glGetError
+typedef void (APIENTRYP PFNGLGETFLOATVPROC)(GLenum pname, GLfloat *data);
+GLAPI PFNGLGETFLOATVPROC glad_glGetFloatv;
+#define glGetFloatv glad_glGetFloatv
+typedef void (APIENTRYP PFNGLGETINTEGERVPROC)(GLenum pname, GLint *data);
+GLAPI PFNGLGETINTEGERVPROC glad_glGetIntegerv;
+#define glGetIntegerv glad_glGetIntegerv
+typedef const GLubyte * (APIENTRYP PFNGLGETSTRINGPROC)(GLenum name);
+GLAPI PFNGLGETSTRINGPROC glad_glGetString;
+#define glGetString glad_glGetString
+typedef void (APIENTRYP PFNGLGETTEXIMAGEPROC)(GLenum target, GLint level, GLenum format, GLenum type, void *pixels);
+GLAPI PFNGLGETTEXIMAGEPROC glad_glGetTexImage;
+#define glGetTexImage glad_glGetTexImage
+typedef void (APIENTRYP PFNGLGETTEXPARAMETERFVPROC)(GLenum target, GLenum pname, GLfloat *params);
+GLAPI PFNGLGETTEXPARAMETERFVPROC glad_glGetTexParameterfv;
+#define glGetTexParameterfv glad_glGetTexParameterfv
+typedef void (APIENTRYP PFNGLGETTEXPARAMETERIVPROC)(GLenum target, GLenum pname, GLint *params);
+GLAPI PFNGLGETTEXPARAMETERIVPROC glad_glGetTexParameteriv;
+#define glGetTexParameteriv glad_glGetTexParameteriv
+typedef void (APIENTRYP PFNGLGETTEXLEVELPARAMETERFVPROC)(GLenum target, GLint level, GLenum pname, GLfloat *params);
+GLAPI PFNGLGETTEXLEVELPARAMETERFVPROC glad_glGetTexLevelParameterfv;
+#define glGetTexLevelParameterfv glad_glGetTexLevelParameterfv
+typedef void (APIENTRYP PFNGLGETTEXLEVELPARAMETERIVPROC)(GLenum target, GLint level, GLenum pname, GLint *params);
+GLAPI PFNGLGETTEXLEVELPARAMETERIVPROC glad_glGetTexLevelParameteriv;
+#define glGetTexLevelParameteriv glad_glGetTexLevelParameteriv
+typedef GLboolean (APIENTRYP PFNGLISENABLEDPROC)(GLenum cap);
+GLAPI PFNGLISENABLEDPROC glad_glIsEnabled;
+#define glIsEnabled glad_glIsEnabled
+typedef void (APIENTRYP PFNGLDEPTHRANGEPROC)(GLdouble n, GLdouble f);
+GLAPI PFNGLDEPTHRANGEPROC glad_glDepthRange;
+#define glDepthRange glad_glDepthRange
+typedef void (APIENTRYP PFNGLVIEWPORTPROC)(GLint x, GLint y, GLsizei width, GLsizei height);
+GLAPI PFNGLVIEWPORTPROC glad_glViewport;
+#define glViewport glad_glViewport
+#endif
+#ifndef GL_VERSION_1_1
+#define GL_VERSION_1_1 1
+GLAPI int GLAD_GL_VERSION_1_1;
+typedef void (APIENTRYP PFNGLDRAWARRAYSPROC)(GLenum mode, GLint first, GLsizei count);
+GLAPI PFNGLDRAWARRAYSPROC glad_glDrawArrays;
+#define glDrawArrays glad_glDrawArrays
+typedef void (APIENTRYP PFNGLDRAWELEMENTSPROC)(GLenum mode, GLsizei count, GLenum type, const void *indices);
+GLAPI PFNGLDRAWELEMENTSPROC glad_glDrawElements;
+#define glDrawElements glad_glDrawElements
+typedef void (APIENTRYP PFNGLPOLYGONOFFSETPROC)(GLfloat factor, GLfloat units);
+GLAPI PFNGLPOLYGONOFFSETPROC glad_glPolygonOffset;
+#define glPolygonOffset glad_glPolygonOffset
+typedef void (APIENTRYP PFNGLCOPYTEXIMAGE1DPROC)(GLenum target, GLint level, GLenum internalformat, GLint x, GLint y, GLsizei width, GLint border);
+GLAPI PFNGLCOPYTEXIMAGE1DPROC glad_glCopyTexImage1D;
+#define glCopyTexImage1D glad_glCopyTexImage1D
+typedef void (APIENTRYP PFNGLCOPYTEXIMAGE2DPROC)(GLenum target, GLint level, GLenum internalformat, GLint x, GLint y, GLsizei width, GLsizei height, GLint border);
+GLAPI PFNGLCOPYTEXIMAGE2DPROC glad_glCopyTexImage2D;
+#define glCopyTexImage2D glad_glCopyTexImage2D
+typedef void (APIENTRYP PFNGLCOPYTEXSUBIMAGE1DPROC)(GLenum target, GLint level, GLint xoffset, GLint x, GLint y, GLsizei width);
+GLAPI PFNGLCOPYTEXSUBIMAGE1DPROC glad_glCopyTexSubImage1D;
+#define glCopyTexSubImage1D glad_glCopyTexSubImage1D
+typedef void (APIENTRYP PFNGLCOPYTEXSUBIMAGE2DPROC)(GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint x, GLint y, GLsizei width, GLsizei height);
+GLAPI PFNGLCOPYTEXSUBIMAGE2DPROC glad_glCopyTexSubImage2D;
+#define glCopyTexSubImage2D glad_glCopyTexSubImage2D
+typedef void (APIENTRYP PFNGLTEXSUBIMAGE1DPROC)(GLenum target, GLint level, GLint xoffset, GLsizei width, GLenum format, GLenum type, const void *pixels);
+GLAPI PFNGLTEXSUBIMAGE1DPROC glad_glTexSubImage1D;
+#define glTexSubImage1D glad_glTexSubImage1D
+typedef void (APIENTRYP PFNGLTEXSUBIMAGE2DPROC)(GLenum target, GLint level, GLint xoffset, GLint yoffset, GLsizei width, GLsizei height, GLenum format, GLenum type, const void *pixels);
+GLAPI PFNGLTEXSUBIMAGE2DPROC glad_glTexSubImage2D;
+#define glTexSubImage2D glad_glTexSubImage2D
+typedef void (APIENTRYP PFNGLBINDTEXTUREPROC)(GLenum target, GLuint texture);
+GLAPI PFNGLBINDTEXTUREPROC glad_glBindTexture;
+#define glBindTexture glad_glBindTexture
+typedef void (APIENTRYP PFNGLDELETETEXTURESPROC)(GLsizei n, const GLuint *textures);
+GLAPI PFNGLDELETETEXTURESPROC glad_glDeleteTextures;
+#define glDeleteTextures glad_glDeleteTextures
+typedef void (APIENTRYP PFNGLGENTEXTURESPROC)(GLsizei n, GLuint *textures);
+GLAPI PFNGLGENTEXTURESPROC glad_glGenTextures;
+#define glGenTextures glad_glGenTextures
+typedef GLboolean (APIENTRYP PFNGLISTEXTUREPROC)(GLuint texture);
+GLAPI PFNGLISTEXTUREPROC glad_glIsTexture;
+#define glIsTexture glad_glIsTexture
+#endif
+#ifndef GL_VERSION_1_2
+#define GL_VERSION_1_2 1
+GLAPI int GLAD_GL_VERSION_1_2;
+typedef void (APIENTRYP PFNGLDRAWRANGEELEMENTSPROC)(GLenum mode, GLuint start, GLuint end, GLsizei count, GLenum type, const void *indices);
+GLAPI PFNGLDRAWRANGEELEMENTSPROC glad_glDrawRangeElements;
+#define glDrawRangeElements glad_glDrawRangeElements
+typedef void (APIENTRYP PFNGLTEXIMAGE3DPROC)(GLenum target, GLint level, GLint internalformat, GLsizei width, GLsizei height, GLsizei depth, GLint border, GLenum format, GLenum type, const void *pixels);
+GLAPI PFNGLTEXIMAGE3DPROC glad_glTexImage3D;
+#define glTexImage3D glad_glTexImage3D
+typedef void (APIENTRYP PFNGLTEXSUBIMAGE3DPROC)(GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLenum format, GLenum type, const void *pixels);
+GLAPI PFNGLTEXSUBIMAGE3DPROC glad_glTexSubImage3D;
+#define glTexSubImage3D glad_glTexSubImage3D
+typedef void (APIENTRYP PFNGLCOPYTEXSUBIMAGE3DPROC)(GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLint x, GLint y, GLsizei width, GLsizei height);
+GLAPI PFNGLCOPYTEXSUBIMAGE3DPROC glad_glCopyTexSubImage3D;
+#define glCopyTexSubImage3D glad_glCopyTexSubImage3D
+#endif
+#ifndef GL_VERSION_1_3
+#define GL_VERSION_1_3 1
+GLAPI int GLAD_GL_VERSION_1_3;
+typedef void (APIENTRYP PFNGLACTIVETEXTUREPROC)(GLenum texture);
+GLAPI PFNGLACTIVETEXTUREPROC glad_glActiveTexture;
+#define glActiveTexture glad_glActiveTexture
+typedef void (APIENTRYP PFNGLSAMPLECOVERAGEPROC)(GLfloat value, GLboolean invert);
+GLAPI PFNGLSAMPLECOVERAGEPROC glad_glSampleCoverage;
+#define glSampleCoverage glad_glSampleCoverage
+typedef void (APIENTRYP PFNGLCOMPRESSEDTEXIMAGE3DPROC)(GLenum target, GLint level, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth, GLint border, GLsizei imageSize, const void *data);
+GLAPI PFNGLCOMPRESSEDTEXIMAGE3DPROC glad_glCompressedTexImage3D;
+#define glCompressedTexImage3D glad_glCompressedTexImage3D
+typedef void (APIENTRYP PFNGLCOMPRESSEDTEXIMAGE2DPROC)(GLenum target, GLint level, GLenum internalformat, GLsizei width, GLsizei height, GLint border, GLsizei imageSize, const void *data);
+GLAPI PFNGLCOMPRESSEDTEXIMAGE2DPROC glad_glCompressedTexImage2D;
+#define glCompressedTexImage2D glad_glCompressedTexImage2D
+typedef void (APIENTRYP PFNGLCOMPRESSEDTEXIMAGE1DPROC)(GLenum target, GLint level, GLenum internalformat, GLsizei width, GLint border, GLsizei imageSize, const void *data);
+GLAPI PFNGLCOMPRESSEDTEXIMAGE1DPROC glad_glCompressedTexImage1D;
+#define glCompressedTexImage1D glad_glCompressedTexImage1D
+typedef void (APIENTRYP PFNGLCOMPRESSEDTEXSUBIMAGE3DPROC)(GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLenum format, GLsizei imageSize, const void *data);
+GLAPI PFNGLCOMPRESSEDTEXSUBIMAGE3DPROC glad_glCompressedTexSubImage3D;
+#define glCompressedTexSubImage3D glad_glCompressedTexSubImage3D
+typedef void (APIENTRYP PFNGLCOMPRESSEDTEXSUBIMAGE2DPROC)(GLenum target, GLint level, GLint xoffset, GLint yoffset, GLsizei width, GLsizei height, GLenum format, GLsizei imageSize, const void *data);
+GLAPI PFNGLCOMPRESSEDTEXSUBIMAGE2DPROC glad_glCompressedTexSubImage2D;
+#define glCompressedTexSubImage2D glad_glCompressedTexSubImage2D
+typedef void (APIENTRYP PFNGLCOMPRESSEDTEXSUBIMAGE1DPROC)(GLenum target, GLint level, GLint xoffset, GLsizei width, GLenum format, GLsizei imageSize, const void *data);
+GLAPI PFNGLCOMPRESSEDTEXSUBIMAGE1DPROC glad_glCompressedTexSubImage1D;
+#define glCompressedTexSubImage1D glad_glCompressedTexSubImage1D
+typedef void (APIENTRYP PFNGLGETCOMPRESSEDTEXIMAGEPROC)(GLenum target, GLint level, void *img);
+GLAPI PFNGLGETCOMPRESSEDTEXIMAGEPROC glad_glGetCompressedTexImage;
+#define glGetCompressedTexImage glad_glGetCompressedTexImage
+#endif
+#ifndef GL_VERSION_1_4
+#define GL_VERSION_1_4 1
+GLAPI int GLAD_GL_VERSION_1_4;
+typedef void (APIENTRYP PFNGLBLENDFUNCSEPARATEPROC)(GLenum sfactorRGB, GLenum dfactorRGB, GLenum sfactorAlpha, GLenum dfactorAlpha);
+GLAPI PFNGLBLENDFUNCSEPARATEPROC glad_glBlendFuncSeparate;
+#define glBlendFuncSeparate glad_glBlendFuncSeparate
+typedef void (APIENTRYP PFNGLMULTIDRAWARRAYSPROC)(GLenum mode, const GLint *first, const GLsizei *count, GLsizei drawcount);
+GLAPI PFNGLMULTIDRAWARRAYSPROC glad_glMultiDrawArrays;
+#define glMultiDrawArrays glad_glMultiDrawArrays
+typedef void (APIENTRYP PFNGLMULTIDRAWELEMENTSPROC)(GLenum mode, const GLsizei *count, GLenum type, const void *const*indices, GLsizei drawcount);
+GLAPI PFNGLMULTIDRAWELEMENTSPROC glad_glMultiDrawElements;
+#define glMultiDrawElements glad_glMultiDrawElements
+typedef void (APIENTRYP PFNGLPOINTPARAMETERFPROC)(GLenum pname, GLfloat param);
+GLAPI PFNGLPOINTPARAMETERFPROC glad_glPointParameterf;
+#define glPointParameterf glad_glPointParameterf
+typedef void (APIENTRYP PFNGLPOINTPARAMETERFVPROC)(GLenum pname, const GLfloat *params);
+GLAPI PFNGLPOINTPARAMETERFVPROC glad_glPointParameterfv;
+#define glPointParameterfv glad_glPointParameterfv
+typedef void (APIENTRYP PFNGLPOINTPARAMETERIPROC)(GLenum pname, GLint param);
+GLAPI PFNGLPOINTPARAMETERIPROC glad_glPointParameteri;
+#define glPointParameteri glad_glPointParameteri
+typedef void (APIENTRYP PFNGLPOINTPARAMETERIVPROC)(GLenum pname, const GLint *params);
+GLAPI PFNGLPOINTPARAMETERIVPROC glad_glPointParameteriv;
+#define glPointParameteriv glad_glPointParameteriv
+typedef void (APIENTRYP PFNGLBLENDCOLORPROC)(GLfloat red, GLfloat green, GLfloat blue, GLfloat alpha);
+GLAPI PFNGLBLENDCOLORPROC glad_glBlendColor;
+#define glBlendColor glad_glBlendColor
+typedef void (APIENTRYP PFNGLBLENDEQUATIONPROC)(GLenum mode);
+GLAPI PFNGLBLENDEQUATIONPROC glad_glBlendEquation;
+#define glBlendEquation glad_glBlendEquation
+#endif
+#ifndef GL_VERSION_1_5
+#define GL_VERSION_1_5 1
+GLAPI int GLAD_GL_VERSION_1_5;
+typedef void (APIENTRYP PFNGLGENQUERIESPROC)(GLsizei n, GLuint *ids);
+GLAPI PFNGLGENQUERIESPROC glad_glGenQueries;
+#define glGenQueries glad_glGenQueries
+typedef void (APIENTRYP PFNGLDELETEQUERIESPROC)(GLsizei n, const GLuint *ids);
+GLAPI PFNGLDELETEQUERIESPROC glad_glDeleteQueries;
+#define glDeleteQueries glad_glDeleteQueries
+typedef GLboolean (APIENTRYP PFNGLISQUERYPROC)(GLuint id);
+GLAPI PFNGLISQUERYPROC glad_glIsQuery;
+#define glIsQuery glad_glIsQuery
+typedef void (APIENTRYP PFNGLBEGINQUERYPROC)(GLenum target, GLuint id);
+GLAPI PFNGLBEGINQUERYPROC glad_glBeginQuery;
+#define glBeginQuery glad_glBeginQuery
+typedef void (APIENTRYP PFNGLENDQUERYPROC)(GLenum target);
+GLAPI PFNGLENDQUERYPROC glad_glEndQuery;
+#define glEndQuery glad_glEndQuery
+typedef void (APIENTRYP PFNGLGETQUERYIVPROC)(GLenum target, GLenum pname, GLint *params);
+GLAPI PFNGLGETQUERYIVPROC glad_glGetQueryiv;
+#define glGetQueryiv glad_glGetQueryiv
+typedef void (APIENTRYP PFNGLGETQUERYOBJECTIVPROC)(GLuint id, GLenum pname, GLint *params);
+GLAPI PFNGLGETQUERYOBJECTIVPROC glad_glGetQueryObjectiv;
+#define glGetQueryObjectiv glad_glGetQueryObjectiv
+typedef void (APIENTRYP PFNGLGETQUERYOBJECTUIVPROC)(GLuint id, GLenum pname, GLuint *params);
+GLAPI PFNGLGETQUERYOBJECTUIVPROC glad_glGetQueryObjectuiv;
+#define glGetQueryObjectuiv glad_glGetQueryObjectuiv
+typedef void (APIENTRYP PFNGLBINDBUFFERPROC)(GLenum target, GLuint buffer);
+GLAPI PFNGLBINDBUFFERPROC glad_glBindBuffer;
+#define glBindBuffer glad_glBindBuffer
+typedef void (APIENTRYP PFNGLDELETEBUFFERSPROC)(GLsizei n, const GLuint *buffers);
+GLAPI PFNGLDELETEBUFFERSPROC glad_glDeleteBuffers;
+#define glDeleteBuffers glad_glDeleteBuffers
+typedef void (APIENTRYP PFNGLGENBUFFERSPROC)(GLsizei n, GLuint *buffers);
+GLAPI PFNGLGENBUFFERSPROC glad_glGenBuffers;
+#define glGenBuffers glad_glGenBuffers
+typedef GLboolean (APIENTRYP PFNGLISBUFFERPROC)(GLuint buffer);
+GLAPI PFNGLISBUFFERPROC glad_glIsBuffer;
+#define glIsBuffer glad_glIsBuffer
+typedef void (APIENTRYP PFNGLBUFFERDATAPROC)(GLenum target, GLsizeiptr size, const void *data, GLenum usage);
+GLAPI PFNGLBUFFERDATAPROC glad_glBufferData;
+#define glBufferData glad_glBufferData
+typedef void (APIENTRYP PFNGLBUFFERSUBDATAPROC)(GLenum target, GLintptr offset, GLsizeiptr size, const void *data);
+GLAPI PFNGLBUFFERSUBDATAPROC glad_glBufferSubData;
+#define glBufferSubData glad_glBufferSubData
+typedef void (APIENTRYP PFNGLGETBUFFERSUBDATAPROC)(GLenum target, GLintptr offset, GLsizeiptr size, void *data);
+GLAPI PFNGLGETBUFFERSUBDATAPROC glad_glGetBufferSubData;
+#define glGetBufferSubData glad_glGetBufferSubData
+typedef void * (APIENTRYP PFNGLMAPBUFFERPROC)(GLenum target, GLenum access);
+GLAPI PFNGLMAPBUFFERPROC glad_glMapBuffer;
+#define glMapBuffer glad_glMapBuffer
+typedef GLboolean (APIENTRYP PFNGLUNMAPBUFFERPROC)(GLenum target);
+GLAPI PFNGLUNMAPBUFFERPROC glad_glUnmapBuffer;
+#define glUnmapBuffer glad_glUnmapBuffer
+typedef void (APIENTRYP PFNGLGETBUFFERPARAMETERIVPROC)(GLenum target, GLenum pname, GLint *params);
+GLAPI PFNGLGETBUFFERPARAMETERIVPROC glad_glGetBufferParameteriv;
+#define glGetBufferParameteriv glad_glGetBufferParameteriv
+typedef void (APIENTRYP PFNGLGETBUFFERPOINTERVPROC)(GLenum target, GLenum pname, void **params);
+GLAPI PFNGLGETBUFFERPOINTERVPROC glad_glGetBufferPointerv;
+#define glGetBufferPointerv glad_glGetBufferPointerv
+#endif
+#ifndef GL_VERSION_2_0
+#define GL_VERSION_2_0 1
+GLAPI int GLAD_GL_VERSION_2_0;
+typedef void (APIENTRYP PFNGLBLENDEQUATIONSEPARATEPROC)(GLenum modeRGB, GLenum modeAlpha);
+GLAPI PFNGLBLENDEQUATIONSEPARATEPROC glad_glBlendEquationSeparate;
+#define glBlendEquationSeparate glad_glBlendEquationSeparate
+typedef void (APIENTRYP PFNGLDRAWBUFFERSPROC)(GLsizei n, const GLenum *bufs);
+GLAPI PFNGLDRAWBUFFERSPROC glad_glDrawBuffers;
+#define glDrawBuffers glad_glDrawBuffers
+typedef void (APIENTRYP PFNGLSTENCILOPSEPARATEPROC)(GLenum face, GLenum sfail, GLenum dpfail, GLenum dppass);
+GLAPI PFNGLSTENCILOPSEPARATEPROC glad_glStencilOpSeparate;
+#define glStencilOpSeparate glad_glStencilOpSeparate
+typedef void (APIENTRYP PFNGLSTENCILFUNCSEPARATEPROC)(GLenum face, GLenum func, GLint ref, GLuint mask);
+GLAPI PFNGLSTENCILFUNCSEPARATEPROC glad_glStencilFuncSeparate;
+#define glStencilFuncSeparate glad_glStencilFuncSeparate
+typedef void (APIENTRYP PFNGLSTENCILMASKSEPARATEPROC)(GLenum face, GLuint mask);
+GLAPI PFNGLSTENCILMASKSEPARATEPROC glad_glStencilMaskSeparate;
+#define glStencilMaskSeparate glad_glStencilMaskSeparate
+typedef void (APIENTRYP PFNGLATTACHSHADERPROC)(GLuint program, GLuint shader);
+GLAPI PFNGLATTACHSHADERPROC glad_glAttachShader;
+#define glAttachShader glad_glAttachShader
+typedef void (APIENTRYP PFNGLBINDATTRIBLOCATIONPROC)(GLuint program, GLuint index, const GLchar *name);
+GLAPI PFNGLBINDATTRIBLOCATIONPROC glad_glBindAttribLocation;
+#define glBindAttribLocation glad_glBindAttribLocation
+typedef void (APIENTRYP PFNGLCOMPILESHADERPROC)(GLuint shader);
+GLAPI PFNGLCOMPILESHADERPROC glad_glCompileShader;
+#define glCompileShader glad_glCompileShader
+typedef GLuint (APIENTRYP PFNGLCREATEPROGRAMPROC)(void);
+GLAPI PFNGLCREATEPROGRAMPROC glad_glCreateProgram;
+#define glCreateProgram glad_glCreateProgram
+typedef GLuint (APIENTRYP PFNGLCREATESHADERPROC)(GLenum type);
+GLAPI PFNGLCREATESHADERPROC glad_glCreateShader;
+#define glCreateShader glad_glCreateShader
+typedef void (APIENTRYP PFNGLDELETEPROGRAMPROC)(GLuint program);
+GLAPI PFNGLDELETEPROGRAMPROC glad_glDeleteProgram;
+#define glDeleteProgram glad_glDeleteProgram
+typedef void (APIENTRYP PFNGLDELETESHADERPROC)(GLuint shader);
+GLAPI PFNGLDELETESHADERPROC glad_glDeleteShader;
+#define glDeleteShader glad_glDeleteShader
+typedef void (APIENTRYP PFNGLDETACHSHADERPROC)(GLuint program, GLuint shader);
+GLAPI PFNGLDETACHSHADERPROC glad_glDetachShader;
+#define glDetachShader glad_glDetachShader
+typedef void (APIENTRYP PFNGLDISABLEVERTEXATTRIBARRAYPROC)(GLuint index);
+GLAPI PFNGLDISABLEVERTEXATTRIBARRAYPROC glad_glDisableVertexAttribArray;
+#define glDisableVertexAttribArray glad_glDisableVertexAttribArray
+typedef void (APIENTRYP PFNGLENABLEVERTEXATTRIBARRAYPROC)(GLuint index);
+GLAPI PFNGLENABLEVERTEXATTRIBARRAYPROC glad_glEnableVertexAttribArray;
+#define glEnableVertexAttribArray glad_glEnableVertexAttribArray
+typedef void (APIENTRYP PFNGLGETACTIVEATTRIBPROC)(GLuint program, GLuint index, GLsizei bufSize, GLsizei *length, GLint *size, GLenum *type, GLchar *name);
+GLAPI PFNGLGETACTIVEATTRIBPROC glad_glGetActiveAttrib;
+#define glGetActiveAttrib glad_glGetActiveAttrib
+typedef void (APIENTRYP PFNGLGETACTIVEUNIFORMPROC)(GLuint program, GLuint index, GLsizei bufSize, GLsizei *length, GLint *size, GLenum *type, GLchar *name);
+GLAPI PFNGLGETACTIVEUNIFORMPROC glad_glGetActiveUniform;
+#define glGetActiveUniform glad_glGetActiveUniform
+typedef void (APIENTRYP PFNGLGETATTACHEDSHADERSPROC)(GLuint program, GLsizei maxCount, GLsizei *count, GLuint *shaders);
+GLAPI PFNGLGETATTACHEDSHADERSPROC glad_glGetAttachedShaders;
+#define glGetAttachedShaders glad_glGetAttachedShaders
+typedef GLint (APIENTRYP PFNGLGETATTRIBLOCATIONPROC)(GLuint program, const GLchar *name);
+GLAPI PFNGLGETATTRIBLOCATIONPROC glad_glGetAttribLocation;
+#define glGetAttribLocation glad_glGetAttribLocation
+typedef void (APIENTRYP PFNGLGETPROGRAMIVPROC)(GLuint program, GLenum pname, GLint *params);
+GLAPI PFNGLGETPROGRAMIVPROC glad_glGetProgramiv;
+#define glGetProgramiv glad_glGetProgramiv
+typedef void (APIENTRYP PFNGLGETPROGRAMINFOLOGPROC)(GLuint program, GLsizei bufSize, GLsizei *length, GLchar *infoLog);
+GLAPI PFNGLGETPROGRAMINFOLOGPROC glad_glGetProgramInfoLog;
+#define glGetProgramInfoLog glad_glGetProgramInfoLog
+typedef void (APIENTRYP PFNGLGETSHADERIVPROC)(GLuint shader, GLenum pname, GLint *params);
+GLAPI PFNGLGETSHADERIVPROC glad_glGetShaderiv;
+#define glGetShaderiv glad_glGetShaderiv
+typedef void (APIENTRYP PFNGLGETSHADERINFOLOGPROC)(GLuint shader, GLsizei bufSize, GLsizei *length, GLchar *infoLog);
+GLAPI PFNGLGETSHADERINFOLOGPROC glad_glGetShaderInfoLog;
+#define glGetShaderInfoLog glad_glGetShaderInfoLog
+typedef void (APIENTRYP PFNGLGETSHADERSOURCEPROC)(GLuint shader, GLsizei bufSize, GLsizei *length, GLchar *source);
+GLAPI PFNGLGETSHADERSOURCEPROC glad_glGetShaderSource;
+#define glGetShaderSource glad_glGetShaderSource
+typedef GLint (APIENTRYP PFNGLGETUNIFORMLOCATIONPROC)(GLuint program, const GLchar *name);
+GLAPI PFNGLGETUNIFORMLOCATIONPROC glad_glGetUniformLocation;
+#define glGetUniformLocation glad_glGetUniformLocation
+typedef void (APIENTRYP PFNGLGETUNIFORMFVPROC)(GLuint program, GLint location, GLfloat *params);
+GLAPI PFNGLGETUNIFORMFVPROC glad_glGetUniformfv;
+#define glGetUniformfv glad_glGetUniformfv
+typedef void (APIENTRYP PFNGLGETUNIFORMIVPROC)(GLuint program, GLint location, GLint *params);
+GLAPI PFNGLGETUNIFORMIVPROC glad_glGetUniformiv;
+#define glGetUniformiv glad_glGetUniformiv
+typedef void (APIENTRYP PFNGLGETVERTEXATTRIBDVPROC)(GLuint index, GLenum pname, GLdouble *params);
+GLAPI PFNGLGETVERTEXATTRIBDVPROC glad_glGetVertexAttribdv;
+#define glGetVertexAttribdv glad_glGetVertexAttribdv
+typedef void (APIENTRYP PFNGLGETVERTEXATTRIBFVPROC)(GLuint index, GLenum pname, GLfloat *params);
+GLAPI PFNGLGETVERTEXATTRIBFVPROC glad_glGetVertexAttribfv;
+#define glGetVertexAttribfv glad_glGetVertexAttribfv
+typedef void (APIENTRYP PFNGLGETVERTEXATTRIBIVPROC)(GLuint index, GLenum pname, GLint *params);
+GLAPI PFNGLGETVERTEXATTRIBIVPROC glad_glGetVertexAttribiv;
+#define glGetVertexAttribiv glad_glGetVertexAttribiv
+typedef void (APIENTRYP PFNGLGETVERTEXATTRIBPOINTERVPROC)(GLuint index, GLenum pname, void **pointer);
+GLAPI PFNGLGETVERTEXATTRIBPOINTERVPROC glad_glGetVertexAttribPointerv;
+#define glGetVertexAttribPointerv glad_glGetVertexAttribPointerv
+typedef GLboolean (APIENTRYP PFNGLISPROGRAMPROC)(GLuint program);
+GLAPI PFNGLISPROGRAMPROC glad_glIsProgram;
+#define glIsProgram glad_glIsProgram
+typedef GLboolean (APIENTRYP PFNGLISSHADERPROC)(GLuint shader);
+GLAPI PFNGLISSHADERPROC glad_glIsShader;
+#define glIsShader glad_glIsShader
+typedef void (APIENTRYP PFNGLLINKPROGRAMPROC)(GLuint program);
+GLAPI PFNGLLINKPROGRAMPROC glad_glLinkProgram;
+#define glLinkProgram glad_glLinkProgram
+typedef void (APIENTRYP PFNGLSHADERSOURCEPROC)(GLuint shader, GLsizei count, const GLchar *const*string, const GLint *length);
+GLAPI PFNGLSHADERSOURCEPROC glad_glShaderSource;
+#define glShaderSource glad_glShaderSource
+typedef void (APIENTRYP PFNGLUSEPROGRAMPROC)(GLuint program);
+GLAPI PFNGLUSEPROGRAMPROC glad_glUseProgram;
+#define glUseProgram glad_glUseProgram
+typedef void (APIENTRYP PFNGLUNIFORM1FPROC)(GLint location, GLfloat v0);
+GLAPI PFNGLUNIFORM1FPROC glad_glUniform1f;
+#define glUniform1f glad_glUniform1f
+typedef void (APIENTRYP PFNGLUNIFORM2FPROC)(GLint location, GLfloat v0, GLfloat v1);
+GLAPI PFNGLUNIFORM2FPROC glad_glUniform2f;
+#define glUniform2f glad_glUniform2f
+typedef void (APIENTRYP PFNGLUNIFORM3FPROC)(GLint location, GLfloat v0, GLfloat v1, GLfloat v2);
+GLAPI PFNGLUNIFORM3FPROC glad_glUniform3f;
+#define glUniform3f glad_glUniform3f
+typedef void (APIENTRYP PFNGLUNIFORM4FPROC)(GLint location, GLfloat v0, GLfloat v1, GLfloat v2, GLfloat v3);
+GLAPI PFNGLUNIFORM4FPROC glad_glUniform4f;
+#define glUniform4f glad_glUniform4f
+typedef void (APIENTRYP PFNGLUNIFORM1IPROC)(GLint location, GLint v0);
+GLAPI PFNGLUNIFORM1IPROC glad_glUniform1i;
+#define glUniform1i glad_glUniform1i
+typedef void (APIENTRYP PFNGLUNIFORM2IPROC)(GLint location, GLint v0, GLint v1);
+GLAPI PFNGLUNIFORM2IPROC glad_glUniform2i;
+#define glUniform2i glad_glUniform2i
+typedef void (APIENTRYP PFNGLUNIFORM3IPROC)(GLint location, GLint v0, GLint v1, GLint v2);
+GLAPI PFNGLUNIFORM3IPROC glad_glUniform3i;
+#define glUniform3i glad_glUniform3i
+typedef void (APIENTRYP PFNGLUNIFORM4IPROC)(GLint location, GLint v0, GLint v1, GLint v2, GLint v3);
+GLAPI PFNGLUNIFORM4IPROC glad_glUniform4i;
+#define glUniform4i glad_glUniform4i
+typedef void (APIENTRYP PFNGLUNIFORM1FVPROC)(GLint location, GLsizei count, const GLfloat *value);
+GLAPI PFNGLUNIFORM1FVPROC glad_glUniform1fv;
+#define glUniform1fv glad_glUniform1fv
+typedef void (APIENTRYP PFNGLUNIFORM2FVPROC)(GLint location, GLsizei count, const GLfloat *value);
+GLAPI PFNGLUNIFORM2FVPROC glad_glUniform2fv;
+#define glUniform2fv glad_glUniform2fv
+typedef void (APIENTRYP PFNGLUNIFORM3FVPROC)(GLint location, GLsizei count, const GLfloat *value);
+GLAPI PFNGLUNIFORM3FVPROC glad_glUniform3fv;
+#define glUniform3fv glad_glUniform3fv
+typedef void (APIENTRYP PFNGLUNIFORM4FVPROC)(GLint location, GLsizei count, const GLfloat *value);
+GLAPI PFNGLUNIFORM4FVPROC glad_glUniform4fv;
+#define glUniform4fv glad_glUniform4fv
+typedef void (APIENTRYP PFNGLUNIFORM1IVPROC)(GLint location, GLsizei count, const GLint *value);
+GLAPI PFNGLUNIFORM1IVPROC glad_glUniform1iv;
+#define glUniform1iv glad_glUniform1iv
+typedef void (APIENTRYP PFNGLUNIFORM2IVPROC)(GLint location, GLsizei count, const GLint *value);
+GLAPI PFNGLUNIFORM2IVPROC glad_glUniform2iv;
+#define glUniform2iv glad_glUniform2iv
+typedef void (APIENTRYP PFNGLUNIFORM3IVPROC)(GLint location, GLsizei count, const GLint *value);
+GLAPI PFNGLUNIFORM3IVPROC glad_glUniform3iv;
+#define glUniform3iv glad_glUniform3iv
+typedef void (APIENTRYP PFNGLUNIFORM4IVPROC)(GLint location, GLsizei count, const GLint *value);
+GLAPI PFNGLUNIFORM4IVPROC glad_glUniform4iv;
+#define glUniform4iv glad_glUniform4iv
+typedef void (APIENTRYP PFNGLUNIFORMMATRIX2FVPROC)(GLint location, GLsizei count, GLboolean transpose, const GLfloat *value);
+GLAPI PFNGLUNIFORMMATRIX2FVPROC glad_glUniformMatrix2fv;
+#define glUniformMatrix2fv glad_glUniformMatrix2fv
+typedef void (APIENTRYP PFNGLUNIFORMMATRIX3FVPROC)(GLint location, GLsizei count, GLboolean transpose, const GLfloat *value);
+GLAPI PFNGLUNIFORMMATRIX3FVPROC glad_glUniformMatrix3fv;
+#define glUniformMatrix3fv glad_glUniformMatrix3fv
+typedef void (APIENTRYP PFNGLUNIFORMMATRIX4FVPROC)(GLint location, GLsizei count, GLboolean transpose, const GLfloat *value);
+GLAPI PFNGLUNIFORMMATRIX4FVPROC glad_glUniformMatrix4fv;
+#define glUniformMatrix4fv glad_glUniformMatrix4fv
+typedef void (APIENTRYP PFNGLVALIDATEPROGRAMPROC)(GLuint program);
+GLAPI PFNGLVALIDATEPROGRAMPROC glad_glValidateProgram;
+#define glValidateProgram glad_glValidateProgram
+typedef void (APIENTRYP PFNGLVERTEXATTRIB1DPROC)(GLuint index, GLdouble x);
+GLAPI PFNGLVERTEXATTRIB1DPROC glad_glVertexAttrib1d;
+#define glVertexAttrib1d glad_glVertexAttrib1d
+typedef void (APIENTRYP PFNGLVERTEXATTRIB1DVPROC)(GLuint index, const GLdouble *v);
+GLAPI PFNGLVERTEXATTRIB1DVPROC glad_glVertexAttrib1dv;
+#define glVertexAttrib1dv glad_glVertexAttrib1dv
+typedef void (APIENTRYP PFNGLVERTEXATTRIB1FPROC)(GLuint index, GLfloat x);
+GLAPI PFNGLVERTEXATTRIB1FPROC glad_glVertexAttrib1f;
+#define glVertexAttrib1f glad_glVertexAttrib1f
+typedef void (APIENTRYP PFNGLVERTEXATTRIB1FVPROC)(GLuint index, const GLfloat *v);
+GLAPI PFNGLVERTEXATTRIB1FVPROC glad_glVertexAttrib1fv;
+#define glVertexAttrib1fv glad_glVertexAttrib1fv
+typedef void (APIENTRYP PFNGLVERTEXATTRIB1SPROC)(GLuint index, GLshort x);
+GLAPI PFNGLVERTEXATTRIB1SPROC glad_glVertexAttrib1s;
+#define glVertexAttrib1s glad_glVertexAttrib1s
+typedef void (APIENTRYP PFNGLVERTEXATTRIB1SVPROC)(GLuint index, const GLshort *v);
+GLAPI PFNGLVERTEXATTRIB1SVPROC glad_glVertexAttrib1sv;
+#define glVertexAttrib1sv glad_glVertexAttrib1sv
+typedef void (APIENTRYP PFNGLVERTEXATTRIB2DPROC)(GLuint index, GLdouble x, GLdouble y);
+GLAPI PFNGLVERTEXATTRIB2DPROC glad_glVertexAttrib2d;
+#define glVertexAttrib2d glad_glVertexAttrib2d
+typedef void (APIENTRYP PFNGLVERTEXATTRIB2DVPROC)(GLuint index, const GLdouble *v);
+GLAPI PFNGLVERTEXATTRIB2DVPROC glad_glVertexAttrib2dv;
+#define glVertexAttrib2dv glad_glVertexAttrib2dv
+typedef void (APIENTRYP PFNGLVERTEXATTRIB2FPROC)(GLuint index, GLfloat x, GLfloat y);
+GLAPI PFNGLVERTEXATTRIB2FPROC glad_glVertexAttrib2f;
+#define glVertexAttrib2f glad_glVertexAttrib2f
+typedef void (APIENTRYP PFNGLVERTEXATTRIB2FVPROC)(GLuint index, const GLfloat *v);
+GLAPI PFNGLVERTEXATTRIB2FVPROC glad_glVertexAttrib2fv;
+#define glVertexAttrib2fv glad_glVertexAttrib2fv
+typedef void (APIENTRYP PFNGLVERTEXATTRIB2SPROC)(GLuint index, GLshort x, GLshort y);
+GLAPI PFNGLVERTEXATTRIB2SPROC glad_glVertexAttrib2s;
+#define glVertexAttrib2s glad_glVertexAttrib2s
+typedef void (APIENTRYP PFNGLVERTEXATTRIB2SVPROC)(GLuint index, const GLshort *v);
+GLAPI PFNGLVERTEXATTRIB2SVPROC glad_glVertexAttrib2sv;
+#define glVertexAttrib2sv glad_glVertexAttrib2sv
+typedef void (APIENTRYP PFNGLVERTEXATTRIB3DPROC)(GLuint index, GLdouble x, GLdouble y, GLdouble z);
+GLAPI PFNGLVERTEXATTRIB3DPROC glad_glVertexAttrib3d;
+#define glVertexAttrib3d glad_glVertexAttrib3d
+typedef void (APIENTRYP PFNGLVERTEXATTRIB3DVPROC)(GLuint index, const GLdouble *v);
+GLAPI PFNGLVERTEXATTRIB3DVPROC glad_glVertexAttrib3dv;
+#define glVertexAttrib3dv glad_glVertexAttrib3dv
+typedef void (APIENTRYP PFNGLVERTEXATTRIB3FPROC)(GLuint index, GLfloat x, GLfloat y, GLfloat z);
+GLAPI PFNGLVERTEXATTRIB3FPROC glad_glVertexAttrib3f;
+#define glVertexAttrib3f glad_glVertexAttrib3f
+typedef void (APIENTRYP PFNGLVERTEXATTRIB3FVPROC)(GLuint index, const GLfloat *v);
+GLAPI PFNGLVERTEXATTRIB3FVPROC glad_glVertexAttrib3fv;
+#define glVertexAttrib3fv glad_glVertexAttrib3fv
+typedef void (APIENTRYP PFNGLVERTEXATTRIB3SPROC)(GLuint index, GLshort x, GLshort y, GLshort z);
+GLAPI PFNGLVERTEXATTRIB3SPROC glad_glVertexAttrib3s;
+#define glVertexAttrib3s glad_glVertexAttrib3s
+typedef void (APIENTRYP PFNGLVERTEXATTRIB3SVPROC)(GLuint index, const GLshort *v);
+GLAPI PFNGLVERTEXATTRIB3SVPROC glad_glVertexAttrib3sv;
+#define glVertexAttrib3sv glad_glVertexAttrib3sv
+typedef void (APIENTRYP PFNGLVERTEXATTRIB4NBVPROC)(GLuint index, const GLbyte *v);
+GLAPI PFNGLVERTEXATTRIB4NBVPROC glad_glVertexAttrib4Nbv;
+#define glVertexAttrib4Nbv glad_glVertexAttrib4Nbv
+typedef void (APIENTRYP PFNGLVERTEXATTRIB4NIVPROC)(GLuint index, const GLint *v);
+GLAPI PFNGLVERTEXATTRIB4NIVPROC glad_glVertexAttrib4Niv;
+#define glVertexAttrib4Niv glad_glVertexAttrib4Niv
+typedef void (APIENTRYP PFNGLVERTEXATTRIB4NSVPROC)(GLuint index, const GLshort *v);
+GLAPI PFNGLVERTEXATTRIB4NSVPROC glad_glVertexAttrib4Nsv;
+#define glVertexAttrib4Nsv glad_glVertexAttrib4Nsv
+typedef void (APIENTRYP PFNGLVERTEXATTRIB4NUBPROC)(GLuint index, GLubyte x, GLubyte y, GLubyte z, GLubyte w);
+GLAPI PFNGLVERTEXATTRIB4NUBPROC glad_glVertexAttrib4Nub;
+#define glVertexAttrib4Nub glad_glVertexAttrib4Nub
+typedef void (APIENTRYP PFNGLVERTEXATTRIB4NUBVPROC)(GLuint index, const GLubyte *v);
+GLAPI PFNGLVERTEXATTRIB4NUBVPROC glad_glVertexAttrib4Nubv;
+#define glVertexAttrib4Nubv glad_glVertexAttrib4Nubv
+typedef void (APIENTRYP PFNGLVERTEXATTRIB4NUIVPROC)(GLuint index, const GLuint *v);
+GLAPI PFNGLVERTEXATTRIB4NUIVPROC glad_glVertexAttrib4Nuiv;
+#define glVertexAttrib4Nuiv glad_glVertexAttrib4Nuiv
+typedef void (APIENTRYP PFNGLVERTEXATTRIB4NUSVPROC)(GLuint index, const GLushort *v);
+GLAPI PFNGLVERTEXATTRIB4NUSVPROC glad_glVertexAttrib4Nusv;
+#define glVertexAttrib4Nusv glad_glVertexAttrib4Nusv
+typedef void (APIENTRYP PFNGLVERTEXATTRIB4BVPROC)(GLuint index, const GLbyte *v);
+GLAPI PFNGLVERTEXATTRIB4BVPROC glad_glVertexAttrib4bv;
+#define glVertexAttrib4bv glad_glVertexAttrib4bv
+typedef void (APIENTRYP PFNGLVERTEXATTRIB4DPROC)(GLuint index, GLdouble x, GLdouble y, GLdouble z, GLdouble w);
+GLAPI PFNGLVERTEXATTRIB4DPROC glad_glVertexAttrib4d;
+#define glVertexAttrib4d glad_glVertexAttrib4d
+typedef void (APIENTRYP PFNGLVERTEXATTRIB4DVPROC)(GLuint index, const GLdouble *v);
+GLAPI PFNGLVERTEXATTRIB4DVPROC glad_glVertexAttrib4dv;
+#define glVertexAttrib4dv glad_glVertexAttrib4dv
+typedef void (APIENTRYP PFNGLVERTEXATTRIB4FPROC)(GLuint index, GLfloat x, GLfloat y, GLfloat z, GLfloat w);
+GLAPI PFNGLVERTEXATTRIB4FPROC glad_glVertexAttrib4f;
+#define glVertexAttrib4f glad_glVertexAttrib4f
+typedef void (APIENTRYP PFNGLVERTEXATTRIB4FVPROC)(GLuint index, const GLfloat *v);
+GLAPI PFNGLVERTEXATTRIB4FVPROC glad_glVertexAttrib4fv;
+#define glVertexAttrib4fv glad_glVertexAttrib4fv
+typedef void (APIENTRYP PFNGLVERTEXATTRIB4IVPROC)(GLuint index, const GLint *v);
+GLAPI PFNGLVERTEXATTRIB4IVPROC glad_glVertexAttrib4iv;
+#define glVertexAttrib4iv glad_glVertexAttrib4iv
+typedef void (APIENTRYP PFNGLVERTEXATTRIB4SPROC)(GLuint index, GLshort x, GLshort y, GLshort z, GLshort w);
+GLAPI PFNGLVERTEXATTRIB4SPROC glad_glVertexAttrib4s;
+#define glVertexAttrib4s glad_glVertexAttrib4s
+typedef void (APIENTRYP PFNGLVERTEXATTRIB4SVPROC)(GLuint index, const GLshort *v);
+GLAPI PFNGLVERTEXATTRIB4SVPROC glad_glVertexAttrib4sv;
+#define glVertexAttrib4sv glad_glVertexAttrib4sv
+typedef void (APIENTRYP PFNGLVERTEXATTRIB4UBVPROC)(GLuint index, const GLubyte *v);
+GLAPI PFNGLVERTEXATTRIB4UBVPROC glad_glVertexAttrib4ubv;
+#define glVertexAttrib4ubv glad_glVertexAttrib4ubv
+typedef void (APIENTRYP PFNGLVERTEXATTRIB4UIVPROC)(GLuint index, const GLuint *v);
+GLAPI PFNGLVERTEXATTRIB4UIVPROC glad_glVertexAttrib4uiv;
+#define glVertexAttrib4uiv glad_glVertexAttrib4uiv
+typedef void (APIENTRYP PFNGLVERTEXATTRIB4USVPROC)(GLuint index, const GLushort *v);
+GLAPI PFNGLVERTEXATTRIB4USVPROC glad_glVertexAttrib4usv;
+#define glVertexAttrib4usv glad_glVertexAttrib4usv
+typedef void (APIENTRYP PFNGLVERTEXATTRIBPOINTERPROC)(GLuint index, GLint size, GLenum type, GLboolean normalized, GLsizei stride, const void *pointer);
+GLAPI PFNGLVERTEXATTRIBPOINTERPROC glad_glVertexAttribPointer;
+#define glVertexAttribPointer glad_glVertexAttribPointer
+#endif
+#ifndef GL_VERSION_2_1
+#define GL_VERSION_2_1 1
+GLAPI int GLAD_GL_VERSION_2_1;
+typedef void (APIENTRYP PFNGLUNIFORMMATRIX2X3FVPROC)(GLint location, GLsizei count, GLboolean transpose, const GLfloat *value);
+GLAPI PFNGLUNIFORMMATRIX2X3FVPROC glad_glUniformMatrix2x3fv;
+#define glUniformMatrix2x3fv glad_glUniformMatrix2x3fv
+typedef void (APIENTRYP PFNGLUNIFORMMATRIX3X2FVPROC)(GLint location, GLsizei count, GLboolean transpose, const GLfloat *value);
+GLAPI PFNGLUNIFORMMATRIX3X2FVPROC glad_glUniformMatrix3x2fv;
+#define glUniformMatrix3x2fv glad_glUniformMatrix3x2fv
+typedef void (APIENTRYP PFNGLUNIFORMMATRIX2X4FVPROC)(GLint location, GLsizei count, GLboolean transpose, const GLfloat *value);
+GLAPI PFNGLUNIFORMMATRIX2X4FVPROC glad_glUniformMatrix2x4fv;
+#define glUniformMatrix2x4fv glad_glUniformMatrix2x4fv
+typedef void (APIENTRYP PFNGLUNIFORMMATRIX4X2FVPROC)(GLint location, GLsizei count, GLboolean transpose, const GLfloat *value);
+GLAPI PFNGLUNIFORMMATRIX4X2FVPROC glad_glUniformMatrix4x2fv;
+#define glUniformMatrix4x2fv glad_glUniformMatrix4x2fv
+typedef void (APIENTRYP PFNGLUNIFORMMATRIX3X4FVPROC)(GLint location, GLsizei count, GLboolean transpose, const GLfloat *value);
+GLAPI PFNGLUNIFORMMATRIX3X4FVPROC glad_glUniformMatrix3x4fv;
+#define glUniformMatrix3x4fv glad_glUniformMatrix3x4fv
+typedef void (APIENTRYP PFNGLUNIFORMMATRIX4X3FVPROC)(GLint location, GLsizei count, GLboolean transpose, const GLfloat *value);
+GLAPI PFNGLUNIFORMMATRIX4X3FVPROC glad_glUniformMatrix4x3fv;
+#define glUniformMatrix4x3fv glad_glUniformMatrix4x3fv
+#endif
+#ifndef GL_VERSION_3_0
+#define GL_VERSION_3_0 1
+GLAPI int GLAD_GL_VERSION_3_0;
+typedef void (APIENTRYP PFNGLCOLORMASKIPROC)(GLuint index, GLboolean r, GLboolean g, GLboolean b, GLboolean a);
+GLAPI PFNGLCOLORMASKIPROC glad_glColorMaski;
+#define glColorMaski glad_glColorMaski
+typedef void (APIENTRYP PFNGLGETBOOLEANI_VPROC)(GLenum target, GLuint index, GLboolean *data);
+GLAPI PFNGLGETBOOLEANI_VPROC glad_glGetBooleani_v;
+#define glGetBooleani_v glad_glGetBooleani_v
+typedef void (APIENTRYP PFNGLGETINTEGERI_VPROC)(GLenum target, GLuint index, GLint *data);
+GLAPI PFNGLGETINTEGERI_VPROC glad_glGetIntegeri_v;
+#define glGetIntegeri_v glad_glGetIntegeri_v
+typedef void (APIENTRYP PFNGLENABLEIPROC)(GLenum target, GLuint index);
+GLAPI PFNGLENABLEIPROC glad_glEnablei;
+#define glEnablei glad_glEnablei
+typedef void (APIENTRYP PFNGLDISABLEIPROC)(GLenum target, GLuint index);
+GLAPI PFNGLDISABLEIPROC glad_glDisablei;
+#define glDisablei glad_glDisablei
+typedef GLboolean (APIENTRYP PFNGLISENABLEDIPROC)(GLenum target, GLuint index);
+GLAPI PFNGLISENABLEDIPROC glad_glIsEnabledi;
+#define glIsEnabledi glad_glIsEnabledi
+typedef void (APIENTRYP PFNGLBEGINTRANSFORMFEEDBACKPROC)(GLenum primitiveMode);
+GLAPI PFNGLBEGINTRANSFORMFEEDBACKPROC glad_glBeginTransformFeedback;
+#define glBeginTransformFeedback glad_glBeginTransformFeedback
+typedef void (APIENTRYP PFNGLENDTRANSFORMFEEDBACKPROC)(void);
+GLAPI PFNGLENDTRANSFORMFEEDBACKPROC glad_glEndTransformFeedback;
+#define glEndTransformFeedback glad_glEndTransformFeedback
+typedef void (APIENTRYP PFNGLBINDBUFFERRANGEPROC)(GLenum target, GLuint index, GLuint buffer, GLintptr offset, GLsizeiptr size);
+GLAPI PFNGLBINDBUFFERRANGEPROC glad_glBindBufferRange;
+#define glBindBufferRange glad_glBindBufferRange
+typedef void (APIENTRYP PFNGLBINDBUFFERBASEPROC)(GLenum target, GLuint index, GLuint buffer);
+GLAPI PFNGLBINDBUFFERBASEPROC glad_glBindBufferBase;
+#define glBindBufferBase glad_glBindBufferBase
+typedef void (APIENTRYP PFNGLTRANSFORMFEEDBACKVARYINGSPROC)(GLuint program, GLsizei count, const GLchar *const*varyings, GLenum bufferMode);
+GLAPI PFNGLTRANSFORMFEEDBACKVARYINGSPROC glad_glTransformFeedbackVaryings;
+#define glTransformFeedbackVaryings glad_glTransformFeedbackVaryings
+typedef void (APIENTRYP PFNGLGETTRANSFORMFEEDBACKVARYINGPROC)(GLuint program, GLuint index, GLsizei bufSize, GLsizei *length, GLsizei *size, GLenum *type, GLchar *name);
+GLAPI PFNGLGETTRANSFORMFEEDBACKVARYINGPROC glad_glGetTransformFeedbackVarying;
+#define glGetTransformFeedbackVarying glad_glGetTransformFeedbackVarying
+typedef void (APIENTRYP PFNGLCLAMPCOLORPROC)(GLenum target, GLenum clamp);
+GLAPI PFNGLCLAMPCOLORPROC glad_glClampColor;
+#define glClampColor glad_glClampColor
+typedef void (APIENTRYP PFNGLBEGINCONDITIONALRENDERPROC)(GLuint id, GLenum mode);
+GLAPI PFNGLBEGINCONDITIONALRENDERPROC glad_glBeginConditionalRender;
+#define glBeginConditionalRender glad_glBeginConditionalRender
+typedef void (APIENTRYP PFNGLENDCONDITIONALRENDERPROC)(void);
+GLAPI PFNGLENDCONDITIONALRENDERPROC glad_glEndConditionalRender;
+#define glEndConditionalRender glad_glEndConditionalRender
+typedef void (APIENTRYP PFNGLVERTEXATTRIBIPOINTERPROC)(GLuint index, GLint size, GLenum type, GLsizei stride, const void *pointer);
+GLAPI PFNGLVERTEXATTRIBIPOINTERPROC glad_glVertexAttribIPointer;
+#define glVertexAttribIPointer glad_glVertexAttribIPointer
+typedef void (APIENTRYP PFNGLGETVERTEXATTRIBIIVPROC)(GLuint index, GLenum pname, GLint *params);
+GLAPI PFNGLGETVERTEXATTRIBIIVPROC glad_glGetVertexAttribIiv;
+#define glGetVertexAttribIiv glad_glGetVertexAttribIiv
+typedef void (APIENTRYP PFNGLGETVERTEXATTRIBIUIVPROC)(GLuint index, GLenum pname, GLuint *params);
+GLAPI PFNGLGETVERTEXATTRIBIUIVPROC glad_glGetVertexAttribIuiv;
+#define glGetVertexAttribIuiv glad_glGetVertexAttribIuiv
+typedef void (APIENTRYP PFNGLVERTEXATTRIBI1IPROC)(GLuint index, GLint x);
+GLAPI PFNGLVERTEXATTRIBI1IPROC glad_glVertexAttribI1i;
+#define glVertexAttribI1i glad_glVertexAttribI1i
+typedef void (APIENTRYP PFNGLVERTEXATTRIBI2IPROC)(GLuint index, GLint x, GLint y);
+GLAPI PFNGLVERTEXATTRIBI2IPROC glad_glVertexAttribI2i;
+#define glVertexAttribI2i glad_glVertexAttribI2i
+typedef void (APIENTRYP PFNGLVERTEXATTRIBI3IPROC)(GLuint index, GLint x, GLint y, GLint z);
+GLAPI PFNGLVERTEXATTRIBI3IPROC glad_glVertexAttribI3i;
+#define glVertexAttribI3i glad_glVertexAttribI3i
+typedef void (APIENTRYP PFNGLVERTEXATTRIBI4IPROC)(GLuint index, GLint x, GLint y, GLint z, GLint w);
+GLAPI PFNGLVERTEXATTRIBI4IPROC glad_glVertexAttribI4i;
+#define glVertexAttribI4i glad_glVertexAttribI4i
+typedef void (APIENTRYP PFNGLVERTEXATTRIBI1UIPROC)(GLuint index, GLuint x);
+GLAPI PFNGLVERTEXATTRIBI1UIPROC glad_glVertexAttribI1ui;
+#define glVertexAttribI1ui glad_glVertexAttribI1ui
+typedef void (APIENTRYP PFNGLVERTEXATTRIBI2UIPROC)(GLuint index, GLuint x, GLuint y);
+GLAPI PFNGLVERTEXATTRIBI2UIPROC glad_glVertexAttribI2ui;
+#define glVertexAttribI2ui glad_glVertexAttribI2ui
+typedef void (APIENTRYP PFNGLVERTEXATTRIBI3UIPROC)(GLuint index, GLuint x, GLuint y, GLuint z);
+GLAPI PFNGLVERTEXATTRIBI3UIPROC glad_glVertexAttribI3ui;
+#define glVertexAttribI3ui glad_glVertexAttribI3ui
+typedef void (APIENTRYP PFNGLVERTEXATTRIBI4UIPROC)(GLuint index, GLuint x, GLuint y, GLuint z, GLuint w);
+GLAPI PFNGLVERTEXATTRIBI4UIPROC glad_glVertexAttribI4ui;
+#define glVertexAttribI4ui glad_glVertexAttribI4ui
+typedef void (APIENTRYP PFNGLVERTEXATTRIBI1IVPROC)(GLuint index, const GLint *v);
+GLAPI PFNGLVERTEXATTRIBI1IVPROC glad_glVertexAttribI1iv;
+#define glVertexAttribI1iv glad_glVertexAttribI1iv
+typedef void (APIENTRYP PFNGLVERTEXATTRIBI2IVPROC)(GLuint index, const GLint *v);
+GLAPI PFNGLVERTEXATTRIBI2IVPROC glad_glVertexAttribI2iv;
+#define glVertexAttribI2iv glad_glVertexAttribI2iv
+typedef void (APIENTRYP PFNGLVERTEXATTRIBI3IVPROC)(GLuint index, const GLint *v);
+GLAPI PFNGLVERTEXATTRIBI3IVPROC glad_glVertexAttribI3iv;
+#define glVertexAttribI3iv glad_glVertexAttribI3iv
+typedef void (APIENTRYP PFNGLVERTEXATTRIBI4IVPROC)(GLuint index, const GLint *v);
+GLAPI PFNGLVERTEXATTRIBI4IVPROC glad_glVertexAttribI4iv;
+#define glVertexAttribI4iv glad_glVertexAttribI4iv
+typedef void (APIENTRYP PFNGLVERTEXATTRIBI1UIVPROC)(GLuint index, const GLuint *v);
+GLAPI PFNGLVERTEXATTRIBI1UIVPROC glad_glVertexAttribI1uiv;
+#define glVertexAttribI1uiv glad_glVertexAttribI1uiv
+typedef void (APIENTRYP PFNGLVERTEXATTRIBI2UIVPROC)(GLuint index, const GLuint *v);
+GLAPI PFNGLVERTEXATTRIBI2UIVPROC glad_glVertexAttribI2uiv;
+#define glVertexAttribI2uiv glad_glVertexAttribI2uiv
+typedef void (APIENTRYP PFNGLVERTEXATTRIBI3UIVPROC)(GLuint index, const GLuint *v);
+GLAPI PFNGLVERTEXATTRIBI3UIVPROC glad_glVertexAttribI3uiv;
+#define glVertexAttribI3uiv glad_glVertexAttribI3uiv
+typedef void (APIENTRYP PFNGLVERTEXATTRIBI4UIVPROC)(GLuint index, const GLuint *v);
+GLAPI PFNGLVERTEXATTRIBI4UIVPROC glad_glVertexAttribI4uiv;
+#define glVertexAttribI4uiv glad_glVertexAttribI4uiv
+typedef void (APIENTRYP PFNGLVERTEXATTRIBI4BVPROC)(GLuint index, const GLbyte *v);
+GLAPI PFNGLVERTEXATTRIBI4BVPROC glad_glVertexAttribI4bv;
+#define glVertexAttribI4bv glad_glVertexAttribI4bv
+typedef void (APIENTRYP PFNGLVERTEXATTRIBI4SVPROC)(GLuint index, const GLshort *v);
+GLAPI PFNGLVERTEXATTRIBI4SVPROC glad_glVertexAttribI4sv;
+#define glVertexAttribI4sv glad_glVertexAttribI4sv
+typedef void (APIENTRYP PFNGLVERTEXATTRIBI4UBVPROC)(GLuint index, const GLubyte *v);
+GLAPI PFNGLVERTEXATTRIBI4UBVPROC glad_glVertexAttribI4ubv;
+#define glVertexAttribI4ubv glad_glVertexAttribI4ubv
+typedef void (APIENTRYP PFNGLVERTEXATTRIBI4USVPROC)(GLuint index, const GLushort *v);
+GLAPI PFNGLVERTEXATTRIBI4USVPROC glad_glVertexAttribI4usv;
+#define glVertexAttribI4usv glad_glVertexAttribI4usv
+typedef void (APIENTRYP PFNGLGETUNIFORMUIVPROC)(GLuint program, GLint location, GLuint *params);
+GLAPI PFNGLGETUNIFORMUIVPROC glad_glGetUniformuiv;
+#define glGetUniformuiv glad_glGetUniformuiv
+typedef void (APIENTRYP PFNGLBINDFRAGDATALOCATIONPROC)(GLuint program, GLuint color, const GLchar *name);
+GLAPI PFNGLBINDFRAGDATALOCATIONPROC glad_glBindFragDataLocation;
+#define glBindFragDataLocation glad_glBindFragDataLocation
+typedef GLint (APIENTRYP PFNGLGETFRAGDATALOCATIONPROC)(GLuint program, const GLchar *name);
+GLAPI PFNGLGETFRAGDATALOCATIONPROC glad_glGetFragDataLocation;
+#define glGetFragDataLocation glad_glGetFragDataLocation
+typedef void (APIENTRYP PFNGLUNIFORM1UIPROC)(GLint location, GLuint v0);
+GLAPI PFNGLUNIFORM1UIPROC glad_glUniform1ui;
+#define glUniform1ui glad_glUniform1ui
+typedef void (APIENTRYP PFNGLUNIFORM2UIPROC)(GLint location, GLuint v0, GLuint v1);
+GLAPI PFNGLUNIFORM2UIPROC glad_glUniform2ui;
+#define glUniform2ui glad_glUniform2ui
+typedef void (APIENTRYP PFNGLUNIFORM3UIPROC)(GLint location, GLuint v0, GLuint v1, GLuint v2);
+GLAPI PFNGLUNIFORM3UIPROC glad_glUniform3ui;
+#define glUniform3ui glad_glUniform3ui
+typedef void (APIENTRYP PFNGLUNIFORM4UIPROC)(GLint location, GLuint v0, GLuint v1, GLuint v2, GLuint v3);
+GLAPI PFNGLUNIFORM4UIPROC glad_glUniform4ui;
+#define glUniform4ui glad_glUniform4ui
+typedef void (APIENTRYP PFNGLUNIFORM1UIVPROC)(GLint location, GLsizei count, const GLuint *value);
+GLAPI PFNGLUNIFORM1UIVPROC glad_glUniform1uiv;
+#define glUniform1uiv glad_glUniform1uiv
+typedef void (APIENTRYP PFNGLUNIFORM2UIVPROC)(GLint location, GLsizei count, const GLuint *value);
+GLAPI PFNGLUNIFORM2UIVPROC glad_glUniform2uiv;
+#define glUniform2uiv glad_glUniform2uiv
+typedef void (APIENTRYP PFNGLUNIFORM3UIVPROC)(GLint location, GLsizei count, const GLuint *value);
+GLAPI PFNGLUNIFORM3UIVPROC glad_glUniform3uiv;
+#define glUniform3uiv glad_glUniform3uiv
+typedef void (APIENTRYP PFNGLUNIFORM4UIVPROC)(GLint location, GLsizei count, const GLuint *value);
+GLAPI PFNGLUNIFORM4UIVPROC glad_glUniform4uiv;
+#define glUniform4uiv glad_glUniform4uiv
+typedef void (APIENTRYP PFNGLTEXPARAMETERIIVPROC)(GLenum target, GLenum pname, const GLint *params);
+GLAPI PFNGLTEXPARAMETERIIVPROC glad_glTexParameterIiv;
+#define glTexParameterIiv glad_glTexParameterIiv
+typedef void (APIENTRYP PFNGLTEXPARAMETERIUIVPROC)(GLenum target, GLenum pname, const GLuint *params);
+GLAPI PFNGLTEXPARAMETERIUIVPROC glad_glTexParameterIuiv;
+#define glTexParameterIuiv glad_glTexParameterIuiv
+typedef void (APIENTRYP PFNGLGETTEXPARAMETERIIVPROC)(GLenum target, GLenum pname, GLint *params);
+GLAPI PFNGLGETTEXPARAMETERIIVPROC glad_glGetTexParameterIiv;
+#define glGetTexParameterIiv glad_glGetTexParameterIiv
+typedef void (APIENTRYP PFNGLGETTEXPARAMETERIUIVPROC)(GLenum target, GLenum pname, GLuint *params);
+GLAPI PFNGLGETTEXPARAMETERIUIVPROC glad_glGetTexParameterIuiv;
+#define glGetTexParameterIuiv glad_glGetTexParameterIuiv
+typedef void (APIENTRYP PFNGLCLEARBUFFERIVPROC)(GLenum buffer, GLint drawbuffer, const GLint *value);
+GLAPI PFNGLCLEARBUFFERIVPROC glad_glClearBufferiv;
+#define glClearBufferiv glad_glClearBufferiv
+typedef void (APIENTRYP PFNGLCLEARBUFFERUIVPROC)(GLenum buffer, GLint drawbuffer, const GLuint *value);
+GLAPI PFNGLCLEARBUFFERUIVPROC glad_glClearBufferuiv;
+#define glClearBufferuiv glad_glClearBufferuiv
+typedef void (APIENTRYP PFNGLCLEARBUFFERFVPROC)(GLenum buffer, GLint drawbuffer, const GLfloat *value);
+GLAPI PFNGLCLEARBUFFERFVPROC glad_glClearBufferfv;
+#define glClearBufferfv glad_glClearBufferfv
+typedef void (APIENTRYP PFNGLCLEARBUFFERFIPROC)(GLenum buffer, GLint drawbuffer, GLfloat depth, GLint stencil);
+GLAPI PFNGLCLEARBUFFERFIPROC glad_glClearBufferfi;
+#define glClearBufferfi glad_glClearBufferfi
+typedef const GLubyte * (APIENTRYP PFNGLGETSTRINGIPROC)(GLenum name, GLuint index);
+GLAPI PFNGLGETSTRINGIPROC glad_glGetStringi;
+#define glGetStringi glad_glGetStringi
+typedef GLboolean (APIENTRYP PFNGLISRENDERBUFFERPROC)(GLuint renderbuffer);
+GLAPI PFNGLISRENDERBUFFERPROC glad_glIsRenderbuffer;
+#define glIsRenderbuffer glad_glIsRenderbuffer
+typedef void (APIENTRYP PFNGLBINDRENDERBUFFERPROC)(GLenum target, GLuint renderbuffer);
+GLAPI PFNGLBINDRENDERBUFFERPROC glad_glBindRenderbuffer;
+#define glBindRenderbuffer glad_glBindRenderbuffer
+typedef void (APIENTRYP PFNGLDELETERENDERBUFFERSPROC)(GLsizei n, const GLuint *renderbuffers);
+GLAPI PFNGLDELETERENDERBUFFERSPROC glad_glDeleteRenderbuffers;
+#define glDeleteRenderbuffers glad_glDeleteRenderbuffers
+typedef void (APIENTRYP PFNGLGENRENDERBUFFERSPROC)(GLsizei n, GLuint *renderbuffers);
+GLAPI PFNGLGENRENDERBUFFERSPROC glad_glGenRenderbuffers;
+#define glGenRenderbuffers glad_glGenRenderbuffers
+typedef void (APIENTRYP PFNGLRENDERBUFFERSTORAGEPROC)(GLenum target, GLenum internalformat, GLsizei width, GLsizei height);
+GLAPI PFNGLRENDERBUFFERSTORAGEPROC glad_glRenderbufferStorage;
+#define glRenderbufferStorage glad_glRenderbufferStorage
+typedef void (APIENTRYP PFNGLGETRENDERBUFFERPARAMETERIVPROC)(GLenum target, GLenum pname, GLint *params);
+GLAPI PFNGLGETRENDERBUFFERPARAMETERIVPROC glad_glGetRenderbufferParameteriv;
+#define glGetRenderbufferParameteriv glad_glGetRenderbufferParameteriv
+typedef GLboolean (APIENTRYP PFNGLISFRAMEBUFFERPROC)(GLuint framebuffer);
+GLAPI PFNGLISFRAMEBUFFERPROC glad_glIsFramebuffer;
+#define glIsFramebuffer glad_glIsFramebuffer
+typedef void (APIENTRYP PFNGLBINDFRAMEBUFFERPROC)(GLenum target, GLuint framebuffer);
+GLAPI PFNGLBINDFRAMEBUFFERPROC glad_glBindFramebuffer;
+#define glBindFramebuffer glad_glBindFramebuffer
+typedef void (APIENTRYP PFNGLDELETEFRAMEBUFFERSPROC)(GLsizei n, const GLuint *framebuffers);
+GLAPI PFNGLDELETEFRAMEBUFFERSPROC glad_glDeleteFramebuffers;
+#define glDeleteFramebuffers glad_glDeleteFramebuffers
+typedef void (APIENTRYP PFNGLGENFRAMEBUFFERSPROC)(GLsizei n, GLuint *framebuffers);
+GLAPI PFNGLGENFRAMEBUFFERSPROC glad_glGenFramebuffers;
+#define glGenFramebuffers glad_glGenFramebuffers
+typedef GLenum (APIENTRYP PFNGLCHECKFRAMEBUFFERSTATUSPROC)(GLenum target);
+GLAPI PFNGLCHECKFRAMEBUFFERSTATUSPROC glad_glCheckFramebufferStatus;
+#define glCheckFramebufferStatus glad_glCheckFramebufferStatus
+typedef void (APIENTRYP PFNGLFRAMEBUFFERTEXTURE1DPROC)(GLenum target, GLenum attachment, GLenum textarget, GLuint texture, GLint level);
+GLAPI PFNGLFRAMEBUFFERTEXTURE1DPROC glad_glFramebufferTexture1D;
+#define glFramebufferTexture1D glad_glFramebufferTexture1D
+typedef void (APIENTRYP PFNGLFRAMEBUFFERTEXTURE2DPROC)(GLenum target, GLenum attachment, GLenum textarget, GLuint texture, GLint level);
+GLAPI PFNGLFRAMEBUFFERTEXTURE2DPROC glad_glFramebufferTexture2D;
+#define glFramebufferTexture2D glad_glFramebufferTexture2D
+typedef void (APIENTRYP PFNGLFRAMEBUFFERTEXTURE3DPROC)(GLenum target, GLenum attachment, GLenum textarget, GLuint texture, GLint level, GLint zoffset);
+GLAPI PFNGLFRAMEBUFFERTEXTURE3DPROC glad_glFramebufferTexture3D;
+#define glFramebufferTexture3D glad_glFramebufferTexture3D
+typedef void (APIENTRYP PFNGLFRAMEBUFFERRENDERBUFFERPROC)(GLenum target, GLenum attachment, GLenum renderbuffertarget, GLuint renderbuffer);
+GLAPI PFNGLFRAMEBUFFERRENDERBUFFERPROC glad_glFramebufferRenderbuffer;
+#define glFramebufferRenderbuffer glad_glFramebufferRenderbuffer
+typedef void (APIENTRYP PFNGLGETFRAMEBUFFERATTACHMENTPARAMETERIVPROC)(GLenum target, GLenum attachment, GLenum pname, GLint *params);
+GLAPI PFNGLGETFRAMEBUFFERATTACHMENTPARAMETERIVPROC glad_glGetFramebufferAttachmentParameteriv;
+#define glGetFramebufferAttachmentParameteriv glad_glGetFramebufferAttachmentParameteriv
+typedef void (APIENTRYP PFNGLGENERATEMIPMAPPROC)(GLenum target);
+GLAPI PFNGLGENERATEMIPMAPPROC glad_glGenerateMipmap;
+#define glGenerateMipmap glad_glGenerateMipmap
+typedef void (APIENTRYP PFNGLBLITFRAMEBUFFERPROC)(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1, GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1, GLbitfield mask, GLenum filter);
+GLAPI PFNGLBLITFRAMEBUFFERPROC glad_glBlitFramebuffer;
+#define glBlitFramebuffer glad_glBlitFramebuffer
+typedef void (APIENTRYP PFNGLRENDERBUFFERSTORAGEMULTISAMPLEPROC)(GLenum target, GLsizei samples, GLenum internalformat, GLsizei width, GLsizei height);
+GLAPI PFNGLRENDERBUFFERSTORAGEMULTISAMPLEPROC glad_glRenderbufferStorageMultisample;
+#define glRenderbufferStorageMultisample glad_glRenderbufferStorageMultisample
+typedef void (APIENTRYP PFNGLFRAMEBUFFERTEXTURELAYERPROC)(GLenum target, GLenum attachment, GLuint texture, GLint level, GLint layer);
+GLAPI PFNGLFRAMEBUFFERTEXTURELAYERPROC glad_glFramebufferTextureLayer;
+#define glFramebufferTextureLayer glad_glFramebufferTextureLayer
+typedef void * (APIENTRYP PFNGLMAPBUFFERRANGEPROC)(GLenum target, GLintptr offset, GLsizeiptr length, GLbitfield access);
+GLAPI PFNGLMAPBUFFERRANGEPROC glad_glMapBufferRange;
+#define glMapBufferRange glad_glMapBufferRange
+typedef void (APIENTRYP PFNGLFLUSHMAPPEDBUFFERRANGEPROC)(GLenum target, GLintptr offset, GLsizeiptr length);
+GLAPI PFNGLFLUSHMAPPEDBUFFERRANGEPROC glad_glFlushMappedBufferRange;
+#define glFlushMappedBufferRange glad_glFlushMappedBufferRange
+typedef void (APIENTRYP PFNGLBINDVERTEXARRAYPROC)(GLuint array);
+GLAPI PFNGLBINDVERTEXARRAYPROC glad_glBindVertexArray;
+#define glBindVertexArray glad_glBindVertexArray
+typedef void (APIENTRYP PFNGLDELETEVERTEXARRAYSPROC)(GLsizei n, const GLuint *arrays);
+GLAPI PFNGLDELETEVERTEXARRAYSPROC glad_glDeleteVertexArrays;
+#define glDeleteVertexArrays glad_glDeleteVertexArrays
+typedef void (APIENTRYP PFNGLGENVERTEXARRAYSPROC)(GLsizei n, GLuint *arrays);
+GLAPI PFNGLGENVERTEXARRAYSPROC glad_glGenVertexArrays;
+#define glGenVertexArrays glad_glGenVertexArrays
+typedef GLboolean (APIENTRYP PFNGLISVERTEXARRAYPROC)(GLuint array);
+GLAPI PFNGLISVERTEXARRAYPROC glad_glIsVertexArray;
+#define glIsVertexArray glad_glIsVertexArray
+#endif
+#ifndef GL_VERSION_3_1
+#define GL_VERSION_3_1 1
+GLAPI int GLAD_GL_VERSION_3_1;
+typedef void (APIENTRYP PFNGLDRAWARRAYSINSTANCEDPROC)(GLenum mode, GLint first, GLsizei count, GLsizei instancecount);
+GLAPI PFNGLDRAWARRAYSINSTANCEDPROC glad_glDrawArraysInstanced;
+#define glDrawArraysInstanced glad_glDrawArraysInstanced
+typedef void (APIENTRYP PFNGLDRAWELEMENTSINSTANCEDPROC)(GLenum mode, GLsizei count, GLenum type, const void *indices, GLsizei instancecount);
+GLAPI PFNGLDRAWELEMENTSINSTANCEDPROC glad_glDrawElementsInstanced;
+#define glDrawElementsInstanced glad_glDrawElementsInstanced
+typedef void (APIENTRYP PFNGLTEXBUFFERPROC)(GLenum target, GLenum internalformat, GLuint buffer);
+GLAPI PFNGLTEXBUFFERPROC glad_glTexBuffer;
+#define glTexBuffer glad_glTexBuffer
+typedef void (APIENTRYP PFNGLPRIMITIVERESTARTINDEXPROC)(GLuint index);
+GLAPI PFNGLPRIMITIVERESTARTINDEXPROC glad_glPrimitiveRestartIndex;
+#define glPrimitiveRestartIndex glad_glPrimitiveRestartIndex
+typedef void (APIENTRYP PFNGLCOPYBUFFERSUBDATAPROC)(GLenum readTarget, GLenum writeTarget, GLintptr readOffset, GLintptr writeOffset, GLsizeiptr size);
+GLAPI PFNGLCOPYBUFFERSUBDATAPROC glad_glCopyBufferSubData;
+#define glCopyBufferSubData glad_glCopyBufferSubData
+typedef void (APIENTRYP PFNGLGETUNIFORMINDICESPROC)(GLuint program, GLsizei uniformCount, const GLchar *const*uniformNames, GLuint *uniformIndices);
+GLAPI PFNGLGETUNIFORMINDICESPROC glad_glGetUniformIndices;
+#define glGetUniformIndices glad_glGetUniformIndices
+typedef void (APIENTRYP PFNGLGETACTIVEUNIFORMSIVPROC)(GLuint program, GLsizei uniformCount, const GLuint *uniformIndices, GLenum pname, GLint *params);
+GLAPI PFNGLGETACTIVEUNIFORMSIVPROC glad_glGetActiveUniformsiv;
+#define glGetActiveUniformsiv glad_glGetActiveUniformsiv
+typedef void (APIENTRYP PFNGLGETACTIVEUNIFORMNAMEPROC)(GLuint program, GLuint uniformIndex, GLsizei bufSize, GLsizei *length, GLchar *uniformName);
+GLAPI PFNGLGETACTIVEUNIFORMNAMEPROC glad_glGetActiveUniformName;
+#define glGetActiveUniformName glad_glGetActiveUniformName
+typedef GLuint (APIENTRYP PFNGLGETUNIFORMBLOCKINDEXPROC)(GLuint program, const GLchar *uniformBlockName);
+GLAPI PFNGLGETUNIFORMBLOCKINDEXPROC glad_glGetUniformBlockIndex;
+#define glGetUniformBlockIndex glad_glGetUniformBlockIndex
+typedef void (APIENTRYP PFNGLGETACTIVEUNIFORMBLOCKIVPROC)(GLuint program, GLuint uniformBlockIndex, GLenum pname, GLint *params);
+GLAPI PFNGLGETACTIVEUNIFORMBLOCKIVPROC glad_glGetActiveUniformBlockiv;
+#define glGetActiveUniformBlockiv glad_glGetActiveUniformBlockiv
+typedef void (APIENTRYP PFNGLGETACTIVEUNIFORMBLOCKNAMEPROC)(GLuint program, GLuint uniformBlockIndex, GLsizei bufSize, GLsizei *length, GLchar *uniformBlockName);
+GLAPI PFNGLGETACTIVEUNIFORMBLOCKNAMEPROC glad_glGetActiveUniformBlockName;
+#define glGetActiveUniformBlockName glad_glGetActiveUniformBlockName
+typedef void (APIENTRYP PFNGLUNIFORMBLOCKBINDINGPROC)(GLuint program, GLuint uniformBlockIndex, GLuint uniformBlockBinding);
+GLAPI PFNGLUNIFORMBLOCKBINDINGPROC glad_glUniformBlockBinding;
+#define glUniformBlockBinding glad_glUniformBlockBinding
+#endif
+#ifndef GL_VERSION_3_2
+#define GL_VERSION_3_2 1
+GLAPI int GLAD_GL_VERSION_3_2;
+typedef void (APIENTRYP PFNGLDRAWELEMENTSBASEVERTEXPROC)(GLenum mode, GLsizei count, GLenum type, const void *indices, GLint basevertex);
+GLAPI PFNGLDRAWELEMENTSBASEVERTEXPROC glad_glDrawElementsBaseVertex;
+#define glDrawElementsBaseVertex glad_glDrawElementsBaseVertex
+typedef void (APIENTRYP PFNGLDRAWRANGEELEMENTSBASEVERTEXPROC)(GLenum mode, GLuint start, GLuint end, GLsizei count, GLenum type, const void *indices, GLint basevertex);
+GLAPI PFNGLDRAWRANGEELEMENTSBASEVERTEXPROC glad_glDrawRangeElementsBaseVertex;
+#define glDrawRangeElementsBaseVertex glad_glDrawRangeElementsBaseVertex
+typedef void (APIENTRYP PFNGLDRAWELEMENTSINSTANCEDBASEVERTEXPROC)(GLenum mode, GLsizei count, GLenum type, const void *indices, GLsizei instancecount, GLint basevertex);
+GLAPI PFNGLDRAWELEMENTSINSTANCEDBASEVERTEXPROC glad_glDrawElementsInstancedBaseVertex;
+#define glDrawElementsInstancedBaseVertex glad_glDrawElementsInstancedBaseVertex
+typedef void (APIENTRYP PFNGLMULTIDRAWELEMENTSBASEVERTEXPROC)(GLenum mode, const GLsizei *count, GLenum type, const void *const*indices, GLsizei drawcount, const GLint *basevertex);
+GLAPI PFNGLMULTIDRAWELEMENTSBASEVERTEXPROC glad_glMultiDrawElementsBaseVertex;
+#define glMultiDrawElementsBaseVertex glad_glMultiDrawElementsBaseVertex
+typedef void (APIENTRYP PFNGLPROVOKINGVERTEXPROC)(GLenum mode);
+GLAPI PFNGLPROVOKINGVERTEXPROC glad_glProvokingVertex;
+#define glProvokingVertex glad_glProvokingVertex
+typedef GLsync (APIENTRYP PFNGLFENCESYNCPROC)(GLenum condition, GLbitfield flags);
+GLAPI PFNGLFENCESYNCPROC glad_glFenceSync;
+#define glFenceSync glad_glFenceSync
+typedef GLboolean (APIENTRYP PFNGLISSYNCPROC)(GLsync sync);
+GLAPI PFNGLISSYNCPROC glad_glIsSync;
+#define glIsSync glad_glIsSync
+typedef void (APIENTRYP PFNGLDELETESYNCPROC)(GLsync sync);
+GLAPI PFNGLDELETESYNCPROC glad_glDeleteSync;
+#define glDeleteSync glad_glDeleteSync
+typedef GLenum (APIENTRYP PFNGLCLIENTWAITSYNCPROC)(GLsync sync, GLbitfield flags, GLuint64 timeout);
+GLAPI PFNGLCLIENTWAITSYNCPROC glad_glClientWaitSync;
+#define glClientWaitSync glad_glClientWaitSync
+typedef void (APIENTRYP PFNGLWAITSYNCPROC)(GLsync sync, GLbitfield flags, GLuint64 timeout);
+GLAPI PFNGLWAITSYNCPROC glad_glWaitSync;
+#define glWaitSync glad_glWaitSync
+typedef void (APIENTRYP PFNGLGETINTEGER64VPROC)(GLenum pname, GLint64 *data);
+GLAPI PFNGLGETINTEGER64VPROC glad_glGetInteger64v;
+#define glGetInteger64v glad_glGetInteger64v
+typedef void (APIENTRYP PFNGLGETSYNCIVPROC)(GLsync sync, GLenum pname, GLsizei count, GLsizei *length, GLint *values);
+GLAPI PFNGLGETSYNCIVPROC glad_glGetSynciv;
+#define glGetSynciv glad_glGetSynciv
+typedef void (APIENTRYP PFNGLGETINTEGER64I_VPROC)(GLenum target, GLuint index, GLint64 *data);
+GLAPI PFNGLGETINTEGER64I_VPROC glad_glGetInteger64i_v;
+#define glGetInteger64i_v glad_glGetInteger64i_v
+typedef void (APIENTRYP PFNGLGETBUFFERPARAMETERI64VPROC)(GLenum target, GLenum pname, GLint64 *params);
+GLAPI PFNGLGETBUFFERPARAMETERI64VPROC glad_glGetBufferParameteri64v;
+#define glGetBufferParameteri64v glad_glGetBufferParameteri64v
+typedef void (APIENTRYP PFNGLFRAMEBUFFERTEXTUREPROC)(GLenum target, GLenum attachment, GLuint texture, GLint level);
+GLAPI PFNGLFRAMEBUFFERTEXTUREPROC glad_glFramebufferTexture;
+#define glFramebufferTexture glad_glFramebufferTexture
+typedef void (APIENTRYP PFNGLTEXIMAGE2DMULTISAMPLEPROC)(GLenum target, GLsizei samples, GLenum internalformat, GLsizei width, GLsizei height, GLboolean fixedsamplelocations);
+GLAPI PFNGLTEXIMAGE2DMULTISAMPLEPROC glad_glTexImage2DMultisample;
+#define glTexImage2DMultisample glad_glTexImage2DMultisample
+typedef void (APIENTRYP PFNGLTEXIMAGE3DMULTISAMPLEPROC)(GLenum target, GLsizei samples, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth, GLboolean fixedsamplelocations);
+GLAPI PFNGLTEXIMAGE3DMULTISAMPLEPROC glad_glTexImage3DMultisample;
+#define glTexImage3DMultisample glad_glTexImage3DMultisample
+typedef void (APIENTRYP PFNGLGETMULTISAMPLEFVPROC)(GLenum pname, GLuint index, GLfloat *val);
+GLAPI PFNGLGETMULTISAMPLEFVPROC glad_glGetMultisamplefv;
+#define glGetMultisamplefv glad_glGetMultisamplefv
+typedef void (APIENTRYP PFNGLSAMPLEMASKIPROC)(GLuint maskNumber, GLbitfield mask);
+GLAPI PFNGLSAMPLEMASKIPROC glad_glSampleMaski;
+#define glSampleMaski glad_glSampleMaski
+#endif
+#ifndef GL_VERSION_3_3
+#define GL_VERSION_3_3 1
+GLAPI int GLAD_GL_VERSION_3_3;
+typedef void (APIENTRYP PFNGLBINDFRAGDATALOCATIONINDEXEDPROC)(GLuint program, GLuint colorNumber, GLuint index, const GLchar *name);
+GLAPI PFNGLBINDFRAGDATALOCATIONINDEXEDPROC glad_glBindFragDataLocationIndexed;
+#define glBindFragDataLocationIndexed glad_glBindFragDataLocationIndexed
+typedef GLint (APIENTRYP PFNGLGETFRAGDATAINDEXPROC)(GLuint program, const GLchar *name);
+GLAPI PFNGLGETFRAGDATAINDEXPROC glad_glGetFragDataIndex;
+#define glGetFragDataIndex glad_glGetFragDataIndex
+typedef void (APIENTRYP PFNGLGENSAMPLERSPROC)(GLsizei count, GLuint *samplers);
+GLAPI PFNGLGENSAMPLERSPROC glad_glGenSamplers;
+#define glGenSamplers glad_glGenSamplers
+typedef void (APIENTRYP PFNGLDELETESAMPLERSPROC)(GLsizei count, const GLuint *samplers);
+GLAPI PFNGLDELETESAMPLERSPROC glad_glDeleteSamplers;
+#define glDeleteSamplers glad_glDeleteSamplers
+typedef GLboolean (APIENTRYP PFNGLISSAMPLERPROC)(GLuint sampler);
+GLAPI PFNGLISSAMPLERPROC glad_glIsSampler;
+#define glIsSampler glad_glIsSampler
+typedef void (APIENTRYP PFNGLBINDSAMPLERPROC)(GLuint unit, GLuint sampler);
+GLAPI PFNGLBINDSAMPLERPROC glad_glBindSampler;
+#define glBindSampler glad_glBindSampler
+typedef void (APIENTRYP PFNGLSAMPLERPARAMETERIPROC)(GLuint sampler, GLenum pname, GLint param);
+GLAPI PFNGLSAMPLERPARAMETERIPROC glad_glSamplerParameteri;
+#define glSamplerParameteri glad_glSamplerParameteri
+typedef void (APIENTRYP PFNGLSAMPLERPARAMETERIVPROC)(GLuint sampler, GLenum pname, const GLint *param);
+GLAPI PFNGLSAMPLERPARAMETERIVPROC glad_glSamplerParameteriv;
+#define glSamplerParameteriv glad_glSamplerParameteriv
+typedef void (APIENTRYP PFNGLSAMPLERPARAMETERFPROC)(GLuint sampler, GLenum pname, GLfloat param);
+GLAPI PFNGLSAMPLERPARAMETERFPROC glad_glSamplerParameterf;
+#define glSamplerParameterf glad_glSamplerParameterf
+typedef void (APIENTRYP PFNGLSAMPLERPARAMETERFVPROC)(GLuint sampler, GLenum pname, const GLfloat *param);
+GLAPI PFNGLSAMPLERPARAMETERFVPROC glad_glSamplerParameterfv;
+#define glSamplerParameterfv glad_glSamplerParameterfv
+typedef void (APIENTRYP PFNGLSAMPLERPARAMETERIIVPROC)(GLuint sampler, GLenum pname, const GLint *param);
+GLAPI PFNGLSAMPLERPARAMETERIIVPROC glad_glSamplerParameterIiv;
+#define glSamplerParameterIiv glad_glSamplerParameterIiv
+typedef void (APIENTRYP PFNGLSAMPLERPARAMETERIUIVPROC)(GLuint sampler, GLenum pname, const GLuint *param);
+GLAPI PFNGLSAMPLERPARAMETERIUIVPROC glad_glSamplerParameterIuiv;
+#define glSamplerParameterIuiv glad_glSamplerParameterIuiv
+typedef void (APIENTRYP PFNGLGETSAMPLERPARAMETERIVPROC)(GLuint sampler, GLenum pname, GLint *params);
+GLAPI PFNGLGETSAMPLERPARAMETERIVPROC glad_glGetSamplerParameteriv;
+#define glGetSamplerParameteriv glad_glGetSamplerParameteriv
+typedef void (APIENTRYP PFNGLGETSAMPLERPARAMETERIIVPROC)(GLuint sampler, GLenum pname, GLint *params);
+GLAPI PFNGLGETSAMPLERPARAMETERIIVPROC glad_glGetSamplerParameterIiv;
+#define glGetSamplerParameterIiv glad_glGetSamplerParameterIiv
+typedef void (APIENTRYP PFNGLGETSAMPLERPARAMETERFVPROC)(GLuint sampler, GLenum pname, GLfloat *params);
+GLAPI PFNGLGETSAMPLERPARAMETERFVPROC glad_glGetSamplerParameterfv;
+#define glGetSamplerParameterfv glad_glGetSamplerParameterfv
+typedef void (APIENTRYP PFNGLGETSAMPLERPARAMETERIUIVPROC)(GLuint sampler, GLenum pname, GLuint *params);
+GLAPI PFNGLGETSAMPLERPARAMETERIUIVPROC glad_glGetSamplerParameterIuiv;
+#define glGetSamplerParameterIuiv glad_glGetSamplerParameterIuiv
+typedef void (APIENTRYP PFNGLQUERYCOUNTERPROC)(GLuint id, GLenum target);
+GLAPI PFNGLQUERYCOUNTERPROC glad_glQueryCounter;
+#define glQueryCounter glad_glQueryCounter
+typedef void (APIENTRYP PFNGLGETQUERYOBJECTI64VPROC)(GLuint id, GLenum pname, GLint64 *params);
+GLAPI PFNGLGETQUERYOBJECTI64VPROC glad_glGetQueryObjecti64v;
+#define glGetQueryObjecti64v glad_glGetQueryObjecti64v
+typedef void (APIENTRYP PFNGLGETQUERYOBJECTUI64VPROC)(GLuint id, GLenum pname, GLuint64 *params);
+GLAPI PFNGLGETQUERYOBJECTUI64VPROC glad_glGetQueryObjectui64v;
+#define glGetQueryObjectui64v glad_glGetQueryObjectui64v
+typedef void (APIENTRYP PFNGLVERTEXATTRIBDIVISORPROC)(GLuint index, GLuint divisor);
+GLAPI PFNGLVERTEXATTRIBDIVISORPROC glad_glVertexAttribDivisor;
+#define glVertexAttribDivisor glad_glVertexAttribDivisor
+typedef void (APIENTRYP PFNGLVERTEXATTRIBP1UIPROC)(GLuint index, GLenum type, GLboolean normalized, GLuint value);
+GLAPI PFNGLVERTEXATTRIBP1UIPROC glad_glVertexAttribP1ui;
+#define glVertexAttribP1ui glad_glVertexAttribP1ui
+typedef void (APIENTRYP PFNGLVERTEXATTRIBP1UIVPROC)(GLuint index, GLenum type, GLboolean normalized, const GLuint *value);
+GLAPI PFNGLVERTEXATTRIBP1UIVPROC glad_glVertexAttribP1uiv;
+#define glVertexAttribP1uiv glad_glVertexAttribP1uiv
+typedef void (APIENTRYP PFNGLVERTEXATTRIBP2UIPROC)(GLuint index, GLenum type, GLboolean normalized, GLuint value);
+GLAPI PFNGLVERTEXATTRIBP2UIPROC glad_glVertexAttribP2ui;
+#define glVertexAttribP2ui glad_glVertexAttribP2ui
+typedef void (APIENTRYP PFNGLVERTEXATTRIBP2UIVPROC)(GLuint index, GLenum type, GLboolean normalized, const GLuint *value);
+GLAPI PFNGLVERTEXATTRIBP2UIVPROC glad_glVertexAttribP2uiv;
+#define glVertexAttribP2uiv glad_glVertexAttribP2uiv
+typedef void (APIENTRYP PFNGLVERTEXATTRIBP3UIPROC)(GLuint index, GLenum type, GLboolean normalized, GLuint value);
+GLAPI PFNGLVERTEXATTRIBP3UIPROC glad_glVertexAttribP3ui;
+#define glVertexAttribP3ui glad_glVertexAttribP3ui
+typedef void (APIENTRYP PFNGLVERTEXATTRIBP3UIVPROC)(GLuint index, GLenum type, GLboolean normalized, const GLuint *value);
+GLAPI PFNGLVERTEXATTRIBP3UIVPROC glad_glVertexAttribP3uiv;
+#define glVertexAttribP3uiv glad_glVertexAttribP3uiv
+typedef void (APIENTRYP PFNGLVERTEXATTRIBP4UIPROC)(GLuint index, GLenum type, GLboolean normalized, GLuint value);
+GLAPI PFNGLVERTEXATTRIBP4UIPROC glad_glVertexAttribP4ui;
+#define glVertexAttribP4ui glad_glVertexAttribP4ui
+typedef void (APIENTRYP PFNGLVERTEXATTRIBP4UIVPROC)(GLuint index, GLenum type, GLboolean normalized, const GLuint *value);
+GLAPI PFNGLVERTEXATTRIBP4UIVPROC glad_glVertexAttribP4uiv;
+#define glVertexAttribP4uiv glad_glVertexAttribP4uiv
+typedef void (APIENTRYP PFNGLVERTEXP2UIPROC)(GLenum type, GLuint value);
+GLAPI PFNGLVERTEXP2UIPROC glad_glVertexP2ui;
+#define glVertexP2ui glad_glVertexP2ui
+typedef void (APIENTRYP PFNGLVERTEXP2UIVPROC)(GLenum type, const GLuint *value);
+GLAPI PFNGLVERTEXP2UIVPROC glad_glVertexP2uiv;
+#define glVertexP2uiv glad_glVertexP2uiv
+typedef void (APIENTRYP PFNGLVERTEXP3UIPROC)(GLenum type, GLuint value);
+GLAPI PFNGLVERTEXP3UIPROC glad_glVertexP3ui;
+#define glVertexP3ui glad_glVertexP3ui
+typedef void (APIENTRYP PFNGLVERTEXP3UIVPROC)(GLenum type, const GLuint *value);
+GLAPI PFNGLVERTEXP3UIVPROC glad_glVertexP3uiv;
+#define glVertexP3uiv glad_glVertexP3uiv
+typedef void (APIENTRYP PFNGLVERTEXP4UIPROC)(GLenum type, GLuint value);
+GLAPI PFNGLVERTEXP4UIPROC glad_glVertexP4ui;
+#define glVertexP4ui glad_glVertexP4ui
+typedef void (APIENTRYP PFNGLVERTEXP4UIVPROC)(GLenum type, const GLuint *value);
+GLAPI PFNGLVERTEXP4UIVPROC glad_glVertexP4uiv;
+#define glVertexP4uiv glad_glVertexP4uiv
+typedef void (APIENTRYP PFNGLTEXCOORDP1UIPROC)(GLenum type, GLuint coords);
+GLAPI PFNGLTEXCOORDP1UIPROC glad_glTexCoordP1ui;
+#define glTexCoordP1ui glad_glTexCoordP1ui
+typedef void (APIENTRYP PFNGLTEXCOORDP1UIVPROC)(GLenum type, const GLuint *coords);
+GLAPI PFNGLTEXCOORDP1UIVPROC glad_glTexCoordP1uiv;
+#define glTexCoordP1uiv glad_glTexCoordP1uiv
+typedef void (APIENTRYP PFNGLTEXCOORDP2UIPROC)(GLenum type, GLuint coords);
+GLAPI PFNGLTEXCOORDP2UIPROC glad_glTexCoordP2ui;
+#define glTexCoordP2ui glad_glTexCoordP2ui
+typedef void (APIENTRYP PFNGLTEXCOORDP2UIVPROC)(GLenum type, const GLuint *coords);
+GLAPI PFNGLTEXCOORDP2UIVPROC glad_glTexCoordP2uiv;
+#define glTexCoordP2uiv glad_glTexCoordP2uiv
+typedef void (APIENTRYP PFNGLTEXCOORDP3UIPROC)(GLenum type, GLuint coords);
+GLAPI PFNGLTEXCOORDP3UIPROC glad_glTexCoordP3ui;
+#define glTexCoordP3ui glad_glTexCoordP3ui
+typedef void (APIENTRYP PFNGLTEXCOORDP3UIVPROC)(GLenum type, const GLuint *coords);
+GLAPI PFNGLTEXCOORDP3UIVPROC glad_glTexCoordP3uiv;
+#define glTexCoordP3uiv glad_glTexCoordP3uiv
+typedef void (APIENTRYP PFNGLTEXCOORDP4UIPROC)(GLenum type, GLuint coords);
+GLAPI PFNGLTEXCOORDP4UIPROC glad_glTexCoordP4ui;
+#define glTexCoordP4ui glad_glTexCoordP4ui
+typedef void (APIENTRYP PFNGLTEXCOORDP4UIVPROC)(GLenum type, const GLuint *coords);
+GLAPI PFNGLTEXCOORDP4UIVPROC glad_glTexCoordP4uiv;
+#define glTexCoordP4uiv glad_glTexCoordP4uiv
+typedef void (APIENTRYP PFNGLMULTITEXCOORDP1UIPROC)(GLenum texture, GLenum type, GLuint coords);
+GLAPI PFNGLMULTITEXCOORDP1UIPROC glad_glMultiTexCoordP1ui;
+#define glMultiTexCoordP1ui glad_glMultiTexCoordP1ui
+typedef void (APIENTRYP PFNGLMULTITEXCOORDP1UIVPROC)(GLenum texture, GLenum type, const GLuint *coords);
+GLAPI PFNGLMULTITEXCOORDP1UIVPROC glad_glMultiTexCoordP1uiv;
+#define glMultiTexCoordP1uiv glad_glMultiTexCoordP1uiv
+typedef void (APIENTRYP PFNGLMULTITEXCOORDP2UIPROC)(GLenum texture, GLenum type, GLuint coords);
+GLAPI PFNGLMULTITEXCOORDP2UIPROC glad_glMultiTexCoordP2ui;
+#define glMultiTexCoordP2ui glad_glMultiTexCoordP2ui
+typedef void (APIENTRYP PFNGLMULTITEXCOORDP2UIVPROC)(GLenum texture, GLenum type, const GLuint *coords);
+GLAPI PFNGLMULTITEXCOORDP2UIVPROC glad_glMultiTexCoordP2uiv;
+#define glMultiTexCoordP2uiv glad_glMultiTexCoordP2uiv
+typedef void (APIENTRYP PFNGLMULTITEXCOORDP3UIPROC)(GLenum texture, GLenum type, GLuint coords);
+GLAPI PFNGLMULTITEXCOORDP3UIPROC glad_glMultiTexCoordP3ui;
+#define glMultiTexCoordP3ui glad_glMultiTexCoordP3ui
+typedef void (APIENTRYP PFNGLMULTITEXCOORDP3UIVPROC)(GLenum texture, GLenum type, const GLuint *coords);
+GLAPI PFNGLMULTITEXCOORDP3UIVPROC glad_glMultiTexCoordP3uiv;
+#define glMultiTexCoordP3uiv glad_glMultiTexCoordP3uiv
+typedef void (APIENTRYP PFNGLMULTITEXCOORDP4UIPROC)(GLenum texture, GLenum type, GLuint coords);
+GLAPI PFNGLMULTITEXCOORDP4UIPROC glad_glMultiTexCoordP4ui;
+#define glMultiTexCoordP4ui glad_glMultiTexCoordP4ui
+typedef void (APIENTRYP PFNGLMULTITEXCOORDP4UIVPROC)(GLenum texture, GLenum type, const GLuint *coords);
+GLAPI PFNGLMULTITEXCOORDP4UIVPROC glad_glMultiTexCoordP4uiv;
+#define glMultiTexCoordP4uiv glad_glMultiTexCoordP4uiv
+typedef void (APIENTRYP PFNGLNORMALP3UIPROC)(GLenum type, GLuint coords);
+GLAPI PFNGLNORMALP3UIPROC glad_glNormalP3ui;
+#define glNormalP3ui glad_glNormalP3ui
+typedef void (APIENTRYP PFNGLNORMALP3UIVPROC)(GLenum type, const GLuint *coords);
+GLAPI PFNGLNORMALP3UIVPROC glad_glNormalP3uiv;
+#define glNormalP3uiv glad_glNormalP3uiv
+typedef void (APIENTRYP PFNGLCOLORP3UIPROC)(GLenum type, GLuint color);
+GLAPI PFNGLCOLORP3UIPROC glad_glColorP3ui;
+#define glColorP3ui glad_glColorP3ui
+typedef void (APIENTRYP PFNGLCOLORP3UIVPROC)(GLenum type, const GLuint *color);
+GLAPI PFNGLCOLORP3UIVPROC glad_glColorP3uiv;
+#define glColorP3uiv glad_glColorP3uiv
+typedef void (APIENTRYP PFNGLCOLORP4UIPROC)(GLenum type, GLuint color);
+GLAPI PFNGLCOLORP4UIPROC glad_glColorP4ui;
+#define glColorP4ui glad_glColorP4ui
+typedef void (APIENTRYP PFNGLCOLORP4UIVPROC)(GLenum type, const GLuint *color);
+GLAPI PFNGLCOLORP4UIVPROC glad_glColorP4uiv;
+#define glColorP4uiv glad_glColorP4uiv
+typedef void (APIENTRYP PFNGLSECONDARYCOLORP3UIPROC)(GLenum type, GLuint color);
+GLAPI PFNGLSECONDARYCOLORP3UIPROC glad_glSecondaryColorP3ui;
+#define glSecondaryColorP3ui glad_glSecondaryColorP3ui
+typedef void (APIENTRYP PFNGLSECONDARYCOLORP3UIVPROC)(GLenum type, const GLuint *color);
+GLAPI PFNGLSECONDARYCOLORP3UIVPROC glad_glSecondaryColorP3uiv;
+#define glSecondaryColorP3uiv glad_glSecondaryColorP3uiv
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/pufferlib/ocean/constellation/point_particle.fs b/pufferlib/ocean/constellation/point_particle.fs
new file mode 100644
index 000000000..9a184dafc
--- /dev/null
+++ b/pufferlib/ocean/constellation/point_particle.fs
@@ -0,0 +1,25 @@
+#version 330
+
+in vec4 fragColor;
+out vec4 finalColor;
+
+
+void main()
+{
+
+vec2 uv = gl_PointCoord - vec2(0.5);
+float dist = length(uv); // distance from center of point
+
+// Soft radial falloff (tightens with higher number)
+float falloff = exp(-20.0 * dist * dist);
+
+// Kill pixels too dark to be visible — avoids black ring
+if (falloff < 0.01)
+    discard;
+
+// Final color, scaled by falloff
+vec3 color = fragColor.rgb * falloff * 5.0;
+finalColor = vec4(color, falloff);
+
+
+}
diff --git a/pufferlib/ocean/constellation/point_particle.vs b/pufferlib/ocean/constellation/point_particle.vs
new file mode 100644
index 000000000..e7df6dc28
--- /dev/null
+++ b/pufferlib/ocean/constellation/point_particle.vs
@@ -0,0 +1,42 @@
+#version 330
+
+// Input vertex attributes
+in vec3 vertexPosition;
+in vec4 vertexColor;
+
+// Input uniform values
+uniform mat4 mvp;
+uniform float currentTime;
+
+// Output to fragment shader
+out vec4 fragColor;
+
+// NOTE: Add your custom variables here
+
+float twinkle(float idx, float t) {
+    float base_size = 10.0;
+    float phase = mod(idx * 137.5, 360.0);
+    float frequency = 2.5 + mod(idx, 3.0) * 1.0;
+    float amplitude = (mod(idx, 10.0) == 0.0) ? 10.0 : 2.0;
+    float size_variation = amplitude * sin(frequency * t + radians(phase));
+    return max(0.5, base_size + size_variation);
+}
+
+void main()
+{
+    // Unpack data from vertexPosition
+    vec2  pos = vertexPosition.xy;
+    float idx = vertexPosition.z;
+
+    // Calculate final vertex position (jiggle it around a bit horizontally)
+    //pos += vec2(100, 0)*sin(period*currentTime);
+    gl_Position = mvp*vec4(pos, 0.0, 1.0);
+
+    // Calculate the screen space size of this particle (also vary it over time)
+    //gl_PointSize = 10 - 5*abs(sin(0.1*idx*currentTime));
+    //gl_PointSize = 10.0;
+
+    gl_PointSize = twinkle(idx, currentTime);
+
+    fragColor = vertexColor;
+}
diff --git a/pufferlib/ocean/constellation/puffer.rgs b/pufferlib/ocean/constellation/puffer.rgs
new file mode 100644
index 000000000..95a05b0e1
Binary files /dev/null and b/pufferlib/ocean/constellation/puffer.rgs differ
diff --git a/pufferlib/ocean/constellation/star_shader.fs b/pufferlib/ocean/constellation/star_shader.fs
new file mode 100644
index 000000000..e4c8d215f
--- /dev/null
+++ b/pufferlib/ocean/constellation/star_shader.fs
@@ -0,0 +1,11 @@
+#version 330
+
+in vec4 fragColor;
+out vec4 finalColor;
+
+void main() {
+    // Optional: Circular points
+    vec2 coord = gl_PointCoord - vec2(0.5);
+    if (length(coord) > 0.5) discard;
+    finalColor = fragColor;
+}
diff --git a/pufferlib/ocean/constellation/star_shader.vs b/pufferlib/ocean/constellation/star_shader.vs
new file mode 100644
index 000000000..c31bd2506
--- /dev/null
+++ b/pufferlib/ocean/constellation/star_shader.vs
@@ -0,0 +1,20 @@
+#version 330
+
+in vec2 position; // Screen-space position
+in vec4 color;    // RGBA color
+out vec4 fragColor;
+
+uniform float screenWidth;
+uniform float screenHeight;
+uniform float pointSize;
+
+void main() {
+    // Convert screen-space to NDC
+    vec2 ndc = vec2(
+        position.x / screenWidth * 2.0 - 1.0,
+        1.0 - position.y / screenHeight * 2.0
+    );
+    gl_Position = vec4(ndc, 0.0, 1.0);
+    fragColor = color;
+    gl_PointSize = pointSize;
+}
diff --git a/pufferlib/ocean/constellation/style_cyber.rgs b/pufferlib/ocean/constellation/style_cyber.rgs
new file mode 100644
index 000000000..142aeee91
Binary files /dev/null and b/pufferlib/ocean/constellation/style_cyber.rgs differ
diff --git a/pufferlib/ocean/env_binding.h b/pufferlib/ocean/env_binding.h
index f64b6148b..64efe1cf1 100644
--- a/pufferlib/ocean/env_binding.h
+++ b/pufferlib/ocean/env_binding.h
@@ -12,6 +12,13 @@ static PyObject* my_shared(PyObject* self, PyObject* args, PyObject* kwargs) {
 }
 #endif
 
+static PyObject* my_shared_close(PyObject* self, PyObject* args);
+#ifndef MY_SHARED_CLOSE
+static PyObject* my_shared_close(PyObject* self, PyObject* args) {
+    return 0;
+}
+#endif
+
 static PyObject* my_get(PyObject* dict, Env* env);
 #ifndef MY_GET
 static PyObject* my_get(PyObject* dict, Env* env) {
@@ -657,6 +664,7 @@ static PyMethodDef methods[] = {
     {"vec_render", vec_render, METH_VARARGS, "Render the vector of environments"},
     {"vec_close", vec_close, METH_VARARGS, "Close the vector of environments"},
     {"shared", (PyCFunction)my_shared, METH_VARARGS | METH_KEYWORDS, "Shared state"},
+    {"shared_close", my_shared_close, METH_VARARGS, "Close shared state"},
     MY_METHODS,
     {NULL, NULL, 0, NULL}
 };
diff --git a/pufferlib/ocean/g2048/g2048.h b/pufferlib/ocean/g2048/g2048.h
index 801fc7406..6727312f2 100644
--- a/pufferlib/ocean/g2048/g2048.h
+++ b/pufferlib/ocean/g2048/g2048.h
@@ -39,7 +39,7 @@ static inline float calculate_perf(unsigned char max_tile) {
     return perf;
 }
 
-typedef struct {
+typedef struct Log {
     float perf;
     float score;
     float merge_score;
@@ -54,7 +54,7 @@ typedef struct {
     float n;
 } Log;
 
-typedef struct {
+typedef struct Game {
     Log log;                        // Required
     unsigned char* observations;    // Cheaper in memory if encoded in uint_8
     int* actions;                   // Required
diff --git a/pufferlib/ocean/moba/binding.c b/pufferlib/ocean/moba/binding.c
index c48c5fc9e..bfc652881 100644
--- a/pufferlib/ocean/moba/binding.c
+++ b/pufferlib/ocean/moba/binding.c
@@ -2,6 +2,7 @@
 
 #define Env MOBA
 #define MY_SHARED
+#define MY_SHARED_CLOSE
 #include "../env_binding.h"
 
 static PyObject* my_shared(PyObject* self, PyObject* args, PyObject* kwargs) {
@@ -22,6 +23,44 @@ static PyObject* my_shared(PyObject* self, PyObject* args, PyObject* kwargs) {
     return PyLong_FromVoidPtr(state);
 }
 
+static PyObject* my_shared_close(PyObject* self, PyObject* args) {
+    PyObject* handle_obj = PyTuple_GetItem(args, 0);
+    if (!PyObject_TypeCheck(handle_obj, &PyLong_Type)) {
+        PyErr_SetString(PyExc_TypeError, "state handle must be an integer");
+        return NULL;
+    }
+
+    PyObject* state_dict = (PyObject*)PyLong_AsVoidPtr(handle_obj);
+
+    PyObject* ai_path_buffer_handle = PyDict_GetItemString(state_dict, "ai_path_buffer");
+    if (ai_path_buffer_handle == NULL) {
+        PyErr_SetString(PyExc_KeyError, "Key 'ai_path_buffer' not found in state");
+        return NULL;
+    }
+    int* ai_path_buffer = (int*)PyLong_AsVoidPtr(ai_path_buffer_handle);
+    free(ai_path_buffer);
+
+    PyObject* ai_paths_handle = PyDict_GetItemString(state_dict, "ai_paths");
+    if (ai_paths_handle == NULL) {
+        PyErr_SetString(PyExc_KeyError, "Key 'ai_paths' not found in state");
+        return NULL;
+    }
+    unsigned char* ai_paths = (unsigned char*)PyLong_AsVoidPtr(ai_paths_handle);
+    free(ai_paths);
+
+    PyObject* game_map_handle = PyDict_GetItemString(state_dict, "game_map");
+    if (game_map_handle == NULL) {
+        PyErr_SetString(PyExc_KeyError, "Key 'game_map' not found in state");
+        return NULL;
+    }
+    unsigned char* game_map = (unsigned char*)PyLong_AsVoidPtr(game_map_handle);
+    free(game_map);
+
+    Py_INCREF(Py_None);
+    return Py_None;
+}
+
+
 static int my_init(Env* env, PyObject* args, PyObject* kwargs) {
     env->vision_range = unpack(kwargs, "vision_range");
     env->agent_speed = unpack(kwargs, "agent_speed");
diff --git a/pufferlib/ocean/moba/moba.py b/pufferlib/ocean/moba/moba.py
index 4b4dd0be8..362e76e53 100644
--- a/pufferlib/ocean/moba/moba.py
+++ b/pufferlib/ocean/moba/moba.py
@@ -78,6 +78,7 @@ def render(self):
             binding.vec_render(self.c_envs, 0)
 
     def close(self):
+        binding.shared_close(self.c_state)
         binding.vec_close(self.c_envs)
 
 
diff --git a/pufferlib/ocean/pacman/binding.c b/pufferlib/ocean/pacman/binding.c
index f9f8c1c81..6bcf45e49 100644
--- a/pufferlib/ocean/pacman/binding.c
+++ b/pufferlib/ocean/pacman/binding.c
@@ -17,6 +17,7 @@ static int my_init(Env* env, PyObject* args, PyObject* kwargs) {
 
 static int my_log(PyObject* dict, Log* log) {
     assign_to_dict(dict, "score", log->score);
+    assign_to_dict(dict, "perf", log->perf);
     assign_to_dict(dict, "episode_return", log->episode_return);
     assign_to_dict(dict, "episode_length", log->episode_length);
     return 0;
diff --git a/pufferlib/ocean/pacman/pacman.h b/pufferlib/ocean/pacman/pacman.h
index b00c7002c..a1f29ff45 100644
--- a/pufferlib/ocean/pacman/pacman.h
+++ b/pufferlib/ocean/pacman/pacman.h
@@ -32,6 +32,7 @@ struct Log {
         float episode_return;
         float episode_length;
         float score;
+        float perf;
         float n;
 };
 
@@ -153,6 +154,7 @@ typedef struct PacmanEnv {
 
 void add_log(PacmanEnv *env) {
     env->log.score += env->score;
+    env->log.perf += (float)env->score / NUM_DOTS;
     env->log.episode_return += env->score;
     env->log.episode_length = env->step_count;
     env->log.n++;
diff --git a/pufferlib/ocean/plot/data.csv b/pufferlib/ocean/plot/data.csv
new file mode 100644
index 000000000..8275c76d4
--- /dev/null
+++ b/pufferlib/ocean/plot/data.csv
@@ -0,0 +1,6 @@
+x,y
+0.0,0.0
+1.0,1.0
+2.0,4.0
+3.0,9.0
+4.0,16.0
diff --git a/pufferlib/ocean/plot/plot.c b/pufferlib/ocean/plot/plot.c
new file mode 100644
index 000000000..bcf6c9eca
--- /dev/null
+++ b/pufferlib/ocean/plot/plot.c
@@ -0,0 +1,815 @@
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "raylib.h"
+
+
+#define RAYGUI_IMPLEMENTATION
+#include "raygui.h"
+
+#include "cJSON.h"
+
+const Color PUFF_RED = (Color){187, 0, 0, 255};
+const Color PUFF_CYAN = (Color){0, 187, 187, 255};
+const Color PUFF_WHITE = (Color){241, 241, 241, 241};
+const Color PUFF_BACKGROUND = (Color){6, 24, 24, 255};
+
+const float EMPTY = -4242.0f;
+
+#define SEP 4
+#define SETTINGS_HEIGHT 20
+#define TOGGLE_WIDTH 60
+#define DROPDOWN_WIDTH 200
+
+typedef struct {
+    char *key;
+    float *values;
+    int size;
+} KeyValue;
+
+typedef struct PlotArgs {
+    float x_min;
+    float x_max;
+    float y_min;
+    float y_max;
+    float z_min;
+    float z_max;
+    int width;
+    int height;
+    int title_font_size;
+    int axis_font_size;
+    int axis_tick_font_size;
+    int legend_font_size;
+    int line_width;
+    int tick_length;
+    int x_margin;
+    int y_margin;
+    Color font_color;
+    Color background_color;
+    Color axis_color;
+    char* x_label;
+    char* y_label;
+    char* z_label;
+    Font font;
+} PlotArgs;
+
+PlotArgs DEFAULT_PLOT_ARGS = {
+    .x_min = EMPTY,
+    .x_max = EMPTY,
+    .y_min = EMPTY,
+    .y_max = EMPTY,
+    .z_min = EMPTY,
+    .z_max = EMPTY,
+    .width = 960,
+    .height = 540 - SETTINGS_HEIGHT,
+    .title_font_size = 24,
+    .axis_font_size = 20,
+    .axis_tick_font_size = 12,
+    .legend_font_size = 12,
+    .line_width = 2,
+    .tick_length = 8,
+    .x_margin = 70,
+    .y_margin = 70,
+    .font_color = PUFF_WHITE,
+    .background_color = PUFF_BACKGROUND,
+    .axis_color = PUFF_WHITE,
+    .x_label = "Cost",
+    .y_label = "Score",
+    .z_label = "Train/Learning Rate",
+};
+
+const char* format_tick_label(double value) {
+    static char buffer[32];
+    int precision = 2;
+
+    if (fabs(value) < 1e-10) {
+        strcpy(buffer, "0");
+        return buffer;
+    }
+
+    if (fabs(value) < 0.01 || fabs(value) > 10000) {
+        snprintf(buffer, sizeof(buffer), "%.2e", value);
+    } else {
+        snprintf(buffer, sizeof(buffer), "%.*f", precision, value);
+
+        char *end = buffer + strlen(buffer) - 1;
+        while (end > buffer && *end == '0') *end-- = '\0';
+        if (end > buffer && *end == '.') *end = '\0';
+    }
+
+    return buffer;
+}
+
+void draw_axes(PlotArgs args) {
+    int width = args.width;
+    int height = args.height;
+
+    // Draw axes
+    DrawLine(args.x_margin, args.y_margin,
+        args.x_margin, height - args.y_margin, PUFF_WHITE);
+    DrawLine(args.x_margin, height - args.y_margin,
+        width - args.x_margin, height - args.y_margin, PUFF_WHITE);
+
+    // X label
+    Vector2 x_font_size = MeasureTextEx(args.font, args.x_label, args.axis_font_size, 0);
+    DrawText(
+        args.x_label,
+        width/2 - x_font_size.x/2,
+        height - x_font_size.y,
+        args.axis_font_size,
+        PUFF_WHITE
+    );
+
+    // Y label
+    Vector2 y_font_size = MeasureTextEx(args.font, args.y_label, args.axis_font_size, 0);
+    DrawTextPro(
+        args.font,
+        args.y_label,
+        (Vector2){
+            0,
+            height/2 + y_font_size.x/2
+        },
+        (Vector2){ 0, 0 },
+        -90,
+        args.axis_font_size,
+        0,
+        PUFF_WHITE
+    );
+
+    // Autofit number of ticks
+    Vector2 tick_label_size = MeasureTextEx(args.font, "estimate", args.axis_font_size, 0);
+    int num_x_ticks = (width - 2*args.x_margin)/tick_label_size.x;
+    int num_y_ticks = (height - 2*args.y_margin)/tick_label_size.x;
+
+    // X ticks
+    for (int i=0; i<num_x_ticks; i++) {
+        float val = args.x_min + i*(args.x_max - args.x_min)/(float)num_x_ticks;
+        char* label = format_tick_label(val);
+        float x_pos = args.x_margin + i*(width - 2*args.x_margin)/num_x_ticks;
+        DrawLine(
+            x_pos,
+            height - args.y_margin - args.tick_length,
+            x_pos,
+            height - args.y_margin + args.tick_length,
+            args.axis_color
+        );
+
+        Vector2 this_tick_size = MeasureTextEx(args.font, label, args.axis_font_size, 0);
+        DrawText(
+            label,
+            x_pos - this_tick_size.x/2,
+            height - args.y_margin + args.tick_length,
+            args.axis_tick_font_size,
+            PUFF_WHITE
+        );
+    }
+
+    // Y ticks
+    for (int i=0; i<num_y_ticks; i++) {
+        float val = args.y_min + i*(args.y_max - args.y_min)/(float)num_y_ticks;
+        char* label = format_tick_label(val);
+        float y_pos = height - args.y_margin - i*(height - 2*args.y_margin)/num_y_ticks;
+        DrawLine(
+            args.x_margin - args.tick_length,
+            y_pos,
+            args.x_margin + args.tick_length,
+            y_pos,
+            args.axis_color
+        );
+        Vector2 this_tick_size = MeasureTextEx(args.font, label, args.axis_font_size, 0);
+        DrawText(
+            label,
+            args.x_margin - this_tick_size.x - args.tick_length,
+            y_pos,
+            args.axis_tick_font_size,
+            PUFF_WHITE
+        );
+ 
+    }
+}
+
+void draw_box_axes(KeyValue *hypers, int hyper_count, PlotArgs args) {
+    int width = args.width;
+    int height = args.height;
+
+    // Draw axes
+    DrawLine(args.x_margin, args.y_margin,
+        args.x_margin, height - args.y_margin, PUFF_WHITE);
+    DrawLine(args.x_margin, height - args.y_margin,
+        width - args.x_margin, height - args.y_margin, PUFF_WHITE);
+
+    // X label
+    Vector2 x_font_size = MeasureTextEx(args.font, args.x_label, args.axis_font_size, 0);
+    DrawText(
+        args.x_label,
+        width/2 - x_font_size.x/2,
+        height - x_font_size.y,
+        args.axis_font_size,
+        PUFF_WHITE
+    );
+
+    // Y label
+    Vector2 y_font_size = MeasureTextEx(args.font, args.y_label, args.axis_font_size, 0);
+    DrawTextPro(
+        args.font,
+        args.y_label,
+        (Vector2){
+            0,
+            height/2 + y_font_size.x/2
+        },
+        (Vector2){ 0, 0 },
+        -90,
+        args.axis_font_size,
+        0,
+        PUFF_WHITE
+    );
+
+    // Autofit number of ticks
+    Vector2 tick_label_size = MeasureTextEx(args.font, "estimate", args.axis_font_size, 0);
+    int num_x_ticks = (width - 2*args.x_margin)/tick_label_size.x;
+    int num_y_ticks = (height - 2*args.y_margin)/tick_label_size.x;
+
+    // X ticks
+    for (int i=0; i<num_x_ticks; i++) {
+        float val = args.x_min + i*(args.x_max - args.x_min)/(float)num_x_ticks;
+        char* label = format_tick_label(val);
+        float x_pos = args.x_margin + i*(width - 2*args.x_margin)/num_x_ticks;
+        DrawLine(
+            x_pos,
+            height - args.y_margin - args.tick_length,
+            x_pos,
+            height - args.y_margin + args.tick_length,
+            args.axis_color
+        );
+
+        Vector2 this_tick_size = MeasureTextEx(args.font, label, args.axis_font_size, 0);
+        DrawText(
+            label,
+            x_pos - this_tick_size.x/2,
+            height - args.y_margin + args.tick_length,
+            args.axis_tick_font_size,
+            PUFF_WHITE
+        );
+    }
+
+    // Y ticks
+    for (int i=0; i<hyper_count; i++) {
+        char* label = hypers[i].key;
+        float y_pos = height - args.y_margin - i*(height - 2*args.y_margin)/hyper_count;
+        DrawLine(
+            args.x_margin - args.tick_length,
+            y_pos,
+            args.x_margin + args.tick_length,
+            y_pos,
+            args.axis_color
+        );
+        Vector2 this_tick_size = MeasureTextEx(args.font, label, args.axis_font_size, 0);
+        DrawText(
+            label,
+            args.x_margin - this_tick_size.x - args.tick_length,
+            y_pos,
+            args.axis_tick_font_size,
+            PUFF_WHITE
+        );
+ 
+    }
+}
+
+
+void draw_axes3(PlotArgs args) {
+    DrawLine3D(
+        (Vector3){-10.0f, 0, 0},
+        (Vector3){10.0f, 0, 0},
+        RED
+    );
+    DrawLine3D(
+        (Vector3){0, -10.0f, 0},
+        (Vector3){0, 10.0f, 0},
+        GREEN
+    );
+    DrawLine3D(
+        (Vector3){0, 0, -10.0f},
+        (Vector3){0, 0, 10.0f},
+        BLUE
+    );
+}
+
+float ary_min(float* ary, int num) {
+    float min = ary[0];
+    for (int i=1; i<num; i++) {
+        if (ary[i] < min) min = ary[i];
+    }
+    return min;
+}
+
+float ary_max(float* ary, int num) {
+    float max = ary[0];
+    for (int i=1; i<num; i++) {
+        if (ary[i] > max) max = ary[i];
+    }
+    return max;
+}
+
+void boxplot(float* mmin, float* mmax, bool log_x, int num_points, PlotArgs args) {
+    int width = args.width;
+    int height = args.height;
+
+    // Find min/max for scaling
+    //float z_min = args.z_min == EMPTY ? ary_min(z, num_points) : args.z_min;
+    //float z_max = args.z_max == EMPTY ? ary_max(z, num_points) : args.z_max;
+
+    float x_min = args.x_min;
+    float x_max = args.x_max;
+
+    if (log_x) {
+        x_min = x_min<=1e-8 ? -8 : log10(x_min);
+        x_max = x_max<=1e-8 ? -8 : log10(x_max);
+    }
+
+    float dx = x_max - x_min;
+    if (dx == 0) dx = 1.0f;
+    x_min -= 0.1f * dx; x_max += 0.1f * dx;
+    dx = x_max - x_min;
+    float dy = (height - 2*args.y_margin)/((float)num_points);
+
+    // Plot lines
+    for (int j=0; j<num_points; j++) {
+        float x1 = mmin[j];
+        float x2 = mmax[j];
+
+        if (log_x) {
+            x1 = x1 <= 0 ? 0 : log10(x1);
+            x2 = x2 <= 0 ? 0 : log10(x2);
+        }
+
+        float left = args.x_margin + (x1 - x_min)/(x_max - x_min)*(width - 2*args.x_margin);
+        float right = args.x_margin + (x2 - x_min)/(x_max - x_min)*(width - 2*args.x_margin);
+        DrawRectangle(left, args.y_margin + j*dy, right - left, dy, PUFF_CYAN);
+    }
+}
+
+
+void plot(float* x, float* y, int num_points, PlotArgs args) {
+    int width = args.width;
+    int height = args.height;
+
+    // Find min/max for scaling
+    //float z_min = args.z_min == EMPTY ? ary_min(z, num_points) : args.z_min;
+    //float z_max = args.z_max == EMPTY ? ary_max(z, num_points) : args.z_max;
+
+    float x_min = args.x_min;
+    float x_max = args.x_max;
+    float y_min = args.y_min;
+    float y_max = args.y_max;
+
+    float dx = x_max - x_min;
+    float dy = y_max - y_min;
+    if (dx == 0) dx = 1.0f;
+    if (dy == 0) dy = 1.0f;
+    x_min -= 0.1f * dx; x_max += 0.1f * dx;
+    y_min -= 0.1f * dy; y_max += 0.1f * dy;
+    dx = x_max - x_min;
+    dy = y_max - y_min;
+
+    // Plot lines
+    for (int j = 0; j < num_points - 1; j++) {
+        float x1 = args.x_margin + (x[j] - x_min) / dx * (width - 2*args.x_margin);
+        float y1 = (height - args.y_margin) - (y[j] - y_min) / dy * (height - 2*args.y_margin);
+        /*
+        float x2 = args.margin + (x[j + 1] - x_min) / dx * (width - 2*args.margin);
+        float y2 = (height - args.margin) - (y[j + 1] - y_min) / dy * (height - 2*args.margin);
+        DrawLine(x1, y1, x2, y2, PUFF_CYAN);
+        */
+        DrawCircle(x1, y1, args.line_width, PUFF_CYAN);
+    }
+}
+
+void plot3(float* x, float* y, float* z, bool log_x, bool log_y, bool log_z, int num_points, PlotArgs args) {
+    int width = args.width;
+    int height = args.height;
+
+    float x_min = args.x_min;
+    float x_max = args.x_max;
+    float y_min = args.y_min;
+    float y_max = args.y_max;
+    float z_min = args.z_min;
+    float z_max = args.z_max;
+
+    float dx = x_max - x_min;
+    float dy = y_max - y_min;
+    float dz = z_max - z_min;
+    if (dx == 0) dx = 1.0f;
+    if (dy == 0) dy = 1.0f;
+    if (dz == 0) dz = 1.0f;
+    x_min -= 0.1f * dx; x_max += 0.1f * dx;
+    y_min -= 0.1f * dy; y_max += 0.1f * dy;
+    z_min -= 0.1f * dz; z_max += 0.1f * dz;
+    dx = x_max - x_min;
+    dy = y_max - y_min;
+    dz = z_max - z_min;
+
+    // Plot lines
+    for (int j = 0; j < num_points - 1; j++) {
+        float xj = (log_x) ? log10(x[j]) : x[j];
+        float yj = (log_y) ? log10(y[j]) : y[j];
+        float zj = (log_z) ? log10(z[j]) : z[j];
+        DrawSphere((Vector3){xj, yj, zj}, 0.1f, PUFF_CYAN);
+    }
+}
+
+
+float* get_values(KeyValue *map, int map_count, char *search_key, int *out_size) {
+    for (int i = 0; i < map_count; i++) {
+        if (map[i].key && strcmp(map[i].key, search_key) == 0) {
+            *out_size = map[i].size;
+            return map[i].values;
+        }
+    }
+    return NULL;
+}
+
+int cleanup(KeyValue *map, int map_count, cJSON *root, char *json_str) {
+    if (map) {
+        for (int i=0; i<map_count; i++) {
+            if (map[i].key) free(map[i].key);
+            if (map[i].values) free(map[i].values);
+        }
+    }
+    if (root) cJSON_Delete(root);
+    if (json_str) free(json_str);
+    return 1;
+}
+
+void compute_boxplot_data(KeyValue *hypers, float *box_mmin, float *box_mmax, int hyper_count, PlotArgs *args) {
+    args->x_min = 1e-8;
+    args->x_max = 1e8;
+
+    for (int i=0; i<hyper_count; i++) {
+        float* values = hypers[i].values;
+        box_mmin[i] = values[0];
+        box_mmax[i] = values[0];
+
+        for (int j=0; j<hypers[i].size; j++) {
+            box_mmin[i] = fmin(box_mmin[i], values[j]);
+            box_mmax[i] = fmax(box_mmax[i], values[j]);
+            //args->x_min = fmin(args->x_min, values[j]);
+            //args->x_max = fmax(args->x_max, values[j]);
+        }
+    }
+}
+
+
+
+int main(void) {
+    FILE *file = fopen("pufferlib/ocean/plot/data.json", "r");
+    if (!file) {
+        printf("Error opening file\n");
+        return 1;
+    }
+
+    fseek(file, 0, SEEK_END);
+    long file_size = ftell(file);
+    fseek(file, 0, SEEK_SET);
+    char *json_str = malloc(file_size + 1);
+    if (!json_str) {
+        printf("Memory allocation error\n");
+        fclose(file);
+        return 1;
+    }
+
+    // Read file into buffer
+    fread(json_str, 1, file_size, file);
+    json_str[file_size] = '\0';
+    fclose(file);
+
+    cJSON *root = cJSON_Parse(json_str);
+    if (!root) {
+        printf("JSON parse error: %s\n", cJSON_GetErrorPtr());
+        free(json_str);
+        return 1;
+    }
+
+    if (!cJSON_IsObject(root)) {
+        printf("Error: Root is not an object\n");
+        return cleanup(NULL, 0, root, json_str);
+    }
+
+    int map_count = 0;
+    cJSON *item = root->child;
+    while (item) {
+        map_count++;
+        item = item->next;
+    }
+    KeyValue *map = calloc(map_count, sizeof(KeyValue));
+    if (!map) {
+        printf("Memory allocation error\n");
+        return cleanup(NULL, 0, root, json_str);
+    }
+
+    // Load all keys and their float arrays
+    int hyper_count = 0;
+    int idx = 0;
+    item = root->child;
+    while (item) {
+        map[idx].key = strdup(item->string);
+        if (strncmp(map[idx].key, "train", 5) == 0) {
+            hyper_count++;
+        }
+        if (!map[idx].key) {
+            printf("Memory allocation error for key\n");
+            return cleanup(map, map_count, root, json_str);
+        }
+
+        if (!cJSON_IsArray(item)) {
+            printf("Error: Value for key '%s' is not an array\n", map[idx].key);
+            return cleanup(map, map_count, root, json_str);
+        }
+
+        int array_size = cJSON_GetArraySize(item);
+        map[idx].values = malloc(array_size * sizeof(float));
+        if (!map[idx].values) {
+            printf("Memory allocation error for values\n");
+            return cleanup(map, map_count, root, json_str);
+        }
+
+        map[idx].size = array_size;
+
+        for (int j = 0; j < array_size; j++) {
+            cJSON *sub = cJSON_GetArrayItem(item, j);
+            if (cJSON_IsNumber(sub)) {
+                map[idx].values[j] = (float)sub->valuedouble;
+            } else {
+                continue;
+                printf("Error: Non-number in array for key '%s' at index %d\n", map[idx].key, j);
+                return cleanup(map, map_count, root, json_str);
+            }
+        }
+
+        idx++;
+        item = item->next;
+    }
+
+    // Create items as an array of strings
+    //if (map_count > 100) {
+    //    map_count = 100;
+    //}
+    char **items = malloc(map_count * sizeof(char *));
+    if (!items) {
+        printf("Memory allocation error\n");
+        return cleanup(map, map_count, root, json_str);
+    }
+    for (int i = 0; i < map_count; i++) {
+        items[i] = map[i].key;  // Or strdup if you need copies
+    }
+
+    // Create options as a semicolon-separated string
+    size_t options_len = 0;
+    for (int i = 0; i < map_count; i++) {
+        options_len += strlen(map[i].key) + 1;  // +1 for semicolon or null
+    }
+    char *options = malloc(options_len);
+    if (!options) {
+        printf("Memory allocation error\n");
+        free(items);
+        return cleanup(map, map_count, root, json_str);
+    }
+    options[0] = '\0';
+    for (int i = 0; i < map_count; i++) {
+        if (i > 0) strcat(options, ";");
+        strcat(options, map[i].key);
+    }
+
+    // Hypers
+
+    hyper_count = 5;
+    char *hyper_key[5] = {"train/learning_rate", "train/gamma", "train/gae_lambda", "train/ent_coef", "train/vf_coef"};
+    KeyValue hypers[5];
+    for (int i=0; i<5; i++) {
+        hypers[i].key = hyper_key[i];
+        hypers[i].values = get_values(map, map_count, hyper_key[i], &hypers[i].size);
+    }
+    float *box_mmin = malloc(hyper_count * sizeof(float));
+    float *box_mmax = malloc(hyper_count * sizeof(float));
+
+    // Example usage: Print the arrays
+    // Cleanup
+    //free(cost_array);
+    //free(score_array);
+    //cJSON_Delete(root);
+    //free(json_str);
+
+    //float *x = cost_array;
+    //float *y = score_array;
+    //float num_points = cost_size;
+
+    //float *x = malloc(num_points * sizeof(float));
+    //float *y = malloc(num_points * sizeof(float));
+    
+  
+    // Initialize Raylib
+    InitWindow(2*DEFAULT_PLOT_ARGS.width, 2*DEFAULT_PLOT_ARGS.height + 2*SETTINGS_HEIGHT, "Puffer Constellation");
+    ClearBackground(PUFF_BACKGROUND);
+    SetTargetFPS(60);
+
+    Camera3D camera = (Camera3D){ 0 };
+    camera.position = (Vector3){ 10.0f, 10.0f, 10.0f };
+    camera.target = (Vector3){ 0.0f, 0.0f, 0.0f };
+    camera.up = (Vector3){ 0.0f, 1.0f, 0.0f };
+    camera.fovy = 45.0f;
+    camera.projection = CAMERA_PERSPECTIVE;
+    PlotArgs args1 = DEFAULT_PLOT_ARGS;
+    args1.font = GetFontDefault();
+    RenderTexture2D fig1 = LoadRenderTexture(args1.width, args1.height);
+    bool fig1_x_active = false;
+    int fig1_x_idx = 2;
+    bool fig1_x_log = true;
+    bool fig1_y_active = false;
+    int fig1_y_idx = 6;
+    bool fig1_y_log = false;
+    bool fig1_z_active = false;
+    int fig1_z_idx = 1;
+    bool fig1_z_log = true;
+
+    PlotArgs args2 = DEFAULT_PLOT_ARGS;
+    args2.font = GetFontDefault();
+    RenderTexture2D fig2 = LoadRenderTexture(args2.width, args2.height);
+    bool fig2_x_active = false;
+    int fig2_x_idx = 1;
+    bool fig2_y_active = false;
+    int fig2_y_idx = 0;
+
+    PlotArgs args3 = DEFAULT_PLOT_ARGS;
+    args3.x_margin = 250;
+    args3.font = GetFontDefault();
+    RenderTexture2D fig3 = LoadRenderTexture(args3.width, args3.height);
+    bool fig3_x_active = false;
+    int fig3_x_idx = 3;
+    bool fig3_x_log = true;
+    bool fig3_y_active = false;
+    int fig3_y_idx = 0;
+
+    PlotArgs args4 = DEFAULT_PLOT_ARGS;
+    args4.font = GetFontDefault();
+    RenderTexture2D fig4 = LoadRenderTexture(args4.width, args4.height);
+    bool fig4_x_active = false;
+    int fig4_x_idx = 4;
+    bool fig4_y_active = false;
+    int fig4_y_idx = 0;
+
+    //char* items[] = {"environment/score", "cost", "train/learning_rate", "train/gamma", "train/gae_lambda"};
+    //char options[] = "environment/score;cost;train/learning_rate;train/gamma;train/gae_lambda";
+
+    float* x;
+    float* y;
+    float* z;
+    int num_points;
+    char* x_label;
+    char* y_label;
+    char* z_label;
+
+    while (!WindowShouldClose()) {
+        BeginDrawing();
+        ClearBackground(PUFF_BACKGROUND);
+
+        x_label = items[fig1_x_idx];
+        y_label = items[fig1_y_idx];
+        z_label = items[fig1_z_idx];
+        args1.x_label = x_label;
+        args1.y_label = y_label;
+        args1.z_label = z_label;
+        x = get_values(map, map_count, x_label, &num_points);
+        y = get_values(map, map_count, y_label, &num_points);
+        z = get_values(map, map_count, z_label, &num_points);
+        args1.x_min = ary_min(x, num_points);
+        args1.x_max = ary_max(x, num_points);
+        args1.y_min = ary_min(y, num_points);
+        args1.y_max = ary_max(y, num_points);
+        args1.z_min = ary_min(z, num_points);
+        args1.z_max = ary_max(z, num_points);
+        float x_mid = fig1_x_log ? (log10(args1.x_max) + log10(args1.x_min))/2.0f : (args1.x_max + args1.x_min)/2.0f;
+        float y_mid = fig1_y_log ? (log10(args1.y_max) + log10(args1.y_min))/2.0f : (args1.y_max + args1.y_min)/2.0f;
+        float z_mid = fig1_z_log ? (log10(args1.z_max) + log10(args1.z_min))/2.0f : (args1.z_max + args1.z_min)/2.0f;
+        camera.target = (Vector3){x_mid, y_mid, z_mid};
+        BeginTextureMode(fig1);
+        ClearBackground(PUFF_BACKGROUND);
+        BeginMode3D(camera);
+        UpdateCamera(&camera, CAMERA_ORBITAL);
+        plot3(x, y, z, fig1_x_log, fig1_y_log, fig1_z_log, num_points, args1);
+        draw_axes3(args1);
+        EndMode3D();
+        EndTextureMode();
+        DrawTextureRec(
+            fig1.texture,
+            (Rectangle){0, 0, fig1.texture.width, -fig1.texture.height },
+            (Vector2){ 0, SETTINGS_HEIGHT }, WHITE
+        );
+        Rectangle fig1_x_rect = {0, 0, DROPDOWN_WIDTH, SETTINGS_HEIGHT};
+        if (GuiDropdownBox(fig1_x_rect, options, &fig1_x_idx, fig1_x_active)){
+            fig1_x_active = !fig1_x_active;
+        }
+        Rectangle fig1_x_check_rect = {DROPDOWN_WIDTH, 0, SETTINGS_HEIGHT, SETTINGS_HEIGHT};
+        GuiCheckBox(fig1_x_check_rect, "Log X", &fig1_x_log);
+        Rectangle fig1_y_rect = {DROPDOWN_WIDTH + TOGGLE_WIDTH, 0, DROPDOWN_WIDTH, SETTINGS_HEIGHT};
+        if (GuiDropdownBox(fig1_y_rect, options, &fig1_y_idx, fig1_y_active)){
+            fig1_y_active = !fig1_y_active;
+        }
+        Rectangle fig1_y_check_rect = {2*DROPDOWN_WIDTH+TOGGLE_WIDTH, 0, SETTINGS_HEIGHT, SETTINGS_HEIGHT};
+        GuiCheckBox(fig1_y_check_rect, "Log Y", &fig1_y_log);
+        Rectangle fig1_z_rect = {2*DROPDOWN_WIDTH + 2*TOGGLE_WIDTH, 0, DROPDOWN_WIDTH, SETTINGS_HEIGHT};
+        if (GuiDropdownBox(fig1_z_rect, options, &fig1_z_idx, fig1_z_active)){
+            fig1_z_active = !fig1_z_active;
+        }
+        Rectangle fig1_z_check_rect = {3*DROPDOWN_WIDTH + 2*TOGGLE_WIDTH, 0, SETTINGS_HEIGHT, SETTINGS_HEIGHT};
+        GuiCheckBox(fig1_z_check_rect, "Log Z", &fig1_z_log);
+
+        x_label = items[fig2_x_idx];
+        y_label = items[fig2_y_idx];
+        args2.x_label = x_label;
+        args2.y_label = y_label;
+        x = get_values(map, map_count, x_label, &num_points);
+        y = get_values(map, map_count, y_label, &num_points);
+        args2.x_min = ary_min(x, num_points);
+        args2.x_max = ary_max(x, num_points);
+        args2.y_min = ary_min(y, num_points);
+        args2.y_max = ary_max(y, num_points);
+        BeginTextureMode(fig2);
+        ClearBackground(PUFF_BACKGROUND);
+        plot(x, y, num_points, args2);
+        draw_axes(args2);
+        EndTextureMode();
+        DrawTextureRec(
+            fig2.texture,
+            (Rectangle){ 0, 0, fig2.texture.width, -fig2.texture.height },
+            (Vector2){ fig1.texture.width, SETTINGS_HEIGHT }, WHITE
+        );
+        Rectangle fig2_x_rect = {fig1.texture.width, 0, DROPDOWN_WIDTH, SETTINGS_HEIGHT};
+        if (GuiDropdownBox(fig2_x_rect, options, &fig2_x_idx, fig2_x_active)){
+            fig2_x_active = !fig2_x_active;
+        }
+        Rectangle fig2_y_rect = {fig1.texture.width + DROPDOWN_WIDTH, 0, DROPDOWN_WIDTH, SETTINGS_HEIGHT};
+        if (GuiDropdownBox(fig2_y_rect, options, &fig2_y_idx, fig2_y_active)){
+            fig2_y_active = !fig2_y_active;
+        }
+
+        compute_boxplot_data(hypers, box_mmin, box_mmax, hyper_count, &args3);
+        args3.x_label = "Value";
+        args3.y_label = "Hyperparameter";
+        BeginTextureMode(fig3);
+        ClearBackground(PUFF_BACKGROUND);
+        boxplot(box_mmin, box_mmax, fig3_x_log, hyper_count, args3);
+        //draw_axes(args3);
+        draw_box_axes(hypers, hyper_count, args3);
+        EndTextureMode();
+        DrawTextureRec(
+            fig3.texture,
+            (Rectangle){ 0, 0, fig3.texture.width, -fig3.texture.height },
+            (Vector2){ 0, fig1.texture.height + 2*SETTINGS_HEIGHT }, WHITE
+        );
+        Rectangle fig3_x_rect = {0, fig1.texture.height + SETTINGS_HEIGHT, DROPDOWN_WIDTH, SETTINGS_HEIGHT};
+        if (GuiDropdownBox(fig3_x_rect, options, &fig3_x_idx, fig3_x_active)){
+            fig3_x_active = !fig3_x_active;
+        }
+        Rectangle fig3_y_rect = {DROPDOWN_WIDTH, fig1.texture.height + SETTINGS_HEIGHT, DROPDOWN_WIDTH, SETTINGS_HEIGHT};
+        if (GuiDropdownBox(fig3_y_rect, options, &fig3_y_idx, fig3_y_active)){
+            fig3_y_active = !fig3_y_active;
+        }
+
+        x_label = items[fig4_x_idx];
+        y_label = items[fig4_y_idx];
+        args4.x_label = x_label;
+        args4.y_label = y_label;
+        x = get_values(map, map_count, x_label, &num_points);
+        y = get_values(map, map_count, y_label, &num_points);
+        args4.x_min = ary_min(x, num_points);
+        args4.x_max = ary_max(x, num_points);
+        args4.y_min = ary_min(y, num_points);
+        args4.y_max = ary_max(y, num_points);
+        BeginTextureMode(fig4);
+        ClearBackground(PUFF_BACKGROUND);
+        plot(x, y, num_points, args4);
+        draw_axes(args4);
+        EndTextureMode();
+        DrawTextureRec(
+            fig4.texture,
+            (Rectangle){ 0, 0, fig4.texture.width, -fig4.texture.height },
+            (Vector2){ fig1.texture.width, fig1.texture.height + 2*SETTINGS_HEIGHT }, WHITE
+        );
+        Rectangle fig4_x_rect = {fig1.texture.width, fig1.texture.height + SETTINGS_HEIGHT, DROPDOWN_WIDTH, SETTINGS_HEIGHT};
+        if (GuiDropdownBox(fig4_x_rect, options, &fig4_x_idx, fig4_x_active)){
+            fig4_x_active = !fig4_x_active;
+        }
+        Rectangle fig4_y_rect = {fig1.texture.width + DROPDOWN_WIDTH, fig1.texture.height + SETTINGS_HEIGHT, DROPDOWN_WIDTH, SETTINGS_HEIGHT};
+        if (GuiDropdownBox(fig4_y_rect, options, &fig4_y_idx, fig4_y_active)){
+            fig4_y_active = !fig4_y_active;
+        }
+
+        DrawFPS(GetScreenWidth() - 95, 10);
+        EndDrawing();
+    }
+
+    //free(x);
+    //free(y);
+    CloseWindow();
+    return 0;
+}
diff --git a/pufferlib/ocean/snake/snake.h b/pufferlib/ocean/snake/snake.h
index 30a9a4d40..e7d31b832 100644
--- a/pufferlib/ocean/snake/snake.h
+++ b/pufferlib/ocean/snake/snake.h
@@ -232,7 +232,7 @@ void step_snake(CSnake* env, int i) {
         env->rewards[i] = env->reward_death;
         env->snake_logs[i].episode_return += env->reward_death;
         env->snake_logs[i].score = env->snake_lengths[i];
-        env->snake_logs[i].perf = env->snake_logs[i].score / env->snake_logs[i].episode_length;
+        env->snake_logs[i].perf = fminf(env->snake_logs[i].score/120.0f, 1.0f);
         add_log(env, i);
         spawn_snake(env, i);
         return;
diff --git a/pufferlib/ocean/squared/squared.h b/pufferlib/ocean/squared/squared.h
index 512d95d25..e7c97ff98 100644
--- a/pufferlib/ocean/squared/squared.h
+++ b/pufferlib/ocean/squared/squared.h
@@ -58,7 +58,7 @@ void c_reset(Squared* env) {
     env->r = env->size/2;
     env->c = env->size/2;
     env->tick = 0;
-    int target_idx;
+    int target_idx = 0; // Deterministic for testing
     do {
         target_idx = rand() % tiles;
     } while (target_idx == tiles/2);
diff --git a/pufferlib/ocean/torch.py b/pufferlib/ocean/torch.py
index c7663c5f5..9dbdd0b71 100644
--- a/pufferlib/ocean/torch.py
+++ b/pufferlib/ocean/torch.py
@@ -13,6 +13,7 @@
 import pufferlib.models
 
 from pufferlib.models import Default as Policy
+from pufferlib.models import MinGRU, Mamba, GRU, MinGRULayer
 from pufferlib.models import Convolutional as Conv
 Recurrent = pufferlib.models.LSTMWrapper
 from pufferlib.pytorch import layer_init, _nativize_dtype, nativize_tensor
@@ -484,10 +485,6 @@ def decode_actions(self, flat_hidden):
         value = self.value_fn(flat_hidden)
         return action, value
 
-class TowerClimbLSTM(pufferlib.models.LSTMWrapper):
-    def __init__(self, env, policy, input_size = 256, hidden_size = 256):
-        super().__init__(env, policy, input_size, hidden_size)
-
 class TowerClimb(nn.Module):
     def __init__(self, env, cnn_channels=16, hidden_size = 256, **kwargs):
         self.hidden_size = hidden_size
@@ -539,10 +536,9 @@ def decode_actions(self, flat_hidden):
         
         return action, value
 
-
 class ImpulseWarsLSTM(Recurrent):
-    def __init__(self, env: pufferlib.PufferEnv, policy: nn.Module, input_size: int = 512, hidden_size: int = 512):
-        super().__init__(env, policy, input_size, hidden_size)
+    def __init__(self, env, policy, hidden_size: int = 512, **kwargs):
+        super().__init__(env, policy, hidden_size)
 
 
 class ImpulseWarsPolicy(nn.Module):
@@ -551,7 +547,6 @@ def __init__(
         env: pufferlib.PufferEnv,
         cnn_channels: int = 64,
         weapon_type_embedding_dims: int = 2,
-        input_size: int = 512,
         hidden_size: int = 512,
         batch_size: int = 131_072,
         num_drones: int = 2,
@@ -644,7 +639,7 @@ def __init__(
         )
 
         self.encoder = nn.Sequential(
-            layer_init(nn.Linear(featuresSize, input_size)),
+            layer_init(nn.Linear(featuresSize, hidden_size)),
             nn.ReLU(),
         )
 
@@ -963,3 +958,181 @@ def decode_actions(self, hidden):
         logits = self.decoder(hidden)
         values = self.value(hidden)
         return logits, values
+
+class G2048LSTM(nn.Module):
+    def __init__(self, env, hidden_size=128, num_layers=1, **kwargs):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.input_size = hidden_size
+        self.num_layers = num_layers
+        self.obs_shape = env.single_observation_space.shape
+
+        self.g2048 = G2048(env, hidden_size)
+
+        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers=num_layers)
+        self.cell = nn.ModuleList([torch.nn.LSTMCell(hidden_size, hidden_size) for _ in range(num_layers)])
+
+        for i in range(num_layers):
+            cell = self.cell[i]
+
+            w_ih = getattr(self.lstm, f'weight_ih_l{i}')
+            w_hh = getattr(self.lstm, f'weight_hh_l{i}')
+            b_ih = getattr(self.lstm, f'bias_ih_l{i}')
+            b_hh = getattr(self.lstm, f'bias_hh_l{i}')
+
+            nn.init.orthogonal_(w_ih, 1.0)
+            nn.init.orthogonal_(w_hh, 1.0)
+            b_ih.data.zero_()
+            b_hh.data.zero_()
+
+            cell.weight_ih = w_ih
+            cell.weight_hh = w_hh
+            cell.bias_ih = b_ih
+            cell.bias_hh = b_hh
+
+    def initial_state(self, batch_size, device):
+        h = torch.zeros(self.num_layers, batch_size, self.hidden_size, device=device)
+        c = torch.zeros(self.num_layers, batch_size, self.hidden_size, device=device)
+        return h, c
+
+    def forward_eval(self, x, state):
+        '''Forward function for inference. 3x faster than using LSTM directly'''
+        assert state[0].shape[1] == state[1].shape[1] == x.shape[0], 'LSTM state must be (h, c)'
+        h = self.g2048.encode_observations(x)
+        lstm_h, lstm_c = state
+        for i in range(self.num_layers):
+            h, c = self.cell[i](h, (lstm_h[i], lstm_c[i]))
+            lstm_h[i] = h
+            lstm_c[i] = c
+
+        logits, values = self.g2048.decode_actions(h)
+        return logits, values, (lstm_h, lstm_c)
+
+    def forward(self, x):
+        '''Forward function for training. Uses LSTM for fast time-batching'''
+        x_shape, space_shape = x.shape, self.obs_shape
+        x_n, space_n = len(x_shape), len(space_shape)
+        assert x_shape[-space_n:] == space_shape, f'Invalid input tensor shape {x.shape} != {space_shape}'
+
+        B, TT = x_shape[:2]
+        x = x.reshape(B*TT, *space_shape)
+        h = self.g2048.encode_observations(x)
+        assert h.shape == (B*TT, self.input_size)
+        h = h.reshape(B, TT, self.input_size)
+
+        h = h.transpose(0, 1)
+        h, (lstm_h, lstm_c) = self.lstm.forward(h)
+        h = h.transpose(0, 1)
+
+        flat_hidden = h.reshape(B*TT, self.hidden_size)
+        logits, values = self.g2048.decode_actions(flat_hidden)
+        values = values.reshape(B, TT)
+        return logits, values
+
+class G2048MinGRU(nn.Module):
+    def __init__(self, env, hidden_size=128, num_layers=1, expansion_factor=2, **kwargs):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.input_size = hidden_size
+        self.expansion_factor = expansion_factor
+        self.obs_shape = env.single_observation_space.shape
+
+        self.g2048 = G2048(env, hidden_size)
+        self.expansion_factor = expansion_factor
+        self.num_layers = num_layers
+        self.mingru = nn.ModuleList([MinGRULayer(hidden_size, expansion_factor) for _ in range(num_layers)])
+
+    def initial_state(self, batch_size, device):
+        state = torch.zeros(self.num_layers, batch_size, self.hidden_size*self.expansion_factor, device=device)
+        return (state,)
+
+    def forward_eval(self, x, state):
+        state = state[0]
+        assert state.shape[1] == x.shape[0]
+        h = self.g2048.encode_observations(x)
+        h = h.unsqueeze(1)
+        state = state.unsqueeze(2)
+        state_out = []
+        for i in range(self.num_layers):
+            h, s = self.mingru[i](h, state[i])
+            state_out.append(s)
+
+        h = h.squeeze(1)
+        state = torch.stack(state_out, 0).squeeze(2)
+        logits, values = self.g2048.decode_actions(h)
+        return logits, values, (state,)
+
+    def forward(self, x):
+        '''Forward function for training. Uses LSTM for fast time-batching'''
+        x_shape, space_shape = x.shape, self.obs_shape
+        x_n, space_n = len(x_shape), len(space_shape)
+        assert x_shape[-space_n:] == space_shape, f'Invalid input tensor shape {x.shape} != {space_shape}'
+
+        B, TT = x_shape[:2]
+        x = x.reshape(B*TT, *space_shape)
+        h = self.g2048.encode_observations(x)
+        assert h.shape == (B*TT, self.input_size)
+        h = h.reshape(B, TT, self.input_size)
+
+        state = self.initial_state(B, h.device)[0].unsqueeze(2)
+        for i in range(self.num_layers):
+            h, _ = self.mingru[i](h, state[i])
+
+        flat_hidden = h.reshape(B*TT, self.hidden_size)
+        logits, values = self.g2048.decode_actions(flat_hidden)
+        values = values.reshape(B, TT)
+        return logits, values
+
+class NMMO3MinGRU(nn.Module):
+    def __init__(self, env, hidden_size=128, num_layers=1, expansion_factor=2, **kwargs):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.input_size = hidden_size
+        self.expansion_factor = expansion_factor
+        self.obs_shape = env.single_observation_space.shape
+
+        self.nmmo3 = NMMO3(env, hidden_size)
+        self.expansion_factor = expansion_factor
+        self.num_layers = num_layers
+        self.mingru = nn.ModuleList([MinGRULayer(hidden_size, expansion_factor) for _ in range(num_layers)])
+
+    def initial_state(self, batch_size, device):
+        state = torch.zeros(self.num_layers, batch_size, self.hidden_size*self.expansion_factor, device=device)
+        return (state,)
+
+    def forward_eval(self, x, state):
+        state = state[0]
+        assert state.shape[1] == x.shape[0]
+        h = self.nmmo3.encode_observations(x)
+        h = h.unsqueeze(1)
+        state = state.unsqueeze(2)
+        state_out = []
+        for i in range(self.num_layers):
+            h, s = self.mingru[i](h, state[i])
+            state_out.append(s)
+
+        h = h.squeeze(1)
+        state = torch.stack(state_out, 0).squeeze(2)
+        logits, values = self.nmmo3.decode_actions(h)
+        return logits, values, (state,)
+
+    def forward(self, x):
+        '''Forward function for training. Uses LSTM for fast time-batching'''
+        x_shape, space_shape = x.shape, self.obs_shape
+        x_n, space_n = len(x_shape), len(space_shape)
+        assert x_shape[-space_n:] == space_shape, f'Invalid input tensor shape {x.shape} != {space_shape}'
+
+        B, TT = x_shape[:2]
+        x = x.reshape(B*TT, *space_shape)
+        h = self.nmmo3.encode_observations(x)
+        assert h.shape == (B*TT, self.input_size)
+        h = h.reshape(B, TT, self.input_size)
+
+        state = self.initial_state(B, h.device)[0].unsqueeze(2)
+        for i in range(self.num_layers):
+            h, _ = self.mingru[i](h, state[i])
+
+        flat_hidden = h.reshape(B*TT, self.hidden_size)
+        logits, values = self.nmmo3.decode_actions(flat_hidden)
+        values = values.reshape(B, TT)
+        return logits, values
diff --git a/pufferlib/ocean/tower_climb/binding.c b/pufferlib/ocean/tower_climb/binding.c
index c80d8153f..184f97325 100644
--- a/pufferlib/ocean/tower_climb/binding.c
+++ b/pufferlib/ocean/tower_climb/binding.c
@@ -1,34 +1,43 @@
+#include <Python.h>
+
 #include "tower_climb.h"
 
 #define Env CTowerClimb
 #define MY_SHARED
+
+static PyObject* py_generate_one_map(PyObject* self, PyObject* args);
+#define MY_METHODS {"generate_one_map", py_generate_one_map, METH_VARARGS, "Generate one tower climb map."}
+
 #include "../env_binding.h"
 
 static PyObject* my_shared(PyObject* self, PyObject* args, PyObject* kwargs) {
-    int num_maps = unpack(kwargs, "num_maps");
-    Level* levels = calloc(num_maps, sizeof(Level));
+    const char* path = "resources/tower_climb/maps.bin";
+    int num_maps = 0;
+
+    Level* levels = load_levels_from_file(&num_maps, path);
+    if (levels == NULL) {
+        PyErr_SetString(PyExc_IOError, "Failed to load maps from maps.bin. Did you run './tower_climb' to pregenerate them?");
+        return NULL;
+    }
+
     PuzzleState* puzzle_states = calloc(num_maps, sizeof(PuzzleState));
 
     for (int i = 0; i < num_maps; i++) {
-        int goal_height = rand() % 4 + 5;
-        int min_moves = 10;
-        int max_moves = 15;
-        init_level(&levels[i]);
         init_puzzle_state(&puzzle_states[i]);
-        cy_init_random_level(&levels[i], goal_height, max_moves, min_moves, i);
         levelToPuzzleState(&levels[i], &puzzle_states[i]);
     }
 
     PyObject* levels_handle = PyLong_FromVoidPtr(levels);
     PyObject* puzzles_handle = PyLong_FromVoidPtr(puzzle_states);
+    PyObject* num_maps_obj = PyLong_FromLong(num_maps);
     PyObject* state = PyDict_New();
     PyDict_SetItemString(state, "levels", levels_handle);
     PyDict_SetItemString(state, "puzzles", puzzles_handle);
+    PyDict_SetItemString(state, "num_maps", num_maps_obj);
     return PyLong_FromVoidPtr(state);
 }
 
 static int my_init(Env* env, PyObject* args, PyObject* kwargs) {
-    env->num_maps = unpack(kwargs, "num_maps");
     env->reward_climb_row = unpack(kwargs, "reward_climb_row");
     env->reward_fall_row = unpack(kwargs, "reward_fall_row");
     env->reward_illegal_move = unpack(kwargs, "reward_illegal_move");
@@ -77,6 +86,19 @@ static int my_init(Env* env, PyObject* args, PyObject* kwargs) {
     }
     env->all_levels = (Level*)PyLong_AsVoidPtr(levels_obj);
 
+    PyObject* num_maps_obj = PyDict_GetItemString(state_dict, "num_maps");
+    if (num_maps_obj == NULL) {
+        PyErr_SetString(PyExc_KeyError, "Key 'num_maps' not found in state");
+        return 1;
+    }
+    if (!PyLong_Check(num_maps_obj)) {
+        PyErr_SetString(PyExc_TypeError, "'num_maps' must be an integer");
+        return 1;
+    }
+    if (env->all_levels != NULL) {
+        env->num_maps = PyLong_AsLong(num_maps_obj);
+    }
+
     PyObject* puzzles_obj = PyDict_GetItemString(state_dict, "puzzles");
     if (!PyObject_TypeCheck(puzzles_obj, &PyLong_Type)) {
         PyErr_SetString(PyExc_TypeError, "puzzles handle must be an integer");
@@ -92,6 +114,42 @@ static int my_init(Env* env, PyObject* args, PyObject* kwargs) {
     return 0;
 }
 
+static PyObject* py_generate_one_map(PyObject* self, PyObject* args) {
+    int seed;
+    if (!PyArg_ParseTuple(args, "i", &seed)) {
+        return NULL; // PyArg_ParseTuple sets the error
+    }
+
+    Level level;
+    init_level(&level);
+
+    // Generation parameters from generate_maps.py
+    int goal_height = 5 + (seed % 4);
+    int min_moves = 10;
+    int max_moves = 30;
+
+    cy_init_random_level(&level, goal_height, max_moves, min_moves, seed);
+
+    // Package the map data into a Python tuple
+    PyObject* map_data_obj = PyBytes_FromStringAndSize((const char*)level.map, BLOCK_BYTES);
+    if (map_data_obj == NULL) {
+        free(level.map);
+        return NULL;
+    }
+
+    PyObject* result_tuple = Py_BuildValue(
+        "Oiiiiii",
+        map_data_obj,
+        level.rows, level.cols, level.size,
+        level.total_length, level.goal_location, level.spawn_location
+    );
+
+    Py_DECREF(map_data_obj);
+    free(level.map);
+
+    return result_tuple;
+}
+
 static int my_log(PyObject* dict, Log* log) {
     assign_to_dict(dict, "perf", log->perf);
     assign_to_dict(dict, "score", log->score);
diff --git a/pufferlib/ocean/tower_climb/generate_maps.py b/pufferlib/ocean/tower_climb/generate_maps.py
new file mode 100644
index 000000000..dc960bd51
--- /dev/null
+++ b/pufferlib/ocean/tower_climb/generate_maps.py
@@ -0,0 +1,40 @@
+import multiprocessing
+from tqdm import tqdm
+import time
+from pufferlib.ocean.tower_climb import binding
+
+def main(total_maps=1000, output_file="maps.bin", num_processes=4):
+    print(f"Generating {total_maps} maps in parallel using {num_processes} processes...")
+    start_time = time.time()
+
+    results = []
+    # Use a multiprocessing pool to run generate_one_map in parallel
+    with multiprocessing.Pool(processes=4) as pool:
+        # Create a unique seed for each map
+        seeds = range(total_maps)
+        for result in tqdm(pool.imap_unordered(binding.generate_one_map, seeds), total=total_maps, desc="Generating maps"):
+            results.append(result)
+
+    end_time = time.time()
+    print(f"Finished generation in {end_time - start_time:.2f} seconds.")
+
+    print(f"Saving {total_maps} maps to {output_file}...")
+    with open(output_file, "wb") as f:
+        # Write the header (total number of maps)
+        f.write(total_maps.to_bytes(4, 'little'))
+
+        # Write each level's data
+        for res in results:
+            map_data, rows, cols, size, total_length, goal_loc, spawn_loc = res
+            f.write(rows.to_bytes(4, 'little'))
+            f.write(cols.to_bytes(4, 'little'))
+            f.write(size.to_bytes(4, 'little'))
+            f.write(total_length.to_bytes(4, 'little'))
+            f.write(goal_loc.to_bytes(4, 'little'))
+            f.write(spawn_loc.to_bytes(4, 'little'))
+            f.write(map_data)
+
+    print("Done.")
+
+if __name__ == "__main__":
+    main(total_maps=20000)
diff --git a/pufferlib/ocean/tower_climb/tower_climb.c b/pufferlib/ocean/tower_climb/tower_climb.c
index d5489958e..3aca897f0 100644
--- a/pufferlib/ocean/tower_climb/tower_climb.c
+++ b/pufferlib/ocean/tower_climb/tower_climb.c
@@ -103,20 +103,21 @@ void free_tower_climb_net(TowerClimbNet* net) {
 void demo() {   
     Weights* weights = load_weights("resources/tower_climb/tower_climb_weights.bin", 560407);
     TowerClimbNet* net = init_tower_climb_net(weights, 1);
+    const char* path = "resources/tower_climb/maps.bin";
+
+    int num_maps = 0;
+    Level* levels = load_levels_from_file(&num_maps, path);
+    if (levels == NULL || num_maps == 0) {
+        fprintf(stderr, "Failed to load maps for demo. Pre-generate maps with generate_maps.py and put maps.bin under resources/topwer_climb.\n");
+        return;
+    }
 
-    int num_maps = 1;  // Generate 1 map only to start faster
-    Level* levels = calloc(num_maps, sizeof(Level));
     PuzzleState* puzzle_states = calloc(num_maps, sizeof(PuzzleState));
 
     srand(time(NULL));
     
     for (int i = 0; i < num_maps; i++) {
-        int goal_height = rand() % 4 + 5;
-        int min_moves = 10;
-        int max_moves = 15;
-        init_level(&levels[i]);
         init_puzzle_state(&puzzle_states[i]);
-        cy_init_random_level(&levels[i], goal_height, max_moves, min_moves, i);
         levelToPuzzleState(&levels[i], &puzzle_states[i]);
     }
 
@@ -124,9 +125,6 @@ void demo() {
     env->num_maps = num_maps;
     env->all_levels = levels;
     env->all_puzzles = puzzle_states;
-
-    int random_level = 5 + (rand() % 4);
-    init_random_level(env, random_level, 15, 10, rand());
     c_reset(env);
     c_render(env);
     Client* client = env->client;
@@ -192,9 +190,10 @@ void demo() {
     free_allocated(env);
     free_tower_climb_net(net);
     free(weights);
-    free(levels[0].map);
+    for (int i = 0; i < num_maps; i++) {
+        free(levels[i].map);
+    }
     free(levels);
-    free(puzzle_states[0].blocks);
     free(puzzle_states);
 }
 
@@ -217,8 +216,5 @@ void performance_test() {
 
 int main() {
     demo();
-    // performance_test();
     return 0;
 }
-
-
diff --git a/pufferlib/ocean/tower_climb/tower_climb.h b/pufferlib/ocean/tower_climb/tower_climb.h
index e86572493..8ca05310b 100644
--- a/pufferlib/ocean/tower_climb/tower_climb.h
+++ b/pufferlib/ocean/tower_climb/tower_climb.h
@@ -57,7 +57,7 @@
 #define TEST_BIT(mask, i)   ( ((mask)[(i)/8] & (1 << ((i)%8))) != 0 )
 
 // BFS
-#define MAX_BFS_SIZE 10000000
+#define MAX_BFS_SIZE 40000000
 #define MAX_NEIGHBORS 6 // based on action space
 
 // hash table 
@@ -89,7 +89,7 @@ static const int wrap_orientation[4][2] = {
 
 typedef struct Level Level;
 struct Level {
-    int* map;
+    unsigned char* map;
     int rows;
     int cols;
     int size;
@@ -99,7 +99,7 @@ struct Level {
 };
 
 void init_level(Level* lvl){
-	lvl->map = calloc(1000,sizeof(unsigned int));
+	lvl->map = calloc(BLOCK_BYTES, sizeof(unsigned char));
     lvl->rows = 10;
     lvl->cols = 10;
     lvl->size = 100;
@@ -111,7 +111,7 @@ void init_level(Level* lvl){
 void reset_level(Level* lvl){
     lvl->goal_location = 999;
     lvl->spawn_location = 0;
-    memset(lvl->map, 0, 1000 * sizeof(unsigned int));
+    memset(lvl->map, 0, BLOCK_BYTES * sizeof(unsigned char));
 }
 
 void free_level(Level* lvl){
@@ -193,12 +193,7 @@ void add_log(CTowerClimb* env) {
 }
 
 void levelToPuzzleState(Level* level, PuzzleState* state) {
-    memset(state->blocks, 0, BLOCK_BYTES);
-    for (int i = 0; i < level->total_length; i++) {
-        if (level->map[i] == 1) {
-            SET_BIT(state->blocks, i);
-        }
-    }
+    memcpy(state->blocks, level->map, BLOCK_BYTES);
     state->robot_position = level->spawn_location;
     state->robot_orientation = UP;  
     state->robot_state = 0;        
@@ -227,7 +222,7 @@ void setPuzzle(CTowerClimb* env, PuzzleState* src, Level* lvl){
 	env->state->robot_orientation = src->robot_orientation;
 	env->state->robot_state = src->robot_state;
 	env->state->block_grabbed = src->block_grabbed; 
-    memcpy(env->level->map, lvl->map, lvl->total_length * sizeof(int));
+    memcpy(env->level->map, lvl->map, BLOCK_BYTES);
     env->level->rows = lvl->rows;
     env->level->cols = lvl->cols;
     env->level->size = lvl->size;
@@ -351,8 +346,8 @@ void c_reset(CTowerClimb* env) {
         // Emergency fallback: use a simple default level
         env->level->goal_location = 999;
         env->level->spawn_location = 0;
-        memset(env->level->map, 0, env->level->total_length * sizeof(int));
-        env->level->map[0] = 1;  // Ground block
+        memset(env->level->map, 0, BLOCK_BYTES);
+        SET_BIT(env->level->map, 0);  // Ground block
         levelToPuzzleState(env->level, env->state);
     }
     
@@ -1086,6 +1081,11 @@ int bfs(PuzzleState* start, int maxDepth, Level* lvl, int min_moves) {
         for (int i = 0; i < nCount; i++) {
             PuzzleState* nxt = &neighbors[i].state;
             if (!isVisited(nxt)) {
+                if (back >= MAX_BFS_SIZE) {
+                    printf("BFS queue overflow on add! Aborting search for this level.\n");
+                    free(nxt->blocks); // Free the state we are not adding
+                    continue; // Skip adding more neighbors
+                }
                 markVisited(nxt);
                 neighbors[i].depth = current.depth + 1;
                 neighbors[i].parent = currentIndex;
@@ -1146,23 +1146,22 @@ void gen_level(Level* lvl, int goal_level) {
                 int allowed_block_placement = within_legal_bounds && (z <= (legal_depth_size - y));
                 if (allowed_block_placement){
                     int chance = (rand() % 2 ==0) ? 1 : 0;
-                    lvl->map[block_index] = chance;
+                    if (chance) SET_BIT(lvl->map, block_index);
                     // create spawn point above an existing block
-                    if (spawn_created == 0 && y == 2 && lvl->map[block_index - area] == 1){
+                    if (spawn_created == 0 && y == 2 && TEST_BIT(lvl->map, block_index - area)){
                         spawn_created = 1;
                         spawn_index = block_index;
-                        lvl->map[spawn_index] = 0;
+                        CLEAR_BIT(lvl->map, spawn_index);
                     }
                 }
                 if (!goal_created && y == goal_level && 
-                    (lvl->map[block_index + col_max - area] == 1 || 
-                     lvl->map[block_index - 1 - area] == 1 || 
-                     lvl->map[block_index + 1 - area] == 1)) {
+                    (TEST_BIT(lvl->map, block_index + col_max - area) || 
+                     TEST_BIT(lvl->map, block_index - 1 - area) || 
+                     TEST_BIT(lvl->map, block_index + 1 - area))) {
                     // 33% chance to place goal here, unless we're at the last valid position
                     if (rand() % 3 == 0 || (x == col_max-1 && z == 0)) {
                         goal_created = 1;
                         goal_index = block_index;
-                        lvl->map[goal_index] = 2;
                     }
                 }
             }
@@ -1201,6 +1200,7 @@ void cy_init_random_level(Level* level, int goal_level, int max_moves, int min_m
     gen_level(level, goal_level);
     // guarantee a map is created
     while(level->spawn_location == 0 || level->goal_location == 999 || verify_level(level,max_moves, min_moves) == 0){
+        reset_level(level);
         gen_level(level, goal_level);
     }
 }
@@ -2135,6 +2135,47 @@ void c_render(CTowerClimb* env) {
     render_scene(client, env);
 }
 
+Level* load_levels_from_file(int* num_maps, const char* path) {
+    FILE* fp = fopen(path, "rb");
+    if (fp == NULL) {
+        perror("Failed to open file for reading");
+        *num_maps = 0;
+        return NULL;
+    }
+
+    if (fread(num_maps, sizeof(int), 1, fp) != 1) {
+        fprintf(stderr, "Failed to read map count from %s\n", path);
+        fclose(fp);
+        *num_maps = 0;
+        return NULL;
+    }
+
+    Level* levels = calloc(*num_maps, sizeof(Level));
+    if (levels == NULL) {
+        fprintf(stderr, "Failed to allocate memory for levels\n");
+        fclose(fp);
+        *num_maps = 0;
+        return NULL;
+    }
+
+    for (int i = 0; i < *num_maps; i++) {
+        // Read struct fields individually to match the save order
+        fread(&levels[i].rows, sizeof(int), 1, fp);
+        fread(&levels[i].cols, sizeof(int), 1, fp);
+        fread(&levels[i].size, sizeof(int), 1, fp);
+        fread(&levels[i].total_length, sizeof(int), 1, fp);
+        fread(&levels[i].goal_location, sizeof(int), 1, fp);
+        fread(&levels[i].spawn_location, sizeof(int), 1, fp);
+
+        // Allocate and read the map data
+        levels[i].map = calloc(BLOCK_BYTES, sizeof(unsigned char));
+        fread(levels[i].map, sizeof(unsigned char), BLOCK_BYTES, fp);
+    }
+
+    fclose(fp);
+    return levels;
+}
+
 void close_client(Client* client) {
     // First unload all animations
     UnloadModelAnimations(client->animations, 8);  // We know we have 8 animations
diff --git a/pufferlib/ocean/video/video.c b/pufferlib/ocean/video/video.c
new file mode 100644
index 000000000..5d608a6b4
--- /dev/null
+++ b/pufferlib/ocean/video/video.c
@@ -0,0 +1,140 @@
+/* Quick rendering demo. You can use Xvfb to make it headless.
+ * It is faster than writing individual frames to a file.
+ *
+ * apt install ffmpeg Xvfb
+ * Xvfb :99 -screen 0 1280x720x24 &
+ * 
+ * Separate terminal:
+ * bash scripts/build_ocean video fast
+ * export DISPLAY=:99
+ * ./video
+ */
+
+#include <raylib.h>
+#include "rlgl.h"
+#include <stdlib.h>
+#include <time.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <sys/wait.h>
+
+const Color PUFF_BACKGROUND = (Color){6, 24, 24, 255};
+
+typedef struct {
+    int pipefd[2];
+    pid_t pid;
+} VideoRecorder;
+
+bool OpenVideo(VideoRecorder *recorder) {
+    if (pipe(recorder->pipefd) == -1) {
+        TraceLog(LOG_ERROR, "Failed to create pipe");
+        return false;
+    }
+
+    recorder->pid = fork();
+    if (recorder->pid == -1) {
+        TraceLog(LOG_ERROR, "Failed to fork");
+        return false;
+    }
+
+    if (recorder->pid == 0) { // Child process: run ffmpeg
+        close(recorder->pipefd[1]);
+        dup2(recorder->pipefd[0], STDIN_FILENO);
+        close(recorder->pipefd[0]);
+        execlp("ffmpeg", "ffmpeg",
+               "-y",
+               "-f", "rawvideo",
+               "-pix_fmt", "rgba",
+               "-s", TextFormat("%dx%d", GetScreenWidth(), GetScreenHeight()),
+               "-r", "60",
+               "-i", "-",
+               "-c:v", "libx264",
+               "-pix_fmt", "yuv420p",
+               "-preset", "fast",
+               "-crf", "18",
+               "-loglevel", "error",
+               "output.mp4",
+               NULL);
+        TraceLog(LOG_ERROR, "Failed to launch ffmpeg");
+        return false;
+    }
+
+    close(recorder->pipefd[0]); // Close read end in parent
+    return true;
+}
+
+void WriteFrame(VideoRecorder *recorder) {
+    int width = GetScreenWidth();
+    int height = GetScreenHeight();
+    unsigned char *screen_data = rlReadScreenPixels(width, height);
+    write(recorder->pipefd[1], screen_data, width * height * 4 * sizeof(*screen_data));
+    RL_FREE(screen_data);
+}
+
+void CloseVideo(VideoRecorder *recorder) {
+    close(recorder->pipefd[1]);
+    wait(NULL);
+}
+
+int main(void) {
+    const int screenWidth = 800;
+    const int screenHeight = 600;
+    const int maxFrames = 300;
+    float elapsedTime = 0;
+    float writeFPS = 0;
+
+    InitWindow(screenWidth, screenHeight, "Headless video saving");
+    SetTargetFPS(6000); // High FPS so we can time this test
+
+    VideoRecorder recorder;
+    if (!OpenVideo(&recorder)) {
+        CloseWindow();
+        return -1;
+    }
+
+    // Sample program
+    Texture2D texture = LoadTexture("resources/shared/puffers.png");
+    Rectangle rightRec = {0.0f, 0.0f, 128.0f, 128.0f};
+    Rectangle leftRec = {0.0f, 128.0f, 128.0f, 128.0f};
+    Vector2 pos = {(float)(screenWidth/2.0f), (float)(screenHeight/2.0f)};
+    Vector2 vel= {3.0f, 3.0f};
+
+    int frame = 0;
+    double startTime = GetTime();
+    while (!WindowShouldClose()) {
+        pos.x += vel.x;
+        pos.y += vel.y;
+        if (pos.x <= 0 || pos.x + 128 >= screenWidth) vel.x = -vel.x;
+        if (pos.y <= 0 || pos.y + 128 >= screenHeight) vel.y = -vel.y;
+
+        // Render as normal; no changes required
+        BeginDrawing();
+        ClearBackground(PUFF_BACKGROUND);
+        DrawTextureRec(texture, (vel.x >= 0) ? rightRec : leftRec, pos, WHITE);
+        DrawText("Headless video saving demo", 20, 20, 20, WHITE);
+        EndDrawing();
+
+        // Write a single frame
+        if (frame < maxFrames) {
+            WriteFrame(&recorder);
+            frame++;
+        }
+
+        // Don't forget to close the file
+        if (frame >= maxFrames) {
+            double endTime = GetTime(); // End timing
+            elapsedTime = endTime - startTime;
+            writeFPS = (elapsedTime > 0) ? maxFrames / elapsedTime : 0;
+            CloseVideo(&recorder);
+            break;
+        }
+    }
+
+    UnloadTexture(texture);
+    CloseWindow();
+
+    printf("Wrote %d frames in %.2f seconds (%.2f FPS)\n",
+        maxFrames, elapsedTime, writeFPS);
+
+    return 0;
+}
diff --git a/pufferlib/pufferl.py b/pufferlib/pufferl.py
index 5fe9c14a2..c9aedad67 100644
--- a/pufferlib/pufferl.py
+++ b/pufferlib/pufferl.py
@@ -7,6 +7,7 @@
 warnings.filterwarnings('error', category=RuntimeWarning)
 
 import os
+import io
 import sys
 import glob
 import ast
@@ -23,6 +24,7 @@
 import psutil
 
 import torch
+from torch import func
 import torch.distributed
 from torch.distributed.elastic.multiprocessing.errors import record
 import torch.utils.cpp_extension
@@ -33,6 +35,7 @@
 import pufferlib.pytorch
 try:
     from pufferlib import _C
+    from pufferlib import fake_tensors
 except ImportError:
     raise ImportError('Failed to import C/CUDA advantage kernel. If you have non-default PyTorch, try installing with --no-build-isolation')
 
@@ -54,25 +57,42 @@
 # and can find CUDA or HIP in the system
 ADVANTAGE_CUDA = bool(CUDA_HOME or ROCM_HOME)
 
+# DEBUG FLAG IS A BUG. FUCK THIS DO NOT NOT NOT ENABLE
+#torch.autograd.set_detect_anomaly(True)
+#torch._dynamo.config.capture_scalar_outputs = True
+
+
 class PuffeRL:
-    def __init__(self, config, vecenv, policy, logger=None):
+    def __init__(self, config, logger=None, verbose=True):
         # Backend perf optimization
-        torch.set_float32_matmul_precision('high')
-        torch.backends.cudnn.deterministic = config['torch_deterministic']
-        torch.backends.cudnn.benchmark = True
+        num_envs = 8192
+        self.num_envs = num_envs
+        #grid_size = 11
+        dummy = torch.zeros(5).cuda()
+
+        #vecenv = CPPEnv(num_envs)
+        #vecenv.reset()
 
         # Reproducibility
         seed = config['seed']
-        #random.seed(seed)
-        #np.random.seed(seed)
-        #torch.manual_seed(seed)
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+
+        from gymnasium.spaces import Box, Discrete
+        obs_space = Box(low=0, high=2, shape=(118,), dtype=np.float32)
+        atn_space = Discrete(3)
+        self.single_observation_space = obs_space
+        self.single_action_space = atn_space
 
         # Vecenv info
-        vecenv.async_reset(seed)
-        obs_space = vecenv.single_observation_space
-        atn_space = vecenv.single_action_space
-        total_agents = vecenv.num_agents
+        #vecenv.async_reset(seed)
+        #obs_space = vecenv.single_observation_space
+        #atn_space = vecenv.single_action_space
+        #total_agents = vecenv.num_agents
+        total_agents = num_envs
         self.total_agents = total_agents
+        self.agents_per_batch = total_agents
 
         # Experience
         if config['batch_size'] == 'auto' and config['bptt_horizon'] == 'auto':
@@ -92,6 +112,7 @@ def __init__(self, config, vecenv, policy, logger=None):
             )
 
         device = config['device']
+        '''
         self.observations = torch.zeros(segments, horizon, *obs_space.shape,
             dtype=pufferlib.pytorch.numpy_to_torch_dtype_dict[obs_space.dtype],
             pin_memory=device == 'cuda' and config['cpu_offload'],
@@ -108,98 +129,63 @@ def __init__(self, config, vecenv, policy, logger=None):
         self.ep_lengths = torch.zeros(total_agents, device=device, dtype=torch.int32)
         self.ep_indices = torch.arange(total_agents, device=device, dtype=torch.int32)
         self.free_idx = total_agents
-
-        # LSTM
-        if config['use_rnn']:
-            n = vecenv.agents_per_batch
-            h = policy.hidden_size
-            self.lstm_h = {i*n: torch.zeros(n, h, device=device) for i in range(total_agents//n)}
-            self.lstm_c = {i*n: torch.zeros(n, h, device=device) for i in range(total_agents//n)}
-
+        '''
+ 
         # Minibatching & gradient accumulation
         minibatch_size = config['minibatch_size']
         max_minibatch_size = config['max_minibatch_size']
         self.minibatch_size = min(minibatch_size, max_minibatch_size)
-        if minibatch_size > max_minibatch_size and minibatch_size % max_minibatch_size != 0:
-            raise pufferlib.APIUsageError(
-                f'minibatch_size {minibatch_size} > max_minibatch_size {max_minibatch_size} must divide evenly')
-
-        if batch_size < minibatch_size:
-            raise pufferlib.APIUsageError(
-                f'batch_size {batch_size} must be >= minibatch_size {minibatch_size}'
-            )
-
         self.accumulate_minibatches = max(1, minibatch_size // max_minibatch_size)
-        self.total_minibatches = int(config['update_epochs'] * batch_size / self.minibatch_size)
+        self.total_minibatches = config['num_minibatches']
         self.minibatch_segments = self.minibatch_size // horizon 
         if self.minibatch_segments * horizon != self.minibatch_size:
             raise pufferlib.APIUsageError(
                 f'minibatch_size {self.minibatch_size} must be divisible by bptt_horizon {horizon}'
             )
 
-        # Torch compile
-        self.uncompiled_policy = policy
-        self.policy = policy
-        if config['compile']:
-            self.policy = torch.compile(policy, mode=config['compile_mode'])
-            self.policy.forward_eval = torch.compile(policy, mode=config['compile_mode'])
-            pufferlib.pytorch.sample_logits = torch.compile(pufferlib.pytorch.sample_logits, mode=config['compile_mode'])
-
-        # Optimizer
-        if config['optimizer'] == 'adam':
-            optimizer = torch.optim.Adam(
-                self.policy.parameters(),
-                lr=config['learning_rate'],
-                betas=(config['adam_beta1'], config['adam_beta2']),
-                eps=config['adam_eps'],
-            )
-        elif config['optimizer'] == 'muon':
-            import heavyball
-            from heavyball import ForeachMuon
-            warnings.filterwarnings(action='ignore', category=UserWarning, module=r'heavyball.*')
-            heavyball.utils.compile_mode = "default"
-
-            # # optionally a little bit better/faster alternative to newtonschulz iteration
-            # import heavyball.utils
-            # heavyball.utils.zeroth_power_mode = 'thinky_polar_express'
-
-            # heavyball_momentum=True introduced in heavyball 2.1.1
-            # recovers heavyball-1.7.2 behaviour - previously swept hyperparameters work well
-            optimizer = ForeachMuon(
-                self.policy.parameters(),
-                lr=config['learning_rate'],
-                betas=(config['adam_beta1'], config['adam_beta2']),
-                eps=config['adam_eps'],
-                heavyball_momentum=True,
-            )
-        else:
-            raise ValueError(f'Unknown optimizer: {config["optimizer"]}')
-
-        self.optimizer = optimizer
-
         # Logging
         self.logger = logger
         if logger is None:
-            self.logger = NoLogger(config)
+            self.logger = Logger(config)
 
-        # Learning rate scheduler
         epochs = config['total_timesteps'] // config['batch_size']
         eta_min = config['learning_rate'] * config['min_lr_ratio']
-        self.scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
-            optimizer, T_max=epochs, eta_min=eta_min)
+        
         self.total_epochs = epochs
 
-        # Automatic mixed precision
-        precision = config['precision']
-        self.amp_context = contextlib.nullcontext()
-        if config.get('amp', True) and config['device'] == 'cuda':
-            self.amp_context = torch.amp.autocast(device_type='cuda', dtype=getattr(torch, precision))
-        if precision not in ('float32', 'bfloat16'):
-            raise pufferlib.APIUsageError(f'Invalid precision: {precision}: use float32 or bfloat16')
+        self.num_layers = 4
+        config['input_size'] = 118
+        config['num_atns'] = 3
+        config['hidden_size'] = 128
+        config['expansion_factor'] = 1
+        config['num_layers'] = self.num_layers
+        config['minibatch_segments'] = self.minibatch_segments
+        config['segments'] = segments
+        config['horizon'] = horizon
+        config['lr'] = config['learning_rate']
+        config['beta1'] = config['adam_beta1']
+        config['beta2'] = config['adam_beta2']
+        config['eps'] = config['adam_eps']
+        config['max_epochs'] = epochs
+        config['total_minibatches'] = self.total_minibatches
+        config['accumulate_minibatches'] = self.accumulate_minibatches
+        config['num_envs'] = self.num_envs
+        config['cudagraphs'] = True
+        config['kernels'] = True
+        config['profile'] = False
+        config['num_buffers'] = 2
+        self.pufferl_cpp = _C.create_pufferl(config)
+        self.observations = self.pufferl_cpp.observations
+        self.actions = self.pufferl_cpp.actions
+        self.rewards = self.pufferl_cpp.rewards
+        self.terminals = self.pufferl_cpp.terminals
+        self.logprobs = self.pufferl_cpp.logprobs
+        self.values = self.pufferl_cpp.values
+        self.debug = self.pufferl_cpp.debug
 
         # Initializations
         self.config = config
-        self.vecenv = vecenv
+        #self.vecenv = vecenv
         self.epoch = 0
         self.global_step = 0
         self.last_log_step = 0
@@ -210,11 +196,15 @@ def __init__(self, config, vecenv, policy, logger=None):
         self.stats = defaultdict(list)
         self.last_stats = defaultdict(list)
         self.losses = {}
+        self.verbose = verbose
 
         # Dashboard
-        self.model_size = sum(p.numel() for p in policy.parameters() if p.requires_grad)
+        #self.model_size = sum(p.numel() for p in policy.parameters() if p.requires_grad)
         self.print_dashboard(clear=True)
 
+        #self.compiled_evaluate = torch.compile(_C.compiled_evaluate)
+        #self.eval_forward = torch.compile(self.pufferl_cpp.policy.forward, mode='reduce-overhead')
+
     @property
     def uptime(self):
         return time.time() - self.start_time
@@ -226,243 +216,88 @@ def sps(self):
 
         return (self.global_step - self.last_log_step) / (time.time() - self.last_log_time)
 
+
     def evaluate(self):
         profile = self.profile
         epoch = self.epoch
         profile('eval', epoch)
-        profile('eval_misc', epoch, nest=True)
 
         config = self.config
         device = config['device']
 
-        if config['use_rnn']:
-            for k in self.lstm_h:
-                self.lstm_h[k].zero_()
-                self.lstm_c[k].zero_()
-
-        self.full_rows = 0
-        while self.full_rows < self.segments:
-            profile('env', epoch)
-            o, r, d, t, info, env_id, mask = self.vecenv.recv()
-
-            profile('eval_misc', epoch)
-            env_id = slice(env_id[0], env_id[-1] + 1)
-
-            done_mask = d + t # TODO: Handle truncations separately
-            self.global_step += int(mask.sum())
-
-            profile('eval_copy', epoch)
-            o = torch.as_tensor(o)
-            o_device = o.to(device)#, non_blocking=True)
-            r = torch.as_tensor(r).to(device)#, non_blocking=True)
-            d = torch.as_tensor(d).to(device)#, non_blocking=True)
-
-            profile('eval_forward', epoch)
-            with torch.no_grad(), self.amp_context:
-                state = dict(
-                    reward=r,
-                    done=d,
-                    env_id=env_id,
-                    mask=mask,
-                )
-
-                if config['use_rnn']:
-                    state['lstm_h'] = self.lstm_h[env_id.start]
-                    state['lstm_c'] = self.lstm_c[env_id.start]
-
-                logits, value = self.policy.forward_eval(o_device, state)
-                action, logprob, _ = pufferlib.pytorch.sample_logits(logits)
-                r = torch.clamp(r, -1, 1)
-
-            profile('eval_copy', epoch)
-            with torch.no_grad():
-                if config['use_rnn']:
-                    self.lstm_h[env_id.start] = state['lstm_h']
-                    self.lstm_c[env_id.start] = state['lstm_c']
-
-                # Fast path for fully vectorized envs
-                l = self.ep_lengths[env_id.start].item()
-                batch_rows = slice(self.ep_indices[env_id.start].item(), 1+self.ep_indices[env_id.stop - 1].item())
-
-                if config['cpu_offload']:
-                    self.observations[batch_rows, l] = o
-                else:
-                    self.observations[batch_rows, l] = o_device
-
-                self.actions[batch_rows, l] = action
-                self.logprobs[batch_rows, l] = logprob
-                self.rewards[batch_rows, l] = r
-                self.terminals[batch_rows, l] = d.float()
-                self.values[batch_rows, l] = value.flatten()
-
-                # Note: We are not yet handling masks in this version
-                self.ep_lengths[env_id] += 1
-                if l+1 >= config['bptt_horizon']:
-                    num_full = env_id.stop - env_id.start
-                    self.ep_indices[env_id] = self.free_idx + torch.arange(num_full, device=config['device']).int()
-                    self.ep_lengths[env_id] = 0
-                    self.free_idx += num_full
-                    self.full_rows += num_full
-
-                action = action.cpu().numpy()
-                if isinstance(logits, torch.distributions.Normal):
-                    action = np.clip(action, self.vecenv.action_space.low, self.vecenv.action_space.high)
-
-            profile('eval_misc', epoch)
-            for i in info:
-                for k, v in pufferlib.unroll_nested_dict(i):
-                    if isinstance(v, np.ndarray):
-                        v = v.tolist()
-                    elif isinstance(v, (list, tuple)):
-                        self.stats[k].extend(v)
-                    else:
-                        self.stats[k].append(v)
-
-            profile('env', epoch)
-            self.vecenv.send(action)
-
-        profile('eval_misc', epoch)
-        self.free_idx = self.total_agents
-        self.ep_indices = torch.arange(self.total_agents, device=device, dtype=torch.int32)
-        self.ep_lengths.zero_()
+        state = _C.rollouts(self.pufferl_cpp,)
+
+        '''
+        obs, act, rew, term = _C.env_buffers(self.pufferl_cpp)
+
+        num_buffers = 2
+        block_size = int(self.num_envs / num_buffers)
+        with torch.no_grad():
+            for i in range(self.config['bptt_horizon']):
+                buf = i % num_buffers
+                h = int(i / num_buffers)
+                _C.python_vec_recv(self.pufferl_cpp, buf)
+
+                start = int(block_size * buf)
+                obs_batch = obs.narrow(0, start, block_size)
+                state_batch = state.narrow(1, start, block_size)
+                logits, value, state_out = self.eval_forward(obs_batch.cuda(), state_batch)
+                state_batch.copy_(state_out)
+
+                logits = torch.nan_to_num(logits)
+                logprobs = torch.log_softmax(logits, dim=1)
+                action = torch.multinomial(logprobs.exp(), 1, True).squeeze(1).to(torch.int32)
+                logprob = logprobs.gather(1, action.unsqueeze(1)).squeeze(1)
+
+                self.observations.select(1, h).narrow(0, start, block_size).copy_(obs_batch, True)
+                self.actions.select(1, h).narrow(0, start, block_size).copy_(action.to(torch.int64), True)
+                self.logprobs.select(1, h).narrow(0, start, block_size).copy_(logprob.to(torch.float32), True)
+                self.values.select(1, h).narrow(0, start, block_size).copy_(value.flatten().to(torch.float32), True)
+
+                rewards_batch = rew.narrow(0, start, block_size)
+                rewards_clamped = torch.clamp(rewards_batch, -1, 1)
+
+                self.rewards.select(1, h).narrow(0, start, block_size).copy_(rewards_clamped.to(torch.float32), True)
+
+                terminals_batch = term.narrow(0, start, block_size)
+                self.terminals.select(1, h).narrow(0, start, block_size).copy_(terminals_batch.to(torch.float32), True)
+
+                act.narrow(0, start, block_size).copy_(action.to(torch.float32), True)
+
+                torch.cuda.synchronize()
+                _C.python_vec_send(self.pufferl_cpp, buf)
+        '''
+
+        #torch.cuda.synchronize()
+        logs = _C.log_environments(self.pufferl_cpp)
+        if logs:
+            self.stats['perf'] = [logs['perf']]
+            self.stats['score'] = [logs['score']]
+            self.stats['episode_return'] = [logs['episode_return']]
+            self.stats['episode_length'] = [logs['episode_length']]
+            self.stats['n'] = [logs['n']]
+
+        self.global_step += config['batch_size']
         profile.end()
         return self.stats
 
-    @record
     def train(self):
         profile = self.profile
         epoch = self.epoch
         profile('train', epoch)
-        profile('train_misc', epoch, nest=True)
-        losses = defaultdict(float)
         config = self.config
         device = config['device']
 
-        b0 = config['prio_beta0']
-        a = config['prio_alpha']
-        clip_coef = config['clip_coef']
-        vf_clip = config['vf_clip_coef']
-        anneal_beta = b0 + (1 - b0)*a*self.epoch/self.total_epochs
-        self.ratio[:] = 1
-
-        for mb in range(self.total_minibatches):
-            profile('train_misc', epoch)
-            self.amp_context.__enter__()
-
-            shape = self.values.shape
-            advantages = torch.zeros(shape, device=device)
-            advantages = compute_puff_advantage(self.values, self.rewards,
-                self.terminals, self.ratio, advantages, config['gamma'],
-                config['gae_lambda'], config['vtrace_rho_clip'], config['vtrace_c_clip'])
-
-            # Prioritize experience by advantage magnitude
-            adv = advantages.abs().sum(axis=1)
-            prio_weights = torch.nan_to_num(adv**a, 0, 0, 0)
-            prio_probs = (prio_weights + 1e-6)/(prio_weights.sum() + 1e-6)
-            idx = torch.multinomial(prio_probs, self.minibatch_segments)
-            mb_prio = (self.segments*prio_probs[idx, None])**-anneal_beta
-
-            profile('train_copy', epoch)
-            mb_obs = self.observations[idx]
-            mb_actions = self.actions[idx]
-            mb_logprobs = self.logprobs[idx]
-            mb_rewards = self.rewards[idx]
-            mb_terminals = self.terminals[idx]
-            mb_truncations = self.truncations[idx]
-            mb_ratio = self.ratio[idx]
-            mb_values = self.values[idx]
-            mb_returns = advantages[idx] + mb_values
-            mb_advantages = advantages[idx]
-
-            profile('train_forward', epoch)
-            if not config['use_rnn']:
-                mb_obs = mb_obs.reshape(-1, *self.vecenv.single_observation_space.shape)
-
-            state = dict(
-                action=mb_actions,
-                lstm_h=None,
-                lstm_c=None,
-            )
+        losses = _C.train(self.pufferl_cpp)
 
-            logits, newvalue = self.policy(mb_obs, state)
-            actions, newlogprob, entropy = pufferlib.pytorch.sample_logits(logits, action=mb_actions)
-
-            profile('train_misc', epoch)
-            newlogprob = newlogprob.reshape(mb_logprobs.shape)
-            logratio = newlogprob - mb_logprobs
-            ratio = logratio.exp()
-            self.ratio[idx] = ratio.detach()
-
-            with torch.no_grad():
-                old_approx_kl = (-logratio).mean()
-                approx_kl = ((ratio - 1) - logratio).mean()
-                clipfrac = ((ratio - 1.0).abs() > config['clip_coef']).float().mean()
-
-            # NOTE: Commenting this out since adv is replaced below
-            # adv = advantages[idx]
-            # adv = compute_puff_advantage(mb_values, mb_rewards, mb_terminals,
-            #     ratio, adv, config['gamma'], config['gae_lambda'],
-            #     config['vtrace_rho_clip'], config['vtrace_c_clip'])
-
-            # Weight advantages by priority and normalize
-            adv = mb_advantages
-            adv = mb_prio * (adv - adv.mean()) / (adv.std() + 1e-8)
-
-            # Losses
-            pg_loss1 = -adv * ratio
-            pg_loss2 = -adv * torch.clamp(ratio, 1 - clip_coef, 1 + clip_coef)
-            pg_loss = torch.max(pg_loss1, pg_loss2).mean()
-
-            newvalue = newvalue.view(mb_returns.shape)
-            v_clipped = mb_values + torch.clamp(newvalue - mb_values, -vf_clip, vf_clip)
-            v_loss_unclipped = (newvalue - mb_returns) ** 2
-            v_loss_clipped = (v_clipped - mb_returns) ** 2
-            v_loss = 0.5*torch.max(v_loss_unclipped, v_loss_clipped).mean()
-
-            entropy_loss = entropy.mean()
-
-            loss = pg_loss + config['vf_coef']*v_loss - config['ent_coef']*entropy_loss
-            self.amp_context.__enter__() # TODO: AMP needs some debugging
-
-            # This breaks vloss clipping?
-            self.values[idx] = newvalue.detach().float()
-
-            # Logging
-            profile('train_misc', epoch)
-            losses['policy_loss'] += pg_loss.item() / self.total_minibatches
-            losses['value_loss'] += v_loss.item() / self.total_minibatches
-            losses['entropy'] += entropy_loss.item() / self.total_minibatches
-            losses['old_approx_kl'] += old_approx_kl.item() / self.total_minibatches
-            losses['approx_kl'] += approx_kl.item() / self.total_minibatches
-            losses['clipfrac'] += clipfrac.item() / self.total_minibatches
-            losses['importance'] += ratio.mean().item() / self.total_minibatches
-
-            # Learn on accumulated minibatches
-            profile('learn', epoch)
-            loss.backward()
-            if (mb + 1) % self.accumulate_minibatches == 0:
-                torch.nn.utils.clip_grad_norm_(self.policy.parameters(), config['max_grad_norm'])
-                self.optimizer.step()
-                self.optimizer.zero_grad()
-
-        # Reprioritize experience
         profile('train_misc', epoch)
-        if config['anneal_lr']:
-            self.scheduler.step()
-
-        y_pred = self.values.flatten()
-        y_true = advantages.flatten() + self.values.flatten()
-        var_y = y_true.var()
-        explained_var = torch.nan if var_y == 0 else (1 - (y_true - y_pred).var() / var_y).item()
-        losses['explained_variance'] = explained_var
-
         profile.end()
         logs = None
         self.epoch += 1
         done_training = self.global_step >= config['total_timesteps']
         if done_training or self.global_step == 0 or time.time() > self.last_log_time + 0.25:
             logs = self.mean_and_log()
-            self.losses = losses
+            #self.losses = losses
             self.print_dashboard()
             self.stats = defaultdict(list)
             self.last_log_time = time.time()
@@ -493,7 +328,7 @@ def mean_and_log(self):
             'agent_steps': agent_steps,
             'uptime': time.time() - self.start_time,
             'epoch': int(dist_sum(self.epoch, device)),
-            'learning_rate': self.optimizer.param_groups[0]["lr"],
+            #'learning_rate': self.optimizer.param_groups[0]["lr"],
             **{f'environment/{k}': v for k, v in self.stats.items()},
             **{f'losses/{k}': v for k, v in self.losses.items()},
             **{f'performance/{k}': v['elapsed'] for k, v in self.profile},
@@ -513,11 +348,13 @@ def mean_and_log(self):
         return logs
 
     def close(self):
+        os._exit(0)
         self.vecenv.close()
         self.utilization.stop()
         model_path = self.save_checkpoint()
         run_id = self.logger.run_id
-        path = os.path.join(self.config['data_dir'], f'{self.config["env"]}_{run_id}.pt')
+        path = os.path.join(self.config['data_dir'],
+            self.config["env"], f'{run_id}.pt')
         shutil.copy(model_path, path)
         return path
 
@@ -527,7 +364,8 @@ def save_checkpoint(self):
                return
  
         run_id = self.logger.run_id
-        path = os.path.join(self.config['data_dir'], f'{self.config["env"]}_{run_id}')
+        path = os.path.join(self.config['data_dir'],
+            self.config["env"], run_id)
         if not os.path.exists(path):
             os.makedirs(path)
 
@@ -536,10 +374,10 @@ def save_checkpoint(self):
         if os.path.exists(model_path):
             return model_path
 
-        torch.save(self.uncompiled_policy.state_dict(), model_path)
+        #torch.save(self.uncompiled_policy.state_dict(), model_path)
 
         state = {
-            'optimizer_state_dict': self.optimizer.state_dict(),
+            #'optimizer_state_dict': self.optimizer.state_dict(),
             'global_step': self.global_step,
             'agent_step': self.global_step,
             'update': self.epoch,
@@ -552,7 +390,10 @@ def save_checkpoint(self):
         return model_path
 
     def print_dashboard(self, clear=False, idx=[0],
-            c1='[cyan]', c2='[dim default]', b1='[bright_cyan]', b2='[default]'):
+            c1='[cyan]', c2='[white]', b1='[bright_cyan]', b2='[bright_white]'):
+        if not self.verbose:
+            return
+
         config = self.config
         sps = dist_sum(self.sps, config['device'])
         agent_steps = dist_sum(self.global_step, config['device'])
@@ -589,13 +430,13 @@ def print_dashboard(self, clear=False, idx=[0],
 
         s.add_column(f"{c1}Summary", justify='left', vertical='top', width=10)
         s.add_column(f"{c1}Value", justify='right', vertical='top', width=14)
-        s.add_row(f'{b2}Env', f'{b2}{config["env"]}')
-        s.add_row(f'{b2}Params', abbreviate(self.model_size, b2, c2))
-        s.add_row(f'{b2}Steps', abbreviate(agent_steps, b2, c2))
-        s.add_row(f'{b2}SPS', abbreviate(sps, b2, c2))
-        s.add_row(f'{b2}Epoch', f'{b2}{self.epoch}')
-        s.add_row(f'{b2}Uptime', duration(self.uptime, b2, c2))
-        s.add_row(f'{b2}Remaining', remaining)
+        s.add_row(f'{c2}Env', f'{b2}{config["env"]}')
+        #s.add_row(f'{c2}Params', abbreviate(self.model_size, b2, c2))
+        s.add_row(f'{c2}Steps', abbreviate(agent_steps, b2, c2))
+        s.add_row(f'{c2}SPS', abbreviate(sps, b2, c2))
+        s.add_row(f'{c2}Epoch', f'{b2}{self.epoch}')
+        s.add_row(f'{c2}Uptime', duration(self.uptime, b2, c2))
+        s.add_row(f'{c2}Remaining', remaining)
 
         delta = profile.eval['buffer'] + profile.train['buffer']
         p = Table(box=None, expand=True, show_header=False)
@@ -720,11 +561,14 @@ def dist_mean(value, device):
     return dist_sum(value, device) / torch.distributed.get_world_size()
 
 class Profile:
-    def __init__(self, frequency=5):
-        self.profiles = defaultdict(lambda: defaultdict(float))
+    def __init__(self, frequency=1):
+        self.reset()
         self.frequency = frequency
         self.stack = []
 
+    def reset(self):
+        self.profiles = defaultdict(lambda: defaultdict(float))
+
     def __iter__(self):
         return iter(self.profiles.items())
 
@@ -736,8 +580,8 @@ def __call__(self, name, epoch, nest=False):
         if (epoch + 1) % self.frequency != 0:
             return
 
-        if torch.cuda.is_available():
-            torch.cuda.synchronize()
+        #if torch.cuda.is_available():
+        #    torch.cuda.synchronize()
 
         tick = time.time()
         if len(self.stack) != 0 and not nest:
@@ -754,8 +598,8 @@ def pop(self, end):
         profile['elapsed'] += delta * self.frequency
 
     def end(self):
-        if torch.cuda.is_available():
-            torch.cuda.synchronize()
+        #if torch.cuda.is_available():
+        #    torch.cuda.synchronize()
 
         end = time.time()
         for i in range(len(self.stack)):
@@ -789,9 +633,9 @@ def run(self):
                    time.sleep(self.delay)
                    continue
 
-                self.gpu_util.append(torch.cuda.utilization())
-                free, total = torch.cuda.mem_get_info()
-                self.gpu_mem.append(100*(total-free)/total)
+                #self.gpu_util.append(torch.cuda.utilization())
+                #free, total = torch.cuda.mem_get_info()
+                #self.gpu_mem.append(100*(total-free)/total)
             else:
                 self.gpu_util.append(0)
                 self.gpu_mem.append(0)
@@ -821,15 +665,33 @@ def downsample(data_list, num_points):
 
     return downsampled.tolist() + [last]
 
-class NoLogger:
+class Logger:
     def __init__(self, args):
-        self.run_id = str(int(100*time.time()))
+        self.run_id = str(int(1000*time.time()))
+        root = os.path.join(args['data_dir'], 'logs', args['env'])
+        if not os.path.exists(root):
+            os.makedirs(root)
+
+        self.path = os.path.join(root, self.run_id + '.json')
+        self.logs = {'data': []}
+        for k, v in pufferlib.unroll_nested_dict(args):
+            self.logs[k] = v
+
+    # Temp hack to log full config
+    def init(self, args):
+        for k, v in pufferlib.unroll_nested_dict(args):
+            self.logs[k] = v
 
     def log(self, logs, step):
-        pass
+        self.logs['data'].append(logs)
+
+    def log_cost(self, cost):
+        self.logs['cost'] = cost
 
     def close(self, model_path):
-        pass
+        import json
+        with open(self.path, 'w') as f:
+            json.dump(self.logs, f)
 
 class NeptuneLogger:
     def __init__(self, args, load_id=None, mode='async'):
@@ -852,6 +714,12 @@ def __init__(self, args, load_id=None, mode='async'):
             neptune[k].append(v)
         self.should_upload_model = not args['no_model_upload']
 
+    def init(self, args):
+        pass
+
+    def log_cost(self, cost):
+        pass
+
     def log(self, logs, step):
         for k, v in logs.items():
             self.neptune[k].append(v, step=step)
@@ -905,7 +773,57 @@ def download(self):
         model_file = max(os.listdir(data_dir))
         return f'{data_dir}/{model_file}'
 
-def train(env_name, args=None, vecenv=None, policy=None, logger=None, should_stop_early=None):
+def check(env_name):
+    torch.set_printoptions(precision=16)
+
+    args = load_config(env_name)
+    args['train']['optimizer'] = 'adam'
+
+    vecenv = load_env(env_name, args)
+
+    torch.manual_seed(args['train']['seed'])
+    policy = load_policy(args, vecenv, env_name)
+
+    import pufferlib.python_pufferl
+    train_config = dict(**args['train'])
+    pufferl_python = pufferlib.python_pufferl.PuffeRL(train_config, vecenv, policy, verbose=False)
+
+    pufferl_cpp = PuffeRL(train_config, verbose=False)
+
+    python_params = dict(policy.named_parameters())
+    for k, v in pufferl_cpp.pufferl_cpp.policy.named_parameters():
+        v_python = python_params[k].data
+        assert torch.allclose(v, v_python)
+
+    torch.manual_seed(args['train']['seed'])
+    pufferl_python.evaluate()
+    #pufferl_python.train()
+    #pufferl_python.evaluate()
+
+    torch.manual_seed(args['train']['seed'])
+    pufferl_cpp.evaluate()
+    #pufferl_cpp.train()
+    #pufferl_cpp.evaluate()
+
+    # You need to determinize the env before checks
+    for i in range(args['train']['bptt_horizon']):
+        python_obs = pufferl_python.observations[:, i].float()
+        cpp_obs = pufferl_cpp.observations[:, i]
+        assert torch.allclose(pufferl_python.observations[:, i].float(), pufferl_cpp.observations[:, i]), f'Observation {i} mismatch'
+        assert torch.allclose(pufferl_python.actions[:, i], pufferl_cpp.actions[:, i].long()), f'Action {i} mismatch'
+        assert torch.allclose(pufferl_python.rewards[:, i], pufferl_cpp.rewards[:, i]), f'Reward {i} mismatch'
+        assert torch.allclose(pufferl_python.terminals[:, i], pufferl_cpp.terminals[:, i]), f'Terminal {i} mismatch'
+        assert torch.allclose(pufferl_python.logprobs[:, i], pufferl_cpp.logprobs[:, i], atol=1e-5), f'Logprob {i} mismatch'
+        assert torch.allclose(pufferl_python.values[:, i], pufferl_cpp.values[:, i], atol=1e-4), f'Value {i} mismatch'
+
+    python_params = dict(policy.named_parameters())
+    for k, v in pufferl_cpp.pufferl_cpp.policy.named_parameters():
+        v_python = python_params[k].data
+        assert torch.allclose(v, v_python, atol=1e-5)
+
+    print('Check passed')
+
+def train(env_name, args=None, vecenv=None, policy=None, logger=None, verbose=True, should_stop_early=None):
     args = args or load_config(env_name)
 
     # Assume TorchRun DDP is used if LOCAL_RANK is set
@@ -920,7 +838,7 @@ def train(env_name, args=None, vecenv=None, policy=None, logger=None, should_sto
         os.environ["CUDA_VISIBLE_DEVICES"] = str(local_rank)
 
     vecenv = vecenv or load_env(env_name, args)
-    policy = policy or load_policy(args, vecenv, env_name)
+    #policy = policy or load_policy(args, vecenv, env_name)
 
     if 'LOCAL_RANK' in os.environ:
         args['train']['device'] = torch.cuda.current_device()
@@ -941,11 +859,16 @@ def train(env_name, args=None, vecenv=None, policy=None, logger=None, should_sto
     elif args['wandb']:
         logger = WandbLogger(args)
 
-    train_config = { **args['train'], 'env': env_name }
-    pufferl = PuffeRL(train_config, vecenv, policy, logger)
+    train_config = dict(**args['train'])#, env=env_name)
+    #pufferl = PuffeRL(train_config, vecenv, policy, logger, verbose)
+    pufferl = PuffeRL(train_config, logger, verbose)
+    pufferl.logger.init(args)
 
     all_logs = []
+    max_cost = args['train'].get('max_cost', -1)
     while pufferl.global_step < train_config['total_timesteps']:
+        if pufferl.uptime > max_cost and max_cost > 0:
+            break
         if train_config['device'] == 'cuda':
             torch.compiler.cudagraph_mark_step_begin()
         pufferl.evaluate()
@@ -965,20 +888,55 @@ def train(env_name, args=None, vecenv=None, policy=None, logger=None, should_sto
     # Final eval. You can reset the env here, but depending on
     # your env, this can skew data (i.e. you only collect the shortest
     # rollouts within a fixed number of epochs)
+    uptime = pufferl.uptime
+    agent_steps = pufferl.global_step
     for i in range(128):  # Run eval for at least 32, but put a hard stop at 128.
         stats = pufferl.evaluate()
         if i >= 32 and stats:
             break
 
     logs = pufferl.mean_and_log()
+    logs['uptime'] = uptime
+    logs['agent_steps'] = agent_steps
     if logs is not None:
         all_logs.append(logs)
 
     pufferl.print_dashboard()
     model_path = pufferl.close()
+    pufferl.logger.log_cost(uptime)
     pufferl.logger.close(model_path)
     return all_logs
 
+def sps(env_name, args=None, vecenv=None, policy=None, logger=None, verbose=True, should_stop_early=None):
+    args = args or load_config(env_name)
+    train_config = dict(**args['train'])#, env=env_name)
+    pufferl = PuffeRL(train_config, logger, verbose)
+    # Warmup
+    for _ in range(3):
+        _C.batched_forward(
+            pufferl.pufferl_cpp,
+            pufferl.observations,
+            pufferl.total_minibatches,
+            pufferl.minibatch_segments,
+        )
+
+    N = 100
+    torch.cuda.synchronize()
+    start = time.time()
+    for _ in range(N):
+        _C.batched_forward(
+            pufferl.pufferl_cpp,
+            pufferl.observations,
+            pufferl.total_minibatches,
+            pufferl.minibatch_segments,
+        )
+    torch.cuda.synchronize()
+    end = time.time()
+    dt = end - start
+    sps = pufferl.config['batch_size']*N/dt
+    print(f'SPS: {sps/1e6:.1f}M')
+
+
 def eval(env_name, args=None, vecenv=None, policy=None):
     args = args or load_config(env_name)
     backend = args['vec']['backend']
@@ -1038,6 +996,181 @@ def eval(env_name, args=None, vecenv=None, policy=None):
 def stop_if_loss_nan(logs):
     return any("losses/" in k and np.isnan(v) for k, v in logs.items())
 
+def _sweep_worker(env_name, q_host, q_worker, device):
+    while True:
+        #print("Worker waiting")
+        args = q_worker.get()
+        #print("Worker got data")
+        args['train']['device'] = device
+        seed = time.time_ns() & 0xFFFFFFFF
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        try:
+            all_logs = train(env_name, args=args, verbose=False)
+        except Exception:
+            import traceback
+            traceback.print_exc()
+
+        #all_logs = [{'foo': 0}]
+        #print("Worker ran experiment")
+        q_host.put(all_logs)
+        #print("Worker submitted result")
+
+def multisweep(args=None, env_name=None):
+    args = args or load_config(env_name)
+    sweep_gpus = args['sweep_gpus']
+    if sweep_gpus == -1:
+        sweep_gpus = torch.cuda.device_count()
+
+    method = args['sweep'].pop('method')
+    try:
+        sweep_cls = getattr(pufferlib.sweep, method)
+    except:
+        raise pufferlib.APIUsageError(f'Invalid sweep method {method}. See pufferlib.sweep')
+
+    sweep = sweep_cls(args['sweep'])
+    points_per_run = args['sweep']['downsample']
+    target_key = f'environment/{args["sweep"]["metric"]}'
+
+    from multiprocessing import Process, Queue, set_start_method
+    from copy import deepcopy
+
+    host_queues = []
+    worker_queues = []
+    workers = []
+    worker_args = []
+    set_start_method('spawn')
+    for i in range(sweep_gpus):
+        q_host = Queue()
+        q_worker = Queue()
+        w = Process(
+            target=_sweep_worker,
+            args=(env_name, q_host, q_worker, f'cuda:{i}')
+        )
+        w.start()
+        host_queues.append(q_host)
+        worker_queues.append(q_worker)
+        args = deepcopy(args)
+        worker_args.append(args)
+
+    for w in range(sweep_gpus):
+        args = worker_args[w]
+        sweep.suggest(args)
+        total_timesteps = args['train']['total_timesteps']
+        worker_queues[w].put(args)
+
+    runs = 0
+
+    suggestion = deepcopy(args)
+    while runs < args['max_runs']:
+        for w in range(sweep_gpus):
+            args = worker_args[w]
+            if host_queues[w].empty():
+                continue
+
+            all_logs = host_queues[w].get(timeout=0)
+            if not all_logs:
+                continue
+
+            all_logs = [e for e in all_logs if target_key in e]
+            scores = downsample([log[target_key] for log in all_logs], points_per_run)
+            times = downsample([log['uptime'] for log in all_logs], points_per_run)
+            steps = downsample([log['agent_steps'] for log in all_logs], points_per_run)
+            #costs = np.stack([times, steps], axis=1)
+            costs = times
+            timesteps = [log['agent_steps'] for log in all_logs]
+            timesteps = downsample(timesteps, points_per_run)
+            for score, cost, timestep in zip(scores, costs, timesteps):
+                args['train']['total_timesteps'] = timestep
+                sweep.observe(args, score, cost)
+
+            runs += 1
+
+            sweep.suggest(args)
+            worker_queues[w].put(args)
+
+def paretosweep(args=None, env_name=None):
+    args = args or load_config(env_name)
+    sweep_gpus = args['sweep_gpus']
+    if sweep_gpus == -1:
+        sweep_gpus = torch.cuda.device_count()
+
+    method = args['sweep'].pop('method')
+    try:
+        sweep_cls = getattr(pufferlib.sweep, method)
+    except:
+        raise pufferlib.APIUsageError(f'Invalid sweep method {method}. See pufferlib.sweep')
+
+    total_timesteps = args['sweep']['train'].pop('total_timesteps')
+    mmin = total_timesteps['min']
+    mmax = total_timesteps['max']
+    all_timesteps = np.geomspace(mmin, mmax, sweep_gpus)
+    # You hardcoded buffer size to 5 instead of 10 for this
+    sweeps = [sweep_cls(args['sweep']) for _ in range(sweep_gpus)]
+    points_per_run = args['sweep']['downsample']
+    target_key = f'environment/{args["sweep"]["metric"]}'
+
+    from multiprocessing import Process, Queue, set_start_method
+    from copy import deepcopy
+
+    host_queues = []
+    worker_queues = []
+    workers = []
+    worker_args = []
+    set_start_method('spawn')
+    for i in range(sweep_gpus):
+        q_host = Queue()
+        q_worker = Queue()
+        w = Process(
+            target=_sweep_worker,
+            args=(env_name, q_host, q_worker, f'cuda:{i}')
+        )
+        w.start()
+        host_queues.append(q_host)
+        worker_queues.append(q_worker)
+        args = deepcopy(args)
+        worker_args.append(args)
+
+    for w in range(sweep_gpus):
+        args = worker_args[w]
+        sweeps[w].suggest(args)
+        args['train']['total_timesteps'] = all_timesteps[w]
+        worker_queues[w].put(args)
+
+    runs = 0
+
+    suggestion = deepcopy(args)
+    while runs < args['max_runs']:
+        for w in range(sweep_gpus):
+            args = worker_args[w]
+            if host_queues[w].empty():
+                continue
+
+            all_logs = host_queues[w].get(timeout=0)
+            if not all_logs:
+                continue
+
+            all_logs = [e for e in all_logs if target_key in e]
+            scores = downsample([log[target_key] for log in all_logs], points_per_run)
+            times = downsample([log['uptime'] for log in all_logs], points_per_run)
+            steps = downsample([log['agent_steps'] for log in all_logs], points_per_run)
+            #costs = np.stack([times, steps], axis=1)
+            costs = times
+            timesteps = [log['agent_steps'] for log in all_logs]
+            timesteps = downsample(timesteps, points_per_run)
+            for score, cost, timestep in zip(scores, costs, timesteps):
+                args['train']['total_timesteps'] = timestep
+                sweeps[w].observe(args, score, cost)
+
+            runs += 1
+
+            sweeps[w].suggest(args)
+            args['train']['total_timesteps'] = all_timesteps[w]
+            worker_queues[w].put(args)
+
+    print('Done')
+
 def sweep(args=None, env_name=None):
     args = args or load_config(env_name)
     if not args['wandb'] and not args['neptune']:
@@ -1073,7 +1206,7 @@ def sweep(args=None, env_name=None):
         total_timesteps = args['train']['total_timesteps']
 
         scores = downsample([log[target_key] for log in all_logs], points_per_run)
-        costs = downsample([log['uptime'] for log in all_logs], points_per_run)
+        costs = downsample([log['agent_steps'] for log in all_logs], points_per_run)
         timesteps = downsample([log['agent_steps'] for log in all_logs], points_per_run)
 
         if len(timesteps) > 0 and timesteps[-1] < 0.7 * total_timesteps:  # 0.7 is arbitrary
@@ -1090,23 +1223,66 @@ def sweep(args=None, env_name=None):
         args['train']['total_timesteps'] = total_timesteps
 
 def profile(args=None, env_name=None, vecenv=None, policy=None):
-    args = load_config()
-    vecenv = vecenv or load_env(env_name, args)
-    policy = policy or load_policy(args, vecenv)
+    args = load_config(env_name)
+    #vecenv = vecenv or load_env(env_name, args)
+    #policy = policy or load_policy(args, vecenv)
+
+    #train_config = dict(**args['train'], env=args['env_name'], tag=args['tag'])
+    train_config = dict(**args['train'])
+    #pufferl = PuffeRL(train_config, vecenv, policy, neptune=args['neptune'], wandb=args['wandb'])
+    pufferl = PuffeRL(train_config)
+
+    # Warmup
+    for _ in range(5):
+        stats = pufferl.evaluate()
+        pufferl.train()
 
-    train_config = dict(**args['train'], env=args['env_name'], tag=args['tag'])
-    pufferl = PuffeRL(train_config, vecenv, policy, neptune=args['neptune'], wandb=args['wandb'])
+    torch.cuda.synchronize()
+    torch._C._cuda_clearCublasWorkspaces()      # optional, clears cuBLAS heuristics
+    torch.compiler.cudagraph_mark_step_begin()  # forces any pending CUDA graph/JIT work to finish
+    torch.cuda.synchronize()
+
+    pufferl.evaluate()
+    pufferl.train()
+    torch.cuda.synchronize()
 
     import torchvision.models as models
-    from torch.profiler import profile, record_function, ProfilerActivity
-    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof:
-        with record_function("model_inference"):
-            for _ in range(10):
+    from torch.profiler import profile, record_function, ProfilerActivity, schedule
+
+    with profile(
+        activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+        schedule=schedule(
+            skip_first=15,
+            wait=5,
+            warmup=10,
+            active=5,
+            repeat=1
+        ),
+    ) as prof:
+        for _ in range(35):  # 15 + 5 + 10 + 5 
+            with record_function("full_step"):
+                pufferl.evaluate()
+                pufferl.train()
+            prof.step()
+
+    '''
+    with profile(
+        activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+        record_shapes=True,
+        with_stack=False,
+    ) as prof:
+        with record_function("full_step"):
+            for _ in range(5):
                 stats = pufferl.evaluate()
                 pufferl.train()
+                prof.step()
+    '''
 
-    print(prof.key_averages().table(sort_by='cuda_time_total', row_limit=10))
-    prof.export_chrome_trace("trace.json")
+    print(prof.key_averages(group_by_input_shape=False).table(
+        sort_by="self_cpu_time_total",
+        row_limit=50,
+    ))
+    #prof.export_chrome_trace("trace.json")
 
 def export(args=None, env_name=None, vecenv=None, policy=None):
     args = args or load_config(env_name)
@@ -1148,11 +1324,12 @@ def load_policy(args, vecenv, env_name=''):
     policy_cls = getattr(env_module.torch, args['policy_name'])
     policy = policy_cls(vecenv.driver_env, **args['policy'])
 
+    '''
     rnn_name = args['rnn_name']
     if rnn_name is not None:
         rnn_cls = getattr(env_module.torch, args['rnn_name'])
-        policy = rnn_cls(vecenv.driver_env, policy, **args['rnn'])
-
+        policy = rnn_cls(vecenv.driver_env, policy, **args['policy'])
+    '''
     policy = policy.to(device)
 
     load_id = args['load_id']
@@ -1227,7 +1404,7 @@ def make_parser():
     parser.add_argument('--save-frames', type=int, default=0)
     parser.add_argument('--gif-path', type=str, default='eval.gif')
     parser.add_argument('--fps', type=float, default=15)
-    parser.add_argument('--max-runs', type=int, default=200, help='Max number of sweep runs')
+    parser.add_argument('--max-runs', type=int, default=1200, help='Max number of sweep runs')
     parser.add_argument('--wandb', action='store_true', help='Use wandb for logging')
     parser.add_argument('--wandb-project', type=str, default='pufferlib')
     parser.add_argument('--wandb-group', type=str, default='debug')
@@ -1236,6 +1413,7 @@ def make_parser():
     parser.add_argument('--neptune-project', type=str, default='ablations')
     parser.add_argument('--no-model-upload', action='store_true', help='Do not upload models to wandb or neptune')
     parser.add_argument('--local-rank', type=int, default=0, help='Used by torchrun for DDP')
+    parser.add_argument('--sweep-gpus', type=int, default=-1, help='multigpu sweeps')
     parser.add_argument('--tag', type=str, default=None, help='Tag for experiment')
     return parser
 
@@ -1297,12 +1475,20 @@ def main():
         eval(env_name=env_name)
     elif mode == 'sweep':
         sweep(env_name=env_name)
+    elif mode == 'multisweep':
+        multisweep(env_name=env_name)
+    elif mode == 'paretosweep':
+        paretosweep(env_name=env_name)
     elif mode == 'autotune':
         autotune(env_name=env_name)
     elif mode == 'profile':
         profile(env_name=env_name)
     elif mode == 'export':
         export(env_name=env_name)
+    elif mode == 'check':
+        check(env_name=env_name)
+    elif mode == 'sps':
+        sps(env_name=env_name)
     else:
         raise pufferlib.APIUsageError(err)
 
diff --git a/pufferlib/python_pufferl.py b/pufferlib/python_pufferl.py
new file mode 100644
index 000000000..b65e4b56d
--- /dev/null
+++ b/pufferlib/python_pufferl.py
@@ -0,0 +1,1451 @@
+## puffer [train | eval | sweep] [env_name] [optional args] -- See https://puffer.ai for full detail0
+# This is the same as python -m pufferlib.pufferl [train | eval | sweep] [env_name] [optional args]
+# Distributed example: torchrun --standalone --nnodes=1 --nproc-per-node=6 -m pufferlib.pufferl train puffer_nmmo3
+
+import contextlib
+import warnings
+warnings.filterwarnings('error', category=RuntimeWarning)
+
+import os
+import sys
+import glob
+import ast
+import time
+import random
+import shutil
+import argparse
+import importlib
+import configparser
+from threading import Thread
+from collections import defaultdict, deque
+
+import numpy as np
+import psutil
+
+import torch
+import torch.distributed
+from torch.distributed.elastic.multiprocessing.errors import record
+import torch.utils.cpp_extension
+
+import pufferlib
+import pufferlib.sweep
+import pufferlib.vector
+import pufferlib.pytorch
+from pufferlib.muon import Muon
+try:
+    from pufferlib import _C
+except ImportError:
+    raise ImportError('Failed to import C/CUDA advantage kernel. If you have non-default PyTorch, try installing with --no-build-isolation')
+
+import rich
+import rich.traceback
+from rich.table import Table
+from rich.console import Console
+from rich_argparse import RichHelpFormatter
+rich.traceback.install(show_locals=False)
+
+import signal # Aggressively exit on ctrl+c
+signal.signal(signal.SIGINT, lambda sig, frame: os._exit(0))
+
+# Assume advantage kernel has been built if CUDA compiler is available
+ADVANTAGE_CUDA = shutil.which("nvcc") is not None
+
+class PuffeRL:
+    def __init__(self, config, vecenv, policy, logger=None, verbose=True):
+        # Backend perf optimization
+        torch.set_float32_matmul_precision('high') # Old
+        torch.backends.cudnn.conv.fp32_precision = 'tf32'
+        torch.backends.cudnn.deterministic = config['torch_deterministic']
+        torch.backends.cudnn.benchmark = True
+
+        # Reproducibility
+        seed = config['seed']
+        #random.seed(seed)
+        #np.random.seed(seed)
+        #torch.manual_seed(seed)
+
+        # Vecenv info
+        vecenv.async_reset(seed)
+        obs_space = vecenv.single_observation_space
+        atn_space = vecenv.single_action_space
+        total_agents = vecenv.num_agents
+        self.total_agents = total_agents
+
+        # Experience
+        if config['batch_size'] == 'auto' and config['bptt_horizon'] == 'auto':
+            raise pufferlib.APIUsageError('Must specify batch_size or bptt_horizon')
+        elif config['batch_size'] == 'auto':
+            config['batch_size'] = total_agents * config['bptt_horizon']
+        elif config['bptt_horizon'] == 'auto':
+            config['bptt_horizon'] = config['batch_size'] // total_agents
+
+        batch_size = config['batch_size']
+        horizon = config['bptt_horizon']
+        segments = batch_size // horizon
+        self.segments = segments
+        if total_agents > segments:
+            raise pufferlib.APIUsageError(
+                f'Total agents {total_agents} <= segments {segments}'
+            )
+
+        device = config['device']
+        self.observations = torch.zeros(segments, horizon, *obs_space.shape,
+            dtype=pufferlib.pytorch.numpy_to_torch_dtype_dict[obs_space.dtype],
+            pin_memory=device == 'cuda' and config['cpu_offload'],
+            device='cpu' if config['cpu_offload'] else device)
+        self.actions = torch.zeros(segments, horizon, *atn_space.shape, device=device,
+            dtype=pufferlib.pytorch.numpy_to_torch_dtype_dict[atn_space.dtype])
+        self.values = torch.zeros(segments, horizon, device=device)
+        self.logprobs = torch.zeros(segments, horizon, device=device)
+        self.rewards = torch.zeros(segments, horizon, device=device)
+        self.terminals = torch.zeros(segments, horizon, device=device)
+        self.truncations = torch.zeros(segments, horizon, device=device)
+        self.ratio = torch.ones(segments, horizon, device=device)
+        self.importance = torch.ones(segments, horizon, device=device)
+        self.ep_lengths = torch.zeros(total_agents, device=device, dtype=torch.int32)
+        self.ep_indices = torch.arange(total_agents, device=device, dtype=torch.int32)
+        self.free_idx = total_agents
+
+        # Recurrent cell
+        if config['use_rnn']:
+            n = vecenv.agents_per_batch
+            self.state = {i*n: policy.initial_state(n, device=device) for i in range(total_agents//n)}
+
+        # Minibatching & gradient accumulation
+        minibatch_size = config['minibatch_size']
+        max_minibatch_size = config['max_minibatch_size']
+        self.minibatch_size = min(minibatch_size, max_minibatch_size)
+        self.accumulate_minibatches = max(1, minibatch_size // max_minibatch_size)
+        self.minibatch_segments = self.minibatch_size // horizon 
+        if self.minibatch_segments * horizon != self.minibatch_size:
+            raise pufferlib.APIUsageError(
+                f'minibatch_size {self.minibatch_size} must be divisible by bptt_horizon {horizon}'
+            )
+
+        # Torch compile
+        self.uncompiled_policy = policy
+        self.policy = policy
+        if config['compile']:
+            self.policy = torch.compile(policy, mode=config['compile_mode'])
+            self.policy.forward_eval = torch.compile(policy.forward_eval, mode=config['compile_mode'])
+            pufferlib.pytorch.sample_logits = torch.compile(pufferlib.pytorch.sample_logits, mode=config['compile_mode'])
+
+        self.optimizer = Muon(
+            self.policy.parameters(),
+            lr=config['learning_rate'],
+            momentum=config['adam_beta1'],
+            eps=config['adam_eps'],
+        )
+
+        # Logging
+        self.logger = logger
+        if logger is None:
+            self.logger = Logger(config, policy.__class__.__name__)
+
+        # Learning rate scheduler
+        epochs = max(1, config['total_timesteps'] // config['batch_size'])
+        self.total_epochs = epochs
+
+        # Automatic mixed precision
+        precision = config['precision']
+        self.amp_context = contextlib.nullcontext()
+        if config.get('amp', True) and config['device'] == 'cuda':
+            self.amp_context = torch.amp.autocast(device_type='cuda', dtype=getattr(torch, precision))
+        if precision not in ('float32', 'bfloat16'):
+            raise pufferlib.APIUsageError(f'Invalid precision: {precision}: use float32 or bfloat16')
+
+        # Initializations
+        self.config = config
+        self.vecenv = vecenv
+        self.epoch = 0
+        self.global_step = 0
+        self.last_log_step = 0
+        self.last_log_time = time.time()
+        self.start_time = time.time()
+        self.utilization = Utilization()
+        self.profile = Profile()
+        self.stats = defaultdict(list)
+        self.last_stats = defaultdict(list)
+        self.losses = {}
+        self.verbose = verbose
+
+        # Dashboard
+        self.model_size = sum(p.numel() for p in policy.parameters() if p.requires_grad)
+        def count_parameters(model):
+            return sum(p.numel() for p in model.parameters() if p.requires_grad)
+        self.print_dashboard(clear=True)
+
+
+        self.num_layers = 4
+        config['input_size'] = 118
+        config['num_atns'] = 3
+        config['hidden_size'] = 128
+        config['num_layers'] = self.num_layers
+        config['minibatch_segments'] = self.minibatch_segments
+        config['segments'] = segments
+        config['horizon'] = horizon
+        config['lr'] = config['learning_rate']
+        config['beta1'] = config['adam_beta1']
+        config['beta2'] = config['adam_beta2']
+        config['eps'] = config['adam_eps']
+        config['max_epochs'] = epochs
+        config['total_minibatches'] = config['num_minibatches']
+        config['accumulate_minibatches'] = self.accumulate_minibatches
+        config['num_envs'] = 8192
+        config['cudagraphs'] = False
+        config['kernels'] = False
+        config['num_buffers'] = 1
+        self.pufferl_cpp = _C.create_pufferl(config)
+
+
+    @property
+    def uptime(self):
+        return time.time() - self.start_time
+
+    @property
+    def sps(self):
+        if self.global_step == self.last_log_step:
+            return 0
+
+        return (self.global_step - self.last_log_step) / (time.time() - self.last_log_time)
+
+    def evaluate(self):
+        profile = self.profile
+        epoch = self.epoch
+        profile('eval', epoch)
+        profile('eval_misc', epoch, nest=True)
+
+        config = self.config
+        device = config['device']
+
+        if config['use_rnn']:
+            for k, tensors in self.state.items():
+                self.state[k] = [torch.zeros_like(e) for e in tensors]
+
+        self.full_rows = 0
+        while self.full_rows < self.segments:
+            profile('env', epoch)
+
+            '''
+            _C.python_vec_recv(self.pufferl_cpp, 0)
+            o, a, r, d = _C.env_buffers(self.pufferl_cpp)
+            t = torch.zeros(8192, device=device)
+            info = {}
+            mask = torch.ones(8192, device=device)
+            env_id = [0, 8191]
+            '''
+            o, r, d, t, info, env_id, mask = self.vecenv.recv()
+
+            profile('eval_misc', epoch)
+            env_id = slice(env_id[0], env_id[-1] + 1)
+
+            done_mask = d + t # TODO: Handle truncations separately
+            self.global_step += int(mask.sum())
+
+            profile('eval_copy', epoch)
+            o = torch.as_tensor(o)
+            o_device = o.to(device)#, non_blocking=True)
+            r = torch.as_tensor(r).to(device)#, non_blocking=True)
+            d = torch.as_tensor(d).to(device)#, non_blocking=True)
+
+            profile('eval_forward', epoch)
+            with torch.no_grad(), self.amp_context:
+                if config['use_rnn']:
+                    state = self.state[env_id.start]
+
+                logits, value, state = self.policy.forward_eval(o_device, state)
+                self.debug = state[0][-1]
+                action, logprob, _ = pufferlib.pytorch.sample_logits(logits)
+                r = torch.clamp(r, -1, 1)
+
+            profile('eval_copy', epoch)
+            with torch.no_grad():
+                if config['use_rnn']:
+                    self.state[env_id.start] = state
+
+                # Fast path for fully vectorized envs
+                l = self.ep_lengths[env_id.start].item()
+                batch_rows = slice(self.ep_indices[env_id.start].item(), 1+self.ep_indices[env_id.stop - 1].item())
+
+                if config['cpu_offload']:
+                    self.observations[batch_rows, l] = o
+                else:
+                    self.observations[batch_rows, l] = o_device
+
+                self.actions[batch_rows, l] = action
+                self.logprobs[batch_rows, l] = logprob
+                self.rewards[batch_rows, l] = r
+                self.terminals[batch_rows, l] = d.float()
+                self.values[batch_rows, l] = value.flatten()
+
+                # Note: We are not yet handling masks in this version
+                self.ep_lengths[env_id] += 1
+                if l+1 >= config['bptt_horizon']:
+                    num_full = env_id.stop - env_id.start
+                    self.ep_indices[env_id] = self.free_idx + torch.arange(num_full, device=config['device']).int()
+                    self.ep_lengths[env_id] = 0
+                    self.free_idx += num_full
+                    self.full_rows += num_full
+
+                #a[:] = action
+                action = action.cpu().numpy()
+                if isinstance(logits, torch.distributions.Normal):
+                    action = np.clip(action, self.vecenv.action_space.low, self.vecenv.action_space.high)
+
+            profile('eval_misc', epoch)
+            for i in info:
+                for k, v in pufferlib.unroll_nested_dict(i):
+                    if isinstance(v, np.ndarray):
+                        v = v.tolist()
+                    elif isinstance(v, (list, tuple)):
+                        self.stats[k].extend(v)
+                    else:
+                        self.stats[k].append(v)
+
+            profile('env', epoch)
+
+
+            #_C.python_vec_send(self.pufferl_cpp, 0)
+            self.vecenv.send(action)
+
+        logs = _C.log_environments(self.pufferl_cpp)
+        if logs:
+            self.stats['perf'] = [logs['perf']]
+            self.stats['score'] = [logs['score']]
+            self.stats['episode_return'] = [logs['episode_return']]
+            self.stats['episode_length'] = [logs['episode_length']]
+            self.stats['n'] = [logs['n']]
+
+
+        profile('eval_misc', epoch)
+        self.free_idx = self.total_agents
+        self.ep_indices = torch.arange(self.total_agents, device=device, dtype=torch.int32)
+        self.ep_lengths.zero_()
+        profile.end()
+        return self.stats
+
+    @record
+    def train(self):
+        profile = self.profile
+        epoch = self.epoch
+        profile('train', epoch)
+        losses = defaultdict(float)
+        config = self.config
+        device = config['device']
+
+        b0 = config['prio_beta0']
+        a = config['prio_alpha']
+        clip_coef = config['clip_coef']
+        vf_clip = config['vf_clip_coef']
+        anneal_beta = b0 + (1 - b0)*a*self.epoch/self.total_epochs
+        self.ratio[:] = 1
+
+        learning_rate = config['learning_rate']
+        if config['anneal_lr'] and self.epoch > 0:
+            lr_ratio = self.epoch / self.total_epochs
+            lr_min = config['learning_rate'] * config['min_lr_ratio']
+            learning_rate = lr_min + 0.5*(learning_rate - lr_min) * (1 + np.cos(np.pi * lr_ratio))
+            self.optimizer.param_groups[0]['lr'] = learning_rate
+
+        num_minibatches = config['num_minibatches']
+        for mb in range(num_minibatches):
+            profile('train_misc', epoch, nest=True)
+            self.amp_context.__enter__()
+
+            shape = self.values.shape
+            advantages = torch.zeros(shape, device=device)
+            advantages = compute_puff_advantage(self.values, self.rewards,
+                self.terminals, self.ratio, advantages, config['gamma'],
+                config['gae_lambda'], config['vtrace_rho_clip'], config['vtrace_c_clip'])
+
+            profile('train_copy', epoch)
+            adv = advantages.abs().sum(axis=1)
+            prio_weights = torch.nan_to_num(adv**a, 0, 0, 0)
+            prio_probs = (prio_weights + 1e-6)/(prio_weights.sum() + 1e-6)
+            idx = torch.multinomial(prio_probs,
+                self.minibatch_segments, replacement=True)
+            mb_prio = (self.segments*prio_probs[idx, None])**-anneal_beta
+
+            mb_obs = self.observations[idx]
+            mb_actions = self.actions[idx]
+            mb_logprobs = self.logprobs[idx]
+            mb_rewards = self.rewards[idx]
+            mb_terminals = self.terminals[idx]
+            mb_truncations = self.truncations[idx]
+            mb_ratio = self.ratio[idx]
+            mb_values = self.values[idx]
+            mb_returns = advantages[idx] + mb_values
+            mb_advantages = advantages[idx]
+
+            profile('train_forward', epoch)
+            if not config['use_rnn']:
+                mb_obs = mb_obs.reshape(-1, *self.vecenv.single_observation_space.shape)
+
+            logits, newvalue = self.policy(mb_obs)
+            actions, newlogprob, entropy = pufferlib.pytorch.sample_logits(logits, action=mb_actions)
+
+            profile('train_misc', epoch)
+            newlogprob = newlogprob.reshape(mb_logprobs.shape)
+            logratio = newlogprob - mb_logprobs
+            ratio = logratio.exp()
+            self.ratio[idx] = ratio.detach()
+
+            with torch.no_grad():
+                old_approx_kl = (-logratio).mean()
+                approx_kl = ((ratio - 1) - logratio).mean()
+                clipfrac = ((ratio - 1.0).abs() > config['clip_coef']).float().mean()
+
+            adv = mb_advantages
+            adv = mb_prio * (adv - adv.mean()) / (adv.std() + 1e-8)
+
+            # Losses
+            pg_loss1 = -adv * ratio
+            pg_loss2 = -adv * torch.clamp(ratio, 1 - clip_coef, 1 + clip_coef)
+            pg_loss = torch.max(pg_loss1, pg_loss2).mean()
+
+            newvalue = newvalue.view(mb_returns.shape)
+            v_clipped = mb_values + torch.clamp(newvalue - mb_values, -vf_clip, vf_clip)
+            v_loss_unclipped = (newvalue - mb_returns) ** 2
+            v_loss_clipped = (v_clipped - mb_returns) ** 2
+            v_loss = 0.5*torch.max(v_loss_unclipped, v_loss_clipped).mean()
+
+            entropy_loss = entropy.mean()
+
+            loss = pg_loss + config['vf_coef']*v_loss - config['ent_coef']*entropy_loss
+            self.amp_context.__enter__() # TODO: AMP needs some debugging
+
+            # This breaks vloss clipping?
+            self.values[idx] = newvalue.detach().float()
+
+            # Logging
+            profile('train_misc', epoch)
+            losses['policy_loss'] += pg_loss.item() / num_minibatches
+            losses['value_loss'] += v_loss.item() / num_minibatches
+            losses['entropy'] += entropy_loss.item() / num_minibatches
+            losses['old_approx_kl'] += old_approx_kl.item() / num_minibatches
+            losses['approx_kl'] += approx_kl.item() / num_minibatches
+            losses['clipfrac'] += clipfrac.item() / num_minibatches
+            losses['importance'] += ratio.mean().item() / num_minibatches
+
+            # Learn on accumulated minibatches
+            profile('learn', epoch)
+            loss.backward()
+            if (mb + 1) % self.accumulate_minibatches == 0:
+                torch.nn.utils.clip_grad_norm_(self.policy.parameters(), config['max_grad_norm'])
+                self.optimizer.step()
+                self.optimizer.zero_grad()
+
+        # Reprioritize experience
+        profile('train_misc', epoch)
+        y_pred = self.values.flatten()
+        y_true = advantages.flatten() + self.values.flatten()
+        var_y = y_true.var()
+        explained_var = torch.nan if var_y == 0 else (1 - (y_true - y_pred).var() / var_y).item()
+        losses['explained_variance'] = explained_var
+
+        profile.end()
+        logs = None
+        self.epoch += 1
+        done_training = self.global_step >= config['total_timesteps']
+        if done_training or self.global_step == 0 or time.time() > self.last_log_time + 0.25:
+            logs = self.mean_and_log()
+            self.losses = losses
+            self.print_dashboard()
+            self.stats = defaultdict(list)
+            self.last_log_time = time.time()
+            self.last_log_step = self.global_step
+            profile.clear()
+
+        if self.epoch % config['checkpoint_interval'] == 0 or done_training:
+            self.save_checkpoint()
+            self.msg = f'Checkpoint saved at update {self.epoch}'
+
+        return logs
+
+    def mean_and_log(self):
+        config = self.config
+        for k in list(self.stats.keys()):
+            v = self.stats[k]
+            try:
+                v = np.mean(v)
+            except:
+                del self.stats[k]
+
+            self.stats[k] = v
+
+        device = config['device']
+        agent_steps = int(dist_sum(self.global_step, device))
+        logs = {
+            'SPS': dist_sum(self.sps, device),
+            'agent_steps': agent_steps,
+            'uptime': time.time() - self.start_time,
+            'epoch': int(dist_sum(self.epoch, device)),
+            #'learning_rate': self.adam.param_groups[0]["lr"],
+            **{f'environment/{k}': v for k, v in self.stats.items()},
+            **{f'losses/{k}': v for k, v in self.losses.items()},
+            **{f'performance/{k}': v['elapsed'] for k, v in self.profile},
+            #**{f'environment/{k}': dist_mean(v, device) for k, v in self.stats.items()},
+            #**{f'losses/{k}': dist_mean(v, device) for k, v in self.losses.items()},
+            #**{f'performance/{k}': dist_sum(v['elapsed'], device) for k, v in self.profile},
+        }
+
+        if torch.distributed.is_initialized():
+           if torch.distributed.get_rank() != 0:
+               self.logger.log(logs, agent_steps)
+               return logs
+           else:
+               return None
+
+        self.logger.log(logs, agent_steps)
+        return logs
+
+    def close(self):
+        self.vecenv.close()
+        self.utilization.stop()
+        model_path = self.save_checkpoint()
+        run_id = self.logger.run_id
+        path = os.path.join(self.config['data_dir'],
+            self.config["env"], f'{run_id}.pt')
+        shutil.copy(model_path, path)
+        return path
+
+    def save_checkpoint(self):
+        if torch.distributed.is_initialized():
+           if torch.distributed.get_rank() != 0:
+               return
+ 
+        run_id = self.logger.run_id
+        path = os.path.join(self.config['data_dir'],
+            self.config["env"], run_id)
+        if not os.path.exists(path):
+            os.makedirs(path)
+
+        model_name = f'model_{self.config["env"]}_{self.epoch:06d}.pt'
+        model_path = os.path.join(path, model_name)
+        if os.path.exists(model_path):
+            return model_path
+
+        torch.save(self.uncompiled_policy.state_dict(), model_path)
+
+        state = {
+            #'adam_state_dict': self.adam.state_dict(),
+            #'muon_state_dict': self.muon.state_dict(),
+            'global_step': self.global_step,
+            'agent_step': self.global_step,
+            'update': self.epoch,
+            'model_name': model_name,
+            'run_id': run_id,
+        }
+        state_path = os.path.join(path, 'trainer_state.pt')
+        torch.save(state, state_path + '.tmp')
+        os.rename(state_path + '.tmp', state_path)
+        return model_path
+
+    def print_dashboard(self, clear=False, idx=[0],
+            c1='[cyan]', c2='[white]', b1='[bright_cyan]', b2='[bright_white]'):
+        if not self.verbose:
+            return
+
+        config = self.config
+        sps = dist_sum(self.sps, config['device'])
+        agent_steps = dist_sum(self.global_step, config['device'])
+        if torch.distributed.is_initialized():
+           if torch.distributed.get_rank() != 0:
+               return
+ 
+        profile = self.profile
+        console = Console()
+        dashboard = Table(box=rich.box.ROUNDED, expand=True,
+            show_header=False, border_style='bright_cyan')
+        table = Table(box=None, expand=True, show_header=False)
+        dashboard.add_row(table)
+
+        table.add_column(justify="left", width=30)
+        table.add_column(justify="center", width=12)
+        table.add_column(justify="center", width=12)
+        table.add_column(justify="center", width=13)
+        table.add_column(justify="right", width=13)
+
+        table.add_row(
+            f'{b1}PufferLib {b2}3.0 {idx[0]*" "}:blowfish:',
+            f'{c1}CPU: {b2}{np.mean(self.utilization.cpu_util):.1f}{c2}%',
+            f'{c1}GPU: {b2}{np.mean(self.utilization.gpu_util):.1f}{c2}%',
+            f'{c1}DRAM: {b2}{np.mean(self.utilization.cpu_mem):.1f}{c2}%',
+            f'{c1}VRAM: {b2}{np.mean(self.utilization.gpu_mem):.1f}{c2}%',
+        )
+        idx[0] = (idx[0] - 1) % 10
+            
+        s = Table(box=None, expand=True)
+        remaining = 'A hair past a freckle'
+        if sps != 0:
+            remaining = duration((config['total_timesteps'] - agent_steps)/sps, b2, c2)
+
+        s.add_column(f"{c1}Summary", justify='left', vertical='top', width=10)
+        s.add_column(f"{c1}Value", justify='right', vertical='top', width=14)
+        s.add_row(f'{c2}Env', f'{b2}{config["env"]}')
+        s.add_row(f'{c2}Params', abbreviate(self.model_size, b2, c2))
+        s.add_row(f'{c2}Steps', abbreviate(agent_steps, b2, c2))
+        s.add_row(f'{c2}SPS', abbreviate(sps, b2, c2))
+        s.add_row(f'{c2}Epoch', f'{b2}{self.epoch}')
+        s.add_row(f'{c2}Uptime', duration(self.uptime, b2, c2))
+        s.add_row(f'{c2}Remaining', remaining)
+
+        delta = profile.eval['buffer'] + profile.train['buffer']
+        p = Table(box=None, expand=True, show_header=False)
+        p.add_column(f"{c1}Performance", justify="left", width=10)
+        p.add_column(f"{c1}Time", justify="right", width=8)
+        p.add_column(f"{c1}%", justify="right", width=4)
+        p.add_row(*fmt_perf('Evaluate', b1, delta, profile.eval, b2, c2))
+        p.add_row(*fmt_perf('  Forward', c2, delta, profile.eval_forward, b2, c2))
+        p.add_row(*fmt_perf('  Env', c2, delta, profile.env, b2, c2))
+        p.add_row(*fmt_perf('  Copy', c2, delta, profile.eval_copy, b2, c2))
+        p.add_row(*fmt_perf('  Misc', c2, delta, profile.eval_misc, b2, c2))
+        p.add_row(*fmt_perf('Train', b1, delta, profile.train, b2, c2))
+        p.add_row(*fmt_perf('  Forward', c2, delta, profile.train_forward, b2, c2))
+        p.add_row(*fmt_perf('  Learn', c2, delta, profile.learn, b2, c2))
+        p.add_row(*fmt_perf('  Copy', c2, delta, profile.train_copy, b2, c2))
+        p.add_row(*fmt_perf('  Misc', c2, delta, profile.train_misc, b2, c2))
+
+        l = Table(box=None, expand=True, )
+        l.add_column(f'{c1}Losses', justify="left", width=16)
+        l.add_column(f'{c1}Value', justify="right", width=8)
+        for metric, value in self.losses.items():
+            l.add_row(f'{c2}{metric}', f'{b2}{value:.3f}')
+
+        monitor = Table(box=None, expand=True, pad_edge=False)
+        monitor.add_row(s, p, l)
+        dashboard.add_row(monitor)
+
+        table = Table(box=None, expand=True, pad_edge=False)
+        dashboard.add_row(table)
+        left = Table(box=None, expand=True)
+        right = Table(box=None, expand=True)
+        table.add_row(left, right)
+        left.add_column(f"{c1}User Stats", justify="left", width=20)
+        left.add_column(f"{c1}Value", justify="right", width=10)
+        right.add_column(f"{c1}User Stats", justify="left", width=20)
+        right.add_column(f"{c1}Value", justify="right", width=10)
+        i = 0
+
+        if self.stats:
+            self.last_stats = self.stats
+
+        for metric, value in (self.stats or self.last_stats).items():
+            try: # Discard non-numeric values
+                int(value)
+            except:
+                continue
+
+            u = left if i % 2 == 0 else right
+            u.add_row(f'{c2}{metric}', f'{b2}{value:.3f}')
+            i += 1
+            if i == 30:
+                break
+
+        if clear:
+            console.clear()
+
+        with console.capture() as capture:
+            console.print(dashboard)
+
+        print('\033[0;0H' + capture.get())
+
+def compute_puff_advantage(values, rewards, terminals,
+        ratio, advantages, gamma, gae_lambda, vtrace_rho_clip, vtrace_c_clip):
+    '''CUDA kernel for puffer advantage with automatic CPU fallback. You need
+    nvcc (in cuda-dev-tools or in a cuda-dev docker base) for PufferLib to
+    compile the fast version.'''
+
+    device = values.device
+    if not ADVANTAGE_CUDA:
+        values = values.cpu()
+        rewards = rewards.cpu()
+        terminals = terminals.cpu()
+        ratio = ratio.cpu()
+        advantages = advantages.cpu()
+
+    torch.ops.pufferlib.compute_puff_advantage(values, rewards, terminals,
+        ratio, advantages, gamma, gae_lambda, vtrace_rho_clip, vtrace_c_clip)
+
+    if not ADVANTAGE_CUDA:
+        return advantages.to(device)
+
+    return advantages
+
+
+def abbreviate(num, b2, c2):
+    if num < 1e3:
+        return str(num)
+    elif num < 1e6:
+        return f'{num/1e3:.1f}K'
+    elif num < 1e9:
+        return f'{num/1e6:.1f}M'
+    elif num < 1e12:
+        return f'{num/1e9:.1f}B'
+    else:
+        return f'{num/1e12:.2f}T'
+
+def duration(seconds, b2, c2):
+    if seconds < 0:
+        return f"{b2}0{c2}s"
+    seconds = int(seconds)
+    h = seconds // 3600
+    m = (seconds % 3600) // 60
+    s = seconds % 60
+    return f"{b2}{h}{c2}h {b2}{m}{c2}m {b2}{s}{c2}s" if h else f"{b2}{m}{c2}m {b2}{s}{c2}s" if m else f"{b2}{s}{c2}s"
+
+def fmt_perf(name, color, delta_ref, prof, b2, c2):
+    percent = 0 if delta_ref == 0 else int(100*prof['buffer']/delta_ref - 1e-5)
+    return f'{color}{name}', duration(prof['elapsed'], b2, c2), f'{b2}{percent:2d}{c2}%'
+
+def dist_sum(value, device):
+    if not torch.distributed.is_initialized():
+        return value
+
+    tensor = torch.tensor(value, device=device)
+    torch.distributed.all_reduce(tensor, op=torch.distributed.ReduceOp.SUM)
+    return tensor.item()
+
+def dist_mean(value, device):
+    if not torch.distributed.is_initialized():
+        return value
+
+    return dist_sum(value, device) / torch.distributed.get_world_size()
+
+class Profile:
+    def __init__(self, frequency=1):
+        self.profiles = defaultdict(lambda: defaultdict(float))
+        self.frequency = frequency
+        self.stack = []
+
+    def __iter__(self):
+        return iter(self.profiles.items())
+
+    def __getattr__(self, name):
+        return self.profiles[name]
+
+    def __call__(self, name, epoch, nest=False):
+        if epoch % self.frequency != 0:
+            return
+
+        #if torch.cuda.is_available():
+        #    torch.cuda.synchronize()
+
+        tick = time.time()
+        if len(self.stack) != 0 and not nest:
+            self.pop(tick)
+
+        self.stack.append(name)
+        self.profiles[name]['start'] = tick
+
+    def pop(self, end):
+        profile = self.profiles[self.stack.pop()]
+        delta = end - profile['start']
+        profile['elapsed'] += delta
+        profile['delta'] += delta
+
+    def end(self):
+        #if torch.cuda.is_available():
+        #    torch.cuda.synchronize()
+
+        end = time.time()
+        for i in range(len(self.stack)):
+            self.pop(end)
+
+    def clear(self):
+        for prof in self.profiles.values():
+            if prof['delta'] > 0:
+                prof['buffer'] = prof['delta']
+                prof['delta'] = 0
+
+class Utilization(Thread):
+    def __init__(self, delay=1, maxlen=20):
+        super().__init__()
+        self.cpu_mem = deque([0], maxlen=maxlen)
+        self.cpu_util = deque([0], maxlen=maxlen)
+        self.gpu_util = deque([0], maxlen=maxlen)
+        self.gpu_mem = deque([0], maxlen=maxlen)
+        self.stopped = False
+        self.delay = delay
+        self.start()
+
+    def run(self):
+        while not self.stopped:
+            self.cpu_util.append(100*psutil.cpu_percent()/psutil.cpu_count())
+            mem = psutil.virtual_memory()
+            self.cpu_mem.append(100*mem.active/mem.total)
+            if torch.cuda.is_available():
+                # Monitoring in distributed crashes nvml
+                if torch.distributed.is_initialized():
+                   time.sleep(self.delay)
+                   continue
+
+                #self.gpu_util.append(torch.cuda.utilization())
+                #free, total = torch.cuda.mem_get_info()
+                #self.gpu_mem.append(100*(total-free)/total)
+            else:
+                self.gpu_util.append(0)
+                self.gpu_mem.append(0)
+
+            time.sleep(self.delay)
+
+    def stop(self):
+        self.stopped = True
+
+def downsample(arr, m):
+    if len(arr) <= m:
+        return arr
+
+    if m == 0:
+        return [arr[-1]]
+
+    orig_arr = arr
+    last = arr[-1]
+    arr = arr[:-1]
+    arr = np.array(arr)
+    n = len(arr)
+    n = (n//m)*m
+    arr = arr[-n:]
+    downsampled = arr.reshape(m, -1).mean(axis=1)
+    return np.concatenate([downsampled, [last]])
+
+class Logger:
+    def __init__(self, args, policy_name):
+        self.run_id = str(int(1000*time.time()))
+        root = os.path.join(args['data_dir'], 'logs', policy_name, args['env'])
+        if not os.path.exists(root):
+            os.makedirs(root)
+
+        self.path = os.path.join(root, self.run_id + '.json')
+        self.logs = {'data': []}
+        for k, v in pufferlib.unroll_nested_dict(args):
+            self.logs[k] = v
+
+    # Temp hack to log full config
+    def init(self, args):
+        for k, v in pufferlib.unroll_nested_dict(args):
+            self.logs[k] = v
+
+    def log(self, logs, step):
+        self.logs['data'].append(logs)
+
+    def log_cost(self, cost):
+        self.logs['cost'] = cost
+
+    def close(self, model_path):
+        import json
+        with open(self.path, 'w') as f:
+            json.dump(self.logs, f)
+
+class NeptuneLogger:
+    def __init__(self, args, load_id=None, mode='async'):
+        import neptune as nept
+        neptune_name = args['neptune_name']
+        neptune_project = args['neptune_project']
+        neptune = nept.init_run(
+            project=f"{neptune_name}/{neptune_project}",
+            capture_hardware_metrics=False,
+            capture_stdout=False,
+            capture_stderr=False,
+            capture_traceback=False,
+            with_id=load_id,
+            mode=mode,
+            tags = [args['tag']] if args['tag'] is not None else [],
+        )
+        self.run_id = neptune._sys_id
+        self.neptune = neptune
+        for k, v in pufferlib.unroll_nested_dict(args):
+            neptune[k].append(v)
+
+    def init(self, args):
+        pass
+
+    def log_cost(self, cost):
+        pass
+
+    def log(self, logs, step):
+        for k, v in logs.items():
+            self.neptune[k].append(v, step=step)
+
+    def close(self, model_path):
+        self.neptune['model'].track_files(model_path)
+        self.neptune.stop()
+
+    def download(self):
+        self.neptune["model"].download(destination='artifacts')
+        return f'artifacts/{self.run_id}.pt'
+ 
+class WandbLogger:
+    def __init__(self, args, load_id=None, resume='allow'):
+        import wandb
+        wandb.init(
+            id=load_id or wandb.util.generate_id(),
+            project=args['wandb_project'],
+            group=args['wandb_group'],
+            allow_val_change=True,
+            save_code=False,
+            resume=resume,
+            config=args,
+            tags = [args['tag']] if args['tag'] is not None else [],
+        )
+        self.wandb = wandb
+        self.run_id = wandb.run.id
+
+    def log(self, logs, step):
+        self.wandb.log(logs, step=step)
+
+    def close(self, model_path):
+        artifact = self.wandb.Artifact(self.run_id, type='model')
+        artifact.add_file(model_path)
+        self.wandb.run.log_artifact(artifact)
+        self.wandb.finish()
+
+    def download(self):
+        artifact = self.wandb.use_artifact(f'{self.run_id}:latest')
+        data_dir = artifact.download()
+        model_file = max(os.listdir(data_dir))
+        return f'{data_dir}/{model_file}'
+ 
+def train(env_name, args=None, vecenv=None, policy=None, logger=None, verbose=True):
+    args = args or load_config(env_name)
+
+    # Assume TorchRun DDP is used if LOCAL_RANK is set
+    if 'LOCAL_RANK' in os.environ:
+        world_size = int(os.environ.get('WORLD_SIZE', 1))
+        print("World size", world_size)
+        master_addr = os.environ.get('MASTER_ADDR', 'localhost')
+        master_port = os.environ.get('MASTER_PORT', '29500')
+        local_rank = int(os.environ["LOCAL_RANK"])
+        print(f"rank: {local_rank}, MASTER_ADDR={master_addr}, MASTER_PORT={master_port}")
+        torch.cuda.set_device(local_rank)
+        os.environ["CUDA_VISIBLE_DEVICES"] = str(local_rank)
+
+    vecenv = vecenv or load_env(env_name, args)
+    policy = policy or load_policy(args, vecenv, env_name)
+
+    if 'LOCAL_RANK' in os.environ:
+        args['train']['device'] = torch.cuda.current_device()
+        torch.distributed.init_process_group(backend='nccl', world_size=world_size)
+        policy = policy.to(local_rank)
+        model = torch.nn.parallel.DistributedDataParallel(
+            policy, device_ids=[local_rank], output_device=local_rank
+        )
+        if hasattr(policy, 'lstm'):
+            #model.lstm = policy.lstm
+            model.hidden_size = policy.hidden_size
+
+        model.forward_eval = policy.forward_eval
+        model.initial_state = policy.initial_state
+        policy = model.to(local_rank)
+
+    if args['neptune']:
+        logger = NeptuneLogger(args)
+    elif args['wandb']:
+        logger = WandbLogger(args)
+
+    train_config = dict(**args['train'], env=env_name)
+    pufferl = PuffeRL(train_config, vecenv, policy, logger, verbose)
+    pufferl.logger.init(args)
+
+    all_logs = []
+    max_cost = args['train'].get('max_cost', -1)
+    while pufferl.global_step < train_config['total_timesteps']:
+        if pufferl.uptime > max_cost and max_cost > 0:
+            break
+
+        if train_config['device'] == 'cuda':
+            torch.compiler.cudagraph_mark_step_begin()
+        pufferl.evaluate()
+        if train_config['device'] == 'cuda':
+            torch.compiler.cudagraph_mark_step_begin()
+        logs = pufferl.train()
+
+        if logs is not None:
+            if pufferl.global_step > 0.20*train_config['total_timesteps']:
+                all_logs.append(logs)
+
+    # Final eval. You can reset the env here, but depending on
+    # your env, this can skew data (i.e. you only collect the shortest
+    # rollouts within a fixed number of epochs)
+    i = 0
+    stats = {}
+    uptime = pufferl.uptime
+    agent_steps = pufferl.global_step
+    for i in range(128):  # Run eval for at least 32, but put a hard stop at 128.
+        stats = pufferl.evaluate()
+        if i >= 32 and stats:
+            break
+
+    logs = pufferl.mean_and_log()
+    logs['uptime'] = uptime
+    logs['agent_steps'] = agent_steps
+    if logs is not None:
+        all_logs.append(logs)
+
+    pufferl.print_dashboard()
+    model_path = pufferl.close()
+    pufferl.logger.log_cost(uptime)
+    pufferl.logger.close(model_path)
+    return all_logs
+
+def eval(env_name, args=None, vecenv=None, policy=None):
+    args = args or load_config(env_name)
+    backend = args['vec']['backend']
+    if backend != 'PufferEnv':
+        backend = 'Serial'
+
+    args['vec'] = dict(backend=backend, num_envs=1)
+    vecenv = vecenv or load_env(env_name, args)
+
+    policy = policy or load_policy(args, vecenv, env_name)
+    ob, info = vecenv.reset()
+    driver = vecenv.driver_env
+    num_agents = vecenv.observation_space.shape[0]
+    device = args['train']['device']
+
+    state = {}
+    if args['train']['use_rnn']:
+        state = policy.initial_state(num_agents, device)
+
+    frames = []
+    while True:
+        render = driver.render()
+        if len(frames) < args['save_frames']:
+            frames.append(render)
+
+        # Screenshot Ocean envs with F12, gifs with control + F12
+        if driver.render_mode == 'ansi':
+            print('\033[0;0H' + render + '\n')
+            time.sleep(1/args['fps'])
+        elif driver.render_mode == 'rgb_array':
+            pass
+            #import cv2
+            #render = cv2.cvtColor(render, cv2.COLOR_RGB2BGR)
+            #cv2.imshow('frame', render)
+            #cv2.waitKey(1)
+            #time.sleep(1/args['fps'])
+
+        with torch.no_grad():
+            ob = torch.as_tensor(ob).to(device)
+            logits, value, state = policy.forward_eval(ob, state)
+            action, logprob, _ = pufferlib.pytorch.sample_logits(logits)
+            action = action.cpu().numpy().reshape(vecenv.action_space.shape)
+
+        if isinstance(logits, torch.distributions.Normal):
+            action = np.clip(action, vecenv.action_space.low, vecenv.action_space.high)
+
+        ob = vecenv.step(action)[0]
+
+        if len(frames) > 0 and len(frames) == args['save_frames']:
+            import imageio
+            imageio.mimsave(args['gif_path'], frames, fps=args['fps'], loop=0)
+            frames.append('Done')
+
+def _sweep_worker(env_name, q_host, q_worker, device):
+    while True:
+        #print("Worker waiting")
+        args = q_worker.get()
+        #print("Worker got data")
+        args['train']['device'] = device
+        seed = time.time_ns() & 0xFFFFFFFF
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        try:
+            all_logs = train(env_name, args=args, verbose=False)
+        except Exception:
+            import traceback
+            traceback.print_exc()
+
+        #all_logs = [{'foo': 0}]
+        #print("Worker ran experiment")
+        q_host.put(all_logs)
+        #print("Worker submitted result")
+
+def multisweep(args=None, env_name=None):
+    args = args or load_config(env_name)
+    sweep_gpus = args['sweep_gpus']
+    if sweep_gpus == -1:
+        sweep_gpus = torch.cuda.device_count()
+
+    method = args['sweep'].pop('method')
+    try:
+        sweep_cls = getattr(pufferlib.sweep, method)
+    except:
+        raise pufferlib.APIUsageError(f'Invalid sweep method {method}. See pufferlib.sweep')
+
+    sweep = sweep_cls(args['sweep'])
+    points_per_run = args['sweep']['downsample']
+    target_key = f'environment/{args["sweep"]["metric"]}'
+
+    from multiprocessing import Process, Queue, set_start_method
+    from copy import deepcopy
+
+    host_queues = []
+    worker_queues = []
+    workers = []
+    worker_args = []
+    set_start_method('spawn')
+    for i in range(sweep_gpus):
+        q_host = Queue()
+        q_worker = Queue()
+        w = Process(
+            target=_sweep_worker,
+            args=(env_name, q_host, q_worker, f'cuda:{i}')
+        )
+        w.start()
+        host_queues.append(q_host)
+        worker_queues.append(q_worker)
+        args = deepcopy(args)
+        worker_args.append(args)
+
+    for w in range(sweep_gpus):
+        args = worker_args[w]
+        sweep.suggest(args)
+        total_timesteps = args['train']['total_timesteps']
+        worker_queues[w].put(args)
+
+    runs = 0
+
+    suggestion = deepcopy(args)
+    while runs < args['max_runs']:
+        for w in range(sweep_gpus):
+            args = worker_args[w]
+            if host_queues[w].empty():
+                continue
+
+            all_logs = host_queues[w].get(timeout=0)
+            if not all_logs:
+                continue
+
+            all_logs = [e for e in all_logs if target_key in e]
+            scores = downsample([log[target_key] for log in all_logs], points_per_run)
+            times = downsample([log['uptime'] for log in all_logs], points_per_run)
+            steps = downsample([log['agent_steps'] for log in all_logs], points_per_run)
+            #costs = np.stack([times, steps], axis=1)
+            costs = times
+            timesteps = [log['agent_steps'] for log in all_logs]
+            timesteps = downsample(timesteps, points_per_run)
+            for score, cost, timestep in zip(scores, costs, timesteps):
+                args['train']['total_timesteps'] = timestep
+                sweep.observe(args, score, cost)
+
+            runs += 1
+
+            sweep.suggest(args)
+            worker_queues[w].put(args)
+
+def paretosweep(args=None, env_name=None):
+    args = args or load_config(env_name)
+    sweep_gpus = args['sweep_gpus']
+    if sweep_gpus == -1:
+        sweep_gpus = torch.cuda.device_count()
+
+    method = args['sweep'].pop('method')
+    try:
+        sweep_cls = getattr(pufferlib.sweep, method)
+    except:
+        raise pufferlib.APIUsageError(f'Invalid sweep method {method}. See pufferlib.sweep')
+
+    total_timesteps = args['sweep']['train'].pop('total_timesteps')
+    mmin = total_timesteps['min']
+    mmax = total_timesteps['max']
+    all_timesteps = np.geomspace(mmin, mmax, sweep_gpus)
+    # You hardcoded buffer size to 5 instead of 10 for this
+    sweeps = [sweep_cls(args['sweep']) for _ in range(sweep_gpus)]
+    points_per_run = args['sweep']['downsample']
+    target_key = f'environment/{args["sweep"]["metric"]}'
+
+    from multiprocessing import Process, Queue, set_start_method
+    from copy import deepcopy
+
+    host_queues = []
+    worker_queues = []
+    workers = []
+    worker_args = []
+    set_start_method('spawn')
+    for i in range(sweep_gpus):
+        q_host = Queue()
+        q_worker = Queue()
+        w = Process(
+            target=_sweep_worker,
+            args=(env_name, q_host, q_worker, f'cuda:{i}')
+        )
+        w.start()
+        host_queues.append(q_host)
+        worker_queues.append(q_worker)
+        args = deepcopy(args)
+        worker_args.append(args)
+
+    for w in range(sweep_gpus):
+        args = worker_args[w]
+        sweeps[w].suggest(args)
+        args['train']['total_timesteps'] = all_timesteps[w]
+        worker_queues[w].put(args)
+
+    runs = 0
+
+    suggestion = deepcopy(args)
+    while runs < args['max_runs']:
+        for w in range(sweep_gpus):
+            args = worker_args[w]
+            if host_queues[w].empty():
+                continue
+
+            all_logs = host_queues[w].get(timeout=0)
+            if not all_logs:
+                continue
+
+            all_logs = [e for e in all_logs if target_key in e]
+            scores = downsample([log[target_key] for log in all_logs], points_per_run)
+            times = downsample([log['uptime'] for log in all_logs], points_per_run)
+            steps = downsample([log['agent_steps'] for log in all_logs], points_per_run)
+            #costs = np.stack([times, steps], axis=1)
+            costs = times
+            timesteps = [log['agent_steps'] for log in all_logs]
+            timesteps = downsample(timesteps, points_per_run)
+            for score, cost, timestep in zip(scores, costs, timesteps):
+                args['train']['total_timesteps'] = timestep
+                sweeps[w].observe(args, score, cost)
+
+            runs += 1
+
+            sweeps[w].suggest(args)
+            args['train']['total_timesteps'] = all_timesteps[w]
+            worker_queues[w].put(args)
+
+    print('Done')
+
+def sweep(args=None, env_name=None):
+    args = args or load_config(env_name)
+    if not args['wandb'] and not args['neptune']:
+        raise pufferlib.APIUsageError('Sweeps require either wandb or neptune')
+
+    method = args['sweep'].pop('method')
+    try:
+        sweep_cls = getattr(pufferlib.sweep, method)
+    except:
+        raise pufferlib.APIUsageError(f'Invalid sweep method {method}. See pufferlib.sweep')
+
+    sweep = sweep_cls(args['sweep'])
+    points_per_run = args['sweep']['downsample']
+    target_key = f'environment/{args["sweep"]["metric"]}'
+    for i in range(args['max_runs']):
+        seed = time.time_ns() & 0xFFFFFFFF
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        sweep.suggest(args)
+        total_timesteps = args['train']['total_timesteps']
+        all_logs = train(env_name, args=args)
+        all_logs = [e for e in all_logs if target_key in e]
+        scores = downsample([log[target_key] for log in all_logs], points_per_run)
+        costs = downsample([log['agent_steps'] for log in all_logs], points_per_run)
+        timesteps = downsample([log['agent_steps'] for log in all_logs], points_per_run)
+        for score, cost, timestep in zip(scores, costs, timesteps):
+            args['train']['total_timesteps'] = timestep
+            sweep.observe(args, score, cost)
+
+        # Prevent logging final eval steps as training steps
+        args['train']['total_timesteps'] = total_timesteps
+
+def profile(args=None, env_name=None, vecenv=None, policy=None):
+    args = load_config()
+    vecenv = vecenv or load_env(env_name, args)
+    policy = policy or load_policy(args, vecenv)
+
+    train_config = dict(**args['train'], env=args['env_name'], tag=args['tag'])
+    pufferl = PuffeRL(train_config, vecenv, policy, neptune=args['neptune'], wandb=args['wandb'])
+
+    import torchvision.models as models
+    from torch.profiler import profile, record_function, ProfilerActivity
+    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof:
+        with record_function("model_inference"):
+            for _ in range(10):
+                stats = pufferl.evaluate()
+                pufferl.train()
+
+    print(prof.key_averages().table(sort_by='cuda_time_total', row_limit=10))
+    prof.export_chrome_trace("trace.json")
+
+def export(args=None, env_name=None, vecenv=None, policy=None):
+    args = args or load_config(env_name)
+    vecenv = vecenv or load_env(env_name, args)
+    policy = policy or load_policy(args, vecenv)
+
+    weights = []
+    for name, param in policy.named_parameters():
+        weights.append(param.data.cpu().numpy().flatten())
+        print(name, param.shape, param.data.cpu().numpy().ravel()[0])
+    
+    path = f'{args["env_name"]}_weights.bin'
+    weights = np.concatenate(weights)
+    weights.tofile(path)
+    print(f'Saved {len(weights)} weights to {path}')
+
+def autotune(args=None, env_name=None, vecenv=None, policy=None):
+    package = args['package']
+    module_name = 'pufferlib.ocean' if package == 'ocean' else f'pufferlib.environments.{package}'
+    env_module = importlib.import_module(module_name)
+    env_name = args['env_name']
+    make_env = env_module.env_creator(env_name)
+    pufferlib.vector.autotune(make_env, batch_size=args['train']['env_batch_size'])
+ 
+def load_env(env_name, args):
+    package = args['package']
+    module_name = 'pufferlib.ocean' if package == 'ocean' else f'pufferlib.environments.{package}'
+    env_module = importlib.import_module(module_name)
+    make_env = env_module.env_creator(env_name)
+    return pufferlib.vector.make(make_env, env_kwargs=args['env'], **args['vec'])
+
+def load_policy(args, vecenv, env_name=''):
+    package = args['package']
+    module_name = 'pufferlib.ocean' if package == 'ocean' else f'pufferlib.environments.{package}'
+    env_module = importlib.import_module(module_name)
+
+    # NOTE: LSTM API is changing. Trying to make it work now, but should revisit later.
+    rnn_name = args['rnn_name']
+    '''
+    if rnn_name is not None:
+        policy_cls = getattr(env_module.torch, args['policy_name'])
+        def make_policy():
+            return policy_cls(vecenv.driver_env, **args['policy'])
+        rnn_cls = getattr(env_module.torch, args['rnn_name'])
+        policy = rnn_cls(vecenv.driver_env, make_policy, **args['rnn'])
+    else:
+    '''
+    policy_cls = getattr(env_module.torch, args['policy_name'])
+    policy = policy_cls(vecenv.driver_env, **args['policy'])
+
+    device = args['train']['device']
+    policy = policy.to(device)
+
+    load_id = args['load_id']
+    if load_id is not None:
+        if args['neptune']:
+            path = NeptuneLogger(args, load_id, mode='read-only').download()
+        elif args['wandb']:
+            path = WandbLogger(args, load_id).download()
+        else:
+            raise pufferlib.APIUsageError('No run id provided for eval')
+
+        state_dict = torch.load(path, map_location=device)
+        state_dict = {k.replace('module.', ''): v for k, v in state_dict.items()}
+        policy.load_state_dict(state_dict)
+
+    load_path = args['load_model_path']
+    if load_path == 'latest':
+        load_path = max(glob.glob(f"experiments/{env_name}*.pt"), key=os.path.getctime)
+
+    if load_path is not None:
+        state_dict = torch.load(load_path, map_location=device)
+        state_dict = {k.replace('module.', ''): v for k, v in state_dict.items()}
+        policy.load_state_dict(state_dict)
+        #state_path = os.path.join(*load_path.split('/')[:-1], 'state.pt')
+        #optim_state = torch.load(state_path)['optimizer_state_dict']
+        #pufferl.optimizer.load_state_dict(optim_state)
+
+    return policy
+
+def load_config(env_name):
+    parser = argparse.ArgumentParser(
+        description=f':blowfish: PufferLib [bright_cyan]{pufferlib.__version__}[/]'
+        ' demo options. Shows valid args for your env and policy',
+        formatter_class=RichHelpFormatter, add_help=False)
+    parser.add_argument('--load-model-path', type=str, default=None,
+        help='Path to a pretrained checkpoint')
+    parser.add_argument('--load-id', type=str,
+        default=None, help='Kickstart/eval from from a finished Wandb/Neptune run')
+    parser.add_argument('--render-mode', type=str, default='auto',
+        choices=['auto', 'human', 'ansi', 'rgb_array', 'raylib', 'None'])
+    parser.add_argument('--save-frames', type=int, default=0)
+    parser.add_argument('--gif-path', type=str, default='eval.gif')
+    parser.add_argument('--fps', type=float, default=15)
+    parser.add_argument('--max-runs', type=int, default=1200, help='Max number of sweep runs')
+    parser.add_argument('--wandb', action='store_true', help='Use wandb for logging')
+    parser.add_argument('--wandb-project', type=str, default='pufferlib')
+    parser.add_argument('--wandb-group', type=str, default='debug')
+    parser.add_argument('--neptune', action='store_true', help='Use neptune for logging')
+    parser.add_argument('--neptune-name', type=str, default='pufferai')
+    parser.add_argument('--neptune-project', type=str, default='ablations')
+    parser.add_argument('--local-rank', type=int, default=0, help='Used by torchrun for DDP')
+    parser.add_argument('--sweep-gpus', type=int, default=-1, help='multigpu sweeps')
+    parser.add_argument('--tag', type=str, default=None, help='Tag for experiment')
+    args = parser.parse_known_args()[0]
+
+    # Load defaults and config
+    puffer_dir = os.path.dirname(os.path.realpath(__file__))
+    puffer_config_dir = os.path.join(puffer_dir, 'config/**/*.ini')
+    puffer_default_config = os.path.join(puffer_dir, 'config/default.ini')
+    if env_name == 'default':
+        p = configparser.ConfigParser()
+        p.read(puffer_default_config)
+    else:
+        for path in glob.glob(puffer_config_dir, recursive=True):
+            p = configparser.ConfigParser()
+            p.read([puffer_default_config, path])
+            if env_name in p['base']['env_name'].split(): break
+        else:
+            raise pufferlib.APIUsageError('No config for env_name {}'.format(env_name))
+
+    # Dynamic help menu from config
+    def puffer_type(value):
+        try:
+            return ast.literal_eval(value)
+        except:
+            return value
+
+    for section in p.sections():
+        for key in p[section]:
+            fmt = f'--{key}' if section == 'base' else f'--{section}.{key}'
+            parser.add_argument(
+                fmt.replace('_', '-'),
+                default=puffer_type(p[section][key]),
+                type=puffer_type
+            )
+
+    parser.add_argument('-h', '--help', default=argparse.SUPPRESS,
+        action='help', help='Show this help message and exit')
+
+    # Unpack to nested dict
+    parsed = vars(parser.parse_args())
+    args = defaultdict(dict)
+    for key, value in parsed.items():
+        next = args
+        for subkey in key.split('.'):
+            prev = next
+            next = next.setdefault(subkey, {})
+
+        prev[subkey] = value
+
+    args['train']['use_rnn'] = args['rnn_name'] is not None
+    return args
+
+def main():
+    err = 'Usage: puffer [train, eval, sweep, autotune, profile, export] [env_name] [optional args]. --help for more info'
+    if len(sys.argv) < 3:
+        raise pufferlib.APIUsageError(err)
+
+    mode = sys.argv.pop(1)
+    env_name = sys.argv.pop(1)
+    if mode == 'train':
+        train(env_name=env_name)
+    elif mode == 'eval':
+        eval(env_name=env_name)
+    elif mode == 'sweep':
+        sweep(env_name=env_name)
+    elif mode == 'multisweep':
+        multisweep(env_name=env_name)
+    elif mode == 'paretosweep':
+        paretosweep(env_name=env_name)
+    elif mode == 'autotune':
+        autotune(env_name=env_name)
+    elif mode == 'profile':
+        profile(env_name=env_name)
+    elif mode == 'export':
+        export(env_name=env_name)
+    else:
+        raise pufferlib.APIUsageError(err)
+
+if __name__ == '__main__':
+    main()
diff --git a/pufferlib/pytorch.py b/pufferlib/pytorch.py
index caf92632b..dec232561 100644
--- a/pufferlib/pytorch.py
+++ b/pufferlib/pytorch.py
@@ -200,6 +200,9 @@ def sample_logits(logits, action=None):
         logits = logits.unsqueeze(0)
     # TODO: Double check this
     else: #multi-discrete
+        #TODO: This stupid bullshit optimization causes
+        # out of bounds action samples when nans/infs are input
+        # instead of failing gracefully
         logits = torch.nn.utils.rnn.pad_sequence(
             [l.transpose(0,1) for l in logits], 
             batch_first=False, 
diff --git a/pufferlib/resources/tower_climb/maps.bin b/pufferlib/resources/tower_climb/maps.bin
new file mode 100644
index 000000000..acbf5c42e
Binary files /dev/null and b/pufferlib/resources/tower_climb/maps.bin differ
diff --git a/pufferlib/sweep.py b/pufferlib/sweep.py
index c366a7281..3db73d91f 100644
--- a/pufferlib/sweep.py
+++ b/pufferlib/sweep.py
@@ -711,7 +711,9 @@ def suggest(self, fill):
                     p_success = self.success_classifier.predict_proba(suggestions)[:, 1]
                 suggestion_scores *= p_success
 
-        best_idx = np.argmax(suggestion_scores)
+        idxs = np.argsort(suggestion_scores)[::-1]
+        best_idx = idxs[0]
+
         info = dict(
             cost = gp_c[best_idx].item(),
             score = gp_y[best_idx].item(),
diff --git a/pyproject.toml b/pyproject.toml
index 797707449..11c027671 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -251,7 +251,7 @@ puffer = "pufferlib.pufferl:main"
 Homepage = "https://puffer.ai"
 
 [build-system]
-requires = ["setuptools", "wheel", "Cython", "numpy<2.0", "torch"]
+requires = ["setuptools", "wheel", "Cython", "numpy<2.0", "torch", "pybind11"]
 build-backend = "setuptools.build_meta"
 
 [tool.uv]
diff --git a/scripts/build_dlltest.sh b/scripts/build_dlltest.sh
new file mode 100755
index 000000000..bcbdf91e0
--- /dev/null
+++ b/scripts/build_dlltest.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+# Usage: ./build_env.sh pong [local|fast|web]
+
+RAYLIB_NAME='raylib-5.5_linux_amd64'
+LINK_ARCHIVES="./$RAYLIB_NAME/lib/libraylib.a"
+
+FLAGS=(
+    -Wall
+    -I./$RAYLIB_NAME/include
+    -I/usr/local/cuda/include
+    "pufferlib/extensions/test_dll.c" -o "test_dll"
+    $LINK_ARCHIVES
+    -lGL
+    -lm
+    -lpthread
+    -L/usr/local/cuda/lib64 -lcudart
+    -ferror-limit=3
+    -DPLATFORM_DESKTOP
+    # Bite me
+    -Werror=incompatible-pointer-types
+    -Wno-error=incompatible-pointer-types-discards-qualifiers
+    -Wno-incompatible-pointer-types-discards-qualifiers
+    -Wno-error=array-parameter
+    #-fsanitize=address,undefined,bounds,pointer-overflow,leak
+    #-fsanitize=thread
+    -fno-omit-frame-pointer
+)
+
+clang -g -O2 ${FLAGS[@]}
diff --git a/scripts/build_ocean.sh b/scripts/build_ocean.sh
index 88909d44f..5afead463 100755
--- a/scripts/build_ocean.sh
+++ b/scripts/build_ocean.sh
@@ -70,12 +70,20 @@ FLAGS=(
     -I./$BOX2D_NAME/include
     -I./$BOX2D_NAME/src
     -I./pufferlib/extensions
+    #"$SRC_DIR/cJSON.c" "$SRC_DIR/$ENV.c" -o "$ENV"
     "$SRC_DIR/$ENV.c" -o "$ENV"
     $LINK_ARCHIVES
+    -lGL
     -lm
     -lpthread
     -ferror-limit=3
     -DPLATFORM_DESKTOP
+    # Bite me
+    -Werror=incompatible-pointer-types
+    -Werror=return-type
+    -Wno-error=incompatible-pointer-types-discards-qualifiers
+    -Wno-incompatible-pointer-types-discards-qualifiers
+    -Wno-error=array-parameter
 )
 
 
diff --git a/scripts/build_vec.sh b/scripts/build_vec.sh
new file mode 100755
index 000000000..582dce2b0
--- /dev/null
+++ b/scripts/build_vec.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+# Usage: ./build_env.sh pong [local|fast|web]
+
+RAYLIB_NAME='raylib-5.5_linux_amd64'
+LINK_ARCHIVES="./$RAYLIB_NAME/lib/libraylib.a"
+
+FLAGS=(
+    -shared
+    -Wall
+    -I./$RAYLIB_NAME/include
+    -I/usr/local/cuda/include
+    "pufferlib/extensions/breakout.c" -o "breakout.so"
+    $LINK_ARCHIVES
+    -lGL
+    -lm
+    -lpthread
+    #-L/usr/local/cuda/lib64 -lcudart
+    -ferror-limit=3
+    -DPLATFORM_DESKTOP
+    # Bite me
+    -Werror=incompatible-pointer-types
+    -Wno-error=incompatible-pointer-types-discards-qualifiers
+    -Wno-incompatible-pointer-types-discards-qualifiers
+    -Wno-error=array-parameter
+    -fms-extensions
+    #-fsanitize=address,undefined,bounds,pointer-overflow,leak
+    #-fno-omit-frame-pointer
+    #-fsanitize=thread
+    -fPIC
+)
+
+clang -g -O2 ${FLAGS[@]}
diff --git a/setup.py b/setup.py
index 8a8444dd6..859da8ed8 100644
--- a/setup.py
+++ b/setup.py
@@ -11,6 +11,7 @@
 import tarfile
 import platform
 import shutil
+import pybind11
 
 from setuptools.command.build_ext import build_ext
 from torch.utils import cpp_extension
@@ -82,19 +83,23 @@ def download_box2d(platform):
 ]
 cxx_args = [
     '-fdiagnostics-color=always',
+    '-std=c++17',
+]
+nvcc_args = [
+    '-Xcompiler=-D_GLIBCXX_USE_CXX11_ABI=1',
+    '-std=c++17',
 ]
-nvcc_args = []
 
 if DEBUG:
     extra_compile_args += [
         '-O0',
         '-g',
-        '-fsanitize=address,undefined,bounds,pointer-overflow,leak',
-        '-fno-omit-frame-pointer',
+        #'-fsanitize=address,undefined,bounds,pointer-overflow,leak',
+        #'-fno-omit-frame-pointer',
     ]
     extra_link_args += [
         '-g',
-        '-fsanitize=address,undefined,bounds,pointer-overflow,leak',
+        #'-fsanitize=address,undefined,bounds,pointer-overflow,leak',
     ]
     cxx_args += [
         '-O0',
@@ -167,14 +172,15 @@ def run(self):
         self.run_command('build_torch')
         self.run_command('build_c')
 
+extnames = ["pufferlib._C", "squared_torch._C"]
 class CBuildExt(build_ext):
     def run(self, *args, **kwargs):
-        self.extensions = [e for e in self.extensions if e.name != "pufferlib._C"]
+        self.extensions = [e for e in self.extensions if e.name not in extnames]
         super().run(*args, **kwargs)
 
 class TorchBuildExt(cpp_extension.BuildExtension):
     def run(self):
-        self.extensions = [e for e in self.extensions if e.name == "pufferlib._C"]
+        self.extensions = [e for e in self.extensions if e.name in extnames]
         super().run()
 
 INCLUDE = [f'{BOX2D_NAME}/include', f'{BOX2D_NAME}/src']
@@ -213,11 +219,54 @@ def run(self):
             c_ext.include_dirs.append('/usr/local/include')
             c_ext.extra_link_args.extend(['-L/usr/local/lib', '-llammps'])
 
+# Standalone profiler build command
+class ProfilerBuildExt(build_ext):
+    user_options = build_ext.user_options + [
+        ('no-torch', None, 'Build profiler without torch support'),
+    ]
+
+    def initialize_options(self):
+        super().initialize_options()
+        self.no_torch = False
+
+    def finalize_options(self):
+        super().finalize_options()
+
+    def run(self):
+        import subprocess
+        import sysconfig
+        import torch.utils.cpp_extension as cpp_ext
+
+        src = 'profile_kernels.cu'
+        out = 'profile_kernels'
+
+        nvcc = cpp_ext._join_cuda_home('bin', 'nvcc')
+        arch = '-arch=sm_80'
+
+        cmd = [nvcc, '-O3', arch, '-I.', src, '-o', out]
+
+        if not self.no_torch:
+            out = 'profile_kernels_torch'
+            lib_paths = cpp_ext.library_paths()
+            cmd = [nvcc, '-O3', arch, '-DUSE_TORCH', '-I.']
+            cmd += ['-I' + sysconfig.get_path('include')]
+            cmd += ['-I' + p for p in cpp_ext.include_paths()]
+            cmd += ['-L' + p for p in lib_paths]
+            cmd += ['-Xlinker', '-rpath,' + ':'.join(lib_paths)]
+            cmd += ['-Xlinker', '--no-as-needed']
+            cmd += ['-lc10', '-lc10_cuda', '-ltorch', '-ltorch_cpu', '-ltorch_cuda']
+            cmd += [src, '-o', out]
+
+        print(f'Building profiler: {" ".join(cmd)}')
+        subprocess.check_call(cmd)
+        print(f'Built: {out}')
+
 # Define cmdclass outside of setup to add dynamic commands
 cmdclass = {
     "build_ext": BuildExt,
     "build_torch": TorchBuildExt,
     "build_c": CBuildExt,
+    "build_profiler": ProfilerBuildExt,
 }
 
 if not NO_OCEAN:
@@ -235,25 +284,39 @@ def run(self):
 
 
 # Check if CUDA compiler is available. You need cuda dev, not just runtime.
+import torch
+cuda_home = os.environ.get('CUDA_HOME') or os.environ.get('CUDA_PATH') or torch.utils.cpp_extension.CUDA_HOME or '/usr/local/cuda'
+nvtx_lib_dir = os.path.join(cuda_home, 'lib64')  # Common on Linux; fall back to 'lib' if needed
+nvtx_lib = 'nvToolsExt'
 torch_extensions = []
 if not NO_TRAIN:
     torch_sources = [
         "pufferlib/extensions/pufferlib.cpp",
+        "pufferlib/extensions/muon.cpp",
     ]
     if BUID_CUDA_EXT:
         extension = CUDAExtension
         torch_sources.append("pufferlib/extensions/cuda/pufferlib.cu")
+        torch_sources.append("pufferlib/extensions/cuda/squared_torch.cu")
+        torch_sources.append("pufferlib/extensions/cuda/kernels.cu")
+        torch_sources.append("pufferlib/extensions/cuda/modules.cu")
     else:
         extension = CppExtension
 
+    import torch
     torch_extensions = [
        extension(
             "pufferlib._C",
             torch_sources,
+            include_dirs=[pybind11.get_include(), torch.utils.cpp_extension.include_paths()[0]],
             extra_compile_args = {
-                "cxx": cxx_args,
+                "cxx": extra_compile_args + cxx_args,
                 "nvcc": nvcc_args,
-            }
+            },
+            extra_link_args=extra_link_args,
+            extra_objects=[RAYLIB_A],
+            libraries=[nvtx_lib],
+            library_dirs=[nvtx_lib_dir],
         ),
     ]
 
@@ -281,7 +344,7 @@ def run(self):
 
 if not NO_TRAIN:
     install_requires += [
-        'torch',
+        'torch>=2.9',
         'psutil',
         'nvidia-ml-py',
         'rich',
diff --git a/test_atomic_mutex.c b/test_atomic_mutex.c
new file mode 100644
index 000000000..65d7d2dbc
--- /dev/null
+++ b/test_atomic_mutex.c
@@ -0,0 +1,106 @@
+#include <assert.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <pthread.h>
+#include <unistd.h>
+#include <stdatomic.h>
+#include <limits.h>
+
+typedef struct Arg {
+    long idx;
+    long end;
+    atomic_long* completed;
+    int block_size;
+    pthread_cond_t cond;
+    pthread_mutex_t mutex;
+} Arg;
+
+typedef struct WrappedArg {
+    Arg* arg;
+    int id;
+} WrappedArg;
+
+static void* worker_step(void* void_arg) {
+    WrappedArg* wrapped_arg = (WrappedArg*)void_arg;
+    Arg* arg = wrapped_arg->arg;
+    pthread_cond_t* cond = &arg->cond;
+    pthread_mutex_t* mutex = &arg->mutex;
+    while (1) {
+        pthread_mutex_lock(mutex);
+        atomic_store(&arg->completed[wrapped_arg->id], arg->idx);
+        while (arg->idx >= arg->end) {
+            pthread_cond_wait(cond, mutex);
+        }
+        long start = arg->idx;
+        long end = arg->idx + arg->block_size;
+        if (end > arg->end) {
+            end = arg->end;
+        }
+        arg->idx = end;
+        pthread_mutex_unlock(mutex);
+        usleep(arg->block_size);
+        //atomic_store(&arg->completed[wrapped_arg->id], end);
+        //printf("Thread %d completed %d\n", wrapped_arg->id, end);
+    }
+    return NULL;
+}
+
+
+int main() {
+    Arg arg = {0};
+    pthread_mutex_init(&arg.mutex, NULL);
+    pthread_cond_init(&arg.cond, NULL);
+
+    int n_threads = 4;
+    arg.completed = calloc(n_threads, sizeof(atomic_long));
+    pthread_t* threads = calloc(n_threads, sizeof(pthread_t));
+    WrappedArg* wrapped_args = calloc(n_threads, sizeof(WrappedArg));
+    for (int i=0; i<n_threads; i++) {
+        wrapped_args[i].arg = &arg;
+        wrapped_args[i].id = i;
+        int err = pthread_create(&threads[i], NULL, worker_step, (void*)(&wrapped_args[i]));
+        pthread_detach(threads[i]);
+        assert(err == 0 && "failed to create thread");
+    }
+
+    int buffers = 4;
+    int chunk_size = 4096;
+    int block_size = 1024;
+    arg.block_size = block_size;
+
+    float timeout = 3;
+    int start = time(NULL);
+    int end = 0;
+    int buf = 0;
+    int iter = 0;
+
+    while (time(NULL) - start < timeout) {
+        long min_expected = (iter - buffers) * chunk_size;
+        for (int i=0; i<n_threads; i++) {
+            long completed = atomic_load(&arg.completed[i]);
+            if (completed < min_expected) {
+                continue; // Bad, fix
+            }
+        }
+
+        pthread_mutex_lock(&arg.mutex);
+        arg.end += chunk_size;
+        pthread_cond_broadcast(&arg.cond);
+        pthread_mutex_unlock(&arg.mutex);
+    }
+
+    // Check final state
+    long min_completed = LONG_MAX;
+    for (int i=0; i<n_threads; i++) {
+        long completed = atomic_load(&arg.completed[i]);
+        if (completed < min_completed) {
+            min_completed = completed;
+        }
+    }
+
+    for (int i=0; i<n_threads; i++) {
+        pthread_cancel(threads[i]);
+    }
+
+    printf("SPS: %.2f M\n", min_completed/1e6f/timeout);
+}
diff --git a/test_squared_torch.py b/test_squared_torch.py
new file mode 100644
index 000000000..f4f8b4d9a
--- /dev/null
+++ b/test_squared_torch.py
@@ -0,0 +1,93 @@
+
+import torch
+import torch.utils.cpp_extension
+try:
+    from pufferlib import _C
+except ImportError:
+    raise ImportError('Failed to import C/CUDA advantage kernel. If you have non-default PyTorch, try installing with --no-build-isolation')
+
+# Note: In Python/CUDA interop via PyTorch, we'll use integers directly
+NOOP = 0
+DOWN = 1
+UP = 2
+LEFT = 3
+RIGHT = 4
+
+if __name__ == '__main__':
+    # THIS IS HARDCODED IN CUDA. DO NOT CHANGE
+    num_envs = 2048
+    steps = 10000
+    grid_size = 9
+    dummy = torch.zeros(5).cuda()
+    indices = torch.arange(num_envs).int()
+    envs, obs, actions, rewards, terminals = _C.create_squared_environments(num_envs, grid_size)#, dummy)
+    _C.reset_environments(envs, indices)
+
+    import time
+    start = time.time()
+    torch.cuda.synchronize()
+
+    for i in range(steps):
+        # Get agent and goal positions from obs
+        agent_pos = torch.nonzero(obs == 1, as_tuple=False)  # [N, 3] -> (env_idx, y, x)
+        goal_pos = torch.nonzero(obs == 2, as_tuple=False)   # [N, 3] -> (env_idx, y, x)
+
+        # Extract environment indices and coordinates
+        agent_envs = agent_pos[:, 0]
+        agent_y = agent_pos[:, 1]
+        agent_x = agent_pos[:, 2]
+
+        goal_envs = goal_pos[:, 0]
+        goal_y = goal_pos[:, 1]
+        goal_x = goal_pos[:, 2]
+
+        # Since both are sorted by env index, we can assume alignment
+        # But we need to map both to the same batch dimension (num_envs)
+        # Create tensors to hold coords per env
+        device = obs.device
+        full_agent_y = torch.zeros(num_envs, dtype=torch.long, device=device)
+        full_agent_x = torch.zeros(num_envs, dtype=torch.long, device=device)
+        full_goal_y = torch.zeros(num_envs, dtype=torch.long, device=device)
+        full_goal_x = torch.zeros(num_envs, dtype=torch.long, device=device)
+
+        # Scatter the detected positions into full arrays
+        full_agent_y[agent_envs] = agent_y
+        full_agent_x[agent_envs] = agent_x
+        full_goal_y[goal_envs] = goal_y
+        full_goal_x[goal_envs] = goal_x
+
+        # Now compute desired actions
+        move_y = full_goal_y - full_agent_y
+        move_x = full_goal_x - full_agent_x
+
+        # Default action is NOOP
+        atns = torch.full((num_envs,), NOOP, dtype=torch.long, device=device)
+
+        up_mask = move_y < 0
+        down_mask = move_y > 0
+        atns[up_mask] = UP
+        atns[down_mask] = DOWN
+
+        noop_mask = move_y == 0
+        left_mask = noop_mask & (move_x < 0)
+        right_mask = noop_mask & (move_x > 0)
+        atns[left_mask] = LEFT
+        atns[right_mask] = RIGHT
+
+        # Assign actions
+        actions[:] = atns
+
+        # Step environment
+        _C.step_environments(envs, indices)
+
+    torch.cuda.synchronize()
+    end = time.time()
+
+    logs = _C.log_environments(envs, indices)
+    print('perf', logs.perf)
+    print('score', logs.score)
+    print('episode_return', logs.episode_return)
+    print('episode_length', logs.episode_length)
+    print('n', logs.n)
+
+    print('Steps/sec:', num_envs * steps / (end - start))
diff --git a/tests/microbench.py b/tests/microbench.py
new file mode 100644
index 000000000..ac54be8ea
--- /dev/null
+++ b/tests/microbench.py
@@ -0,0 +1,295 @@
+import time
+import rich
+
+import torch
+import torch.utils.cpp_extension
+
+import pufferlib
+try:
+    from pufferlib import _C
+except ImportError:
+    raise ImportError('Failed to import C/CUDA advantage kernel. If you have non-default PyTorch, try installing with --no-build-isolation')
+
+BR = 4096  # Rollout batch (no T dim)
+BT = 512   # Train batch (with T dim)
+T = 64
+H = 128
+A = 4
+TIMEOUT = 1
+
+def check_close(a, b, rtol=1e-3, atol=1e-4):
+    output = []
+    assert len(a) == len(b)
+    for a, b in zip(a, b):
+        a = a.float()
+        b = b.float()
+        max_diff = (a - b).abs().max()
+        passed = torch.allclose(a, b, rtol=rtol, atol=atol)
+        color = 'green' if passed else 'red'
+        output.append(f'[{color}]{max_diff:.2e}[/{color}]')
+
+    return ' '.join(output)
+
+def parse_args(args):
+    py_args = []
+    cpp_args = []
+    backward = False
+    for arg in args:
+        if isinstance(arg, torch.Tensor):
+            if arg.requires_grad:
+                backward = True
+
+            # VERY IMPORTANT: You have to set requires_grad AFTER moving to GPU
+            # Otherwise torch moves grads back to CPU and crushes perf
+            #dtype = torch.float64 if arg.dtype == torch.float32 else arg.dtype
+            dtype = torch.float32 if arg.dtype == torch.float32 else arg.dtype
+            py_args.append(arg.clone().detach().to(dtype).cuda().requires_grad_(arg.requires_grad))
+            cpp_args.append(arg.clone().detach().cuda().requires_grad_(arg.requires_grad))
+        else:
+            py_args.append(arg)
+            cpp_args.append(arg)
+
+    return py_args, cpp_args, backward
+
+def test_loss(outputs):
+    if type(outputs) == torch.Tensor:
+        return outputs.sum()
+
+    return sum([o.sum() for o in outputs])/len(outputs)
+
+def test_kernel(py_func, cpp_func, *args, benchmark=True):
+    py_args, cpp_args, backward = parse_args(args)
+
+    py_out = py_func(*py_args)
+    cpp_out = cpp_func(*cpp_args)
+
+    if not isinstance(py_out, (tuple, list)):
+        py_out = [py_out]
+    if not isinstance(cpp_out, (tuple, list)):
+        cpp_out = [cpp_out]
+
+    output = check_close(py_out, cpp_out)
+    rich.print('\tForward check:', output)
+
+    if backward:
+        py_loss = test_loss(py_out)
+        cpp_loss = test_loss(cpp_out)
+
+        py_loss.backward()
+        cpp_loss.backward()
+
+        has_grad = lambda v: [e.grad for e in v if isinstance(e, torch.Tensor) and e.grad is not None]
+        py_grad = has_grad(py_args)
+        cpp_grad = has_grad(cpp_args)
+
+
+        output = check_close(py_grad, cpp_grad)
+        rich.print('\tBackward check:', output)
+
+    if benchmark:
+        py_sps = time_sps(py_func, *py_args)
+        cpp_sps = time_sps(cpp_func, *cpp_args)
+        print(f'\tForward sps: {py_sps} (naive) {cpp_sps} (C++)')
+
+        if backward:
+            py_sps = time_sps(py_func, *py_args, backward=True)
+            cpp_sps = time_sps(cpp_func, *cpp_args, backward=True)
+            print(f'\tBackward sps: {py_sps} (naive) {cpp_sps} (C++)')
+
+def time_sps(func, *args, backward=False):
+    assert isinstance(args[0], torch.Tensor)
+    N = args[0].shape[:-1].numel()
+
+    if backward:
+        outputs = func(*args)
+        if not isinstance(outputs, (tuple, list)):
+            outputs = [outputs]
+        grad_outputs = [torch.randn_like(o) for o in outputs]
+
+    # Warm up
+    for i in range(3):
+        if backward:
+            for arg in args:
+                if isinstance(arg, torch.Tensor) and arg.requires_grad:
+                    arg.grad = None
+            torch.autograd.backward(outputs, grad_outputs, retain_graph=True)
+        else:
+            with torch.no_grad():
+                func(*args)
+
+    torch.cuda.synchronize()
+    start = time.time()
+    steps = 0
+    while time.time() - start < TIMEOUT:
+        steps += 1
+        if backward:
+            for arg in args:
+                if isinstance(arg, torch.Tensor) and arg.requires_grad:
+                    arg.grad = None
+            torch.autograd.backward(outputs, grad_outputs, retain_graph=True)
+        else:
+            with torch.no_grad():
+                func(*args)
+
+    torch.cuda.synchronize()
+    sps = N*steps/(time.time() - start)
+    if sps < 1e3:
+        return f'{sps:.2f}'
+    if sps < 1e6:
+        return f'{sps/1e3:.2f} K'
+    if sps < 1e9:
+        return f'{sps/1e6:.2f} M'
+
+    return f'{sps/1e9:.2f} B'
+
+def mingru_gate(state, gate, hidden):
+    hidden = torch.where(hidden >= 0, hidden + 0.5, hidden.sigmoid())
+    gate = gate.sigmoid()
+    out = torch.lerp(state, hidden, gate)
+    return out
+
+def test_mingru_gate():
+    state = torch.randn(BR, H)
+    gate = torch.randn(BR, H)
+    hidden = torch.randn(BR, H)
+    print('mingru_gate')
+    test_kernel(mingru_gate, _C.mingru_gate, state, gate, hidden)
+
+def log_coeffs_and_values(gate, hidden):
+    log_coeffs = -torch.nn.functional.softplus(gate)
+    log_z = -torch.nn.functional.softplus(-gate)
+    log_tilde_h = torch.where(hidden >= 0,
+        (torch.nn.functional.relu(hidden) + 0.5).log(),
+        -torch.nn.functional.softplus(-hidden))
+    log_values = log_z + log_tilde_h
+    return log_coeffs, log_values
+
+def log_coeffs_and_values_loss(outputs):
+    log_coeffs, log_values = outputs
+    return torch.sum(log_coeffs) + torch.sum(log_values)
+
+def test_log_coeffs_and_values():
+    gate = torch.randn(BT, T, H, requires_grad=True)
+    hidden = torch.randn(BT, T, H, requires_grad=True)
+    print('log_coeffs_and_values')
+    test_kernel(log_coeffs_and_values, _C.log_coeffs_and_values, gate, hidden)
+
+def fused_scan(log_coeffs, log_values, state):
+    # Fuse cat+pad+narrow into the scan (matches kernel behavior)
+    log_values = torch.cat([state.log(), log_values], dim=1)
+    log_coeffs = torch.nn.functional.pad(log_coeffs, (0, 0, 1, 0))
+    a_star = log_coeffs.cumsum(1)
+    log_h0_plus_b_star = (log_values - a_star).logcumsumexp(1)
+    log_h = a_star + log_h0_plus_b_star
+    full_out = log_h.exp()
+    # Narrow to get last T timesteps (kernel returns this directly)
+    T = log_values.size(1) - 1  # original T before cat
+    out = full_out.narrow(1, 1, T)  # skip first timestep
+    next_state = full_out.narrow(1, T, 1)  # last timestep
+    return [out, next_state]
+
+def fused_scan_loss(outputs):
+    return torch.sum(outputs[0]) + torch.sum(outputs[1])
+
+def test_fused_scan():
+    # Numerically unstable function. Must be called with the distribution
+    # that is used in the full network.
+    log_coeffs = -torch.nn.functional.softplus(torch.randn(BT, T, H)).requires_grad_(True)
+    log_values = -torch.nn.functional.softplus(torch.randn(BT, T, H)).requires_grad_(True)
+    state = torch.rand(BT, 1, H).requires_grad_(True)  # state must be positive for log
+
+    print('fused_scan')
+    test_kernel(fused_scan, _C.fused_scan, log_coeffs, log_values, state)
+
+def logcumsumexp(x):
+    return [torch.log(torch.exp(x).cumsum(1))]
+
+def logcumsumexp_loss(outputs):
+    return torch.sum(outputs[0])
+
+def test_logcumsumexp():
+    x = torch.randn(BT, T, H, requires_grad=True)
+    print('logcumsumexp')
+    test_kernel(logcumsumexp, _C.logcumsumexp_cuda, x)
+
+def fused_ppo_loss(logits, newvalue, actions, old_logprobs,
+        advantages, prio, values, returns, adv_mean, adv_std,
+        clip_coef, vf_clip_coef, vf_coef, ent_coef):
+
+    segments, horizon, _ = logits.shape
+
+    flat_logits = logits.reshape(-1, logits.size(-1));
+    flat_actions = actions.reshape(-1);
+    logprobs_new = torch.log_softmax(flat_logits, 1);
+
+    probs_new = logprobs_new.exp();
+    entropy = - (probs_new * logprobs_new).sum(1).mean();
+
+    newlogprob_flat = logprobs_new.gather(1, flat_actions.unsqueeze(1)).squeeze(1);
+    newlogprob = newlogprob_flat.reshape(segments, horizon);
+    logratio = newlogprob - old_logprobs;
+    ratio_new = logratio.exp();
+
+    adv_normalized = prio.unsqueeze(1) * (advantages - adv_mean) / (adv_std + 1e-8);
+    pg_loss1 = -adv_normalized * ratio_new;
+    pg_loss2 = -adv_normalized * torch.clamp(ratio_new, 1.0 - clip_coef, 1.0 + clip_coef);
+    pg_loss = torch.max(pg_loss1, pg_loss2).mean();
+
+    newvalue = newvalue.view(returns.shape)
+    v_clipped = values + torch.clamp(newvalue - values, -vf_clip_coef, vf_clip_coef);
+    v_loss_unclipped = (newvalue - returns).pow(2);
+    v_loss_clipped = (v_clipped - returns).pow(2);
+    v_loss = 0.5 * torch.max(v_loss_unclipped, v_loss_clipped).mean();
+
+    # Entrop is a little off (1e-6)
+    loss = pg_loss + vf_coef*v_loss - ent_coef*entropy
+    return loss
+
+def test_fused_ppo_loss():
+    logits = torch.randn(BT, T, A, requires_grad=True)
+    values_pred = torch.randn(BT, T, requires_grad=True).contiguous()
+    actions = torch.randint(0, A, (BT, T))
+    old_logprobs = torch.randn(BT, T)
+    advantages = torch.randn(BT, T)
+    prio = torch.rand(BT)
+    values = torch.randn(BT, T)
+    returns = torch.randn(BT, T)
+
+    adv_mean = advantages.mean()
+    adv_std = advantages.std()
+
+    # TODO: These should be tensors, but have to adjust the test kernel too.
+    # This makes it much slower... but needed for graphing? More perf checks required.
+    clip_coef = 0.1
+    vf_clip_coef = 0.1
+    vf_coef = 0.1
+    ent_coef = 0.1
+
+    args = (fused_ppo_loss, _C.fused_ppo_loss, logits, values_pred, actions,
+        old_logprobs, advantages, prio, values, returns, advantages.mean(), advantages.std(),
+        clip_coef, vf_clip_coef, vf_coef, ent_coef)
+    print('fused_ppo_loss')
+    test_kernel(*args)
+
+def rmsnorm(x, weight, eps):
+    shape = (x.shape[-1],)
+    return torch.nn.functional.rms_norm(x, shape, weight, eps)
+
+def rmsnorm_loss(outputs):
+    return torch.sum(outputs[0])
+
+def test_rmsnorm():
+    x = torch.randn(BT, T, H, requires_grad=True)
+    weight = torch.randn(H, requires_grad=True)
+    eps = 1e-5
+
+    print('rmsnorm correctness')
+    test_kernel(rmsnorm, _C.rmsnorm, x, weight, eps)
+
+if __name__ == '__main__':
+    #test_mingru_gate()
+    #test_log_coeffs_and_values()
+    #test_logcumsumexp()
+    test_fused_scan()
+    #test_fused_ppo_loss()
+    #test_rmsnorm()
diff --git a/tests/test_muon.py b/tests/test_muon.py
new file mode 100644
index 000000000..fdd37f30f
--- /dev/null
+++ b/tests/test_muon.py
@@ -0,0 +1,115 @@
+
+import torch
+import torch.utils.cpp_extension
+import torch.nn as nn
+import warnings
+from typing import List
+
+# Suppress heavyball warnings
+warnings.filterwarnings(action='ignore', category=UserWarning, module=r'heavyball.*')
+
+# Try importing torch.optim.Muon (from muon-optim package)
+from pufferlib.muon import Muon as TorchMuon
+
+# Import heavyball's ForeachMuon
+import heavyball
+from heavyball import ForeachMuon
+
+# Set compile mode to default
+heavyball.utils.compile_mode = "default"
+
+# Reproducibility
+torch.manual_seed(42)
+torch.set_num_threads(1)
+
+# Config
+config = {
+    'learning_rate': 1e-3,
+    'adam_beta1': 0.9,
+    'adam_beta2': 0.999,
+    'adam_eps': 1e-8,
+}
+
+# Model: Linear -> ReLU -> Linear (no biases)
+class Net(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.l1 = nn.Linear(10, 20, bias=True)
+        self.act = nn.ReLU()
+        self.l2 = nn.Linear(20, 1, bias=True)
+
+    def forward(self, x):
+        return self.l2(self.act(self.l1(x)))
+
+# Initialize model and data
+model1 = Net()
+model2 = Net()
+
+# Copy weights
+for p1, p2 in zip(model1.parameters(), model2.parameters()):
+    p2.data.copy_(p1.data)
+
+# Dummy data
+x = torch.randn(16, 10)
+y = torch.randn(16, 1)
+
+# Optimizers
+heavy_optimizer = ForeachMuon(
+    model1.parameters(),
+    lr=config['learning_rate'],
+    betas=(config['adam_beta1'], config['adam_beta2']),
+    eps=config['adam_eps'],
+    heavyball_momentum=True,
+    compile_step=False
+)
+
+torch_optimizer = TorchMuon(
+    model2.parameters(),
+    lr=config['learning_rate'],
+    momentum=config['adam_beta1'],
+    eps=config['adam_eps'],
+    weight_decay=0.0,
+)
+
+# Loss function
+loss_fn = nn.MSELoss()
+
+# Training loop
+n_epochs = 5
+print(f"{'Epoch':<6} {'AllClose':<10} {'Max Abs Diff':<15}")
+print("-" * 35)
+
+for epoch in range(n_epochs):
+    # Zero grads
+    heavy_optimizer.zero_grad()
+    torch_optimizer.zero_grad()
+
+    # Forward
+    loss1 = loss_fn(model1(x), y)
+    loss2 = loss_fn(model2(x), y)
+
+    # Backward
+    loss1.backward()
+    loss2.backward()
+
+    # Step
+    heavy_optimizer.step()
+    torch_optimizer.step()
+
+    # Compare parameters
+    all_close = True
+    max_diff = 0.0
+    for p1, p2 in zip(model1.parameters(), model2.parameters()):
+        diff = (p1.data - p2.data).abs()
+        max_diff = max(max_diff, diff.max().item())
+        if not torch.allclose(p1.data, p2.data, atol=1e-6, rtol=1e-5):
+            all_close = False
+
+    print(f"{epoch+1:<6} {str(all_close):<10} {max_diff:<15.3e}")
+
+    # Optional: break early if divergence
+    if not all_close and max_diff > 1e-4:
+        print("❗ Significant divergence detected.")
+        break
+
+print("\n✅ Test complete.")