VOOZH about

URL: https://huggingface.co/George-Ogden/gptr2-nano-with-momentum-with-weight-decay

⇱ George-Ogden/gptr2-nano-with-momentum-with-weight-decay · Hugging Face


This model is significantly undertrained and designed for research purposes only.
For use in transformers:

from transformers import AutoTokenizer, GPT2Model

import torch.nn as nn
import torch

class RMSLayerNorm(nn.Module):
 def __init__(self, normalized_shape, eps=1e-8, affine=True):
 super(RMSLayerNorm, self).__init__()
 self.normalized_shape = normalized_shape
 self.eps = eps
 self.affine = affine

 if self.affine:
 self.weight = nn.Parameter(torch.ones(()))
 else:
 self.register_parameter('weight', None)
 self.register_parameter('bias', None)

 def forward(self, x):
 rms = torch.sqrt(torch.mean(x**2, dim=-1, keepdim=True) + self.eps)
 x_normalized = x / rms
 if self.affine:
 x_normalized = x_normalized * self.weight
 return x_normalized


def replace(model):
 for name, child in model.named_children():
 if isinstance(child, nn.modules.normalization.LayerNorm):
 setattr(model, name, RMSLayerNorm(child.normalized_shape, eps=child.eps, affine=True))
 else:
 replace(child)
 return model


class GPTR2Model(GPT2Model):
 def __init__(self, config):
 super().__init__(config)
 replace(self)

model = GPTR2Model.from_pretrained("George-Ogden/gptr2-nano-with-momentum-with-weight-decay")
tokenizer = AutoTokenizer.from_pretrained("gpt2")

For more details and example usage, see https://github.com/George-Ogden/residual-streams

Downloads last month
6
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support

Dataset used to train George-Ogden/gptr2-nano-with-momentum-with-weight-decay