Skip to content

Bigram Language Model

  • This is a simple transformer-based Bigram Language Model

  • This model has 0.209729 million parameters.

  • For complete code implementation refer the notebook.

import torch

model_path = "/content/drive/MyDrive/Rakuten/GPT/model/BiGmodel.pth"

# Load the model
model = BigramLanguageModel()
model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))

Before Quantization

  • model:

    BigramLanguageModel(
      (token_embedding_table): Embedding(65, 64)
      (position_embedding_table): Embedding(32, 64)
      (blocks): Sequential(
        (0): Block(
          (sa): MultiHeadAttention(
            (heads): ModuleList(
              (0-3): 4 x Head(
                (key): Linear(in_features=64, out_features=16, bias=False)
                (query): Linear(in_features=64, out_features=16, bias=False)
                (value): Linear(in_features=64, out_features=16, bias=False)
                (dropout): Dropout(p=0.0, inplace=False)
              )
            )
            (proj): Linear(in_features=64, out_features=64, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (ffwd): FeedFoward(
            (net): Sequential(
              (0): Linear(in_features=64, out_features=256, bias=True)
              (1): ReLU()
              (2): Linear(in_features=256, out_features=64, bias=True)
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (ln1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
          (ln2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        )
        (1): Block(
          (sa): MultiHeadAttention(
            (heads): ModuleList(
              (0-3): 4 x Head(
                (key): Linear(in_features=64, out_features=16, bias=False)
                (query): Linear(in_features=64, out_features=16, bias=False)
                (value): Linear(in_features=64, out_features=16, bias=False)
                (dropout): Dropout(p=0.0, inplace=False)
              )
            )
            (proj): Linear(in_features=64, out_features=64, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (ffwd): FeedFoward(
            (net): Sequential(
              (0): Linear(in_features=64, out_features=256, bias=True)
              (1): ReLU()
              (2): Linear(in_features=256, out_features=64, bias=True)
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (ln1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
          (ln2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        )
        (2): Block(
          (sa): MultiHeadAttention(
            (heads): ModuleList(
              (0-3): 4 x Head(
                (key): Linear(in_features=64, out_features=16, bias=False)
                (query): Linear(in_features=64, out_features=16, bias=False)
                (value): Linear(in_features=64, out_features=16, bias=False)
                (dropout): Dropout(p=0.0, inplace=False)
              )
            )
            (proj): Linear(in_features=64, out_features=64, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (ffwd): FeedFoward(
            (net): Sequential(
              (0): Linear(in_features=64, out_features=256, bias=True)
              (1): ReLU()
              (2): Linear(in_features=256, out_features=64, bias=True)
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (ln1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
          (ln2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        )
        (3): Block(
          (sa): MultiHeadAttention(
            (heads): ModuleList(
              (0-3): 4 x Head(
                (key): Linear(in_features=64, out_features=16, bias=False)
                (query): Linear(in_features=64, out_features=16, bias=False)
                (value): Linear(in_features=64, out_features=16, bias=False)
                (dropout): Dropout(p=0.0, inplace=False)
              )
            )
            (proj): Linear(in_features=64, out_features=64, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (ffwd): FeedFoward(
            (net): Sequential(
              (0): Linear(in_features=64, out_features=256, bias=True)
              (1): ReLU()
              (2): Linear(in_features=256, out_features=64, bias=True)
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (ln1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
          (ln2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        )
      )
      (ln_f): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (lm_head): Linear(in_features=64, out_features=65, bias=True)
    )
    

  • Memory Size:

    Model size: 0.86 MB
    

  • Model Generation with max_new_tokens=100:
    BUCKINGHAM:
    Now Good thet, for
    gair, my but stail, frele with was you said, I her did-you as this b
    

After Quantization

  • Quantized Model:

    BigramLanguageModel(
      (token_embedding_table): Embedding(65, 64)
      (position_embedding_table): Embedding(32, 64)
      (blocks): Sequential(
        (0): Block(
          (sa): MultiHeadAttention(
            (heads): ModuleList(
              (0-3): 4 x Head(
                (key): W8A16LinearLayer()
                (query): W8A16LinearLayer()
                (value): W8A16LinearLayer()
                (dropout): Dropout(p=0.0, inplace=False)
              )
            )
            (proj): W8A16LinearLayer()
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (ffwd): FeedFoward(
            (net): Sequential(
              (0): W8A16LinearLayer()
              (1): ReLU()
              (2): W8A16LinearLayer()
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (ln1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
          (ln2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        )
        (1): Block(
          (sa): MultiHeadAttention(
            (heads): ModuleList(
              (0-3): 4 x Head(
                (key): W8A16LinearLayer()
                (query): W8A16LinearLayer()
                (value): W8A16LinearLayer()
                (dropout): Dropout(p=0.0, inplace=False)
              )
            )
            (proj): W8A16LinearLayer()
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (ffwd): FeedFoward(
            (net): Sequential(
              (0): W8A16LinearLayer()
              (1): ReLU()
              (2): W8A16LinearLayer()
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (ln1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
          (ln2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        )
        (2): Block(
          (sa): MultiHeadAttention(
            (heads): ModuleList(
              (0-3): 4 x Head(
                (key): W8A16LinearLayer()
                (query): W8A16LinearLayer()
                (value): W8A16LinearLayer()
                (dropout): Dropout(p=0.0, inplace=False)
              )
            )
            (proj): W8A16LinearLayer()
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (ffwd): FeedFoward(
            (net): Sequential(
              (0): W8A16LinearLayer()
              (1): ReLU()
              (2): W8A16LinearLayer()
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (ln1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
          (ln2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        )
        (3): Block(
          (sa): MultiHeadAttention(
            (heads): ModuleList(
              (0-3): 4 x Head(
                (key): W8A16LinearLayer()
                (query): W8A16LinearLayer()
                (value): W8A16LinearLayer()
                (dropout): Dropout(p=0.0, inplace=False)
              )
            )
            (proj): W8A16LinearLayer()
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (ffwd): FeedFoward(
            (net): Sequential(
              (0): W8A16LinearLayer()
              (1): ReLU()
              (2): W8A16LinearLayer()
              (3): Dropout(p=0.0, inplace=False)
            )
          )
          (ln1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
          (ln2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        )
      )
      (ln_f): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (lm_head): Linear(in_features=64, out_features=65, bias=True)
    )
    

  • Model Size:

    Model size: 0.31 MB
    

  • Quantized Model Generation max_new_tokens=100:

    Wilt to thy bodve all's palt
    Than maked my segvereign ROMBELIZABETH:
    Wt slavefpore of me tout.
    
    HAST
    

2.6M param Model Quantization ->

<- Quantization Granularity