{ "metadata": { "kernelspec": { "language": "python", "display_name": "Python 3", "name": "python3" }, "language_info": { "name": "python", "version": "3.10.14", "mimetype": "text/x-python", "codemirror_mode": { "name": "ipython", "version": 3 }, "pygments_lexer": "ipython3", "nbconvert_exporter": "python", "file_extension": ".py" }, "kaggle": { "accelerator": "nvidiaTeslaT4", "dataSources": [], "dockerImageVersionId": 30787, "isInternetEnabled": true, "language": "python", "sourceType": "notebook", "isGpuEnabled": true }, "colab": { "name": "Mixtral-Experiment", "provenance": [] } }, "nbformat_minor": 0, "nbformat": 4, "cells": [ { "cell_type": "code", "source": [ "from huggingface_hub import login\n", "\n", "# Fetch Hugging Face username and token from Colab secrets\n", "HF_USERNAME = \"pritam3355\"\n", "HF_TOKEN = \"HF_TOKEN\"\n", "\n", "# Login to Hugging Face\n", "try:\n", " login(token=HF_TOKEN)\n", "except ValueError:\n", " # If token is not valid or found, login with username and token\n", " # (likely requires manual authorization)\n", " login(username=HF_USERNAME, token=HF_TOKEN)" ], "metadata": { "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", "trusted": true, "execution": { "iopub.status.busy": "2024-10-18T18:20:43.271188Z", "iopub.execute_input": "2024-10-18T18:20:43.271841Z", "iopub.status.idle": "2024-10-18T18:20:43.353056Z", "shell.execute_reply.started": "2024-10-18T18:20:43.271801Z", "shell.execute_reply": "2024-10-18T18:20:43.35218Z" }, "id": "H5JWFz2XAAak", "outputId": "af45db86-89f6-4349-c2d9-15d969f3d3f2" }, "outputs": [ { "name": "stdout", "text": "The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.\nToken is valid (permission: fineGrained).\nYour token has been saved to /root/.cache/huggingface/token\nLogin successful\n", "output_type": "stream" } ], "execution_count": null }, { "cell_type": "markdown", "source": [ "Here Using Mixtral model to extract and analyze how the input sequence is processed in Forward pass. Mixtral is similar to Mistal model but has more parameters" ], "metadata": { "id": "bLD_CkBUAQMy" } }, { "cell_type": "code", "source": [ "import torch\n", "from transformers import AutoModelForCausalLM, AutoTokenizer\n", "\n", "model_id = \"mistralai/Mixtral-8x7B-v0.1\"\n", "tokenizer = AutoTokenizer.from_pretrained(model_id,device=\"auto\")\n", "\n", "model = AutoModelForCausalLM.from_pretrained(model_id,trust_remote_code=True,\n", " torch_dtype=torch.bfloat16,\n", " low_cpu_mem_usage=True,device_map=\"auto\")\n" ], "metadata": { "trusted": true, "execution": { "iopub.status.busy": "2024-10-18T18:20:43.354757Z", "iopub.execute_input": "2024-10-18T18:20:43.355493Z", "iopub.status.idle": "2024-10-18T18:30:40.651163Z", "shell.execute_reply.started": "2024-10-18T18:20:43.355448Z", "shell.execute_reply": "2024-10-18T18:30:40.650377Z" }, "id": "E3nm2iWcAAaq", "outputId": "82d9a608-7eff-4578-b328-f9f773ed4f39", "colab": { "referenced_widgets": [ "fa5c2b7f05bc412993098a3731e72989", "e64a4b6de34d4f40b88305ce507e3658", "4675ed906a964735b4334458935ab4b9", "e4f24bafae8f4397b76818a34ca9d6e4", "3621e97c28544d34ab3953c22d227cd0", "dd02aa16c10b4ab78373aa3dae939489", "44e75ecc95b74f03a7a58e6ea21165c1", "6d26de44c0334077b6c14104747a48ad", "57c7fa8051a94bcb96c0309651ab8298", "b736720173fd4ba5bbe54cbcc1177423", "368fe041fff84949ac30d3d45ac78a0d", "79ff492b16e946c8a6238d31b181ffc8", "2a12b5905b434c11beaaceaf7e1a6394", "9f16b85fde7148b7931c30fb024c87d5", "f0bae3fc9925442e82d58ecd7a305808", "2181a83c39114bc78b1e4859b3ccdfed", "14ad494e78084d8983bc6c0751f9d941", "280600190e10484db98261256542f236", "562e9f5c0d0d4228b218553019e483b6", "cc6675e71cea4018b6adff29d60f0a82", "39633f760e104265b1ddc2bcb3e4961d", "64288ea1c3074a528339b9d0f9729d18", "584114fa6b554a1495f6aa14011e0cc6", "2756416bfbcf474c94c1ca2ab4b7d8e3", "8c6e4f33682040feb42c1385c66b7ba2", "68cc9722525c46328cf963c2a4f2740a", "06367bbf0c094ba1bc7d481fb1bfc3f9", "1434b26ed3b4449b8fd6a76e0f1e5c97" ] } }, "outputs": [ { "output_type": "display_data", "data": { "text/plain": "tokenizer_config.json: 0%| | 0.00/967 [00:00" }, "metadata": {} } ], "execution_count": null }, { "cell_type": "markdown", "source": [ "## Forward Pass" ], "metadata": { "id": "vORmhXXTAAa3" } }, { "cell_type": "code", "source": [ "input_text = \"The quick brown fox jumps over the lazy dog !\"\n", "\n", "# Tokenize the input text\n", "inputs = tokenizer(input_text, return_tensors=\"pt\")\n", "print(\"Tokenized inputs {'input_ids','attention_mask'} - \",inputs)\n", "print(\"Decoded tokens : \",tokenizer.decode(inputs['input_ids'][0]))" ], "metadata": { "trusted": true, "execution": { "iopub.status.busy": "2024-10-18T18:30:40.706383Z", "iopub.execute_input": "2024-10-18T18:30:40.706895Z", "iopub.status.idle": "2024-10-18T18:30:40.728093Z", "shell.execute_reply.started": "2024-10-18T18:30:40.706863Z", "shell.execute_reply": "2024-10-18T18:30:40.727243Z" }, "id": "dcC4RjNTAAa4", "outputId": "f0d4c2e3-b7f6-471d-9b9b-ce5316c47431" }, "outputs": [ { "name": "stdout", "text": "Tokenized inputs {'input_ids','attention_mask'} - {'input_ids': tensor([[ 1, 415, 2936, 9060, 285, 1142, 461, 10575, 754, 272,\n 17898, 3914, 918]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}\nDecoded tokens : The quick brown fox jumps over the lazy dog !\n", "output_type": "stream" } ], "execution_count": null }, { "cell_type": "code", "source": [ "\n", "with torch.no_grad():\n", " model_output = model(**inputs)" ], "metadata": { "trusted": true, "execution": { "iopub.status.busy": "2024-10-18T18:30:40.729287Z", "iopub.execute_input": "2024-10-18T18:30:40.729872Z", "iopub.status.idle": "2024-10-18T18:36:43.660892Z", "shell.execute_reply.started": "2024-10-18T18:30:40.72983Z", "shell.execute_reply": "2024-10-18T18:36:43.660087Z" }, "id": "4x2A5-m-AAa6", "outputId": "d0fc43d2-1229-4582-d4d3-6b5f745be24e" }, "outputs": [ { "name": "stderr", "text": "Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)\n", "output_type": "stream" } ], "execution_count": null }, { "cell_type": "code", "source": [ "for layer, output in outputs.items():\n", " print(f\"Output at {layer}: \")\n", " if isinstance(output, torch.Tensor):\n", " print(output.shape, type(output))\n", " elif isinstance(output, tuple):\n", " for i, o in enumerate(output):\n", " print(f\"Output {i}: {o.shape if isinstance(o, torch.Tensor) else type(o)}\")\n", " else:\n", " print(type(output))\n", " print(\"-\" * 100)" ], "metadata": { "trusted": true, "execution": { "iopub.status.busy": "2024-10-18T18:56:11.28238Z", "iopub.execute_input": "2024-10-18T18:56:11.283252Z", "iopub.status.idle": "2024-10-18T18:56:11.291437Z", "shell.execute_reply.started": "2024-10-18T18:56:11.283214Z", "shell.execute_reply": "2024-10-18T18:56:11.290478Z" }, "id": "xVuaYV3pAAa7", "outputId": "84e2f36e-0f10-4be1-9fdd-581fe61fabb1" }, "outputs": [ { "name": "stdout", "text": "Output at embed_tokens: \ntorch.Size([1, 13, 4096]) \n----------------------------------------------------------------------------------------------------\nOutput at self_attn_layer_1: \nOutput 0: torch.Size([1, 13, 4096])\nOutput 1: \nOutput 2: \n----------------------------------------------------------------------------------------------------\nOutput at block_sparse_moe_experts: \ntorch.Size([3, 4096]) \n----------------------------------------------------------------------------------------------------\nOutput at post_attention_layernorm: \ntorch.Size([1, 13, 4096]) \n----------------------------------------------------------------------------------------------------\nOutput at norm: \ntorch.Size([1, 13, 4096]) \n----------------------------------------------------------------------------------------------------\nOutput at lm_head: \ntorch.Size([1, 13, 32000]) \n----------------------------------------------------------------------------------------------------\nOutput at input_layernorm: \ntorch.Size([1, 13, 4096]) \n----------------------------------------------------------------------------------------------------\nOutput at self_attn_q_proj: \ntorch.Size([1, 13, 4096]) \n----------------------------------------------------------------------------------------------------\nOutput at self_attn_k_proj: \ntorch.Size([1, 13, 1024]) \n----------------------------------------------------------------------------------------------------\nOutput at self_attn_v_proj: \ntorch.Size([1, 13, 1024]) \n----------------------------------------------------------------------------------------------------\nOutput at self_attn_o_proj: \ntorch.Size([1, 13, 4096]) \n----------------------------------------------------------------------------------------------------\nOutput at block_sparse_moe_gate: \ntorch.Size([13, 8]) \n----------------------------------------------------------------------------------------------------\n", "output_type": "stream" } ], "execution_count": null }, { "cell_type": "markdown", "source": [ "\n", "\n", "### Explanation of Shapes:\n", "\n", "### 1. **embed_tokens**\n", "- **Shape:** `torch.Size([1, 13, 4096])`\n", "- **Explanation:**\n", " - `1`: Batch size (number of sequences in this batch, here it's 1 sequence).\n", " - `13`: Sequence length (the number of tokens in the input, here 13 tokens).\n", " - `4096`: Embedding size (each token is mapped to a 4096-dimensional vector).\n", " \n", " **Shape Format:** `(batch_size, seq_len, embed_dim)`\n", "\n", "### 2. **self_attn_layer_1**\n", "- **Shape (Output 0):** `torch.Size([1, 13, 4096])`\n", "- **Explanation:**\n", " - `1`: Batch size.\n", " - `13`: Sequence length.\n", " - `4096`: Hidden size (output of the attention mechanism).\n", "\n", " **Shape Format:** `(batch_size, seq_len, hidden_dim)`\n", "\n", "### 3. **block_sparse_moe_experts**\n", "- **Shape:** `torch.Size([3, 4096])`\n", "- **Explanation:**\n", " - `3`: Number of activated experts for the MoE (Mixture of Experts) layer. In the `block_sparse_moe`, two experts are chosen per token (2 tokens in the batch may activate the same experts, hence the 3).\n", " - `4096`: Expert embedding size (the dimensionality of the expert's output).\n", " \n", " **Shape Format:** `(num_experts, expert_embed_dim)`\n", "\n", "### 4. **post_attention_layernorm**\n", "- **Shape:** `torch.Size([1, 13, 4096])`\n", "- **Explanation:**\n", " - `1`: Batch size.\n", " - `13`: Sequence length.\n", " - `4096`: Output dimension after the post-attention normalization step.\n", " \n", " **Shape Format:** `(batch_size, seq_len, hidden_dim)`\n", "\n", "### 5. **norm**\n", "- **Shape:** `torch.Size([1, 13, 4096])`\n", "- **Explanation:**\n", " - `1`: Batch size.\n", " - `13`: Sequence length.\n", " - `4096`: Output dimension after applying the final normalization layer.\n", " \n", " **Shape Format:** `(batch_size, seq_len, hidden_dim)`\n", "\n", "### 6. **lm_head**\n", "- **Shape:** `torch.Size([1, 13, 32000])`\n", "- **Explanation:**\n", " - `1`: Batch size.\n", " - `13`: Sequence length.\n", " - `32000`: Vocabulary size (logits over the vocabulary for each token in the sequence).\n", " \n", " **Shape Format:** `(batch_size, seq_len, vocab_size)`\n", "\n", "### 7. **input_layernorm**\n", "- **Shape:** `torch.Size([1, 13, 4096])`\n", "- **Explanation:**\n", " - `1`: Batch size.\n", " - `13`: Sequence length.\n", " - `4096`: Output dimension after the input layer normalization step.\n", " \n", " **Shape Format:** `(batch_size, seq_len, hidden_dim)`\n", "\n", "### 8. **self_attn_q_proj**\n", "- **Shape:** `torch.Size([1, 13, 4096])`\n", "- **Explanation:**\n", " - `1`: Batch size.\n", " - `13`: Sequence length.\n", " - `4096`: Query projection size (the hidden state is projected to the query vector space).\n", " \n", " **Shape Format:** `(batch_size, seq_len, hidden_dim)`\n", "\n", "### 9. **self_attn_k_proj**\n", "- **Shape:** `torch.Size([1, 13, 1024])`\n", "- **Explanation:**\n", " - `1`: Batch size.\n", " - `13`: Sequence length.\n", " - `1024`: Key projection size (here, the key is projected to a smaller dimensional space compared to queries/values).\n", " \n", " **Shape Format:** `(batch_size, seq_len, key_dim)`\n", "\n", "### 10. **self_attn_v_proj**\n", "- **Shape:** `torch.Size([1, 13, 1024])`\n", "- **Explanation:**\n", " - `1`: Batch size.\n", " - `13`: Sequence length.\n", " - `1024`: Value projection size (the values are also projected to the same size as the keys).\n", " \n", " **Shape Format:** `(batch_size, seq_len, value_dim)`\n", "\n", "### 11. **self_attn_o_proj**\n", "- **Shape:** `torch.Size([1, 13, 4096])`\n", "- **Explanation:**\n", " - `1`: Batch size.\n", " - `13`: Sequence length.\n", " - `4096`: Output projection size (the final result after the attention mechanism is projected back to the original hidden dimension).\n", " \n", " **Shape Format:** `(batch_size, seq_len, hidden_dim)`\n", "\n", "### 12. **block_sparse_moe_gate**\n", "- **Shape:** `torch.Size([13, 8])`\n", "- **Explanation:**\n", " - `13`: Sequence length (the gate operates per token).\n", " - `8`: Number of experts (gating decisions are made over all available experts).\n", " \n", " **Shape Format:** `(seq_len, num_experts)`\n", "\n", "### Summary Table:\n", "\n", "| Layer Name | Shape Format | Dimensions | Notes |\n", "|------------------------------|------------------------------------|------------|------------------------------------------------------------|\n", "| `embed_tokens` | `(batch_size, seq_len, embed_dim)` | `[1, 13, 4096]` | Embedding tokens from vocabulary. |\n", "| `self_attn_layer_1` | `(batch_size, seq_len, hidden_dim)`| `[1, 13, 4096]` | Output of first attention layer. |\n", "| `block_sparse_moe_experts` | `(num_experts, expert_embed_dim)` | `[3, 4096]` | Expert outputs in MoE block. |\n", "| `post_attention_layernorm` | `(batch_size, seq_len, hidden_dim)`| `[1, 13, 4096]` | Layer norm after attention. |\n", "| `norm` | `(batch_size, seq_len, hidden_dim)`| `[1, 13, 4096]` | Final normalization layer. |\n", "| `lm_head` | `(batch_size, seq_len, vocab_size)`| `[1, 13, 32000]` | Logits for each token over the vocabulary. |\n", "| `input_layernorm` | `(batch_size, seq_len, hidden_dim)`| `[1, 13, 4096]` | Input layer normalization. |\n", "| `self_attn_q_proj` | `(batch_size, seq_len, hidden_dim)`| `[1, 13, 4096]` | Query projection in self-attention. |\n", "| `self_attn_k_proj` | `(batch_size, seq_len, key_dim)` | `[1, 13, 1024]` | Key projection in self-attention. |\n", "| `self_attn_v_proj` | `(batch_size, seq_len, value_dim)` | `[1, 13, 1024]` | Value projection in self-attention. |\n", "| `self_attn_o_proj` | `(batch_size, seq_len, hidden_dim)`| `[1, 13, 4096]` | Output projection after attention. |\n", "| `block_sparse_moe_gate` | `(seq_len, num_experts)` | `[13, 8]` | Gating decisions for the mixture of experts. |\n", "\n" ], "metadata": { "id": "FyugDOzXAAa8" } } ] }