From 998eed43a3938bf6c5b2b7bd1f2496095249754c Mon Sep 17 00:00:00 2001 From: Pritam Das <69068731+Pritam3355@users.noreply.github.com> Date: Sat, 19 Oct 2024 01:08:56 +0530 Subject: [PATCH] Add files via upload --- llm_experiments/Mixtral_Experiment.ipynb | 965 +++++++++++++++++++++++ 1 file changed, 965 insertions(+) create mode 100644 llm_experiments/Mixtral_Experiment.ipynb diff --git a/llm_experiments/Mixtral_Experiment.ipynb b/llm_experiments/Mixtral_Experiment.ipynb new file mode 100644 index 000000000..4903471af --- /dev/null +++ b/llm_experiments/Mixtral_Experiment.ipynb @@ -0,0 +1,965 @@ +{ + "metadata": { + "kernelspec": { + "language": "python", + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.10.14", + "mimetype": "text/x-python", + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "pygments_lexer": "ipython3", + "nbconvert_exporter": "python", + "file_extension": ".py" + }, + "kaggle": { + "accelerator": "nvidiaTeslaT4", + "dataSources": [], + "dockerImageVersionId": 30787, + "isInternetEnabled": true, + "language": "python", + "sourceType": "notebook", + "isGpuEnabled": true + }, + "colab": { + "name": "Mixtral-Experiment", + "provenance": [] + } + }, + "nbformat_minor": 0, + "nbformat": 4, + "cells": [ + { + "cell_type": "code", + "source": [ + "from huggingface_hub import login\n", + "\n", + "# Fetch Hugging Face username and token from Colab secrets\n", + "HF_USERNAME = \"pritam3355\"\n", + "HF_TOKEN = \"HF_TOKEN\"\n", + "\n", + "# Login to Hugging Face\n", + "try:\n", + " login(token=HF_TOKEN)\n", + "except ValueError:\n", + " # If token is not valid or found, login with username and token\n", + " # (likely requires manual authorization)\n", + " login(username=HF_USERNAME, token=HF_TOKEN)" + ], + "metadata": { + "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", + "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", + "trusted": true, + "execution": { + "iopub.status.busy": "2024-10-18T18:20:43.271188Z", + "iopub.execute_input": "2024-10-18T18:20:43.271841Z", + "iopub.status.idle": "2024-10-18T18:20:43.353056Z", + "shell.execute_reply.started": "2024-10-18T18:20:43.271801Z", + "shell.execute_reply": "2024-10-18T18:20:43.35218Z" + }, + "id": "H5JWFz2XAAak", + "outputId": "af45db86-89f6-4349-c2d9-15d969f3d3f2" + }, + "outputs": [ + { + "name": "stdout", + "text": "The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.\nToken is valid (permission: fineGrained).\nYour token has been saved to /root/.cache/huggingface/token\nLogin successful\n", + "output_type": "stream" + } + ], + "execution_count": null + }, + { + "cell_type": "markdown", + "source": [ + "Here Using Mixtral model to extract and analyze how the input sequence is processed in Forward pass. Mixtral is similar to Mistal model but has more parameters" + ], + "metadata": { + "id": "bLD_CkBUAQMy" + } + }, + { + "cell_type": "code", + "source": [ + "import torch\n", + "from transformers import AutoModelForCausalLM, AutoTokenizer\n", + "\n", + "model_id = \"mistralai/Mixtral-8x7B-v0.1\"\n", + "tokenizer = AutoTokenizer.from_pretrained(model_id,device=\"auto\")\n", + "\n", + "model = AutoModelForCausalLM.from_pretrained(model_id,trust_remote_code=True,\n", + " torch_dtype=torch.bfloat16,\n", + " low_cpu_mem_usage=True,device_map=\"auto\")\n" + ], + "metadata": { + "trusted": true, + "execution": { + "iopub.status.busy": "2024-10-18T18:20:43.354757Z", + "iopub.execute_input": "2024-10-18T18:20:43.355493Z", + "iopub.status.idle": "2024-10-18T18:30:40.651163Z", + "shell.execute_reply.started": "2024-10-18T18:20:43.355448Z", + "shell.execute_reply": "2024-10-18T18:30:40.650377Z" + }, + "id": "E3nm2iWcAAaq", + "outputId": "82d9a608-7eff-4578-b328-f9f773ed4f39", + "colab": { + "referenced_widgets": [ + "fa5c2b7f05bc412993098a3731e72989", + "e64a4b6de34d4f40b88305ce507e3658", + "4675ed906a964735b4334458935ab4b9", + "e4f24bafae8f4397b76818a34ca9d6e4", + "3621e97c28544d34ab3953c22d227cd0", + "dd02aa16c10b4ab78373aa3dae939489", + "44e75ecc95b74f03a7a58e6ea21165c1", + "6d26de44c0334077b6c14104747a48ad", + "57c7fa8051a94bcb96c0309651ab8298", + "b736720173fd4ba5bbe54cbcc1177423", + "368fe041fff84949ac30d3d45ac78a0d", + "79ff492b16e946c8a6238d31b181ffc8", + "2a12b5905b434c11beaaceaf7e1a6394", + "9f16b85fde7148b7931c30fb024c87d5", + "f0bae3fc9925442e82d58ecd7a305808", + "2181a83c39114bc78b1e4859b3ccdfed", + "14ad494e78084d8983bc6c0751f9d941", + "280600190e10484db98261256542f236", + "562e9f5c0d0d4228b218553019e483b6", + "cc6675e71cea4018b6adff29d60f0a82", + "39633f760e104265b1ddc2bcb3e4961d", + "64288ea1c3074a528339b9d0f9729d18", + "584114fa6b554a1495f6aa14011e0cc6", + "2756416bfbcf474c94c1ca2ab4b7d8e3", + "8c6e4f33682040feb42c1385c66b7ba2", + "68cc9722525c46328cf963c2a4f2740a", + "06367bbf0c094ba1bc7d481fb1bfc3f9", + "1434b26ed3b4449b8fd6a76e0f1e5c97" + ] + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": "tokenizer_config.json: 0%| | 0.00/967 [00:00" + }, + "metadata": {} + } + ], + "execution_count": null + }, + { + "cell_type": "markdown", + "source": [ + "## Forward Pass" + ], + "metadata": { + "id": "vORmhXXTAAa3" + } + }, + { + "cell_type": "code", + "source": [ + "input_text = \"The quick brown fox jumps over the lazy dog !\"\n", + "\n", + "# Tokenize the input text\n", + "inputs = tokenizer(input_text, return_tensors=\"pt\")\n", + "print(\"Tokenized inputs {'input_ids','attention_mask'} - \",inputs)\n", + "print(\"Decoded tokens : \",tokenizer.decode(inputs['input_ids'][0]))" + ], + "metadata": { + "trusted": true, + "execution": { + "iopub.status.busy": "2024-10-18T18:30:40.706383Z", + "iopub.execute_input": "2024-10-18T18:30:40.706895Z", + "iopub.status.idle": "2024-10-18T18:30:40.728093Z", + "shell.execute_reply.started": "2024-10-18T18:30:40.706863Z", + "shell.execute_reply": "2024-10-18T18:30:40.727243Z" + }, + "id": "dcC4RjNTAAa4", + "outputId": "f0d4c2e3-b7f6-471d-9b9b-ce5316c47431" + }, + "outputs": [ + { + "name": "stdout", + "text": "Tokenized inputs {'input_ids','attention_mask'} - {'input_ids': tensor([[ 1, 415, 2936, 9060, 285, 1142, 461, 10575, 754, 272,\n 17898, 3914, 918]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}\nDecoded tokens : The quick brown fox jumps over the lazy dog !\n", + "output_type": "stream" + } + ], + "execution_count": null + }, + { + "cell_type": "code", + "source": [ + "\n", + "with torch.no_grad():\n", + " model_output = model(**inputs)" + ], + "metadata": { + "trusted": true, + "execution": { + "iopub.status.busy": "2024-10-18T18:30:40.729287Z", + "iopub.execute_input": "2024-10-18T18:30:40.729872Z", + "iopub.status.idle": "2024-10-18T18:36:43.660892Z", + "shell.execute_reply.started": "2024-10-18T18:30:40.72983Z", + "shell.execute_reply": "2024-10-18T18:36:43.660087Z" + }, + "id": "4x2A5-m-AAa6", + "outputId": "d0fc43d2-1229-4582-d4d3-6b5f745be24e" + }, + "outputs": [ + { + "name": "stderr", + "text": "Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)\n", + "output_type": "stream" + } + ], + "execution_count": null + }, + { + "cell_type": "code", + "source": [ + "for layer, output in outputs.items():\n", + " print(f\"Output at {layer}: \")\n", + " if isinstance(output, torch.Tensor):\n", + " print(output.shape, type(output))\n", + " elif isinstance(output, tuple):\n", + " for i, o in enumerate(output):\n", + " print(f\"Output {i}: {o.shape if isinstance(o, torch.Tensor) else type(o)}\")\n", + " else:\n", + " print(type(output))\n", + " print(\"-\" * 100)" + ], + "metadata": { + "trusted": true, + "execution": { + "iopub.status.busy": "2024-10-18T18:56:11.28238Z", + "iopub.execute_input": "2024-10-18T18:56:11.283252Z", + "iopub.status.idle": "2024-10-18T18:56:11.291437Z", + "shell.execute_reply.started": "2024-10-18T18:56:11.283214Z", + "shell.execute_reply": "2024-10-18T18:56:11.290478Z" + }, + "id": "xVuaYV3pAAa7", + "outputId": "84e2f36e-0f10-4be1-9fdd-581fe61fabb1" + }, + "outputs": [ + { + "name": "stdout", + "text": "Output at embed_tokens: \ntorch.Size([1, 13, 4096]) \n----------------------------------------------------------------------------------------------------\nOutput at self_attn_layer_1: \nOutput 0: torch.Size([1, 13, 4096])\nOutput 1: \nOutput 2: \n----------------------------------------------------------------------------------------------------\nOutput at block_sparse_moe_experts: \ntorch.Size([3, 4096]) \n----------------------------------------------------------------------------------------------------\nOutput at post_attention_layernorm: \ntorch.Size([1, 13, 4096]) \n----------------------------------------------------------------------------------------------------\nOutput at norm: \ntorch.Size([1, 13, 4096]) \n----------------------------------------------------------------------------------------------------\nOutput at lm_head: \ntorch.Size([1, 13, 32000]) \n----------------------------------------------------------------------------------------------------\nOutput at input_layernorm: \ntorch.Size([1, 13, 4096]) \n----------------------------------------------------------------------------------------------------\nOutput at self_attn_q_proj: \ntorch.Size([1, 13, 4096]) \n----------------------------------------------------------------------------------------------------\nOutput at self_attn_k_proj: \ntorch.Size([1, 13, 1024]) \n----------------------------------------------------------------------------------------------------\nOutput at self_attn_v_proj: \ntorch.Size([1, 13, 1024]) \n----------------------------------------------------------------------------------------------------\nOutput at self_attn_o_proj: \ntorch.Size([1, 13, 4096]) \n----------------------------------------------------------------------------------------------------\nOutput at block_sparse_moe_gate: \ntorch.Size([13, 8]) \n----------------------------------------------------------------------------------------------------\n", + "output_type": "stream" + } + ], + "execution_count": null + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "### Explanation of Shapes:\n", + "\n", + "### 1. **embed_tokens**\n", + "- **Shape:** `torch.Size([1, 13, 4096])`\n", + "- **Explanation:**\n", + " - `1`: Batch size (number of sequences in this batch, here it's 1 sequence).\n", + " - `13`: Sequence length (the number of tokens in the input, here 13 tokens).\n", + " - `4096`: Embedding size (each token is mapped to a 4096-dimensional vector).\n", + " \n", + " **Shape Format:** `(batch_size, seq_len, embed_dim)`\n", + "\n", + "### 2. **self_attn_layer_1**\n", + "- **Shape (Output 0):** `torch.Size([1, 13, 4096])`\n", + "- **Explanation:**\n", + " - `1`: Batch size.\n", + " - `13`: Sequence length.\n", + " - `4096`: Hidden size (output of the attention mechanism).\n", + "\n", + " **Shape Format:** `(batch_size, seq_len, hidden_dim)`\n", + "\n", + "### 3. **block_sparse_moe_experts**\n", + "- **Shape:** `torch.Size([3, 4096])`\n", + "- **Explanation:**\n", + " - `3`: Number of activated experts for the MoE (Mixture of Experts) layer. In the `block_sparse_moe`, two experts are chosen per token (2 tokens in the batch may activate the same experts, hence the 3).\n", + " - `4096`: Expert embedding size (the dimensionality of the expert's output).\n", + " \n", + " **Shape Format:** `(num_experts, expert_embed_dim)`\n", + "\n", + "### 4. **post_attention_layernorm**\n", + "- **Shape:** `torch.Size([1, 13, 4096])`\n", + "- **Explanation:**\n", + " - `1`: Batch size.\n", + " - `13`: Sequence length.\n", + " - `4096`: Output dimension after the post-attention normalization step.\n", + " \n", + " **Shape Format:** `(batch_size, seq_len, hidden_dim)`\n", + "\n", + "### 5. **norm**\n", + "- **Shape:** `torch.Size([1, 13, 4096])`\n", + "- **Explanation:**\n", + " - `1`: Batch size.\n", + " - `13`: Sequence length.\n", + " - `4096`: Output dimension after applying the final normalization layer.\n", + " \n", + " **Shape Format:** `(batch_size, seq_len, hidden_dim)`\n", + "\n", + "### 6. **lm_head**\n", + "- **Shape:** `torch.Size([1, 13, 32000])`\n", + "- **Explanation:**\n", + " - `1`: Batch size.\n", + " - `13`: Sequence length.\n", + " - `32000`: Vocabulary size (logits over the vocabulary for each token in the sequence).\n", + " \n", + " **Shape Format:** `(batch_size, seq_len, vocab_size)`\n", + "\n", + "### 7. **input_layernorm**\n", + "- **Shape:** `torch.Size([1, 13, 4096])`\n", + "- **Explanation:**\n", + " - `1`: Batch size.\n", + " - `13`: Sequence length.\n", + " - `4096`: Output dimension after the input layer normalization step.\n", + " \n", + " **Shape Format:** `(batch_size, seq_len, hidden_dim)`\n", + "\n", + "### 8. **self_attn_q_proj**\n", + "- **Shape:** `torch.Size([1, 13, 4096])`\n", + "- **Explanation:**\n", + " - `1`: Batch size.\n", + " - `13`: Sequence length.\n", + " - `4096`: Query projection size (the hidden state is projected to the query vector space).\n", + " \n", + " **Shape Format:** `(batch_size, seq_len, hidden_dim)`\n", + "\n", + "### 9. **self_attn_k_proj**\n", + "- **Shape:** `torch.Size([1, 13, 1024])`\n", + "- **Explanation:**\n", + " - `1`: Batch size.\n", + " - `13`: Sequence length.\n", + " - `1024`: Key projection size (here, the key is projected to a smaller dimensional space compared to queries/values).\n", + " \n", + " **Shape Format:** `(batch_size, seq_len, key_dim)`\n", + "\n", + "### 10. **self_attn_v_proj**\n", + "- **Shape:** `torch.Size([1, 13, 1024])`\n", + "- **Explanation:**\n", + " - `1`: Batch size.\n", + " - `13`: Sequence length.\n", + " - `1024`: Value projection size (the values are also projected to the same size as the keys).\n", + " \n", + " **Shape Format:** `(batch_size, seq_len, value_dim)`\n", + "\n", + "### 11. **self_attn_o_proj**\n", + "- **Shape:** `torch.Size([1, 13, 4096])`\n", + "- **Explanation:**\n", + " - `1`: Batch size.\n", + " - `13`: Sequence length.\n", + " - `4096`: Output projection size (the final result after the attention mechanism is projected back to the original hidden dimension).\n", + " \n", + " **Shape Format:** `(batch_size, seq_len, hidden_dim)`\n", + "\n", + "### 12. **block_sparse_moe_gate**\n", + "- **Shape:** `torch.Size([13, 8])`\n", + "- **Explanation:**\n", + " - `13`: Sequence length (the gate operates per token).\n", + " - `8`: Number of experts (gating decisions are made over all available experts).\n", + " \n", + " **Shape Format:** `(seq_len, num_experts)`\n", + "\n", + "### Summary Table:\n", + "\n", + "| Layer Name | Shape Format | Dimensions | Notes |\n", + "|------------------------------|------------------------------------|------------|------------------------------------------------------------|\n", + "| `embed_tokens` | `(batch_size, seq_len, embed_dim)` | `[1, 13, 4096]` | Embedding tokens from vocabulary. |\n", + "| `self_attn_layer_1` | `(batch_size, seq_len, hidden_dim)`| `[1, 13, 4096]` | Output of first attention layer. |\n", + "| `block_sparse_moe_experts` | `(num_experts, expert_embed_dim)` | `[3, 4096]` | Expert outputs in MoE block. |\n", + "| `post_attention_layernorm` | `(batch_size, seq_len, hidden_dim)`| `[1, 13, 4096]` | Layer norm after attention. |\n", + "| `norm` | `(batch_size, seq_len, hidden_dim)`| `[1, 13, 4096]` | Final normalization layer. |\n", + "| `lm_head` | `(batch_size, seq_len, vocab_size)`| `[1, 13, 32000]` | Logits for each token over the vocabulary. |\n", + "| `input_layernorm` | `(batch_size, seq_len, hidden_dim)`| `[1, 13, 4096]` | Input layer normalization. |\n", + "| `self_attn_q_proj` | `(batch_size, seq_len, hidden_dim)`| `[1, 13, 4096]` | Query projection in self-attention. |\n", + "| `self_attn_k_proj` | `(batch_size, seq_len, key_dim)` | `[1, 13, 1024]` | Key projection in self-attention. |\n", + "| `self_attn_v_proj` | `(batch_size, seq_len, value_dim)` | `[1, 13, 1024]` | Value projection in self-attention. |\n", + "| `self_attn_o_proj` | `(batch_size, seq_len, hidden_dim)`| `[1, 13, 4096]` | Output projection after attention. |\n", + "| `block_sparse_moe_gate` | `(seq_len, num_experts)` | `[13, 8]` | Gating decisions for the mixture of experts. |\n", + "\n" + ], + "metadata": { + "id": "FyugDOzXAAa8" + } + } + ] +} \ No newline at end of file