mirror of
https://github.com/TheAlgorithms/Python.git
synced 2025-03-16 19:49:48 +00:00
Delete llm_experiments directory
This commit is contained in:
parent
998eed43a3
commit
7019bf4a7f
@ -1,965 +0,0 @@
|
||||
{
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"language": "python",
|
||||
"display_name": "Python 3",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"name": "python",
|
||||
"version": "3.10.14",
|
||||
"mimetype": "text/x-python",
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"pygments_lexer": "ipython3",
|
||||
"nbconvert_exporter": "python",
|
||||
"file_extension": ".py"
|
||||
},
|
||||
"kaggle": {
|
||||
"accelerator": "nvidiaTeslaT4",
|
||||
"dataSources": [],
|
||||
"dockerImageVersionId": 30787,
|
||||
"isInternetEnabled": true,
|
||||
"language": "python",
|
||||
"sourceType": "notebook",
|
||||
"isGpuEnabled": true
|
||||
},
|
||||
"colab": {
|
||||
"name": "Mixtral-Experiment",
|
||||
"provenance": []
|
||||
}
|
||||
},
|
||||
"nbformat_minor": 0,
|
||||
"nbformat": 4,
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"from huggingface_hub import login\n",
|
||||
"\n",
|
||||
"# Fetch Hugging Face username and token from Colab secrets\n",
|
||||
"HF_USERNAME = \"pritam3355\"\n",
|
||||
"HF_TOKEN = \"HF_TOKEN\"\n",
|
||||
"\n",
|
||||
"# Login to Hugging Face\n",
|
||||
"try:\n",
|
||||
" login(token=HF_TOKEN)\n",
|
||||
"except ValueError:\n",
|
||||
" # If token is not valid or found, login with username and token\n",
|
||||
" # (likely requires manual authorization)\n",
|
||||
" login(username=HF_USERNAME, token=HF_TOKEN)"
|
||||
],
|
||||
"metadata": {
|
||||
"_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
|
||||
"_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
|
||||
"trusted": true,
|
||||
"execution": {
|
||||
"iopub.status.busy": "2024-10-18T18:20:43.271188Z",
|
||||
"iopub.execute_input": "2024-10-18T18:20:43.271841Z",
|
||||
"iopub.status.idle": "2024-10-18T18:20:43.353056Z",
|
||||
"shell.execute_reply.started": "2024-10-18T18:20:43.271801Z",
|
||||
"shell.execute_reply": "2024-10-18T18:20:43.35218Z"
|
||||
},
|
||||
"id": "H5JWFz2XAAak",
|
||||
"outputId": "af45db86-89f6-4349-c2d9-15d969f3d3f2"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"text": "The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.\nToken is valid (permission: fineGrained).\nYour token has been saved to /root/.cache/huggingface/token\nLogin successful\n",
|
||||
"output_type": "stream"
|
||||
}
|
||||
],
|
||||
"execution_count": null
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"Here Using Mixtral model to extract and analyze how the input sequence is processed in Forward pass. Mixtral is similar to Mistal model but has more parameters"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "bLD_CkBUAQMy"
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"import torch\n",
|
||||
"from transformers import AutoModelForCausalLM, AutoTokenizer\n",
|
||||
"\n",
|
||||
"model_id = \"mistralai/Mixtral-8x7B-v0.1\"\n",
|
||||
"tokenizer = AutoTokenizer.from_pretrained(model_id,device=\"auto\")\n",
|
||||
"\n",
|
||||
"model = AutoModelForCausalLM.from_pretrained(model_id,trust_remote_code=True,\n",
|
||||
" torch_dtype=torch.bfloat16,\n",
|
||||
" low_cpu_mem_usage=True,device_map=\"auto\")\n"
|
||||
],
|
||||
"metadata": {
|
||||
"trusted": true,
|
||||
"execution": {
|
||||
"iopub.status.busy": "2024-10-18T18:20:43.354757Z",
|
||||
"iopub.execute_input": "2024-10-18T18:20:43.355493Z",
|
||||
"iopub.status.idle": "2024-10-18T18:30:40.651163Z",
|
||||
"shell.execute_reply.started": "2024-10-18T18:20:43.355448Z",
|
||||
"shell.execute_reply": "2024-10-18T18:30:40.650377Z"
|
||||
},
|
||||
"id": "E3nm2iWcAAaq",
|
||||
"outputId": "82d9a608-7eff-4578-b328-f9f773ed4f39",
|
||||
"colab": {
|
||||
"referenced_widgets": [
|
||||
"fa5c2b7f05bc412993098a3731e72989",
|
||||
"e64a4b6de34d4f40b88305ce507e3658",
|
||||
"4675ed906a964735b4334458935ab4b9",
|
||||
"e4f24bafae8f4397b76818a34ca9d6e4",
|
||||
"3621e97c28544d34ab3953c22d227cd0",
|
||||
"dd02aa16c10b4ab78373aa3dae939489",
|
||||
"44e75ecc95b74f03a7a58e6ea21165c1",
|
||||
"6d26de44c0334077b6c14104747a48ad",
|
||||
"57c7fa8051a94bcb96c0309651ab8298",
|
||||
"b736720173fd4ba5bbe54cbcc1177423",
|
||||
"368fe041fff84949ac30d3d45ac78a0d",
|
||||
"79ff492b16e946c8a6238d31b181ffc8",
|
||||
"2a12b5905b434c11beaaceaf7e1a6394",
|
||||
"9f16b85fde7148b7931c30fb024c87d5",
|
||||
"f0bae3fc9925442e82d58ecd7a305808",
|
||||
"2181a83c39114bc78b1e4859b3ccdfed",
|
||||
"14ad494e78084d8983bc6c0751f9d941",
|
||||
"280600190e10484db98261256542f236",
|
||||
"562e9f5c0d0d4228b218553019e483b6",
|
||||
"cc6675e71cea4018b6adff29d60f0a82",
|
||||
"39633f760e104265b1ddc2bcb3e4961d",
|
||||
"64288ea1c3074a528339b9d0f9729d18",
|
||||
"584114fa6b554a1495f6aa14011e0cc6",
|
||||
"2756416bfbcf474c94c1ca2ab4b7d8e3",
|
||||
"8c6e4f33682040feb42c1385c66b7ba2",
|
||||
"68cc9722525c46328cf963c2a4f2740a",
|
||||
"06367bbf0c094ba1bc7d481fb1bfc3f9",
|
||||
"1434b26ed3b4449b8fd6a76e0f1e5c97"
|
||||
]
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "display_data",
|
||||
"data": {
|
||||
"text/plain": "tokenizer_config.json: 0%| | 0.00/967 [00:00<?, ?B/s]",
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"version_major": 2,
|
||||
"version_minor": 0,
|
||||
"model_id": "fa5c2b7f05bc412993098a3731e72989"
|
||||
}
|
||||
},
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"output_type": "display_data",
|
||||
"data": {
|
||||
"text/plain": "tokenizer.model: 0%| | 0.00/493k [00:00<?, ?B/s]",
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"version_major": 2,
|
||||
"version_minor": 0,
|
||||
"model_id": "e64a4b6de34d4f40b88305ce507e3658"
|
||||
}
|
||||
},
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"output_type": "display_data",
|
||||
"data": {
|
||||
"text/plain": "tokenizer.json: 0%| | 0.00/1.80M [00:00<?, ?B/s]",
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"version_major": 2,
|
||||
"version_minor": 0,
|
||||
"model_id": "4675ed906a964735b4334458935ab4b9"
|
||||
}
|
||||
},
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"output_type": "display_data",
|
||||
"data": {
|
||||
"text/plain": "special_tokens_map.json: 0%| | 0.00/72.0 [00:00<?, ?B/s]",
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"version_major": 2,
|
||||
"version_minor": 0,
|
||||
"model_id": "e4f24bafae8f4397b76818a34ca9d6e4"
|
||||
}
|
||||
},
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"output_type": "display_data",
|
||||
"data": {
|
||||
"text/plain": "config.json: 0%| | 0.00/720 [00:00<?, ?B/s]",
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"version_major": 2,
|
||||
"version_minor": 0,
|
||||
"model_id": "3621e97c28544d34ab3953c22d227cd0"
|
||||
}
|
||||
},
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"output_type": "display_data",
|
||||
"data": {
|
||||
"text/plain": "model.safetensors.index.json: 0%| | 0.00/92.7k [00:00<?, ?B/s]",
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"version_major": 2,
|
||||
"version_minor": 0,
|
||||
"model_id": "dd02aa16c10b4ab78373aa3dae939489"
|
||||
}
|
||||
},
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"output_type": "display_data",
|
||||
"data": {
|
||||
"text/plain": "Downloading shards: 0%| | 0/19 [00:00<?, ?it/s]",
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"version_major": 2,
|
||||
"version_minor": 0,
|
||||
"model_id": "44e75ecc95b74f03a7a58e6ea21165c1"
|
||||
}
|
||||
},
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"output_type": "display_data",
|
||||
"data": {
|
||||
"text/plain": "model-00001-of-00019.safetensors: 0%| | 0.00/4.89G [00:00<?, ?B/s]",
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"version_major": 2,
|
||||
"version_minor": 0,
|
||||
"model_id": "6d26de44c0334077b6c14104747a48ad"
|
||||
}
|
||||
},
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"output_type": "display_data",
|
||||
"data": {
|
||||
"text/plain": "model-00002-of-00019.safetensors: 0%| | 0.00/4.98G [00:00<?, ?B/s]",
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"version_major": 2,
|
||||
"version_minor": 0,
|
||||
"model_id": "57c7fa8051a94bcb96c0309651ab8298"
|
||||
}
|
||||
},
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"output_type": "display_data",
|
||||
"data": {
|
||||
"text/plain": "model-00003-of-00019.safetensors: 0%| | 0.00/4.98G [00:00<?, ?B/s]",
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"version_major": 2,
|
||||
"version_minor": 0,
|
||||
"model_id": "b736720173fd4ba5bbe54cbcc1177423"
|
||||
}
|
||||
},
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"output_type": "display_data",
|
||||
"data": {
|
||||
"text/plain": "model-00004-of-00019.safetensors: 0%| | 0.00/4.90G [00:00<?, ?B/s]",
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"version_major": 2,
|
||||
"version_minor": 0,
|
||||
"model_id": "368fe041fff84949ac30d3d45ac78a0d"
|
||||
}
|
||||
},
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"output_type": "display_data",
|
||||
"data": {
|
||||
"text/plain": "model-00005-of-00019.safetensors: 0%| | 0.00/4.98G [00:00<?, ?B/s]",
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"version_major": 2,
|
||||
"version_minor": 0,
|
||||
"model_id": "79ff492b16e946c8a6238d31b181ffc8"
|
||||
}
|
||||
},
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"output_type": "display_data",
|
||||
"data": {
|
||||
"text/plain": "model-00006-of-00019.safetensors: 0%| | 0.00/4.98G [00:00<?, ?B/s]",
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"version_major": 2,
|
||||
"version_minor": 0,
|
||||
"model_id": "2a12b5905b434c11beaaceaf7e1a6394"
|
||||
}
|
||||
},
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"output_type": "display_data",
|
||||
"data": {
|
||||
"text/plain": "model-00007-of-00019.safetensors: 0%| | 0.00/4.90G [00:00<?, ?B/s]",
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"version_major": 2,
|
||||
"version_minor": 0,
|
||||
"model_id": "9f16b85fde7148b7931c30fb024c87d5"
|
||||
}
|
||||
},
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"output_type": "display_data",
|
||||
"data": {
|
||||
"text/plain": "model-00008-of-00019.safetensors: 0%| | 0.00/4.98G [00:00<?, ?B/s]",
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"version_major": 2,
|
||||
"version_minor": 0,
|
||||
"model_id": "f0bae3fc9925442e82d58ecd7a305808"
|
||||
}
|
||||
},
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"output_type": "display_data",
|
||||
"data": {
|
||||
"text/plain": "model-00009-of-00019.safetensors: 0%| | 0.00/4.98G [00:00<?, ?B/s]",
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"version_major": 2,
|
||||
"version_minor": 0,
|
||||
"model_id": "2181a83c39114bc78b1e4859b3ccdfed"
|
||||
}
|
||||
},
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"output_type": "display_data",
|
||||
"data": {
|
||||
"text/plain": "model-00010-of-00019.safetensors: 0%| | 0.00/4.90G [00:00<?, ?B/s]",
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"version_major": 2,
|
||||
"version_minor": 0,
|
||||
"model_id": "14ad494e78084d8983bc6c0751f9d941"
|
||||
}
|
||||
},
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"output_type": "display_data",
|
||||
"data": {
|
||||
"text/plain": "model-00011-of-00019.safetensors: 0%| | 0.00/4.98G [00:00<?, ?B/s]",
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"version_major": 2,
|
||||
"version_minor": 0,
|
||||
"model_id": "280600190e10484db98261256542f236"
|
||||
}
|
||||
},
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"output_type": "display_data",
|
||||
"data": {
|
||||
"text/plain": "model-00012-of-00019.safetensors: 0%| | 0.00/4.98G [00:00<?, ?B/s]",
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"version_major": 2,
|
||||
"version_minor": 0,
|
||||
"model_id": "562e9f5c0d0d4228b218553019e483b6"
|
||||
}
|
||||
},
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"output_type": "display_data",
|
||||
"data": {
|
||||
"text/plain": "model-00013-of-00019.safetensors: 0%| | 0.00/4.98G [00:00<?, ?B/s]",
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"version_major": 2,
|
||||
"version_minor": 0,
|
||||
"model_id": "cc6675e71cea4018b6adff29d60f0a82"
|
||||
}
|
||||
},
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"output_type": "display_data",
|
||||
"data": {
|
||||
"text/plain": "model-00014-of-00019.safetensors: 0%| | 0.00/4.90G [00:00<?, ?B/s]",
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"version_major": 2,
|
||||
"version_minor": 0,
|
||||
"model_id": "39633f760e104265b1ddc2bcb3e4961d"
|
||||
}
|
||||
},
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"output_type": "display_data",
|
||||
"data": {
|
||||
"text/plain": "model-00015-of-00019.safetensors: 0%| | 0.00/4.98G [00:00<?, ?B/s]",
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"version_major": 2,
|
||||
"version_minor": 0,
|
||||
"model_id": "64288ea1c3074a528339b9d0f9729d18"
|
||||
}
|
||||
},
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"output_type": "display_data",
|
||||
"data": {
|
||||
"text/plain": "model-00016-of-00019.safetensors: 0%| | 0.00/4.98G [00:00<?, ?B/s]",
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"version_major": 2,
|
||||
"version_minor": 0,
|
||||
"model_id": "584114fa6b554a1495f6aa14011e0cc6"
|
||||
}
|
||||
},
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"output_type": "display_data",
|
||||
"data": {
|
||||
"text/plain": "model-00017-of-00019.safetensors: 0%| | 0.00/4.90G [00:00<?, ?B/s]",
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"version_major": 2,
|
||||
"version_minor": 0,
|
||||
"model_id": "2756416bfbcf474c94c1ca2ab4b7d8e3"
|
||||
}
|
||||
},
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"output_type": "display_data",
|
||||
"data": {
|
||||
"text/plain": "model-00018-of-00019.safetensors: 0%| | 0.00/4.98G [00:00<?, ?B/s]",
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"version_major": 2,
|
||||
"version_minor": 0,
|
||||
"model_id": "8c6e4f33682040feb42c1385c66b7ba2"
|
||||
}
|
||||
},
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"output_type": "display_data",
|
||||
"data": {
|
||||
"text/plain": "model-00019-of-00019.safetensors: 0%| | 0.00/4.22G [00:00<?, ?B/s]",
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"version_major": 2,
|
||||
"version_minor": 0,
|
||||
"model_id": "68cc9722525c46328cf963c2a4f2740a"
|
||||
}
|
||||
},
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"output_type": "display_data",
|
||||
"data": {
|
||||
"text/plain": "Loading checkpoint shards: 0%| | 0/19 [00:00<?, ?it/s]",
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"version_major": 2,
|
||||
"version_minor": 0,
|
||||
"model_id": "06367bbf0c094ba1bc7d481fb1bfc3f9"
|
||||
}
|
||||
},
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"output_type": "display_data",
|
||||
"data": {
|
||||
"text/plain": "generation_config.json: 0%| | 0.00/116 [00:00<?, ?B/s]",
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"version_major": 2,
|
||||
"version_minor": 0,
|
||||
"model_id": "1434b26ed3b4449b8fd6a76e0f1e5c97"
|
||||
}
|
||||
},
|
||||
"metadata": {}
|
||||
}
|
||||
],
|
||||
"execution_count": null
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"# Mistral Model Architecture\n",
|
||||
"\n",
|
||||
"```python\n",
|
||||
"\n",
|
||||
"MixtralForCausalLM(\n",
|
||||
" (model): MixtralModel(\n",
|
||||
" (embed_tokens): Embedding(32000, 4096)\n",
|
||||
" (layers): ModuleList(\n",
|
||||
" (0-31): 32 x MixtralDecoderLayer(\n",
|
||||
" (self_attn): MixtralSdpaAttention(\n",
|
||||
" (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n",
|
||||
" (k_proj): Linear(in_features=4096, out_features=1024, bias=False)\n",
|
||||
" (v_proj): Linear(in_features=4096, out_features=1024, bias=False)\n",
|
||||
" (o_proj): Linear(in_features=4096, out_features=4096, bias=False)\n",
|
||||
" (rotary_emb): MixtralRotaryEmbedding()\n",
|
||||
" )\n",
|
||||
" (block_sparse_moe): MixtralSparseMoeBlock(\n",
|
||||
" (gate): Linear(in_features=4096, out_features=8, bias=False)\n",
|
||||
" (experts): ModuleList(\n",
|
||||
" (0-7): 8 x MixtralBlockSparseTop2MLP(\n",
|
||||
" (w1): Linear(in_features=4096, out_features=14336, bias=False)\n",
|
||||
" (w2): Linear(in_features=14336, out_features=4096, bias=False)\n",
|
||||
" (w3): Linear(in_features=4096, out_features=14336, bias=False)\n",
|
||||
" (act_fn): SiLU()\n",
|
||||
" )\n",
|
||||
" )\n",
|
||||
" )\n",
|
||||
" (input_layernorm): MixtralRMSNorm((4096,), eps=1e-05)\n",
|
||||
" (post_attention_layernorm): MixtralRMSNorm((4096,), eps=1e-05)\n",
|
||||
" )\n",
|
||||
" )\n",
|
||||
" (norm): MixtralRMSNorm((4096,), eps=1e-05)\n",
|
||||
" )\n",
|
||||
" (lm_head): Linear(in_features=4096, out_features=32000, bias=False)\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"```"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "DaKAIgb6AAas"
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"### Model Config\n",
|
||||
"\n",
|
||||
"```python\n",
|
||||
"\n",
|
||||
"MixtralConfig {\n",
|
||||
" \"_name_or_path\": \"mistralai/Mixtral-8x7B-v0.1\",\n",
|
||||
" \"architectures\": [\n",
|
||||
" \"MixtralForCausalLM\"\n",
|
||||
" ],\n",
|
||||
" \"attention_dropout\": 0.0,\n",
|
||||
" \"bos_token_id\": 1,\n",
|
||||
" \"eos_token_id\": 2,\n",
|
||||
" \"hidden_act\": \"silu\",\n",
|
||||
" \"hidden_size\": 4096,\n",
|
||||
" \"initializer_range\": 0.02,\n",
|
||||
" \"intermediate_size\": 14336,\n",
|
||||
" \"max_position_embeddings\": 32768,\n",
|
||||
" \"model_type\": \"mixtral\",\n",
|
||||
" \"num_attention_heads\": 32,\n",
|
||||
" \"num_experts_per_tok\": 2,\n",
|
||||
" \"num_hidden_layers\": 32,\n",
|
||||
" \"num_key_value_heads\": 8,\n",
|
||||
" \"num_local_experts\": 8,\n",
|
||||
" \"output_router_logits\": false,\n",
|
||||
" \"rms_norm_eps\": 1e-05,\n",
|
||||
" \"rope_theta\": 1000000.0,\n",
|
||||
" \"router_aux_loss_coef\": 0.02,\n",
|
||||
" \"router_jitter_noise\": 0.0,\n",
|
||||
" \"sliding_window\": null,\n",
|
||||
" \"tie_word_embeddings\": false,\n",
|
||||
" \"torch_dtype\": \"bfloat16\",\n",
|
||||
" \"transformers_version\": \"4.45.1\",\n",
|
||||
" \"use_cache\": true,\n",
|
||||
" \"vocab_size\": 32000\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"```"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "tQtMnYC-AAav"
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"## These are the most Important Parts of the Model\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"1. **Embedding Layer**: This converts token IDs to embeddings.\n",
|
||||
"2. **Self-Attention Layer**: This performs the self-attention mechanism.\n",
|
||||
"3. **Block Sparse MoE Experts**: This applies the Mixture of Experts (MoE) mechanism.\n",
|
||||
"4. **Post-Attention LayerNorm**: This normalizes the output after the attention mechanism.\n",
|
||||
"5. **Final Norm Layer**: This normalizes the final output of the model.\n",
|
||||
"6. **Language Model Head**: This converts the final hidden states to logits.\n"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "YLZWKqhxAAax"
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"total_params = sum(p.numel() for p in model.parameters())\n",
|
||||
"trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)\n",
|
||||
"\n",
|
||||
"print(f\"Total Parameters: {total_params}\")\n",
|
||||
"print(f\"Trainable Parameters: {trainable_params}\")\n",
|
||||
"print(f\"Non-Trainable Parameters: {total_params - trainable_params}\")"
|
||||
],
|
||||
"metadata": {
|
||||
"trusted": true,
|
||||
"execution": {
|
||||
"iopub.status.busy": "2024-10-18T18:30:40.652378Z",
|
||||
"iopub.execute_input": "2024-10-18T18:30:40.652918Z",
|
||||
"iopub.status.idle": "2024-10-18T18:30:40.673546Z",
|
||||
"shell.execute_reply.started": "2024-10-18T18:30:40.652882Z",
|
||||
"shell.execute_reply": "2024-10-18T18:30:40.672605Z"
|
||||
},
|
||||
"id": "X_LcNEfDAAax",
|
||||
"outputId": "14a7e102-c12a-40d6-d9c2-da12acbbc4fb"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"text": "Total Parameters: 46702792704\nTrainable Parameters: 46702792704\nNon-Trainable Parameters: 0\n",
|
||||
"output_type": "stream"
|
||||
}
|
||||
],
|
||||
"execution_count": null
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"## Define a dictionary to store the outputs\n",
|
||||
"\n",
|
||||
"\n"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "ajT_agN8AAay"
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"outputs = {\n",
|
||||
" \"embed_tokens\": None,\n",
|
||||
" \"self_attn_layer_1\": None,\n",
|
||||
" \"block_sparse_moe_experts\": None,\n",
|
||||
" \"post_attention_layernorm\": None,\n",
|
||||
" \"norm\": None,\n",
|
||||
" \"lm_head\": None,\n",
|
||||
" \"input_layernorm\": None, # Adding hook for input layernorm\n",
|
||||
" \"self_attn_q_proj\": None, # Adding hook for q_proj in self_attn\n",
|
||||
" \"self_attn_k_proj\": None, # Adding hook for k_proj in self_attn\n",
|
||||
" \"self_attn_v_proj\": None, # Adding hook for v_proj in self_attn\n",
|
||||
" \"self_attn_o_proj\": None, # Adding hook for o_proj in self_attn\n",
|
||||
" \"block_sparse_moe_gate\": None, # Adding hook for gate in block_sparse_moe\n",
|
||||
"}\n"
|
||||
],
|
||||
"metadata": {
|
||||
"trusted": true,
|
||||
"execution": {
|
||||
"iopub.status.busy": "2024-10-18T18:30:40.675424Z",
|
||||
"iopub.execute_input": "2024-10-18T18:30:40.675923Z",
|
||||
"iopub.status.idle": "2024-10-18T18:30:40.686803Z",
|
||||
"shell.execute_reply.started": "2024-10-18T18:30:40.675885Z",
|
||||
"shell.execute_reply": "2024-10-18T18:30:40.685864Z"
|
||||
},
|
||||
"id": "WslzL6fIAAaz"
|
||||
},
|
||||
"outputs": [],
|
||||
"execution_count": null
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"## Define & Register hooks"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "uZzq-iu_AAa0"
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"# Define the hook functions\n",
|
||||
"def hook_fn(name):\n",
|
||||
" def hook(module, input, output):\n",
|
||||
" outputs[name] = output\n",
|
||||
" return hook\n",
|
||||
"\n",
|
||||
"# Register hooks\n",
|
||||
"model.model.embed_tokens.register_forward_hook(hook_fn(\"embed_tokens\"))\n",
|
||||
"model.model.layers[0].self_attn.register_forward_hook(hook_fn(\"self_attn_layer_1\"))\n",
|
||||
"model.model.layers[0].block_sparse_moe.experts[0].register_forward_hook(hook_fn(\"block_sparse_moe_experts\"))\n",
|
||||
"model.model.layers[0].post_attention_layernorm.register_forward_hook(hook_fn(\"post_attention_layernorm\"))\n",
|
||||
"model.model.norm.register_forward_hook(hook_fn(\"norm\"))\n",
|
||||
"model.lm_head.register_forward_hook(hook_fn(\"lm_head\"))\n",
|
||||
"\n",
|
||||
"# Additional hooks\n",
|
||||
"model.model.layers[0].input_layernorm.register_forward_hook(hook_fn(\"input_layernorm\"))\n",
|
||||
"model.model.layers[0].self_attn.q_proj.register_forward_hook(hook_fn(\"self_attn_q_proj\"))\n",
|
||||
"model.model.layers[0].self_attn.k_proj.register_forward_hook(hook_fn(\"self_attn_k_proj\"))\n",
|
||||
"model.model.layers[0].self_attn.v_proj.register_forward_hook(hook_fn(\"self_attn_v_proj\"))\n",
|
||||
"model.model.layers[0].self_attn.o_proj.register_forward_hook(hook_fn(\"self_attn_o_proj\"))\n",
|
||||
"model.model.layers[0].block_sparse_moe.gate.register_forward_hook(hook_fn(\"block_sparse_moe_gate\"))"
|
||||
],
|
||||
"metadata": {
|
||||
"trusted": true,
|
||||
"execution": {
|
||||
"iopub.status.busy": "2024-10-18T18:30:40.68794Z",
|
||||
"iopub.execute_input": "2024-10-18T18:30:40.688277Z",
|
||||
"iopub.status.idle": "2024-10-18T18:30:40.705334Z",
|
||||
"shell.execute_reply.started": "2024-10-18T18:30:40.688245Z",
|
||||
"shell.execute_reply": "2024-10-18T18:30:40.704462Z"
|
||||
},
|
||||
"id": "8LONwVHwAAa1",
|
||||
"outputId": "8cc24463-f06b-4069-b873-af792beecfd5"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"execution_count": 7,
|
||||
"output_type": "execute_result",
|
||||
"data": {
|
||||
"text/plain": "<torch.utils.hooks.RemovableHandle at 0x7d29af68c820>"
|
||||
},
|
||||
"metadata": {}
|
||||
}
|
||||
],
|
||||
"execution_count": null
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"## Forward Pass"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "vORmhXXTAAa3"
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"input_text = \"The quick brown fox jumps over the lazy dog !\"\n",
|
||||
"\n",
|
||||
"# Tokenize the input text\n",
|
||||
"inputs = tokenizer(input_text, return_tensors=\"pt\")\n",
|
||||
"print(\"Tokenized inputs {'input_ids','attention_mask'} - \",inputs)\n",
|
||||
"print(\"Decoded tokens : \",tokenizer.decode(inputs['input_ids'][0]))"
|
||||
],
|
||||
"metadata": {
|
||||
"trusted": true,
|
||||
"execution": {
|
||||
"iopub.status.busy": "2024-10-18T18:30:40.706383Z",
|
||||
"iopub.execute_input": "2024-10-18T18:30:40.706895Z",
|
||||
"iopub.status.idle": "2024-10-18T18:30:40.728093Z",
|
||||
"shell.execute_reply.started": "2024-10-18T18:30:40.706863Z",
|
||||
"shell.execute_reply": "2024-10-18T18:30:40.727243Z"
|
||||
},
|
||||
"id": "dcC4RjNTAAa4",
|
||||
"outputId": "f0d4c2e3-b7f6-471d-9b9b-ce5316c47431"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"text": "Tokenized inputs {'input_ids','attention_mask'} - {'input_ids': tensor([[ 1, 415, 2936, 9060, 285, 1142, 461, 10575, 754, 272,\n 17898, 3914, 918]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}\nDecoded tokens : <s> The quick brown fox jumps over the lazy dog !\n",
|
||||
"output_type": "stream"
|
||||
}
|
||||
],
|
||||
"execution_count": null
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"\n",
|
||||
"with torch.no_grad():\n",
|
||||
" model_output = model(**inputs)"
|
||||
],
|
||||
"metadata": {
|
||||
"trusted": true,
|
||||
"execution": {
|
||||
"iopub.status.busy": "2024-10-18T18:30:40.729287Z",
|
||||
"iopub.execute_input": "2024-10-18T18:30:40.729872Z",
|
||||
"iopub.status.idle": "2024-10-18T18:36:43.660892Z",
|
||||
"shell.execute_reply.started": "2024-10-18T18:30:40.72983Z",
|
||||
"shell.execute_reply": "2024-10-18T18:36:43.660087Z"
|
||||
},
|
||||
"id": "4x2A5-m-AAa6",
|
||||
"outputId": "d0fc43d2-1229-4582-d4d3-6b5f745be24e"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"text": "Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)\n",
|
||||
"output_type": "stream"
|
||||
}
|
||||
],
|
||||
"execution_count": null
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"for layer, output in outputs.items():\n",
|
||||
" print(f\"Output at {layer}: \")\n",
|
||||
" if isinstance(output, torch.Tensor):\n",
|
||||
" print(output.shape, type(output))\n",
|
||||
" elif isinstance(output, tuple):\n",
|
||||
" for i, o in enumerate(output):\n",
|
||||
" print(f\"Output {i}: {o.shape if isinstance(o, torch.Tensor) else type(o)}\")\n",
|
||||
" else:\n",
|
||||
" print(type(output))\n",
|
||||
" print(\"-\" * 100)"
|
||||
],
|
||||
"metadata": {
|
||||
"trusted": true,
|
||||
"execution": {
|
||||
"iopub.status.busy": "2024-10-18T18:56:11.28238Z",
|
||||
"iopub.execute_input": "2024-10-18T18:56:11.283252Z",
|
||||
"iopub.status.idle": "2024-10-18T18:56:11.291437Z",
|
||||
"shell.execute_reply.started": "2024-10-18T18:56:11.283214Z",
|
||||
"shell.execute_reply": "2024-10-18T18:56:11.290478Z"
|
||||
},
|
||||
"id": "xVuaYV3pAAa7",
|
||||
"outputId": "84e2f36e-0f10-4be1-9fdd-581fe61fabb1"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"text": "Output at embed_tokens: \ntorch.Size([1, 13, 4096]) <class 'torch.Tensor'>\n----------------------------------------------------------------------------------------------------\nOutput at self_attn_layer_1: \nOutput 0: torch.Size([1, 13, 4096])\nOutput 1: <class 'NoneType'>\nOutput 2: <class 'transformers.cache_utils.DynamicCache'>\n----------------------------------------------------------------------------------------------------\nOutput at block_sparse_moe_experts: \ntorch.Size([3, 4096]) <class 'torch.Tensor'>\n----------------------------------------------------------------------------------------------------\nOutput at post_attention_layernorm: \ntorch.Size([1, 13, 4096]) <class 'torch.Tensor'>\n----------------------------------------------------------------------------------------------------\nOutput at norm: \ntorch.Size([1, 13, 4096]) <class 'torch.Tensor'>\n----------------------------------------------------------------------------------------------------\nOutput at lm_head: \ntorch.Size([1, 13, 32000]) <class 'torch.Tensor'>\n----------------------------------------------------------------------------------------------------\nOutput at input_layernorm: \ntorch.Size([1, 13, 4096]) <class 'torch.Tensor'>\n----------------------------------------------------------------------------------------------------\nOutput at self_attn_q_proj: \ntorch.Size([1, 13, 4096]) <class 'torch.Tensor'>\n----------------------------------------------------------------------------------------------------\nOutput at self_attn_k_proj: \ntorch.Size([1, 13, 1024]) <class 'torch.Tensor'>\n----------------------------------------------------------------------------------------------------\nOutput at self_attn_v_proj: \ntorch.Size([1, 13, 1024]) <class 'torch.Tensor'>\n----------------------------------------------------------------------------------------------------\nOutput at self_attn_o_proj: \ntorch.Size([1, 13, 4096]) <class 'torch.Tensor'>\n----------------------------------------------------------------------------------------------------\nOutput at block_sparse_moe_gate: \ntorch.Size([13, 8]) <class 'torch.Tensor'>\n----------------------------------------------------------------------------------------------------\n",
|
||||
"output_type": "stream"
|
||||
}
|
||||
],
|
||||
"execution_count": null
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"\n",
|
||||
"\n",
|
||||
"### Explanation of Shapes:\n",
|
||||
"\n",
|
||||
"### 1. **embed_tokens**\n",
|
||||
"- **Shape:** `torch.Size([1, 13, 4096])`\n",
|
||||
"- **Explanation:**\n",
|
||||
" - `1`: Batch size (number of sequences in this batch, here it's 1 sequence).\n",
|
||||
" - `13`: Sequence length (the number of tokens in the input, here 13 tokens).\n",
|
||||
" - `4096`: Embedding size (each token is mapped to a 4096-dimensional vector).\n",
|
||||
" \n",
|
||||
" **Shape Format:** `(batch_size, seq_len, embed_dim)`\n",
|
||||
"\n",
|
||||
"### 2. **self_attn_layer_1**\n",
|
||||
"- **Shape (Output 0):** `torch.Size([1, 13, 4096])`\n",
|
||||
"- **Explanation:**\n",
|
||||
" - `1`: Batch size.\n",
|
||||
" - `13`: Sequence length.\n",
|
||||
" - `4096`: Hidden size (output of the attention mechanism).\n",
|
||||
"\n",
|
||||
" **Shape Format:** `(batch_size, seq_len, hidden_dim)`\n",
|
||||
"\n",
|
||||
"### 3. **block_sparse_moe_experts**\n",
|
||||
"- **Shape:** `torch.Size([3, 4096])`\n",
|
||||
"- **Explanation:**\n",
|
||||
" - `3`: Number of activated experts for the MoE (Mixture of Experts) layer. In the `block_sparse_moe`, two experts are chosen per token (2 tokens in the batch may activate the same experts, hence the 3).\n",
|
||||
" - `4096`: Expert embedding size (the dimensionality of the expert's output).\n",
|
||||
" \n",
|
||||
" **Shape Format:** `(num_experts, expert_embed_dim)`\n",
|
||||
"\n",
|
||||
"### 4. **post_attention_layernorm**\n",
|
||||
"- **Shape:** `torch.Size([1, 13, 4096])`\n",
|
||||
"- **Explanation:**\n",
|
||||
" - `1`: Batch size.\n",
|
||||
" - `13`: Sequence length.\n",
|
||||
" - `4096`: Output dimension after the post-attention normalization step.\n",
|
||||
" \n",
|
||||
" **Shape Format:** `(batch_size, seq_len, hidden_dim)`\n",
|
||||
"\n",
|
||||
"### 5. **norm**\n",
|
||||
"- **Shape:** `torch.Size([1, 13, 4096])`\n",
|
||||
"- **Explanation:**\n",
|
||||
" - `1`: Batch size.\n",
|
||||
" - `13`: Sequence length.\n",
|
||||
" - `4096`: Output dimension after applying the final normalization layer.\n",
|
||||
" \n",
|
||||
" **Shape Format:** `(batch_size, seq_len, hidden_dim)`\n",
|
||||
"\n",
|
||||
"### 6. **lm_head**\n",
|
||||
"- **Shape:** `torch.Size([1, 13, 32000])`\n",
|
||||
"- **Explanation:**\n",
|
||||
" - `1`: Batch size.\n",
|
||||
" - `13`: Sequence length.\n",
|
||||
" - `32000`: Vocabulary size (logits over the vocabulary for each token in the sequence).\n",
|
||||
" \n",
|
||||
" **Shape Format:** `(batch_size, seq_len, vocab_size)`\n",
|
||||
"\n",
|
||||
"### 7. **input_layernorm**\n",
|
||||
"- **Shape:** `torch.Size([1, 13, 4096])`\n",
|
||||
"- **Explanation:**\n",
|
||||
" - `1`: Batch size.\n",
|
||||
" - `13`: Sequence length.\n",
|
||||
" - `4096`: Output dimension after the input layer normalization step.\n",
|
||||
" \n",
|
||||
" **Shape Format:** `(batch_size, seq_len, hidden_dim)`\n",
|
||||
"\n",
|
||||
"### 8. **self_attn_q_proj**\n",
|
||||
"- **Shape:** `torch.Size([1, 13, 4096])`\n",
|
||||
"- **Explanation:**\n",
|
||||
" - `1`: Batch size.\n",
|
||||
" - `13`: Sequence length.\n",
|
||||
" - `4096`: Query projection size (the hidden state is projected to the query vector space).\n",
|
||||
" \n",
|
||||
" **Shape Format:** `(batch_size, seq_len, hidden_dim)`\n",
|
||||
"\n",
|
||||
"### 9. **self_attn_k_proj**\n",
|
||||
"- **Shape:** `torch.Size([1, 13, 1024])`\n",
|
||||
"- **Explanation:**\n",
|
||||
" - `1`: Batch size.\n",
|
||||
" - `13`: Sequence length.\n",
|
||||
" - `1024`: Key projection size (here, the key is projected to a smaller dimensional space compared to queries/values).\n",
|
||||
" \n",
|
||||
" **Shape Format:** `(batch_size, seq_len, key_dim)`\n",
|
||||
"\n",
|
||||
"### 10. **self_attn_v_proj**\n",
|
||||
"- **Shape:** `torch.Size([1, 13, 1024])`\n",
|
||||
"- **Explanation:**\n",
|
||||
" - `1`: Batch size.\n",
|
||||
" - `13`: Sequence length.\n",
|
||||
" - `1024`: Value projection size (the values are also projected to the same size as the keys).\n",
|
||||
" \n",
|
||||
" **Shape Format:** `(batch_size, seq_len, value_dim)`\n",
|
||||
"\n",
|
||||
"### 11. **self_attn_o_proj**\n",
|
||||
"- **Shape:** `torch.Size([1, 13, 4096])`\n",
|
||||
"- **Explanation:**\n",
|
||||
" - `1`: Batch size.\n",
|
||||
" - `13`: Sequence length.\n",
|
||||
" - `4096`: Output projection size (the final result after the attention mechanism is projected back to the original hidden dimension).\n",
|
||||
" \n",
|
||||
" **Shape Format:** `(batch_size, seq_len, hidden_dim)`\n",
|
||||
"\n",
|
||||
"### 12. **block_sparse_moe_gate**\n",
|
||||
"- **Shape:** `torch.Size([13, 8])`\n",
|
||||
"- **Explanation:**\n",
|
||||
" - `13`: Sequence length (the gate operates per token).\n",
|
||||
" - `8`: Number of experts (gating decisions are made over all available experts).\n",
|
||||
" \n",
|
||||
" **Shape Format:** `(seq_len, num_experts)`\n",
|
||||
"\n",
|
||||
"### Summary Table:\n",
|
||||
"\n",
|
||||
"| Layer Name | Shape Format | Dimensions | Notes |\n",
|
||||
"|------------------------------|------------------------------------|------------|------------------------------------------------------------|\n",
|
||||
"| `embed_tokens` | `(batch_size, seq_len, embed_dim)` | `[1, 13, 4096]` | Embedding tokens from vocabulary. |\n",
|
||||
"| `self_attn_layer_1` | `(batch_size, seq_len, hidden_dim)`| `[1, 13, 4096]` | Output of first attention layer. |\n",
|
||||
"| `block_sparse_moe_experts` | `(num_experts, expert_embed_dim)` | `[3, 4096]` | Expert outputs in MoE block. |\n",
|
||||
"| `post_attention_layernorm` | `(batch_size, seq_len, hidden_dim)`| `[1, 13, 4096]` | Layer norm after attention. |\n",
|
||||
"| `norm` | `(batch_size, seq_len, hidden_dim)`| `[1, 13, 4096]` | Final normalization layer. |\n",
|
||||
"| `lm_head` | `(batch_size, seq_len, vocab_size)`| `[1, 13, 32000]` | Logits for each token over the vocabulary. |\n",
|
||||
"| `input_layernorm` | `(batch_size, seq_len, hidden_dim)`| `[1, 13, 4096]` | Input layer normalization. |\n",
|
||||
"| `self_attn_q_proj` | `(batch_size, seq_len, hidden_dim)`| `[1, 13, 4096]` | Query projection in self-attention. |\n",
|
||||
"| `self_attn_k_proj` | `(batch_size, seq_len, key_dim)` | `[1, 13, 1024]` | Key projection in self-attention. |\n",
|
||||
"| `self_attn_v_proj` | `(batch_size, seq_len, value_dim)` | `[1, 13, 1024]` | Value projection in self-attention. |\n",
|
||||
"| `self_attn_o_proj` | `(batch_size, seq_len, hidden_dim)`| `[1, 13, 4096]` | Output projection after attention. |\n",
|
||||
"| `block_sparse_moe_gate` | `(seq_len, num_experts)` | `[13, 8]` | Gating decisions for the mixture of experts. |\n",
|
||||
"\n"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "FyugDOzXAAa8"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
@ -1,89 +0,0 @@
|
||||
|
||||
|
||||
|
||||
# LLM-Experiment Series
|
||||
|
||||
Welcome to the LL-Experiment series! This series of notebooks and scripts aims to provide a comprehensive guide on investigating the internal workings of Large Language Models (LLMs), understanding how they process inputs, and experimenting with their architectures.
|
||||
|
||||
## Table of Contents
|
||||
|
||||
- [Introduction](#introduction)
|
||||
- [Series Overview](#series-overview)
|
||||
- [Getting Started](#getting-started)
|
||||
- [Notebooks and Scripts](#notebooks-and-scripts)
|
||||
- [Contributing](#contributing)
|
||||
- [License](#license)
|
||||
|
||||
## Introduction
|
||||
|
||||
Large Language Models (LLMs) have revolutionized the field of natural language processing (NLP) by achieving state-of-the-art performance on various tasks. However, understanding their internal workings and how they process inputs can be challenging. This series aims to demystify LLMs by providing detailed explanations, hands-on experiments, and practical tips for tweaking their architectures.
|
||||
|
||||
## Series Overview
|
||||
|
||||
The LLM-Experiment series will cover the following topics:
|
||||
|
||||
1. **Understanding LLM Architectures**:
|
||||
- An overview of popular open source LLM architectures like Whisper, Llama, and Mixtral.
|
||||
- Key trouble shooting during experimentation
|
||||
|
||||
2. **Investigating Input Processing**:
|
||||
- How inputs are tokenized and embedded.
|
||||
- The role of attention mechanisms in processing sequences.
|
||||
- Visualizing and analyzing the outputs at various layers of the model.
|
||||
|
||||
3. **Tweaking LLM Architectures**:
|
||||
- Experimenting with different configurations and hyperparameters.
|
||||
- Modifying existing LLM architectures to improve performance or adapt to specific tasks.
|
||||
- Implementing custom layers and components.
|
||||
|
||||
4. **Conducting New Experiments**:
|
||||
- Designing and implementing new experiments to test hypotheses about LLM behavior.
|
||||
- Evaluating the impact of architectural changes on model performance.
|
||||
- Sharing insights and findings with the community.
|
||||
|
||||
## Getting Started
|
||||
|
||||
To get started with the LLM-Experiment series, you will need the following:
|
||||
|
||||
1. **Python Environment**:
|
||||
- All these notebooks are created in Kaggle or Google Colab, So it's recommended to use the same to reproduce the results for other models
|
||||
|
||||
|
||||
2. **Hugging Face Account**:
|
||||
- Create a Hugging Face account and obtain an API token.
|
||||
- Login to Hugging Face using the provided token or username and token.
|
||||
- Most of the Mistral,Llama models needs some sort of Agreement acceptance
|
||||
|
||||
3. **Notebooks and Scripts**:
|
||||
- Clone this repository to access the notebooks and scripts or you can directly open in Google Colab
|
||||
- Follow the instructions in each notebook to run the experiments and analyze the results.
|
||||
|
||||
## Notebooks and Scripts
|
||||
|
||||
The series will include the following notebooks and scripts:
|
||||
|
||||
1. **Mixtral Model Analysis**:
|
||||
- Analyzing the architecture and configuration of the Mixtral model.
|
||||
- Registering hooks to capture the outputs at various layers.
|
||||
|
||||
2. **Input Processing and Embedding**: - Upcoming
|
||||
|
||||
|
||||
3. **Attention Mechanisms and improvements**: - Upcoming
|
||||
|
||||
|
||||
4. **Rolling Buffer,KV-cache,Sliding Window Attention**: - Upcoming
|
||||
|
||||
|
||||
5. **Tweaking Model Architectures - Adapters,Down-Casting**: - Upcoming
|
||||
|
||||
|
||||
## Contributing
|
||||
|
||||
We welcome contributions from the community! If you have any ideas, suggestions, or improvements, please feel free to open an issue or submit a pull request.
|
||||
|
||||
## License
|
||||
|
||||
This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for more details.
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user