diff --git a/roBERTa.ipynb b/roBERTa.ipynb index 511ed8f..f3a73f1 100644 --- a/roBERTa.ipynb +++ b/roBERTa.ipynb @@ -2,30 +2,50 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "YXAL6gpkijkz", - "outputId": "c13a31b1-bcff-414e-9b99-ea4cc8865582" + "outputId": "fad6b4d3-9a24-466d-bd3d-ddd388a66617" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ - "\u001b[K |████████████████████████████████| 24.2 MB 1.1 MB/s \n", - "\u001b[K |████████████████████████████████| 1.2 MB 14.0 MB/s \n", - "\u001b[K |████████████████████████████████| 4.4 MB 14.7 MB/s \n", - "\u001b[K |████████████████████████████████| 596 kB 76.2 MB/s \n", - "\u001b[K |████████████████████████████████| 6.6 MB 85.9 MB/s \n", - "\u001b[K |████████████████████████████████| 101 kB 12.7 MB/s \n", + "\u001b[K |████████████████████████████████| 21.6 MB 4.8 MB/s \n", + "\u001b[K |████████████████████████████████| 4.6 MB 5.3 MB/s \n", + "\u001b[K |████████████████████████████████| 511.7 MB 5.1 kB/s \n", + "\u001b[K |████████████████████████████████| 5.8 MB 39.8 MB/s \n", + "\u001b[K |████████████████████████████████| 438 kB 97.9 MB/s \n", + "\u001b[K |████████████████████████████████| 1.6 MB 83.3 MB/s \n", + "\u001b[K |████████████████████████████████| 2.1 MB 5.1 MB/s \n", + "\u001b[K |████████████████████████████████| 636 kB 92.0 MB/s \n", + "\u001b[K |████████████████████████████████| 352 kB 89.2 MB/s \n", + "\u001b[K |████████████████████████████████| 43 kB 2.4 MB/s \n", + "\u001b[K |████████████████████████████████| 237 kB 78.5 MB/s \n", + "\u001b[K |████████████████████████████████| 99 kB 10.7 MB/s \n", + "\u001b[K |████████████████████████████████| 1.1 MB 65.9 MB/s \n", + "\u001b[K |████████████████████████████████| 1.2 MB 75.3 MB/s \n", + "\u001b[K |████████████████████████████████| 92 kB 13.0 MB/s \n", + "\u001b[?25h Building wheel for py-cpuinfo (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Building wheel for seqeval (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[K |████████████████████████████████| 24.2 MB 5.2 MB/s \n", + "\u001b[K |████████████████████████████████| 4.4 MB 5.2 MB/s \n", + "\u001b[K |████████████████████████████████| 101 kB 13.0 MB/s \n", + "\u001b[K |████████████████████████████████| 6.6 MB 57.1 MB/s \n", "\u001b[?25h" ] } ], "source": [ + "!pip uninstall -y opencv-python --quiet\n", + "!pip install \"opencv-python-headless<4.3\" --quiet\n", + "!pip install -U \"tensorflow-text==2.9.*\" --quiet\n", + "!pip install tf-models-official --quiet\n", + "!pip install keras-metrics --quiet\n", "!pip install gensim==3.8.3 --quiet\n", "!pip install pydot --quiet\n", "!pip install sentencepiece --quiet\n", @@ -34,7 +54,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 3, "metadata": { "id": "Kd0xo5RHVbg-" }, @@ -49,12 +69,14 @@ "import pandas as pd\n", "import tensorflow as tf\n", "from tensorflow import keras\n", + "from tensorflow.keras import metrics\n", + "import tensorflow_models as tfm\n", "\n", "from tensorflow.keras.layers import Embedding, Input, Dense, Lambda\n", "from tensorflow.keras.models import Model\n", "import tensorflow.keras.backend as K\n", "import tensorflow_datasets as tfds\n", - "from keras.preprocessing.sequence import pad_sequences\n", + "#from keras.preprocessing.sequence import pad_sequences\n", "import torch\n", "\n", "import sklearn as sk\n", @@ -89,13 +111,13 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "3Umy49AUj-QH", - "outputId": "d8d406e7-14d5-496a-b5b2-540c1d271485" + "outputId": "dd1b3cca-cbe5-47f2-f40f-b2eda465c468" }, "outputs": [ { @@ -123,7 +145,7 @@ "metadata": { "id": "RuHXI9hByBSX" }, - "execution_count": 6, + "execution_count": 5, "outputs": [] }, { @@ -141,7 +163,7 @@ "metadata": { "id": "WZRn3TO4yBim" }, - "execution_count": 7, + "execution_count": 6, "outputs": [] }, { @@ -163,45 +185,45 @@ "base_uri": "https://localhost:8080/", "height": 113, "referenced_widgets": [ - "1c14f7620aed423490665bf394979c2a", - "1b910353b4f447baaa7eebd7775cd460", - "85c9e7dfd26c4f4183edfd55a94796e5", - "f3530bb9ec0648c683e80bcbfe94928d", - "78790dd55d0645ce9d1cfd24324bef99", - "ff0d5b56520442c6b696265e899b6d83", - "56fc27b211e848f5ba0210a300e6f67f", - "3a1eb3acb42e400992dab6be8e1a1fc5", - "2239139bfb2b47b082401c18fda79b4e", - "348b7dfdbbff440284b6675b81e078cf", - "8f2f2d92ff704b76a52045ccbb2f2e6c", - "4bf6b3f3dfae48b9ae3e19cb7f6479f6", - "61a9c669bb9341c0bad274c2bdb393ca", - "43be6db4353f4512b2c1dee8853be562", - "5694734f4a4043bd9c949eddd2898a3d", - "c1a5cf55fb38421581ae170bde50f566", - "699fe666f5a241799ab27ee4ce3b6520", - "e070a551ebf148d0a822537ca2c0f793", - "239cf5902f3f4292b1222185dd91548f", - "df592f2845974e0daf3a397712080c5e", - "a1999cb40900410aaee638d4b860b9e2", - "b61ffe9dc2a24f369627109ab2dd6fff", - "6a8d50cd3e7e497089b08f4c76c41c0a", - "8486fbe5f15245c8abf0398f900872e6", - "7b30a2600071419a84e93f98af079e32", - "880b766905e34ada8435d948851f5a9c", - "ae3b81471e3046a0949cb15a2342b684", - "26777f9afb58454e819de2504781651e", - "84dc5d805eba417bbbe12889cc03479d", - "c25d815426834860a9428944727f1448", - "44e7b70238cd483eb004c0fba2a5aabb", - "1dc0c315388548ce84519e6bcfce247e", - "e4648b40f7994915bf1eec0d401219b6" + "a896342be0dc4486bac61aa604128b90", + "273a2764131e465189a2ed29d5484e34", + "e8af2ab322b84518879662d0ca0201a1", + "60a865cc60a04d6a863e7df45f6b3cbc", + "0427845a9b0742c9b58bc74bb6e490ea", + "026d2f69850d4c42b0f32b7e55f42019", + "b838b7d4cdb548e48ef441fd5014c57f", + "d8b8395567a9405eaef69ac18ade55cc", + "f9bcacc9c768433cb5567dba41300cff", + "9afa1df07a6b44dba52ccdcfc2f1ba42", + "a691bc6164f6485496d75934ced86c07", + "34f65b030e35449faa123ff6419fce32", + "c52fec1a0cb04c11ad70de5766d180d5", + "61b63e7371de45e79df278b532bae014", + "bfae99ff42064b269b7c637a21834cb8", + "e273e13619be4095a78dba814b2966ff", + "57fd37ed5d0a4bf39a73e2b0b7b48761", + "860c3f8064b94d2f82f6fc46f0644936", + "8d1fc95c952c49f98eab60e3d9dc937b", + "84a303d3191b427a99422ba19e095ac5", + "74bfd5cd85844ba59098df83d2cf0aa1", + "274c17f08d2c4dd4b0eec23a8b0e05aa", + "f7ac1486af3145e79837f42b2e27ee1e", + "8a2bd4f4bdf64692b850a80ff6cb633c", + "8ed8f190517c477b8874339fbd2f1d6a", + "bbd71f11d1194dbdb25a86f8efe6a656", + "155d6081457f4348b9552e6afe44bb96", + "4dd74c3b0bd644fe81308c5237348f1b", + "2bc9a33616e94636884ec818095cf3a0", + "a8894b47bd7b4ae993dd56a2906c26aa", + "fa5322d165d74af1a6d635a586840845", + "655b1f7f9a48411a8bfce2d2de5265e1", + "bc277c3869d2463ca2272eb38139e359" ] }, "id": "aU1dhDMtdWws", - "outputId": "ac7136a5-a276-4c52-dad5-b084eb54bb5a" + "outputId": "28093ce4-4231-4275-a236-b44b873e563f" }, - "execution_count": 19, + "execution_count": 7, "outputs": [ { "output_type": "display_data", @@ -212,7 +234,7 @@ "application/vnd.jupyter.widget-view+json": { "version_major": 2, "version_minor": 0, - "model_id": "1c14f7620aed423490665bf394979c2a" + "model_id": "a896342be0dc4486bac61aa604128b90" } }, "metadata": {} @@ -226,7 +248,7 @@ "application/vnd.jupyter.widget-view+json": { "version_major": 2, "version_minor": 0, - "model_id": "4bf6b3f3dfae48b9ae3e19cb7f6479f6" + "model_id": "34f65b030e35449faa123ff6419fce32" } }, "metadata": {} @@ -240,7 +262,7 @@ "application/vnd.jupyter.widget-view+json": { "version_major": 2, "version_minor": 0, - "model_id": "6a8d50cd3e7e497089b08f4c76c41c0a" + "model_id": "f7ac1486af3145e79837f42b2e27ee1e" } }, "metadata": {} @@ -250,13 +272,11 @@ { "cell_type": "code", "source": [ - "def create_roberta_model():\n", - " \"\"\"Create a roBERTa model using the model and parameters specified in the roBERTa paper:\n", + "def create_roberta_model(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5,epsilon=1e-08)):\n", + " \"\"\"Create a roBERTa model based on the roBERTa paper:\n", " https://arxiv.org/pdf/1907.11692.pdf \n", "\n", " - model: TFRobertaForSequenceClassification\n", - " - learning rate: 2e-5\n", - " - epsilon: 1e-8\n", " \"\"\"\n", " roberta_model = TFRobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)\n", "\n", @@ -272,7 +292,7 @@ "\n", " # Compile the model:\n", " roberta_model.compile(\n", - " optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5,epsilon=1e-08),\n", + " optimizer = optimizer,\n", " loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), \n", " metrics = [tf.keras.metrics.SparseCategoricalAccuracy(\"accuracy\")]\n", " )\n", @@ -282,7 +302,7 @@ "metadata": { "id": "IOJQDQHqgfMI" }, - "execution_count": 24, + "execution_count": 8, "outputs": [] }, { @@ -301,10 +321,10 @@ " return_tensors='tf')\n", " return encodings\n", "\n", - "def run_roberta(length, tokenizer):\n", + "def run_roberta(length, tokenizer, model_name, optimizer=None):\n", " \"\"\" Tokenizes, trains and evaluates roBERTa models for different max_lengths\n", " \"\"\"\n", - "\n", + " print(f\"This model will be saved as {model_name}\")\n", " print(f'Running roBERTa for encoding max_length: {length}')\n", " print('Tokenizing data...')\n", " train_encodings_roberta = tokenize(length, x_train, tokenizer)\n", @@ -315,7 +335,11 @@ " print(f'Created encoding for validation data with shape {valid_encodings_roberta.input_ids.shape}')\n", " print(f'Created encoding for test data with shape {test_encodings_roberta.input_ids.shape}')\n", "\n", - " model = create_roberta_model()\n", + " if optimizer:\n", + " print(\"Using custom optimizer\")\n", + " model = create_roberta_model(optimizer=optimizer)\n", + " else:\n", + " model = create_roberta_model()\n", " print('Training model...')\n", " history = model.fit(\n", " [train_encodings_roberta.input_ids, train_encodings_roberta.attention_mask], \n", @@ -340,7 +364,7 @@ " print(classification_report(y_test, preds))\n", "\n", " model.save(\n", - " str.format(\"/content/drive/My Drive/models/Project W266/roberta_model_{length}\", length=length),\n", + " str.format(\"/content/drive/My Drive/models/Project W266/{name}\", name = model_name),\n", " overwrite=True,\n", " include_optimizer=True,\n", " save_format=None,\n", @@ -352,9 +376,18 @@ "metadata": { "id": "qMRC9B0RZLbO" }, - "execution_count": 77, + "execution_count": 14, "outputs": [] }, + { + "cell_type": "markdown", + "source": [ + "### Running for various lengths of embeddings" + ], + "metadata": { + "id": "8YHIQu-2vybh" + } + }, { "cell_type": "code", "source": [ @@ -1289,193 +1322,1189 @@ ] } ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "background_execution": "on", - "collapsed_sections": [], - "machine_shape": "hm", - "name": "roBERTa.ipynb", - "provenance": [] }, - "gpuClass": "standard", - "kernelspec": { - "display_name": "Python 3", - "name": "python3" + { + "cell_type": "markdown", + "source": [ + "### Trying out a different learning rate" + ], + "metadata": { + "id": "fQ0Cv7Nuv4EZ" + } }, - "language_info": { - "name": "python" + { + "cell_type": "code", + "source": [ + "def create_learning_schedule(initial_learning_rate=2e-5):\n", + " # Set up epochs and steps\n", + " epochs = 4\n", + " batch_size = 32\n", + "\n", + " train_data_size = len(x_train)\n", + " steps_per_epoch = int(train_data_size / batch_size)\n", + " num_train_steps = steps_per_epoch * epochs\n", + " # Using 6% of the data for warm up as this is what was done in the roBERTa paper\n", + " warmup_steps = int(0.06 * num_train_steps)\n", + "\n", + " linear_decay = tf.keras.optimizers.schedules.PolynomialDecay(\n", + " initial_learning_rate=initial_learning_rate,\n", + " end_learning_rate=0,\n", + " decay_steps=num_train_steps)\n", + "\n", + " warmup_schedule = tfm.optimization.lr_schedule.LinearWarmup(\n", + " warmup_learning_rate = 0,\n", + " after_warmup_lr_sched = linear_decay,\n", + " warmup_steps = warmup_steps\n", + " )\n", + " return warmup_schedule\n" + ], + "metadata": { + "id": "0sob0lWav8xV" + }, + "execution_count": 16, + "outputs": [] }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "1c14f7620aed423490665bf394979c2a": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_1b910353b4f447baaa7eebd7775cd460", - "IPY_MODEL_85c9e7dfd26c4f4183edfd55a94796e5", - "IPY_MODEL_f3530bb9ec0648c683e80bcbfe94928d" + { + "cell_type": "code", + "source": [ + "" + ], + "metadata": { + "id": "T3hyZ4cgxihL" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "#max_lengths = [64,128,256,320,384,448,512]\n", + "max_lengths = [384]\n", + "learning_rates = [1e-5, 2e-5, 3e-5]\n", + "for initial_learning_rate in learning_rates:\n", + " warmup_schedule = create_learning_schedule(initial_learning_rate)\n", + " for length in max_lengths:\n", + " model_name = str.format('roberta_model_{length}_lr-{learning_rate}', length=384, learning_rate=initial_learning_rate)\n", + " run_roberta(length, roberta_tokenizer, model_name, optimizer = tf.keras.optimizers.experimental.Adam(\n", + " learning_rate = warmup_schedule))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 906, + "referenced_widgets": [ + "8839985a012f4d72813673ad2d288488", + "1ca91bee29f540709a52e68c17ea0486", + "443a4b408f984311afe0d44df5655252", + "a995024669924dfc815e0732d5637155", + "0deabd256d2241c8b9a07dbfa38885c7", + "f70df6007d3849c79de647cdf4b0da9f", + "688e482f012142b3ba1e5c8d6edf7423", + "261c5622d53842609b166627807f916d", + "c489efa4a09d4a0f8f4b1f660486762f", + "d766338135bf47a1aca35ded50fdce3f", + "a0ecd8b159e24b08aba14889756bdcff" + ] + }, + "id": "XYlbWNXe9Ohh", + "outputId": "3f3ad898-1c37-48c6-fc80-da8fd31b2940" + }, + "execution_count": 11, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Running roBERTa for encoding max_length: 384\n", + "Tokenizing data...\n", + "Created encoding for training data with shape (47146, 384)\n", + "Created encoding for validation data with shape (5893, 384)\n", + "Created encoding for test data with shape (5894, 384)\n", + "Using custom optimizer\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "Downloading: 0%| | 0.00/627M [00:00