diff --git a/roBERTa.ipynb b/roBERTa.ipynb index f3a73f1..fd2b1e4 100644 --- a/roBERTa.ipynb +++ b/roBERTa.ipynb @@ -2,40 +2,40 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "YXAL6gpkijkz", - "outputId": "fad6b4d3-9a24-466d-bd3d-ddd388a66617" + "outputId": "c5815152-f8d7-4b27-a07b-fb811b9e6b16" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ - "\u001b[K |████████████████████████████████| 21.6 MB 4.8 MB/s \n", + "\u001b[K |████████████████████████████████| 21.6 MB 2.5 MB/s \n", "\u001b[K |████████████████████████████████| 4.6 MB 5.3 MB/s \n", - "\u001b[K |████████████████████████████████| 511.7 MB 5.1 kB/s \n", - "\u001b[K |████████████████████████████████| 5.8 MB 39.8 MB/s \n", - "\u001b[K |████████████████████████████████| 438 kB 97.9 MB/s \n", - "\u001b[K |████████████████████████████████| 1.6 MB 83.3 MB/s \n", + "\u001b[K |████████████████████████████████| 511.7 MB 5.4 kB/s \n", + "\u001b[K |████████████████████████████████| 438 kB 94.0 MB/s \n", + "\u001b[K |████████████████████████████████| 1.6 MB 71.1 MB/s \n", + "\u001b[K |████████████████████████████████| 5.8 MB 89.1 MB/s \n", "\u001b[K |████████████████████████████████| 2.1 MB 5.1 MB/s \n", - "\u001b[K |████████████████████████████████| 636 kB 92.0 MB/s \n", - "\u001b[K |████████████████████████████████| 352 kB 89.2 MB/s \n", - "\u001b[K |████████████████████████████████| 43 kB 2.4 MB/s \n", - "\u001b[K |████████████████████████████████| 237 kB 78.5 MB/s \n", - "\u001b[K |████████████████████████████████| 99 kB 10.7 MB/s \n", - "\u001b[K |████████████████████████████████| 1.1 MB 65.9 MB/s \n", - "\u001b[K |████████████████████████████████| 1.2 MB 75.3 MB/s \n", - "\u001b[K |████████████████████████████████| 92 kB 13.0 MB/s \n", + "\u001b[K |████████████████████████████████| 237 kB 80.4 MB/s \n", + "\u001b[K |████████████████████████████████| 352 kB 94.3 MB/s \n", + "\u001b[K |████████████████████████████████| 92 kB 13.3 MB/s \n", + "\u001b[K |████████████████████████████████| 636 kB 90.5 MB/s \n", + "\u001b[K |████████████████████████████████| 99 kB 11.1 MB/s \n", + "\u001b[K |████████████████████████████████| 1.1 MB 74.4 MB/s \n", + "\u001b[K |████████████████████████████████| 43 kB 2.2 MB/s \n", + "\u001b[K |████████████████████████████████| 1.2 MB 81.9 MB/s \n", "\u001b[?25h Building wheel for py-cpuinfo (setup.py) ... \u001b[?25l\u001b[?25hdone\n", " Building wheel for seqeval (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - "\u001b[K |████████████████████████████████| 24.2 MB 5.2 MB/s \n", - "\u001b[K |████████████████████████████████| 4.4 MB 5.2 MB/s \n", - "\u001b[K |████████████████████████████████| 101 kB 13.0 MB/s \n", - "\u001b[K |████████████████████████████████| 6.6 MB 57.1 MB/s \n", + "\u001b[K |████████████████████████████████| 24.2 MB 5.1 MB/s \n", + "\u001b[K |████████████████████████████████| 4.4 MB 5.0 MB/s \n", + "\u001b[K |████████████████████████████████| 101 kB 10.8 MB/s \n", + "\u001b[K |████████████████████████████████| 6.6 MB 61.8 MB/s \n", "\u001b[?25h" ] } @@ -54,7 +54,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": { "id": "Kd0xo5RHVbg-" }, @@ -111,13 +111,13 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "3Umy49AUj-QH", - "outputId": "dd1b3cca-cbe5-47f2-f40f-b2eda465c468" + "outputId": "d625463f-444b-47bc-a491-dc259af91246" }, "outputs": [ { @@ -145,7 +145,7 @@ "metadata": { "id": "RuHXI9hByBSX" }, - "execution_count": 5, + "execution_count": 4, "outputs": [] }, { @@ -163,7 +163,7 @@ "metadata": { "id": "WZRn3TO4yBim" }, - "execution_count": 6, + "execution_count": 5, "outputs": [] }, { @@ -185,45 +185,45 @@ "base_uri": "https://localhost:8080/", "height": 113, "referenced_widgets": [ - "a896342be0dc4486bac61aa604128b90", - "273a2764131e465189a2ed29d5484e34", - "e8af2ab322b84518879662d0ca0201a1", - "60a865cc60a04d6a863e7df45f6b3cbc", - "0427845a9b0742c9b58bc74bb6e490ea", - "026d2f69850d4c42b0f32b7e55f42019", - "b838b7d4cdb548e48ef441fd5014c57f", - "d8b8395567a9405eaef69ac18ade55cc", - "f9bcacc9c768433cb5567dba41300cff", - "9afa1df07a6b44dba52ccdcfc2f1ba42", - "a691bc6164f6485496d75934ced86c07", - "34f65b030e35449faa123ff6419fce32", - "c52fec1a0cb04c11ad70de5766d180d5", - "61b63e7371de45e79df278b532bae014", - "bfae99ff42064b269b7c637a21834cb8", - "e273e13619be4095a78dba814b2966ff", - "57fd37ed5d0a4bf39a73e2b0b7b48761", - "860c3f8064b94d2f82f6fc46f0644936", - "8d1fc95c952c49f98eab60e3d9dc937b", - "84a303d3191b427a99422ba19e095ac5", - "74bfd5cd85844ba59098df83d2cf0aa1", - "274c17f08d2c4dd4b0eec23a8b0e05aa", - "f7ac1486af3145e79837f42b2e27ee1e", - "8a2bd4f4bdf64692b850a80ff6cb633c", - "8ed8f190517c477b8874339fbd2f1d6a", - "bbd71f11d1194dbdb25a86f8efe6a656", - "155d6081457f4348b9552e6afe44bb96", - "4dd74c3b0bd644fe81308c5237348f1b", - "2bc9a33616e94636884ec818095cf3a0", - "a8894b47bd7b4ae993dd56a2906c26aa", - "fa5322d165d74af1a6d635a586840845", - "655b1f7f9a48411a8bfce2d2de5265e1", - "bc277c3869d2463ca2272eb38139e359" + "cfe1b073ae4a4ee9b1d953825053ed8f", + "afcc11e497874c7a9067bed3d8a7e349", + "1d8bfa9df8424b6c937024bced838cbe", + "27b476022a6c46d08ec76dc08f34595d", + "dce42174fc8e4a77a14c21738a7a4c54", + "49d8f9cdf4da4e0a9abd2b1d3019ca12", + "15ad7701d80b47809e3141f55eb6f285", + "65fdf996228e4e7cb7d1e6e3a681e738", + "4f2901bf693a48ec89713d45e78115c0", + "83274eb7518d49699adbc895aa87dc44", + "4ea9c46c59d5409eb9f8bf8475781e61", + "c061120a886c4b049d72fcab8d1b05e1", + "e5abae6d69e4498a936e942c5a74e273", + "f0da005a128e46ebbf1156414ce96fed", + "31dc9779e3e54dcaa4a0897252d4e8c9", + "ccc41275e6d44b35bdfe2e466fe1d852", + "db019f2172c34309acecb6b90eafa5e6", + "dbb09d1246434067958ef5cd078da148", + "217063072bce45cdb34724b44f915068", + "b98b83b3c26c4a15ba2d3083ecaad9c0", + "dffaae07ce6a495090a049865f998dae", + "6c5230b90aef4a2e9f742fb12a1d10e9", + "40db94c12725430abf3d7107cd177033", + "b474f608b4af4f42a7048143f9b45dcd", + "9867d7d0ee9947a4bc469c28d5f27f9b", + "bac09e226e9d4e15a353ea75a7735076", + "3b4a4fcdc02e4d068200cb28f416ddb9", + "639168419e6048f5bc098e690739f879", + "b3ee9b9a3ee2431a9c1a4740cf9c66b7", + "eb4378c6fb5b4870ae7d244b272cc9dd", + "5ce9eb35602f498c9f8ed9b7988b0e04", + "b333b09dddfd44269541ebfb2479ac0a", + "6209c0c641cd48a896ca16b61a5e97cd" ] }, "id": "aU1dhDMtdWws", - "outputId": "28093ce4-4231-4275-a236-b44b873e563f" + "outputId": "7cca290b-7084-4931-d911-8523cd256852" }, - "execution_count": 7, + "execution_count": 6, "outputs": [ { "output_type": "display_data", @@ -234,7 +234,7 @@ "application/vnd.jupyter.widget-view+json": { "version_major": 2, "version_minor": 0, - "model_id": "a896342be0dc4486bac61aa604128b90" + "model_id": "cfe1b073ae4a4ee9b1d953825053ed8f" } }, "metadata": {} @@ -248,7 +248,7 @@ "application/vnd.jupyter.widget-view+json": { "version_major": 2, "version_minor": 0, - "model_id": "34f65b030e35449faa123ff6419fce32" + "model_id": "c061120a886c4b049d72fcab8d1b05e1" } }, "metadata": {} @@ -262,7 +262,7 @@ "application/vnd.jupyter.widget-view+json": { "version_major": 2, "version_minor": 0, - "model_id": "f7ac1486af3145e79837f42b2e27ee1e" + "model_id": "40db94c12725430abf3d7107cd177033" } }, "metadata": {} @@ -302,7 +302,7 @@ "metadata": { "id": "IOJQDQHqgfMI" }, - "execution_count": 8, + "execution_count": 7, "outputs": [] }, { @@ -376,7 +376,7 @@ "metadata": { "id": "qMRC9B0RZLbO" }, - "execution_count": 14, + "execution_count": 8, "outputs": [] }, { @@ -1976,6 +1976,213 @@ ] } ] + }, + { + "cell_type": "code", + "source": [ + "# Larger data set:\n", + "train = pd.read_csv(\"/content/drive/My Drive/Colab Notebooks/W266 Project/data/yelp_train_large.csv\")\n", + "test = pd.read_csv(\"/content/drive/My Drive/Colab Notebooks/W266 Project/data/yelp_test_large.csv\")\n", + "valid = pd.read_csv(\"/content/drive/My Drive/Colab Notebooks/W266 Project/data/yelp_valid_large.csv\")\n", + "\n", + "x_train = list(train[['text']].text)\n", + "y_train = np.asarray(train[['label']].label)\n", + "\n", + "x_test = list(test[['text']].text)\n", + "y_test = np.asarray(test[['label']].label)\n", + "\n", + "x_valid = list(valid[['text']].text)\n", + "y_valid = np.asarray(valid[['label']].label)\n", + "\n", + "len(x_train)" + ], + "metadata": { + "id": "hOWQ9sr0-ldC", + "outputId": "159152df-2fe1-4a6d-855f-95803e169574", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "execution_count": 9, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "471465" + ] + }, + "metadata": {}, + "execution_count": 9 + } + ] + }, + { + "cell_type": "code", + "source": [ + "max_lengths = [384]\n", + "for length in max_lengths:\n", + " run_roberta(length, roberta_tokenizer, 'roberta_384_large')" + ], + "metadata": { + "id": "K133rKHA2JLu", + "outputId": "5231c8d5-2170-4b69-d2a1-6e90540af18d", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 835, + "referenced_widgets": [ + "ff91ad40e0cb44bcae67dd2eb9ff7dbe", + "69a8b638873d484694629efb8b988e93", + "e92cd265403c4417b787742f8fe2756a", + "e1c4ac149c324ceeae62d0528b1dd025", + "a78d6c7b85ba44e78160f5d054ad1288", + "738de4b5332942749435e1539cd90098", + "c174dbb3fbfd426687040e83e9f12421", + "b21b352052da4de8af45fcfabd237302", + "8ca96f2c389f45d58d8b60bb34765bb3", + "447b0a4d2c474f7e9777a23e72fc8d92", + "2900ac0595cc4a7fa79b4d95374c94f2" + ] + } + }, + "execution_count": 10, + "outputs": [ + { + "metadata": { + "tags": null + }, + "name": "stdout", + "output_type": "stream", + "text": [ + "This model will be saved as roberta_384_large\n", + "Running roBERTa for encoding max_length: 384\n", + "Tokenizing data...\n", + "Created encoding for training data with shape (471465, 384)\n", + "Created encoding for validation data with shape (58933, 384)\n", + "Created encoding for test data with shape (58934, 384)\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "ff91ad40e0cb44bcae67dd2eb9ff7dbe", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading: 0%| | 0.00/627M [00:00