diff --git a/sdk/python/examples/create-pytorchjob-from-func.ipynb b/sdk/python/examples/create-pytorchjob-from-func.ipynb index 945160fe17..aaafdf8132 100644 --- a/sdk/python/examples/create-pytorchjob-from-func.ipynb +++ b/sdk/python/examples/create-pytorchjob-from-func.ipynb @@ -17,7 +17,9 @@ { "cell_type": "markdown", "id": "a8bb6564-fde3-4c28-841c-012122643dd9", - "metadata": {}, + "metadata": { + "tags": [] + }, "source": [ "## Install Kubeflow Python SDKs\n", "\n", @@ -368,14 +370,14 @@ "source": [ "## Start Distributive Training with PyTorchJob\n", "\n", - "Before creating PyTorchJob, you have to create `PyTorchJobClient()`. It uses [Kubernetes Python client](https://github.com/kubernetes-client/python) to communicate with Kubernetes API server. You can set path and context for [the kubeconfig file](https://kubernetes.io/docs/concepts/configuration/organize-cluster-access-kubeconfig/). The default location for the kubeconfig is `~/.kube/config`.\n", + "Before creating PyTorchJob, you have to create `TrainingClient()`. It uses [Kubernetes Python client](https://github.com/kubernetes-client/python) to communicate with Kubernetes API server. You can set path and context for [the kubeconfig file](https://kubernetes.io/docs/concepts/configuration/organize-cluster-access-kubeconfig/). The default location for the kubeconfig is `~/.kube/config`.\n", "\n", "Kubeflow Training Operator automatically set the appropriate env variables (`MASTER_PORT`, `MASTER_ADDR`, `WORLD_SIZE`, `RANK`) for each PyTorchJob container." ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 5, "id": "eb1acd34-ebcf-409b-8bb3-0225cee37110", "metadata": { "tags": [] @@ -385,18 +387,18 @@ "name": "stderr", "output_type": "stream", "text": [ - "2022-09-12T18:42:12Z INFO PyTorchJob train-pytorch has been created\n" + "PyTorchJob kubeflow-user-example-com/train-pytorch has been created\n" ] } ], "source": [ - "from kubeflow.training import PyTorchJobClient\n", + "from kubeflow.training import TrainingClient\n", "\n", "# Start PyTorchJob Training.\n", "pytorchjob_name = \"train-pytorch\"\n", - "pytorchjob_client = PyTorchJobClient()\n", + "training_client = TrainingClient()\n", "\n", - "pytorchjob_client.create_pytorchjob_from_func(\n", + "training_client.create_pytorchjob_from_func(\n", " name=pytorchjob_name,\n", " func=train_pytorch_model,\n", " num_worker_replicas=3, # How many PyTorch Workers will be run.\n", @@ -408,14 +410,14 @@ "id": "e44c3ad7-62c4-4b58-b52a-15fd8746b772", "metadata": {}, "source": [ - "### Get PyTorchJob Status\n", + "### Check PyTorchJob Status\n", "\n", - "Use `PyTorchJobClient` API to get information about created PyTorchJob." + "Use `KubeflowClient` APIs to get information about created PyTorchJob." ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 18, "id": "4141f6c2-c38f-4972-b68a-35d150ef7485", "metadata": { "tags": [] @@ -425,12 +427,12 @@ "name": "stdout", "output_type": "stream", "text": [ - "PyTorchJob Status: Running\n" + "PyTorchJob Status: True\n" ] } ], "source": [ - "print(f\"PyTorchJob Status: {pytorchjob_client.get_job_status(pytorchjob_name)}\")" + "print(f\"PyTorchJob Status: {training_client.is_job_running(name=pytorchjob_name, job_kind='PyTorchJob')}\")" ] }, { @@ -443,26 +445,26 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 19, "id": "49b53308-a19b-45e8-942f-4333e727ee48", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'train-pytorch-master-0',\n", + "['train-pytorch-master-0',\n", " 'train-pytorch-worker-0',\n", " 'train-pytorch-worker-1',\n", - " 'train-pytorch-worker-2'}" + " 'train-pytorch-worker-2']" ] }, - "execution_count": 15, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "pytorchjob_client.get_pod_names(pytorchjob_name)" + "training_client.get_job_pod_names(pytorchjob_name)" ] }, { @@ -483,7 +485,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 27, "id": "5232d542-d4bf-4c51-8b11-ad0534fb0b9d", "metadata": { "tags": [] @@ -493,225 +495,225 @@ "name": "stderr", "output_type": "stream", "text": [ - "2022-09-12T19:02:17Z INFO The logs of Pod train-pytorch-master-0:\n", - " 2022-09-12T18:50:25Z INFO Added key: store_based_barrier_key:1 to store for rank: 0\n", - "2022-09-12T18:50:25Z INFO Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 4 nodes.\n", + "The logs of pod train-pytorch-master-0:\n", + " 2023-01-12T18:55:33Z INFO Added key: store_based_barrier_key:1 to store for rank: 0\n", + "2023-01-12T18:55:33Z INFO Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 4 nodes.\n", "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz\n", "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz\n", - "100%|██████████| 26421880/26421880 [06:26<00:00, 68445.84it/s] ]\n", + "100%|██████████| 26421880/26421880 [00:02<00:00, 12562567.98it/s]\n", "Extracting ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw\n", "\n", "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz\n", "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz\n", - "100%|██████████| 29515/29515 [00:00<00:00, 216810.86it/s]\n", "Extracting ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw\n", "\n", "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz\n", + "100%|██████████| 29515/29515 [00:00<00:00, 211170.82it/s]\n", "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz\n", - "100%|██████████| 4422102/4422102 [01:23<00:00, 52722.58it/s]\n", + "100%|██████████| 4422102/4422102 [00:00<00:00, 4511582.77it/s]\n", "Extracting ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw\n", "\n", "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz\n", "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz\n", - "100%|██████████| 5148/5148 [00:00<00:00, 18743296.00it/s]\n", + "100%|██████████| 5148/5148 [00:00<00:00, 23675742.32it/s]\n", "Extracting ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw\n", "\n", - "2022-09-12T18:58:20Z INFO Start training for RANK: 0. WORLD_SIZE: 4\n", - "2022-09-12T18:58:31Z INFO Train Epoch: 0 [0/60000 (0%)]\tloss=2.2902\n", - "2022-09-12T18:58:31Z INFO Reducer buckets have been rebuilt in this iteration.\n", - "2022-09-12T18:58:32Z INFO Train Epoch: 0 [320/60000 (1%)]\tloss=2.2901\n", - "2022-09-12T18:58:33Z INFO Train Epoch: 0 [640/60000 (1%)]\tloss=2.2797\n", - "2022-09-12T18:58:33Z INFO Train Epoch: 0 [960/60000 (2%)]\tloss=2.2889\n", - "2022-09-12T18:58:34Z INFO Train Epoch: 0 [1280/60000 (2%)]\tloss=2.2720\n", - "2022-09-12T18:58:34Z INFO Train Epoch: 0 [1600/60000 (3%)]\tloss=2.2684\n", - "2022-09-12T18:58:35Z INFO Train Epoch: 0 [1920/60000 (3%)]\tloss=2.2588\n", - "2022-09-12T18:58:36Z INFO Train Epoch: 0 [2240/60000 (4%)]\tloss=2.2018\n", - "2022-09-12T18:58:36Z INFO Train Epoch: 0 [2560/60000 (4%)]\tloss=2.2099\n", - "2022-09-12T18:58:37Z INFO Train Epoch: 0 [2880/60000 (5%)]\tloss=2.2041\n", - "2022-09-12T18:58:37Z INFO Train Epoch: 0 [3200/60000 (5%)]\tloss=2.1588\n", - "2022-09-12T18:58:38Z INFO Train Epoch: 0 [3520/60000 (6%)]\tloss=2.0929\n", - "2022-09-12T18:58:39Z INFO Train Epoch: 0 [3840/60000 (6%)]\tloss=1.9030\n", - "2022-09-12T18:58:39Z INFO Train Epoch: 0 [4160/60000 (7%)]\tloss=1.7375\n", - "2022-09-12T18:58:40Z INFO Train Epoch: 0 [4480/60000 (7%)]\tloss=1.7278\n", - "2022-09-12T18:58:40Z INFO Train Epoch: 0 [4800/60000 (8%)]\tloss=1.4415\n", - "2022-09-12T18:58:41Z INFO Train Epoch: 0 [5120/60000 (9%)]\tloss=1.2989\n", - "2022-09-12T18:58:42Z INFO Train Epoch: 0 [5440/60000 (9%)]\tloss=1.2891\n", - "2022-09-12T18:58:42Z INFO Train Epoch: 0 [5760/60000 (10%)]\tloss=1.3265\n", - "2022-09-12T18:58:43Z INFO Train Epoch: 0 [6080/60000 (10%)]\tloss=1.1599\n", - "2022-09-12T18:58:44Z INFO Train Epoch: 0 [6400/60000 (11%)]\tloss=1.0840\n", - "2022-09-12T18:58:44Z INFO Train Epoch: 0 [6720/60000 (11%)]\tloss=1.2574\n", - "2022-09-12T18:58:45Z INFO Train Epoch: 0 [7040/60000 (12%)]\tloss=1.0064\n", - "2022-09-12T18:58:45Z INFO Train Epoch: 0 [7360/60000 (12%)]\tloss=1.0433\n", - "2022-09-12T18:58:46Z INFO Train Epoch: 0 [7680/60000 (13%)]\tloss=1.0249\n", - "2022-09-12T18:58:47Z INFO Train Epoch: 0 [8000/60000 (13%)]\tloss=1.2595\n", - "2022-09-12T18:58:47Z INFO Train Epoch: 0 [8320/60000 (14%)]\tloss=1.0006\n", - "2022-09-12T18:58:48Z INFO Train Epoch: 0 [8640/60000 (14%)]\tloss=1.0372\n", - "2022-09-12T18:58:49Z INFO Train Epoch: 0 [8960/60000 (15%)]\tloss=1.1736\n", - "2022-09-12T18:58:49Z INFO Train Epoch: 0 [9280/60000 (15%)]\tloss=0.6428\n", - "2022-09-12T18:58:50Z INFO Train Epoch: 0 [9600/60000 (16%)]\tloss=1.2883\n", - "2022-09-12T18:58:51Z INFO Train Epoch: 0 [9920/60000 (17%)]\tloss=0.8874\n", - "2022-09-12T18:58:51Z INFO Train Epoch: 0 [10240/60000 (17%)]\tloss=1.1232\n", - "2022-09-12T18:58:52Z INFO Train Epoch: 0 [10560/60000 (18%)]\tloss=1.0875\n", - "2022-09-12T18:58:52Z INFO Train Epoch: 0 [10880/60000 (18%)]\tloss=0.8829\n", - "2022-09-12T18:58:53Z INFO Train Epoch: 0 [11200/60000 (19%)]\tloss=0.6489\n", - "2022-09-12T18:58:54Z INFO Train Epoch: 0 [11520/60000 (19%)]\tloss=0.9545\n", - "2022-09-12T18:58:54Z INFO Train Epoch: 0 [11840/60000 (20%)]\tloss=1.1365\n", - "2022-09-12T18:58:55Z INFO Train Epoch: 0 [12160/60000 (20%)]\tloss=0.9353\n", - "2022-09-12T18:58:56Z INFO Train Epoch: 0 [12480/60000 (21%)]\tloss=0.7308\n", - "2022-09-12T18:58:56Z INFO Train Epoch: 0 [12800/60000 (21%)]\tloss=0.7806\n", - "2022-09-12T18:58:57Z INFO Train Epoch: 0 [13120/60000 (22%)]\tloss=1.1674\n", - "2022-09-12T18:58:58Z INFO Train Epoch: 0 [13440/60000 (22%)]\tloss=0.8342\n", - "2022-09-12T18:58:58Z INFO Train Epoch: 0 [13760/60000 (23%)]\tloss=0.8479\n", - "2022-09-12T18:58:59Z INFO Train Epoch: 0 [14080/60000 (23%)]\tloss=0.8669\n", - "2022-09-12T18:59:00Z INFO Train Epoch: 0 [14400/60000 (24%)]\tloss=0.8408\n", - "2022-09-12T18:59:00Z INFO Train Epoch: 0 [14720/60000 (25%)]\tloss=0.6637\n", - "2022-09-12T18:59:01Z INFO Train Epoch: 0 [15040/60000 (25%)]\tloss=0.8141\n", - "2022-09-12T18:59:02Z INFO Train Epoch: 0 [15360/60000 (26%)]\tloss=0.9311\n", - "2022-09-12T18:59:03Z INFO Train Epoch: 0 [15680/60000 (26%)]\tloss=0.7325\n", - "2022-09-12T18:59:04Z INFO Train Epoch: 0 [16000/60000 (27%)]\tloss=1.1148\n", - "2022-09-12T18:59:04Z INFO Train Epoch: 0 [16320/60000 (27%)]\tloss=0.5028\n", - "2022-09-12T18:59:05Z INFO Train Epoch: 0 [16640/60000 (28%)]\tloss=0.9469\n", - "2022-09-12T18:59:05Z INFO Train Epoch: 0 [16960/60000 (28%)]\tloss=0.6968\n", - "2022-09-12T18:59:06Z INFO Train Epoch: 0 [17280/60000 (29%)]\tloss=0.7775\n", - "2022-09-12T18:59:07Z INFO Train Epoch: 0 [17600/60000 (29%)]\tloss=0.6287\n", - "2022-09-12T18:59:07Z INFO Train Epoch: 0 [17920/60000 (30%)]\tloss=0.6075\n", - "2022-09-12T18:59:08Z INFO Train Epoch: 0 [18240/60000 (30%)]\tloss=1.1254\n", - "2022-09-12T18:59:09Z INFO Train Epoch: 0 [18560/60000 (31%)]\tloss=1.1152\n", - "2022-09-12T18:59:09Z INFO Train Epoch: 0 [18880/60000 (31%)]\tloss=0.7017\n", - "2022-09-12T18:59:10Z INFO Train Epoch: 0 [19200/60000 (32%)]\tloss=0.5707\n", - "2022-09-12T18:59:10Z INFO Train Epoch: 0 [19520/60000 (33%)]\tloss=0.7994\n", - "2022-09-12T18:59:11Z INFO Train Epoch: 0 [19840/60000 (33%)]\tloss=0.9094\n", - "2022-09-12T18:59:12Z INFO Train Epoch: 0 [20160/60000 (34%)]\tloss=1.1098\n", - "2022-09-12T18:59:12Z INFO Train Epoch: 0 [20480/60000 (34%)]\tloss=0.5613\n", - "2022-09-12T18:59:13Z INFO Train Epoch: 0 [20800/60000 (35%)]\tloss=0.9604\n", - "2022-09-12T18:59:13Z INFO Train Epoch: 0 [21120/60000 (35%)]\tloss=0.4959\n", - "2022-09-12T18:59:14Z INFO Train Epoch: 0 [21440/60000 (36%)]\tloss=0.9506\n", - "2022-09-12T18:59:15Z INFO Train Epoch: 0 [21760/60000 (36%)]\tloss=0.6677\n", - "2022-09-12T18:59:15Z INFO Train Epoch: 0 [22080/60000 (37%)]\tloss=0.7729\n", - "2022-09-12T18:59:16Z INFO Train Epoch: 0 [22400/60000 (37%)]\tloss=0.5282\n", - "2022-09-12T18:59:16Z INFO Train Epoch: 0 [22720/60000 (38%)]\tloss=0.6309\n", - "2022-09-12T18:59:17Z INFO Train Epoch: 0 [23040/60000 (38%)]\tloss=1.0241\n", - "2022-09-12T18:59:18Z INFO Train Epoch: 0 [23360/60000 (39%)]\tloss=0.5549\n", - "2022-09-12T18:59:18Z INFO Train Epoch: 0 [23680/60000 (39%)]\tloss=0.7683\n", - "2022-09-12T18:59:19Z INFO Train Epoch: 0 [24000/60000 (40%)]\tloss=0.9024\n", - "2022-09-12T18:59:20Z INFO Train Epoch: 0 [24320/60000 (41%)]\tloss=0.8187\n", - "2022-09-12T18:59:21Z INFO Train Epoch: 0 [24640/60000 (41%)]\tloss=0.6414\n", - "2022-09-12T18:59:21Z INFO Train Epoch: 0 [24960/60000 (42%)]\tloss=0.8111\n", - "2022-09-12T18:59:22Z INFO Train Epoch: 0 [25280/60000 (42%)]\tloss=0.4828\n", - "2022-09-12T18:59:23Z INFO Train Epoch: 0 [25600/60000 (43%)]\tloss=0.7490\n", - "2022-09-12T18:59:23Z INFO Train Epoch: 0 [25920/60000 (43%)]\tloss=0.5983\n", - "2022-09-12T18:59:24Z INFO Train Epoch: 0 [26240/60000 (44%)]\tloss=0.9854\n", - "2022-09-12T18:59:25Z INFO Train Epoch: 0 [26560/60000 (44%)]\tloss=0.7044\n", - "2022-09-12T18:59:25Z INFO Train Epoch: 0 [26880/60000 (45%)]\tloss=0.6213\n", - "2022-09-12T18:59:26Z INFO Train Epoch: 0 [27200/60000 (45%)]\tloss=0.9710\n", - "2022-09-12T18:59:27Z INFO Train Epoch: 0 [27520/60000 (46%)]\tloss=0.4506\n", - "2022-09-12T18:59:27Z INFO Train Epoch: 0 [27840/60000 (46%)]\tloss=0.7417\n", - "2022-09-12T18:59:28Z INFO Train Epoch: 0 [28160/60000 (47%)]\tloss=0.8037\n", - "2022-09-12T18:59:29Z INFO Train Epoch: 0 [28480/60000 (47%)]\tloss=0.8103\n", - "2022-09-12T18:59:29Z INFO Train Epoch: 0 [28800/60000 (48%)]\tloss=1.0093\n", - "2022-09-12T18:59:30Z INFO Train Epoch: 0 [29120/60000 (49%)]\tloss=0.6391\n", - "2022-09-12T18:59:30Z INFO Train Epoch: 0 [29440/60000 (49%)]\tloss=0.9008\n", - "2022-09-12T18:59:31Z INFO Train Epoch: 0 [29760/60000 (50%)]\tloss=0.7537\n", - "2022-09-12T18:59:32Z INFO Train Epoch: 0 [30080/60000 (50%)]\tloss=0.9524\n", - "2022-09-12T18:59:32Z INFO Train Epoch: 0 [30400/60000 (51%)]\tloss=0.6028\n", - "2022-09-12T18:59:33Z INFO Train Epoch: 0 [30720/60000 (51%)]\tloss=0.6095\n", - "2022-09-12T18:59:33Z INFO Train Epoch: 0 [31040/60000 (52%)]\tloss=0.4763\n", - "2022-09-12T18:59:34Z INFO Train Epoch: 0 [31360/60000 (52%)]\tloss=0.5009\n", - "2022-09-12T18:59:35Z INFO Train Epoch: 0 [31680/60000 (53%)]\tloss=0.7307\n", - "2022-09-12T18:59:35Z INFO Train Epoch: 0 [32000/60000 (53%)]\tloss=0.8121\n", - "2022-09-12T18:59:36Z INFO Train Epoch: 0 [32320/60000 (54%)]\tloss=0.5977\n", - "2022-09-12T18:59:36Z INFO Train Epoch: 0 [32640/60000 (54%)]\tloss=0.6981\n", - "2022-09-12T18:59:37Z INFO Train Epoch: 0 [32960/60000 (55%)]\tloss=0.6279\n", - "2022-09-12T18:59:38Z INFO Train Epoch: 0 [33280/60000 (55%)]\tloss=0.5949\n", - "2022-09-12T18:59:38Z INFO Train Epoch: 0 [33600/60000 (56%)]\tloss=0.5335\n", - "2022-09-12T18:59:39Z INFO Train Epoch: 0 [33920/60000 (57%)]\tloss=0.4350\n", - "2022-09-12T18:59:39Z INFO Train Epoch: 0 [34240/60000 (57%)]\tloss=0.4548\n", - "2022-09-12T18:59:40Z INFO Train Epoch: 0 [34560/60000 (58%)]\tloss=0.3458\n", - "2022-09-12T18:59:41Z INFO Train Epoch: 0 [34880/60000 (58%)]\tloss=0.6637\n", - "2022-09-12T18:59:41Z INFO Train Epoch: 0 [35200/60000 (59%)]\tloss=0.5401\n", - "2022-09-12T18:59:42Z INFO Train Epoch: 0 [35520/60000 (59%)]\tloss=0.5323\n", - "2022-09-12T18:59:42Z INFO Train Epoch: 0 [35840/60000 (60%)]\tloss=0.5373\n", - "2022-09-12T18:59:43Z INFO Train Epoch: 0 [36160/60000 (60%)]\tloss=0.6909\n", - "2022-09-12T18:59:44Z INFO Train Epoch: 0 [36480/60000 (61%)]\tloss=0.7216\n", - "2022-09-12T18:59:44Z INFO Train Epoch: 0 [36800/60000 (61%)]\tloss=0.6451\n", - "2022-09-12T18:59:45Z INFO Train Epoch: 0 [37120/60000 (62%)]\tloss=0.7345\n", - "2022-09-12T18:59:45Z INFO Train Epoch: 0 [37440/60000 (62%)]\tloss=0.5737\n", - "2022-09-12T18:59:46Z INFO Train Epoch: 0 [37760/60000 (63%)]\tloss=0.4804\n", - "2022-09-12T18:59:47Z INFO Train Epoch: 0 [38080/60000 (63%)]\tloss=0.7796\n", - "2022-09-12T18:59:47Z INFO Train Epoch: 0 [38400/60000 (64%)]\tloss=0.7034\n", - "2022-09-12T18:59:48Z INFO Train Epoch: 0 [38720/60000 (65%)]\tloss=0.5887\n", - "2022-09-12T18:59:49Z INFO Train Epoch: 0 [39040/60000 (65%)]\tloss=0.5303\n", - "2022-09-12T18:59:49Z INFO Train Epoch: 0 [39360/60000 (66%)]\tloss=0.4477\n", - "2022-09-12T18:59:50Z INFO Train Epoch: 0 [39680/60000 (66%)]\tloss=0.5510\n", - "2022-09-12T18:59:51Z INFO Train Epoch: 0 [40000/60000 (67%)]\tloss=0.4812\n", - "2022-09-12T18:59:51Z INFO Train Epoch: 0 [40320/60000 (67%)]\tloss=0.4678\n", - "2022-09-12T18:59:52Z INFO Train Epoch: 0 [40640/60000 (68%)]\tloss=0.2526\n", - "2022-09-12T18:59:52Z INFO Train Epoch: 0 [40960/60000 (68%)]\tloss=0.5467\n", - "2022-09-12T18:59:53Z INFO Train Epoch: 0 [41280/60000 (69%)]\tloss=0.7217\n", - "2022-09-12T18:59:54Z INFO Train Epoch: 0 [41600/60000 (69%)]\tloss=0.8281\n", - "2022-09-12T18:59:54Z INFO Train Epoch: 0 [41920/60000 (70%)]\tloss=0.5504\n", - "2022-09-12T18:59:55Z INFO Train Epoch: 0 [42240/60000 (70%)]\tloss=0.6440\n", - "2022-09-12T18:59:56Z INFO Train Epoch: 0 [42560/60000 (71%)]\tloss=0.4030\n", - "2022-09-12T18:59:56Z INFO Train Epoch: 0 [42880/60000 (71%)]\tloss=0.7278\n", - "2022-09-12T18:59:57Z INFO Train Epoch: 0 [43200/60000 (72%)]\tloss=0.6447\n", - "2022-09-12T18:59:58Z INFO Train Epoch: 0 [43520/60000 (73%)]\tloss=0.4235\n", - "2022-09-12T18:59:59Z INFO Train Epoch: 0 [43840/60000 (73%)]\tloss=0.6513\n", - "2022-09-12T18:59:59Z INFO Train Epoch: 0 [44160/60000 (74%)]\tloss=0.5926\n", - "2022-09-12T19:00:00Z INFO Train Epoch: 0 [44480/60000 (74%)]\tloss=0.4309\n", - "2022-09-12T19:00:01Z INFO Train Epoch: 0 [44800/60000 (75%)]\tloss=0.5905\n", - "2022-09-12T19:00:02Z INFO Train Epoch: 0 [45120/60000 (75%)]\tloss=0.5037\n", - "2022-09-12T19:00:03Z INFO Train Epoch: 0 [45440/60000 (76%)]\tloss=0.7945\n", - "2022-09-12T19:00:04Z INFO Train Epoch: 0 [45760/60000 (76%)]\tloss=0.4317\n", - "2022-09-12T19:00:05Z INFO Train Epoch: 0 [46080/60000 (77%)]\tloss=0.5603\n", - "2022-09-12T19:00:06Z INFO Train Epoch: 0 [46400/60000 (77%)]\tloss=0.4657\n", - "2022-09-12T19:00:07Z INFO Train Epoch: 0 [46720/60000 (78%)]\tloss=0.5834\n", - "2022-09-12T19:00:07Z INFO Train Epoch: 0 [47040/60000 (78%)]\tloss=0.3848\n", - "2022-09-12T19:00:08Z INFO Train Epoch: 0 [47360/60000 (79%)]\tloss=0.6270\n", - "2022-09-12T19:00:09Z INFO Train Epoch: 0 [47680/60000 (79%)]\tloss=0.4031\n", - "2022-09-12T19:00:09Z INFO Train Epoch: 0 [48000/60000 (80%)]\tloss=0.5808\n", - "2022-09-12T19:00:10Z INFO Train Epoch: 0 [48320/60000 (81%)]\tloss=0.5529\n", - "2022-09-12T19:00:11Z INFO Train Epoch: 0 [48640/60000 (81%)]\tloss=0.7345\n", - "2022-09-12T19:00:11Z INFO Train Epoch: 0 [48960/60000 (82%)]\tloss=0.5727\n", - "2022-09-12T19:00:12Z INFO Train Epoch: 0 [49280/60000 (82%)]\tloss=0.6785\n", - "2022-09-12T19:00:12Z INFO Train Epoch: 0 [49600/60000 (83%)]\tloss=0.3206\n", - "2022-09-12T19:00:13Z INFO Train Epoch: 0 [49920/60000 (83%)]\tloss=0.3703\n", - "2022-09-12T19:00:14Z INFO Train Epoch: 0 [50240/60000 (84%)]\tloss=0.5272\n", - "2022-09-12T19:00:14Z INFO Train Epoch: 0 [50560/60000 (84%)]\tloss=0.8197\n", - "2022-09-12T19:00:15Z INFO Train Epoch: 0 [50880/60000 (85%)]\tloss=0.4263\n", - "2022-09-12T19:00:15Z INFO Train Epoch: 0 [51200/60000 (85%)]\tloss=0.4994\n", - "2022-09-12T19:00:16Z INFO Train Epoch: 0 [51520/60000 (86%)]\tloss=0.5168\n", - "2022-09-12T19:00:17Z INFO Train Epoch: 0 [51840/60000 (86%)]\tloss=0.7186\n", - "2022-09-12T19:00:17Z INFO Train Epoch: 0 [52160/60000 (87%)]\tloss=0.4517\n", - "2022-09-12T19:00:18Z INFO Train Epoch: 0 [52480/60000 (87%)]\tloss=0.8989\n", - "2022-09-12T19:00:18Z INFO Train Epoch: 0 [52800/60000 (88%)]\tloss=0.5387\n", - "2022-09-12T19:00:19Z INFO Train Epoch: 0 [53120/60000 (89%)]\tloss=0.7302\n", - "2022-09-12T19:00:20Z INFO Train Epoch: 0 [53440/60000 (89%)]\tloss=0.5866\n", - "2022-09-12T19:00:20Z INFO Train Epoch: 0 [53760/60000 (90%)]\tloss=0.5319\n", - "2022-09-12T19:00:21Z INFO Train Epoch: 0 [54080/60000 (90%)]\tloss=0.7869\n", - "2022-09-12T19:00:22Z INFO Train Epoch: 0 [54400/60000 (91%)]\tloss=0.7421\n", - "2022-09-12T19:00:22Z INFO Train Epoch: 0 [54720/60000 (91%)]\tloss=0.4713\n", - "2022-09-12T19:00:23Z INFO Train Epoch: 0 [55040/60000 (92%)]\tloss=0.3956\n", - "2022-09-12T19:00:24Z INFO Train Epoch: 0 [55360/60000 (92%)]\tloss=0.4628\n", - "2022-09-12T19:00:24Z INFO Train Epoch: 0 [55680/60000 (93%)]\tloss=0.5494\n", - "2022-09-12T19:00:25Z INFO Train Epoch: 0 [56000/60000 (93%)]\tloss=0.8519\n", - "2022-09-12T19:00:25Z INFO Train Epoch: 0 [56320/60000 (94%)]\tloss=0.6107\n", - "2022-09-12T19:00:26Z INFO Train Epoch: 0 [56640/60000 (94%)]\tloss=0.3419\n", - "2022-09-12T19:00:27Z INFO Train Epoch: 0 [56960/60000 (95%)]\tloss=0.7939\n", - "2022-09-12T19:00:27Z INFO Train Epoch: 0 [57280/60000 (95%)]\tloss=0.5046\n", - "2022-09-12T19:00:28Z INFO Train Epoch: 0 [57600/60000 (96%)]\tloss=0.5847\n", - "2022-09-12T19:00:29Z INFO Train Epoch: 0 [57920/60000 (97%)]\tloss=0.2835\n", - "2022-09-12T19:00:31Z INFO Train Epoch: 0 [58240/60000 (97%)]\tloss=0.4612\n", - "2022-09-12T19:00:32Z INFO Train Epoch: 0 [58560/60000 (98%)]\tloss=0.5352\n", - "2022-09-12T19:00:33Z INFO Train Epoch: 0 [58880/60000 (98%)]\tloss=0.7347\n", - "2022-09-12T19:00:34Z INFO Train Epoch: 0 [59200/60000 (99%)]\tloss=0.4075\n", - "2022-09-12T19:00:35Z INFO Train Epoch: 0 [59520/60000 (99%)]\tloss=0.5479\n", - "2022-09-12T19:00:36Z INFO Train Epoch: 0 [59840/60000 (100%)]\tloss=0.6257\n", + "2023-01-12T18:55:39Z INFO Start training for RANK: 0. WORLD_SIZE: 4\n", + "2023-01-12T18:55:40Z INFO Train Epoch: 0 [0/60000 (0%)]\tloss=2.3033\n", + "2023-01-12T18:55:40Z INFO Reducer buckets have been rebuilt in this iteration.\n", + "2023-01-12T18:55:42Z INFO Train Epoch: 0 [320/60000 (1%)]\tloss=2.3035\n", + "2023-01-12T18:55:43Z INFO Train Epoch: 0 [640/60000 (1%)]\tloss=2.2942\n", + "2023-01-12T18:55:43Z INFO Train Epoch: 0 [960/60000 (2%)]\tloss=2.2920\n", + "2023-01-12T18:55:44Z INFO Train Epoch: 0 [1280/60000 (2%)]\tloss=2.2875\n", + "2023-01-12T18:55:45Z INFO Train Epoch: 0 [1600/60000 (3%)]\tloss=2.2658\n", + "2023-01-12T18:55:46Z INFO Train Epoch: 0 [1920/60000 (3%)]\tloss=2.2676\n", + "2023-01-12T18:55:46Z INFO Train Epoch: 0 [2240/60000 (4%)]\tloss=2.2092\n", + "2023-01-12T18:55:47Z INFO Train Epoch: 0 [2560/60000 (4%)]\tloss=2.2292\n", + "2023-01-12T18:55:47Z INFO Train Epoch: 0 [2880/60000 (5%)]\tloss=2.2402\n", + "2023-01-12T18:55:48Z INFO Train Epoch: 0 [3200/60000 (5%)]\tloss=2.1984\n", + "2023-01-12T18:55:48Z INFO Train Epoch: 0 [3520/60000 (6%)]\tloss=2.1415\n", + "2023-01-12T18:55:49Z INFO Train Epoch: 0 [3840/60000 (6%)]\tloss=2.0092\n", + "2023-01-12T18:55:49Z INFO Train Epoch: 0 [4160/60000 (7%)]\tloss=1.8847\n", + "2023-01-12T18:55:50Z INFO Train Epoch: 0 [4480/60000 (7%)]\tloss=1.8625\n", + "2023-01-12T18:55:51Z INFO Train Epoch: 0 [4800/60000 (8%)]\tloss=1.5723\n", + "2023-01-12T18:55:51Z INFO Train Epoch: 0 [5120/60000 (9%)]\tloss=1.4135\n", + "2023-01-12T18:55:52Z INFO Train Epoch: 0 [5440/60000 (9%)]\tloss=1.3640\n", + "2023-01-12T18:55:52Z INFO Train Epoch: 0 [5760/60000 (10%)]\tloss=1.3703\n", + "2023-01-12T18:55:53Z INFO Train Epoch: 0 [6080/60000 (10%)]\tloss=1.1940\n", + "2023-01-12T18:55:53Z INFO Train Epoch: 0 [6400/60000 (11%)]\tloss=1.1059\n", + "2023-01-12T18:55:54Z INFO Train Epoch: 0 [6720/60000 (11%)]\tloss=1.2499\n", + "2023-01-12T18:55:54Z INFO Train Epoch: 0 [7040/60000 (12%)]\tloss=0.9975\n", + "2023-01-12T18:55:55Z INFO Train Epoch: 0 [7360/60000 (12%)]\tloss=1.0447\n", + "2023-01-12T18:55:56Z INFO Train Epoch: 0 [7680/60000 (13%)]\tloss=1.0539\n", + "2023-01-12T18:55:56Z INFO Train Epoch: 0 [8000/60000 (13%)]\tloss=1.2946\n", + "2023-01-12T18:55:57Z INFO Train Epoch: 0 [8320/60000 (14%)]\tloss=1.0458\n", + "2023-01-12T18:55:57Z INFO Train Epoch: 0 [8640/60000 (14%)]\tloss=1.1081\n", + "2023-01-12T18:55:58Z INFO Train Epoch: 0 [8960/60000 (15%)]\tloss=1.2158\n", + "2023-01-12T18:56:01Z INFO Train Epoch: 0 [9280/60000 (15%)]\tloss=0.6873\n", + "2023-01-12T18:56:01Z INFO Train Epoch: 0 [9600/60000 (16%)]\tloss=1.3140\n", + "2023-01-12T18:56:02Z INFO Train Epoch: 0 [9920/60000 (17%)]\tloss=0.9072\n", + "2023-01-12T18:56:02Z INFO Train Epoch: 0 [10240/60000 (17%)]\tloss=1.1416\n", + "2023-01-12T18:56:03Z INFO Train Epoch: 0 [10560/60000 (18%)]\tloss=1.2440\n", + "2023-01-12T18:56:04Z INFO Train Epoch: 0 [10880/60000 (18%)]\tloss=0.9684\n", + "2023-01-12T18:56:04Z INFO Train Epoch: 0 [11200/60000 (19%)]\tloss=0.7044\n", + "2023-01-12T18:56:05Z INFO Train Epoch: 0 [11520/60000 (19%)]\tloss=0.9956\n", + "2023-01-12T18:56:05Z INFO Train Epoch: 0 [11840/60000 (20%)]\tloss=1.1197\n", + "2023-01-12T18:56:06Z INFO Train Epoch: 0 [12160/60000 (20%)]\tloss=0.9295\n", + "2023-01-12T18:56:06Z INFO Train Epoch: 0 [12480/60000 (21%)]\tloss=0.7795\n", + "2023-01-12T18:56:07Z INFO Train Epoch: 0 [12800/60000 (21%)]\tloss=0.8194\n", + "2023-01-12T18:56:07Z INFO Train Epoch: 0 [13120/60000 (22%)]\tloss=1.1227\n", + "2023-01-12T18:56:08Z INFO Train Epoch: 0 [13440/60000 (22%)]\tloss=0.9001\n", + "2023-01-12T18:56:08Z INFO Train Epoch: 0 [13760/60000 (23%)]\tloss=0.9062\n", + "2023-01-12T18:56:09Z INFO Train Epoch: 0 [14080/60000 (23%)]\tloss=0.9513\n", + "2023-01-12T18:56:10Z INFO Train Epoch: 0 [14400/60000 (24%)]\tloss=0.8561\n", + "2023-01-12T18:56:11Z INFO Train Epoch: 0 [14720/60000 (25%)]\tloss=0.7293\n", + "2023-01-12T18:56:12Z INFO Train Epoch: 0 [15040/60000 (25%)]\tloss=0.8429\n", + "2023-01-12T18:56:12Z INFO Train Epoch: 0 [15360/60000 (26%)]\tloss=0.9922\n", + "2023-01-12T18:56:13Z INFO Train Epoch: 0 [15680/60000 (26%)]\tloss=0.7432\n", + "2023-01-12T18:56:15Z INFO Train Epoch: 0 [16000/60000 (27%)]\tloss=1.0907\n", + "2023-01-12T18:56:16Z INFO Train Epoch: 0 [16320/60000 (27%)]\tloss=0.5217\n", + "2023-01-12T18:56:16Z INFO Train Epoch: 0 [16640/60000 (28%)]\tloss=0.9695\n", + "2023-01-12T18:56:17Z INFO Train Epoch: 0 [16960/60000 (28%)]\tloss=0.7314\n", + "2023-01-12T18:56:17Z INFO Train Epoch: 0 [17280/60000 (29%)]\tloss=0.8013\n", + "2023-01-12T18:56:18Z INFO Train Epoch: 0 [17600/60000 (29%)]\tloss=0.6232\n", + "2023-01-12T18:56:18Z INFO Train Epoch: 0 [17920/60000 (30%)]\tloss=0.6004\n", + "2023-01-12T18:56:19Z INFO Train Epoch: 0 [18240/60000 (30%)]\tloss=1.1647\n", + "2023-01-12T18:56:19Z INFO Train Epoch: 0 [18560/60000 (31%)]\tloss=1.1845\n", + "2023-01-12T18:56:20Z INFO Train Epoch: 0 [18880/60000 (31%)]\tloss=0.7494\n", + "2023-01-12T18:56:21Z INFO Train Epoch: 0 [19200/60000 (32%)]\tloss=0.6017\n", + "2023-01-12T18:56:21Z INFO Train Epoch: 0 [19520/60000 (33%)]\tloss=0.8297\n", + "2023-01-12T18:56:22Z INFO Train Epoch: 0 [19840/60000 (33%)]\tloss=0.8827\n", + "2023-01-12T18:56:22Z INFO Train Epoch: 0 [20160/60000 (34%)]\tloss=1.1165\n", + "2023-01-12T18:56:23Z INFO Train Epoch: 0 [20480/60000 (34%)]\tloss=0.5660\n", + "2023-01-12T18:56:23Z INFO Train Epoch: 0 [20800/60000 (35%)]\tloss=0.9627\n", + "2023-01-12T18:56:24Z INFO Train Epoch: 0 [21120/60000 (35%)]\tloss=0.4962\n", + "2023-01-12T18:56:24Z INFO Train Epoch: 0 [21440/60000 (36%)]\tloss=1.0196\n", + "2023-01-12T18:56:25Z INFO Train Epoch: 0 [21760/60000 (36%)]\tloss=0.7316\n", + "2023-01-12T18:56:25Z INFO Train Epoch: 0 [22080/60000 (37%)]\tloss=0.7878\n", + "2023-01-12T18:56:26Z INFO Train Epoch: 0 [22400/60000 (37%)]\tloss=0.5671\n", + "2023-01-12T18:56:27Z INFO Train Epoch: 0 [22720/60000 (38%)]\tloss=0.6081\n", + "2023-01-12T18:56:27Z INFO Train Epoch: 0 [23040/60000 (38%)]\tloss=1.0035\n", + "2023-01-12T18:56:28Z INFO Train Epoch: 0 [23360/60000 (39%)]\tloss=0.5702\n", + "2023-01-12T18:56:30Z INFO Train Epoch: 0 [23680/60000 (39%)]\tloss=0.7771\n", + "2023-01-12T18:56:31Z INFO Train Epoch: 0 [24000/60000 (40%)]\tloss=0.9109\n", + "2023-01-12T18:56:32Z INFO Train Epoch: 0 [24320/60000 (41%)]\tloss=0.8138\n", + "2023-01-12T18:56:32Z INFO Train Epoch: 0 [24640/60000 (41%)]\tloss=0.7430\n", + "2023-01-12T18:56:33Z INFO Train Epoch: 0 [24960/60000 (42%)]\tloss=0.7815\n", + "2023-01-12T18:56:33Z INFO Train Epoch: 0 [25280/60000 (42%)]\tloss=0.5246\n", + "2023-01-12T18:56:34Z INFO Train Epoch: 0 [25600/60000 (43%)]\tloss=0.7377\n", + "2023-01-12T18:56:34Z INFO Train Epoch: 0 [25920/60000 (43%)]\tloss=0.6146\n", + "2023-01-12T18:56:35Z INFO Train Epoch: 0 [26240/60000 (44%)]\tloss=0.9728\n", + "2023-01-12T18:56:35Z INFO Train Epoch: 0 [26560/60000 (44%)]\tloss=0.7355\n", + "2023-01-12T18:56:36Z INFO Train Epoch: 0 [26880/60000 (45%)]\tloss=0.6064\n", + "2023-01-12T18:56:36Z INFO Train Epoch: 0 [27200/60000 (45%)]\tloss=1.0344\n", + "2023-01-12T18:56:37Z INFO Train Epoch: 0 [27520/60000 (46%)]\tloss=0.4730\n", + "2023-01-12T18:56:38Z INFO Train Epoch: 0 [27840/60000 (46%)]\tloss=0.7260\n", + "2023-01-12T18:56:38Z INFO Train Epoch: 0 [28160/60000 (47%)]\tloss=0.8061\n", + "2023-01-12T18:56:39Z INFO Train Epoch: 0 [28480/60000 (47%)]\tloss=0.8537\n", + "2023-01-12T18:56:39Z INFO Train Epoch: 0 [28800/60000 (48%)]\tloss=1.0247\n", + "2023-01-12T18:56:40Z INFO Train Epoch: 0 [29120/60000 (49%)]\tloss=0.6724\n", + "2023-01-12T18:56:41Z INFO Train Epoch: 0 [29440/60000 (49%)]\tloss=0.9595\n", + "2023-01-12T18:56:43Z INFO Train Epoch: 0 [29760/60000 (50%)]\tloss=0.7610\n", + "2023-01-12T18:56:44Z INFO Train Epoch: 0 [30080/60000 (50%)]\tloss=0.9843\n", + "2023-01-12T18:56:45Z INFO Train Epoch: 0 [30400/60000 (51%)]\tloss=0.6334\n", + "2023-01-12T18:56:45Z INFO Train Epoch: 0 [30720/60000 (51%)]\tloss=0.6374\n", + "2023-01-12T18:56:46Z INFO Train Epoch: 0 [31040/60000 (52%)]\tloss=0.5124\n", + "2023-01-12T18:56:46Z INFO Train Epoch: 0 [31360/60000 (52%)]\tloss=0.5240\n", + "2023-01-12T18:56:47Z INFO Train Epoch: 0 [31680/60000 (53%)]\tloss=0.6984\n", + "2023-01-12T18:56:47Z INFO Train Epoch: 0 [32000/60000 (53%)]\tloss=0.8143\n", + "2023-01-12T18:56:48Z INFO Train Epoch: 0 [32320/60000 (54%)]\tloss=0.6173\n", + "2023-01-12T18:56:49Z INFO Train Epoch: 0 [32640/60000 (54%)]\tloss=0.6989\n", + "2023-01-12T18:56:49Z INFO Train Epoch: 0 [32960/60000 (55%)]\tloss=0.6109\n", + "2023-01-12T18:56:50Z INFO Train Epoch: 0 [33280/60000 (55%)]\tloss=0.5810\n", + "2023-01-12T18:56:50Z INFO Train Epoch: 0 [33600/60000 (56%)]\tloss=0.5392\n", + "2023-01-12T18:56:51Z INFO Train Epoch: 0 [33920/60000 (57%)]\tloss=0.4317\n", + "2023-01-12T18:56:51Z INFO Train Epoch: 0 [34240/60000 (57%)]\tloss=0.4624\n", + "2023-01-12T18:56:52Z INFO Train Epoch: 0 [34560/60000 (58%)]\tloss=0.3868\n", + "2023-01-12T18:56:52Z INFO Train Epoch: 0 [34880/60000 (58%)]\tloss=0.6871\n", + "2023-01-12T18:56:53Z INFO Train Epoch: 0 [35200/60000 (59%)]\tloss=0.5277\n", + "2023-01-12T18:56:54Z INFO Train Epoch: 0 [35520/60000 (59%)]\tloss=0.5487\n", + "2023-01-12T18:56:54Z INFO Train Epoch: 0 [35840/60000 (60%)]\tloss=0.5509\n", + "2023-01-12T18:56:55Z INFO Train Epoch: 0 [36160/60000 (60%)]\tloss=0.7043\n", + "2023-01-12T18:56:55Z INFO Train Epoch: 0 [36480/60000 (61%)]\tloss=0.7568\n", + "2023-01-12T18:56:56Z INFO Train Epoch: 0 [36800/60000 (61%)]\tloss=0.6199\n", + "2023-01-12T18:56:56Z INFO Train Epoch: 0 [37120/60000 (62%)]\tloss=0.7296\n", + "2023-01-12T18:56:57Z INFO Train Epoch: 0 [37440/60000 (62%)]\tloss=0.5492\n", + "2023-01-12T18:56:58Z INFO Train Epoch: 0 [37760/60000 (63%)]\tloss=0.4943\n", + "2023-01-12T18:56:59Z INFO Train Epoch: 0 [38080/60000 (63%)]\tloss=0.8262\n", + "2023-01-12T18:57:01Z INFO Train Epoch: 0 [38400/60000 (64%)]\tloss=0.6767\n", + "2023-01-12T18:57:02Z INFO Train Epoch: 0 [38720/60000 (65%)]\tloss=0.6093\n", + "2023-01-12T18:57:02Z INFO Train Epoch: 0 [39040/60000 (65%)]\tloss=0.5222\n", + "2023-01-12T18:57:03Z INFO Train Epoch: 0 [39360/60000 (66%)]\tloss=0.4399\n", + "2023-01-12T18:57:03Z INFO Train Epoch: 0 [39680/60000 (66%)]\tloss=0.6005\n", + "2023-01-12T18:57:04Z INFO Train Epoch: 0 [40000/60000 (67%)]\tloss=0.5421\n", + "2023-01-12T18:57:04Z INFO Train Epoch: 0 [40320/60000 (67%)]\tloss=0.4670\n", + "2023-01-12T18:57:05Z INFO Train Epoch: 0 [40640/60000 (68%)]\tloss=0.2799\n", + "2023-01-12T18:57:06Z INFO Train Epoch: 0 [40960/60000 (68%)]\tloss=0.5594\n", + "2023-01-12T18:57:06Z INFO Train Epoch: 0 [41280/60000 (69%)]\tloss=0.7234\n", + "2023-01-12T18:57:07Z INFO Train Epoch: 0 [41600/60000 (69%)]\tloss=0.8179\n", + "2023-01-12T18:57:08Z INFO Train Epoch: 0 [41920/60000 (70%)]\tloss=0.5361\n", + "2023-01-12T18:57:08Z INFO Train Epoch: 0 [42240/60000 (70%)]\tloss=0.6700\n", + "2023-01-12T18:57:09Z INFO Train Epoch: 0 [42560/60000 (71%)]\tloss=0.4328\n", + "2023-01-12T18:57:09Z INFO Train Epoch: 0 [42880/60000 (71%)]\tloss=0.7155\n", + "2023-01-12T18:57:10Z INFO Train Epoch: 0 [43200/60000 (72%)]\tloss=0.6536\n", + "2023-01-12T18:57:11Z INFO Train Epoch: 0 [43520/60000 (73%)]\tloss=0.4034\n", + "2023-01-12T18:57:12Z INFO Train Epoch: 0 [43840/60000 (73%)]\tloss=0.6295\n", + "2023-01-12T18:57:13Z INFO Train Epoch: 0 [44160/60000 (74%)]\tloss=0.6419\n", + "2023-01-12T18:57:15Z INFO Train Epoch: 0 [44480/60000 (74%)]\tloss=0.4257\n", + "2023-01-12T18:57:15Z INFO Train Epoch: 0 [44800/60000 (75%)]\tloss=0.6005\n", + "2023-01-12T18:57:16Z INFO Train Epoch: 0 [45120/60000 (75%)]\tloss=0.5280\n", + "2023-01-12T18:57:17Z INFO Train Epoch: 0 [45440/60000 (76%)]\tloss=0.7624\n", + "2023-01-12T18:57:17Z INFO Train Epoch: 0 [45760/60000 (76%)]\tloss=0.4500\n", + "2023-01-12T18:57:18Z INFO Train Epoch: 0 [46080/60000 (77%)]\tloss=0.6136\n", + "2023-01-12T18:57:18Z INFO Train Epoch: 0 [46400/60000 (77%)]\tloss=0.4631\n", + "2023-01-12T18:57:19Z INFO Train Epoch: 0 [46720/60000 (78%)]\tloss=0.6543\n", + "2023-01-12T18:57:19Z INFO Train Epoch: 0 [47040/60000 (78%)]\tloss=0.3783\n", + "2023-01-12T18:57:20Z INFO Train Epoch: 0 [47360/60000 (79%)]\tloss=0.6068\n", + "2023-01-12T18:57:20Z INFO Train Epoch: 0 [47680/60000 (79%)]\tloss=0.4288\n", + "2023-01-12T18:57:21Z INFO Train Epoch: 0 [48000/60000 (80%)]\tloss=0.5632\n", + "2023-01-12T18:57:22Z INFO Train Epoch: 0 [48320/60000 (81%)]\tloss=0.5509\n", + "2023-01-12T18:57:22Z INFO Train Epoch: 0 [48640/60000 (81%)]\tloss=0.7985\n", + "2023-01-12T18:57:23Z INFO Train Epoch: 0 [48960/60000 (82%)]\tloss=0.5953\n", + "2023-01-12T18:57:23Z INFO Train Epoch: 0 [49280/60000 (82%)]\tloss=0.6759\n", + "2023-01-12T18:57:24Z INFO Train Epoch: 0 [49600/60000 (83%)]\tloss=0.3233\n", + "2023-01-12T18:57:24Z INFO Train Epoch: 0 [49920/60000 (83%)]\tloss=0.3583\n", + "2023-01-12T18:57:25Z INFO Train Epoch: 0 [50240/60000 (84%)]\tloss=0.5348\n", + "2023-01-12T18:57:25Z INFO Train Epoch: 0 [50560/60000 (84%)]\tloss=0.8532\n", + "2023-01-12T18:57:26Z INFO Train Epoch: 0 [50880/60000 (85%)]\tloss=0.4251\n", + "2023-01-12T18:57:27Z INFO Train Epoch: 0 [51200/60000 (85%)]\tloss=0.4953\n", + "2023-01-12T18:57:27Z INFO Train Epoch: 0 [51520/60000 (86%)]\tloss=0.5538\n", + "2023-01-12T18:57:28Z INFO Train Epoch: 0 [51840/60000 (86%)]\tloss=0.7728\n", + "2023-01-12T18:57:29Z INFO Train Epoch: 0 [52160/60000 (87%)]\tloss=0.4604\n", + "2023-01-12T18:57:31Z INFO Train Epoch: 0 [52480/60000 (87%)]\tloss=0.8828\n", + "2023-01-12T18:57:32Z INFO Train Epoch: 0 [52800/60000 (88%)]\tloss=0.5369\n", + "2023-01-12T18:57:32Z INFO Train Epoch: 0 [53120/60000 (89%)]\tloss=0.7731\n", + "2023-01-12T18:57:33Z INFO Train Epoch: 0 [53440/60000 (89%)]\tloss=0.6234\n", + "2023-01-12T18:57:33Z INFO Train Epoch: 0 [53760/60000 (90%)]\tloss=0.5501\n", + "2023-01-12T18:57:34Z INFO Train Epoch: 0 [54080/60000 (90%)]\tloss=0.7707\n", + "2023-01-12T18:57:34Z INFO Train Epoch: 0 [54400/60000 (91%)]\tloss=0.7441\n", + "2023-01-12T18:57:35Z INFO Train Epoch: 0 [54720/60000 (91%)]\tloss=0.5040\n", + "2023-01-12T18:57:36Z INFO Train Epoch: 0 [55040/60000 (92%)]\tloss=0.4233\n", + "2023-01-12T18:57:36Z INFO Train Epoch: 0 [55360/60000 (92%)]\tloss=0.4983\n", + "2023-01-12T18:57:37Z INFO Train Epoch: 0 [55680/60000 (93%)]\tloss=0.5547\n", + "2023-01-12T18:57:37Z INFO Train Epoch: 0 [56000/60000 (93%)]\tloss=0.7808\n", + "2023-01-12T18:57:38Z INFO Train Epoch: 0 [56320/60000 (94%)]\tloss=0.5937\n", + "2023-01-12T18:57:38Z INFO Train Epoch: 0 [56640/60000 (94%)]\tloss=0.3243\n", + "2023-01-12T18:57:39Z INFO Train Epoch: 0 [56960/60000 (95%)]\tloss=0.7926\n", + "2023-01-12T18:57:39Z INFO Train Epoch: 0 [57280/60000 (95%)]\tloss=0.5203\n", + "2023-01-12T18:57:40Z INFO Train Epoch: 0 [57600/60000 (96%)]\tloss=0.5806\n", + "2023-01-12T18:57:41Z INFO Train Epoch: 0 [57920/60000 (97%)]\tloss=0.2864\n", + "2023-01-12T18:57:42Z INFO Train Epoch: 0 [58240/60000 (97%)]\tloss=0.4806\n", + "2023-01-12T18:57:43Z INFO Train Epoch: 0 [58560/60000 (98%)]\tloss=0.5448\n", + "2023-01-12T18:57:44Z INFO Train Epoch: 0 [58880/60000 (98%)]\tloss=0.7353\n", + "2023-01-12T18:57:45Z INFO Train Epoch: 0 [59200/60000 (99%)]\tloss=0.3771\n", + "2023-01-12T18:57:45Z INFO Train Epoch: 0 [59520/60000 (99%)]\tloss=0.5527\n", + "2023-01-12T18:57:46Z INFO Train Epoch: 0 [59840/60000 (100%)]\tloss=0.5935\n", "\n" ] } ], "source": [ - "pytorchjob_client.get_logs(pytorchjob_name)" + "training_client.get_job_logs(pytorchjob_name, container=\"pytorch\")" ] }, { @@ -726,7 +728,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 28, "id": "32ae88fd-5b5d-4ba1-a560-9a35c5ac17de", "metadata": { "tags": [] @@ -736,12 +738,12 @@ "name": "stderr", "output_type": "stream", "text": [ - "2022-09-12T19:02:27Z INFO PyTorchJob train-pytorch has been deleted\n" + "PyTorchJob kubeflow-user-example-com/train-pytorch has been deleted\n" ] } ], "source": [ - "pytorchjob_client.delete(pytorchjob_name)" + "training_client.delete_pytorchjob(pytorchjob_name)" ] }, { @@ -755,7 +757,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, diff --git a/sdk/python/examples/kubeflow-pytorchjob-sdk.ipynb b/sdk/python/examples/kubeflow-pytorchjob-sdk.ipynb index bb9d04c31b..0c2c28e02d 100644 --- a/sdk/python/examples/kubeflow-pytorchjob-sdk.ipynb +++ b/sdk/python/examples/kubeflow-pytorchjob-sdk.ipynb @@ -19,50 +19,35 @@ } }, "source": [ - "This is a sample for Kubeflow PyTorchJob SDK `kubeflow-pytorchjob`.\n", + "This is a sample for Kubeflow Training SDK `kubeflow-training`.\n", "\n", - "The notebook shows how to use Kubeflow PyTorchJob SDK to create, get, wait, check and delete PyTorchJob." + "The notebook shows how to use Kubeflow Training SDK to create, get, wait, check and delete PyTorchJob." ] }, { - "cell_type": "code", - "execution_count": 1, + "cell_type": "markdown", "metadata": { - "pycharm": { - "name": "#%%\n" - } + "tags": [] }, - "outputs": [], "source": [ - "from kubernetes.client import V1PodTemplateSpec\n", - "from kubernetes.client import V1ObjectMeta\n", - "from kubernetes.client import V1PodSpec\n", - "from kubernetes.client import V1Container\n", - "from kubernetes.client import V1ResourceRequirements\n", + "## Install Kubeflow Training Python SDKs\n", "\n", - "from kubeflow.training import constants\n", - "from kubeflow.training.utils import utils\n", - "from kubeflow.training import V1ReplicaSpec\n", - "from kubeflow.training import KubeflowOrgV1PyTorchJob\n", - "from kubeflow.training import KubeflowOrgV1PyTorchJobSpec\n", - "from kubeflow.training import V1RunPolicy\n", - "from kubeflow.training import PyTorchJobClient" + "You need to install Kubeflow Training SDK to run this Notebook." ] }, { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ - "Define namespace where pytorchjob needs to be created to. If not specified, below function defines namespace to the current one where SDK is running in the cluster, otherwise it will deploy to default namespace." + "# TODO (andreyvelich): Change to release version when SDK with the new APIs is published.\n", + "!pip install git+https://github.com/kubeflow/training-operator.git#subdirectory=sdk/python" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 6, "metadata": { "pycharm": { "name": "#%%\n" @@ -70,7 +55,16 @@ }, "outputs": [], "source": [ - "namespace = utils.get_default_target_namespace()" + "from kubernetes.client import V1PodTemplateSpec\n", + "from kubernetes.client import V1ObjectMeta\n", + "from kubernetes.client import V1PodSpec\n", + "from kubernetes.client import V1Container\n", + "\n", + "from kubeflow.training import V1ReplicaSpec\n", + "from kubeflow.training import KubeflowOrgV1PyTorchJob\n", + "from kubeflow.training import KubeflowOrgV1PyTorchJobSpec\n", + "from kubeflow.training import V1RunPolicy\n", + "from kubeflow.training import TrainingClient" ] }, { @@ -81,7 +75,7 @@ } }, "source": [ - "### Define PyTorchJob" + "## Define PyTorchJob" ] }, { @@ -97,7 +91,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 37, "metadata": { "pycharm": { "name": "#%%\n" @@ -105,28 +99,35 @@ }, "outputs": [], "source": [ + "name = \"pytorch-dist-mnist-gloo\"\n", + "namespace = \"kubeflow-user-example-com\"\n", + "container_name = \"pytorch\"\n", + "\n", "container = V1Container(\n", - " name=\"pytorch\",\n", + " name=container_name,\n", " image=\"gcr.io/kubeflow-ci/pytorch-dist-mnist-test:v1.0\",\n", - " args=[\"--backend\",\"gloo\"]\n", - ")\n", - "\n", - "master = V1ReplicaSpec(\n", - " replicas=1,\n", - " restart_policy=\"OnFailure\",\n", - " template=V1PodTemplateSpec(\n", - " spec=V1PodSpec(\n", - " containers=[container]\n", - " )\n", - " )\n", + " args=[\"--backend\", \"gloo\"],\n", ")\n", "\n", - "worker = V1ReplicaSpec(\n", + "replica_spec = V1ReplicaSpec(\n", " replicas=1,\n", " restart_policy=\"OnFailure\",\n", " template=V1PodTemplateSpec(\n", + " metadata=V1ObjectMeta(\n", + " name=name,\n", + " namespace=namespace,\n", + " annotations={\n", + " \"sidecar.istio.io/inject\": \"false\"\n", + " }\n", + " ),\n", " spec=V1PodSpec(\n", - " containers=[container]\n", + " containers=[\n", + " V1Container(\n", + " name=container_name,\n", + " image=\"gcr.io/kubeflow-ci/pytorch-dist-mnist-test:v1.0\",\n", + " args=[\"--backend\", \"gloo\"],\n", + " )\n", + " ]\n", " )\n", " )\n", ")\n", @@ -134,12 +135,14 @@ "pytorchjob = KubeflowOrgV1PyTorchJob(\n", " api_version=\"kubeflow.org/v1\",\n", " kind=\"PyTorchJob\",\n", - " metadata=V1ObjectMeta(name=\"pytorch-dist-mnist-gloo\",namespace=namespace),\n", + " metadata=V1ObjectMeta(name=name, namespace=namespace),\n", " spec=KubeflowOrgV1PyTorchJobSpec(\n", " run_policy=V1RunPolicy(clean_pod_policy=\"None\"),\n", - " pytorch_replica_specs={\"Master\": master,\n", - " \"Worker\": worker}\n", - " )\n", + " pytorch_replica_specs={\n", + " \"Master\": replica_spec,\n", + " \"Worker\": replica_spec\n", + " },\n", + " ),\n", ")" ] }, @@ -151,12 +154,14 @@ } }, "source": [ - "### Create PyTorchJob" + "## Create PyTorchJob\n", + "\n", + "You have to create Training Client to deploy you PyTorchJob in you cluster." ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 38, "metadata": { "pycharm": { "name": "#%%\n" @@ -164,53 +169,16 @@ }, "outputs": [ { - "data": { - "text/plain": [ - "{'apiVersion': 'kubeflow.org/v1',\n", - " 'kind': 'PyTorchJob',\n", - " 'metadata': {'creationTimestamp': '2021-10-02T18:55:16Z',\n", - " 'generation': 1,\n", - " 'managedFields': [{'apiVersion': 'kubeflow.org/v1',\n", - " 'fieldsType': 'FieldsV1',\n", - " 'fieldsV1': {'f:spec': {'.': {},\n", - " 'f:pytorchReplicaSpecs': {'.': {},\n", - " 'f:Master': {'.': {},\n", - " 'f:replicas': {},\n", - " 'f:restartPolicy': {},\n", - " 'f:template': {'.': {}, 'f:spec': {'.': {}, 'f:containers': {}}}},\n", - " 'f:Worker': {'.': {},\n", - " 'f:replicas': {},\n", - " 'f:restartPolicy': {},\n", - " 'f:template': {'.': {}, 'f:spec': {'.': {}, 'f:containers': {}}}}},\n", - " 'f:runPolicy': {'.': {}, 'f:cleanPodPolicy': {}}}},\n", - " 'manager': 'OpenAPI-Generator',\n", - " 'operation': 'Update',\n", - " 'time': '2021-10-02T18:55:16Z'}],\n", - " 'name': 'pytorch-dist-mnist-gloo',\n", - " 'namespace': 'default',\n", - " 'resourceVersion': '5169',\n", - " 'uid': '583b9831-8b6d-44e1-86c1-9a171c472fe3'},\n", - " 'spec': {'pytorchReplicaSpecs': {'Master': {'replicas': 1,\n", - " 'restartPolicy': 'OnFailure',\n", - " 'template': {'spec': {'containers': [{'args': ['--backend', 'gloo'],\n", - " 'image': 'gcr.io/kubeflow-ci/pytorch-dist-mnist-test:v1.0',\n", - " 'name': 'pytorch'}]}}},\n", - " 'Worker': {'replicas': 1,\n", - " 'restartPolicy': 'OnFailure',\n", - " 'template': {'spec': {'containers': [{'args': ['--backend', 'gloo'],\n", - " 'image': 'gcr.io/kubeflow-ci/pytorch-dist-mnist-test:v1.0',\n", - " 'name': 'pytorch'}]}}}},\n", - " 'runPolicy': {'cleanPodPolicy': 'None'}}}" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" + "name": "stderr", + "output_type": "stream", + "text": [ + "PyTorchJob kubeflow-user-example-com/pytorch-dist-mnist-gloo has been created\n" + ] } ], "source": [ - "pytorchjob_client = PyTorchJobClient()\n", - "pytorchjob_client.create(pytorchjob)" + "training_client = TrainingClient()\n", + "training_client.create_pytorchjob(pytorchjob, namespace=namespace)" ] }, { @@ -221,12 +189,14 @@ } }, "source": [ - "### Get the created PyTorchJob " + "## Get the Created PyTorchJob\n", + "\n", + "You can verify the created PyTorchJob name" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 39, "metadata": { "pycharm": { "name": "#%%\n" @@ -236,73 +206,16 @@ { "data": { "text/plain": [ - "{'apiVersion': 'kubeflow.org/v1',\n", - " 'kind': 'PyTorchJob',\n", - " 'metadata': {'creationTimestamp': '2021-10-02T18:55:16Z',\n", - " 'generation': 1,\n", - " 'managedFields': [{'apiVersion': 'kubeflow.org/v1',\n", - " 'fieldsType': 'FieldsV1',\n", - " 'fieldsV1': {'f:spec': {'.': {},\n", - " 'f:pytorchReplicaSpecs': {'.': {},\n", - " 'f:Master': {'.': {},\n", - " 'f:replicas': {},\n", - " 'f:restartPolicy': {},\n", - " 'f:template': {'.': {}, 'f:spec': {'.': {}, 'f:containers': {}}}},\n", - " 'f:Worker': {'.': {},\n", - " 'f:replicas': {},\n", - " 'f:restartPolicy': {},\n", - " 'f:template': {'.': {}, 'f:spec': {'.': {}, 'f:containers': {}}}}},\n", - " 'f:runPolicy': {'.': {}, 'f:cleanPodPolicy': {}}}},\n", - " 'manager': 'OpenAPI-Generator',\n", - " 'operation': 'Update',\n", - " 'time': '2021-10-02T18:55:16Z'},\n", - " {'apiVersion': 'kubeflow.org/v1',\n", - " 'fieldsType': 'FieldsV1',\n", - " 'fieldsV1': {'f:status': {'.': {},\n", - " 'f:conditions': {},\n", - " 'f:replicaStatuses': {'.': {},\n", - " 'f:Master': {'.': {}, 'f:active': {}},\n", - " 'f:Worker': {'.': {}, 'f:active': {}}}}},\n", - " 'manager': 'manager',\n", - " 'operation': 'Update',\n", - " 'time': '2021-10-02T18:55:17Z'}],\n", - " 'name': 'pytorch-dist-mnist-gloo',\n", - " 'namespace': 'default',\n", - " 'resourceVersion': '5204',\n", - " 'uid': '583b9831-8b6d-44e1-86c1-9a171c472fe3'},\n", - " 'spec': {'pytorchReplicaSpecs': {'Master': {'replicas': 1,\n", - " 'restartPolicy': 'OnFailure',\n", - " 'template': {'spec': {'containers': [{'args': ['--backend', 'gloo'],\n", - " 'image': 'gcr.io/kubeflow-ci/pytorch-dist-mnist-test:v1.0',\n", - " 'name': 'pytorch'}]}}},\n", - " 'Worker': {'replicas': 1,\n", - " 'restartPolicy': 'OnFailure',\n", - " 'template': {'spec': {'containers': [{'args': ['--backend', 'gloo'],\n", - " 'image': 'gcr.io/kubeflow-ci/pytorch-dist-mnist-test:v1.0',\n", - " 'name': 'pytorch'}]}}}},\n", - " 'runPolicy': {'cleanPodPolicy': 'None'}},\n", - " 'status': {'conditions': [{'lastTransitionTime': '2021-10-02T18:55:16Z',\n", - " 'lastUpdateTime': '2021-10-02T18:55:16Z',\n", - " 'message': 'PyTorchJob pytorch-dist-mnist-gloo is created.',\n", - " 'reason': 'PyTorchJobCreated',\n", - " 'status': 'True',\n", - " 'type': 'Created'},\n", - " {'lastTransitionTime': '2021-10-02T18:55:16Z',\n", - " 'lastUpdateTime': '2021-10-02T18:55:16Z',\n", - " 'message': 'PyTorchJob pytorch-dist-mnist-gloo is running.',\n", - " 'reason': 'JobRunning',\n", - " 'status': 'True',\n", - " 'type': 'Running'}],\n", - " 'replicaStatuses': {'Master': {'active': 1}, 'Worker': {'active': 1}}}}" + "'pytorch-dist-mnist-gloo'" ] }, - "execution_count": 5, + "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "pytorchjob_client.get('pytorch-dist-mnist-gloo')" + "training_client.get_pytorchjob(name).metadata.name" ] }, { @@ -313,12 +226,12 @@ } }, "source": [ - "### Get the PyTorchJob status, check if the PyTorchJob has been started." + "## Get the PyTorchJob Conditions" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 40, "metadata": { "pycharm": { "name": "#%%\n" @@ -328,16 +241,27 @@ { "data": { "text/plain": [ - "'Running'" + "[{'last_transition_time': datetime.datetime(2023, 1, 12, 18, 30, 13, tzinfo=tzlocal()),\n", + " 'last_update_time': datetime.datetime(2023, 1, 12, 18, 30, 13, tzinfo=tzlocal()),\n", + " 'message': 'PyTorchJob pytorch-dist-mnist-gloo is created.',\n", + " 'reason': 'PyTorchJobCreated',\n", + " 'status': 'True',\n", + " 'type': 'Created'},\n", + " {'last_transition_time': datetime.datetime(2023, 1, 12, 18, 30, 18, tzinfo=tzlocal()),\n", + " 'last_update_time': datetime.datetime(2023, 1, 12, 18, 30, 18, tzinfo=tzlocal()),\n", + " 'message': 'PyTorchJob pytorch-dist-mnist-gloo is running.',\n", + " 'reason': 'JobRunning',\n", + " 'status': 'True',\n", + " 'type': 'Running'}]" ] }, - "execution_count": 6, + "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "pytorchjob_client.get_job_status('pytorch-dist-mnist-gloo', namespace=namespace)" + "training_client.get_job_conditions(name=name, namespace=namespace, job_kind=\"PyTorchJob\")" ] }, { @@ -348,12 +272,12 @@ } }, "source": [ - "### Wait for the specified PyTorchJob to finish" + "## Wait Until PyTorchJob Finishes" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 41, "metadata": { "pycharm": { "name": "#%%\n" @@ -364,15 +288,18 @@ "name": "stdout", "output_type": "stream", "text": [ - "NAME STATE TIME \n", - "pytorch-dist-mnist-gloo Running 2021-10-02T18:55:16Z \n", - "pytorch-dist-mnist-gloo Running 2021-10-02T18:55:16Z \n", - "pytorch-dist-mnist-gloo Succeeded 2021-10-02T18:57:38Z \n" + "pytorch-dist-mnist-gloo Running 2023-01-12 18:30:18+00:00\n", + "pytorch-dist-mnist-gloo Running 2023-01-12 18:30:18+00:00\n", + "pytorch-dist-mnist-gloo Running 2023-01-12 18:30:18+00:00\n", + "pytorch-dist-mnist-gloo Succeeded 2023-01-12 18:36:48+00:00\n", + "Succeeded number of replicas: 1\n" ] } ], "source": [ - "pytorchjob_client.wait_for_job('pytorch-dist-mnist-gloo', namespace=namespace, watch=True)" + "pytorchjob = training_client.wait_for_job_conditions(name=name, namespace=namespace, job_kind=\"PyTorchJob\")\n", + "\n", + "print(f\"Succeeded number of replicas: {pytorchjob.status.replica_statuses['Master'].succeeded}\")" ] }, { @@ -383,12 +310,12 @@ } }, "source": [ - "### Check if the PyTorchJob succeeded" + "## Verify if PyTorchJob is Succeeded" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 42, "metadata": { "pycharm": { "name": "#%%\n" @@ -401,13 +328,13 @@ "True" ] }, - "execution_count": 8, + "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "pytorchjob_client.is_job_succeeded('pytorch-dist-mnist-gloo', namespace=namespace)" + "training_client.is_job_succeeded(name=name, namespace=namespace, job_kind=\"PyTorchJob\")" ] }, { @@ -418,12 +345,12 @@ } }, "source": [ - "### Get the PyTorchJob training logs." + "## Get the PyTorchJob Training Logs" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 43, "metadata": { "pycharm": { "name": "#%%\n" @@ -434,7 +361,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "The logs of Pod pytorch-dist-mnist-gloo-master-0:\n", + "The logs of pod pytorch-dist-mnist-gloo-master-0:\n", " Using distributed PyTorch with gloo backend\n", "Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz\n", "Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz\n", @@ -449,102 +376,102 @@ "Train Epoch: 1 [2560/60000 (4%)]\tloss=1.8679\n", "Train Epoch: 1 [3200/60000 (5%)]\tloss=1.4135\n", "Train Epoch: 1 [3840/60000 (6%)]\tloss=1.0003\n", - "Train Epoch: 1 [4480/60000 (7%)]\tloss=0.7763\n", + "Train Epoch: 1 [4480/60000 (7%)]\tloss=0.7762\n", "Train Epoch: 1 [5120/60000 (9%)]\tloss=0.4598\n", - "Train Epoch: 1 [5760/60000 (10%)]\tloss=0.4870\n", - "Train Epoch: 1 [6400/60000 (11%)]\tloss=0.4381\n", - "Train Epoch: 1 [7040/60000 (12%)]\tloss=0.4089\n", - "Train Epoch: 1 [7680/60000 (13%)]\tloss=0.4618\n", - "Train Epoch: 1 [8320/60000 (14%)]\tloss=0.4284\n", - "Train Epoch: 1 [8960/60000 (15%)]\tloss=0.3992\n", - "Train Epoch: 1 [9600/60000 (16%)]\tloss=0.3840\n", - "Train Epoch: 1 [10240/60000 (17%)]\tloss=0.2981\n", - "Train Epoch: 1 [10880/60000 (18%)]\tloss=0.5013\n", - "Train Epoch: 1 [11520/60000 (19%)]\tloss=0.5246\n", - "Train Epoch: 1 [12160/60000 (20%)]\tloss=0.3376\n", - "Train Epoch: 1 [12800/60000 (21%)]\tloss=0.3678\n", - "Train Epoch: 1 [13440/60000 (22%)]\tloss=0.4515\n", - "Train Epoch: 1 [14080/60000 (23%)]\tloss=0.3043\n", - "Train Epoch: 1 [14720/60000 (25%)]\tloss=0.3581\n", - "Train Epoch: 1 [15360/60000 (26%)]\tloss=0.3301\n", - "Train Epoch: 1 [16000/60000 (27%)]\tloss=0.4392\n", - "Train Epoch: 1 [16640/60000 (28%)]\tloss=0.3626\n", - "Train Epoch: 1 [17280/60000 (29%)]\tloss=0.3179\n", - "Train Epoch: 1 [17920/60000 (30%)]\tloss=0.2013\n", - "Train Epoch: 1 [18560/60000 (31%)]\tloss=0.5004\n", - "Train Epoch: 1 [19200/60000 (32%)]\tloss=0.3266\n", - "Train Epoch: 1 [19840/60000 (33%)]\tloss=0.1194\n", - "Train Epoch: 1 [20480/60000 (34%)]\tloss=0.1898\n", - "Train Epoch: 1 [21120/60000 (35%)]\tloss=0.1402\n", - "Train Epoch: 1 [21760/60000 (36%)]\tloss=0.3161\n", - "Train Epoch: 1 [22400/60000 (37%)]\tloss=0.1499\n", - "Train Epoch: 1 [23040/60000 (38%)]\tloss=0.2888\n", - "Train Epoch: 1 [23680/60000 (39%)]\tloss=0.4680\n", - "Train Epoch: 1 [24320/60000 (41%)]\tloss=0.2159\n", - "Train Epoch: 1 [24960/60000 (42%)]\tloss=0.1518\n", - "Train Epoch: 1 [25600/60000 (43%)]\tloss=0.2247\n", - "Train Epoch: 1 [26240/60000 (44%)]\tloss=0.2634\n", - "Train Epoch: 1 [26880/60000 (45%)]\tloss=0.2333\n", - "Train Epoch: 1 [27520/60000 (46%)]\tloss=0.2626\n", + "Train Epoch: 1 [5760/60000 (10%)]\tloss=0.4860\n", + "Train Epoch: 1 [6400/60000 (11%)]\tloss=0.4389\n", + "Train Epoch: 1 [7040/60000 (12%)]\tloss=0.4084\n", + "Train Epoch: 1 [7680/60000 (13%)]\tloss=0.4602\n", + "Train Epoch: 1 [8320/60000 (14%)]\tloss=0.4289\n", + "Train Epoch: 1 [8960/60000 (15%)]\tloss=0.3990\n", + "Train Epoch: 1 [9600/60000 (16%)]\tloss=0.3852\n", + "Train Epoch: 1 [10240/60000 (17%)]\tloss=0.2984\n", + "Train Epoch: 1 [10880/60000 (18%)]\tloss=0.5029\n", + "Train Epoch: 1 [11520/60000 (19%)]\tloss=0.5236\n", + "Train Epoch: 1 [12160/60000 (20%)]\tloss=0.3378\n", + "Train Epoch: 1 [12800/60000 (21%)]\tloss=0.3674\n", + "Train Epoch: 1 [13440/60000 (22%)]\tloss=0.4508\n", + "Train Epoch: 1 [14080/60000 (23%)]\tloss=0.3034\n", + "Train Epoch: 1 [14720/60000 (25%)]\tloss=0.3574\n", + "Train Epoch: 1 [15360/60000 (26%)]\tloss=0.3313\n", + "Train Epoch: 1 [16000/60000 (27%)]\tloss=0.4405\n", + "Train Epoch: 1 [16640/60000 (28%)]\tloss=0.3642\n", + "Train Epoch: 1 [17280/60000 (29%)]\tloss=0.3172\n", + "Train Epoch: 1 [17920/60000 (30%)]\tloss=0.2016\n", + "Train Epoch: 1 [18560/60000 (31%)]\tloss=0.4978\n", + "Train Epoch: 1 [19200/60000 (32%)]\tloss=0.3254\n", + "Train Epoch: 1 [19840/60000 (33%)]\tloss=0.1191\n", + "Train Epoch: 1 [20480/60000 (34%)]\tloss=0.1905\n", + "Train Epoch: 1 [21120/60000 (35%)]\tloss=0.1408\n", + "Train Epoch: 1 [21760/60000 (36%)]\tloss=0.3150\n", + "Train Epoch: 1 [22400/60000 (37%)]\tloss=0.1506\n", + "Train Epoch: 1 [23040/60000 (38%)]\tloss=0.2899\n", + "Train Epoch: 1 [23680/60000 (39%)]\tloss=0.4676\n", + "Train Epoch: 1 [24320/60000 (41%)]\tloss=0.2157\n", + "Train Epoch: 1 [24960/60000 (42%)]\tloss=0.1520\n", + "Train Epoch: 1 [25600/60000 (43%)]\tloss=0.2244\n", + "Train Epoch: 1 [26240/60000 (44%)]\tloss=0.2632\n", + "Train Epoch: 1 [26880/60000 (45%)]\tloss=0.2335\n", + "Train Epoch: 1 [27520/60000 (46%)]\tloss=0.2619\n", "Train Epoch: 1 [28160/60000 (47%)]\tloss=0.2126\n", - "Train Epoch: 1 [28800/60000 (48%)]\tloss=0.1335\n", - "Train Epoch: 1 [29440/60000 (49%)]\tloss=0.2777\n", - "Train Epoch: 1 [30080/60000 (50%)]\tloss=0.0940\n", - "Train Epoch: 1 [30720/60000 (51%)]\tloss=0.1276\n", - "Train Epoch: 1 [31360/60000 (52%)]\tloss=0.2465\n", - "Train Epoch: 1 [32000/60000 (53%)]\tloss=0.3388\n", - "Train Epoch: 1 [32640/60000 (54%)]\tloss=0.1522\n", - "Train Epoch: 1 [33280/60000 (55%)]\tloss=0.0904\n", + "Train Epoch: 1 [28800/60000 (48%)]\tloss=0.1324\n", + "Train Epoch: 1 [29440/60000 (49%)]\tloss=0.2795\n", + "Train Epoch: 1 [30080/60000 (50%)]\tloss=0.0951\n", + "Train Epoch: 1 [30720/60000 (51%)]\tloss=0.1284\n", + "Train Epoch: 1 [31360/60000 (52%)]\tloss=0.2461\n", + "Train Epoch: 1 [32000/60000 (53%)]\tloss=0.3394\n", + "Train Epoch: 1 [32640/60000 (54%)]\tloss=0.1517\n", + "Train Epoch: 1 [33280/60000 (55%)]\tloss=0.0916\n", "Train Epoch: 1 [33920/60000 (57%)]\tloss=0.1449\n", - "Train Epoch: 1 [34560/60000 (58%)]\tloss=0.1985\n", - "Train Epoch: 1 [35200/60000 (59%)]\tloss=0.2195\n", - "Train Epoch: 1 [35840/60000 (60%)]\tloss=0.0631\n", - "Train Epoch: 1 [36480/60000 (61%)]\tloss=0.1359\n", - "Train Epoch: 1 [37120/60000 (62%)]\tloss=0.1165\n", - "Train Epoch: 1 [37760/60000 (63%)]\tloss=0.2356\n", - "Train Epoch: 1 [38400/60000 (64%)]\tloss=0.0635\n", - "Train Epoch: 1 [39040/60000 (65%)]\tloss=0.1068\n", - "Train Epoch: 1 [39680/60000 (66%)]\tloss=0.1600\n", - "Train Epoch: 1 [40320/60000 (67%)]\tloss=0.1089\n", + "Train Epoch: 1 [34560/60000 (58%)]\tloss=0.1978\n", + "Train Epoch: 1 [35200/60000 (59%)]\tloss=0.2189\n", + "Train Epoch: 1 [35840/60000 (60%)]\tloss=0.0637\n", + "Train Epoch: 1 [36480/60000 (61%)]\tloss=0.1368\n", + "Train Epoch: 1 [37120/60000 (62%)]\tloss=0.1153\n", + "Train Epoch: 1 [37760/60000 (63%)]\tloss=0.2358\n", + "Train Epoch: 1 [38400/60000 (64%)]\tloss=0.0631\n", + "Train Epoch: 1 [39040/60000 (65%)]\tloss=0.1063\n", + "Train Epoch: 1 [39680/60000 (66%)]\tloss=0.1602\n", + "Train Epoch: 1 [40320/60000 (67%)]\tloss=0.1098\n", "Train Epoch: 1 [40960/60000 (68%)]\tloss=0.1781\n", - "Train Epoch: 1 [41600/60000 (69%)]\tloss=0.2301\n", - "Train Epoch: 1 [42240/60000 (70%)]\tloss=0.0741\n", - "Train Epoch: 1 [42880/60000 (71%)]\tloss=0.1549\n", - "Train Epoch: 1 [43520/60000 (72%)]\tloss=0.2785\n", - "Train Epoch: 1 [44160/60000 (74%)]\tloss=0.1427\n", - "Train Epoch: 1 [44800/60000 (75%)]\tloss=0.1164\n", - "Train Epoch: 1 [45440/60000 (76%)]\tloss=0.1217\n", - "Train Epoch: 1 [46080/60000 (77%)]\tloss=0.0779\n", - "Train Epoch: 1 [46720/60000 (78%)]\tloss=0.1949\n", - "Train Epoch: 1 [47360/60000 (79%)]\tloss=0.0687\n", - "Train Epoch: 1 [48000/60000 (80%)]\tloss=0.2096\n", - "Train Epoch: 1 [48640/60000 (81%)]\tloss=0.1387\n", - "Train Epoch: 1 [49280/60000 (82%)]\tloss=0.0942\n", - "Train Epoch: 1 [49920/60000 (83%)]\tloss=0.1073\n", - "Train Epoch: 1 [50560/60000 (84%)]\tloss=0.1198\n", - "Train Epoch: 1 [51200/60000 (85%)]\tloss=0.1442\n", - "Train Epoch: 1 [51840/60000 (86%)]\tloss=0.0656\n", + "Train Epoch: 1 [41600/60000 (69%)]\tloss=0.2297\n", + "Train Epoch: 1 [42240/60000 (70%)]\tloss=0.0735\n", + "Train Epoch: 1 [42880/60000 (71%)]\tloss=0.1562\n", + "Train Epoch: 1 [43520/60000 (72%)]\tloss=0.2771\n", + "Train Epoch: 1 [44160/60000 (74%)]\tloss=0.1429\n", + "Train Epoch: 1 [44800/60000 (75%)]\tloss=0.1172\n", + "Train Epoch: 1 [45440/60000 (76%)]\tloss=0.1202\n", + "Train Epoch: 1 [46080/60000 (77%)]\tloss=0.0767\n", + "Train Epoch: 1 [46720/60000 (78%)]\tloss=0.1938\n", + "Train Epoch: 1 [47360/60000 (79%)]\tloss=0.0699\n", + "Train Epoch: 1 [48000/60000 (80%)]\tloss=0.2114\n", + "Train Epoch: 1 [48640/60000 (81%)]\tloss=0.1373\n", + "Train Epoch: 1 [49280/60000 (82%)]\tloss=0.0934\n", + "Train Epoch: 1 [49920/60000 (83%)]\tloss=0.1075\n", + "Train Epoch: 1 [50560/60000 (84%)]\tloss=0.1185\n", + "Train Epoch: 1 [51200/60000 (85%)]\tloss=0.1457\n", + "Train Epoch: 1 [51840/60000 (86%)]\tloss=0.0694\n", "Train Epoch: 1 [52480/60000 (87%)]\tloss=0.0242\n", - "Train Epoch: 1 [53120/60000 (88%)]\tloss=0.2644\n", - "Train Epoch: 1 [53760/60000 (90%)]\tloss=0.0932\n", - "Train Epoch: 1 [54400/60000 (91%)]\tloss=0.1294\n", - "Train Epoch: 1 [55040/60000 (92%)]\tloss=0.1901\n", - "Train Epoch: 1 [55680/60000 (93%)]\tloss=0.0341\n", - "Train Epoch: 1 [56320/60000 (94%)]\tloss=0.0358\n", - "Train Epoch: 1 [56960/60000 (95%)]\tloss=0.0770\n", - "Train Epoch: 1 [57600/60000 (96%)]\tloss=0.1181\n", - "Train Epoch: 1 [58240/60000 (97%)]\tloss=0.1945\n", - "Train Epoch: 1 [58880/60000 (98%)]\tloss=0.2064\n", - "Train Epoch: 1 [59520/60000 (99%)]\tloss=0.0642\n", + "Train Epoch: 1 [53120/60000 (88%)]\tloss=0.2635\n", + "Train Epoch: 1 [53760/60000 (90%)]\tloss=0.0922\n", + "Train Epoch: 1 [54400/60000 (91%)]\tloss=0.1287\n", + "Train Epoch: 1 [55040/60000 (92%)]\tloss=0.1908\n", + "Train Epoch: 1 [55680/60000 (93%)]\tloss=0.0350\n", + "Train Epoch: 1 [56320/60000 (94%)]\tloss=0.0359\n", + "Train Epoch: 1 [56960/60000 (95%)]\tloss=0.0762\n", + "Train Epoch: 1 [57600/60000 (96%)]\tloss=0.1173\n", + "Train Epoch: 1 [58240/60000 (97%)]\tloss=0.1948\n", + "Train Epoch: 1 [58880/60000 (98%)]\tloss=0.2035\n", + "Train Epoch: 1 [59520/60000 (99%)]\tloss=0.0639\n", "\n", - "accuracy=0.9667\n", + "accuracy=0.9665\n", "\n", "\n" ] } ], "source": [ - "pytorchjob_client.get_logs('pytorch-dist-mnist-gloo', namespace=namespace)" + "training_client.get_job_logs(name=name, namespace=namespace, container=container_name)" ] }, { @@ -555,12 +482,12 @@ } }, "source": [ - "### Delete the PyTorchJob" + "## Delete the PyTorchJob" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 44, "metadata": { "pycharm": { "name": "#%%\n" @@ -568,31 +495,28 @@ }, "outputs": [ { - "data": { - "text/plain": [ - "{'kind': 'Status',\n", - " 'apiVersion': 'v1',\n", - " 'metadata': {},\n", - " 'status': 'Success',\n", - " 'details': {'name': 'pytorch-dist-mnist-gloo',\n", - " 'group': 'kubeflow.org',\n", - " 'kind': 'pytorchjobs',\n", - " 'uid': '583b9831-8b6d-44e1-86c1-9a171c472fe3'}}" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" + "name": "stderr", + "output_type": "stream", + "text": [ + "PyTorchJob kubeflow-user-example-com/pytorch-dist-mnist-gloo has been deleted\n" + ] } ], "source": [ - "pytorchjob_client.delete('pytorch-dist-mnist-gloo')" + "training_client.delete_pytorchjob(name)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -606,9 +530,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.8.10" } }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} diff --git a/sdk/python/examples/kubeflow-tfjob-sdk.ipynb b/sdk/python/examples/kubeflow-tfjob-sdk.ipynb index 673b32bc27..1c0112b91d 100644 --- a/sdk/python/examples/kubeflow-tfjob-sdk.ipynb +++ b/sdk/python/examples/kubeflow-tfjob-sdk.ipynb @@ -12,6 +12,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": { "pycharm": { @@ -19,6 +20,8 @@ } }, "source": [ + "TODO (andreyvelich): This example should be updated with the new SDK version.\n", + "\n", "This is a sample for Kubeflow TFJob SDK `kubeflow-tfjob`.\n", "\n", "The notebook shows how to use Kubeflow TFJob SDK to create, get, wait, check and delete tfjob." @@ -453,10 +456,10 @@ "cell_type": "code", "execution_count": 8, "metadata": { - "scrolled": true, "pycharm": { "name": "#%%\n" - } + }, + "scrolled": true }, "outputs": [ { @@ -708,4 +711,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +}