From f2bacd167950580da96ad09049124cbb1afe0ace Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Mon, 7 Jun 2021 11:37:40 -0700 Subject: [PATCH 01/87] dataflow: minimal gpu examples --- dataflow/gpu-workers/Dockerfile | 54 ---------------- dataflow/gpu-workers/README.md | 5 -- .../gpu-workers/pytorch-minimal/.dockerignore | 5 ++ .../gpu-workers/pytorch-minimal/.gcloudignore | 5 ++ .../gpu-workers/pytorch-minimal/Dockerfile | 30 +++++++++ .../gpu-workers/pytorch-minimal/README.md | 45 +++++++++++++ .../gpu-workers/pytorch-minimal/build.yaml | 32 ++++++++++ dataflow/gpu-workers/pytorch-minimal/main.py | 61 ++++++++++++++++++ .../pytorch-minimal/requirements.txt | 2 + dataflow/gpu-workers/pytorch-minimal/run.yaml | 51 +++++++++++++++ .../{ => tensorflow-landsat}/.dockerignore | 0 .../{ => tensorflow-landsat}/.gcloudignore | 0 .../gpu-workers/tensorflow-landsat/Dockerfile | 41 ++++++++++++ .../gpu-workers/tensorflow-landsat/README.md | 48 ++++++++++++++ .../gpu-workers/tensorflow-landsat/build.yaml | 32 ++++++++++ .../{ => tensorflow-landsat}/cloudbuild.yaml | 0 .../{ => tensorflow-landsat}/e2e_test.py | 0 .../main.py} | 63 ++++++++++--------- .../noxfile_config.py | 0 .../requirements-test.txt | 0 .../{ => tensorflow-landsat}/requirements.txt | 0 .../gpu-workers/tensorflow-landsat/run.yaml | 53 ++++++++++++++++ .../tensorflow-minimal/.dockerignore | 5 ++ .../tensorflow-minimal/.gcloudignore | 5 ++ .../gpu-workers/tensorflow-minimal/Dockerfile | 41 ++++++++++++ .../gpu-workers/tensorflow-minimal/README.md | 45 +++++++++++++ .../gpu-workers/tensorflow-minimal/build.yaml | 32 ++++++++++ .../gpu-workers/tensorflow-minimal/main.py | 62 ++++++++++++++++++ .../tensorflow-minimal/requirements.txt | 2 + .../gpu-workers/tensorflow-minimal/run.yaml | 50 +++++++++++++++ 30 files changed, 680 insertions(+), 89 deletions(-) delete mode 100644 dataflow/gpu-workers/Dockerfile delete mode 100644 dataflow/gpu-workers/README.md create mode 100644 dataflow/gpu-workers/pytorch-minimal/.dockerignore create mode 100644 dataflow/gpu-workers/pytorch-minimal/.gcloudignore create mode 100644 dataflow/gpu-workers/pytorch-minimal/Dockerfile create mode 100644 dataflow/gpu-workers/pytorch-minimal/README.md create mode 100644 dataflow/gpu-workers/pytorch-minimal/build.yaml create mode 100644 dataflow/gpu-workers/pytorch-minimal/main.py create mode 100644 dataflow/gpu-workers/pytorch-minimal/requirements.txt create mode 100644 dataflow/gpu-workers/pytorch-minimal/run.yaml rename dataflow/gpu-workers/{ => tensorflow-landsat}/.dockerignore (100%) rename dataflow/gpu-workers/{ => tensorflow-landsat}/.gcloudignore (100%) create mode 100644 dataflow/gpu-workers/tensorflow-landsat/Dockerfile create mode 100644 dataflow/gpu-workers/tensorflow-landsat/README.md create mode 100644 dataflow/gpu-workers/tensorflow-landsat/build.yaml rename dataflow/gpu-workers/{ => tensorflow-landsat}/cloudbuild.yaml (100%) rename dataflow/gpu-workers/{ => tensorflow-landsat}/e2e_test.py (100%) rename dataflow/gpu-workers/{landsat_view.py => tensorflow-landsat/main.py} (90%) rename dataflow/gpu-workers/{ => tensorflow-landsat}/noxfile_config.py (100%) rename dataflow/gpu-workers/{ => tensorflow-landsat}/requirements-test.txt (100%) rename dataflow/gpu-workers/{ => tensorflow-landsat}/requirements.txt (100%) create mode 100644 dataflow/gpu-workers/tensorflow-landsat/run.yaml create mode 100644 dataflow/gpu-workers/tensorflow-minimal/.dockerignore create mode 100644 dataflow/gpu-workers/tensorflow-minimal/.gcloudignore create mode 100644 dataflow/gpu-workers/tensorflow-minimal/Dockerfile create mode 100644 dataflow/gpu-workers/tensorflow-minimal/README.md create mode 100644 dataflow/gpu-workers/tensorflow-minimal/build.yaml create mode 100644 dataflow/gpu-workers/tensorflow-minimal/main.py create mode 100644 dataflow/gpu-workers/tensorflow-minimal/requirements.txt create mode 100644 dataflow/gpu-workers/tensorflow-minimal/run.yaml diff --git a/dataflow/gpu-workers/Dockerfile b/dataflow/gpu-workers/Dockerfile deleted file mode 100644 index 7243acc3142..00000000000 --- a/dataflow/gpu-workers/Dockerfile +++ /dev/null @@ -1,54 +0,0 @@ -# Copyright 2020 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Make sure the CUDA and cuDNN versions are compatible with your TensorFlow version. -# https://www.tensorflow.org/install/source#gpu -# Check the Nvidia container registry catalog to look at the available Nvidia images: -# https://ngc.nvidia.com/catalog/containers/nvidia:cuda -FROM nvidia/cuda:11.2.2-cudnn8-runtime-ubuntu20.04 - -# The Python version of the Dockerfile MUST match the Python version you use -# to launch the Dataflow job. -ARG python_version=3.8 - -WORKDIR /root - -# Copy the Apache Beam worker files and the requirements.txt file. -COPY --from=apache/beam_python3.8_sdk:2.29.0 /opt/apache/beam /opt/apache/beam -COPY requirements.txt . - -# Update PATH so we find our new Conda and Python installations. -ENV PATH=/opt/python/bin:/opt/conda/bin:$PATH - -RUN apt-get update \ - && apt-get upgrade -y \ - && apt-get install -y wget \ - && rm -rf /var/lib/apt/lists/* \ - # The nvidia image doesn't come with Python pre-installed. - # We use Miniconda to install the Python version of our choice. - && wget -q https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \ - && sh Miniconda3-latest-Linux-x86_64.sh -b -p /opt/conda \ - && rm Miniconda3-latest-Linux-x86_64.sh \ - # Create a new Python environment and install our requirements. - # We don't need to update $PATH since /usr/local is already in $PATH. - && conda create -y -p /opt/python python=$python_version pip \ - && pip install --no-cache-dir -U pip \ - && pip install --no-cache-dir -r requirements.txt \ - && conda clean -y --all --force-pkgs-dirs \ - # Beam workers looks for pip at /usr/local/bin/pip by default. - # This can be omitted in Beam 2.30.0 and later versions. - && ln -s $(which pip) /usr/local/bin/pip - -# Set the entrypoint to Apache Beam SDK worker launcher. -ENTRYPOINT [ "/opt/apache/beam/boot" ] diff --git a/dataflow/gpu-workers/README.md b/dataflow/gpu-workers/README.md deleted file mode 100644 index a71f0da3e95..00000000000 --- a/dataflow/gpu-workers/README.md +++ /dev/null @@ -1,5 +0,0 @@ -# Workers with GPUs - -[![Open in Cloud Shell](http://gstatic.com/cloudssh/images/open-btn.svg)](https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=dataflow/gpu-workers/README.md) - -📝 Tutorial: [Processing Landsat satellite images with GPUs](https://cloud.google.com/dataflow/docs/samples/satellite-images-gpus) diff --git a/dataflow/gpu-workers/pytorch-minimal/.dockerignore b/dataflow/gpu-workers/pytorch-minimal/.dockerignore new file mode 100644 index 00000000000..775d845fa58 --- /dev/null +++ b/dataflow/gpu-workers/pytorch-minimal/.dockerignore @@ -0,0 +1,5 @@ +# Ignore everything except the source files. +**/* +!Dockerfile +!requirements.txt +!*.py diff --git a/dataflow/gpu-workers/pytorch-minimal/.gcloudignore b/dataflow/gpu-workers/pytorch-minimal/.gcloudignore new file mode 100644 index 00000000000..775d845fa58 --- /dev/null +++ b/dataflow/gpu-workers/pytorch-minimal/.gcloudignore @@ -0,0 +1,5 @@ +# Ignore everything except the source files. +**/* +!Dockerfile +!requirements.txt +!*.py diff --git a/dataflow/gpu-workers/pytorch-minimal/Dockerfile b/dataflow/gpu-workers/pytorch-minimal/Dockerfile new file mode 100644 index 00000000000..4bee40c9d13 --- /dev/null +++ b/dataflow/gpu-workers/pytorch-minimal/Dockerfile @@ -0,0 +1,30 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM pytorch/pytorch:1.8.1-cuda11.1-cudnn8-runtime + +WORKDIR /pipeline + +# Copy the Apache Beam worker files and the pipeline source files. +COPY --from=apache/beam_python3.8_sdk:2.29.0 /opt/apache/beam /opt/apache/beam +COPY requirements.txt . +COPY *.py ./ + +# Install the pipeline requirements and check that there are no conflicts. +RUN pip install --no-cache-dir --upgrade pip \ + && pip install --no-cache-dir -r requirements.txt \ + && pip check + +# Set the entrypoint to Apache Beam SDK worker launcher. +ENTRYPOINT [ "/opt/apache/beam/boot" ] diff --git a/dataflow/gpu-workers/pytorch-minimal/README.md b/dataflow/gpu-workers/pytorch-minimal/README.md new file mode 100644 index 00000000000..15a81d95fc7 --- /dev/null +++ b/dataflow/gpu-workers/pytorch-minimal/README.md @@ -0,0 +1,45 @@ +# PyTorch GPU minimal pipeline + +## Before you begin + +Make sure you have followed the +[Dataflow setup instructions](../../README.md). + +Finally, save your resource names in environment variables. + +```sh +export PROJECT=$(gcloud config get-value project) +``` + +## Building the Docker image + +We use Cloud Build to build the container image for the workers. + +```sh +gcloud builds submit --config build.yaml +``` + +## Running the Dataflow job with GPUs + +We use Cloud Build to run the Dataflow job. +We launch the job using the worker image to make sure the job launches +with the same Python version as the workers. + +```sh +export REGION="us-central1" +export WORKER_ZONE="us-central1-f" +export GPU_TYPE="nvidia-tesla-t4" + +gcloud beta builds submit \ + --config run.yaml \ + --substitutions _REGION=$REGION,_WORKER_ZONE=$WORKER_ZONE,_GPU_TYPE=$GPU_TYPE \ + --no-source +``` + +> ℹ️ Make sure the GPU type you choose is available in the worker zone for the job. +> For more information, see [GPU availability](https://cloud.google.com/dataflow/docs/resources/locations#gpu_availability). + +## What's next? + +For a more complete example, take a look at +📝 [Processing Landsat satellite images with GPUs](https://cloud.google.com/dataflow/docs/samples/satellite-images-gpus). diff --git a/dataflow/gpu-workers/pytorch-minimal/build.yaml b/dataflow/gpu-workers/pytorch-minimal/build.yaml new file mode 100644 index 00000000000..c72876e2623 --- /dev/null +++ b/dataflow/gpu-workers/pytorch-minimal/build.yaml @@ -0,0 +1,32 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# ----------------------------------------------------------------------------- +# This Cloud Build config file builds and pushes the image for the workers. +# +# To learn more about this file: +# https://cloud.google.com/build/docs/build-config +# ----------------------------------------------------------------------------- + +steps: +- name: gcr.io/cloud-builders/docker + args: + - build + - --tag=gcr.io/$PROJECT_ID/samples/dataflow/pytorch-gpu:latest + - . + +images: [gcr.io/$PROJECT_ID/samples/dataflow/pytorch-gpu:latest] + +options: + machineType: E2_HIGHCPU_8 diff --git a/dataflow/gpu-workers/pytorch-minimal/main.py b/dataflow/gpu-workers/pytorch-minimal/main.py new file mode 100644 index 00000000000..19b5a740fba --- /dev/null +++ b/dataflow/gpu-workers/pytorch-minimal/main.py @@ -0,0 +1,61 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import logging +from typing import Any, List, Optional + +import apache_beam as beam +from apache_beam.options.pipeline_options import PipelineOptions +import torch + + +def check_gpus(element: Any, gpus_optional: bool = False) -> Any: + """Validates that we are detecting GPUs, otherwise raise a RuntimeError.""" + if torch.cuda.is_available(): + logging.info(f"Using GPU: {torch.cuda.get_device_name(0)}") + elif gpus_optional: + logging.warning("No GPUs found, defaulting to CPU.") + else: + raise RuntimeError("No GPUs found.") + return element + + +def run(input_text: str, beam_args: Optional[List[str]] = None) -> None: + beam_options = PipelineOptions(beam_args, save_main_session=True) + + # We currently cannot use the `with` statement to run without waiting. + # https://issues.apache.org/jira/browse/BEAM-12455 + pipeline = beam.Pipeline(options=beam_options) + ( + pipeline + | "Create data" >> beam.Create([input_text]) + | "Check GPU availability" >> beam.Map(check_gpus) + | "My transform" >> beam.Map(logging.info) + ) + pipeline.run() + + +if __name__ == "__main__": + logging.getLogger().setLevel(logging.INFO) + + parser = argparse.ArgumentParser() + parser.add_argument( + "--input-text", + default="Hello!", + help="Input text to display.", + ) + args, beam_args = parser.parse_known_args() + + run(args.input_text, beam_args) diff --git a/dataflow/gpu-workers/pytorch-minimal/requirements.txt b/dataflow/gpu-workers/pytorch-minimal/requirements.txt new file mode 100644 index 00000000000..530aa4098e7 --- /dev/null +++ b/dataflow/gpu-workers/pytorch-minimal/requirements.txt @@ -0,0 +1,2 @@ +apache-beam[gcp]==2.29.0 +torch==1.8.1 diff --git a/dataflow/gpu-workers/pytorch-minimal/run.yaml b/dataflow/gpu-workers/pytorch-minimal/run.yaml new file mode 100644 index 00000000000..83858d79f36 --- /dev/null +++ b/dataflow/gpu-workers/pytorch-minimal/run.yaml @@ -0,0 +1,51 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This Cloud Build config runs a Dataflow job using GPUs. +# We use the same worker image to launch the job. +# That way we guarantee the same Python version for the workers. +# It also already has all the requirements installed. + +# ----------------------------------------------------------------------------- +# To learn more about this file: +# https://cloud.google.com/build/docs/build-config +# +# To learn more about Cloud Build variable substitutions: +# https://cloud.google.com/build/docs/configuring-builds/substitute-variable-values#using_user-defined_substitutions +# ----------------------------------------------------------------------------- + +substitutions: + _REGION: us-central1 + _WORKER_ZONE: us-central1-f + _GPU_TYPE: nvidia-tesla-t4 + _GPU_COUNT: '1' + +steps: +- name: gcr.io/$PROJECT_ID/samples/dataflow/pytorch-gpu:latest + entrypoint: python + args: + - /pipeline/main.py + - --runner=DataflowRunner + - --project=$PROJECT_ID + - --region=$_REGION + - --worker_harness_container_image=gcr.io/$PROJECT_ID/samples/dataflow/pytorch-gpu:latest + - --worker_zone=$_WORKER_ZONE + - --disk_size_gb=100 + - --experiment=worker_accelerator=type:$_GPU_TYPE;count:$_GPU_COUNT;install-nvidia-driver + - --experiment=use_runner_v2 + +options: + logging: CLOUD_LOGGING_ONLY + +serviceAccount: projects/$PROJECT_ID/serviceAccounts/$PROJECT_NUMBER-compute@developer.gserviceaccount.com diff --git a/dataflow/gpu-workers/.dockerignore b/dataflow/gpu-workers/tensorflow-landsat/.dockerignore similarity index 100% rename from dataflow/gpu-workers/.dockerignore rename to dataflow/gpu-workers/tensorflow-landsat/.dockerignore diff --git a/dataflow/gpu-workers/.gcloudignore b/dataflow/gpu-workers/tensorflow-landsat/.gcloudignore similarity index 100% rename from dataflow/gpu-workers/.gcloudignore rename to dataflow/gpu-workers/tensorflow-landsat/.gcloudignore diff --git a/dataflow/gpu-workers/tensorflow-landsat/Dockerfile b/dataflow/gpu-workers/tensorflow-landsat/Dockerfile new file mode 100644 index 00000000000..a8686076460 --- /dev/null +++ b/dataflow/gpu-workers/tensorflow-landsat/Dockerfile @@ -0,0 +1,41 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Each version of TensorFlow requires a specific CUDA/cuDNN version: +# https://www.tensorflow.org/install/source#gpu +# For a list of all the nvidia images: +# https://ngc.nvidia.com/catalog/containers/nvidia:cuda/tags +FROM nvcr.io/nvidia/cuda:11.2.2-cudnn8-runtime-ubuntu20.04 + +WORKDIR /pipeline + +# Copy the Apache Beam worker files and the pipeline source files. +COPY --from=apache/beam_python3.8_sdk:2.29.0 /opt/apache/beam /opt/apache/beam +COPY requirements.txt . +COPY *.py ./ + +# If you need a different Python version, consider: +# https://launchpad.net/~deadsnakes/+archive/ubuntu/ppa +RUN apt-get update \ + && apt-get install -y curl python3.8 python3-distutils \ + && rm -rf /var/lib/apt/lists/* \ + && update-alternatives --install /usr/bin/python python /usr/bin/python3.8 10 \ + && curl https://bootstrap.pypa.io/get-pip.py | python \ + # Install the pipeline requirements and check that there are no conflicts. + && pip install --no-cache-dir --upgrade pip \ + && pip install --no-cache-dir -r requirements.txt \ + && pip check + +# Set the entrypoint to Apache Beam SDK worker launcher. +ENTRYPOINT [ "/opt/apache/beam/boot" ] diff --git a/dataflow/gpu-workers/tensorflow-landsat/README.md b/dataflow/gpu-workers/tensorflow-landsat/README.md new file mode 100644 index 00000000000..4d87ed8f622 --- /dev/null +++ b/dataflow/gpu-workers/tensorflow-landsat/README.md @@ -0,0 +1,48 @@ +# Workers with GPUs + +[![Open in Cloud Shell](http://gstatic.com/cloudssh/images/open-btn.svg)](https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=dataflow/gpu-workers/README.md) + +📝 Tutorial: [Processing Landsat satellite images with GPUs](https://cloud.google.com/dataflow/docs/samples/satellite-images-gpus) + +## Before you begin + +Make sure you have followed the +[Dataflow setup instructions](../../README.md), and additionally: + +* Use or [create a Cloud Storage bucket](https://console.cloud.google.com/storage/create-bucket). + +Finally, save your resource names in environment variables. + +```sh +export PROJECT=$(gcloud config get-value project) +export BUCKET="my-bucket-name" +``` + +## Building the Docker image + +We use Cloud Build to build the container image for the workers. + +```sh +gcloud builds submit --config build.yaml +``` + +## Running the Dataflow job with GPUs + +We use Cloud Build to run the Dataflow job. +We launch the job using the worker image to make sure the job launches +with the same Python version as the workers. + +```sh +export OUTPUT_PATH="gs://$BUCKET/samples/dataflow/landsat/" +export REGION="us-central1" +export WORKER_ZONE="us-central1-f" +export GPU_TYPE="nvidia-tesla-t4" + +gcloud beta builds submit \ + --config run.yaml \ + --substitutions _OUTPUT_PATH=$OUTPUT_PATH,_REGION=$REGION,_WORKER_ZONE=$WORKER_ZONE,_GPU_TYPE=$GPU_TYPE \ + --no-source +``` + +> ℹ️ Make sure the GPU type you choose is available in the worker zone for the job. +> For more information, see [GPU availability](https://cloud.google.com/dataflow/docs/resources/locations#gpu_availability). diff --git a/dataflow/gpu-workers/tensorflow-landsat/build.yaml b/dataflow/gpu-workers/tensorflow-landsat/build.yaml new file mode 100644 index 00000000000..b2b81b8f92d --- /dev/null +++ b/dataflow/gpu-workers/tensorflow-landsat/build.yaml @@ -0,0 +1,32 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# ----------------------------------------------------------------------------- +# This Cloud Build config file builds and pushes the image for the workers. +# +# To learn more about this file: +# https://cloud.google.com/build/docs/build-config +# ----------------------------------------------------------------------------- + +steps: +- name: gcr.io/cloud-builders/docker + args: + - build + - --tag=gcr.io/$PROJECT_ID/samples/dataflow/landsat-gpu:latest + - . + +images: [gcr.io/$PROJECT_ID/samples/dataflow/landsat-gpu:latest] + +options: + machineType: E2_HIGHCPU_8 diff --git a/dataflow/gpu-workers/cloudbuild.yaml b/dataflow/gpu-workers/tensorflow-landsat/cloudbuild.yaml similarity index 100% rename from dataflow/gpu-workers/cloudbuild.yaml rename to dataflow/gpu-workers/tensorflow-landsat/cloudbuild.yaml diff --git a/dataflow/gpu-workers/e2e_test.py b/dataflow/gpu-workers/tensorflow-landsat/e2e_test.py similarity index 100% rename from dataflow/gpu-workers/e2e_test.py rename to dataflow/gpu-workers/tensorflow-landsat/e2e_test.py diff --git a/dataflow/gpu-workers/landsat_view.py b/dataflow/gpu-workers/tensorflow-landsat/main.py similarity index 90% rename from dataflow/gpu-workers/landsat_view.py rename to dataflow/gpu-workers/tensorflow-landsat/main.py index 9e61016eabf..00216422633 100644 --- a/dataflow/gpu-workers/landsat_view.py +++ b/dataflow/gpu-workers/tensorflow-landsat/main.py @@ -276,39 +276,42 @@ def run( max_value = vis_params["max"] gamma = vis_params["gamma"] - options = PipelineOptions(beam_args, save_main_session=True) - with beam.Pipeline(options=options) as pipeline: - # Optionally, validate that the workers are using GPUs. - gpu_check = ( - pipeline - | beam.Create([None]) - | "Check GPU availability" >> beam.Map(check_gpus, gpus_optional) + beam_options = PipelineOptions(beam_args, save_main_session=True) + + # We currently cannot use the `with` statement to run without waiting. + # https://issues.apache.org/jira/browse/BEAM-12455 + pipeline = beam.Pipeline(options=beam_options) + + # Convert Landsat 8 scenes into images. + # ℹ️ We pass `gpu_check` as an unused side input to force that step in + # the pipeline to wait for the check before continuing. + ( + pipeline + | "Create scene IDs" >> beam.Create(scenes) + | "Get RGB band paths" + >> beam.Map( + get_band_paths, + rgb_band_names, + unused_side_input=beam.pvalue.AsSingleton( + pipeline + | beam.Create([None]) + | "Check GPUs" >> beam.Map(check_gpus, gpus_optional) + ), ) - - # Convert Landsat 8 scenes into images. - # ℹ️ We pass `gpu_check` as an unused side input to force that step in - # the pipeline to wait for the check before continuing. - ( - pipeline - | "Create scene IDs" >> beam.Create(scenes) - | "Get RGB band paths" - >> beam.Map( - get_band_paths, - rgb_band_names, - unused_side_input=beam.pvalue.AsSingleton(gpu_check), - ) - | "Load RGB band values" >> beam.MapTuple(load_values) - | "Preprocess pixels" - >> beam.MapTuple(preprocess_pixels, min_value, max_value, gamma) - | "Convert to image" - >> beam.MapTuple( - lambda scene, rgb_pixels: ( - scene, - Image.fromarray(rgb_pixels.numpy(), mode="RGB"), - ) + | "Load RGB band values" >> beam.MapTuple(load_values) + | "Preprocess pixels" + >> beam.MapTuple(preprocess_pixels, min_value, max_value, gamma) + | "Convert to image" + >> beam.MapTuple( + lambda scene, rgb_pixels: ( + scene, + Image.fromarray(rgb_pixels.numpy(), mode="RGB"), ) - | "Save to Cloud Storage" >> beam.MapTuple(save_to_gcs, output_path_prefix) ) + | "Save to Cloud Storage" >> beam.MapTuple(save_to_gcs, output_path_prefix) + ) + + pipeline.run() if __name__ == "__main__": diff --git a/dataflow/gpu-workers/noxfile_config.py b/dataflow/gpu-workers/tensorflow-landsat/noxfile_config.py similarity index 100% rename from dataflow/gpu-workers/noxfile_config.py rename to dataflow/gpu-workers/tensorflow-landsat/noxfile_config.py diff --git a/dataflow/gpu-workers/requirements-test.txt b/dataflow/gpu-workers/tensorflow-landsat/requirements-test.txt similarity index 100% rename from dataflow/gpu-workers/requirements-test.txt rename to dataflow/gpu-workers/tensorflow-landsat/requirements-test.txt diff --git a/dataflow/gpu-workers/requirements.txt b/dataflow/gpu-workers/tensorflow-landsat/requirements.txt similarity index 100% rename from dataflow/gpu-workers/requirements.txt rename to dataflow/gpu-workers/tensorflow-landsat/requirements.txt diff --git a/dataflow/gpu-workers/tensorflow-landsat/run.yaml b/dataflow/gpu-workers/tensorflow-landsat/run.yaml new file mode 100644 index 00000000000..f447bfcbbc6 --- /dev/null +++ b/dataflow/gpu-workers/tensorflow-landsat/run.yaml @@ -0,0 +1,53 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This Cloud Build config runs a Dataflow job using GPUs. +# We use the same worker image to launch the job. +# That way we guarantee the same Python version for the workers. +# It also already has all the requirements installed. + +# ----------------------------------------------------------------------------- +# To learn more about this file: +# https://cloud.google.com/build/docs/build-config +# +# To learn more about Cloud Build variable substitutions: +# https://cloud.google.com/build/docs/configuring-builds/substitute-variable-values#using_user-defined_substitutions +# ----------------------------------------------------------------------------- + +substitutions: + _OUTPUT_PATH: please set --substitutions _OUTPUT_PATH=gs://my-bucket/output/path + _REGION: us-central1 + _WORKER_ZONE: us-central1-f + _GPU_TYPE: nvidia-tesla-t4 + _GPU_COUNT: '1' + +steps: +- name: gcr.io/$PROJECT_ID/samples/dataflow/landsat-gpu:latest + entrypoint: python + args: + - /pipeline/main.py + - --output-path-prefix=$_OUTPUT_PATH + - --runner=DataflowRunner + - --project=$PROJECT_ID + - --region=$_REGION + - --worker_machine_type=custom-1-13312-ext + - --worker_harness_container_image=gcr.io/$PROJECT_ID/samples/dataflow/landsat-gpu:latest + - --worker_zone=$_WORKER_ZONE + - --experiment=worker_accelerator=type:$_GPU_TYPE;count:$_GPU_COUNT;install-nvidia-driver + - --experiment=use_runner_v2 + +options: + logging: CLOUD_LOGGING_ONLY + +serviceAccount: projects/$PROJECT_ID/serviceAccounts/$PROJECT_NUMBER-compute@developer.gserviceaccount.com diff --git a/dataflow/gpu-workers/tensorflow-minimal/.dockerignore b/dataflow/gpu-workers/tensorflow-minimal/.dockerignore new file mode 100644 index 00000000000..775d845fa58 --- /dev/null +++ b/dataflow/gpu-workers/tensorflow-minimal/.dockerignore @@ -0,0 +1,5 @@ +# Ignore everything except the source files. +**/* +!Dockerfile +!requirements.txt +!*.py diff --git a/dataflow/gpu-workers/tensorflow-minimal/.gcloudignore b/dataflow/gpu-workers/tensorflow-minimal/.gcloudignore new file mode 100644 index 00000000000..775d845fa58 --- /dev/null +++ b/dataflow/gpu-workers/tensorflow-minimal/.gcloudignore @@ -0,0 +1,5 @@ +# Ignore everything except the source files. +**/* +!Dockerfile +!requirements.txt +!*.py diff --git a/dataflow/gpu-workers/tensorflow-minimal/Dockerfile b/dataflow/gpu-workers/tensorflow-minimal/Dockerfile new file mode 100644 index 00000000000..e892d4c28a8 --- /dev/null +++ b/dataflow/gpu-workers/tensorflow-minimal/Dockerfile @@ -0,0 +1,41 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Each version of TensorFlow requires a specific CUDA/cuDNN version: +# https://www.tensorflow.org/install/source#gpu +# For a list of all the nvidia images: +# https://ngc.nvidia.com/catalog/containers/nvidia:cuda/tags +FROM nvcr.io/nvidia/cuda:11.2.2-cudnn8-runtime-ubuntu20.04 + +WORKDIR /pipeline + +# Copy the Apache Beam worker files and the pipeline source files. +COPY --from=apache/beam_python3.8_sdk:2.29.0 /opt/apache/beam /opt/apache/beam +COPY requirements.txt . +COPY *.py ./ + +# If you need a different Python version, consider: +# https://launchpad.net/~deadsnakes/+archive/ubuntu/ppa +RUN apt-get update \ + && apt-get install -y curl python3.8 python3-distutils \ + && rm -rf /var/lib/apt/lists/* \ + && update-alternatives --install /usr/bin/python python /usr/bin/python3.8 10 \ + && curl https://bootstrap.pypa.io/get-pip.py | python \ + # Install the pipeline requirements and check that there are no conflicts. + && pip install --no-cache-dir --upgrade pip \ + && pip install --no-cache-dir -r requirements.txt \ + && pip check + +# Set the entrypoint to Apache Beam SDK worker launcher. +ENTRYPOINT [ "/opt/apache/beam/boot" ] diff --git a/dataflow/gpu-workers/tensorflow-minimal/README.md b/dataflow/gpu-workers/tensorflow-minimal/README.md new file mode 100644 index 00000000000..15a81d95fc7 --- /dev/null +++ b/dataflow/gpu-workers/tensorflow-minimal/README.md @@ -0,0 +1,45 @@ +# PyTorch GPU minimal pipeline + +## Before you begin + +Make sure you have followed the +[Dataflow setup instructions](../../README.md). + +Finally, save your resource names in environment variables. + +```sh +export PROJECT=$(gcloud config get-value project) +``` + +## Building the Docker image + +We use Cloud Build to build the container image for the workers. + +```sh +gcloud builds submit --config build.yaml +``` + +## Running the Dataflow job with GPUs + +We use Cloud Build to run the Dataflow job. +We launch the job using the worker image to make sure the job launches +with the same Python version as the workers. + +```sh +export REGION="us-central1" +export WORKER_ZONE="us-central1-f" +export GPU_TYPE="nvidia-tesla-t4" + +gcloud beta builds submit \ + --config run.yaml \ + --substitutions _REGION=$REGION,_WORKER_ZONE=$WORKER_ZONE,_GPU_TYPE=$GPU_TYPE \ + --no-source +``` + +> ℹ️ Make sure the GPU type you choose is available in the worker zone for the job. +> For more information, see [GPU availability](https://cloud.google.com/dataflow/docs/resources/locations#gpu_availability). + +## What's next? + +For a more complete example, take a look at +📝 [Processing Landsat satellite images with GPUs](https://cloud.google.com/dataflow/docs/samples/satellite-images-gpus). diff --git a/dataflow/gpu-workers/tensorflow-minimal/build.yaml b/dataflow/gpu-workers/tensorflow-minimal/build.yaml new file mode 100644 index 00000000000..84f60f90255 --- /dev/null +++ b/dataflow/gpu-workers/tensorflow-minimal/build.yaml @@ -0,0 +1,32 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# ----------------------------------------------------------------------------- +# This Cloud Build config file builds and pushes the image for the workers. +# +# To learn more about this file: +# https://cloud.google.com/build/docs/build-config +# ----------------------------------------------------------------------------- + +steps: +- name: gcr.io/cloud-builders/docker + args: + - build + - --tag=gcr.io/$PROJECT_ID/samples/dataflow/tensorflow-gpu:latest + - . + +images: [gcr.io/$PROJECT_ID/samples/dataflow/tensorflow-gpu:latest] + +options: + machineType: E2_HIGHCPU_8 diff --git a/dataflow/gpu-workers/tensorflow-minimal/main.py b/dataflow/gpu-workers/tensorflow-minimal/main.py new file mode 100644 index 00000000000..924f4e0935d --- /dev/null +++ b/dataflow/gpu-workers/tensorflow-minimal/main.py @@ -0,0 +1,62 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import logging +from typing import Any, List, Optional + +import apache_beam as beam +from apache_beam.options.pipeline_options import PipelineOptions +import tensorflow as tf + + +def check_gpus(element: Any, gpus_optional: bool = False) -> Any: + """Validates that we are detecting GPUs, otherwise raise a RuntimeError.""" + gpu_devices = tf.config.list_physical_devices("GPU") + if gpu_devices: + logging.info(f"Using GPU: {gpu_devices}") + elif gpus_optional: + logging.warning("No GPUs found, defaulting to CPU.") + else: + raise RuntimeError("No GPUs found.") + return element + + +def run(input_text: str, beam_args: Optional[List[str]] = None) -> None: + beam_options = PipelineOptions(beam_args, save_main_session=True) + + # We currently cannot use the `with` statement to run without waiting. + # https://issues.apache.org/jira/browse/BEAM-12455 + pipeline = beam.Pipeline(options=beam_options) + ( + pipeline + | "Create data" >> beam.Create([input_text]) + | "Check GPU availability" >> beam.Map(check_gpus) + | "My transform" >> beam.Map(logging.info) + ) + pipeline.run() + + +if __name__ == "__main__": + logging.getLogger().setLevel(logging.INFO) + + parser = argparse.ArgumentParser() + parser.add_argument( + "--input-text", + default="Hello!", + help="Input text to display.", + ) + args, beam_args = parser.parse_known_args() + + run(args.input_text, beam_args) diff --git a/dataflow/gpu-workers/tensorflow-minimal/requirements.txt b/dataflow/gpu-workers/tensorflow-minimal/requirements.txt new file mode 100644 index 00000000000..f2f6e11354a --- /dev/null +++ b/dataflow/gpu-workers/tensorflow-minimal/requirements.txt @@ -0,0 +1,2 @@ +apache-beam[gcp]==2.29.0 +tensorflow==2.5.0 diff --git a/dataflow/gpu-workers/tensorflow-minimal/run.yaml b/dataflow/gpu-workers/tensorflow-minimal/run.yaml new file mode 100644 index 00000000000..036db374e0d --- /dev/null +++ b/dataflow/gpu-workers/tensorflow-minimal/run.yaml @@ -0,0 +1,50 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This Cloud Build config runs a Dataflow job using GPUs. +# We use the same worker image to launch the job. +# That way we guarantee the same Python version for the workers. +# It also already has all the requirements installed. + +# ----------------------------------------------------------------------------- +# To learn more about this file: +# https://cloud.google.com/build/docs/build-config +# +# To learn more about Cloud Build variable substitutions: +# https://cloud.google.com/build/docs/configuring-builds/substitute-variable-values#using_user-defined_substitutions +# ----------------------------------------------------------------------------- + +substitutions: + _REGION: us-central1 + _WORKER_ZONE: us-central1-f + _GPU_TYPE: nvidia-tesla-t4 + _GPU_COUNT: '1' + +steps: +- name: gcr.io/$PROJECT_ID/samples/dataflow/tensorflow-gpu:latest + entrypoint: python + args: + - /pipeline/main.py + - --runner=DataflowRunner + - --project=$PROJECT_ID + - --region=$_REGION + - --worker_harness_container_image=gcr.io/$PROJECT_ID/samples/dataflow/tensorflow-gpu:latest + - --worker_zone=$_WORKER_ZONE + - --experiment=worker_accelerator=type:$_GPU_TYPE;count:$_GPU_COUNT;install-nvidia-driver + - --experiment=use_runner_v2 + +options: + logging: CLOUD_LOGGING_ONLY + +serviceAccount: projects/$PROJECT_ID/serviceAccounts/$PROJECT_NUMBER-compute@developer.gserviceaccount.com From d8bba19ed35afa73678e9693165111137843c144 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Mon, 7 Jun 2021 14:56:46 -0700 Subject: [PATCH 02/87] added tests --- dataflow/conftest.py | 80 +++++++++------ .../flex-templates/streaming_beam/e2e_test.py | 10 +- .../gpu-workers/pytorch-minimal/e2e_test.py | 44 +++++++++ dataflow/gpu-workers/pytorch-minimal/run.yaml | 4 +- .../gpu-workers/tensorflow-landsat/README.md | 4 +- .../tensorflow-landsat/e2e_test.py | 99 +++---------------- .../gpu-workers/tensorflow-landsat/run.yaml | 5 +- .../gpu-workers/tensorflow-minimal/run.yaml | 2 + 8 files changed, 125 insertions(+), 123 deletions(-) create mode 100644 dataflow/gpu-workers/pytorch-minimal/e2e_test.py diff --git a/dataflow/conftest.py b/dataflow/conftest.py index 13314bf86dd..f8191effd89 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -158,40 +158,64 @@ def _infinite_publish_job() -> None: p.terminate() @staticmethod - def container_image( - image_path: str, + def cloud_build_submit( + image_name: Optional[str] = None, + config: Optional[str] = None, + substitutions: Optional[Dict[str, str]] = None, project: str = PROJECT, - tag: str = "latest", - ) -> str: - image_name = f"gcr.io/{project}/{image_path}-{UUID}:{tag}" + ) -> None: + """Sends a Cloud Build job, if an image_name is provided it will be deleted at teardown.""" cmd = ["gcloud", "auth", "configure-docker"] print(cmd) - subprocess.run(cmd, check=True) - cmd = [ - "gcloud", - "builds", - "submit", - f"--project={project}", - f"--tag={image_name}", - ".", - ] - print(cmd) - subprocess.run(cmd, check=True) - print(f"container_image: {image_name}") - yield image_name + if substitutions: + cmd_substitutions = [ + f"--substitutions={','.join([k + '=' + v for k, v in substitutions.items()])}" + ] + else: + cmd_substitutions = [] - cmd = [ - "gcloud", - "container", - "images", - "delete", - image_name, - f"--project={project}", - "--quiet", - ] - print(cmd) subprocess.run(cmd, check=True) + if config: + cmd = [ + "gcloud", + "builds", + "submit", + f"--project={project}", + f"--config={config}", + *cmd_substitutions, + ] + print(cmd) + subprocess.run(cmd, check=True) + yield config + elif image_name: + cmd = [ + "gcloud", + "builds", + "submit", + f"--project={project}", + f"--tag=gcr.io/{project}/{image_name}:latest", + *cmd_substitutions, + ".", + ] + print(cmd) + subprocess.run(cmd, check=True) + yield f"gcr.io/{project}/{image_name}:latest" + else: + raise ValueError("must specify either `config` or `image_name`") + + if image_name: + cmd = [ + "gcloud", + "container", + "images", + "delete", + f"gcr.io/{project}/{image_name}:latest", + f"--project={project}", + "--quiet", + ] + print(cmd) + subprocess.run(cmd, check=True) @staticmethod def dataflow_job_id_from_job_name( diff --git a/dataflow/flex-templates/streaming_beam/e2e_test.py b/dataflow/flex-templates/streaming_beam/e2e_test.py index e642306ed4b..08f0676b446 100644 --- a/dataflow/flex-templates/streaming_beam/e2e_test.py +++ b/dataflow/flex-templates/streaming_beam/e2e_test.py @@ -13,14 +13,12 @@ import json import time -# `conftest` cannot be imported when running in `nox`, but we still -# try to import it for the autocomplete when writing the tests. try: + # `conftest` cannot be imported when running in `nox`, but we still + # try to import it for the autocomplete when writing the tests. from conftest import Utils except ModuleNotFoundError: - from typing import Any - - Utils = Any + Utils = None import pytest NAME = "dataflow-flex-templates-streaming-beam" @@ -61,7 +59,7 @@ def pubsub_publisher(utils: Utils, pubsub_topic: str) -> bool: @pytest.fixture(scope="session") def flex_template_image(utils: Utils) -> str: - yield from utils.container_image(NAME) + yield from utils.cloud_build_submit(NAME) @pytest.fixture(scope="session") diff --git a/dataflow/gpu-workers/pytorch-minimal/e2e_test.py b/dataflow/gpu-workers/pytorch-minimal/e2e_test.py new file mode 100644 index 00000000000..d0a25e927d9 --- /dev/null +++ b/dataflow/gpu-workers/pytorch-minimal/e2e_test.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python + +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +try: + # `conftest` cannot be imported when running in `nox`, but we still + # try to import it for the autocomplete when writing the tests. + from conftest import Utils +except ModuleNotFoundError: + Utils = None +from google.cloud import storage +import pytest + +NAME = "dataflow-gpu-pytorch" + + +@pytest.fixture(scope="session") +def bucket_name(utils: Utils) -> str: + yield from utils.storage_bucket(NAME) + + +@pytest.fixture(scope="session") +def worker_image(utils: Utils) -> str: + yield from utils.cloud_build_submit(NAME, config="build.yaml") + + +def test_end_to_end(utils: Utils, bucket_name: str, worker_image: str) -> None: + # Run the Beam pipeline in Dataflow making sure GPUs are used. + utils.cloud_build_submit( + config="run.yaml", + substitutions={"_TEMP_LOCATION": f"gs://{bucket_name}/temp"}, + ) diff --git a/dataflow/gpu-workers/pytorch-minimal/run.yaml b/dataflow/gpu-workers/pytorch-minimal/run.yaml index 83858d79f36..c5e74f4d77a 100644 --- a/dataflow/gpu-workers/pytorch-minimal/run.yaml +++ b/dataflow/gpu-workers/pytorch-minimal/run.yaml @@ -26,6 +26,7 @@ # ----------------------------------------------------------------------------- substitutions: + _TEMP_LOCATION: '' _REGION: us-central1 _WORKER_ZONE: us-central1-f _GPU_TYPE: nvidia-tesla-t4 @@ -39,9 +40,10 @@ steps: - --runner=DataflowRunner - --project=$PROJECT_ID - --region=$_REGION + - --temp_location=$_TEMP_LOCATION - --worker_harness_container_image=gcr.io/$PROJECT_ID/samples/dataflow/pytorch-gpu:latest - --worker_zone=$_WORKER_ZONE - - --disk_size_gb=100 + - --disk_size_gb=50 - --experiment=worker_accelerator=type:$_GPU_TYPE;count:$_GPU_COUNT;install-nvidia-driver - --experiment=use_runner_v2 diff --git a/dataflow/gpu-workers/tensorflow-landsat/README.md b/dataflow/gpu-workers/tensorflow-landsat/README.md index 4d87ed8f622..eeb91e7a5e6 100644 --- a/dataflow/gpu-workers/tensorflow-landsat/README.md +++ b/dataflow/gpu-workers/tensorflow-landsat/README.md @@ -33,14 +33,14 @@ We launch the job using the worker image to make sure the job launches with the same Python version as the workers. ```sh -export OUTPUT_PATH="gs://$BUCKET/samples/dataflow/landsat/" +export GCS_PATH="gs://$BUCKET/samples/dataflow/landsat" export REGION="us-central1" export WORKER_ZONE="us-central1-f" export GPU_TYPE="nvidia-tesla-t4" gcloud beta builds submit \ --config run.yaml \ - --substitutions _OUTPUT_PATH=$OUTPUT_PATH,_REGION=$REGION,_WORKER_ZONE=$WORKER_ZONE,_GPU_TYPE=$GPU_TYPE \ + --substitutions _GCS_PATH=$GCS_PATH,_REGION=$REGION,_WORKER_ZONE=$WORKER_ZONE,_GPU_TYPE=$GPU_TYPE \ --no-source ``` diff --git a/dataflow/gpu-workers/tensorflow-landsat/e2e_test.py b/dataflow/gpu-workers/tensorflow-landsat/e2e_test.py index f3f105d7b6a..ae49e73f2fb 100644 --- a/dataflow/gpu-workers/tensorflow-landsat/e2e_test.py +++ b/dataflow/gpu-workers/tensorflow-landsat/e2e_test.py @@ -14,102 +14,33 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os -import platform -import subprocess -import uuid - +try: + # `conftest` cannot be imported when running in `nox`, but we still + # try to import it for the autocomplete when writing the tests. + from conftest import Utils +except ModuleNotFoundError: + Utils = None from google.cloud import storage import pytest -SUFFIX = uuid.uuid4().hex[0:6] -PROJECT = os.environ["GOOGLE_CLOUD_PROJECT"] -BUCKET_NAME = f"dataflow-gpu-test-{SUFFIX}" -IMAGE_NAME = f"dataflow/gpu-workers/test-{SUFFIX}:latest" -REGION = "us-central1" -ZONE = "us-central1-f" - - -@pytest.fixture(scope="session") -def bucket_name() -> str: - storage_client = storage.Client() - bucket = storage_client.create_bucket(BUCKET_NAME) - - yield BUCKET_NAME - - bucket.delete(force=True) +NAME = "dataflow-gpu-landsat" @pytest.fixture(scope="session") -def configure_docker() -> None: - subprocess.run( - [ - "gcloud", - "auth", - "configure-docker", - ] - ) +def bucket_name(utils: Utils) -> str: + yield from utils.storage_bucket(NAME) @pytest.fixture(scope="session") -def image_name(configure_docker: None) -> str: - # See the `cloudbuild.yaml` for the configuration for this build. - substitutions = { - "_PYTHON_VERSION": platform.python_version(), - "_IMAGE": IMAGE_NAME, - } - print(f"-- Cloud build substitutions: {substitutions}") - subprocess.run( - [ - "gcloud", - "builds", - "submit", - f"--project={PROJECT}", - f"--substitutions={','.join([k + '=' + v for k, v in substitutions.items()])}", - "--timeout=30m", - "--quiet", - ], - check=True, - ) - - yield f"gcr.io/{PROJECT}/{IMAGE_NAME}" - - # Delete the image when we're done. - subprocess.run( - [ - "gcloud", - "container", - "images", - "delete", - f"gcr.io/{PROJECT}/{IMAGE_NAME}", - f"--project={PROJECT}", - "--quiet", - ], - check=True, - ) +def worker_image(utils: Utils) -> str: + yield from utils.cloud_build_submit(NAME, config="build.yaml") -def test_end_to_end(bucket_name: str, image_name: str) -> None: +def test_end_to_end(utils: Utils, bucket_name: str, worker_image: str) -> None: # Run the Beam pipeline in Dataflow making sure GPUs are used. - gpu_type = "nvidia-tesla-t4" - subprocess.run( - [ - "python", - "landsat_view.py", - f"--output-path-prefix=gs://{bucket_name}/outputs/", - "--runner=DataflowRunner", - f"--job_name=gpu-workers-{SUFFIX}", - f"--project={PROJECT}", - f"--region={REGION}", - f"--temp_location=gs://{bucket_name}/temp", - "--worker_machine_type=custom-1-13312-ext", - "--disk_size_gb=300", - f"--worker_harness_container_image={image_name}", - f"--worker_zone={ZONE}", - f"--experiments=worker_accelerator=type={gpu_type},count=1,install-nvidia-driver", - "--experiments=use_runner_v2", - ], - check=True, + utils.cloud_build_submit( + config="run.yaml", + substitutions={"_GCS_PATH": f"gs://{bucket_name}"}, ) # Check that output files were created and are not empty. diff --git a/dataflow/gpu-workers/tensorflow-landsat/run.yaml b/dataflow/gpu-workers/tensorflow-landsat/run.yaml index f447bfcbbc6..8b8a6b655cc 100644 --- a/dataflow/gpu-workers/tensorflow-landsat/run.yaml +++ b/dataflow/gpu-workers/tensorflow-landsat/run.yaml @@ -26,7 +26,7 @@ # ----------------------------------------------------------------------------- substitutions: - _OUTPUT_PATH: please set --substitutions _OUTPUT_PATH=gs://my-bucket/output/path + _GCS_PATH: please set --substitutions _GCS_PATH=gs://my-bucket/samples/dataflow/landsat _REGION: us-central1 _WORKER_ZONE: us-central1-f _GPU_TYPE: nvidia-tesla-t4 @@ -37,10 +37,11 @@ steps: entrypoint: python args: - /pipeline/main.py - - --output-path-prefix=$_OUTPUT_PATH + - --output-path-prefix=$_GCS_PATH/outputs/ - --runner=DataflowRunner - --project=$PROJECT_ID - --region=$_REGION + - --temp_location=$_GCS_PATH/temp - --worker_machine_type=custom-1-13312-ext - --worker_harness_container_image=gcr.io/$PROJECT_ID/samples/dataflow/landsat-gpu:latest - --worker_zone=$_WORKER_ZONE diff --git a/dataflow/gpu-workers/tensorflow-minimal/run.yaml b/dataflow/gpu-workers/tensorflow-minimal/run.yaml index 036db374e0d..769ca1eae3f 100644 --- a/dataflow/gpu-workers/tensorflow-minimal/run.yaml +++ b/dataflow/gpu-workers/tensorflow-minimal/run.yaml @@ -26,6 +26,7 @@ # ----------------------------------------------------------------------------- substitutions: + _TEMP_LOCATION: '' _REGION: us-central1 _WORKER_ZONE: us-central1-f _GPU_TYPE: nvidia-tesla-t4 @@ -39,6 +40,7 @@ steps: - --runner=DataflowRunner - --project=$PROJECT_ID - --region=$_REGION + - --temp_location=$_TEMP_LOCATION - --worker_harness_container_image=gcr.io/$PROJECT_ID/samples/dataflow/tensorflow-gpu:latest - --worker_zone=$_WORKER_ZONE - --experiment=worker_accelerator=type:$_GPU_TYPE;count:$_GPU_COUNT;install-nvidia-driver From ed2d91d9675e66bfa302d0b4917264dbae9a3f13 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Mon, 7 Jun 2021 14:59:12 -0700 Subject: [PATCH 03/87] update dockerignore and cloudignore --- dataflow/gpu-workers/tensorflow-landsat/.dockerignore | 11 +++++------ dataflow/gpu-workers/tensorflow-landsat/.gcloudignore | 11 +++++------ 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/dataflow/gpu-workers/tensorflow-landsat/.dockerignore b/dataflow/gpu-workers/tensorflow-landsat/.dockerignore index 04f5ec66ca6..775d845fa58 100644 --- a/dataflow/gpu-workers/tensorflow-landsat/.dockerignore +++ b/dataflow/gpu-workers/tensorflow-landsat/.dockerignore @@ -1,6 +1,5 @@ -# Ignore files for docker. -.mypy_cache/ -.nox/ -__pycache__/ -env/ -outputs/ +# Ignore everything except the source files. +**/* +!Dockerfile +!requirements.txt +!*.py diff --git a/dataflow/gpu-workers/tensorflow-landsat/.gcloudignore b/dataflow/gpu-workers/tensorflow-landsat/.gcloudignore index cda483971fd..775d845fa58 100644 --- a/dataflow/gpu-workers/tensorflow-landsat/.gcloudignore +++ b/dataflow/gpu-workers/tensorflow-landsat/.gcloudignore @@ -1,6 +1,5 @@ -# Ignore files for gcloud like Cloud Build. -.mypy_cache/ -.nox/ -__pycache__/ -env/ -outputs/ +# Ignore everything except the source files. +**/* +!Dockerfile +!requirements.txt +!*.py From 7969369e9efe3487a657bccc6cf047639989ccf3 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Mon, 7 Jun 2021 14:59:53 -0700 Subject: [PATCH 04/87] remove old config --- .../tensorflow-landsat/cloudbuild.yaml | 35 ------------------- 1 file changed, 35 deletions(-) delete mode 100644 dataflow/gpu-workers/tensorflow-landsat/cloudbuild.yaml diff --git a/dataflow/gpu-workers/tensorflow-landsat/cloudbuild.yaml b/dataflow/gpu-workers/tensorflow-landsat/cloudbuild.yaml deleted file mode 100644 index dec3d7aabb8..00000000000 --- a/dataflow/gpu-workers/tensorflow-landsat/cloudbuild.yaml +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# To build the container image: -# PYTHON_VERSION=`python -c 'import platform; print(platform.python_version())'` -# gcloud builds submit --substitutions _PYTHON_VERSION=$PYTHON_VERSION . --timeout 20m - -steps: - # Build the container image with the Python version of our choice. - - name: gcr.io/cloud-builders/docker - args: - [ 'build' - , '--build-arg=python_version=$_PYTHON_VERSION' - , '--tag=gcr.io/$PROJECT_ID/$_IMAGE' - , '.' - ] - - # Push the image to Container Registry. - - name: gcr.io/cloud-builders/docker - args: [ 'push', 'gcr.io/$PROJECT_ID/$_IMAGE' ] - -substitutions: - _PYTHON_VERSION: '3.8' - _IMAGE: samples/dataflow/tensorflow-gpu:latest From d34a2cac36d87590f67f8ed335867498516616e0 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Mon, 7 Jun 2021 15:02:55 -0700 Subject: [PATCH 05/87] update header year --- dataflow/gpu-workers/tensorflow-landsat/e2e_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataflow/gpu-workers/tensorflow-landsat/e2e_test.py b/dataflow/gpu-workers/tensorflow-landsat/e2e_test.py index ae49e73f2fb..d8e93467fb4 100644 --- a/dataflow/gpu-workers/tensorflow-landsat/e2e_test.py +++ b/dataflow/gpu-workers/tensorflow-landsat/e2e_test.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -# Copyright 2020 Google LLC +# Copyright 2021 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 193ce0e0b726014494aef70ae4dfd27ec3731679 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Mon, 7 Jun 2021 15:09:28 -0700 Subject: [PATCH 06/87] removed unused import --- dataflow/gpu-workers/pytorch-minimal/e2e_test.py | 1 - 1 file changed, 1 deletion(-) diff --git a/dataflow/gpu-workers/pytorch-minimal/e2e_test.py b/dataflow/gpu-workers/pytorch-minimal/e2e_test.py index d0a25e927d9..b1d612593f6 100644 --- a/dataflow/gpu-workers/pytorch-minimal/e2e_test.py +++ b/dataflow/gpu-workers/pytorch-minimal/e2e_test.py @@ -20,7 +20,6 @@ from conftest import Utils except ModuleNotFoundError: Utils = None -from google.cloud import storage import pytest NAME = "dataflow-gpu-pytorch" From abcbaaed0f9b822e6dea8e81dc6fa3cbd918ff7c Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Mon, 7 Jun 2021 15:09:37 -0700 Subject: [PATCH 07/87] add test --- .../tensorflow-minimal/e2e_test.py | 43 +++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 dataflow/gpu-workers/tensorflow-minimal/e2e_test.py diff --git a/dataflow/gpu-workers/tensorflow-minimal/e2e_test.py b/dataflow/gpu-workers/tensorflow-minimal/e2e_test.py new file mode 100644 index 00000000000..de5b46738ae --- /dev/null +++ b/dataflow/gpu-workers/tensorflow-minimal/e2e_test.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python + +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +try: + # `conftest` cannot be imported when running in `nox`, but we still + # try to import it for the autocomplete when writing the tests. + from conftest import Utils +except ModuleNotFoundError: + Utils = None +import pytest + +NAME = "dataflow-gpu-tensorflow" + + +@pytest.fixture(scope="session") +def bucket_name(utils: Utils) -> str: + yield from utils.storage_bucket(NAME) + + +@pytest.fixture(scope="session") +def worker_image(utils: Utils) -> str: + yield from utils.cloud_build_submit(NAME, config="build.yaml") + + +def test_end_to_end(utils: Utils, bucket_name: str, worker_image: str) -> None: + # Run the Beam pipeline in Dataflow making sure GPUs are used. + utils.cloud_build_submit( + config="run.yaml", + substitutions={"_TEMP_LOCATION": f"gs://{bucket_name}/temp"}, + ) From f6af3384e2b343e249a6506b9341c9ececc5bb56 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Mon, 7 Jun 2021 16:42:34 -0700 Subject: [PATCH 08/87] make image name unique --- dataflow/conftest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index f8191effd89..97675101450 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -200,7 +200,7 @@ def cloud_build_submit( ] print(cmd) subprocess.run(cmd, check=True) - yield f"gcr.io/{project}/{image_name}:latest" + yield f"gcr.io/{project}/{image_name}-{UUID}:latest" else: raise ValueError("must specify either `config` or `image_name`") @@ -210,7 +210,7 @@ def cloud_build_submit( "container", "images", "delete", - f"gcr.io/{project}/{image_name}:latest", + f"gcr.io/{project}/{image_name}-{UUID}:latest", f"--project={project}", "--quiet", ] From ba72e54eae3017f900c346f3b9aa6a4318f1e7c0 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Mon, 7 Jun 2021 17:15:11 -0700 Subject: [PATCH 09/87] fix image name --- dataflow/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index 97675101450..01917e47bc1 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -194,7 +194,7 @@ def cloud_build_submit( "builds", "submit", f"--project={project}", - f"--tag=gcr.io/{project}/{image_name}:latest", + f"--tag=gcr.io/{project}/{image_name}-{UUID}:latest", *cmd_substitutions, ".", ] From ec4bcdedbdd90b221b291f7c43157507e6f57750 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Wed, 9 Jun 2021 12:25:42 -0700 Subject: [PATCH 10/87] fix resource name issues --- dataflow/conftest.py | 82 ++++++------------- .../flex-templates/streaming_beam/e2e_test.py | 4 +- .../gpu-workers/pytorch-minimal/README.md | 3 +- .../gpu-workers/pytorch-minimal/build.yaml | 10 +-- .../gpu-workers/pytorch-minimal/e2e_test.py | 7 +- dataflow/gpu-workers/pytorch-minimal/main.py | 9 +- dataflow/gpu-workers/pytorch-minimal/run.yaml | 9 +- .../gpu-workers/tensorflow-landsat/README.md | 5 +- .../gpu-workers/tensorflow-landsat/build.yaml | 10 +-- .../tensorflow-landsat/e2e_test.py | 10 ++- .../gpu-workers/tensorflow-landsat/main.py | 15 ++-- .../gpu-workers/tensorflow-landsat/run.yaml | 14 ++-- .../gpu-workers/tensorflow-minimal/README.md | 3 +- .../gpu-workers/tensorflow-minimal/build.yaml | 10 +-- .../tensorflow-minimal/e2e_test.py | 7 +- .../gpu-workers/tensorflow-minimal/main.py | 8 +- .../gpu-workers/tensorflow-minimal/run.yaml | 7 +- 17 files changed, 99 insertions(+), 114 deletions(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index 01917e47bc1..046afb39daa 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -15,6 +15,7 @@ import json import multiprocessing as mp import os +import re import subprocess import sys import time @@ -31,6 +32,9 @@ RETRY_MAX_TIME = 5 * 60 # 5 minutes in seconds +HYPHEN_NAME_RE = re.compile(r"[^\w\d-]+") +UNDERSCORE_NAME_RE = re.compile(r"[^\w\d_]+") + @dataclass class Utils: @@ -40,25 +44,33 @@ class Utils: zone: str = ZONE @staticmethod - def storage_bucket(bucket_name: str) -> str: + def hyphen_name(name: str) -> str: + return f"{HYPHEN_NAME_RE.sub('-', name)}-{UUID}" + + @staticmethod + def underscore_name(name: str) -> str: + return f"{UNDERSCORE_NAME_RE.sub('_', name)}-{UUID}" + + @staticmethod + def storage_bucket(name: str) -> str: from google.cloud import storage storage_client = storage.Client() - bucket_unique_name = f"{bucket_name}-{UUID}" - bucket = storage_client.create_bucket(bucket_unique_name) + bucket = storage_client.create_bucket(Utils.hyphen_name(name)) - print(f"storage_bucket: {bucket_unique_name}") - yield bucket_unique_name + print(f"storage_bucket: {bucket.name}") + yield bucket.name bucket.delete(force=True) @staticmethod - def bigquery_dataset(dataset_name: str, project: str = PROJECT) -> str: + def bigquery_dataset(name: str, project: str = PROJECT) -> str: from google.cloud import bigquery bigquery_client = bigquery.Client() + dataset = bigquery_client.create_dataset( - bigquery.Dataset(f"{project}.{dataset_name.replace('-', '_')}_{UUID}") + bigquery.Dataset(f"{project}.{Utils.underscore_name(name)}") ) print(f"bigquery_dataset: {dataset.full_dataset_id}") @@ -77,11 +89,11 @@ def bigquery_query(query: str) -> Iterable[Dict[str, Any]]: yield dict(row) @staticmethod - def pubsub_topic(topic_name: str, project: str = PROJECT) -> str: + def pubsub_topic(name: str, project: str = PROJECT) -> str: from google.cloud import pubsub publisher_client = pubsub.PublisherClient() - topic_path = publisher_client.topic_path(project, f"{topic_name}-{UUID}") + topic_path = publisher_client.topic_path(project, Utils.hyphen_name(name)) topic = publisher_client.create_topic(topic_path) print(f"pubsub_topic: {topic.name}") @@ -98,14 +110,14 @@ def pubsub_topic(topic_name: str, project: str = PROJECT) -> str: @staticmethod def pubsub_subscription( topic_path: str, - subscription_name: str, + name: str, project: str = PROJECT, ) -> str: from google.cloud import pubsub subscriber = pubsub.SubscriberClient() subscription_path = subscriber.subscription_path( - project, f"{subscription_name}-{UUID}" + project, Utils.hyphen_name(name) ) subscription = subscriber.create_subscription(subscription_path, topic_path) @@ -200,7 +212,7 @@ def cloud_build_submit( ] print(cmd) subprocess.run(cmd, check=True) - yield f"gcr.io/{project}/{image_name}-{UUID}:latest" + yield f"{image_name}-{UUID}:latest" else: raise ValueError("must specify either `config` or `image_name`") @@ -217,36 +229,6 @@ def cloud_build_submit( print(cmd) subprocess.run(cmd, check=True) - @staticmethod - def dataflow_job_id_from_job_name( - job_name: str, - project: str = PROJECT, - ) -> Optional[str]: - from googleapiclient.discovery import build - - dataflow = build("dataflow", "v1b3") - - # Only return the 50 most recent results - our job is likely to be in here. - # If the job is not found, first try increasing this number.[]''job_id - # For more info see: - # https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/list - jobs_request = ( - dataflow.projects() - .jobs() - .list( - projectId=project, - filter="ACTIVE", - pageSize=50, - ) - ) - response = jobs_request.execute() - - # Search for the job in the list that has our name (names are unique) - for job in response["jobs"]: - if job["name"] == job_name: - return job["id"] - return None - @staticmethod def dataflow_jobs_wait( job_id: str, @@ -303,20 +285,10 @@ def dataflow_jobs_cancel_by_job_id( ] subprocess.run(cmd, check=True) - @staticmethod - def dataflow_jobs_cancel_by_job_name( - job_name: str, project: str = PROJECT, region: str = REGION - ) -> None: - # To cancel a dataflow job, we need its ID, not its name. - # If it doesn't, job_id will be equal to None. - job_id = Utils.dataflow_job_id_from_job_name(project, job_name) - if job_id is not None: - Utils.dataflow_jobs_cancel_by_job_id(job_id, project, region) - @staticmethod def dataflow_flex_template_build( bucket_name: str, - template_image: str, + image_name: str, metadata_file: str, project: str = PROJECT, template_file: str = "template.json", @@ -330,7 +302,7 @@ def dataflow_flex_template_build( "build", template_gcs_path, f"--project={project}", - f"--image={template_image}", + f"--image=gcr.io/{project}/{image_name}", "--sdk-language=PYTHON", f"--metadata-file={metadata_file}", ] @@ -353,7 +325,7 @@ def dataflow_flex_template_run( import yaml # https://cloud.google.com/sdk/gcloud/reference/dataflow/flex-template/run - unique_job_name = f"{job_name}-{UUID}" + unique_job_name = Utils.hyphen_name(job_name) print(f"dataflow_job_name: {unique_job_name}") cmd = [ "gcloud", diff --git a/dataflow/flex-templates/streaming_beam/e2e_test.py b/dataflow/flex-templates/streaming_beam/e2e_test.py index 08f0676b446..ce0ba9193fc 100644 --- a/dataflow/flex-templates/streaming_beam/e2e_test.py +++ b/dataflow/flex-templates/streaming_beam/e2e_test.py @@ -21,7 +21,7 @@ Utils = None import pytest -NAME = "dataflow-flex-templates-streaming-beam" +NAME = "dataflow/flex-templates/streaming-beam" @pytest.fixture(scope="session") @@ -66,7 +66,7 @@ def flex_template_image(utils: Utils) -> str: def flex_template_path(utils: Utils, bucket_name: str, flex_template_image: str) -> str: yield from utils.dataflow_flex_template_build( bucket_name=bucket_name, - template_image=flex_template_image, + image_name=flex_template_image, metadata_file="metadata.json", ) diff --git a/dataflow/gpu-workers/pytorch-minimal/README.md b/dataflow/gpu-workers/pytorch-minimal/README.md index 15a81d95fc7..4c8b7fa8370 100644 --- a/dataflow/gpu-workers/pytorch-minimal/README.md +++ b/dataflow/gpu-workers/pytorch-minimal/README.md @@ -27,12 +27,11 @@ with the same Python version as the workers. ```sh export REGION="us-central1" -export WORKER_ZONE="us-central1-f" export GPU_TYPE="nvidia-tesla-t4" gcloud beta builds submit \ --config run.yaml \ - --substitutions _REGION=$REGION,_WORKER_ZONE=$WORKER_ZONE,_GPU_TYPE=$GPU_TYPE \ + --substitutions _REGION=$REGION,_GPU_TYPE=$GPU_TYPE \ --no-source ``` diff --git a/dataflow/gpu-workers/pytorch-minimal/build.yaml b/dataflow/gpu-workers/pytorch-minimal/build.yaml index c72876e2623..eed5c16aa70 100644 --- a/dataflow/gpu-workers/pytorch-minimal/build.yaml +++ b/dataflow/gpu-workers/pytorch-minimal/build.yaml @@ -19,14 +19,14 @@ # https://cloud.google.com/build/docs/build-config # ----------------------------------------------------------------------------- +substitutions: + _IMAGE: samples/dataflow/pytorch-gpu:latest + steps: - name: gcr.io/cloud-builders/docker - args: - - build - - --tag=gcr.io/$PROJECT_ID/samples/dataflow/pytorch-gpu:latest - - . + args: [ build, --tag=gcr.io/$PROJECT_ID/$_IMAGE, . ] -images: [gcr.io/$PROJECT_ID/samples/dataflow/pytorch-gpu:latest] +images: [ gcr.io/$PROJECT_ID/$_IMAGE ] options: machineType: E2_HIGHCPU_8 diff --git a/dataflow/gpu-workers/pytorch-minimal/e2e_test.py b/dataflow/gpu-workers/pytorch-minimal/e2e_test.py index b1d612593f6..84a965d4a1f 100644 --- a/dataflow/gpu-workers/pytorch-minimal/e2e_test.py +++ b/dataflow/gpu-workers/pytorch-minimal/e2e_test.py @@ -22,7 +22,7 @@ Utils = None import pytest -NAME = "dataflow-gpu-pytorch" +NAME = "dataflow/gpu-workers/pytorch-minimal" @pytest.fixture(scope="session") @@ -39,5 +39,8 @@ def test_end_to_end(utils: Utils, bucket_name: str, worker_image: str) -> None: # Run the Beam pipeline in Dataflow making sure GPUs are used. utils.cloud_build_submit( config="run.yaml", - substitutions={"_TEMP_LOCATION": f"gs://{bucket_name}/temp"}, + substitutions={ + "_IMAGE": worker_image, + "_TEMP_LOCATION": f"gs://{bucket_name}/temp", + }, ) diff --git a/dataflow/gpu-workers/pytorch-minimal/main.py b/dataflow/gpu-workers/pytorch-minimal/main.py index 19b5a740fba..5ae070d7948 100644 --- a/dataflow/gpu-workers/pytorch-minimal/main.py +++ b/dataflow/gpu-workers/pytorch-minimal/main.py @@ -18,6 +18,7 @@ import apache_beam as beam from apache_beam.options.pipeline_options import PipelineOptions +from apache_beam.pvalue import AsSingleton import torch @@ -41,7 +42,13 @@ def run(input_text: str, beam_args: Optional[List[str]] = None) -> None: ( pipeline | "Create data" >> beam.Create([input_text]) - | "Check GPU availability" >> beam.Map(check_gpus) + | "Check GPU availability" + >> beam.Map( + lambda x, unused_side_input: x, + unused_side_input=beam.pvalue.AsSingleton( + pipeline | beam.Create([None]) | beam.Map(check_gpus) + ), + ) | "My transform" >> beam.Map(logging.info) ) pipeline.run() diff --git a/dataflow/gpu-workers/pytorch-minimal/run.yaml b/dataflow/gpu-workers/pytorch-minimal/run.yaml index c5e74f4d77a..fb915831fe1 100644 --- a/dataflow/gpu-workers/pytorch-minimal/run.yaml +++ b/dataflow/gpu-workers/pytorch-minimal/run.yaml @@ -26,14 +26,14 @@ # ----------------------------------------------------------------------------- substitutions: + _IMAGE: samples/dataflow/pytorch-gpu:latest _TEMP_LOCATION: '' _REGION: us-central1 - _WORKER_ZONE: us-central1-f _GPU_TYPE: nvidia-tesla-t4 _GPU_COUNT: '1' steps: -- name: gcr.io/$PROJECT_ID/samples/dataflow/pytorch-gpu:latest +- name: gcr.io/$PROJECT_ID/$_IMAGE entrypoint: python args: - /pipeline/main.py @@ -41,9 +41,8 @@ steps: - --project=$PROJECT_ID - --region=$_REGION - --temp_location=$_TEMP_LOCATION - - --worker_harness_container_image=gcr.io/$PROJECT_ID/samples/dataflow/pytorch-gpu:latest - - --worker_zone=$_WORKER_ZONE - - --disk_size_gb=50 + - --worker_harness_container_image=gcr.io/$PROJECT_ID/$_IMAGE + - --disk_size_gb=20 - --experiment=worker_accelerator=type:$_GPU_TYPE;count:$_GPU_COUNT;install-nvidia-driver - --experiment=use_runner_v2 diff --git a/dataflow/gpu-workers/tensorflow-landsat/README.md b/dataflow/gpu-workers/tensorflow-landsat/README.md index eeb91e7a5e6..a9b0fd7aa2a 100644 --- a/dataflow/gpu-workers/tensorflow-landsat/README.md +++ b/dataflow/gpu-workers/tensorflow-landsat/README.md @@ -33,14 +33,13 @@ We launch the job using the worker image to make sure the job launches with the same Python version as the workers. ```sh -export GCS_PATH="gs://$BUCKET/samples/dataflow/landsat" +export OUTPUT_PATH="gs://$BUCKET/samples/dataflow/landsat/output-images/" export REGION="us-central1" -export WORKER_ZONE="us-central1-f" export GPU_TYPE="nvidia-tesla-t4" gcloud beta builds submit \ --config run.yaml \ - --substitutions _GCS_PATH=$GCS_PATH,_REGION=$REGION,_WORKER_ZONE=$WORKER_ZONE,_GPU_TYPE=$GPU_TYPE \ + --substitutions _GCS_PATH=$GCS_PATH,_REGION=$REGION,_GPU_TYPE=$GPU_TYPE \ --no-source ``` diff --git a/dataflow/gpu-workers/tensorflow-landsat/build.yaml b/dataflow/gpu-workers/tensorflow-landsat/build.yaml index b2b81b8f92d..559452e3868 100644 --- a/dataflow/gpu-workers/tensorflow-landsat/build.yaml +++ b/dataflow/gpu-workers/tensorflow-landsat/build.yaml @@ -19,14 +19,14 @@ # https://cloud.google.com/build/docs/build-config # ----------------------------------------------------------------------------- +substitutions: + _IMAGE: samples/dataflow/landsat-gpu:latest + steps: - name: gcr.io/cloud-builders/docker - args: - - build - - --tag=gcr.io/$PROJECT_ID/samples/dataflow/landsat-gpu:latest - - . + args: [ build, --tag=gcr.io/$PROJECT_ID/$_IMAGE, . ] -images: [gcr.io/$PROJECT_ID/samples/dataflow/landsat-gpu:latest] +images: [ gcr.io/$PROJECT_ID/$_IMAGE ] options: machineType: E2_HIGHCPU_8 diff --git a/dataflow/gpu-workers/tensorflow-landsat/e2e_test.py b/dataflow/gpu-workers/tensorflow-landsat/e2e_test.py index d8e93467fb4..f35530be4f7 100644 --- a/dataflow/gpu-workers/tensorflow-landsat/e2e_test.py +++ b/dataflow/gpu-workers/tensorflow-landsat/e2e_test.py @@ -23,7 +23,7 @@ from google.cloud import storage import pytest -NAME = "dataflow-gpu-landsat" +NAME = "dataflow/gpu-workers/tensorflow-landsat" @pytest.fixture(scope="session") @@ -40,12 +40,16 @@ def test_end_to_end(utils: Utils, bucket_name: str, worker_image: str) -> None: # Run the Beam pipeline in Dataflow making sure GPUs are used. utils.cloud_build_submit( config="run.yaml", - substitutions={"_GCS_PATH": f"gs://{bucket_name}"}, + substitutions={ + "_IMAGE": worker_image, + "_TEMP_LOCATION": f"gs://{bucket_name}/temp", + "_OUTPUT_PATH": f"gs://{bucket_name}/outputs/", + }, ) # Check that output files were created and are not empty. storage_client = storage.Client() output_files = list(storage_client.list_blobs(bucket_name, prefix="outputs/")) - assert len(output_files) > 0, "No output files found" + assert len(output_files) > 0, f"No files found in gs://{bucket_name}/outputs/" for output_file in output_files: assert output_file.size > 0, f"Output file is empty: {output_file.name}" diff --git a/dataflow/gpu-workers/tensorflow-landsat/main.py b/dataflow/gpu-workers/tensorflow-landsat/main.py index 00216422633..ac28f5ecec2 100644 --- a/dataflow/gpu-workers/tensorflow-landsat/main.py +++ b/dataflow/gpu-workers/tensorflow-landsat/main.py @@ -138,15 +138,12 @@ def check_gpus(element: Any, gpus_optional: bool) -> Any: return element -def get_band_paths( - scene: str, band_names: List[str], unused_side_input: Any -) -> Tuple[str, List[str]]: +def get_band_paths(scene: str, band_names: List[str]) -> Tuple[str, List[str]]: """Gets the Cloud Storage paths for each band in a Landsat scene. Args: scene: Landsat 8 scene ID. band_names: List of the band names corresponding to [Red, Green, Blue] channels. - unused_side_input: Used to wait for the GPU check, can be safely ignored. Returns: A (scene, band_paths) pair. @@ -288,16 +285,14 @@ def run( ( pipeline | "Create scene IDs" >> beam.Create(scenes) - | "Get RGB band paths" + | "Check GPU availability" >> beam.Map( - get_band_paths, - rgb_band_names, + lambda x, unused_side_input: x, unused_side_input=beam.pvalue.AsSingleton( - pipeline - | beam.Create([None]) - | "Check GPUs" >> beam.Map(check_gpus, gpus_optional) + pipeline | beam.Create([None]) | beam.Map(check_gpus) ), ) + | "Get RGB band paths" >> beam.Map(get_band_paths, rgb_band_names) | "Load RGB band values" >> beam.MapTuple(load_values) | "Preprocess pixels" >> beam.MapTuple(preprocess_pixels, min_value, max_value, gamma) diff --git a/dataflow/gpu-workers/tensorflow-landsat/run.yaml b/dataflow/gpu-workers/tensorflow-landsat/run.yaml index 8b8a6b655cc..022d180ee34 100644 --- a/dataflow/gpu-workers/tensorflow-landsat/run.yaml +++ b/dataflow/gpu-workers/tensorflow-landsat/run.yaml @@ -26,25 +26,25 @@ # ----------------------------------------------------------------------------- substitutions: - _GCS_PATH: please set --substitutions _GCS_PATH=gs://my-bucket/samples/dataflow/landsat + _OUTPUT_PATH: please run with --substitutions _OUTPUT_PATH=gs://$BUCKET/samples/dataflow/landsat/outputs/ + _IMAGE: samples/dataflow/landsat-gpu:latest + _TEMP_LOCATION: '' _REGION: us-central1 - _WORKER_ZONE: us-central1-f _GPU_TYPE: nvidia-tesla-t4 _GPU_COUNT: '1' steps: -- name: gcr.io/$PROJECT_ID/samples/dataflow/landsat-gpu:latest +- name: gcr.io/$PROJECT_ID/$_IMAGE entrypoint: python args: - /pipeline/main.py - - --output-path-prefix=$_GCS_PATH/outputs/ + - --output-path-prefix=$_OUTPUT_PATH - --runner=DataflowRunner - --project=$PROJECT_ID - --region=$_REGION - - --temp_location=$_GCS_PATH/temp + - --temp_location=$_TEMP_LOCATION - --worker_machine_type=custom-1-13312-ext - - --worker_harness_container_image=gcr.io/$PROJECT_ID/samples/dataflow/landsat-gpu:latest - - --worker_zone=$_WORKER_ZONE + - --worker_harness_container_image=gcr.io/$PROJECT_ID/$_IMAGE - --experiment=worker_accelerator=type:$_GPU_TYPE;count:$_GPU_COUNT;install-nvidia-driver - --experiment=use_runner_v2 diff --git a/dataflow/gpu-workers/tensorflow-minimal/README.md b/dataflow/gpu-workers/tensorflow-minimal/README.md index 15a81d95fc7..4c8b7fa8370 100644 --- a/dataflow/gpu-workers/tensorflow-minimal/README.md +++ b/dataflow/gpu-workers/tensorflow-minimal/README.md @@ -27,12 +27,11 @@ with the same Python version as the workers. ```sh export REGION="us-central1" -export WORKER_ZONE="us-central1-f" export GPU_TYPE="nvidia-tesla-t4" gcloud beta builds submit \ --config run.yaml \ - --substitutions _REGION=$REGION,_WORKER_ZONE=$WORKER_ZONE,_GPU_TYPE=$GPU_TYPE \ + --substitutions _REGION=$REGION,_GPU_TYPE=$GPU_TYPE \ --no-source ``` diff --git a/dataflow/gpu-workers/tensorflow-minimal/build.yaml b/dataflow/gpu-workers/tensorflow-minimal/build.yaml index 84f60f90255..9362f3c57cb 100644 --- a/dataflow/gpu-workers/tensorflow-minimal/build.yaml +++ b/dataflow/gpu-workers/tensorflow-minimal/build.yaml @@ -19,14 +19,14 @@ # https://cloud.google.com/build/docs/build-config # ----------------------------------------------------------------------------- +substitutions: + _IMAGE: samples/dataflow/tensorflow-gpu:latest + steps: - name: gcr.io/cloud-builders/docker - args: - - build - - --tag=gcr.io/$PROJECT_ID/samples/dataflow/tensorflow-gpu:latest - - . + args: [ build, --tag=gcr.io/$PROJECT_ID/$_IMAGE, . ] -images: [gcr.io/$PROJECT_ID/samples/dataflow/tensorflow-gpu:latest] +images: [ gcr.io/$PROJECT_ID/$_IMAGE ] options: machineType: E2_HIGHCPU_8 diff --git a/dataflow/gpu-workers/tensorflow-minimal/e2e_test.py b/dataflow/gpu-workers/tensorflow-minimal/e2e_test.py index de5b46738ae..73547f1a8bf 100644 --- a/dataflow/gpu-workers/tensorflow-minimal/e2e_test.py +++ b/dataflow/gpu-workers/tensorflow-minimal/e2e_test.py @@ -22,7 +22,7 @@ Utils = None import pytest -NAME = "dataflow-gpu-tensorflow" +NAME = "dataflow/gpu-workers/tensorflow-minimal" @pytest.fixture(scope="session") @@ -39,5 +39,8 @@ def test_end_to_end(utils: Utils, bucket_name: str, worker_image: str) -> None: # Run the Beam pipeline in Dataflow making sure GPUs are used. utils.cloud_build_submit( config="run.yaml", - substitutions={"_TEMP_LOCATION": f"gs://{bucket_name}/temp"}, + substitutions={ + "_IMAGE": worker_image, + "_TEMP_LOCATION": f"gs://{bucket_name}/temp", + }, ) diff --git a/dataflow/gpu-workers/tensorflow-minimal/main.py b/dataflow/gpu-workers/tensorflow-minimal/main.py index 924f4e0935d..33b295ac69b 100644 --- a/dataflow/gpu-workers/tensorflow-minimal/main.py +++ b/dataflow/gpu-workers/tensorflow-minimal/main.py @@ -42,7 +42,13 @@ def run(input_text: str, beam_args: Optional[List[str]] = None) -> None: ( pipeline | "Create data" >> beam.Create([input_text]) - | "Check GPU availability" >> beam.Map(check_gpus) + | "Check GPU availability" + >> beam.Map( + lambda x, unused_side_input: x, + unused_side_input=beam.pvalue.AsSingleton( + pipeline | beam.Create([None]) | beam.Map(check_gpus) + ), + ) | "My transform" >> beam.Map(logging.info) ) pipeline.run() diff --git a/dataflow/gpu-workers/tensorflow-minimal/run.yaml b/dataflow/gpu-workers/tensorflow-minimal/run.yaml index 769ca1eae3f..ca69dcaa353 100644 --- a/dataflow/gpu-workers/tensorflow-minimal/run.yaml +++ b/dataflow/gpu-workers/tensorflow-minimal/run.yaml @@ -26,14 +26,14 @@ # ----------------------------------------------------------------------------- substitutions: + _IMAGE: samples/dataflow/tensorflow-gpu:latest _TEMP_LOCATION: '' _REGION: us-central1 - _WORKER_ZONE: us-central1-f _GPU_TYPE: nvidia-tesla-t4 _GPU_COUNT: '1' steps: -- name: gcr.io/$PROJECT_ID/samples/dataflow/tensorflow-gpu:latest +- name: gcr.io/$PROJECT_ID/$_IMAGE entrypoint: python args: - /pipeline/main.py @@ -41,8 +41,7 @@ steps: - --project=$PROJECT_ID - --region=$_REGION - --temp_location=$_TEMP_LOCATION - - --worker_harness_container_image=gcr.io/$PROJECT_ID/samples/dataflow/tensorflow-gpu:latest - - --worker_zone=$_WORKER_ZONE + - --worker_harness_container_image=gcr.io/$PROJECT_ID/$_IMAGE - --experiment=worker_accelerator=type:$_GPU_TYPE;count:$_GPU_COUNT;install-nvidia-driver - --experiment=use_runner_v2 From d24fbf9ebe7e51ff2bf971821af1890f21d2ebc1 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Wed, 9 Jun 2021 12:37:45 -0700 Subject: [PATCH 11/87] fix lint issues --- dataflow/conftest.py | 2 +- dataflow/flex-templates/__init__.py | 1 - dataflow/gpu-workers/pytorch-minimal/README.md | 3 --- dataflow/gpu-workers/pytorch-minimal/main.py | 4 ---- dataflow/gpu-workers/tensorflow-landsat/README.md | 3 --- dataflow/gpu-workers/tensorflow-landsat/main.py | 5 ----- dataflow/gpu-workers/tensorflow-minimal/README.md | 3 --- dataflow/gpu-workers/tensorflow-minimal/main.py | 3 --- 8 files changed, 1 insertion(+), 23 deletions(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index 046afb39daa..bd3e08349be 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -260,7 +260,7 @@ def dataflow_jobs_wait( print(response) if response["currentState"] == status: return True - except: + except Exception: pass time.sleep(sleep_time_seconds) return False diff --git a/dataflow/flex-templates/__init__.py b/dataflow/flex-templates/__init__.py index 8b137891791..e69de29bb2d 100644 --- a/dataflow/flex-templates/__init__.py +++ b/dataflow/flex-templates/__init__.py @@ -1 +0,0 @@ - diff --git a/dataflow/gpu-workers/pytorch-minimal/README.md b/dataflow/gpu-workers/pytorch-minimal/README.md index 4c8b7fa8370..3ec270791f9 100644 --- a/dataflow/gpu-workers/pytorch-minimal/README.md +++ b/dataflow/gpu-workers/pytorch-minimal/README.md @@ -35,9 +35,6 @@ gcloud beta builds submit \ --no-source ``` -> ℹ️ Make sure the GPU type you choose is available in the worker zone for the job. -> For more information, see [GPU availability](https://cloud.google.com/dataflow/docs/resources/locations#gpu_availability). - ## What's next? For a more complete example, take a look at diff --git a/dataflow/gpu-workers/pytorch-minimal/main.py b/dataflow/gpu-workers/pytorch-minimal/main.py index 5ae070d7948..3b36cc0fcd5 100644 --- a/dataflow/gpu-workers/pytorch-minimal/main.py +++ b/dataflow/gpu-workers/pytorch-minimal/main.py @@ -18,7 +18,6 @@ import apache_beam as beam from apache_beam.options.pipeline_options import PipelineOptions -from apache_beam.pvalue import AsSingleton import torch @@ -35,9 +34,6 @@ def check_gpus(element: Any, gpus_optional: bool = False) -> Any: def run(input_text: str, beam_args: Optional[List[str]] = None) -> None: beam_options = PipelineOptions(beam_args, save_main_session=True) - - # We currently cannot use the `with` statement to run without waiting. - # https://issues.apache.org/jira/browse/BEAM-12455 pipeline = beam.Pipeline(options=beam_options) ( pipeline diff --git a/dataflow/gpu-workers/tensorflow-landsat/README.md b/dataflow/gpu-workers/tensorflow-landsat/README.md index a9b0fd7aa2a..e91193b3a48 100644 --- a/dataflow/gpu-workers/tensorflow-landsat/README.md +++ b/dataflow/gpu-workers/tensorflow-landsat/README.md @@ -42,6 +42,3 @@ gcloud beta builds submit \ --substitutions _GCS_PATH=$GCS_PATH,_REGION=$REGION,_GPU_TYPE=$GPU_TYPE \ --no-source ``` - -> ℹ️ Make sure the GPU type you choose is available in the worker zone for the job. -> For more information, see [GPU availability](https://cloud.google.com/dataflow/docs/resources/locations#gpu_availability). diff --git a/dataflow/gpu-workers/tensorflow-landsat/main.py b/dataflow/gpu-workers/tensorflow-landsat/main.py index ac28f5ecec2..6691c457d9d 100644 --- a/dataflow/gpu-workers/tensorflow-landsat/main.py +++ b/dataflow/gpu-workers/tensorflow-landsat/main.py @@ -278,10 +278,6 @@ def run( # We currently cannot use the `with` statement to run without waiting. # https://issues.apache.org/jira/browse/BEAM-12455 pipeline = beam.Pipeline(options=beam_options) - - # Convert Landsat 8 scenes into images. - # ℹ️ We pass `gpu_check` as an unused side input to force that step in - # the pipeline to wait for the check before continuing. ( pipeline | "Create scene IDs" >> beam.Create(scenes) @@ -305,7 +301,6 @@ def run( ) | "Save to Cloud Storage" >> beam.MapTuple(save_to_gcs, output_path_prefix) ) - pipeline.run() diff --git a/dataflow/gpu-workers/tensorflow-minimal/README.md b/dataflow/gpu-workers/tensorflow-minimal/README.md index 4c8b7fa8370..3ec270791f9 100644 --- a/dataflow/gpu-workers/tensorflow-minimal/README.md +++ b/dataflow/gpu-workers/tensorflow-minimal/README.md @@ -35,9 +35,6 @@ gcloud beta builds submit \ --no-source ``` -> ℹ️ Make sure the GPU type you choose is available in the worker zone for the job. -> For more information, see [GPU availability](https://cloud.google.com/dataflow/docs/resources/locations#gpu_availability). - ## What's next? For a more complete example, take a look at diff --git a/dataflow/gpu-workers/tensorflow-minimal/main.py b/dataflow/gpu-workers/tensorflow-minimal/main.py index 33b295ac69b..f039034349c 100644 --- a/dataflow/gpu-workers/tensorflow-minimal/main.py +++ b/dataflow/gpu-workers/tensorflow-minimal/main.py @@ -35,9 +35,6 @@ def check_gpus(element: Any, gpus_optional: bool = False) -> Any: def run(input_text: str, beam_args: Optional[List[str]] = None) -> None: beam_options = PipelineOptions(beam_args, save_main_session=True) - - # We currently cannot use the `with` statement to run without waiting. - # https://issues.apache.org/jira/browse/BEAM-12455 pipeline = beam.Pipeline(options=beam_options) ( pipeline From d0593cbc79e239b16e95ec6c6ef10eee13d8cfac Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Wed, 9 Jun 2021 13:08:01 -0700 Subject: [PATCH 12/87] fix underscore_name --- dataflow/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index bd3e08349be..7042d3f0327 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -49,7 +49,7 @@ def hyphen_name(name: str) -> str: @staticmethod def underscore_name(name: str) -> str: - return f"{UNDERSCORE_NAME_RE.sub('_', name)}-{UUID}" + return f"{UNDERSCORE_NAME_RE.sub('_', name)}_{UUID}" @staticmethod def storage_bucket(name: str) -> str: From 07a4b444f6eef3c0113f4f3db22850f5e155201d Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Wed, 9 Jun 2021 13:45:20 -0700 Subject: [PATCH 13/87] wait for jobs --- dataflow/conftest.py | 104 +++++++++++++----- .../flex-templates/streaming_beam/e2e_test.py | 2 +- .../gpu-workers/pytorch-minimal/e2e_test.py | 6 + dataflow/gpu-workers/pytorch-minimal/run.yaml | 2 + .../tensorflow-landsat/e2e_test.py | 6 + .../gpu-workers/tensorflow-landsat/run.yaml | 2 + .../tensorflow-minimal/e2e_test.py | 6 + .../gpu-workers/tensorflow-minimal/run.yaml | 2 + 8 files changed, 102 insertions(+), 28 deletions(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index 7042d3f0327..c5e9623889a 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -13,13 +13,14 @@ from dataclasses import dataclass import itertools import json +import logging import multiprocessing as mp import os import re import subprocess import sys import time -from typing import Any, Callable, Dict, Iterable, Optional +from typing import Any, Callable, Dict, Iterable, List, Optional, Union import uuid import pytest @@ -230,40 +231,89 @@ def cloud_build_submit( subprocess.run(cmd, check=True) @staticmethod - def dataflow_jobs_wait( - job_id: str, + def dataflow_jobs_get( + job_id: Optional[str] = None, + job_name: Optional[str] = None, project: str = PROJECT, - status: str = "JOB_STATE_RUNNING", - ) -> bool: + region: str = REGION, + list_page_size=100, + ) -> Optional[Dict[str, Any]]: from googleapiclient.discovery import build dataflow = build("dataflow", "v1b3") - sleep_time_seconds = 30 - max_sleep_time = 10 * 60 + if job_id: + # For more info see: + # https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/get + request = ( + dataflow.projects() + .jobs() + .get( + projectId=project, + jobId=job_id, + view="JOB_VIEW_SUMMARY", + ) + ) + job = request.execute() + print(job) + return job + + elif job_name: + # For more info see: + # https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/list + request = ( + dataflow.projects() + .jobs() + .list( + projectId=project, + filter="ACTIVE", + pageSize=list_page_size, + location=region, + ) + ) + for job in request.execute()["jobs"]: + if job["name"] == job_name: + print(job) + return job + return None + + else: + raise ValueError("must specify either `job_id` or `job_name`") - print(f"Waiting for Dataflow job ID: {job_id} (until status {status})") - for _ in range(0, max_sleep_time, sleep_time_seconds): + @staticmethod + def dataflow_jobs_wait( + job_id: Optional[str] = None, + job_name: Optional[str] = None, + project: str = PROJECT, + region: str = REGION, + until_status: Union[str, Iterable[str]] = { + "JOB_STATE_DONE", + "JOB_STATE_FAILED", + "JOB_STATE_CANCELLED", + }, + timeout_sec: str = 600, + poll_interval_sec=30, + list_page_size=100, + ) -> Optional[str]: + """For a list of all the valid states: + https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs#Job.JobState + """ + target_status = ( + {until_status} if isinstance(until_status, str) else set(until_status) + ) + print(f"Waiting for Dataflow job until {target_status}") + status = None + for _ in range(0, timeout_sec, poll_interval_sec): try: - # For more info see: - # https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/get - jobs_request = ( - dataflow.projects() - .jobs() - .get( - projectId=project, - jobId=job_id, - view="JOB_VIEW_SUMMARY", - ) + status = Utils.dataflow_jobs_get( + job_id, job_name, project, region, list_page_size ) - response = jobs_request.execute() - print(response) - if response["currentState"] == status: - return True - except Exception: - pass - time.sleep(sleep_time_seconds) - return False + if status in target_status: + return status + except Exception as e: + logging.warning(e) + time.sleep(poll_interval_sec) + return status @staticmethod def dataflow_jobs_cancel_by_job_id( diff --git a/dataflow/flex-templates/streaming_beam/e2e_test.py b/dataflow/flex-templates/streaming_beam/e2e_test.py index ce0ba9193fc..b5816c58c78 100644 --- a/dataflow/flex-templates/streaming_beam/e2e_test.py +++ b/dataflow/flex-templates/streaming_beam/e2e_test.py @@ -93,7 +93,7 @@ def test_flex_template_run( # Since this is a streaming job, it will never finish running. # First, lets wait until the job is running. - utils.dataflow_jobs_wait(job_id) + utils.dataflow_jobs_wait(job_id, until_status="JOB_STATE_RUNNING") # Then, wait a minute for data to arrive, get processed, and cancel it. time.sleep(60) diff --git a/dataflow/gpu-workers/pytorch-minimal/e2e_test.py b/dataflow/gpu-workers/pytorch-minimal/e2e_test.py index 84a965d4a1f..8d6eee7253e 100644 --- a/dataflow/gpu-workers/pytorch-minimal/e2e_test.py +++ b/dataflow/gpu-workers/pytorch-minimal/e2e_test.py @@ -37,10 +37,16 @@ def worker_image(utils: Utils) -> str: def test_end_to_end(utils: Utils, bucket_name: str, worker_image: str) -> None: # Run the Beam pipeline in Dataflow making sure GPUs are used. + job_name = utils.hyphen_name(NAME) utils.cloud_build_submit( config="run.yaml", substitutions={ "_IMAGE": worker_image, + "_JOB_NAME": job_name, "_TEMP_LOCATION": f"gs://{bucket_name}/temp", }, ) + + # Wait until the job finishes. + status = utils.dataflow_jobs_wait(job_name=job_name) + assert status == "JOB_STATE_DONE", f"Dataflow pipeline finished in {status} status" diff --git a/dataflow/gpu-workers/pytorch-minimal/run.yaml b/dataflow/gpu-workers/pytorch-minimal/run.yaml index fb915831fe1..7e352a7c924 100644 --- a/dataflow/gpu-workers/pytorch-minimal/run.yaml +++ b/dataflow/gpu-workers/pytorch-minimal/run.yaml @@ -27,6 +27,7 @@ substitutions: _IMAGE: samples/dataflow/pytorch-gpu:latest + _JOB_NAME: '' _TEMP_LOCATION: '' _REGION: us-central1 _GPU_TYPE: nvidia-tesla-t4 @@ -40,6 +41,7 @@ steps: - --runner=DataflowRunner - --project=$PROJECT_ID - --region=$_REGION + - --job_name=$_JOB_NAME - --temp_location=$_TEMP_LOCATION - --worker_harness_container_image=gcr.io/$PROJECT_ID/$_IMAGE - --disk_size_gb=20 diff --git a/dataflow/gpu-workers/tensorflow-landsat/e2e_test.py b/dataflow/gpu-workers/tensorflow-landsat/e2e_test.py index f35530be4f7..9b4007ce9c6 100644 --- a/dataflow/gpu-workers/tensorflow-landsat/e2e_test.py +++ b/dataflow/gpu-workers/tensorflow-landsat/e2e_test.py @@ -38,15 +38,21 @@ def worker_image(utils: Utils) -> str: def test_end_to_end(utils: Utils, bucket_name: str, worker_image: str) -> None: # Run the Beam pipeline in Dataflow making sure GPUs are used. + job_name = utils.hyphen_name(NAME) utils.cloud_build_submit( config="run.yaml", substitutions={ "_IMAGE": worker_image, + "_JOB_NAME": job_name, "_TEMP_LOCATION": f"gs://{bucket_name}/temp", "_OUTPUT_PATH": f"gs://{bucket_name}/outputs/", }, ) + # Wait until the job finishes. + status = utils.dataflow_jobs_wait(job_name=job_name) + assert status == "JOB_STATE_DONE", f"Dataflow pipeline finished in {status} status" + # Check that output files were created and are not empty. storage_client = storage.Client() output_files = list(storage_client.list_blobs(bucket_name, prefix="outputs/")) diff --git a/dataflow/gpu-workers/tensorflow-landsat/run.yaml b/dataflow/gpu-workers/tensorflow-landsat/run.yaml index 022d180ee34..6eb8264e7db 100644 --- a/dataflow/gpu-workers/tensorflow-landsat/run.yaml +++ b/dataflow/gpu-workers/tensorflow-landsat/run.yaml @@ -28,6 +28,7 @@ substitutions: _OUTPUT_PATH: please run with --substitutions _OUTPUT_PATH=gs://$BUCKET/samples/dataflow/landsat/outputs/ _IMAGE: samples/dataflow/landsat-gpu:latest + _JOB_NAME: '' _TEMP_LOCATION: '' _REGION: us-central1 _GPU_TYPE: nvidia-tesla-t4 @@ -42,6 +43,7 @@ steps: - --runner=DataflowRunner - --project=$PROJECT_ID - --region=$_REGION + - --job_name=$_JOB_NAME - --temp_location=$_TEMP_LOCATION - --worker_machine_type=custom-1-13312-ext - --worker_harness_container_image=gcr.io/$PROJECT_ID/$_IMAGE diff --git a/dataflow/gpu-workers/tensorflow-minimal/e2e_test.py b/dataflow/gpu-workers/tensorflow-minimal/e2e_test.py index 73547f1a8bf..bce098c9cef 100644 --- a/dataflow/gpu-workers/tensorflow-minimal/e2e_test.py +++ b/dataflow/gpu-workers/tensorflow-minimal/e2e_test.py @@ -37,10 +37,16 @@ def worker_image(utils: Utils) -> str: def test_end_to_end(utils: Utils, bucket_name: str, worker_image: str) -> None: # Run the Beam pipeline in Dataflow making sure GPUs are used. + job_name = utils.hyphen_name(NAME) utils.cloud_build_submit( config="run.yaml", substitutions={ "_IMAGE": worker_image, + "_JOB_NAME": job_name, "_TEMP_LOCATION": f"gs://{bucket_name}/temp", }, ) + + # Wait until the job finishes. + status = utils.dataflow_jobs_wait(job_name=job_name) + assert status == "JOB_STATE_DONE", f"Dataflow pipeline finished in {status} status" diff --git a/dataflow/gpu-workers/tensorflow-minimal/run.yaml b/dataflow/gpu-workers/tensorflow-minimal/run.yaml index ca69dcaa353..75178e5d54c 100644 --- a/dataflow/gpu-workers/tensorflow-minimal/run.yaml +++ b/dataflow/gpu-workers/tensorflow-minimal/run.yaml @@ -27,6 +27,7 @@ substitutions: _IMAGE: samples/dataflow/tensorflow-gpu:latest + _JOB_NAME: '' _TEMP_LOCATION: '' _REGION: us-central1 _GPU_TYPE: nvidia-tesla-t4 @@ -40,6 +41,7 @@ steps: - --runner=DataflowRunner - --project=$PROJECT_ID - --region=$_REGION + - --job_name=$_JOB_NAME - --temp_location=$_TEMP_LOCATION - --worker_harness_container_image=gcr.io/$PROJECT_ID/$_IMAGE - --experiment=worker_accelerator=type:$_GPU_TYPE;count:$_GPU_COUNT;install-nvidia-driver From 17258d4d7ee27fcac7032ba16e5544af214a7d96 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Wed, 9 Jun 2021 14:32:37 -0700 Subject: [PATCH 14/87] fix test requirements --- dataflow/conftest.py | 6 +++--- dataflow/gpu-workers/pytorch-minimal/requirements-test.txt | 3 +++ .../gpu-workers/tensorflow-landsat/requirements-test.txt | 1 + .../gpu-workers/tensorflow-minimal/requirements-test.txt | 3 +++ 4 files changed, 10 insertions(+), 3 deletions(-) create mode 100644 dataflow/gpu-workers/pytorch-minimal/requirements-test.txt create mode 100644 dataflow/gpu-workers/tensorflow-minimal/requirements-test.txt diff --git a/dataflow/conftest.py b/dataflow/conftest.py index c5e9623889a..c92e1288556 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -207,7 +207,7 @@ def cloud_build_submit( "builds", "submit", f"--project={project}", - f"--tag=gcr.io/{project}/{image_name}-{UUID}:latest", + f"--tag=gcr.io//{project}/{image_name}-{UUID}:latest", *cmd_substitutions, ".", ] @@ -223,7 +223,7 @@ def cloud_build_submit( "container", "images", "delete", - f"gcr.io/{project}/{image_name}-{UUID}:latest", + f"gcr.io//{project}/{image_name}-{UUID}:latest", f"--project={project}", "--quiet", ] @@ -352,7 +352,7 @@ def dataflow_flex_template_build( "build", template_gcs_path, f"--project={project}", - f"--image=gcr.io/{project}/{image_name}", + f"--image=gcr.io//{project}/{image_name}", "--sdk-language=PYTHON", f"--metadata-file={metadata_file}", ] diff --git a/dataflow/gpu-workers/pytorch-minimal/requirements-test.txt b/dataflow/gpu-workers/pytorch-minimal/requirements-test.txt new file mode 100644 index 00000000000..4a9e35e0e25 --- /dev/null +++ b/dataflow/gpu-workers/pytorch-minimal/requirements-test.txt @@ -0,0 +1,3 @@ +google-api-python-client==2.1.0 +google-cloud-storage==1.38.0 +pytest==6.2.4 diff --git a/dataflow/gpu-workers/tensorflow-landsat/requirements-test.txt b/dataflow/gpu-workers/tensorflow-landsat/requirements-test.txt index 9782f5d8d54..4a9e35e0e25 100644 --- a/dataflow/gpu-workers/tensorflow-landsat/requirements-test.txt +++ b/dataflow/gpu-workers/tensorflow-landsat/requirements-test.txt @@ -1,2 +1,3 @@ +google-api-python-client==2.1.0 google-cloud-storage==1.38.0 pytest==6.2.4 diff --git a/dataflow/gpu-workers/tensorflow-minimal/requirements-test.txt b/dataflow/gpu-workers/tensorflow-minimal/requirements-test.txt new file mode 100644 index 00000000000..4a9e35e0e25 --- /dev/null +++ b/dataflow/gpu-workers/tensorflow-minimal/requirements-test.txt @@ -0,0 +1,3 @@ +google-api-python-client==2.1.0 +google-cloud-storage==1.38.0 +pytest==6.2.4 From dabb8ffe2ca5e81f05ce53d20632c32c495c1207 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Wed, 9 Jun 2021 15:46:09 -0700 Subject: [PATCH 15/87] fix image prefix --- dataflow/conftest.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index c92e1288556..c5e9623889a 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -207,7 +207,7 @@ def cloud_build_submit( "builds", "submit", f"--project={project}", - f"--tag=gcr.io//{project}/{image_name}-{UUID}:latest", + f"--tag=gcr.io/{project}/{image_name}-{UUID}:latest", *cmd_substitutions, ".", ] @@ -223,7 +223,7 @@ def cloud_build_submit( "container", "images", "delete", - f"gcr.io//{project}/{image_name}-{UUID}:latest", + f"gcr.io/{project}/{image_name}-{UUID}:latest", f"--project={project}", "--quiet", ] @@ -352,7 +352,7 @@ def dataflow_flex_template_build( "build", template_gcs_path, f"--project={project}", - f"--image=gcr.io//{project}/{image_name}", + f"--image=gcr.io/{project}/{image_name}", "--sdk-language=PYTHON", f"--metadata-file={metadata_file}", ] From ffa0476abf4e57a39844588d3783ee953e4663ad Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Wed, 9 Jun 2021 17:20:55 -0700 Subject: [PATCH 16/87] get job status for wait --- dataflow/conftest.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index c5e9623889a..2b8729fcbe0 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -305,9 +305,10 @@ def dataflow_jobs_wait( status = None for _ in range(0, timeout_sec, poll_interval_sec): try: - status = Utils.dataflow_jobs_get( + job = Utils.dataflow_jobs_get( job_id, job_name, project, region, list_page_size ) + status = job["currentStatus"] if status in target_status: return status except Exception as e: From 07d1395d850ff2433e0f9c2baa57facf4cd7550a Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Thu, 10 Jun 2021 11:29:03 -0700 Subject: [PATCH 17/87] use correct region --- dataflow/conftest.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index 2b8729fcbe0..5c8260dd2dd 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -225,6 +225,7 @@ def cloud_build_submit( "delete", f"gcr.io/{project}/{image_name}-{UUID}:latest", f"--project={project}", + "--force-delete-tags", "--quiet", ] print(cmd) @@ -271,7 +272,9 @@ def dataflow_jobs_get( location=region, ) ) - for job in request.execute()["jobs"]: + response = request.execute() + print(response) + for job in response["jobs"]: if job["name"] == job_name: print(job) return job @@ -312,7 +315,7 @@ def dataflow_jobs_wait( if status in target_status: return status except Exception as e: - logging.warning(e) + logging.exception(e) time.sleep(poll_interval_sec) return status From 6c87851c2ad7bae9ddd8a9468cd477d0a43f2e62 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Thu, 10 Jun 2021 11:29:56 -0700 Subject: [PATCH 18/87] make jobs list not region dependent --- dataflow/conftest.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index 5c8260dd2dd..a876a054b7a 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -236,7 +236,6 @@ def dataflow_jobs_get( job_id: Optional[str] = None, job_name: Optional[str] = None, project: str = PROJECT, - region: str = REGION, list_page_size=100, ) -> Optional[Dict[str, Any]]: from googleapiclient.discovery import build @@ -269,7 +268,6 @@ def dataflow_jobs_get( projectId=project, filter="ACTIVE", pageSize=list_page_size, - location=region, ) ) response = request.execute() From 0d76f0beebf7758b1c3b302bc8d8c7df1ac88b18 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Thu, 10 Jun 2021 11:40:00 -0700 Subject: [PATCH 19/87] use uuid for image tags --- dataflow/conftest.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index a876a054b7a..90271c45f61 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -207,13 +207,13 @@ def cloud_build_submit( "builds", "submit", f"--project={project}", - f"--tag=gcr.io/{project}/{image_name}-{UUID}:latest", + f"--tag=gcr.io/{project}/{image_name}:{UUID}", *cmd_substitutions, ".", ] print(cmd) subprocess.run(cmd, check=True) - yield f"{image_name}-{UUID}:latest" + yield f"{image_name}:{UUID}" else: raise ValueError("must specify either `config` or `image_name`") @@ -223,7 +223,7 @@ def cloud_build_submit( "container", "images", "delete", - f"gcr.io/{project}/{image_name}-{UUID}:latest", + f"gcr.io/{project}/{image_name}:{UUID}", f"--project={project}", "--force-delete-tags", "--quiet", From ab9d03924fd61d4e84b2b9d4f6b0a717a47f2a1a Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Thu, 10 Jun 2021 11:42:32 -0700 Subject: [PATCH 20/87] update title --- dataflow/gpu-workers/tensorflow-landsat/README.md | 2 +- dataflow/gpu-workers/tensorflow-minimal/README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dataflow/gpu-workers/tensorflow-landsat/README.md b/dataflow/gpu-workers/tensorflow-landsat/README.md index e91193b3a48..003dfef7294 100644 --- a/dataflow/gpu-workers/tensorflow-landsat/README.md +++ b/dataflow/gpu-workers/tensorflow-landsat/README.md @@ -1,4 +1,4 @@ -# Workers with GPUs +# Processing Landsat satellite images with GPUs [![Open in Cloud Shell](http://gstatic.com/cloudssh/images/open-btn.svg)](https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=dataflow/gpu-workers/README.md) diff --git a/dataflow/gpu-workers/tensorflow-minimal/README.md b/dataflow/gpu-workers/tensorflow-minimal/README.md index 3ec270791f9..a645dbb411b 100644 --- a/dataflow/gpu-workers/tensorflow-minimal/README.md +++ b/dataflow/gpu-workers/tensorflow-minimal/README.md @@ -1,4 +1,4 @@ -# PyTorch GPU minimal pipeline +# TensorFlow GPU minimal pipeline ## Before you begin From 37816360002adef1ebe01d9dcf3cdaad42b6ec98 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Thu, 10 Jun 2021 11:44:06 -0700 Subject: [PATCH 21/87] fix run command --- dataflow/gpu-workers/tensorflow-landsat/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataflow/gpu-workers/tensorflow-landsat/README.md b/dataflow/gpu-workers/tensorflow-landsat/README.md index 003dfef7294..7f826e9c00c 100644 --- a/dataflow/gpu-workers/tensorflow-landsat/README.md +++ b/dataflow/gpu-workers/tensorflow-landsat/README.md @@ -39,6 +39,6 @@ export GPU_TYPE="nvidia-tesla-t4" gcloud beta builds submit \ --config run.yaml \ - --substitutions _GCS_PATH=$GCS_PATH,_REGION=$REGION,_GPU_TYPE=$GPU_TYPE \ + --substitutions _OUTPUT_PATH=$OUTPUT_PATH,_REGION=$REGION,_GPU_TYPE=$GPU_TYPE \ --no-source ``` From f1bd64ec62f502f3cc648f1a7c0d78c7ce5dbb9a Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Thu, 10 Jun 2021 12:16:48 -0700 Subject: [PATCH 22/87] remove region from job get call --- dataflow/conftest.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index 90271c45f61..faa5e60a308 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -307,7 +307,10 @@ def dataflow_jobs_wait( for _ in range(0, timeout_sec, poll_interval_sec): try: job = Utils.dataflow_jobs_get( - job_id, job_name, project, region, list_page_size + job_id=job_id, + job_name=job_name, + project=project, + list_page_size=list_page_size, ) status = job["currentStatus"] if status in target_status: From 6e3a4f69b4e14b92f080db724eb816bf7ea31247 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Thu, 10 Jun 2021 13:50:44 -0700 Subject: [PATCH 23/87] launch jobs from fixtures --- dataflow/conftest.py | 6 ++++-- dataflow/gpu-workers/pytorch-minimal/e2e_test.py | 15 +++++++++++---- .../gpu-workers/tensorflow-landsat/e2e_test.py | 15 +++++++++++---- .../gpu-workers/tensorflow-minimal/e2e_test.py | 15 +++++++++++---- 4 files changed, 37 insertions(+), 14 deletions(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index faa5e60a308..f32907442c9 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -180,6 +180,7 @@ def cloud_build_submit( """Sends a Cloud Build job, if an image_name is provided it will be deleted at teardown.""" cmd = ["gcloud", "auth", "configure-docker"] print(cmd) + subprocess.run(cmd, check=True) if substitutions: cmd_substitutions = [ @@ -188,7 +189,6 @@ def cloud_build_submit( else: cmd_substitutions = [] - subprocess.run(cmd, check=True) if config: cmd = [ "gcloud", @@ -318,7 +318,9 @@ def dataflow_jobs_wait( except Exception as e: logging.exception(e) time.sleep(poll_interval_sec) - return status + raise RuntimeError( + f"Dataflow job not found, job_id={job_id}, job_name={job_name}" + ) @staticmethod def dataflow_jobs_cancel_by_job_id( diff --git a/dataflow/gpu-workers/pytorch-minimal/e2e_test.py b/dataflow/gpu-workers/pytorch-minimal/e2e_test.py index 8d6eee7253e..b40cd9a2fe8 100644 --- a/dataflow/gpu-workers/pytorch-minimal/e2e_test.py +++ b/dataflow/gpu-workers/pytorch-minimal/e2e_test.py @@ -35,18 +35,25 @@ def worker_image(utils: Utils) -> str: yield from utils.cloud_build_submit(NAME, config="build.yaml") -def test_end_to_end(utils: Utils, bucket_name: str, worker_image: str) -> None: +@pytest.fixture(scope="session") +def job_name(utils: Utils) -> str: + yield utils.hyphen_name(NAME) + + +@pytest.fixture(scope="session") +def run_job(utils: Utils, job_name: str, bucket_name: str, worker_image: str) -> str: # Run the Beam pipeline in Dataflow making sure GPUs are used. - job_name = utils.hyphen_name(NAME) - utils.cloud_build_submit( + yield from utils.cloud_build_submit( config="run.yaml", substitutions={ - "_IMAGE": worker_image, "_JOB_NAME": job_name, + "_IMAGE": worker_image, "_TEMP_LOCATION": f"gs://{bucket_name}/temp", }, ) + +def test_pytorch_minimal(utils: Utils, job_name: str, run_job: str) -> None: # Wait until the job finishes. status = utils.dataflow_jobs_wait(job_name=job_name) assert status == "JOB_STATE_DONE", f"Dataflow pipeline finished in {status} status" diff --git a/dataflow/gpu-workers/tensorflow-landsat/e2e_test.py b/dataflow/gpu-workers/tensorflow-landsat/e2e_test.py index 9b4007ce9c6..555a99bab42 100644 --- a/dataflow/gpu-workers/tensorflow-landsat/e2e_test.py +++ b/dataflow/gpu-workers/tensorflow-landsat/e2e_test.py @@ -36,19 +36,26 @@ def worker_image(utils: Utils) -> str: yield from utils.cloud_build_submit(NAME, config="build.yaml") -def test_end_to_end(utils: Utils, bucket_name: str, worker_image: str) -> None: +@pytest.fixture(scope="session") +def job_name(utils: Utils) -> str: + yield utils.hyphen_name(NAME) + + +@pytest.fixture(scope="session") +def run_job(utils: Utils, job_name: str, bucket_name: str, worker_image: str) -> str: # Run the Beam pipeline in Dataflow making sure GPUs are used. - job_name = utils.hyphen_name(NAME) - utils.cloud_build_submit( + yield from utils.cloud_build_submit( config="run.yaml", substitutions={ - "_IMAGE": worker_image, "_JOB_NAME": job_name, + "_IMAGE": worker_image, "_TEMP_LOCATION": f"gs://{bucket_name}/temp", "_OUTPUT_PATH": f"gs://{bucket_name}/outputs/", }, ) + +def test_tensorflow_landsat(utils: Utils, job_name: str, run_job: str) -> None: # Wait until the job finishes. status = utils.dataflow_jobs_wait(job_name=job_name) assert status == "JOB_STATE_DONE", f"Dataflow pipeline finished in {status} status" diff --git a/dataflow/gpu-workers/tensorflow-minimal/e2e_test.py b/dataflow/gpu-workers/tensorflow-minimal/e2e_test.py index bce098c9cef..6b5025dd5ad 100644 --- a/dataflow/gpu-workers/tensorflow-minimal/e2e_test.py +++ b/dataflow/gpu-workers/tensorflow-minimal/e2e_test.py @@ -35,18 +35,25 @@ def worker_image(utils: Utils) -> str: yield from utils.cloud_build_submit(NAME, config="build.yaml") -def test_end_to_end(utils: Utils, bucket_name: str, worker_image: str) -> None: +@pytest.fixture(scope="session") +def job_name(utils: Utils) -> str: + yield utils.hyphen_name(NAME) + + +@pytest.fixture(scope="session") +def run_job(utils: Utils, job_name: str, bucket_name: str, worker_image: str) -> str: # Run the Beam pipeline in Dataflow making sure GPUs are used. - job_name = utils.hyphen_name(NAME) - utils.cloud_build_submit( + yield from utils.cloud_build_submit( config="run.yaml", substitutions={ - "_IMAGE": worker_image, "_JOB_NAME": job_name, + "_IMAGE": worker_image, "_TEMP_LOCATION": f"gs://{bucket_name}/temp", }, ) + +def test_tensorflow_minimal(utils: Utils, job_name: str, run_job: str) -> None: # Wait until the job finishes. status = utils.dataflow_jobs_wait(job_name=job_name) assert status == "JOB_STATE_DONE", f"Dataflow pipeline finished in {status} status" From b66bf1ef518c839ddc949e7d57b6246e2e68e9d2 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Tue, 15 Jun 2021 10:13:51 -0700 Subject: [PATCH 24/87] fix resource names --- dataflow/conftest.py | 5 +++-- dataflow/flex-templates/streaming_beam/e2e_test.py | 8 +++++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index f32907442c9..b0d4d093e60 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -312,7 +312,7 @@ def dataflow_jobs_wait( project=project, list_page_size=list_page_size, ) - status = job["currentStatus"] + status = job["currentState"] if status in target_status: return status except Exception as e: @@ -323,7 +323,7 @@ def dataflow_jobs_wait( ) @staticmethod - def dataflow_jobs_cancel_by_job_id( + def dataflow_jobs_cancel( job_id: str, project: str = PROJECT, region: str = REGION ) -> None: print(f"Canceling Dataflow job ID: {job_id}") @@ -340,6 +340,7 @@ def dataflow_jobs_cancel_by_job_id( job_id, f"--region={region}", ] + print(cmd) subprocess.run(cmd, check=True) @staticmethod diff --git a/dataflow/flex-templates/streaming_beam/e2e_test.py b/dataflow/flex-templates/streaming_beam/e2e_test.py index b5816c58c78..e1f6badfa0f 100644 --- a/dataflow/flex-templates/streaming_beam/e2e_test.py +++ b/dataflow/flex-templates/streaming_beam/e2e_test.py @@ -10,6 +10,7 @@ # distributed under the License is distributed on an 'AS IS' BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +from conftest import PROJECT import json import time @@ -78,6 +79,7 @@ def test_flex_template_run( pubsub_subscription: str, flex_template_path: str, bigquery_dataset: str, + project: str = PROJECT, ) -> None: bigquery_table = "output_table" @@ -86,8 +88,8 @@ def test_flex_template_run( template_path=flex_template_path, bucket_name=bucket_name, parameters={ - "input_subscription": pubsub_subscription, - "output_table": f"{bigquery_dataset}.{bigquery_table}", + "input_subscription": f"projects/{project}/subscriptions/{pubsub_subscription}", + "output_table": f"{project}:{bigquery_dataset}.{bigquery_table}", }, ) @@ -97,7 +99,7 @@ def test_flex_template_run( # Then, wait a minute for data to arrive, get processed, and cancel it. time.sleep(60) - utils.dataflow_jobs_cancel_by_job_id(job_id) + utils.dataflow_jobs_cancel(job_id) # Check for the output data in BigQuery. query = f"SELECT * FROM {bigquery_dataset.replace(':', '.')}.{bigquery_table}" From 45ba754baf94bcdee732d55b6df10a9c83439328 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Tue, 15 Jun 2021 10:53:43 -0700 Subject: [PATCH 25/87] use unique images --- dataflow/conftest.py | 23 ++++++++++--------- .../gpu-workers/pytorch-minimal/e2e_test.py | 23 +++++++++---------- .../tensorflow-landsat/e2e_test.py | 23 +++++++++---------- .../tensorflow-minimal/e2e_test.py | 23 +++++++++---------- 4 files changed, 45 insertions(+), 47 deletions(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index b0d4d093e60..1c6cf467dce 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -190,17 +190,18 @@ def cloud_build_submit( cmd_substitutions = [] if config: - cmd = [ - "gcloud", - "builds", - "submit", - f"--project={project}", - f"--config={config}", - *cmd_substitutions, - ] - print(cmd) - subprocess.run(cmd, check=True) - yield config + with open(config) as f: + cmd = [ + "gcloud", + "builds", + "submit", + f"--project={project}", + f"--config={config}", + *cmd_substitutions, + ] + print(cmd) + subprocess.run(cmd, check=True) + yield f.read() elif image_name: cmd = [ "gcloud", diff --git a/dataflow/gpu-workers/pytorch-minimal/e2e_test.py b/dataflow/gpu-workers/pytorch-minimal/e2e_test.py index b40cd9a2fe8..baf6e161617 100644 --- a/dataflow/gpu-workers/pytorch-minimal/e2e_test.py +++ b/dataflow/gpu-workers/pytorch-minimal/e2e_test.py @@ -31,29 +31,28 @@ def bucket_name(utils: Utils) -> str: @pytest.fixture(scope="session") -def worker_image(utils: Utils) -> str: - yield from utils.cloud_build_submit(NAME, config="build.yaml") - - -@pytest.fixture(scope="session") -def job_name(utils: Utils) -> str: - yield utils.hyphen_name(NAME) +def build_image(utils: Utils) -> str: + yield from utils.cloud_build_submit( + NAME, + config="build.yaml", + substitutions={"_IMAGE": f"{NAME}:{utils.uuid}"}, + ) @pytest.fixture(scope="session") -def run_job(utils: Utils, job_name: str, bucket_name: str, worker_image: str) -> str: +def run_job(utils: Utils, bucket_name: str, build_image: str) -> str: # Run the Beam pipeline in Dataflow making sure GPUs are used. yield from utils.cloud_build_submit( config="run.yaml", substitutions={ - "_JOB_NAME": job_name, - "_IMAGE": worker_image, + "_JOB_NAME": utils.hyphen_name(NAME), + "_IMAGE": f"{NAME}:{utils.uuid}", "_TEMP_LOCATION": f"gs://{bucket_name}/temp", }, ) -def test_pytorch_minimal(utils: Utils, job_name: str, run_job: str) -> None: +def test_pytorch_minimal(utils: Utils, run_job: str) -> None: # Wait until the job finishes. - status = utils.dataflow_jobs_wait(job_name=job_name) + status = utils.dataflow_jobs_wait(job_name=utils.hyphen_name(NAME)) assert status == "JOB_STATE_DONE", f"Dataflow pipeline finished in {status} status" diff --git a/dataflow/gpu-workers/tensorflow-landsat/e2e_test.py b/dataflow/gpu-workers/tensorflow-landsat/e2e_test.py index 555a99bab42..f94c9764be4 100644 --- a/dataflow/gpu-workers/tensorflow-landsat/e2e_test.py +++ b/dataflow/gpu-workers/tensorflow-landsat/e2e_test.py @@ -32,32 +32,31 @@ def bucket_name(utils: Utils) -> str: @pytest.fixture(scope="session") -def worker_image(utils: Utils) -> str: - yield from utils.cloud_build_submit(NAME, config="build.yaml") - - -@pytest.fixture(scope="session") -def job_name(utils: Utils) -> str: - yield utils.hyphen_name(NAME) +def build_image(utils: Utils) -> str: + yield from utils.cloud_build_submit( + NAME, + config="build.yaml", + substitutions={"_IMAGE": f"{NAME}:{utils.uuid}"}, + ) @pytest.fixture(scope="session") -def run_job(utils: Utils, job_name: str, bucket_name: str, worker_image: str) -> str: +def run_job(utils: Utils, bucket_name: str, build_image: str) -> str: # Run the Beam pipeline in Dataflow making sure GPUs are used. yield from utils.cloud_build_submit( config="run.yaml", substitutions={ - "_JOB_NAME": job_name, - "_IMAGE": worker_image, + "_JOB_NAME": utils.hyphen_name(NAME), + "_IMAGE": f"{NAME}:{utils.uuid}", "_TEMP_LOCATION": f"gs://{bucket_name}/temp", "_OUTPUT_PATH": f"gs://{bucket_name}/outputs/", }, ) -def test_tensorflow_landsat(utils: Utils, job_name: str, run_job: str) -> None: +def test_tensorflow_landsat(utils: Utils, run_job: str) -> None: # Wait until the job finishes. - status = utils.dataflow_jobs_wait(job_name=job_name) + status = utils.dataflow_jobs_wait(job_name=utils.hyphen_name(NAME)) assert status == "JOB_STATE_DONE", f"Dataflow pipeline finished in {status} status" # Check that output files were created and are not empty. diff --git a/dataflow/gpu-workers/tensorflow-minimal/e2e_test.py b/dataflow/gpu-workers/tensorflow-minimal/e2e_test.py index 6b5025dd5ad..33ddfc71463 100644 --- a/dataflow/gpu-workers/tensorflow-minimal/e2e_test.py +++ b/dataflow/gpu-workers/tensorflow-minimal/e2e_test.py @@ -31,29 +31,28 @@ def bucket_name(utils: Utils) -> str: @pytest.fixture(scope="session") -def worker_image(utils: Utils) -> str: - yield from utils.cloud_build_submit(NAME, config="build.yaml") - - -@pytest.fixture(scope="session") -def job_name(utils: Utils) -> str: - yield utils.hyphen_name(NAME) +def build_image(utils: Utils) -> str: + yield from utils.cloud_build_submit( + NAME, + config="build.yaml", + substitutions={"_IMAGE": f"{NAME}:{utils.uuid}"}, + ) @pytest.fixture(scope="session") -def run_job(utils: Utils, job_name: str, bucket_name: str, worker_image: str) -> str: +def run_job(utils: Utils, bucket_name: str, build_image: str) -> str: # Run the Beam pipeline in Dataflow making sure GPUs are used. yield from utils.cloud_build_submit( config="run.yaml", substitutions={ - "_JOB_NAME": job_name, - "_IMAGE": worker_image, + "_JOB_NAME": utils.hyphen_name(NAME), + "_IMAGE": f"{NAME}:{utils.uuid}", "_TEMP_LOCATION": f"gs://{bucket_name}/temp", }, ) -def test_tensorflow_minimal(utils: Utils, job_name: str, run_job: str) -> None: +def test_tensorflow_minimal(utils: Utils, run_job: str) -> None: # Wait until the job finishes. - status = utils.dataflow_jobs_wait(job_name=job_name) + status = utils.dataflow_jobs_wait(job_name=utils.hyphen_name(NAME)) assert status == "JOB_STATE_DONE", f"Dataflow pipeline finished in {status} status" From c452cc41630ed922624151e9e77c77edaf5eeda4 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Tue, 15 Jun 2021 12:19:39 -0700 Subject: [PATCH 26/87] fix lint issues --- dataflow/conftest.py | 2 +- dataflow/flex-templates/streaming_beam/e2e_test.py | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index 1c6cf467dce..2794a92a9f8 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -20,7 +20,7 @@ import subprocess import sys import time -from typing import Any, Callable, Dict, Iterable, List, Optional, Union +from typing import Any, Callable, Dict, Iterable, Optional, Union import uuid import pytest diff --git a/dataflow/flex-templates/streaming_beam/e2e_test.py b/dataflow/flex-templates/streaming_beam/e2e_test.py index e1f6badfa0f..bebd2eb08ac 100644 --- a/dataflow/flex-templates/streaming_beam/e2e_test.py +++ b/dataflow/flex-templates/streaming_beam/e2e_test.py @@ -10,7 +10,6 @@ # distributed under the License is distributed on an 'AS IS' BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -from conftest import PROJECT import json import time @@ -79,7 +78,6 @@ def test_flex_template_run( pubsub_subscription: str, flex_template_path: str, bigquery_dataset: str, - project: str = PROJECT, ) -> None: bigquery_table = "output_table" @@ -88,8 +86,8 @@ def test_flex_template_run( template_path=flex_template_path, bucket_name=bucket_name, parameters={ - "input_subscription": f"projects/{project}/subscriptions/{pubsub_subscription}", - "output_table": f"{project}:{bigquery_dataset}.{bigquery_table}", + "input_subscription": f"projects/{utils.project}/subscriptions/{pubsub_subscription}", + "output_table": f"{utils.kproject}:{bigquery_dataset}.{bigquery_table}", }, ) From 67fa556420ec1702253c9e3622ae61abd83f8820 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Tue, 15 Jun 2021 13:16:30 -0700 Subject: [PATCH 27/87] pass --no-source when running the job --- dataflow/conftest.py | 4 +++- dataflow/gpu-workers/pytorch-minimal/e2e_test.py | 4 +++- dataflow/gpu-workers/pytorch-minimal/run.yaml | 1 + dataflow/gpu-workers/tensorflow-landsat/e2e_test.py | 4 +++- dataflow/gpu-workers/tensorflow-landsat/run.yaml | 1 + dataflow/gpu-workers/tensorflow-minimal/e2e_test.py | 4 +++- dataflow/gpu-workers/tensorflow-minimal/run.yaml | 1 + 7 files changed, 15 insertions(+), 4 deletions(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index 2794a92a9f8..bf1918f67de 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -174,6 +174,7 @@ def _infinite_publish_job() -> None: def cloud_build_submit( image_name: Optional[str] = None, config: Optional[str] = None, + source: str = ".", substitutions: Optional[Dict[str, str]] = None, project: str = PROJECT, ) -> None: @@ -198,6 +199,7 @@ def cloud_build_submit( f"--project={project}", f"--config={config}", *cmd_substitutions, + source, ] print(cmd) subprocess.run(cmd, check=True) @@ -210,7 +212,7 @@ def cloud_build_submit( f"--project={project}", f"--tag=gcr.io/{project}/{image_name}:{UUID}", *cmd_substitutions, - ".", + source, ] print(cmd) subprocess.run(cmd, check=True) diff --git a/dataflow/gpu-workers/pytorch-minimal/e2e_test.py b/dataflow/gpu-workers/pytorch-minimal/e2e_test.py index baf6e161617..c24ee471e5d 100644 --- a/dataflow/gpu-workers/pytorch-minimal/e2e_test.py +++ b/dataflow/gpu-workers/pytorch-minimal/e2e_test.py @@ -33,7 +33,7 @@ def bucket_name(utils: Utils) -> str: @pytest.fixture(scope="session") def build_image(utils: Utils) -> str: yield from utils.cloud_build_submit( - NAME, + image_name=NAME, config="build.yaml", substitutions={"_IMAGE": f"{NAME}:{utils.uuid}"}, ) @@ -48,7 +48,9 @@ def run_job(utils: Utils, bucket_name: str, build_image: str) -> str: "_JOB_NAME": utils.hyphen_name(NAME), "_IMAGE": f"{NAME}:{utils.uuid}", "_TEMP_LOCATION": f"gs://{bucket_name}/temp", + "_REGION": utils.region, }, + source="--no-source", ) diff --git a/dataflow/gpu-workers/pytorch-minimal/run.yaml b/dataflow/gpu-workers/pytorch-minimal/run.yaml index 7e352a7c924..95508c9fb86 100644 --- a/dataflow/gpu-workers/pytorch-minimal/run.yaml +++ b/dataflow/gpu-workers/pytorch-minimal/run.yaml @@ -51,4 +51,5 @@ steps: options: logging: CLOUD_LOGGING_ONLY +# Use the Compute Engine default service account to launch the job. serviceAccount: projects/$PROJECT_ID/serviceAccounts/$PROJECT_NUMBER-compute@developer.gserviceaccount.com diff --git a/dataflow/gpu-workers/tensorflow-landsat/e2e_test.py b/dataflow/gpu-workers/tensorflow-landsat/e2e_test.py index f94c9764be4..6f9d58a5760 100644 --- a/dataflow/gpu-workers/tensorflow-landsat/e2e_test.py +++ b/dataflow/gpu-workers/tensorflow-landsat/e2e_test.py @@ -34,7 +34,7 @@ def bucket_name(utils: Utils) -> str: @pytest.fixture(scope="session") def build_image(utils: Utils) -> str: yield from utils.cloud_build_submit( - NAME, + image_name=NAME, config="build.yaml", substitutions={"_IMAGE": f"{NAME}:{utils.uuid}"}, ) @@ -49,8 +49,10 @@ def run_job(utils: Utils, bucket_name: str, build_image: str) -> str: "_JOB_NAME": utils.hyphen_name(NAME), "_IMAGE": f"{NAME}:{utils.uuid}", "_TEMP_LOCATION": f"gs://{bucket_name}/temp", + "_REGION": utils.region, "_OUTPUT_PATH": f"gs://{bucket_name}/outputs/", }, + source="--no-source", ) diff --git a/dataflow/gpu-workers/tensorflow-landsat/run.yaml b/dataflow/gpu-workers/tensorflow-landsat/run.yaml index 6eb8264e7db..416a2f9e6e1 100644 --- a/dataflow/gpu-workers/tensorflow-landsat/run.yaml +++ b/dataflow/gpu-workers/tensorflow-landsat/run.yaml @@ -53,4 +53,5 @@ steps: options: logging: CLOUD_LOGGING_ONLY +# Use the Compute Engine default service account to launch the job. serviceAccount: projects/$PROJECT_ID/serviceAccounts/$PROJECT_NUMBER-compute@developer.gserviceaccount.com diff --git a/dataflow/gpu-workers/tensorflow-minimal/e2e_test.py b/dataflow/gpu-workers/tensorflow-minimal/e2e_test.py index 33ddfc71463..ebac5c00dd7 100644 --- a/dataflow/gpu-workers/tensorflow-minimal/e2e_test.py +++ b/dataflow/gpu-workers/tensorflow-minimal/e2e_test.py @@ -33,7 +33,7 @@ def bucket_name(utils: Utils) -> str: @pytest.fixture(scope="session") def build_image(utils: Utils) -> str: yield from utils.cloud_build_submit( - NAME, + image_name=NAME, config="build.yaml", substitutions={"_IMAGE": f"{NAME}:{utils.uuid}"}, ) @@ -48,7 +48,9 @@ def run_job(utils: Utils, bucket_name: str, build_image: str) -> str: "_JOB_NAME": utils.hyphen_name(NAME), "_IMAGE": f"{NAME}:{utils.uuid}", "_TEMP_LOCATION": f"gs://{bucket_name}/temp", + "_REGION": utils.region, }, + source="--no-source", ) diff --git a/dataflow/gpu-workers/tensorflow-minimal/run.yaml b/dataflow/gpu-workers/tensorflow-minimal/run.yaml index 75178e5d54c..337634629e8 100644 --- a/dataflow/gpu-workers/tensorflow-minimal/run.yaml +++ b/dataflow/gpu-workers/tensorflow-minimal/run.yaml @@ -50,4 +50,5 @@ steps: options: logging: CLOUD_LOGGING_ONLY +# Use the Compute Engine default service account to launch the job. serviceAccount: projects/$PROJECT_ID/serviceAccounts/$PROJECT_NUMBER-compute@developer.gserviceaccount.com From 7aabf8b53dd7a618f9e0e1d30a9cce4c56b58665 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Tue, 15 Jun 2021 14:58:18 -0700 Subject: [PATCH 28/87] more logging and changed region --- dataflow/conftest.py | 30 ++++++++++++------- .../flex-templates/streaming_beam/e2e_test.py | 2 +- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index bf1918f67de..f6a5bb977ca 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -28,8 +28,7 @@ # Default options. UUID = uuid.uuid4().hex[0:6] PROJECT = os.environ["GOOGLE_CLOUD_PROJECT"] -REGION = "us-west1" -ZONE = "us-west1-b" +REGION = "us-central1" RETRY_MAX_TIME = 5 * 60 # 5 minutes in seconds @@ -42,7 +41,6 @@ class Utils: uuid: str = UUID project: str = PROJECT region: str = REGION - zone: str = ZONE @staticmethod def hyphen_name(name: str) -> str: @@ -59,10 +57,11 @@ def storage_bucket(name: str) -> str: storage_client = storage.Client() bucket = storage_client.create_bucket(Utils.hyphen_name(name)) - print(f"storage_bucket: {bucket.name}") + print(f"Created storage_bucket: {bucket.name}") yield bucket.name bucket.delete(force=True) + print(f"Deleted storage_bucket: {bucket.name}") @staticmethod def bigquery_dataset(name: str, project: str = PROJECT) -> str: @@ -74,12 +73,13 @@ def bigquery_dataset(name: str, project: str = PROJECT) -> str: bigquery.Dataset(f"{project}.{Utils.underscore_name(name)}") ) - print(f"bigquery_dataset: {dataset.full_dataset_id}") + print(f"Created bigquery_dataset: {dataset.full_dataset_id}") yield dataset.full_dataset_id bigquery_client.delete_dataset( dataset.full_dataset_id.replace(":", "."), delete_contents=True ) + print(f"Deleted bigquery_dataset: {dataset.full_dataset_id}") @staticmethod def bigquery_query(query: str) -> Iterable[Dict[str, Any]]: @@ -97,7 +97,7 @@ def pubsub_topic(name: str, project: str = PROJECT) -> str: topic_path = publisher_client.topic_path(project, Utils.hyphen_name(name)) topic = publisher_client.create_topic(topic_path) - print(f"pubsub_topic: {topic.name}") + print(f"Created pubsub_topic: {topic.name}") yield topic.name # Due to the pinned library dependencies in apache-beam, client @@ -107,6 +107,7 @@ def pubsub_topic(name: str, project: str = PROJECT) -> str: cmd = ["gcloud", "pubsub", "--project", project, "topics", "delete", topic.name] print(cmd) subprocess.run(cmd, check=True) + print(f"Deleted pubsub_topic: {topic.name}") @staticmethod def pubsub_subscription( @@ -122,7 +123,7 @@ def pubsub_subscription( ) subscription = subscriber.create_subscription(subscription_path, topic_path) - print(f"pubsub_subscription: {subscription.name}") + print(f"Created pubsub_subscription: {subscription.name}") yield subscription.name # Due to the pinned library dependencies in apache-beam, client @@ -140,6 +141,7 @@ def pubsub_subscription( ] print(cmd) subprocess.run(cmd, check=True) + print(f"Deleted pubsub_subscription: {subscription.name}") @staticmethod def pubsub_publisher( @@ -203,6 +205,7 @@ def cloud_build_submit( ] print(cmd) subprocess.run(cmd, check=True) + print(f"Cloud build finished successfully: {config}") yield f.read() elif image_name: cmd = [ @@ -216,6 +219,7 @@ def cloud_build_submit( ] print(cmd) subprocess.run(cmd, check=True) + print(f"Created image: gcr.io/{project}/{image_name}:{UUID}") yield f"{image_name}:{UUID}" else: raise ValueError("must specify either `config` or `image_name`") @@ -233,6 +237,7 @@ def cloud_build_submit( ] print(cmd) subprocess.run(cmd, check=True) + print(f"Deleted image: gcr.io/{project}/{image_name}:{UUID}") @staticmethod def dataflow_jobs_get( @@ -258,7 +263,7 @@ def dataflow_jobs_get( ) ) job = request.execute() - print(job) + print(f"Dataflow job: {job}") return job elif job_name: @@ -274,10 +279,10 @@ def dataflow_jobs_get( ) ) response = request.execute() - print(response) + print(f"Finding job {job_name}, response={response}") for job in response["jobs"]: if job["name"] == job_name: - print(job) + print(f"Dataflow job: {job}") return job return None @@ -305,7 +310,9 @@ def dataflow_jobs_wait( target_status = ( {until_status} if isinstance(until_status, str) else set(until_status) ) - print(f"Waiting for Dataflow job until {target_status}") + print( + f"Waiting for Dataflow job until {target_status}: job_id={job_id}, job_name={job_name}" + ) status = None for _ in range(0, timeout_sec, poll_interval_sec): try: @@ -345,6 +352,7 @@ def dataflow_jobs_cancel( ] print(cmd) subprocess.run(cmd, check=True) + print(f"Cancelled Dataflow job: {job_id}") @staticmethod def dataflow_flex_template_build( diff --git a/dataflow/flex-templates/streaming_beam/e2e_test.py b/dataflow/flex-templates/streaming_beam/e2e_test.py index bebd2eb08ac..561f0c9af10 100644 --- a/dataflow/flex-templates/streaming_beam/e2e_test.py +++ b/dataflow/flex-templates/streaming_beam/e2e_test.py @@ -87,7 +87,7 @@ def test_flex_template_run( bucket_name=bucket_name, parameters={ "input_subscription": f"projects/{utils.project}/subscriptions/{pubsub_subscription}", - "output_table": f"{utils.kproject}:{bigquery_dataset}.{bigquery_table}", + "output_table": f"{utils.project}:{bigquery_dataset}.{bigquery_table}", }, ) From 5f58ba746cbff034404ce71149f6575c495ee31f Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Tue, 15 Jun 2021 15:48:38 -0700 Subject: [PATCH 29/87] simplified type hints --- dataflow/gpu-workers/pytorch-minimal/main.py | 5 +-- .../pytorch-minimal/noxfile_config.py | 38 +++++++++++++++++++ .../gpu-workers/tensorflow-landsat/main.py | 35 ++++------------- .../gpu-workers/tensorflow-minimal/main.py | 5 +-- .../tensorflow-minimal/noxfile_config.py | 38 +++++++++++++++++++ 5 files changed, 88 insertions(+), 33 deletions(-) create mode 100644 dataflow/gpu-workers/pytorch-minimal/noxfile_config.py create mode 100644 dataflow/gpu-workers/tensorflow-minimal/noxfile_config.py diff --git a/dataflow/gpu-workers/pytorch-minimal/main.py b/dataflow/gpu-workers/pytorch-minimal/main.py index 3b36cc0fcd5..b939b33fa53 100644 --- a/dataflow/gpu-workers/pytorch-minimal/main.py +++ b/dataflow/gpu-workers/pytorch-minimal/main.py @@ -14,14 +14,14 @@ import argparse import logging -from typing import Any, List, Optional +from typing import List, Optional import apache_beam as beam from apache_beam.options.pipeline_options import PipelineOptions import torch -def check_gpus(element: Any, gpus_optional: bool = False) -> Any: +def check_gpus(_: None, gpus_optional: bool = False) -> None: """Validates that we are detecting GPUs, otherwise raise a RuntimeError.""" if torch.cuda.is_available(): logging.info(f"Using GPU: {torch.cuda.get_device_name(0)}") @@ -29,7 +29,6 @@ def check_gpus(element: Any, gpus_optional: bool = False) -> Any: logging.warning("No GPUs found, defaulting to CPU.") else: raise RuntimeError("No GPUs found.") - return element def run(input_text: str, beam_args: Optional[List[str]] = None) -> None: diff --git a/dataflow/gpu-workers/pytorch-minimal/noxfile_config.py b/dataflow/gpu-workers/pytorch-minimal/noxfile_config.py new file mode 100644 index 00000000000..74d736256c6 --- /dev/null +++ b/dataflow/gpu-workers/pytorch-minimal/noxfile_config.py @@ -0,0 +1,38 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Default TEST_CONFIG_OVERRIDE for python repos. + +# You can copy this file into your directory, then it will be imported from +# the noxfile.py. + +# The source of truth: +# https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/noxfile_config.py + +TEST_CONFIG_OVERRIDE = { + # You can opt out from the test for specific Python versions. + "ignored_versions": ["2.7", "3.9"], + # Old samples are opted out of enforcing Python type hints + # All new samples should feature them + "enforce_type_hints": True, + # An envvar key for determining the project id to use. Change it + # to 'BUILD_SPECIFIC_GCLOUD_PROJECT' if you want to opt in using a + # build specific Cloud project. You can also use your own string + # to use your own Cloud project. + "gcloud_project_env": "GOOGLE_CLOUD_PROJECT", + # 'gcloud_project_env': 'BUILD_SPECIFIC_GCLOUD_PROJECT', + # A dictionary you want to inject into your test. Don't put any + # secrets here. These values will override predefined values. + "envs": {}, +} diff --git a/dataflow/gpu-workers/tensorflow-landsat/main.py b/dataflow/gpu-workers/tensorflow-landsat/main.py index 6691c457d9d..c35cf1cca0f 100644 --- a/dataflow/gpu-workers/tensorflow-landsat/main.py +++ b/dataflow/gpu-workers/tensorflow-landsat/main.py @@ -111,31 +111,15 @@ ) -def check_gpus(element: Any, gpus_optional: bool) -> Any: - """Makes sure TensorFlow detects GPUs, otherwise raise a RuntimeError. - - Note that this function must be run within a PTransform like beam.Map so - we are sure it's run by the workers, and not the launcher process. - - Args: - element: An element - gpus_optional: If True, the pipeline won't crash if GPUs are not found. - - Returns: - The same element it received as is. - - Raises: - RuntimeError: If no GPUs were found by TensorFlow. - """ - # Make sure we have a GPU available. +def check_gpus(_: None, gpus_optional: bool = False) -> None: + """Validates that we are detecting GPUs, otherwise raise a RuntimeError.""" gpu_devices = tf.config.list_physical_devices("GPU") - logging.info(f"GPU devices: {gpu_devices}") - if len(gpu_devices) == 0: - if gpus_optional: - logging.warning("No GPUs found, defaulting to CPU.") - else: - raise RuntimeError("No GPUs found.") - return element + if gpu_devices: + logging.info(f"Using GPU: {gpu_devices}") + elif gpus_optional: + logging.warning("No GPUs found, defaulting to CPU.") + else: + raise RuntimeError("No GPUs found.") def get_band_paths(scene: str, band_names: List[str]) -> Tuple[str, List[str]]: @@ -274,9 +258,6 @@ def run( gamma = vis_params["gamma"] beam_options = PipelineOptions(beam_args, save_main_session=True) - - # We currently cannot use the `with` statement to run without waiting. - # https://issues.apache.org/jira/browse/BEAM-12455 pipeline = beam.Pipeline(options=beam_options) ( pipeline diff --git a/dataflow/gpu-workers/tensorflow-minimal/main.py b/dataflow/gpu-workers/tensorflow-minimal/main.py index f039034349c..6732d95392f 100644 --- a/dataflow/gpu-workers/tensorflow-minimal/main.py +++ b/dataflow/gpu-workers/tensorflow-minimal/main.py @@ -14,14 +14,14 @@ import argparse import logging -from typing import Any, List, Optional +from typing import List, Optional import apache_beam as beam from apache_beam.options.pipeline_options import PipelineOptions import tensorflow as tf -def check_gpus(element: Any, gpus_optional: bool = False) -> Any: +def check_gpus(_: None, gpus_optional: bool = False) -> None: """Validates that we are detecting GPUs, otherwise raise a RuntimeError.""" gpu_devices = tf.config.list_physical_devices("GPU") if gpu_devices: @@ -30,7 +30,6 @@ def check_gpus(element: Any, gpus_optional: bool = False) -> Any: logging.warning("No GPUs found, defaulting to CPU.") else: raise RuntimeError("No GPUs found.") - return element def run(input_text: str, beam_args: Optional[List[str]] = None) -> None: diff --git a/dataflow/gpu-workers/tensorflow-minimal/noxfile_config.py b/dataflow/gpu-workers/tensorflow-minimal/noxfile_config.py new file mode 100644 index 00000000000..74d736256c6 --- /dev/null +++ b/dataflow/gpu-workers/tensorflow-minimal/noxfile_config.py @@ -0,0 +1,38 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Default TEST_CONFIG_OVERRIDE for python repos. + +# You can copy this file into your directory, then it will be imported from +# the noxfile.py. + +# The source of truth: +# https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/noxfile_config.py + +TEST_CONFIG_OVERRIDE = { + # You can opt out from the test for specific Python versions. + "ignored_versions": ["2.7", "3.9"], + # Old samples are opted out of enforcing Python type hints + # All new samples should feature them + "enforce_type_hints": True, + # An envvar key for determining the project id to use. Change it + # to 'BUILD_SPECIFIC_GCLOUD_PROJECT' if you want to opt in using a + # build specific Cloud project. You can also use your own string + # to use your own Cloud project. + "gcloud_project_env": "GOOGLE_CLOUD_PROJECT", + # 'gcloud_project_env': 'BUILD_SPECIFIC_GCLOUD_PROJECT', + # A dictionary you want to inject into your test. Don't put any + # secrets here. These values will override predefined values. + "envs": {}, +} From c59bbe66d5d4167a8a00a9b9d9333a5d5dd61886 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Tue, 15 Jun 2021 15:58:09 -0700 Subject: [PATCH 30/87] get all jobs and update wait job status --- dataflow/conftest.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index f6a5bb977ca..359184a6df0 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -273,8 +273,9 @@ def dataflow_jobs_get( dataflow.projects() .jobs() .list( + # We don't filter="ACTIVE" because we still want to return the + # job if it failed, is already done, or was cancelled. projectId=project, - filter="ACTIVE", pageSize=list_page_size, ) ) @@ -295,11 +296,7 @@ def dataflow_jobs_wait( job_name: Optional[str] = None, project: str = PROJECT, region: str = REGION, - until_status: Union[str, Iterable[str]] = { - "JOB_STATE_DONE", - "JOB_STATE_FAILED", - "JOB_STATE_CANCELLED", - }, + until_status: str = "JOB_STATE_DONE", timeout_sec: str = 600, poll_interval_sec=30, list_page_size=100, @@ -307,9 +304,14 @@ def dataflow_jobs_wait( """For a list of all the valid states: https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs#Job.JobState """ - target_status = ( - {until_status} if isinstance(until_status, str) else set(until_status) - ) + + # Wait until we reach the desired status, or the job finished in some way. + target_status = { + until_status, + "JOB_STATE_DONE", + "JOB_STATE_FAILED", + "JOB_STATE_CANCELLED", + } print( f"Waiting for Dataflow job until {target_status}: job_id={job_id}, job_name={job_name}" ) From 9ae84e7046ffcdffaa1fb0ccd7d31ef455086e56 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Tue, 15 Jun 2021 16:00:11 -0700 Subject: [PATCH 31/87] update header years --- dataflow/gpu-workers/pytorch-minimal/noxfile_config.py | 2 +- dataflow/gpu-workers/tensorflow-landsat/noxfile_config.py | 2 +- dataflow/gpu-workers/tensorflow-minimal/noxfile_config.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dataflow/gpu-workers/pytorch-minimal/noxfile_config.py b/dataflow/gpu-workers/pytorch-minimal/noxfile_config.py index 74d736256c6..d8e9aba4fdd 100644 --- a/dataflow/gpu-workers/pytorch-minimal/noxfile_config.py +++ b/dataflow/gpu-workers/pytorch-minimal/noxfile_config.py @@ -1,4 +1,4 @@ -# Copyright 2020 Google LLC +# Copyright 2021 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/dataflow/gpu-workers/tensorflow-landsat/noxfile_config.py b/dataflow/gpu-workers/tensorflow-landsat/noxfile_config.py index 74d736256c6..d8e9aba4fdd 100644 --- a/dataflow/gpu-workers/tensorflow-landsat/noxfile_config.py +++ b/dataflow/gpu-workers/tensorflow-landsat/noxfile_config.py @@ -1,4 +1,4 @@ -# Copyright 2020 Google LLC +# Copyright 2021 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/dataflow/gpu-workers/tensorflow-minimal/noxfile_config.py b/dataflow/gpu-workers/tensorflow-minimal/noxfile_config.py index 74d736256c6..d8e9aba4fdd 100644 --- a/dataflow/gpu-workers/tensorflow-minimal/noxfile_config.py +++ b/dataflow/gpu-workers/tensorflow-minimal/noxfile_config.py @@ -1,4 +1,4 @@ -# Copyright 2020 Google LLC +# Copyright 2021 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 95320731ae9cf640c92f39c3abdc51d06751ded9 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Tue, 15 Jun 2021 17:44:06 -0700 Subject: [PATCH 32/87] tune logging --- dataflow/conftest.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index 359184a6df0..eaa14dc99f7 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -263,7 +263,7 @@ def dataflow_jobs_get( ) ) job = request.execute() - print(f"Dataflow job: {job}") + print(f"Found Dataflow job: {job}") return job elif job_name: @@ -280,10 +280,10 @@ def dataflow_jobs_get( ) ) response = request.execute() - print(f"Finding job {job_name}, response={response}") + print(f"Finding Dataflow job {job_name}") for job in response["jobs"]: if job["name"] == job_name: - print(f"Dataflow job: {job}") + print(f"Found job: {job}") return job return None From 10214b5408587cdd4f35f4edc48e1bd5cc8ba932 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Wed, 16 Jun 2021 09:15:59 -0700 Subject: [PATCH 33/87] fix tests --- dataflow/conftest.py | 4 ++-- dataflow/flex-templates/streaming_beam/e2e_test.py | 4 ++-- dataflow/gpu-workers/pytorch-minimal/run.yaml | 1 - dataflow/gpu-workers/tensorflow-landsat/e2e_test.py | 2 +- dataflow/gpu-workers/tensorflow-landsat/main.py | 9 +-------- 5 files changed, 6 insertions(+), 14 deletions(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index eaa14dc99f7..281d712cdb6 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -427,8 +427,8 @@ def dataflow_flex_template_run( print(f"Launched Dataflow Flex Template job: {unique_job_name}") except subprocess.CalledProcessError as e: print(e, file=sys.stderr) - stdout = stdout.decode("utf-8") - stderr = stderr.decode("utf-8") + stdout = e.stdout.decode("utf-8") + stderr = e.stderr.decode("utf-8") finally: print("--- stderr ---") print(stderr) diff --git a/dataflow/flex-templates/streaming_beam/e2e_test.py b/dataflow/flex-templates/streaming_beam/e2e_test.py index 561f0c9af10..3d592c8eddd 100644 --- a/dataflow/flex-templates/streaming_beam/e2e_test.py +++ b/dataflow/flex-templates/streaming_beam/e2e_test.py @@ -86,8 +86,8 @@ def test_flex_template_run( template_path=flex_template_path, bucket_name=bucket_name, parameters={ - "input_subscription": f"projects/{utils.project}/subscriptions/{pubsub_subscription}", - "output_table": f"{utils.project}:{bigquery_dataset}.{bigquery_table}", + "input_subscription": pubsub_subscription, + "output_table": f"{bigquery_dataset}.{bigquery_table}", }, ) diff --git a/dataflow/gpu-workers/pytorch-minimal/run.yaml b/dataflow/gpu-workers/pytorch-minimal/run.yaml index 95508c9fb86..7873f59e857 100644 --- a/dataflow/gpu-workers/pytorch-minimal/run.yaml +++ b/dataflow/gpu-workers/pytorch-minimal/run.yaml @@ -44,7 +44,6 @@ steps: - --job_name=$_JOB_NAME - --temp_location=$_TEMP_LOCATION - --worker_harness_container_image=gcr.io/$PROJECT_ID/$_IMAGE - - --disk_size_gb=20 - --experiment=worker_accelerator=type:$_GPU_TYPE;count:$_GPU_COUNT;install-nvidia-driver - --experiment=use_runner_v2 diff --git a/dataflow/gpu-workers/tensorflow-landsat/e2e_test.py b/dataflow/gpu-workers/tensorflow-landsat/e2e_test.py index 6f9d58a5760..e50e20a88f9 100644 --- a/dataflow/gpu-workers/tensorflow-landsat/e2e_test.py +++ b/dataflow/gpu-workers/tensorflow-landsat/e2e_test.py @@ -50,7 +50,7 @@ def run_job(utils: Utils, bucket_name: str, build_image: str) -> str: "_IMAGE": f"{NAME}:{utils.uuid}", "_TEMP_LOCATION": f"gs://{bucket_name}/temp", "_REGION": utils.region, - "_OUTPUT_PATH": f"gs://{bucket_name}/outputs/", + "_OUTPUT_PATH": f"gs://{bucket_name}/outputs", }, source="--no-source", ) diff --git a/dataflow/gpu-workers/tensorflow-landsat/main.py b/dataflow/gpu-workers/tensorflow-landsat/main.py index c35cf1cca0f..21a31c69d48 100644 --- a/dataflow/gpu-workers/tensorflow-landsat/main.py +++ b/dataflow/gpu-workers/tensorflow-landsat/main.py @@ -240,7 +240,6 @@ def run( scenes: List[str], output_path_prefix: str, vis_params: Dict[str, Any], - gpus_optional: bool, beam_args: Optional[List[str]] = None, ) -> None: """Load multiple Landsat scenes and render them as JPEG files. @@ -249,7 +248,6 @@ def run( scenes: List of Landsat 8 scene IDs. output_path_prefix: Path prefix to save the output files. vis_params: Visualization parameters including {rgb_bands, min, max, gamma}. - gpus_optional: If True, the pipeline won't crash if GPUs are not found. beam_args: Optional list of arguments for Beam pipeline options. """ rgb_band_names = vis_params["rgb_band_names"] @@ -325,11 +323,6 @@ def run( parser.add_argument( "--gamma", type=float, default=DEFAULT_GAMMA, help="Gamma correction factor." ) - parser.add_argument( - "--gpus-optional", - action="store_true", - help="If set, the pipeline won't crash if GPUs are not found.", - ) args, beam_args = parser.parse_known_args() scenes = args.scenes or DEFAULT_SCENES @@ -339,4 +332,4 @@ def run( "max": args.max, "gamma": args.gamma, } - run(scenes, args.output_path_prefix, vis_params, args.gpus_optional, beam_args) + run(scenes, args.output_path_prefix, vis_params, beam_args) From 41501a9b41af2537f4d70a8acf8a56b6184f1120 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Wed, 16 Jun 2021 10:45:38 -0700 Subject: [PATCH 34/87] made logging more explicit --- dataflow/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index 281d712cdb6..22e13492060 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -331,7 +331,7 @@ def dataflow_jobs_wait( logging.exception(e) time.sleep(poll_interval_sec) raise RuntimeError( - f"Dataflow job not found, job_id={job_id}, job_name={job_name}" + f"Dataflow job not found in status {target_status}: job_id={job_id}, job_name={job_name}" ) @staticmethod From 3c6883856c2b983417a06e373d50eee0bbd1af58 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Wed, 16 Jun 2021 10:45:55 -0700 Subject: [PATCH 35/87] use disk_size_gb --- dataflow/gpu-workers/pytorch-minimal/run.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/dataflow/gpu-workers/pytorch-minimal/run.yaml b/dataflow/gpu-workers/pytorch-minimal/run.yaml index 7873f59e857..2933dcfcf52 100644 --- a/dataflow/gpu-workers/pytorch-minimal/run.yaml +++ b/dataflow/gpu-workers/pytorch-minimal/run.yaml @@ -46,6 +46,7 @@ steps: - --worker_harness_container_image=gcr.io/$PROJECT_ID/$_IMAGE - --experiment=worker_accelerator=type:$_GPU_TYPE;count:$_GPU_COUNT;install-nvidia-driver - --experiment=use_runner_v2 + - --disk_size_gb=50 options: logging: CLOUD_LOGGING_ONLY From 2081cadc86f9b174c7ab8dde6a38c589350e31ba Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Wed, 16 Jun 2021 10:59:15 -0700 Subject: [PATCH 36/87] use larger disk_size_gb for landsat --- dataflow/gpu-workers/tensorflow-landsat/run.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/dataflow/gpu-workers/tensorflow-landsat/run.yaml b/dataflow/gpu-workers/tensorflow-landsat/run.yaml index 416a2f9e6e1..9147814d954 100644 --- a/dataflow/gpu-workers/tensorflow-landsat/run.yaml +++ b/dataflow/gpu-workers/tensorflow-landsat/run.yaml @@ -49,6 +49,7 @@ steps: - --worker_harness_container_image=gcr.io/$PROJECT_ID/$_IMAGE - --experiment=worker_accelerator=type:$_GPU_TYPE;count:$_GPU_COUNT;install-nvidia-driver - --experiment=use_runner_v2 + - --disk_size_gb=50 options: logging: CLOUD_LOGGING_ONLY From 314ffbde966414a1397891909e776983a480df19 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Wed, 16 Jun 2021 12:16:54 -0700 Subject: [PATCH 37/87] list all jobs --- dataflow/conftest.py | 51 +++++++++++-------- .../tensorflow-landsat/e2e_test.py | 5 +- 2 files changed, 35 insertions(+), 21 deletions(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index 22e13492060..abf8544eb97 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -239,12 +239,37 @@ def cloud_build_submit( subprocess.run(cmd, check=True) print(f"Deleted image: gcr.io/{project}/{image_name}:{UUID}") + @staticmethod + def dataflow_jobs_list( + project: str = PROJECT, page_size: int = 30 + ) -> Iterable[dict]: + from googleapiclient.discovery import build + + dataflow = build("dataflow", "v1b3") + + response = {"nextPageToken": None} + while "nextPageToken" in response: + # For more info see: + # https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/list + request = ( + dataflow.projects() + .jobs() + .list( + projectId=project, + pageToken=response["nextPageToken"], + pageSize=page_size, + ) + ) + response = request.execute() + for job in response["jobs"]: + yield job + @staticmethod def dataflow_jobs_get( job_id: Optional[str] = None, job_name: Optional[str] = None, project: str = PROJECT, - list_page_size=100, + list_page_size=30, ) -> Optional[Dict[str, Any]]: from googleapiclient.discovery import build @@ -262,30 +287,17 @@ def dataflow_jobs_get( view="JOB_VIEW_SUMMARY", ) ) + # If the job is not found, this throws an HttpError exception. job = request.execute() print(f"Found Dataflow job: {job}") return job elif job_name: - # For more info see: - # https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/list - request = ( - dataflow.projects() - .jobs() - .list( - # We don't filter="ACTIVE" because we still want to return the - # job if it failed, is already done, or was cancelled. - projectId=project, - pageSize=list_page_size, - ) - ) - response = request.execute() - print(f"Finding Dataflow job {job_name}") - for job in response["jobs"]: + for job in Utils.dataflow_jobs_list(project, list_page_size): if job["name"] == job_name: - print(f"Found job: {job}") + print(f"Found Dataflow job: {job}") return job - return None + raise ValueError(f"Dataflow job not found: job_name={job_name}") else: raise ValueError("must specify either `job_id` or `job_name`") @@ -295,9 +307,8 @@ def dataflow_jobs_wait( job_id: Optional[str] = None, job_name: Optional[str] = None, project: str = PROJECT, - region: str = REGION, until_status: str = "JOB_STATE_DONE", - timeout_sec: str = 600, + timeout_sec: str = 600, # defaults to 10 minutes poll_interval_sec=30, list_page_size=100, ) -> Optional[str]: diff --git a/dataflow/gpu-workers/tensorflow-landsat/e2e_test.py b/dataflow/gpu-workers/tensorflow-landsat/e2e_test.py index e50e20a88f9..f7a6c681fb9 100644 --- a/dataflow/gpu-workers/tensorflow-landsat/e2e_test.py +++ b/dataflow/gpu-workers/tensorflow-landsat/e2e_test.py @@ -58,7 +58,10 @@ def run_job(utils: Utils, bucket_name: str, build_image: str) -> str: def test_tensorflow_landsat(utils: Utils, run_job: str) -> None: # Wait until the job finishes. - status = utils.dataflow_jobs_wait(job_name=utils.hyphen_name(NAME)) + timeout = 20 * 60 # 20 minutes + status = utils.dataflow_jobs_wait( + job_name=utils.hyphen_name(NAME), timeout_sec=timeout + ) assert status == "JOB_STATE_DONE", f"Dataflow pipeline finished in {status} status" # Check that output files were created and are not empty. From 83de54822c32689732fa6a0ad0f0de802a1e5690 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Wed, 16 Jun 2021 12:38:13 -0700 Subject: [PATCH 38/87] wait before querying --- dataflow/flex-templates/streaming_beam/e2e_test.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/dataflow/flex-templates/streaming_beam/e2e_test.py b/dataflow/flex-templates/streaming_beam/e2e_test.py index 3d592c8eddd..503ce6ef65e 100644 --- a/dataflow/flex-templates/streaming_beam/e2e_test.py +++ b/dataflow/flex-templates/streaming_beam/e2e_test.py @@ -99,6 +99,10 @@ def test_flex_template_run( time.sleep(60) utils.dataflow_jobs_cancel(job_id) + # After cancelling, wait a minute to make sure the table is created in BigQuery. + # TODO: poll for this with a timeout inside `bigquery_query` + time.sleep(60) + # Check for the output data in BigQuery. query = f"SELECT * FROM {bigquery_dataset.replace(':', '.')}.{bigquery_table}" rows = list(utils.bigquery_query(query)) From c1ee4e9e835dcc2dd5f423d313983ae78b73c401 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Wed, 16 Jun 2021 12:53:26 -0700 Subject: [PATCH 39/87] include python version on resource names --- dataflow/conftest.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index abf8544eb97..3c0c533fc08 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -17,6 +17,7 @@ import multiprocessing as mp import os import re +import platform import subprocess import sys import time @@ -35,6 +36,8 @@ HYPHEN_NAME_RE = re.compile(r"[^\w\d-]+") UNDERSCORE_NAME_RE = re.compile(r"[^\w\d_]+") +PYTHON_VERSION = "".join(platform.python_version_tuple()[0:2]) + @dataclass class Utils: @@ -44,11 +47,12 @@ class Utils: @staticmethod def hyphen_name(name: str) -> str: - return f"{HYPHEN_NAME_RE.sub('-', name)}-{UUID}" + unique_name = f"{name}-{PYTHON_VERSION}-{UUID}" + return HYPHEN_NAME_RE.sub("-", unique_name) @staticmethod def underscore_name(name: str) -> str: - return f"{UNDERSCORE_NAME_RE.sub('_', name)}_{UUID}" + return UNDERSCORE_NAME_RE.sub("_", Utils.hyphen_name(name)) @staticmethod def storage_bucket(name: str) -> str: From d0c4533f4a760762afb84b4d90f6871d94d4e191 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Wed, 16 Jun 2021 12:54:50 -0700 Subject: [PATCH 40/87] update beam version --- dataflow/flex-templates/streaming_beam/requirements.txt | 2 +- dataflow/gpu-workers/pytorch-minimal/requirements.txt | 2 +- dataflow/gpu-workers/tensorflow-minimal/requirements.txt | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dataflow/flex-templates/streaming_beam/requirements.txt b/dataflow/flex-templates/streaming_beam/requirements.txt index 7c934ad8979..009bc29b6d8 100644 --- a/dataflow/flex-templates/streaming_beam/requirements.txt +++ b/dataflow/flex-templates/streaming_beam/requirements.txt @@ -1 +1 @@ -apache-beam[gcp]==2.29.0 +apache-beam[gcp]==2.30.0 diff --git a/dataflow/gpu-workers/pytorch-minimal/requirements.txt b/dataflow/gpu-workers/pytorch-minimal/requirements.txt index 530aa4098e7..ad5777b6ca6 100644 --- a/dataflow/gpu-workers/pytorch-minimal/requirements.txt +++ b/dataflow/gpu-workers/pytorch-minimal/requirements.txt @@ -1,2 +1,2 @@ -apache-beam[gcp]==2.29.0 +apache-beam[gcp]==2.30.0 torch==1.8.1 diff --git a/dataflow/gpu-workers/tensorflow-minimal/requirements.txt b/dataflow/gpu-workers/tensorflow-minimal/requirements.txt index f2f6e11354a..aa9e7e634f5 100644 --- a/dataflow/gpu-workers/tensorflow-minimal/requirements.txt +++ b/dataflow/gpu-workers/tensorflow-minimal/requirements.txt @@ -1,2 +1,2 @@ -apache-beam[gcp]==2.29.0 +apache-beam[gcp]==2.30.0 tensorflow==2.5.0 From 8a10eb29841a3ba29840b9b4b12a52137a32db30 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Wed, 16 Jun 2021 12:57:21 -0700 Subject: [PATCH 41/87] use 30m timeout --- dataflow/gpu-workers/tensorflow-landsat/e2e_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataflow/gpu-workers/tensorflow-landsat/e2e_test.py b/dataflow/gpu-workers/tensorflow-landsat/e2e_test.py index f7a6c681fb9..d79af9cdae0 100644 --- a/dataflow/gpu-workers/tensorflow-landsat/e2e_test.py +++ b/dataflow/gpu-workers/tensorflow-landsat/e2e_test.py @@ -58,7 +58,7 @@ def run_job(utils: Utils, bucket_name: str, build_image: str) -> str: def test_tensorflow_landsat(utils: Utils, run_job: str) -> None: # Wait until the job finishes. - timeout = 20 * 60 # 20 minutes + timeout = 30 * 60 # 30 minutes status = utils.dataflow_jobs_wait( job_name=utils.hyphen_name(NAME), timeout_sec=timeout ) From 35775b125c7e96c21eff5d7b1dea6fec36175fb9 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Wed, 16 Jun 2021 14:12:02 -0700 Subject: [PATCH 42/87] more debugging and increase waiting time --- dataflow/conftest.py | 10 ++++++++++ dataflow/flex-templates/streaming_beam/e2e_test.py | 8 ++------ dataflow/gpu-workers/tensorflow-landsat/e2e_test.py | 2 +- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index 3c0c533fc08..3b1c0d52bbb 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -64,6 +64,16 @@ def storage_bucket(name: str) -> str: print(f"Created storage_bucket: {bucket.name}") yield bucket.name + # Print all the objects in the bucket before deleting for debugging. + print(f"Deleting bucket {bucket.name} with the following contents:") + total_files = 0 + total_size = 0 + for blob in bucket.list_blobs(): + print(f"- {blob.name} ({blob.size} bytes)") + total_files += 1 + total_size += blob.size + print(f"Total {total_files} files ({total_size} bytes)") + bucket.delete(force=True) print(f"Deleted storage_bucket: {bucket.name}") diff --git a/dataflow/flex-templates/streaming_beam/e2e_test.py b/dataflow/flex-templates/streaming_beam/e2e_test.py index 503ce6ef65e..fc2198bcbfa 100644 --- a/dataflow/flex-templates/streaming_beam/e2e_test.py +++ b/dataflow/flex-templates/streaming_beam/e2e_test.py @@ -95,14 +95,10 @@ def test_flex_template_run( # First, lets wait until the job is running. utils.dataflow_jobs_wait(job_id, until_status="JOB_STATE_RUNNING") - # Then, wait a minute for data to arrive, get processed, and cancel it. - time.sleep(60) + # Then, wait a couple minutes for data to arrive, get processed, and cancel it. + time.sleep(2 * 60) utils.dataflow_jobs_cancel(job_id) - # After cancelling, wait a minute to make sure the table is created in BigQuery. - # TODO: poll for this with a timeout inside `bigquery_query` - time.sleep(60) - # Check for the output data in BigQuery. query = f"SELECT * FROM {bigquery_dataset.replace(':', '.')}.{bigquery_table}" rows = list(utils.bigquery_query(query)) diff --git a/dataflow/gpu-workers/tensorflow-landsat/e2e_test.py b/dataflow/gpu-workers/tensorflow-landsat/e2e_test.py index d79af9cdae0..50f01202b1b 100644 --- a/dataflow/gpu-workers/tensorflow-landsat/e2e_test.py +++ b/dataflow/gpu-workers/tensorflow-landsat/e2e_test.py @@ -50,7 +50,7 @@ def run_job(utils: Utils, bucket_name: str, build_image: str) -> str: "_IMAGE": f"{NAME}:{utils.uuid}", "_TEMP_LOCATION": f"gs://{bucket_name}/temp", "_REGION": utils.region, - "_OUTPUT_PATH": f"gs://{bucket_name}/outputs", + "_OUTPUT_PATH": f"gs://{bucket_name}/outputs/", }, source="--no-source", ) From 9484ce9c8df808c66fec1fd1988867b9ed45f7bf Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Wed, 16 Jun 2021 14:35:04 -0700 Subject: [PATCH 43/87] update bq query --- dataflow/conftest.py | 1 + dataflow/flex-templates/streaming_beam/e2e_test.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index 3b1c0d52bbb..1de640c0bbf 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -100,6 +100,7 @@ def bigquery_query(query: str) -> Iterable[Dict[str, Any]]: from google.cloud import bigquery bigquery_client = bigquery.Client() + print(f"Bigquery query: {query}") for row in bigquery_client.query(query): yield dict(row) diff --git a/dataflow/flex-templates/streaming_beam/e2e_test.py b/dataflow/flex-templates/streaming_beam/e2e_test.py index fc2198bcbfa..c2c83ad02ea 100644 --- a/dataflow/flex-templates/streaming_beam/e2e_test.py +++ b/dataflow/flex-templates/streaming_beam/e2e_test.py @@ -100,7 +100,7 @@ def test_flex_template_run( utils.dataflow_jobs_cancel(job_id) # Check for the output data in BigQuery. - query = f"SELECT * FROM {bigquery_dataset.replace(':', '.')}.{bigquery_table}" + query = f"SELECT * FROM `{bigquery_dataset.replace(':', '.')}.{bigquery_table}`" rows = list(utils.bigquery_query(query)) assert len(rows) > 0 for row in rows: From d94d6880e0be35cc884af83a4a7c258aaac08c6b Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Wed, 16 Jun 2021 14:39:51 -0700 Subject: [PATCH 44/87] use shorter flag name alias --- dataflow/gpu-workers/pytorch-minimal/run.yaml | 2 +- dataflow/gpu-workers/tensorflow-landsat/run.yaml | 2 +- dataflow/gpu-workers/tensorflow-minimal/run.yaml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dataflow/gpu-workers/pytorch-minimal/run.yaml b/dataflow/gpu-workers/pytorch-minimal/run.yaml index 2933dcfcf52..fe285533c54 100644 --- a/dataflow/gpu-workers/pytorch-minimal/run.yaml +++ b/dataflow/gpu-workers/pytorch-minimal/run.yaml @@ -43,7 +43,7 @@ steps: - --region=$_REGION - --job_name=$_JOB_NAME - --temp_location=$_TEMP_LOCATION - - --worker_harness_container_image=gcr.io/$PROJECT_ID/$_IMAGE + - --sdk_container_image=gcr.io/$PROJECT_ID/$_IMAGE - --experiment=worker_accelerator=type:$_GPU_TYPE;count:$_GPU_COUNT;install-nvidia-driver - --experiment=use_runner_v2 - --disk_size_gb=50 diff --git a/dataflow/gpu-workers/tensorflow-landsat/run.yaml b/dataflow/gpu-workers/tensorflow-landsat/run.yaml index 9147814d954..2b97dd28e48 100644 --- a/dataflow/gpu-workers/tensorflow-landsat/run.yaml +++ b/dataflow/gpu-workers/tensorflow-landsat/run.yaml @@ -46,7 +46,7 @@ steps: - --job_name=$_JOB_NAME - --temp_location=$_TEMP_LOCATION - --worker_machine_type=custom-1-13312-ext - - --worker_harness_container_image=gcr.io/$PROJECT_ID/$_IMAGE + - --sdk_container_image=gcr.io/$PROJECT_ID/$_IMAGE - --experiment=worker_accelerator=type:$_GPU_TYPE;count:$_GPU_COUNT;install-nvidia-driver - --experiment=use_runner_v2 - --disk_size_gb=50 diff --git a/dataflow/gpu-workers/tensorflow-minimal/run.yaml b/dataflow/gpu-workers/tensorflow-minimal/run.yaml index 337634629e8..f5d2a77c0d8 100644 --- a/dataflow/gpu-workers/tensorflow-minimal/run.yaml +++ b/dataflow/gpu-workers/tensorflow-minimal/run.yaml @@ -43,7 +43,7 @@ steps: - --region=$_REGION - --job_name=$_JOB_NAME - --temp_location=$_TEMP_LOCATION - - --worker_harness_container_image=gcr.io/$PROJECT_ID/$_IMAGE + - --sdk_container_image=gcr.io/$PROJECT_ID/$_IMAGE - --experiment=worker_accelerator=type:$_GPU_TYPE;count:$_GPU_COUNT;install-nvidia-driver - --experiment=use_runner_v2 From 1887870ad63a715b9372d86550e2f2d46212003f Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Wed, 16 Jun 2021 15:13:14 -0700 Subject: [PATCH 45/87] made logs clearer --- dataflow/conftest.py | 72 ++++++++++++++++++++++---------------------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index 1de640c0bbf..14fe00fbe82 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -47,7 +47,7 @@ class Utils: @staticmethod def hyphen_name(name: str) -> str: - unique_name = f"{name}-{PYTHON_VERSION}-{UUID}" + unique_name = f"{name}-py{PYTHON_VERSION}-{UUID}" return HYPHEN_NAME_RE.sub("-", unique_name) @staticmethod @@ -61,21 +61,21 @@ def storage_bucket(name: str) -> str: storage_client = storage.Client() bucket = storage_client.create_bucket(Utils.hyphen_name(name)) - print(f"Created storage_bucket: {bucket.name}") + print(f">> Created storage_bucket: {bucket.name}") yield bucket.name # Print all the objects in the bucket before deleting for debugging. - print(f"Deleting bucket {bucket.name} with the following contents:") + print(f">> Deleting bucket {bucket.name} with the following contents:") total_files = 0 total_size = 0 for blob in bucket.list_blobs(): - print(f"- {blob.name} ({blob.size} bytes)") + print(f" - {blob.name} ({blob.size} bytes)") total_files += 1 total_size += blob.size - print(f"Total {total_files} files ({total_size} bytes)") + print(f">> Total {total_files} files ({total_size} bytes)") bucket.delete(force=True) - print(f"Deleted storage_bucket: {bucket.name}") + print(f">> Deleted storage_bucket: {bucket.name}") @staticmethod def bigquery_dataset(name: str, project: str = PROJECT) -> str: @@ -87,20 +87,20 @@ def bigquery_dataset(name: str, project: str = PROJECT) -> str: bigquery.Dataset(f"{project}.{Utils.underscore_name(name)}") ) - print(f"Created bigquery_dataset: {dataset.full_dataset_id}") + print(f">> Created bigquery_dataset: {dataset.full_dataset_id}") yield dataset.full_dataset_id bigquery_client.delete_dataset( dataset.full_dataset_id.replace(":", "."), delete_contents=True ) - print(f"Deleted bigquery_dataset: {dataset.full_dataset_id}") + print(f">> Deleted bigquery_dataset: {dataset.full_dataset_id}") @staticmethod def bigquery_query(query: str) -> Iterable[Dict[str, Any]]: from google.cloud import bigquery bigquery_client = bigquery.Client() - print(f"Bigquery query: {query}") + print(f">> Bigquery query: {query}") for row in bigquery_client.query(query): yield dict(row) @@ -112,7 +112,7 @@ def pubsub_topic(name: str, project: str = PROJECT) -> str: topic_path = publisher_client.topic_path(project, Utils.hyphen_name(name)) topic = publisher_client.create_topic(topic_path) - print(f"Created pubsub_topic: {topic.name}") + print(f">> Created pubsub_topic: {topic.name}") yield topic.name # Due to the pinned library dependencies in apache-beam, client @@ -120,9 +120,9 @@ def pubsub_topic(name: str, project: str = PROJECT) -> str: # We use gcloud for a workaround. See also: # https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4492 cmd = ["gcloud", "pubsub", "--project", project, "topics", "delete", topic.name] - print(cmd) + print(f">> {cmd}") subprocess.run(cmd, check=True) - print(f"Deleted pubsub_topic: {topic.name}") + print(f">> Deleted pubsub_topic: {topic.name}") @staticmethod def pubsub_subscription( @@ -138,7 +138,7 @@ def pubsub_subscription( ) subscription = subscriber.create_subscription(subscription_path, topic_path) - print(f"Created pubsub_subscription: {subscription.name}") + print(f">> Created pubsub_subscription: {subscription.name}") yield subscription.name # Due to the pinned library dependencies in apache-beam, client @@ -154,9 +154,9 @@ def pubsub_subscription( "delete", subscription.name, ] - print(cmd) + print(f">> {cmd}") subprocess.run(cmd, check=True) - print(f"Deleted pubsub_subscription: {subscription.name}") + print(f">> Deleted pubsub_subscription: {subscription.name}") @staticmethod def pubsub_publisher( @@ -176,14 +176,14 @@ def _infinite_publish_job() -> None: time.sleep(sleep_sec) # Start a subprocess in the background to do the publishing. - print(f"Starting publisher on {topic_path}") + print(f">> Starting publisher on {topic_path}") p = mp.Process(target=_infinite_publish_job) p.start() yield p.is_alive() # For cleanup, terminate the background process. - print("Stopping publisher") + print(">> Stopping publisher") p.join(timeout=0) p.terminate() @@ -197,7 +197,7 @@ def cloud_build_submit( ) -> None: """Sends a Cloud Build job, if an image_name is provided it will be deleted at teardown.""" cmd = ["gcloud", "auth", "configure-docker"] - print(cmd) + print(f">> {cmd}") subprocess.run(cmd, check=True) if substitutions: @@ -218,9 +218,9 @@ def cloud_build_submit( *cmd_substitutions, source, ] - print(cmd) + print(f">> {cmd}") subprocess.run(cmd, check=True) - print(f"Cloud build finished successfully: {config}") + print(f">> Cloud build finished successfully: {config}") yield f.read() elif image_name: cmd = [ @@ -232,9 +232,9 @@ def cloud_build_submit( *cmd_substitutions, source, ] - print(cmd) + print(f">> {cmd}") subprocess.run(cmd, check=True) - print(f"Created image: gcr.io/{project}/{image_name}:{UUID}") + print(f">> Created image: gcr.io/{project}/{image_name}:{UUID}") yield f"{image_name}:{UUID}" else: raise ValueError("must specify either `config` or `image_name`") @@ -250,9 +250,9 @@ def cloud_build_submit( "--force-delete-tags", "--quiet", ] - print(cmd) + print(f">> {cmd}") subprocess.run(cmd, check=True) - print(f"Deleted image: gcr.io/{project}/{image_name}:{UUID}") + print(f">> Deleted image: gcr.io/{project}/{image_name}:{UUID}") @staticmethod def dataflow_jobs_list( @@ -304,13 +304,13 @@ def dataflow_jobs_get( ) # If the job is not found, this throws an HttpError exception. job = request.execute() - print(f"Found Dataflow job: {job}") + print(f">> Found Dataflow job: {job}") return job elif job_name: for job in Utils.dataflow_jobs_list(project, list_page_size): if job["name"] == job_name: - print(f"Found Dataflow job: {job}") + print(f">> Found Dataflow job: {job}") return job raise ValueError(f"Dataflow job not found: job_name={job_name}") @@ -339,7 +339,7 @@ def dataflow_jobs_wait( "JOB_STATE_CANCELLED", } print( - f"Waiting for Dataflow job until {target_status}: job_id={job_id}, job_name={job_name}" + f">> Waiting for Dataflow job until {target_status}: job_id={job_id}, job_name={job_name}" ) status = None for _ in range(0, timeout_sec, poll_interval_sec): @@ -364,7 +364,7 @@ def dataflow_jobs_wait( def dataflow_jobs_cancel( job_id: str, project: str = PROJECT, region: str = REGION ) -> None: - print(f"Canceling Dataflow job ID: {job_id}") + print(f">> Canceling Dataflow job ID: {job_id}") # We get an error using the googleapiclient.discovery APIs, probably # due to incompatible dependencies with apache-beam. # We use gcloud instead to cancel the job. @@ -378,9 +378,9 @@ def dataflow_jobs_cancel( job_id, f"--region={region}", ] - print(cmd) + print(f">> {cmd}") subprocess.run(cmd, check=True) - print(f"Cancelled Dataflow job: {job_id}") + print(f">> Cancelled Dataflow job: {job_id}") @staticmethod def dataflow_flex_template_build( @@ -403,10 +403,10 @@ def dataflow_flex_template_build( "--sdk-language=PYTHON", f"--metadata-file={metadata_file}", ] - print(cmd) + print(f">> {cmd}") subprocess.run(cmd, check=True) - print(f"dataflow_flex_template_build: {template_gcs_path}") + print(f">> dataflow_flex_template_build: {template_gcs_path}") yield template_gcs_path # The template file gets deleted when we delete the bucket. @@ -423,7 +423,7 @@ def dataflow_flex_template_run( # https://cloud.google.com/sdk/gcloud/reference/dataflow/flex-template/run unique_job_name = Utils.hyphen_name(job_name) - print(f"dataflow_job_name: {unique_job_name}") + print(f">> dataflow_job_name: {unique_job_name}") cmd = [ "gcloud", "dataflow", @@ -440,7 +440,7 @@ def dataflow_flex_template_run( "temp_location": f"gs://{bucket_name}/temp", }.items() ] - print(cmd) + print(f">> {cmd}") try: # The `capture_output` option was added in Python 3.7, so we must # pass the `stdout` and `stderr` options explicitly to support 3.6. @@ -450,7 +450,7 @@ def dataflow_flex_template_run( ) stdout = p.stdout.decode("utf-8") stderr = p.stderr.decode("utf-8") - print(f"Launched Dataflow Flex Template job: {unique_job_name}") + print(f">> Launched Dataflow Flex Template job: {unique_job_name}") except subprocess.CalledProcessError as e: print(e, file=sys.stderr) stdout = e.stdout.decode("utf-8") @@ -466,6 +466,6 @@ def dataflow_flex_template_run( @pytest.fixture(scope="session") def utils() -> Utils: - print(f"Test unique identifier: {UUID}") + print(f">> Test unique identifier: {UUID}") subprocess.run(["gcloud", "version"]) return Utils() From afdfa0f35b7a172aa8317d8f8ab5e3e430482004 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Wed, 16 Jun 2021 15:17:14 -0700 Subject: [PATCH 46/87] fix typo --- dataflow/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index 14fe00fbe82..64cdf8c83b8 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -364,7 +364,7 @@ def dataflow_jobs_wait( def dataflow_jobs_cancel( job_id: str, project: str = PROJECT, region: str = REGION ) -> None: - print(f">> Canceling Dataflow job ID: {job_id}") + print(f">> Cancelling Dataflow job ID: {job_id}") # We get an error using the googleapiclient.discovery APIs, probably # due to incompatible dependencies with apache-beam. # We use gcloud instead to cancel the job. From 8d50016606f7a47a21dc05c25b8b85b92362e337 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Wed, 16 Jun 2021 15:19:33 -0700 Subject: [PATCH 47/87] add more logging --- dataflow/flex-templates/streaming_beam/e2e_test.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dataflow/flex-templates/streaming_beam/e2e_test.py b/dataflow/flex-templates/streaming_beam/e2e_test.py index c2c83ad02ea..4b879cc3289 100644 --- a/dataflow/flex-templates/streaming_beam/e2e_test.py +++ b/dataflow/flex-templates/streaming_beam/e2e_test.py @@ -95,8 +95,9 @@ def test_flex_template_run( # First, lets wait until the job is running. utils.dataflow_jobs_wait(job_id, until_status="JOB_STATE_RUNNING") - # Then, wait a couple minutes for data to arrive, get processed, and cancel it. - time.sleep(2 * 60) + # Then, for a while for data to arrive, get processed, and cancel it. + print(f">> Pipeline is running, waiting for messages to arrive") + time.sleep(60) utils.dataflow_jobs_cancel(job_id) # Check for the output data in BigQuery. From 5880ce94d037609de365d4e9a3a37a61cdd13017 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Wed, 16 Jun 2021 15:26:31 -0700 Subject: [PATCH 48/87] wait longer for table to be created --- dataflow/flex-templates/streaming_beam/e2e_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataflow/flex-templates/streaming_beam/e2e_test.py b/dataflow/flex-templates/streaming_beam/e2e_test.py index 4b879cc3289..d48fe67a9d1 100644 --- a/dataflow/flex-templates/streaming_beam/e2e_test.py +++ b/dataflow/flex-templates/streaming_beam/e2e_test.py @@ -97,7 +97,7 @@ def test_flex_template_run( # Then, for a while for data to arrive, get processed, and cancel it. print(f">> Pipeline is running, waiting for messages to arrive") - time.sleep(60) + time.sleep(5 * 60) utils.dataflow_jobs_cancel(job_id) # Check for the output data in BigQuery. From 68d3ad53c0938f0bd705bc9d21382e76950803c0 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Wed, 16 Jun 2021 15:35:27 -0700 Subject: [PATCH 49/87] remove spurious f-string --- dataflow/flex-templates/streaming_beam/e2e_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataflow/flex-templates/streaming_beam/e2e_test.py b/dataflow/flex-templates/streaming_beam/e2e_test.py index d48fe67a9d1..db93c832e16 100644 --- a/dataflow/flex-templates/streaming_beam/e2e_test.py +++ b/dataflow/flex-templates/streaming_beam/e2e_test.py @@ -96,7 +96,7 @@ def test_flex_template_run( utils.dataflow_jobs_wait(job_id, until_status="JOB_STATE_RUNNING") # Then, for a while for data to arrive, get processed, and cancel it. - print(f">> Pipeline is running, waiting for messages to arrive") + print(">> Pipeline is running, waiting for messages to arrive") time.sleep(5 * 60) utils.dataflow_jobs_cancel(job_id) From 958a56a38fe6dc6f14a2684a6d4a0a96ac6c15da Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Wed, 16 Jun 2021 16:26:01 -0700 Subject: [PATCH 50/87] add disk size --- dataflow/gpu-workers/tensorflow-minimal/run.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/dataflow/gpu-workers/tensorflow-minimal/run.yaml b/dataflow/gpu-workers/tensorflow-minimal/run.yaml index f5d2a77c0d8..7cb9d389134 100644 --- a/dataflow/gpu-workers/tensorflow-minimal/run.yaml +++ b/dataflow/gpu-workers/tensorflow-minimal/run.yaml @@ -46,6 +46,7 @@ steps: - --sdk_container_image=gcr.io/$PROJECT_ID/$_IMAGE - --experiment=worker_accelerator=type:$_GPU_TYPE;count:$_GPU_COUNT;install-nvidia-driver - --experiment=use_runner_v2 + - --disk_size_gb=50 options: logging: CLOUD_LOGGING_ONLY From 1f47a9243611e1d81d6f441b9a34001d5a7bcd2b Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Thu, 17 Jun 2021 10:20:08 -0700 Subject: [PATCH 51/87] updated beam version --- dataflow/gpu-workers/tensorflow-landsat/main.py | 3 +-- dataflow/gpu-workers/tensorflow-landsat/requirements.txt | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/dataflow/gpu-workers/tensorflow-landsat/main.py b/dataflow/gpu-workers/tensorflow-landsat/main.py index 21a31c69d48..6afa31522c3 100644 --- a/dataflow/gpu-workers/tensorflow-landsat/main.py +++ b/dataflow/gpu-workers/tensorflow-landsat/main.py @@ -52,11 +52,10 @@ import logging import os import re -from typing import Any, Dict, List, Tuple +from typing import Any, Dict, List, Optional, Tuple import apache_beam as beam from apache_beam.options.pipeline_options import PipelineOptions -from apache_beam.typehints.typehints import Optional import numpy as np from PIL import Image import rasterio diff --git a/dataflow/gpu-workers/tensorflow-landsat/requirements.txt b/dataflow/gpu-workers/tensorflow-landsat/requirements.txt index 1823ef09b96..cbfaaf47f75 100644 --- a/dataflow/gpu-workers/tensorflow-landsat/requirements.txt +++ b/dataflow/gpu-workers/tensorflow-landsat/requirements.txt @@ -1,4 +1,4 @@ Pillow==8.2.0 -apache-beam[gcp]==2.29.0 +apache-beam[gcp]==2.30.0 rasterio==1.2.4 tensorflow==2.5.0 From 84759a4d87ca95eb39fba08e558592adc951a445 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Thu, 17 Jun 2021 10:21:42 -0700 Subject: [PATCH 52/87] add comments --- dataflow/gpu-workers/pytorch-minimal/Dockerfile | 2 ++ dataflow/gpu-workers/tensorflow-landsat/Dockerfile | 2 ++ dataflow/gpu-workers/tensorflow-minimal/Dockerfile | 2 ++ 3 files changed, 6 insertions(+) diff --git a/dataflow/gpu-workers/pytorch-minimal/Dockerfile b/dataflow/gpu-workers/pytorch-minimal/Dockerfile index 4bee40c9d13..0dcc04d017d 100644 --- a/dataflow/gpu-workers/pytorch-minimal/Dockerfile +++ b/dataflow/gpu-workers/pytorch-minimal/Dockerfile @@ -22,6 +22,8 @@ COPY requirements.txt . COPY *.py ./ # Install the pipeline requirements and check that there are no conflicts. +# Since the image already has all the dependencies installed, +# there's no need to run with the --requirements_file option. RUN pip install --no-cache-dir --upgrade pip \ && pip install --no-cache-dir -r requirements.txt \ && pip check diff --git a/dataflow/gpu-workers/tensorflow-landsat/Dockerfile b/dataflow/gpu-workers/tensorflow-landsat/Dockerfile index a8686076460..cc2d7eba729 100644 --- a/dataflow/gpu-workers/tensorflow-landsat/Dockerfile +++ b/dataflow/gpu-workers/tensorflow-landsat/Dockerfile @@ -33,6 +33,8 @@ RUN apt-get update \ && update-alternatives --install /usr/bin/python python /usr/bin/python3.8 10 \ && curl https://bootstrap.pypa.io/get-pip.py | python \ # Install the pipeline requirements and check that there are no conflicts. + # Since the image already has all the dependencies installed, + # there's no need to run with the --requirements_file option. && pip install --no-cache-dir --upgrade pip \ && pip install --no-cache-dir -r requirements.txt \ && pip check diff --git a/dataflow/gpu-workers/tensorflow-minimal/Dockerfile b/dataflow/gpu-workers/tensorflow-minimal/Dockerfile index e892d4c28a8..ff88332cd41 100644 --- a/dataflow/gpu-workers/tensorflow-minimal/Dockerfile +++ b/dataflow/gpu-workers/tensorflow-minimal/Dockerfile @@ -33,6 +33,8 @@ RUN apt-get update \ && update-alternatives --install /usr/bin/python python /usr/bin/python3.8 10 \ && curl https://bootstrap.pypa.io/get-pip.py | python \ # Install the pipeline requirements and check that there are no conflicts. + # Since the image already has all the dependencies installed, + # there's no need to run with the --requirements_file option. && pip install --no-cache-dir --upgrade pip \ && pip install --no-cache-dir -r requirements.txt \ && pip check From d11968803b6f3b30451ed276da40fee0fc091795 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Thu, 17 Jun 2021 11:26:00 -0700 Subject: [PATCH 53/87] add missing fixture --- dataflow/gpu-workers/tensorflow-landsat/e2e_test.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dataflow/gpu-workers/tensorflow-landsat/e2e_test.py b/dataflow/gpu-workers/tensorflow-landsat/e2e_test.py index 50f01202b1b..3576e3b781f 100644 --- a/dataflow/gpu-workers/tensorflow-landsat/e2e_test.py +++ b/dataflow/gpu-workers/tensorflow-landsat/e2e_test.py @@ -56,7 +56,7 @@ def run_job(utils: Utils, bucket_name: str, build_image: str) -> str: ) -def test_tensorflow_landsat(utils: Utils, run_job: str) -> None: +def test_tensorflow_landsat(utils: Utils, bucket_name: str, run_job: str) -> None: # Wait until the job finishes. timeout = 30 * 60 # 30 minutes status = utils.dataflow_jobs_wait( @@ -66,6 +66,7 @@ def test_tensorflow_landsat(utils: Utils, run_job: str) -> None: # Check that output files were created and are not empty. storage_client = storage.Client() + print(f">> Checking for output files in: gs://{bucket_name}/outputs/") output_files = list(storage_client.list_blobs(bucket_name, prefix="outputs/")) assert len(output_files) > 0, f"No files found in gs://{bucket_name}/outputs/" for output_file in output_files: From fe197aef9c046f17fc5c060432425ffd9d27a15d Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Thu, 17 Jun 2021 11:27:47 -0700 Subject: [PATCH 54/87] decrease waiting time --- dataflow/flex-templates/streaming_beam/e2e_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataflow/flex-templates/streaming_beam/e2e_test.py b/dataflow/flex-templates/streaming_beam/e2e_test.py index db93c832e16..53228d7effe 100644 --- a/dataflow/flex-templates/streaming_beam/e2e_test.py +++ b/dataflow/flex-templates/streaming_beam/e2e_test.py @@ -97,7 +97,7 @@ def test_flex_template_run( # Then, for a while for data to arrive, get processed, and cancel it. print(">> Pipeline is running, waiting for messages to arrive") - time.sleep(5 * 60) + time.sleep(60) utils.dataflow_jobs_cancel(job_id) # Check for the output data in BigQuery. From 303e0091918d79dea25827b29d3a0d274c2f7228 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Thu, 17 Jun 2021 12:22:17 -0700 Subject: [PATCH 55/87] increase waiting time --- dataflow/flex-templates/streaming_beam/e2e_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataflow/flex-templates/streaming_beam/e2e_test.py b/dataflow/flex-templates/streaming_beam/e2e_test.py index 53228d7effe..db93c832e16 100644 --- a/dataflow/flex-templates/streaming_beam/e2e_test.py +++ b/dataflow/flex-templates/streaming_beam/e2e_test.py @@ -97,7 +97,7 @@ def test_flex_template_run( # Then, for a while for data to arrive, get processed, and cancel it. print(">> Pipeline is running, waiting for messages to arrive") - time.sleep(60) + time.sleep(5 * 60) utils.dataflow_jobs_cancel(job_id) # Check for the output data in BigQuery. From 748345ff258a5df79d257d97b814228859eaaf7d Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Thu, 17 Jun 2021 12:32:26 -0700 Subject: [PATCH 56/87] provide staging location --- dataflow/conftest.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index 64cdf8c83b8..8a0aa35ac70 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -433,6 +433,7 @@ def dataflow_flex_template_run( f"--template-file-gcs-location={template_path}", f"--project={project}", f"--region={region}", + f"--staging-location=gs://{bucket_name}/staging", ] + [ f"--parameters={name}={value}" for name, value in { From 226d642ebbfa5a6d38709531f147960669b6c178 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Fri, 18 Jun 2021 13:12:09 -0700 Subject: [PATCH 57/87] drain before cancel --- dataflow/conftest.py | 110 ++++++++++-------- .../flex-templates/streaming_beam/e2e_test.py | 5 +- 2 files changed, 67 insertions(+), 48 deletions(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index 8a0aa35ac70..ea31511615e 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -61,21 +61,21 @@ def storage_bucket(name: str) -> str: storage_client = storage.Client() bucket = storage_client.create_bucket(Utils.hyphen_name(name)) - print(f">> Created storage_bucket: {bucket.name}") + logging.info(f"Created storage_bucket: {bucket.name}") yield bucket.name # Print all the objects in the bucket before deleting for debugging. - print(f">> Deleting bucket {bucket.name} with the following contents:") + logging.info(f"Deleting bucket {bucket.name} with the following contents:") total_files = 0 total_size = 0 for blob in bucket.list_blobs(): - print(f" - {blob.name} ({blob.size} bytes)") + logging.info(f" - {blob.name} ({blob.size} bytes)") total_files += 1 total_size += blob.size - print(f">> Total {total_files} files ({total_size} bytes)") + logging.info(f"Total {total_files} files ({total_size} bytes)") bucket.delete(force=True) - print(f">> Deleted storage_bucket: {bucket.name}") + logging.info(f"Deleted storage_bucket: {bucket.name}") @staticmethod def bigquery_dataset(name: str, project: str = PROJECT) -> str: @@ -87,20 +87,20 @@ def bigquery_dataset(name: str, project: str = PROJECT) -> str: bigquery.Dataset(f"{project}.{Utils.underscore_name(name)}") ) - print(f">> Created bigquery_dataset: {dataset.full_dataset_id}") + logging.info(f"Created bigquery_dataset: {dataset.full_dataset_id}") yield dataset.full_dataset_id bigquery_client.delete_dataset( dataset.full_dataset_id.replace(":", "."), delete_contents=True ) - print(f">> Deleted bigquery_dataset: {dataset.full_dataset_id}") + logging.info(f"Deleted bigquery_dataset: {dataset.full_dataset_id}") @staticmethod def bigquery_query(query: str) -> Iterable[Dict[str, Any]]: from google.cloud import bigquery bigquery_client = bigquery.Client() - print(f">> Bigquery query: {query}") + logging.info(f"Bigquery query: {query}") for row in bigquery_client.query(query): yield dict(row) @@ -112,7 +112,7 @@ def pubsub_topic(name: str, project: str = PROJECT) -> str: topic_path = publisher_client.topic_path(project, Utils.hyphen_name(name)) topic = publisher_client.create_topic(topic_path) - print(f">> Created pubsub_topic: {topic.name}") + logging.info(f"Created pubsub_topic: {topic.name}") yield topic.name # Due to the pinned library dependencies in apache-beam, client @@ -120,9 +120,9 @@ def pubsub_topic(name: str, project: str = PROJECT) -> str: # We use gcloud for a workaround. See also: # https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4492 cmd = ["gcloud", "pubsub", "--project", project, "topics", "delete", topic.name] - print(f">> {cmd}") + logging.info(f"{cmd}") subprocess.run(cmd, check=True) - print(f">> Deleted pubsub_topic: {topic.name}") + logging.info(f"Deleted pubsub_topic: {topic.name}") @staticmethod def pubsub_subscription( @@ -138,7 +138,7 @@ def pubsub_subscription( ) subscription = subscriber.create_subscription(subscription_path, topic_path) - print(f">> Created pubsub_subscription: {subscription.name}") + logging.info(f"Created pubsub_subscription: {subscription.name}") yield subscription.name # Due to the pinned library dependencies in apache-beam, client @@ -154,9 +154,9 @@ def pubsub_subscription( "delete", subscription.name, ] - print(f">> {cmd}") + logging.info(f"{cmd}") subprocess.run(cmd, check=True) - print(f">> Deleted pubsub_subscription: {subscription.name}") + logging.info(f"Deleted pubsub_subscription: {subscription.name}") @staticmethod def pubsub_publisher( @@ -176,14 +176,14 @@ def _infinite_publish_job() -> None: time.sleep(sleep_sec) # Start a subprocess in the background to do the publishing. - print(f">> Starting publisher on {topic_path}") + logging.info(f"Starting publisher on {topic_path}") p = mp.Process(target=_infinite_publish_job) p.start() yield p.is_alive() # For cleanup, terminate the background process. - print(">> Stopping publisher") + logging.info("Stopping publisher") p.join(timeout=0) p.terminate() @@ -197,7 +197,7 @@ def cloud_build_submit( ) -> None: """Sends a Cloud Build job, if an image_name is provided it will be deleted at teardown.""" cmd = ["gcloud", "auth", "configure-docker"] - print(f">> {cmd}") + logging.info(f"{cmd}") subprocess.run(cmd, check=True) if substitutions: @@ -218,9 +218,9 @@ def cloud_build_submit( *cmd_substitutions, source, ] - print(f">> {cmd}") + logging.info(f"{cmd}") subprocess.run(cmd, check=True) - print(f">> Cloud build finished successfully: {config}") + logging.info(f"Cloud build finished successfully: {config}") yield f.read() elif image_name: cmd = [ @@ -232,9 +232,9 @@ def cloud_build_submit( *cmd_substitutions, source, ] - print(f">> {cmd}") + logging.info(f"{cmd}") subprocess.run(cmd, check=True) - print(f">> Created image: gcr.io/{project}/{image_name}:{UUID}") + logging.info(f"Created image: gcr.io/{project}/{image_name}:{UUID}") yield f"{image_name}:{UUID}" else: raise ValueError("must specify either `config` or `image_name`") @@ -250,9 +250,9 @@ def cloud_build_submit( "--force-delete-tags", "--quiet", ] - print(f">> {cmd}") + logging.info(f"{cmd}") subprocess.run(cmd, check=True) - print(f">> Deleted image: gcr.io/{project}/{image_name}:{UUID}") + logging.info(f"Deleted image: gcr.io/{project}/{image_name}:{UUID}") @staticmethod def dataflow_jobs_list( @@ -304,13 +304,13 @@ def dataflow_jobs_get( ) # If the job is not found, this throws an HttpError exception. job = request.execute() - print(f">> Found Dataflow job: {job}") + logging.info(f"Found Dataflow job: {job}") return job elif job_name: for job in Utils.dataflow_jobs_list(project, list_page_size): if job["name"] == job_name: - print(f">> Found Dataflow job: {job}") + logging.info(f"Found Dataflow job: {job}") return job raise ValueError(f"Dataflow job not found: job_name={job_name}") @@ -324,7 +324,7 @@ def dataflow_jobs_wait( project: str = PROJECT, until_status: str = "JOB_STATE_DONE", timeout_sec: str = 600, # defaults to 10 minutes - poll_interval_sec=30, + poll_interval_sec=60, list_page_size=100, ) -> Optional[str]: """For a list of all the valid states: @@ -338,10 +338,9 @@ def dataflow_jobs_wait( "JOB_STATE_FAILED", "JOB_STATE_CANCELLED", } - print( - f">> Waiting for Dataflow job until {target_status}: job_id={job_id}, job_name={job_name}" + logging.info( + f"Waiting for Dataflow job until {target_status}: job_id={job_id}, job_name={job_name}" ) - status = None for _ in range(0, timeout_sec, poll_interval_sec): try: job = Utils.dataflow_jobs_get( @@ -352,7 +351,13 @@ def dataflow_jobs_wait( ) status = job["currentState"] if status in target_status: + logging.info( + f"Job status {status} in {target_status}, done waiting" + ) return status + logging.info( + f"Job status {status} not in {target_status}, retrying in {poll_interval_sec} seconds" + ) except Exception as e: logging.exception(e) time.sleep(poll_interval_sec) @@ -364,11 +369,24 @@ def dataflow_jobs_wait( def dataflow_jobs_cancel( job_id: str, project: str = PROJECT, region: str = REGION ) -> None: - print(f">> Cancelling Dataflow job ID: {job_id}") + logging.info(f"Cancelling Dataflow job ID: {job_id}") # We get an error using the googleapiclient.discovery APIs, probably # due to incompatible dependencies with apache-beam. # We use gcloud instead to cancel the job. - # https://cloud.google.com/sdk/gcloud/reference/dataflow/jobs/cancel + # https://cloud.google.com/sdk/gcloud/reference/dataflow/jobs/drain + cmd = [ + "gcloud", + f"--project={project}", + "dataflow", + "jobs", + "drain", + job_id, + f"--region={region}", + ] + logging.info(f"{cmd}") + subprocess.run(cmd, check=True) + + # https://cloud.google.com/sdk/gcloud/reference/dataflow/jobs/cancel cmd = [ "gcloud", f"--project={project}", @@ -378,9 +396,9 @@ def dataflow_jobs_cancel( job_id, f"--region={region}", ] - print(f">> {cmd}") + logging.info(f"{cmd}") subprocess.run(cmd, check=True) - print(f">> Cancelled Dataflow job: {job_id}") + logging.info(f"Cancelled Dataflow job: {job_id}") @staticmethod def dataflow_flex_template_build( @@ -403,10 +421,10 @@ def dataflow_flex_template_build( "--sdk-language=PYTHON", f"--metadata-file={metadata_file}", ] - print(f">> {cmd}") + logging.info(f"{cmd}") subprocess.run(cmd, check=True) - print(f">> dataflow_flex_template_build: {template_gcs_path}") + logging.info(f"dataflow_flex_template_build: {template_gcs_path}") yield template_gcs_path # The template file gets deleted when we delete the bucket. @@ -423,7 +441,7 @@ def dataflow_flex_template_run( # https://cloud.google.com/sdk/gcloud/reference/dataflow/flex-template/run unique_job_name = Utils.hyphen_name(job_name) - print(f">> dataflow_job_name: {unique_job_name}") + logging.info(f"dataflow_job_name: {unique_job_name}") cmd = [ "gcloud", "dataflow", @@ -438,10 +456,10 @@ def dataflow_flex_template_run( f"--parameters={name}={value}" for name, value in { **parameters, - "temp_location": f"gs://{bucket_name}/temp", }.items() ] - print(f">> {cmd}") + logging.info(f"{cmd}") + try: # The `capture_output` option was added in Python 3.7, so we must # pass the `stdout` and `stderr` options explicitly to support 3.6. @@ -451,22 +469,22 @@ def dataflow_flex_template_run( ) stdout = p.stdout.decode("utf-8") stderr = p.stderr.decode("utf-8") - print(f">> Launched Dataflow Flex Template job: {unique_job_name}") + logging.info(f"Launched Dataflow Flex Template job: {unique_job_name}") except subprocess.CalledProcessError as e: - print(e, file=sys.stderr) + logging.info(e, file=sys.stderr) stdout = e.stdout.decode("utf-8") stderr = e.stderr.decode("utf-8") finally: - print("--- stderr ---") - print(stderr) - print("--- stdout ---") - print(stdout) - print("--- end ---") + logging.info("--- stderr ---") + logging.info(stderr) + logging.info("--- stdout ---") + logging.info(stdout) + logging.info("--- end ---") return yaml.safe_load(stdout)["job"]["id"] @pytest.fixture(scope="session") def utils() -> Utils: - print(f">> Test unique identifier: {UUID}") + logging.info(f"Test unique identifier: {UUID}") subprocess.run(["gcloud", "version"]) return Utils() diff --git a/dataflow/flex-templates/streaming_beam/e2e_test.py b/dataflow/flex-templates/streaming_beam/e2e_test.py index db93c832e16..d2828d731a1 100644 --- a/dataflow/flex-templates/streaming_beam/e2e_test.py +++ b/dataflow/flex-templates/streaming_beam/e2e_test.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. import json +import logging import time try: @@ -96,8 +97,8 @@ def test_flex_template_run( utils.dataflow_jobs_wait(job_id, until_status="JOB_STATE_RUNNING") # Then, for a while for data to arrive, get processed, and cancel it. - print(">> Pipeline is running, waiting for messages to arrive") - time.sleep(5 * 60) + logging.info("Pipeline is running, waiting for messages to arrive") + time.sleep(60) utils.dataflow_jobs_cancel(job_id) # Check for the output data in BigQuery. From 16e5d8e917bd6439ac74c8022de3f4fb73ff2c68 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Fri, 18 Jun 2021 13:39:36 -0700 Subject: [PATCH 58/87] adjust wait time --- dataflow/flex-templates/streaming_beam/e2e_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataflow/flex-templates/streaming_beam/e2e_test.py b/dataflow/flex-templates/streaming_beam/e2e_test.py index d2828d731a1..4fb3c484491 100644 --- a/dataflow/flex-templates/streaming_beam/e2e_test.py +++ b/dataflow/flex-templates/streaming_beam/e2e_test.py @@ -98,7 +98,7 @@ def test_flex_template_run( # Then, for a while for data to arrive, get processed, and cancel it. logging.info("Pipeline is running, waiting for messages to arrive") - time.sleep(60) + time.sleep(5 * 60) utils.dataflow_jobs_cancel(job_id) # Check for the output data in BigQuery. From aa29c54962d200f7035d8f09bde744db7b7bfc88 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Fri, 18 Jun 2021 14:40:42 -0700 Subject: [PATCH 59/87] adjust timeout --- dataflow/conftest.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index ea31511615e..b397bb848ed 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -323,7 +323,7 @@ def dataflow_jobs_wait( job_name: Optional[str] = None, project: str = PROJECT, until_status: str = "JOB_STATE_DONE", - timeout_sec: str = 600, # defaults to 10 minutes + timeout_sec: str = 20 * 60, # defaults to 20 minutes poll_interval_sec=60, list_page_size=100, ) -> Optional[str]: @@ -341,7 +341,7 @@ def dataflow_jobs_wait( logging.info( f"Waiting for Dataflow job until {target_status}: job_id={job_id}, job_name={job_name}" ) - for _ in range(0, timeout_sec, poll_interval_sec): + for _ in range(0, timeout_sec + 1, poll_interval_sec): try: job = Utils.dataflow_jobs_get( job_id=job_id, @@ -485,6 +485,7 @@ def dataflow_flex_template_run( @pytest.fixture(scope="session") def utils() -> Utils: + logging.getLogger().setLevel(logging.info) logging.info(f"Test unique identifier: {UUID}") subprocess.run(["gcloud", "version"]) return Utils() From 57eb9ab49ad2147dc161a25db0977f9ecb40eefe Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Fri, 18 Jun 2021 14:57:00 -0700 Subject: [PATCH 60/87] adjust timeout --- dataflow/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index b397bb848ed..4e27e8df4f5 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -485,7 +485,7 @@ def dataflow_flex_template_run( @pytest.fixture(scope="session") def utils() -> Utils: - logging.getLogger().setLevel(logging.info) + logging.getLogger().setLevel(logging.INFO) logging.info(f"Test unique identifier: {UUID}") subprocess.run(["gcloud", "version"]) return Utils() From 3104e6370f70d95a27d7312c87f6f6dad8a314bc Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Fri, 18 Jun 2021 15:43:44 -0700 Subject: [PATCH 61/87] improve error messages --- dataflow/conftest.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index 4e27e8df4f5..b69d3dabf35 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -341,6 +341,7 @@ def dataflow_jobs_wait( logging.info( f"Waiting for Dataflow job until {target_status}: job_id={job_id}, job_name={job_name}" ) + status = None for _ in range(0, timeout_sec + 1, poll_interval_sec): try: job = Utils.dataflow_jobs_get( @@ -361,9 +362,14 @@ def dataflow_jobs_wait( except Exception as e: logging.exception(e) time.sleep(poll_interval_sec) - raise RuntimeError( - f"Dataflow job not found in status {target_status}: job_id={job_id}, job_name={job_name}" - ) + if status is None: + raise RuntimeError( + f"Dataflow job not found: timeout_sec={timeout_sec}, target_status={target_status}, job_id={job_id}, job_name={job_name}" + ) + else: + raise RuntimeError( + f"Dataflow job finished in status {status} but expected {target_status}: job_id={job_id}, job_name={job_name}" + ) @staticmethod def dataflow_jobs_cancel( From 60372bb6384588dabf185955a8f50bd793e5f368 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Mon, 21 Jun 2021 11:29:31 -0700 Subject: [PATCH 62/87] small refactorings --- dataflow/conftest.py | 8 ++++++- .../flex-templates/streaming_beam/e2e_test.py | 19 +++++++++------ .../gpu-workers/pytorch-minimal/README.md | 18 +++++++-------- .../gpu-workers/pytorch-minimal/e2e_test.py | 4 ++-- .../gpu-workers/tensorflow-landsat/README.md | 23 ++++++++----------- .../tensorflow-landsat/e2e_test.py | 6 +++-- .../gpu-workers/tensorflow-minimal/README.md | 18 +++++++-------- .../tensorflow-minimal/e2e_test.py | 4 ++-- 8 files changed, 52 insertions(+), 48 deletions(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index b69d3dabf35..6dec741f172 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -322,8 +322,9 @@ def dataflow_jobs_wait( job_id: Optional[str] = None, job_name: Optional[str] = None, project: str = PROJECT, + region: str = REGION, until_status: str = "JOB_STATE_DONE", - timeout_sec: str = 20 * 60, # defaults to 20 minutes + timeout_sec: str = 30 * 60, poll_interval_sec=60, list_page_size=100, ) -> Optional[str]: @@ -356,6 +357,11 @@ def dataflow_jobs_wait( f"Job status {status} in {target_status}, done waiting" ) return status + elif status == "JOB_STATE_FAILED": + raise RuntimeError( + "Dataflow job failed:\n" + f"https://console.cloud.google.com/dataflow/jobs/{region}/{job_id}?project={project}" + ) logging.info( f"Job status {status} not in {target_status}, retrying in {poll_interval_sec} seconds" ) diff --git a/dataflow/flex-templates/streaming_beam/e2e_test.py b/dataflow/flex-templates/streaming_beam/e2e_test.py index 4fb3c484491..6faec89b8e5 100644 --- a/dataflow/flex-templates/streaming_beam/e2e_test.py +++ b/dataflow/flex-templates/streaming_beam/e2e_test.py @@ -72,23 +72,23 @@ def flex_template_path(utils: Utils, bucket_name: str, flex_template_image: str) ) -def test_flex_template_run( +@pytest.fixture(scope="session") +def run_dataflow_job( utils: Utils, bucket_name: str, pubsub_publisher: str, pubsub_subscription: str, flex_template_path: str, bigquery_dataset: str, -) -> None: +) -> str: - bigquery_table = "output_table" job_id = utils.dataflow_flex_template_run( job_name=NAME, template_path=flex_template_path, bucket_name=bucket_name, parameters={ "input_subscription": pubsub_subscription, - "output_table": f"{bigquery_dataset}.{bigquery_table}", + "output_table": f"{bigquery_dataset}.output_table", }, ) @@ -96,13 +96,18 @@ def test_flex_template_run( # First, lets wait until the job is running. utils.dataflow_jobs_wait(job_id, until_status="JOB_STATE_RUNNING") - # Then, for a while for data to arrive, get processed, and cancel it. + yield job_id + + utils.dataflow_jobs_cancel(job_id) + + +def test_flex_template_run(utils: Utils, run_dataflow_job: str) -> None: + # Wait for a while for data to arrive and get processed. logging.info("Pipeline is running, waiting for messages to arrive") time.sleep(5 * 60) - utils.dataflow_jobs_cancel(job_id) # Check for the output data in BigQuery. - query = f"SELECT * FROM `{bigquery_dataset.replace(':', '.')}.{bigquery_table}`" + query = f"SELECT * FROM `{bigquery_dataset.replace(':', '.')}.output_table`" rows = list(utils.bigquery_query(query)) assert len(rows) > 0 for row in rows: diff --git a/dataflow/gpu-workers/pytorch-minimal/README.md b/dataflow/gpu-workers/pytorch-minimal/README.md index 3ec270791f9..43e24830529 100644 --- a/dataflow/gpu-workers/pytorch-minimal/README.md +++ b/dataflow/gpu-workers/pytorch-minimal/README.md @@ -5,15 +5,12 @@ Make sure you have followed the [Dataflow setup instructions](../../README.md). -Finally, save your resource names in environment variables. - -```sh -export PROJECT=$(gcloud config get-value project) -``` - ## Building the Docker image -We use Cloud Build to build the container image for the workers. +We use +[Cloud Build](https://cloud.google.com/build) +to build the container image for the workers and save it in +[Container Registry](https://cloud.google.com/container-registry/). ```sh gcloud builds submit --config build.yaml @@ -21,9 +18,10 @@ gcloud builds submit --config build.yaml ## Running the Dataflow job with GPUs -We use Cloud Build to run the Dataflow job. -We launch the job using the worker image to make sure the job launches -with the same Python version as the workers. +We use Cloud Build to run the [Dataflow](https://cloud.google.com/dataflow) job. + +> ℹ️ We launch the job using the worker image to make sure the job launches +> with the same Python version as the workers and all the dependencies installed. ```sh export REGION="us-central1" diff --git a/dataflow/gpu-workers/pytorch-minimal/e2e_test.py b/dataflow/gpu-workers/pytorch-minimal/e2e_test.py index c24ee471e5d..52d6a2c7ab0 100644 --- a/dataflow/gpu-workers/pytorch-minimal/e2e_test.py +++ b/dataflow/gpu-workers/pytorch-minimal/e2e_test.py @@ -40,7 +40,7 @@ def build_image(utils: Utils) -> str: @pytest.fixture(scope="session") -def run_job(utils: Utils, bucket_name: str, build_image: str) -> str: +def run_dataflow_job(utils: Utils, bucket_name: str, build_image: str) -> str: # Run the Beam pipeline in Dataflow making sure GPUs are used. yield from utils.cloud_build_submit( config="run.yaml", @@ -54,7 +54,7 @@ def run_job(utils: Utils, bucket_name: str, build_image: str) -> str: ) -def test_pytorch_minimal(utils: Utils, run_job: str) -> None: +def test_pytorch_minimal(utils: Utils, run_dataflow_job: str) -> None: # Wait until the job finishes. status = utils.dataflow_jobs_wait(job_name=utils.hyphen_name(NAME)) assert status == "JOB_STATE_DONE", f"Dataflow pipeline finished in {status} status" diff --git a/dataflow/gpu-workers/tensorflow-landsat/README.md b/dataflow/gpu-workers/tensorflow-landsat/README.md index 7f826e9c00c..dd5b8fadbc7 100644 --- a/dataflow/gpu-workers/tensorflow-landsat/README.md +++ b/dataflow/gpu-workers/tensorflow-landsat/README.md @@ -7,20 +7,14 @@ ## Before you begin Make sure you have followed the -[Dataflow setup instructions](../../README.md), and additionally: - -* Use or [create a Cloud Storage bucket](https://console.cloud.google.com/storage/create-bucket). - -Finally, save your resource names in environment variables. - -```sh -export PROJECT=$(gcloud config get-value project) -export BUCKET="my-bucket-name" -``` +[Dataflow setup instructions](../../README.md). ## Building the Docker image -We use Cloud Build to build the container image for the workers. +We use +[Cloud Build](https://cloud.google.com/build) +to build the container image for the workers and save it in +[Container Registry](https://cloud.google.com/container-registry/). ```sh gcloud builds submit --config build.yaml @@ -28,9 +22,10 @@ gcloud builds submit --config build.yaml ## Running the Dataflow job with GPUs -We use Cloud Build to run the Dataflow job. -We launch the job using the worker image to make sure the job launches -with the same Python version as the workers. +We use Cloud Build to run the [Dataflow](https://cloud.google.com/dataflow) job. + +> ℹ️ We launch the job using the worker image to make sure the job launches +> with the same Python version as the workers and all the dependencies installed. ```sh export OUTPUT_PATH="gs://$BUCKET/samples/dataflow/landsat/output-images/" diff --git a/dataflow/gpu-workers/tensorflow-landsat/e2e_test.py b/dataflow/gpu-workers/tensorflow-landsat/e2e_test.py index 3576e3b781f..972fe627f42 100644 --- a/dataflow/gpu-workers/tensorflow-landsat/e2e_test.py +++ b/dataflow/gpu-workers/tensorflow-landsat/e2e_test.py @@ -41,7 +41,7 @@ def build_image(utils: Utils) -> str: @pytest.fixture(scope="session") -def run_job(utils: Utils, bucket_name: str, build_image: str) -> str: +def run_dataflow_job(utils: Utils, bucket_name: str, build_image: str) -> str: # Run the Beam pipeline in Dataflow making sure GPUs are used. yield from utils.cloud_build_submit( config="run.yaml", @@ -56,7 +56,9 @@ def run_job(utils: Utils, bucket_name: str, build_image: str) -> str: ) -def test_tensorflow_landsat(utils: Utils, bucket_name: str, run_job: str) -> None: +def test_tensorflow_landsat( + utils: Utils, bucket_name: str, run_dataflow_job: str +) -> None: # Wait until the job finishes. timeout = 30 * 60 # 30 minutes status = utils.dataflow_jobs_wait( diff --git a/dataflow/gpu-workers/tensorflow-minimal/README.md b/dataflow/gpu-workers/tensorflow-minimal/README.md index a645dbb411b..debd86b0e91 100644 --- a/dataflow/gpu-workers/tensorflow-minimal/README.md +++ b/dataflow/gpu-workers/tensorflow-minimal/README.md @@ -5,15 +5,12 @@ Make sure you have followed the [Dataflow setup instructions](../../README.md). -Finally, save your resource names in environment variables. - -```sh -export PROJECT=$(gcloud config get-value project) -``` - ## Building the Docker image -We use Cloud Build to build the container image for the workers. +We use +[Cloud Build](https://cloud.google.com/build) +to build the container image for the workers and save it in +[Container Registry](https://cloud.google.com/container-registry/). ```sh gcloud builds submit --config build.yaml @@ -21,9 +18,10 @@ gcloud builds submit --config build.yaml ## Running the Dataflow job with GPUs -We use Cloud Build to run the Dataflow job. -We launch the job using the worker image to make sure the job launches -with the same Python version as the workers. +We use Cloud Build to run the [Dataflow](https://cloud.google.com/dataflow) job. + +> ℹ️ We launch the job using the worker image to make sure the job launches +> with the same Python version as the workers and all the dependencies installed. ```sh export REGION="us-central1" diff --git a/dataflow/gpu-workers/tensorflow-minimal/e2e_test.py b/dataflow/gpu-workers/tensorflow-minimal/e2e_test.py index ebac5c00dd7..6c890550d68 100644 --- a/dataflow/gpu-workers/tensorflow-minimal/e2e_test.py +++ b/dataflow/gpu-workers/tensorflow-minimal/e2e_test.py @@ -40,7 +40,7 @@ def build_image(utils: Utils) -> str: @pytest.fixture(scope="session") -def run_job(utils: Utils, bucket_name: str, build_image: str) -> str: +def run_dataflow_job(utils: Utils, bucket_name: str, build_image: str) -> str: # Run the Beam pipeline in Dataflow making sure GPUs are used. yield from utils.cloud_build_submit( config="run.yaml", @@ -54,7 +54,7 @@ def run_job(utils: Utils, bucket_name: str, build_image: str) -> str: ) -def test_tensorflow_minimal(utils: Utils, run_job: str) -> None: +def test_tensorflow_minimal(utils: Utils, run_dataflow_job: str) -> None: # Wait until the job finishes. status = utils.dataflow_jobs_wait(job_name=utils.hyphen_name(NAME)) assert status == "JOB_STATE_DONE", f"Dataflow pipeline finished in {status} status" From 688865f804e1119f8be1e2eca172fe0de3397c2f Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Mon, 21 Jun 2021 12:04:27 -0700 Subject: [PATCH 63/87] add missing fixture --- dataflow/flex-templates/streaming_beam/e2e_test.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dataflow/flex-templates/streaming_beam/e2e_test.py b/dataflow/flex-templates/streaming_beam/e2e_test.py index 6faec89b8e5..752fadf323b 100644 --- a/dataflow/flex-templates/streaming_beam/e2e_test.py +++ b/dataflow/flex-templates/streaming_beam/e2e_test.py @@ -101,7 +101,9 @@ def run_dataflow_job( utils.dataflow_jobs_cancel(job_id) -def test_flex_template_run(utils: Utils, run_dataflow_job: str) -> None: +def test_flex_template_run( + utils: Utils, bigquery_dataset: str, run_dataflow_job: str +) -> None: # Wait for a while for data to arrive and get processed. logging.info("Pipeline is running, waiting for messages to arrive") time.sleep(5 * 60) From 230fa94d137f3267dacd3fb900d3dd6552a74af1 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Wed, 23 Jun 2021 09:45:11 -0700 Subject: [PATCH 64/87] updated timeout --- dataflow/conftest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dataflow/conftest.py b/dataflow/conftest.py index 6dec741f172..41b3aea8490 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -284,7 +284,7 @@ def dataflow_jobs_get( job_id: Optional[str] = None, job_name: Optional[str] = None, project: str = PROJECT, - list_page_size=30, + list_page_size: int = 30, ) -> Optional[Dict[str, Any]]: from googleapiclient.discovery import build @@ -343,7 +343,7 @@ def dataflow_jobs_wait( f"Waiting for Dataflow job until {target_status}: job_id={job_id}, job_name={job_name}" ) status = None - for _ in range(0, timeout_sec + 1, poll_interval_sec): + for _ in range(0, timeout_sec, poll_interval_sec): try: job = Utils.dataflow_jobs_get( job_id=job_id, From a34132759a0edeb7f60f57da0b483c3bcb57335a Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Wed, 23 Jun 2021 18:34:30 +0000 Subject: [PATCH 65/87] containerize gpu tests --- dataflow/conftest.py | 387 +++++--------- .../flex-templates/streaming_beam/e2e_test.py | 41 +- dataflow/gpu-workers/conftest.py | 503 ++++++++++++++++++ 3 files changed, 651 insertions(+), 280 deletions(-) create mode 100644 dataflow/gpu-workers/conftest.py diff --git a/dataflow/conftest.py b/dataflow/conftest.py index 41b3aea8490..13314bf86dd 100644 --- a/dataflow/conftest.py +++ b/dataflow/conftest.py @@ -13,15 +13,12 @@ from dataclasses import dataclass import itertools import json -import logging import multiprocessing as mp import os -import re -import platform import subprocess import sys import time -from typing import Any, Callable, Dict, Iterable, Optional, Union +from typing import Any, Callable, Dict, Iterable, Optional import uuid import pytest @@ -29,90 +26,65 @@ # Default options. UUID = uuid.uuid4().hex[0:6] PROJECT = os.environ["GOOGLE_CLOUD_PROJECT"] -REGION = "us-central1" +REGION = "us-west1" +ZONE = "us-west1-b" RETRY_MAX_TIME = 5 * 60 # 5 minutes in seconds -HYPHEN_NAME_RE = re.compile(r"[^\w\d-]+") -UNDERSCORE_NAME_RE = re.compile(r"[^\w\d_]+") - -PYTHON_VERSION = "".join(platform.python_version_tuple()[0:2]) - @dataclass class Utils: uuid: str = UUID project: str = PROJECT region: str = REGION + zone: str = ZONE @staticmethod - def hyphen_name(name: str) -> str: - unique_name = f"{name}-py{PYTHON_VERSION}-{UUID}" - return HYPHEN_NAME_RE.sub("-", unique_name) - - @staticmethod - def underscore_name(name: str) -> str: - return UNDERSCORE_NAME_RE.sub("_", Utils.hyphen_name(name)) - - @staticmethod - def storage_bucket(name: str) -> str: + def storage_bucket(bucket_name: str) -> str: from google.cloud import storage storage_client = storage.Client() - bucket = storage_client.create_bucket(Utils.hyphen_name(name)) + bucket_unique_name = f"{bucket_name}-{UUID}" + bucket = storage_client.create_bucket(bucket_unique_name) - logging.info(f"Created storage_bucket: {bucket.name}") - yield bucket.name - - # Print all the objects in the bucket before deleting for debugging. - logging.info(f"Deleting bucket {bucket.name} with the following contents:") - total_files = 0 - total_size = 0 - for blob in bucket.list_blobs(): - logging.info(f" - {blob.name} ({blob.size} bytes)") - total_files += 1 - total_size += blob.size - logging.info(f"Total {total_files} files ({total_size} bytes)") + print(f"storage_bucket: {bucket_unique_name}") + yield bucket_unique_name bucket.delete(force=True) - logging.info(f"Deleted storage_bucket: {bucket.name}") @staticmethod - def bigquery_dataset(name: str, project: str = PROJECT) -> str: + def bigquery_dataset(dataset_name: str, project: str = PROJECT) -> str: from google.cloud import bigquery bigquery_client = bigquery.Client() - dataset = bigquery_client.create_dataset( - bigquery.Dataset(f"{project}.{Utils.underscore_name(name)}") + bigquery.Dataset(f"{project}.{dataset_name.replace('-', '_')}_{UUID}") ) - logging.info(f"Created bigquery_dataset: {dataset.full_dataset_id}") + print(f"bigquery_dataset: {dataset.full_dataset_id}") yield dataset.full_dataset_id bigquery_client.delete_dataset( dataset.full_dataset_id.replace(":", "."), delete_contents=True ) - logging.info(f"Deleted bigquery_dataset: {dataset.full_dataset_id}") @staticmethod def bigquery_query(query: str) -> Iterable[Dict[str, Any]]: from google.cloud import bigquery bigquery_client = bigquery.Client() - logging.info(f"Bigquery query: {query}") for row in bigquery_client.query(query): yield dict(row) @staticmethod - def pubsub_topic(name: str, project: str = PROJECT) -> str: + def pubsub_topic(topic_name: str, project: str = PROJECT) -> str: from google.cloud import pubsub publisher_client = pubsub.PublisherClient() - topic_path = publisher_client.topic_path(project, Utils.hyphen_name(name)) + topic_path = publisher_client.topic_path(project, f"{topic_name}-{UUID}") topic = publisher_client.create_topic(topic_path) - logging.info(f"Created pubsub_topic: {topic.name}") + print(f"pubsub_topic: {topic.name}") yield topic.name # Due to the pinned library dependencies in apache-beam, client @@ -120,25 +92,24 @@ def pubsub_topic(name: str, project: str = PROJECT) -> str: # We use gcloud for a workaround. See also: # https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4492 cmd = ["gcloud", "pubsub", "--project", project, "topics", "delete", topic.name] - logging.info(f"{cmd}") + print(cmd) subprocess.run(cmd, check=True) - logging.info(f"Deleted pubsub_topic: {topic.name}") @staticmethod def pubsub_subscription( topic_path: str, - name: str, + subscription_name: str, project: str = PROJECT, ) -> str: from google.cloud import pubsub subscriber = pubsub.SubscriberClient() subscription_path = subscriber.subscription_path( - project, Utils.hyphen_name(name) + project, f"{subscription_name}-{UUID}" ) subscription = subscriber.create_subscription(subscription_path, topic_path) - logging.info(f"Created pubsub_subscription: {subscription.name}") + print(f"pubsub_subscription: {subscription.name}") yield subscription.name # Due to the pinned library dependencies in apache-beam, client @@ -154,9 +125,8 @@ def pubsub_subscription( "delete", subscription.name, ] - logging.info(f"{cmd}") + print(cmd) subprocess.run(cmd, check=True) - logging.info(f"Deleted pubsub_subscription: {subscription.name}") @staticmethod def pubsub_publisher( @@ -176,229 +146,128 @@ def _infinite_publish_job() -> None: time.sleep(sleep_sec) # Start a subprocess in the background to do the publishing. - logging.info(f"Starting publisher on {topic_path}") + print(f"Starting publisher on {topic_path}") p = mp.Process(target=_infinite_publish_job) p.start() yield p.is_alive() # For cleanup, terminate the background process. - logging.info("Stopping publisher") + print("Stopping publisher") p.join(timeout=0) p.terminate() @staticmethod - def cloud_build_submit( - image_name: Optional[str] = None, - config: Optional[str] = None, - source: str = ".", - substitutions: Optional[Dict[str, str]] = None, + def container_image( + image_path: str, project: str = PROJECT, - ) -> None: - """Sends a Cloud Build job, if an image_name is provided it will be deleted at teardown.""" + tag: str = "latest", + ) -> str: + image_name = f"gcr.io/{project}/{image_path}-{UUID}:{tag}" cmd = ["gcloud", "auth", "configure-docker"] - logging.info(f"{cmd}") + print(cmd) + subprocess.run(cmd, check=True) + cmd = [ + "gcloud", + "builds", + "submit", + f"--project={project}", + f"--tag={image_name}", + ".", + ] + print(cmd) subprocess.run(cmd, check=True) - if substitutions: - cmd_substitutions = [ - f"--substitutions={','.join([k + '=' + v for k, v in substitutions.items()])}" - ] - else: - cmd_substitutions = [] - - if config: - with open(config) as f: - cmd = [ - "gcloud", - "builds", - "submit", - f"--project={project}", - f"--config={config}", - *cmd_substitutions, - source, - ] - logging.info(f"{cmd}") - subprocess.run(cmd, check=True) - logging.info(f"Cloud build finished successfully: {config}") - yield f.read() - elif image_name: - cmd = [ - "gcloud", - "builds", - "submit", - f"--project={project}", - f"--tag=gcr.io/{project}/{image_name}:{UUID}", - *cmd_substitutions, - source, - ] - logging.info(f"{cmd}") - subprocess.run(cmd, check=True) - logging.info(f"Created image: gcr.io/{project}/{image_name}:{UUID}") - yield f"{image_name}:{UUID}" - else: - raise ValueError("must specify either `config` or `image_name`") - - if image_name: - cmd = [ - "gcloud", - "container", - "images", - "delete", - f"gcr.io/{project}/{image_name}:{UUID}", - f"--project={project}", - "--force-delete-tags", - "--quiet", - ] - logging.info(f"{cmd}") - subprocess.run(cmd, check=True) - logging.info(f"Deleted image: gcr.io/{project}/{image_name}:{UUID}") + print(f"container_image: {image_name}") + yield image_name + + cmd = [ + "gcloud", + "container", + "images", + "delete", + image_name, + f"--project={project}", + "--quiet", + ] + print(cmd) + subprocess.run(cmd, check=True) @staticmethod - def dataflow_jobs_list( - project: str = PROJECT, page_size: int = 30 - ) -> Iterable[dict]: + def dataflow_job_id_from_job_name( + job_name: str, + project: str = PROJECT, + ) -> Optional[str]: from googleapiclient.discovery import build dataflow = build("dataflow", "v1b3") - response = {"nextPageToken": None} - while "nextPageToken" in response: - # For more info see: - # https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/list - request = ( - dataflow.projects() - .jobs() - .list( - projectId=project, - pageToken=response["nextPageToken"], - pageSize=page_size, - ) + # Only return the 50 most recent results - our job is likely to be in here. + # If the job is not found, first try increasing this number.[]''job_id + # For more info see: + # https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/list + jobs_request = ( + dataflow.projects() + .jobs() + .list( + projectId=project, + filter="ACTIVE", + pageSize=50, ) - response = request.execute() - for job in response["jobs"]: - yield job + ) + response = jobs_request.execute() + + # Search for the job in the list that has our name (names are unique) + for job in response["jobs"]: + if job["name"] == job_name: + return job["id"] + return None @staticmethod - def dataflow_jobs_get( - job_id: Optional[str] = None, - job_name: Optional[str] = None, + def dataflow_jobs_wait( + job_id: str, project: str = PROJECT, - list_page_size: int = 30, - ) -> Optional[Dict[str, Any]]: + status: str = "JOB_STATE_RUNNING", + ) -> bool: from googleapiclient.discovery import build dataflow = build("dataflow", "v1b3") - if job_id: - # For more info see: - # https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/get - request = ( - dataflow.projects() - .jobs() - .get( - projectId=project, - jobId=job_id, - view="JOB_VIEW_SUMMARY", - ) - ) - # If the job is not found, this throws an HttpError exception. - job = request.execute() - logging.info(f"Found Dataflow job: {job}") - return job - - elif job_name: - for job in Utils.dataflow_jobs_list(project, list_page_size): - if job["name"] == job_name: - logging.info(f"Found Dataflow job: {job}") - return job - raise ValueError(f"Dataflow job not found: job_name={job_name}") + sleep_time_seconds = 30 + max_sleep_time = 10 * 60 - else: - raise ValueError("must specify either `job_id` or `job_name`") - - @staticmethod - def dataflow_jobs_wait( - job_id: Optional[str] = None, - job_name: Optional[str] = None, - project: str = PROJECT, - region: str = REGION, - until_status: str = "JOB_STATE_DONE", - timeout_sec: str = 30 * 60, - poll_interval_sec=60, - list_page_size=100, - ) -> Optional[str]: - """For a list of all the valid states: - https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs#Job.JobState - """ - - # Wait until we reach the desired status, or the job finished in some way. - target_status = { - until_status, - "JOB_STATE_DONE", - "JOB_STATE_FAILED", - "JOB_STATE_CANCELLED", - } - logging.info( - f"Waiting for Dataflow job until {target_status}: job_id={job_id}, job_name={job_name}" - ) - status = None - for _ in range(0, timeout_sec, poll_interval_sec): + print(f"Waiting for Dataflow job ID: {job_id} (until status {status})") + for _ in range(0, max_sleep_time, sleep_time_seconds): try: - job = Utils.dataflow_jobs_get( - job_id=job_id, - job_name=job_name, - project=project, - list_page_size=list_page_size, - ) - status = job["currentState"] - if status in target_status: - logging.info( - f"Job status {status} in {target_status}, done waiting" + # For more info see: + # https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/get + jobs_request = ( + dataflow.projects() + .jobs() + .get( + projectId=project, + jobId=job_id, + view="JOB_VIEW_SUMMARY", ) - return status - elif status == "JOB_STATE_FAILED": - raise RuntimeError( - "Dataflow job failed:\n" - f"https://console.cloud.google.com/dataflow/jobs/{region}/{job_id}?project={project}" - ) - logging.info( - f"Job status {status} not in {target_status}, retrying in {poll_interval_sec} seconds" ) - except Exception as e: - logging.exception(e) - time.sleep(poll_interval_sec) - if status is None: - raise RuntimeError( - f"Dataflow job not found: timeout_sec={timeout_sec}, target_status={target_status}, job_id={job_id}, job_name={job_name}" - ) - else: - raise RuntimeError( - f"Dataflow job finished in status {status} but expected {target_status}: job_id={job_id}, job_name={job_name}" - ) + response = jobs_request.execute() + print(response) + if response["currentState"] == status: + return True + except: + pass + time.sleep(sleep_time_seconds) + return False @staticmethod - def dataflow_jobs_cancel( + def dataflow_jobs_cancel_by_job_id( job_id: str, project: str = PROJECT, region: str = REGION ) -> None: - logging.info(f"Cancelling Dataflow job ID: {job_id}") + print(f"Canceling Dataflow job ID: {job_id}") # We get an error using the googleapiclient.discovery APIs, probably # due to incompatible dependencies with apache-beam. # We use gcloud instead to cancel the job. - # https://cloud.google.com/sdk/gcloud/reference/dataflow/jobs/drain - cmd = [ - "gcloud", - f"--project={project}", - "dataflow", - "jobs", - "drain", - job_id, - f"--region={region}", - ] - logging.info(f"{cmd}") - subprocess.run(cmd, check=True) - - # https://cloud.google.com/sdk/gcloud/reference/dataflow/jobs/cancel + # https://cloud.google.com/sdk/gcloud/reference/dataflow/jobs/cancel cmd = [ "gcloud", f"--project={project}", @@ -408,14 +277,22 @@ def dataflow_jobs_cancel( job_id, f"--region={region}", ] - logging.info(f"{cmd}") subprocess.run(cmd, check=True) - logging.info(f"Cancelled Dataflow job: {job_id}") + + @staticmethod + def dataflow_jobs_cancel_by_job_name( + job_name: str, project: str = PROJECT, region: str = REGION + ) -> None: + # To cancel a dataflow job, we need its ID, not its name. + # If it doesn't, job_id will be equal to None. + job_id = Utils.dataflow_job_id_from_job_name(project, job_name) + if job_id is not None: + Utils.dataflow_jobs_cancel_by_job_id(job_id, project, region) @staticmethod def dataflow_flex_template_build( bucket_name: str, - image_name: str, + template_image: str, metadata_file: str, project: str = PROJECT, template_file: str = "template.json", @@ -429,14 +306,14 @@ def dataflow_flex_template_build( "build", template_gcs_path, f"--project={project}", - f"--image=gcr.io/{project}/{image_name}", + f"--image={template_image}", "--sdk-language=PYTHON", f"--metadata-file={metadata_file}", ] - logging.info(f"{cmd}") + print(cmd) subprocess.run(cmd, check=True) - logging.info(f"dataflow_flex_template_build: {template_gcs_path}") + print(f"dataflow_flex_template_build: {template_gcs_path}") yield template_gcs_path # The template file gets deleted when we delete the bucket. @@ -452,8 +329,8 @@ def dataflow_flex_template_run( import yaml # https://cloud.google.com/sdk/gcloud/reference/dataflow/flex-template/run - unique_job_name = Utils.hyphen_name(job_name) - logging.info(f"dataflow_job_name: {unique_job_name}") + unique_job_name = f"{job_name}-{UUID}" + print(f"dataflow_job_name: {unique_job_name}") cmd = [ "gcloud", "dataflow", @@ -463,15 +340,14 @@ def dataflow_flex_template_run( f"--template-file-gcs-location={template_path}", f"--project={project}", f"--region={region}", - f"--staging-location=gs://{bucket_name}/staging", ] + [ f"--parameters={name}={value}" for name, value in { **parameters, + "temp_location": f"gs://{bucket_name}/temp", }.items() ] - logging.info(f"{cmd}") - + print(cmd) try: # The `capture_output` option was added in Python 3.7, so we must # pass the `stdout` and `stderr` options explicitly to support 3.6. @@ -481,23 +357,22 @@ def dataflow_flex_template_run( ) stdout = p.stdout.decode("utf-8") stderr = p.stderr.decode("utf-8") - logging.info(f"Launched Dataflow Flex Template job: {unique_job_name}") + print(f"Launched Dataflow Flex Template job: {unique_job_name}") except subprocess.CalledProcessError as e: - logging.info(e, file=sys.stderr) - stdout = e.stdout.decode("utf-8") - stderr = e.stderr.decode("utf-8") + print(e, file=sys.stderr) + stdout = stdout.decode("utf-8") + stderr = stderr.decode("utf-8") finally: - logging.info("--- stderr ---") - logging.info(stderr) - logging.info("--- stdout ---") - logging.info(stdout) - logging.info("--- end ---") + print("--- stderr ---") + print(stderr) + print("--- stdout ---") + print(stdout) + print("--- end ---") return yaml.safe_load(stdout)["job"]["id"] @pytest.fixture(scope="session") def utils() -> Utils: - logging.getLogger().setLevel(logging.INFO) - logging.info(f"Test unique identifier: {UUID}") + print(f"Test unique identifier: {UUID}") subprocess.run(["gcloud", "version"]) return Utils() diff --git a/dataflow/flex-templates/streaming_beam/e2e_test.py b/dataflow/flex-templates/streaming_beam/e2e_test.py index 752fadf323b..e642306ed4b 100644 --- a/dataflow/flex-templates/streaming_beam/e2e_test.py +++ b/dataflow/flex-templates/streaming_beam/e2e_test.py @@ -11,18 +11,19 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. import json -import logging import time +# `conftest` cannot be imported when running in `nox`, but we still +# try to import it for the autocomplete when writing the tests. try: - # `conftest` cannot be imported when running in `nox`, but we still - # try to import it for the autocomplete when writing the tests. from conftest import Utils except ModuleNotFoundError: - Utils = None + from typing import Any + + Utils = Any import pytest -NAME = "dataflow/flex-templates/streaming-beam" +NAME = "dataflow-flex-templates-streaming-beam" @pytest.fixture(scope="session") @@ -60,56 +61,48 @@ def pubsub_publisher(utils: Utils, pubsub_topic: str) -> bool: @pytest.fixture(scope="session") def flex_template_image(utils: Utils) -> str: - yield from utils.cloud_build_submit(NAME) + yield from utils.container_image(NAME) @pytest.fixture(scope="session") def flex_template_path(utils: Utils, bucket_name: str, flex_template_image: str) -> str: yield from utils.dataflow_flex_template_build( bucket_name=bucket_name, - image_name=flex_template_image, + template_image=flex_template_image, metadata_file="metadata.json", ) -@pytest.fixture(scope="session") -def run_dataflow_job( +def test_flex_template_run( utils: Utils, bucket_name: str, pubsub_publisher: str, pubsub_subscription: str, flex_template_path: str, bigquery_dataset: str, -) -> str: +) -> None: + bigquery_table = "output_table" job_id = utils.dataflow_flex_template_run( job_name=NAME, template_path=flex_template_path, bucket_name=bucket_name, parameters={ "input_subscription": pubsub_subscription, - "output_table": f"{bigquery_dataset}.output_table", + "output_table": f"{bigquery_dataset}.{bigquery_table}", }, ) # Since this is a streaming job, it will never finish running. # First, lets wait until the job is running. - utils.dataflow_jobs_wait(job_id, until_status="JOB_STATE_RUNNING") - - yield job_id + utils.dataflow_jobs_wait(job_id) - utils.dataflow_jobs_cancel(job_id) - - -def test_flex_template_run( - utils: Utils, bigquery_dataset: str, run_dataflow_job: str -) -> None: - # Wait for a while for data to arrive and get processed. - logging.info("Pipeline is running, waiting for messages to arrive") - time.sleep(5 * 60) + # Then, wait a minute for data to arrive, get processed, and cancel it. + time.sleep(60) + utils.dataflow_jobs_cancel_by_job_id(job_id) # Check for the output data in BigQuery. - query = f"SELECT * FROM `{bigquery_dataset.replace(':', '.')}.output_table`" + query = f"SELECT * FROM {bigquery_dataset.replace(':', '.')}.{bigquery_table}" rows = list(utils.bigquery_query(query)) assert len(rows) > 0 for row in rows: diff --git a/dataflow/gpu-workers/conftest.py b/dataflow/gpu-workers/conftest.py new file mode 100644 index 00000000000..41b3aea8490 --- /dev/null +++ b/dataflow/gpu-workers/conftest.py @@ -0,0 +1,503 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +from dataclasses import dataclass +import itertools +import json +import logging +import multiprocessing as mp +import os +import re +import platform +import subprocess +import sys +import time +from typing import Any, Callable, Dict, Iterable, Optional, Union +import uuid + +import pytest + +# Default options. +UUID = uuid.uuid4().hex[0:6] +PROJECT = os.environ["GOOGLE_CLOUD_PROJECT"] +REGION = "us-central1" + +RETRY_MAX_TIME = 5 * 60 # 5 minutes in seconds + +HYPHEN_NAME_RE = re.compile(r"[^\w\d-]+") +UNDERSCORE_NAME_RE = re.compile(r"[^\w\d_]+") + +PYTHON_VERSION = "".join(platform.python_version_tuple()[0:2]) + + +@dataclass +class Utils: + uuid: str = UUID + project: str = PROJECT + region: str = REGION + + @staticmethod + def hyphen_name(name: str) -> str: + unique_name = f"{name}-py{PYTHON_VERSION}-{UUID}" + return HYPHEN_NAME_RE.sub("-", unique_name) + + @staticmethod + def underscore_name(name: str) -> str: + return UNDERSCORE_NAME_RE.sub("_", Utils.hyphen_name(name)) + + @staticmethod + def storage_bucket(name: str) -> str: + from google.cloud import storage + + storage_client = storage.Client() + bucket = storage_client.create_bucket(Utils.hyphen_name(name)) + + logging.info(f"Created storage_bucket: {bucket.name}") + yield bucket.name + + # Print all the objects in the bucket before deleting for debugging. + logging.info(f"Deleting bucket {bucket.name} with the following contents:") + total_files = 0 + total_size = 0 + for blob in bucket.list_blobs(): + logging.info(f" - {blob.name} ({blob.size} bytes)") + total_files += 1 + total_size += blob.size + logging.info(f"Total {total_files} files ({total_size} bytes)") + + bucket.delete(force=True) + logging.info(f"Deleted storage_bucket: {bucket.name}") + + @staticmethod + def bigquery_dataset(name: str, project: str = PROJECT) -> str: + from google.cloud import bigquery + + bigquery_client = bigquery.Client() + + dataset = bigquery_client.create_dataset( + bigquery.Dataset(f"{project}.{Utils.underscore_name(name)}") + ) + + logging.info(f"Created bigquery_dataset: {dataset.full_dataset_id}") + yield dataset.full_dataset_id + + bigquery_client.delete_dataset( + dataset.full_dataset_id.replace(":", "."), delete_contents=True + ) + logging.info(f"Deleted bigquery_dataset: {dataset.full_dataset_id}") + + @staticmethod + def bigquery_query(query: str) -> Iterable[Dict[str, Any]]: + from google.cloud import bigquery + + bigquery_client = bigquery.Client() + logging.info(f"Bigquery query: {query}") + for row in bigquery_client.query(query): + yield dict(row) + + @staticmethod + def pubsub_topic(name: str, project: str = PROJECT) -> str: + from google.cloud import pubsub + + publisher_client = pubsub.PublisherClient() + topic_path = publisher_client.topic_path(project, Utils.hyphen_name(name)) + topic = publisher_client.create_topic(topic_path) + + logging.info(f"Created pubsub_topic: {topic.name}") + yield topic.name + + # Due to the pinned library dependencies in apache-beam, client + # library throws an error upon deletion. + # We use gcloud for a workaround. See also: + # https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4492 + cmd = ["gcloud", "pubsub", "--project", project, "topics", "delete", topic.name] + logging.info(f"{cmd}") + subprocess.run(cmd, check=True) + logging.info(f"Deleted pubsub_topic: {topic.name}") + + @staticmethod + def pubsub_subscription( + topic_path: str, + name: str, + project: str = PROJECT, + ) -> str: + from google.cloud import pubsub + + subscriber = pubsub.SubscriberClient() + subscription_path = subscriber.subscription_path( + project, Utils.hyphen_name(name) + ) + subscription = subscriber.create_subscription(subscription_path, topic_path) + + logging.info(f"Created pubsub_subscription: {subscription.name}") + yield subscription.name + + # Due to the pinned library dependencies in apache-beam, client + # library throws an error upon deletion. + # We use gcloud for a workaround. See also: + # https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4492 + cmd = [ + "gcloud", + "pubsub", + "--project", + project, + "subscriptions", + "delete", + subscription.name, + ] + logging.info(f"{cmd}") + subprocess.run(cmd, check=True) + logging.info(f"Deleted pubsub_subscription: {subscription.name}") + + @staticmethod + def pubsub_publisher( + topic_path: str, + new_msg: Callable[[int], str] = lambda i: json.dumps( + {"id": i, "content": f"message {i}"} + ), + sleep_sec: int = 1, + ) -> bool: + from google.cloud import pubsub + + def _infinite_publish_job() -> None: + publisher_client = pubsub.PublisherClient() + for i in itertools.count(): + msg = new_msg(i) + publisher_client.publish(topic_path, msg.encode("utf-8")).result() + time.sleep(sleep_sec) + + # Start a subprocess in the background to do the publishing. + logging.info(f"Starting publisher on {topic_path}") + p = mp.Process(target=_infinite_publish_job) + p.start() + + yield p.is_alive() + + # For cleanup, terminate the background process. + logging.info("Stopping publisher") + p.join(timeout=0) + p.terminate() + + @staticmethod + def cloud_build_submit( + image_name: Optional[str] = None, + config: Optional[str] = None, + source: str = ".", + substitutions: Optional[Dict[str, str]] = None, + project: str = PROJECT, + ) -> None: + """Sends a Cloud Build job, if an image_name is provided it will be deleted at teardown.""" + cmd = ["gcloud", "auth", "configure-docker"] + logging.info(f"{cmd}") + subprocess.run(cmd, check=True) + + if substitutions: + cmd_substitutions = [ + f"--substitutions={','.join([k + '=' + v for k, v in substitutions.items()])}" + ] + else: + cmd_substitutions = [] + + if config: + with open(config) as f: + cmd = [ + "gcloud", + "builds", + "submit", + f"--project={project}", + f"--config={config}", + *cmd_substitutions, + source, + ] + logging.info(f"{cmd}") + subprocess.run(cmd, check=True) + logging.info(f"Cloud build finished successfully: {config}") + yield f.read() + elif image_name: + cmd = [ + "gcloud", + "builds", + "submit", + f"--project={project}", + f"--tag=gcr.io/{project}/{image_name}:{UUID}", + *cmd_substitutions, + source, + ] + logging.info(f"{cmd}") + subprocess.run(cmd, check=True) + logging.info(f"Created image: gcr.io/{project}/{image_name}:{UUID}") + yield f"{image_name}:{UUID}" + else: + raise ValueError("must specify either `config` or `image_name`") + + if image_name: + cmd = [ + "gcloud", + "container", + "images", + "delete", + f"gcr.io/{project}/{image_name}:{UUID}", + f"--project={project}", + "--force-delete-tags", + "--quiet", + ] + logging.info(f"{cmd}") + subprocess.run(cmd, check=True) + logging.info(f"Deleted image: gcr.io/{project}/{image_name}:{UUID}") + + @staticmethod + def dataflow_jobs_list( + project: str = PROJECT, page_size: int = 30 + ) -> Iterable[dict]: + from googleapiclient.discovery import build + + dataflow = build("dataflow", "v1b3") + + response = {"nextPageToken": None} + while "nextPageToken" in response: + # For more info see: + # https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/list + request = ( + dataflow.projects() + .jobs() + .list( + projectId=project, + pageToken=response["nextPageToken"], + pageSize=page_size, + ) + ) + response = request.execute() + for job in response["jobs"]: + yield job + + @staticmethod + def dataflow_jobs_get( + job_id: Optional[str] = None, + job_name: Optional[str] = None, + project: str = PROJECT, + list_page_size: int = 30, + ) -> Optional[Dict[str, Any]]: + from googleapiclient.discovery import build + + dataflow = build("dataflow", "v1b3") + + if job_id: + # For more info see: + # https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/get + request = ( + dataflow.projects() + .jobs() + .get( + projectId=project, + jobId=job_id, + view="JOB_VIEW_SUMMARY", + ) + ) + # If the job is not found, this throws an HttpError exception. + job = request.execute() + logging.info(f"Found Dataflow job: {job}") + return job + + elif job_name: + for job in Utils.dataflow_jobs_list(project, list_page_size): + if job["name"] == job_name: + logging.info(f"Found Dataflow job: {job}") + return job + raise ValueError(f"Dataflow job not found: job_name={job_name}") + + else: + raise ValueError("must specify either `job_id` or `job_name`") + + @staticmethod + def dataflow_jobs_wait( + job_id: Optional[str] = None, + job_name: Optional[str] = None, + project: str = PROJECT, + region: str = REGION, + until_status: str = "JOB_STATE_DONE", + timeout_sec: str = 30 * 60, + poll_interval_sec=60, + list_page_size=100, + ) -> Optional[str]: + """For a list of all the valid states: + https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs#Job.JobState + """ + + # Wait until we reach the desired status, or the job finished in some way. + target_status = { + until_status, + "JOB_STATE_DONE", + "JOB_STATE_FAILED", + "JOB_STATE_CANCELLED", + } + logging.info( + f"Waiting for Dataflow job until {target_status}: job_id={job_id}, job_name={job_name}" + ) + status = None + for _ in range(0, timeout_sec, poll_interval_sec): + try: + job = Utils.dataflow_jobs_get( + job_id=job_id, + job_name=job_name, + project=project, + list_page_size=list_page_size, + ) + status = job["currentState"] + if status in target_status: + logging.info( + f"Job status {status} in {target_status}, done waiting" + ) + return status + elif status == "JOB_STATE_FAILED": + raise RuntimeError( + "Dataflow job failed:\n" + f"https://console.cloud.google.com/dataflow/jobs/{region}/{job_id}?project={project}" + ) + logging.info( + f"Job status {status} not in {target_status}, retrying in {poll_interval_sec} seconds" + ) + except Exception as e: + logging.exception(e) + time.sleep(poll_interval_sec) + if status is None: + raise RuntimeError( + f"Dataflow job not found: timeout_sec={timeout_sec}, target_status={target_status}, job_id={job_id}, job_name={job_name}" + ) + else: + raise RuntimeError( + f"Dataflow job finished in status {status} but expected {target_status}: job_id={job_id}, job_name={job_name}" + ) + + @staticmethod + def dataflow_jobs_cancel( + job_id: str, project: str = PROJECT, region: str = REGION + ) -> None: + logging.info(f"Cancelling Dataflow job ID: {job_id}") + # We get an error using the googleapiclient.discovery APIs, probably + # due to incompatible dependencies with apache-beam. + # We use gcloud instead to cancel the job. + # https://cloud.google.com/sdk/gcloud/reference/dataflow/jobs/drain + cmd = [ + "gcloud", + f"--project={project}", + "dataflow", + "jobs", + "drain", + job_id, + f"--region={region}", + ] + logging.info(f"{cmd}") + subprocess.run(cmd, check=True) + + # https://cloud.google.com/sdk/gcloud/reference/dataflow/jobs/cancel + cmd = [ + "gcloud", + f"--project={project}", + "dataflow", + "jobs", + "cancel", + job_id, + f"--region={region}", + ] + logging.info(f"{cmd}") + subprocess.run(cmd, check=True) + logging.info(f"Cancelled Dataflow job: {job_id}") + + @staticmethod + def dataflow_flex_template_build( + bucket_name: str, + image_name: str, + metadata_file: str, + project: str = PROJECT, + template_file: str = "template.json", + ) -> str: + # https://cloud.google.com/sdk/gcloud/reference/dataflow/flex-template/build + template_gcs_path = f"gs://{bucket_name}/{template_file}" + cmd = [ + "gcloud", + "dataflow", + "flex-template", + "build", + template_gcs_path, + f"--project={project}", + f"--image=gcr.io/{project}/{image_name}", + "--sdk-language=PYTHON", + f"--metadata-file={metadata_file}", + ] + logging.info(f"{cmd}") + subprocess.run(cmd, check=True) + + logging.info(f"dataflow_flex_template_build: {template_gcs_path}") + yield template_gcs_path + # The template file gets deleted when we delete the bucket. + + @staticmethod + def dataflow_flex_template_run( + job_name: str, + template_path: str, + bucket_name: str, + parameters: Dict[str, str] = {}, + project: str = PROJECT, + region: str = REGION, + ) -> str: + import yaml + + # https://cloud.google.com/sdk/gcloud/reference/dataflow/flex-template/run + unique_job_name = Utils.hyphen_name(job_name) + logging.info(f"dataflow_job_name: {unique_job_name}") + cmd = [ + "gcloud", + "dataflow", + "flex-template", + "run", + unique_job_name, + f"--template-file-gcs-location={template_path}", + f"--project={project}", + f"--region={region}", + f"--staging-location=gs://{bucket_name}/staging", + ] + [ + f"--parameters={name}={value}" + for name, value in { + **parameters, + }.items() + ] + logging.info(f"{cmd}") + + try: + # The `capture_output` option was added in Python 3.7, so we must + # pass the `stdout` and `stderr` options explicitly to support 3.6. + # https://docs.python.org/3/library/subprocess.html#subprocess.run + p = subprocess.run( + cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + stdout = p.stdout.decode("utf-8") + stderr = p.stderr.decode("utf-8") + logging.info(f"Launched Dataflow Flex Template job: {unique_job_name}") + except subprocess.CalledProcessError as e: + logging.info(e, file=sys.stderr) + stdout = e.stdout.decode("utf-8") + stderr = e.stderr.decode("utf-8") + finally: + logging.info("--- stderr ---") + logging.info(stderr) + logging.info("--- stdout ---") + logging.info(stdout) + logging.info("--- end ---") + return yaml.safe_load(stdout)["job"]["id"] + + +@pytest.fixture(scope="session") +def utils() -> Utils: + logging.getLogger().setLevel(logging.INFO) + logging.info(f"Test unique identifier: {UUID}") + subprocess.run(["gcloud", "version"]) + return Utils() From 33a9f09506952b9f7b71ca8c8e2d447fc9f9e0ed Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Wed, 23 Jun 2021 18:36:46 +0000 Subject: [PATCH 66/87] remove unused import --- dataflow/gpu-workers/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataflow/gpu-workers/conftest.py b/dataflow/gpu-workers/conftest.py index 41b3aea8490..14c568548af 100644 --- a/dataflow/gpu-workers/conftest.py +++ b/dataflow/gpu-workers/conftest.py @@ -21,7 +21,7 @@ import subprocess import sys import time -from typing import Any, Callable, Dict, Iterable, Optional, Union +from typing import Any, Callable, Dict, Iterable, Optional import uuid import pytest From d9f776aa8cc389ebd4ee3eaa1169e9fab57fb837 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Wed, 23 Jun 2021 19:04:37 +0000 Subject: [PATCH 67/87] reverted streaming-beam changes --- dataflow/flex-templates/streaming_beam/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataflow/flex-templates/streaming_beam/requirements.txt b/dataflow/flex-templates/streaming_beam/requirements.txt index 009bc29b6d8..7c934ad8979 100644 --- a/dataflow/flex-templates/streaming_beam/requirements.txt +++ b/dataflow/flex-templates/streaming_beam/requirements.txt @@ -1 +1 @@ -apache-beam[gcp]==2.30.0 +apache-beam[gcp]==2.29.0 From 25303eebcd94791076ede70e91b663b8a36a2937 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Wed, 23 Jun 2021 20:34:57 +0000 Subject: [PATCH 68/87] updated image versions --- dataflow/gpu-workers/pytorch-minimal/Dockerfile | 2 +- dataflow/gpu-workers/tensorflow-landsat/Dockerfile | 2 +- dataflow/gpu-workers/tensorflow-landsat/run.yaml | 1 + dataflow/gpu-workers/tensorflow-minimal/Dockerfile | 2 +- dataflow/gpu-workers/tensorflow-minimal/run.yaml | 1 + 5 files changed, 5 insertions(+), 3 deletions(-) diff --git a/dataflow/gpu-workers/pytorch-minimal/Dockerfile b/dataflow/gpu-workers/pytorch-minimal/Dockerfile index 0dcc04d017d..94aa381e344 100644 --- a/dataflow/gpu-workers/pytorch-minimal/Dockerfile +++ b/dataflow/gpu-workers/pytorch-minimal/Dockerfile @@ -17,7 +17,7 @@ FROM pytorch/pytorch:1.8.1-cuda11.1-cudnn8-runtime WORKDIR /pipeline # Copy the Apache Beam worker files and the pipeline source files. -COPY --from=apache/beam_python3.8_sdk:2.29.0 /opt/apache/beam /opt/apache/beam +COPY --from=apache/beam_python3.8_sdk:2.30.0 /opt/apache/beam /opt/apache/beam COPY requirements.txt . COPY *.py ./ diff --git a/dataflow/gpu-workers/tensorflow-landsat/Dockerfile b/dataflow/gpu-workers/tensorflow-landsat/Dockerfile index cc2d7eba729..7a50a862756 100644 --- a/dataflow/gpu-workers/tensorflow-landsat/Dockerfile +++ b/dataflow/gpu-workers/tensorflow-landsat/Dockerfile @@ -21,7 +21,7 @@ FROM nvcr.io/nvidia/cuda:11.2.2-cudnn8-runtime-ubuntu20.04 WORKDIR /pipeline # Copy the Apache Beam worker files and the pipeline source files. -COPY --from=apache/beam_python3.8_sdk:2.29.0 /opt/apache/beam /opt/apache/beam +COPY --from=apache/beam_python3.8_sdk:2.30.0 /opt/apache/beam /opt/apache/beam COPY requirements.txt . COPY *.py ./ diff --git a/dataflow/gpu-workers/tensorflow-landsat/run.yaml b/dataflow/gpu-workers/tensorflow-landsat/run.yaml index 2b97dd28e48..1ac286736e1 100644 --- a/dataflow/gpu-workers/tensorflow-landsat/run.yaml +++ b/dataflow/gpu-workers/tensorflow-landsat/run.yaml @@ -49,6 +49,7 @@ steps: - --sdk_container_image=gcr.io/$PROJECT_ID/$_IMAGE - --experiment=worker_accelerator=type:$_GPU_TYPE;count:$_GPU_COUNT;install-nvidia-driver - --experiment=use_runner_v2 + - --experiment=no_use_multiple_sdk_containers - --disk_size_gb=50 options: diff --git a/dataflow/gpu-workers/tensorflow-minimal/Dockerfile b/dataflow/gpu-workers/tensorflow-minimal/Dockerfile index ff88332cd41..48b4b390eeb 100644 --- a/dataflow/gpu-workers/tensorflow-minimal/Dockerfile +++ b/dataflow/gpu-workers/tensorflow-minimal/Dockerfile @@ -21,7 +21,7 @@ FROM nvcr.io/nvidia/cuda:11.2.2-cudnn8-runtime-ubuntu20.04 WORKDIR /pipeline # Copy the Apache Beam worker files and the pipeline source files. -COPY --from=apache/beam_python3.8_sdk:2.29.0 /opt/apache/beam /opt/apache/beam +COPY --from=apache/beam_python3.8_sdk:2.30.0 /opt/apache/beam /opt/apache/beam COPY requirements.txt . COPY *.py ./ diff --git a/dataflow/gpu-workers/tensorflow-minimal/run.yaml b/dataflow/gpu-workers/tensorflow-minimal/run.yaml index 7cb9d389134..320c5359fe6 100644 --- a/dataflow/gpu-workers/tensorflow-minimal/run.yaml +++ b/dataflow/gpu-workers/tensorflow-minimal/run.yaml @@ -46,6 +46,7 @@ steps: - --sdk_container_image=gcr.io/$PROJECT_ID/$_IMAGE - --experiment=worker_accelerator=type:$_GPU_TYPE;count:$_GPU_COUNT;install-nvidia-driver - --experiment=use_runner_v2 + - --experiment=no_use_multiple_sdk_containers - --disk_size_gb=50 options: From fdb2610603e89c4b806a26b504c4a5bb888190b4 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Wed, 23 Jun 2021 22:10:08 +0000 Subject: [PATCH 69/87] restore old sample to avoid breaking docs --- dataflow/gpu-workers/.dockerignore | 6 + dataflow/gpu-workers/.gcloudignore | 6 + dataflow/gpu-workers/Dockerfile | 54 +++ dataflow/gpu-workers/README.md | 5 + dataflow/gpu-workers/cloudbuild.yaml | 35 ++ dataflow/gpu-workers/e2e_test.py | 120 +++++++ dataflow/gpu-workers/landsat_view.py | 368 +++++++++++++++++++++ dataflow/gpu-workers/noxfile_config.py | 38 +++ dataflow/gpu-workers/requirements-test.txt | 2 + dataflow/gpu-workers/requirements.txt | 4 + 10 files changed, 638 insertions(+) create mode 100644 dataflow/gpu-workers/.dockerignore create mode 100644 dataflow/gpu-workers/.gcloudignore create mode 100644 dataflow/gpu-workers/Dockerfile create mode 100644 dataflow/gpu-workers/README.md create mode 100644 dataflow/gpu-workers/cloudbuild.yaml create mode 100644 dataflow/gpu-workers/e2e_test.py create mode 100644 dataflow/gpu-workers/landsat_view.py create mode 100644 dataflow/gpu-workers/noxfile_config.py create mode 100644 dataflow/gpu-workers/requirements-test.txt create mode 100644 dataflow/gpu-workers/requirements.txt diff --git a/dataflow/gpu-workers/.dockerignore b/dataflow/gpu-workers/.dockerignore new file mode 100644 index 00000000000..04f5ec66ca6 --- /dev/null +++ b/dataflow/gpu-workers/.dockerignore @@ -0,0 +1,6 @@ +# Ignore files for docker. +.mypy_cache/ +.nox/ +__pycache__/ +env/ +outputs/ diff --git a/dataflow/gpu-workers/.gcloudignore b/dataflow/gpu-workers/.gcloudignore new file mode 100644 index 00000000000..cda483971fd --- /dev/null +++ b/dataflow/gpu-workers/.gcloudignore @@ -0,0 +1,6 @@ +# Ignore files for gcloud like Cloud Build. +.mypy_cache/ +.nox/ +__pycache__/ +env/ +outputs/ diff --git a/dataflow/gpu-workers/Dockerfile b/dataflow/gpu-workers/Dockerfile new file mode 100644 index 00000000000..d9003409717 --- /dev/null +++ b/dataflow/gpu-workers/Dockerfile @@ -0,0 +1,54 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Make sure the CUDA and cuDNN versions are compatible with your TensorFlow version. +# https://www.tensorflow.org/install/source#gpu +# Check the Nvidia container registry catalog to look at the available Nvidia images: +# https://ngc.nvidia.com/catalog/containers/nvidia:cuda +FROM nvidia/cuda:11.3.1-cudnn8-runtime-ubuntu20.04 + +# The Python version of the Dockerfile MUST match the Python version you use +# to launch the Dataflow job. +ARG python_version=3.8 + +WORKDIR /root + +# Copy the Apache Beam worker files and the requirements.txt file. +COPY --from=apache/beam_python3.8_sdk:2.30.0 /opt/apache/beam /opt/apache/beam +COPY requirements.txt . + +# Update PATH so we find our new Conda and Python installations. +ENV PATH=/opt/python/bin:/opt/conda/bin:$PATH + +RUN apt-get update \ + && apt-get upgrade -y \ + && apt-get install -y wget \ + && rm -rf /var/lib/apt/lists/* \ + # The nvidia image doesn't come with Python pre-installed. + # We use Miniconda to install the Python version of our choice. + && wget -q https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \ + && sh Miniconda3-latest-Linux-x86_64.sh -b -p /opt/conda \ + && rm Miniconda3-latest-Linux-x86_64.sh \ + # Create a new Python environment and install our requirements. + # We don't need to update $PATH since /usr/local is already in $PATH. + && conda create -y -p /opt/python python=$python_version pip \ + && pip install --no-cache-dir -U pip \ + && pip install --no-cache-dir -r requirements.txt \ + && conda clean -y --all --force-pkgs-dirs \ + # Beam workers looks for pip at /usr/local/bin/pip by default. + # This can be omitted in Beam 2.30.0 and later versions. + && ln -s $(which pip) /usr/local/bin/pip + +# Set the entrypoint to Apache Beam SDK worker launcher. +ENTRYPOINT [ "/opt/apache/beam/boot" ] diff --git a/dataflow/gpu-workers/README.md b/dataflow/gpu-workers/README.md new file mode 100644 index 00000000000..a71f0da3e95 --- /dev/null +++ b/dataflow/gpu-workers/README.md @@ -0,0 +1,5 @@ +# Workers with GPUs + +[![Open in Cloud Shell](http://gstatic.com/cloudssh/images/open-btn.svg)](https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=dataflow/gpu-workers/README.md) + +📝 Tutorial: [Processing Landsat satellite images with GPUs](https://cloud.google.com/dataflow/docs/samples/satellite-images-gpus) diff --git a/dataflow/gpu-workers/cloudbuild.yaml b/dataflow/gpu-workers/cloudbuild.yaml new file mode 100644 index 00000000000..dec3d7aabb8 --- /dev/null +++ b/dataflow/gpu-workers/cloudbuild.yaml @@ -0,0 +1,35 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# To build the container image: +# PYTHON_VERSION=`python -c 'import platform; print(platform.python_version())'` +# gcloud builds submit --substitutions _PYTHON_VERSION=$PYTHON_VERSION . --timeout 20m + +steps: + # Build the container image with the Python version of our choice. + - name: gcr.io/cloud-builders/docker + args: + [ 'build' + , '--build-arg=python_version=$_PYTHON_VERSION' + , '--tag=gcr.io/$PROJECT_ID/$_IMAGE' + , '.' + ] + + # Push the image to Container Registry. + - name: gcr.io/cloud-builders/docker + args: [ 'push', 'gcr.io/$PROJECT_ID/$_IMAGE' ] + +substitutions: + _PYTHON_VERSION: '3.8' + _IMAGE: samples/dataflow/tensorflow-gpu:latest diff --git a/dataflow/gpu-workers/e2e_test.py b/dataflow/gpu-workers/e2e_test.py new file mode 100644 index 00000000000..f3f105d7b6a --- /dev/null +++ b/dataflow/gpu-workers/e2e_test.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python + +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import platform +import subprocess +import uuid + +from google.cloud import storage +import pytest + +SUFFIX = uuid.uuid4().hex[0:6] +PROJECT = os.environ["GOOGLE_CLOUD_PROJECT"] +BUCKET_NAME = f"dataflow-gpu-test-{SUFFIX}" +IMAGE_NAME = f"dataflow/gpu-workers/test-{SUFFIX}:latest" +REGION = "us-central1" +ZONE = "us-central1-f" + + +@pytest.fixture(scope="session") +def bucket_name() -> str: + storage_client = storage.Client() + bucket = storage_client.create_bucket(BUCKET_NAME) + + yield BUCKET_NAME + + bucket.delete(force=True) + + +@pytest.fixture(scope="session") +def configure_docker() -> None: + subprocess.run( + [ + "gcloud", + "auth", + "configure-docker", + ] + ) + + +@pytest.fixture(scope="session") +def image_name(configure_docker: None) -> str: + # See the `cloudbuild.yaml` for the configuration for this build. + substitutions = { + "_PYTHON_VERSION": platform.python_version(), + "_IMAGE": IMAGE_NAME, + } + print(f"-- Cloud build substitutions: {substitutions}") + subprocess.run( + [ + "gcloud", + "builds", + "submit", + f"--project={PROJECT}", + f"--substitutions={','.join([k + '=' + v for k, v in substitutions.items()])}", + "--timeout=30m", + "--quiet", + ], + check=True, + ) + + yield f"gcr.io/{PROJECT}/{IMAGE_NAME}" + + # Delete the image when we're done. + subprocess.run( + [ + "gcloud", + "container", + "images", + "delete", + f"gcr.io/{PROJECT}/{IMAGE_NAME}", + f"--project={PROJECT}", + "--quiet", + ], + check=True, + ) + + +def test_end_to_end(bucket_name: str, image_name: str) -> None: + # Run the Beam pipeline in Dataflow making sure GPUs are used. + gpu_type = "nvidia-tesla-t4" + subprocess.run( + [ + "python", + "landsat_view.py", + f"--output-path-prefix=gs://{bucket_name}/outputs/", + "--runner=DataflowRunner", + f"--job_name=gpu-workers-{SUFFIX}", + f"--project={PROJECT}", + f"--region={REGION}", + f"--temp_location=gs://{bucket_name}/temp", + "--worker_machine_type=custom-1-13312-ext", + "--disk_size_gb=300", + f"--worker_harness_container_image={image_name}", + f"--worker_zone={ZONE}", + f"--experiments=worker_accelerator=type={gpu_type},count=1,install-nvidia-driver", + "--experiments=use_runner_v2", + ], + check=True, + ) + + # Check that output files were created and are not empty. + storage_client = storage.Client() + output_files = list(storage_client.list_blobs(bucket_name, prefix="outputs/")) + assert len(output_files) > 0, "No output files found" + for output_file in output_files: + assert output_file.size > 0, f"Output file is empty: {output_file.name}" diff --git a/dataflow/gpu-workers/landsat_view.py b/dataflow/gpu-workers/landsat_view.py new file mode 100644 index 00000000000..9e61016eabf --- /dev/null +++ b/dataflow/gpu-workers/landsat_view.py @@ -0,0 +1,368 @@ +#!/usr/bin/env python + +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This Apache Beam pipeline processes Landsat 8 satellite images and renders +them as JPEG files. + +A Landsat 8 image consists of 11 bands. Each band contains the data for a +specific range of the electromagnetic spectrum. + +A JPEG image consists of three channels: Red, Green, and Blue. For Landsat 8 +images, these correspond to Band 4 (red), Band 3 (green), and Band 2 (blue). + +These bands contain the raw pixel data directly from the satellite sensors. The +values in each band can go from 0 to unbounded positive values. For a JPEG image +we need to clamp them into integers between 0 and 255 for each channel. + +For this, we supply visualization parameters, commonly called `vis_params`. +These visualization parameters include: + +- The bands for the RGB cannels, typically [B4, B3, B2] for Landsat 8. +- The minimum value in each band, typically 0 for Landsat 8. +- The maximum value in each band, this varies depending on the light exposure. +- A gamma value for gamma correction. + +The Landsat data is read from the Landsat public dataset in Cloud Storage. +For more information on the Landsat dataset: + https://cloud.google.com/storage/docs/public-datasets/landsat + +The overall workflow of the pipeline is the following: + +- Parse one or more Landsat scene IDs from user-provided flags.. +- Get the Cloud Storage paths of all the RGB bands. +- Load the pixel values for each band from Cloud Storage. +- Preprocess pixels: clamp values and apply gamma correction. +- Create a JPEG image and save it to Cloud Storage. +""" + +import argparse +import logging +import os +import re +from typing import Any, Dict, List, Tuple + +import apache_beam as beam +from apache_beam.options.pipeline_options import PipelineOptions +from apache_beam.typehints.typehints import Optional +import numpy as np +from PIL import Image +import rasterio +import tensorflow as tf + +DEFAULT_RGB_BAND_NAMES = ["B4", "B3", "B2"] +DEFAULT_MIN_BAND_VALUE = 0.0 +DEFAULT_MAX_BAND_VALUE = 12000.0 +DEFAULT_GAMMA = 0.5 + +DEFAULT_SCENES = [ + "LC08_L1TP_001067_20200727_20200807_01_T1", # Brazil-Bolivia boundary + "LC08_L1TP_019024_20190621_20190704_01_T1", # Nottaway river delta, Quebec + "LC08_L1TP_019046_20191214_20191226_01_T1", # Yucatan peninsula + "LC08_L1TP_037035_20191212_20191212_01_T1", # Grand canyon, Arizona + "LC08_L1TP_045031_20200715_20200722_01_T1", # Mount Shasta, California + "LC08_L1TP_064011_20200618_20200625_01_T1", # Mackenzie river delta, Canada + "LC08_L1TP_073087_20200516_20200527_01_T1", # Mt. Taranaki, New Zealand + "LC08_L1TP_083074_20180805_20180814_01_T1", # Nouvelle-Calédonie + "LC08_L1TP_098063_20200703_20200708_01_T1", # Manam volcano, Papua New Guinea + "LC08_L1TP_109078_20200411_20200422_01_T1", # Lake Carnegie, West Australia + "LC08_L1TP_110036_20191009_20191018_01_T1", # Osaka 大阪市, Japan + "LC08_L1TP_115078_20200608_20200625_01_T1", # Sediment deposits, West Australia + "LC08_L1TP_119038_20191109_20191115_01_T1", # Lake Tai 太湖, China + "LC08_L1TP_135040_20190314_20190325_01_T1", # Arunachal Pradesh, India + "LC08_L1TP_137045_20200211_20200225_01_T1", # Ganges river delta, India + "LC08_L1TP_166075_20180608_20180615_01_T1", # Bazaruto island, Mozambique + "LC08_L1TP_169034_20200720_20200807_01_T1", # Lake Urmia دریاچه ارومیه, Iran + "LC08_L1TP_170059_20200101_20200113_01_T1", # Mount Elgon, Uganda + "LC08_L1TP_175079_20200511_20200526_01_T1", # Sand dunes, South Africa + "LC08_L1TP_178069_20200804_20200821_01_T1", # Angola + "LC08_L1TP_178078_20200804_20200821_01_T1", # Sand dunes, Namibia + "LC08_L1TP_191020_20200815_20200822_01_T1", # Phytoplankton at Gotland, Sweden + "LC08_L1TP_195028_20200116_20200127_01_T1", # Swiss Alps + "LC08_L1TP_203045_20200108_20200114_01_T1", # Eye of the Sahara, Mauritania + "LC08_L1TP_231094_20190906_20190917_01_T1", # Patagonia, South America +] + +SCENE_RE = re.compile( + r"(?PL[COTEM]0[78])_" + r"(?PL1TP|L1GT|L1GS)_" + r"(?P\d\d\d)" + r"(?P\d\d\d)_" + r"(?P\d\d\d\d)" + r"(?P\d\d)" + r"(?P\d\d)_" + r"(?P\d\d\d\d)" + r"(?P\d\d)" + r"(?P\d\d)_" + r"(?P\d\d)_" + r"(?PRT|T1|T2)" +) + + +def check_gpus(element: Any, gpus_optional: bool) -> Any: + """Makes sure TensorFlow detects GPUs, otherwise raise a RuntimeError. + + Note that this function must be run within a PTransform like beam.Map so + we are sure it's run by the workers, and not the launcher process. + + Args: + element: An element + gpus_optional: If True, the pipeline won't crash if GPUs are not found. + + Returns: + The same element it received as is. + + Raises: + RuntimeError: If no GPUs were found by TensorFlow. + """ + # Make sure we have a GPU available. + gpu_devices = tf.config.list_physical_devices("GPU") + logging.info(f"GPU devices: {gpu_devices}") + if len(gpu_devices) == 0: + if gpus_optional: + logging.warning("No GPUs found, defaulting to CPU.") + else: + raise RuntimeError("No GPUs found.") + return element + + +def get_band_paths( + scene: str, band_names: List[str], unused_side_input: Any +) -> Tuple[str, List[str]]: + """Gets the Cloud Storage paths for each band in a Landsat scene. + + Args: + scene: Landsat 8 scene ID. + band_names: List of the band names corresponding to [Red, Green, Blue] channels. + unused_side_input: Used to wait for the GPU check, can be safely ignored. + + Returns: + A (scene, band_paths) pair. + + Raises: + ValueError: If the scene or a band does not exist. + """ + # Extract the metadata from the scene ID using a regular expression. + m = SCENE_RE.match(scene) + if not m: + raise ValueError(f"invalid scene ID: {scene}") + + g = m.groupdict() + scene_dir = f"gs://gcp-public-data-landsat/{g['sensor']}/{g['collection']}/{g['wrs_path']}/{g['wrs_row']}/{scene}" + + band_paths = [f"{scene_dir}/{scene}_{band_name}.TIF" for band_name in band_names] + + for band_path in band_paths: + if not tf.io.gfile.exists(band_path): + raise ValueError(f"failed to load: {band_path}") + + return scene, band_paths + + +def load_values(scene: str, band_paths: List[str]) -> Tuple[str, np.ndarray]: + """Loads a scene's bands data as a numpy array. + + Args: + scene: Landsat 8 scene ID. + band_paths: A list of the [Red, Green, Blue] band paths. + + Returns: + A (scene, values) pair. + + The values are stored in a three-dimensional float32 array with shape: + (band, width, height) + """ + + def read_band(band_path: str) -> np.array: + # Use rasterio to read the GeoTIFF values from the band files. + with tf.io.gfile.GFile(band_path, "rb") as f, rasterio.open(f) as data: + return data.read(1) + + logging.info(f"{scene}: load_values({band_paths})") + values = [read_band(band_path) for band_path in band_paths] + return scene, np.array(values, np.float32) + + +def preprocess_pixels( + scene: str, + values: np.ndarray, + min_value: float = 0.0, + max_value: float = 1.0, + gamma: float = 1.0, +) -> Tuple[str, tf.Tensor]: + """Prepares the band data into a pixel-ready format for an RGB image. + + The input band values come in the shape (band, width, height) with + unbounded positive numbers depending on the sensor's exposure. + The values are reshaped into (width, height, band), the values are clamped + to integers between 0 and 255, and a gamma correction value is applied. + + Args: + scene: Landsat 8 scene ID. + values: Band values in the shape (band, width, height). + min_value: Minimum band value. + max_value: Maximum band value. + gamma: Gamma correction value. + + Returns: + A (scene, pixels) pair. The pixels are Image-ready values. + """ + logging.info( + f"{scene}: preprocess_pixels({values.shape}:{values.dtype}, min={min_value}, max={max_value}, gamma={gamma})" + ) + + # Reshape (band, width, height) into (width, height, band). + pixels = tf.transpose(values, (1, 2, 0)) + + # Rescale to values from 0.0 to 1.0 and clamp them into that range. + pixels -= min_value + pixels /= max_value + pixels = tf.clip_by_value(pixels, 0.0, 1.0) + + # Apply gamma correction. + pixels **= 1.0 / gamma + + # Return the pixel values as int8 in the range from 0 to 255, + # which is what PIL.Image expects. + return scene, tf.cast(pixels * 255.0, dtype=tf.uint8) + + +def save_to_gcs( + scene: str, image: Image.Image, output_path_prefix: str, format: str = "JPEG" +) -> None: + """Saves a PIL.Image as a JPEG file in the desired path. + + Args: + scene: Landsat 8 scene ID. + image: A PIL.Image object. + output_path_prefix: Path prefix to save the output files. + format: Image format to save files. + """ + filename = os.path.join(output_path_prefix, scene + "." + format.lower()) + with tf.io.gfile.GFile(filename, "w") as f: + image.save(f, format) + + +def run( + scenes: List[str], + output_path_prefix: str, + vis_params: Dict[str, Any], + gpus_optional: bool, + beam_args: Optional[List[str]] = None, +) -> None: + """Load multiple Landsat scenes and render them as JPEG files. + + Args: + scenes: List of Landsat 8 scene IDs. + output_path_prefix: Path prefix to save the output files. + vis_params: Visualization parameters including {rgb_bands, min, max, gamma}. + gpus_optional: If True, the pipeline won't crash if GPUs are not found. + beam_args: Optional list of arguments for Beam pipeline options. + """ + rgb_band_names = vis_params["rgb_band_names"] + min_value = vis_params["min"] + max_value = vis_params["max"] + gamma = vis_params["gamma"] + + options = PipelineOptions(beam_args, save_main_session=True) + with beam.Pipeline(options=options) as pipeline: + # Optionally, validate that the workers are using GPUs. + gpu_check = ( + pipeline + | beam.Create([None]) + | "Check GPU availability" >> beam.Map(check_gpus, gpus_optional) + ) + + # Convert Landsat 8 scenes into images. + # ℹ️ We pass `gpu_check` as an unused side input to force that step in + # the pipeline to wait for the check before continuing. + ( + pipeline + | "Create scene IDs" >> beam.Create(scenes) + | "Get RGB band paths" + >> beam.Map( + get_band_paths, + rgb_band_names, + unused_side_input=beam.pvalue.AsSingleton(gpu_check), + ) + | "Load RGB band values" >> beam.MapTuple(load_values) + | "Preprocess pixels" + >> beam.MapTuple(preprocess_pixels, min_value, max_value, gamma) + | "Convert to image" + >> beam.MapTuple( + lambda scene, rgb_pixels: ( + scene, + Image.fromarray(rgb_pixels.numpy(), mode="RGB"), + ) + ) + | "Save to Cloud Storage" >> beam.MapTuple(save_to_gcs, output_path_prefix) + ) + + +if __name__ == "__main__": + logging.getLogger().setLevel(logging.INFO) + + parser = argparse.ArgumentParser() + parser.add_argument( + "--output-path-prefix", + required=True, + help="Path prefix for output image files. " + "This can be a Google Cloud Storage path.", + ) + parser.add_argument( + "--scene", + dest="scenes", + action="append", + help="One or more Landsat scene IDs to process, for example " + "LC08_L1TP_109078_20200411_20200422_01_T1. " + "They must be in the format: " + "https://www.usgs.gov/faqs/what-naming-convention-landsat-collections-level-1-scenes", + ) + parser.add_argument( + "--rgb-band-names", + nargs=3, + default=DEFAULT_RGB_BAND_NAMES, + help="List of three band names to be mapped to the RGB channels.", + ) + parser.add_argument( + "--min", + type=float, + default=DEFAULT_MIN_BAND_VALUE, + help="Minimum value of the band value range.", + ) + parser.add_argument( + "--max", + type=float, + default=DEFAULT_MAX_BAND_VALUE, + help="Maximum value of the band value range.", + ) + parser.add_argument( + "--gamma", type=float, default=DEFAULT_GAMMA, help="Gamma correction factor." + ) + parser.add_argument( + "--gpus-optional", + action="store_true", + help="If set, the pipeline won't crash if GPUs are not found.", + ) + args, beam_args = parser.parse_known_args() + + scenes = args.scenes or DEFAULT_SCENES + vis_params = { + "rgb_band_names": args.rgb_band_names, + "min": args.min, + "max": args.max, + "gamma": args.gamma, + } + run(scenes, args.output_path_prefix, vis_params, args.gpus_optional, beam_args) diff --git a/dataflow/gpu-workers/noxfile_config.py b/dataflow/gpu-workers/noxfile_config.py new file mode 100644 index 00000000000..74d736256c6 --- /dev/null +++ b/dataflow/gpu-workers/noxfile_config.py @@ -0,0 +1,38 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Default TEST_CONFIG_OVERRIDE for python repos. + +# You can copy this file into your directory, then it will be imported from +# the noxfile.py. + +# The source of truth: +# https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/noxfile_config.py + +TEST_CONFIG_OVERRIDE = { + # You can opt out from the test for specific Python versions. + "ignored_versions": ["2.7", "3.9"], + # Old samples are opted out of enforcing Python type hints + # All new samples should feature them + "enforce_type_hints": True, + # An envvar key for determining the project id to use. Change it + # to 'BUILD_SPECIFIC_GCLOUD_PROJECT' if you want to opt in using a + # build specific Cloud project. You can also use your own string + # to use your own Cloud project. + "gcloud_project_env": "GOOGLE_CLOUD_PROJECT", + # 'gcloud_project_env': 'BUILD_SPECIFIC_GCLOUD_PROJECT', + # A dictionary you want to inject into your test. Don't put any + # secrets here. These values will override predefined values. + "envs": {}, +} diff --git a/dataflow/gpu-workers/requirements-test.txt b/dataflow/gpu-workers/requirements-test.txt new file mode 100644 index 00000000000..9782f5d8d54 --- /dev/null +++ b/dataflow/gpu-workers/requirements-test.txt @@ -0,0 +1,2 @@ +google-cloud-storage==1.38.0 +pytest==6.2.4 diff --git a/dataflow/gpu-workers/requirements.txt b/dataflow/gpu-workers/requirements.txt new file mode 100644 index 00000000000..1823ef09b96 --- /dev/null +++ b/dataflow/gpu-workers/requirements.txt @@ -0,0 +1,4 @@ +Pillow==8.2.0 +apache-beam[gcp]==2.29.0 +rasterio==1.2.4 +tensorflow==2.5.0 From 5fc27b432ed7ef0a39b44e9fed564ab607ce171d Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Wed, 23 Jun 2021 22:11:29 +0000 Subject: [PATCH 70/87] update copyright year --- dataflow/gpu-workers/tensorflow-landsat/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataflow/gpu-workers/tensorflow-landsat/main.py b/dataflow/gpu-workers/tensorflow-landsat/main.py index 6afa31522c3..408268dfa6d 100644 --- a/dataflow/gpu-workers/tensorflow-landsat/main.py +++ b/dataflow/gpu-workers/tensorflow-landsat/main.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -# Copyright 2020 Google LLC +# Copyright 2021 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 1f005aeae33a4ee0502f1378f1a149838b5a0d0c Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Wed, 23 Jun 2021 22:17:41 +0000 Subject: [PATCH 71/87] fix lint issues --- dataflow/gpu-workers/conftest.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dataflow/gpu-workers/conftest.py b/dataflow/gpu-workers/conftest.py index 14c568548af..89b7ca2d6ea 100644 --- a/dataflow/gpu-workers/conftest.py +++ b/dataflow/gpu-workers/conftest.py @@ -16,8 +16,8 @@ import logging import multiprocessing as mp import os -import re import platform +import re import subprocess import sys import time @@ -325,8 +325,8 @@ def dataflow_jobs_wait( region: str = REGION, until_status: str = "JOB_STATE_DONE", timeout_sec: str = 30 * 60, - poll_interval_sec=60, - list_page_size=100, + poll_interval_sec: int = 60, + list_page_size: int = 100, ) -> Optional[str]: """For a list of all the valid states: https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs#Job.JobState From ea834469a7b7d81d6fe9385665652d06b8ca4d8e Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Thu, 24 Jun 2021 15:57:13 +0000 Subject: [PATCH 72/87] renamed test file --- dataflow/gpu-workers/{e2e_test.py => landsat_test.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename dataflow/gpu-workers/{e2e_test.py => landsat_test.py} (100%) diff --git a/dataflow/gpu-workers/e2e_test.py b/dataflow/gpu-workers/landsat_test.py similarity index 100% rename from dataflow/gpu-workers/e2e_test.py rename to dataflow/gpu-workers/landsat_test.py From 83622c09479e39b5e14cf9626915b267e355cd12 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Thu, 24 Jun 2021 15:57:56 +0000 Subject: [PATCH 73/87] update beam version --- dataflow/gpu-workers/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataflow/gpu-workers/requirements.txt b/dataflow/gpu-workers/requirements.txt index 1823ef09b96..cbfaaf47f75 100644 --- a/dataflow/gpu-workers/requirements.txt +++ b/dataflow/gpu-workers/requirements.txt @@ -1,4 +1,4 @@ Pillow==8.2.0 -apache-beam[gcp]==2.29.0 +apache-beam[gcp]==2.30.0 rasterio==1.2.4 tensorflow==2.5.0 From 3765572eaf334242c11ef86392b850912c33d81d Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Thu, 24 Jun 2021 18:10:34 +0000 Subject: [PATCH 74/87] renamed test files --- dataflow/gpu-workers/{landsat_test.py => e2e_test.py} | 0 .../gpu-workers/pytorch-minimal/{e2e_test.py => pytorch_test.py} | 0 .../tensorflow-landsat/{e2e_test.py => landsat_test.py} | 0 .../tensorflow-minimal/{e2e_test.py => tensorflow_test.py} | 0 4 files changed, 0 insertions(+), 0 deletions(-) rename dataflow/gpu-workers/{landsat_test.py => e2e_test.py} (100%) rename dataflow/gpu-workers/pytorch-minimal/{e2e_test.py => pytorch_test.py} (100%) rename dataflow/gpu-workers/tensorflow-landsat/{e2e_test.py => landsat_test.py} (100%) rename dataflow/gpu-workers/tensorflow-minimal/{e2e_test.py => tensorflow_test.py} (100%) diff --git a/dataflow/gpu-workers/landsat_test.py b/dataflow/gpu-workers/e2e_test.py similarity index 100% rename from dataflow/gpu-workers/landsat_test.py rename to dataflow/gpu-workers/e2e_test.py diff --git a/dataflow/gpu-workers/pytorch-minimal/e2e_test.py b/dataflow/gpu-workers/pytorch-minimal/pytorch_test.py similarity index 100% rename from dataflow/gpu-workers/pytorch-minimal/e2e_test.py rename to dataflow/gpu-workers/pytorch-minimal/pytorch_test.py diff --git a/dataflow/gpu-workers/tensorflow-landsat/e2e_test.py b/dataflow/gpu-workers/tensorflow-landsat/landsat_test.py similarity index 100% rename from dataflow/gpu-workers/tensorflow-landsat/e2e_test.py rename to dataflow/gpu-workers/tensorflow-landsat/landsat_test.py diff --git a/dataflow/gpu-workers/tensorflow-minimal/e2e_test.py b/dataflow/gpu-workers/tensorflow-minimal/tensorflow_test.py similarity index 100% rename from dataflow/gpu-workers/tensorflow-minimal/e2e_test.py rename to dataflow/gpu-workers/tensorflow-minimal/tensorflow_test.py From 16f207bec899de426c4bddb688aad1bad2ff91b8 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Thu, 24 Jun 2021 19:51:42 +0000 Subject: [PATCH 75/87] adjust to conftest behaving differently with tests in current directory --- dataflow/gpu-workers/conftest.py | 33 +++++++++++-------- .../pytorch-minimal/pytorch_test.py | 4 +-- .../tensorflow-landsat/landsat_test.py | 4 +-- .../tensorflow-minimal/tensorflow_test.py | 4 +-- 4 files changed, 25 insertions(+), 20 deletions(-) diff --git a/dataflow/gpu-workers/conftest.py b/dataflow/gpu-workers/conftest.py index 89b7ca2d6ea..ed7387d209f 100644 --- a/dataflow/gpu-workers/conftest.py +++ b/dataflow/gpu-workers/conftest.py @@ -208,20 +208,25 @@ def cloud_build_submit( cmd_substitutions = [] if config: - with open(config) as f: - cmd = [ - "gcloud", - "builds", - "submit", - f"--project={project}", - f"--config={config}", - *cmd_substitutions, - source, - ] - logging.info(f"{cmd}") - subprocess.run(cmd, check=True) - logging.info(f"Cloud build finished successfully: {config}") - yield f.read() + try: + with open(config) as f: + cmd = [ + "gcloud", + "builds", + "submit", + f"--project={project}", + f"--config={config}", + *cmd_substitutions, + source, + ] + logging.info(f"{cmd}") + subprocess.run(cmd, check=True) + logging.info(f"Cloud build finished successfully: {config}") + yield f.read() + except Exception as e: + logging.exception(e) + logging.warning(f'Current directory: {os.getcwd()}') + yield config elif image_name: cmd = [ "gcloud", diff --git a/dataflow/gpu-workers/pytorch-minimal/pytorch_test.py b/dataflow/gpu-workers/pytorch-minimal/pytorch_test.py index 52d6a2c7ab0..039f7b70b32 100644 --- a/dataflow/gpu-workers/pytorch-minimal/pytorch_test.py +++ b/dataflow/gpu-workers/pytorch-minimal/pytorch_test.py @@ -34,7 +34,7 @@ def bucket_name(utils: Utils) -> str: def build_image(utils: Utils) -> str: yield from utils.cloud_build_submit( image_name=NAME, - config="build.yaml", + config="pytorch-minimal/build.yaml", substitutions={"_IMAGE": f"{NAME}:{utils.uuid}"}, ) @@ -43,7 +43,7 @@ def build_image(utils: Utils) -> str: def run_dataflow_job(utils: Utils, bucket_name: str, build_image: str) -> str: # Run the Beam pipeline in Dataflow making sure GPUs are used. yield from utils.cloud_build_submit( - config="run.yaml", + config="pytorch-minimal/run.yaml", substitutions={ "_JOB_NAME": utils.hyphen_name(NAME), "_IMAGE": f"{NAME}:{utils.uuid}", diff --git a/dataflow/gpu-workers/tensorflow-landsat/landsat_test.py b/dataflow/gpu-workers/tensorflow-landsat/landsat_test.py index 972fe627f42..d6eab544cf9 100644 --- a/dataflow/gpu-workers/tensorflow-landsat/landsat_test.py +++ b/dataflow/gpu-workers/tensorflow-landsat/landsat_test.py @@ -35,7 +35,7 @@ def bucket_name(utils: Utils) -> str: def build_image(utils: Utils) -> str: yield from utils.cloud_build_submit( image_name=NAME, - config="build.yaml", + config="tensorflow-landsat/build.yaml", substitutions={"_IMAGE": f"{NAME}:{utils.uuid}"}, ) @@ -44,7 +44,7 @@ def build_image(utils: Utils) -> str: def run_dataflow_job(utils: Utils, bucket_name: str, build_image: str) -> str: # Run the Beam pipeline in Dataflow making sure GPUs are used. yield from utils.cloud_build_submit( - config="run.yaml", + config="tensorflow-landsat/run.yaml", substitutions={ "_JOB_NAME": utils.hyphen_name(NAME), "_IMAGE": f"{NAME}:{utils.uuid}", diff --git a/dataflow/gpu-workers/tensorflow-minimal/tensorflow_test.py b/dataflow/gpu-workers/tensorflow-minimal/tensorflow_test.py index 6c890550d68..dfd9236bad3 100644 --- a/dataflow/gpu-workers/tensorflow-minimal/tensorflow_test.py +++ b/dataflow/gpu-workers/tensorflow-minimal/tensorflow_test.py @@ -34,7 +34,7 @@ def bucket_name(utils: Utils) -> str: def build_image(utils: Utils) -> str: yield from utils.cloud_build_submit( image_name=NAME, - config="build.yaml", + config="tensorflow-minimal/build.yaml", substitutions={"_IMAGE": f"{NAME}:{utils.uuid}"}, ) @@ -43,7 +43,7 @@ def build_image(utils: Utils) -> str: def run_dataflow_job(utils: Utils, bucket_name: str, build_image: str) -> str: # Run the Beam pipeline in Dataflow making sure GPUs are used. yield from utils.cloud_build_submit( - config="run.yaml", + config="tensorflow-minimal/run.yaml", substitutions={ "_JOB_NAME": utils.hyphen_name(NAME), "_IMAGE": f"{NAME}:{utils.uuid}", From 51fdb72ffa591eb64f504542b2ac86d1b2583019 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Fri, 25 Jun 2021 15:48:42 +0000 Subject: [PATCH 76/87] moved new samples to another directory --- dataflow/{gpu-workers => gpu-examples}/conftest.py | 0 .../pytorch-minimal/.dockerignore | 0 .../pytorch-minimal/.gcloudignore | 0 .../pytorch-minimal/Dockerfile | 0 .../{gpu-workers => gpu-examples}/pytorch-minimal/README.md | 0 .../pytorch-minimal/build.yaml | 0 .../pytorch-minimal/e2e_test.py} | 6 +++--- .../{gpu-workers => gpu-examples}/pytorch-minimal/main.py | 0 .../pytorch-minimal/noxfile_config.py | 0 .../pytorch-minimal/requirements-test.txt | 0 .../pytorch-minimal/requirements.txt | 0 .../{gpu-workers => gpu-examples}/pytorch-minimal/run.yaml | 0 .../tensorflow-landsat/.dockerignore | 0 .../tensorflow-landsat/.gcloudignore | 0 .../tensorflow-landsat/Dockerfile | 0 .../tensorflow-landsat/README.md | 0 .../tensorflow-landsat/build.yaml | 0 .../tensorflow-landsat/e2e_test.py} | 6 +++--- .../tensorflow-landsat/main.py | 0 .../tensorflow-landsat/noxfile_config.py | 0 .../tensorflow-landsat/requirements-test.txt | 0 .../tensorflow-landsat/requirements.txt | 0 .../tensorflow-landsat/run.yaml | 0 .../tensorflow-minimal/.dockerignore | 0 .../tensorflow-minimal/.gcloudignore | 0 .../tensorflow-minimal/Dockerfile | 0 .../tensorflow-minimal/README.md | 0 .../tensorflow-minimal/build.yaml | 0 .../tensorflow-minimal/e2e_test.py} | 6 +++--- .../tensorflow-minimal/main.py | 0 .../tensorflow-minimal/noxfile_config.py | 0 .../tensorflow-minimal/requirements-test.txt | 0 .../tensorflow-minimal/requirements.txt | 0 .../tensorflow-minimal/run.yaml | 0 34 files changed, 9 insertions(+), 9 deletions(-) rename dataflow/{gpu-workers => gpu-examples}/conftest.py (100%) rename dataflow/{gpu-workers => gpu-examples}/pytorch-minimal/.dockerignore (100%) rename dataflow/{gpu-workers => gpu-examples}/pytorch-minimal/.gcloudignore (100%) rename dataflow/{gpu-workers => gpu-examples}/pytorch-minimal/Dockerfile (100%) rename dataflow/{gpu-workers => gpu-examples}/pytorch-minimal/README.md (100%) rename dataflow/{gpu-workers => gpu-examples}/pytorch-minimal/build.yaml (100%) rename dataflow/{gpu-workers/pytorch-minimal/pytorch_test.py => gpu-examples/pytorch-minimal/e2e_test.py} (93%) rename dataflow/{gpu-workers => gpu-examples}/pytorch-minimal/main.py (100%) rename dataflow/{gpu-workers => gpu-examples}/pytorch-minimal/noxfile_config.py (100%) rename dataflow/{gpu-workers => gpu-examples}/pytorch-minimal/requirements-test.txt (100%) rename dataflow/{gpu-workers => gpu-examples}/pytorch-minimal/requirements.txt (100%) rename dataflow/{gpu-workers => gpu-examples}/pytorch-minimal/run.yaml (100%) rename dataflow/{gpu-workers => gpu-examples}/tensorflow-landsat/.dockerignore (100%) rename dataflow/{gpu-workers => gpu-examples}/tensorflow-landsat/.gcloudignore (100%) rename dataflow/{gpu-workers => gpu-examples}/tensorflow-landsat/Dockerfile (100%) rename dataflow/{gpu-workers => gpu-examples}/tensorflow-landsat/README.md (100%) rename dataflow/{gpu-workers => gpu-examples}/tensorflow-landsat/build.yaml (100%) rename dataflow/{gpu-workers/tensorflow-landsat/landsat_test.py => gpu-examples/tensorflow-landsat/e2e_test.py} (94%) rename dataflow/{gpu-workers => gpu-examples}/tensorflow-landsat/main.py (100%) rename dataflow/{gpu-workers => gpu-examples}/tensorflow-landsat/noxfile_config.py (100%) rename dataflow/{gpu-workers => gpu-examples}/tensorflow-landsat/requirements-test.txt (100%) rename dataflow/{gpu-workers => gpu-examples}/tensorflow-landsat/requirements.txt (100%) rename dataflow/{gpu-workers => gpu-examples}/tensorflow-landsat/run.yaml (100%) rename dataflow/{gpu-workers => gpu-examples}/tensorflow-minimal/.dockerignore (100%) rename dataflow/{gpu-workers => gpu-examples}/tensorflow-minimal/.gcloudignore (100%) rename dataflow/{gpu-workers => gpu-examples}/tensorflow-minimal/Dockerfile (100%) rename dataflow/{gpu-workers => gpu-examples}/tensorflow-minimal/README.md (100%) rename dataflow/{gpu-workers => gpu-examples}/tensorflow-minimal/build.yaml (100%) rename dataflow/{gpu-workers/tensorflow-minimal/tensorflow_test.py => gpu-examples/tensorflow-minimal/e2e_test.py} (93%) rename dataflow/{gpu-workers => gpu-examples}/tensorflow-minimal/main.py (100%) rename dataflow/{gpu-workers => gpu-examples}/tensorflow-minimal/noxfile_config.py (100%) rename dataflow/{gpu-workers => gpu-examples}/tensorflow-minimal/requirements-test.txt (100%) rename dataflow/{gpu-workers => gpu-examples}/tensorflow-minimal/requirements.txt (100%) rename dataflow/{gpu-workers => gpu-examples}/tensorflow-minimal/run.yaml (100%) diff --git a/dataflow/gpu-workers/conftest.py b/dataflow/gpu-examples/conftest.py similarity index 100% rename from dataflow/gpu-workers/conftest.py rename to dataflow/gpu-examples/conftest.py diff --git a/dataflow/gpu-workers/pytorch-minimal/.dockerignore b/dataflow/gpu-examples/pytorch-minimal/.dockerignore similarity index 100% rename from dataflow/gpu-workers/pytorch-minimal/.dockerignore rename to dataflow/gpu-examples/pytorch-minimal/.dockerignore diff --git a/dataflow/gpu-workers/pytorch-minimal/.gcloudignore b/dataflow/gpu-examples/pytorch-minimal/.gcloudignore similarity index 100% rename from dataflow/gpu-workers/pytorch-minimal/.gcloudignore rename to dataflow/gpu-examples/pytorch-minimal/.gcloudignore diff --git a/dataflow/gpu-workers/pytorch-minimal/Dockerfile b/dataflow/gpu-examples/pytorch-minimal/Dockerfile similarity index 100% rename from dataflow/gpu-workers/pytorch-minimal/Dockerfile rename to dataflow/gpu-examples/pytorch-minimal/Dockerfile diff --git a/dataflow/gpu-workers/pytorch-minimal/README.md b/dataflow/gpu-examples/pytorch-minimal/README.md similarity index 100% rename from dataflow/gpu-workers/pytorch-minimal/README.md rename to dataflow/gpu-examples/pytorch-minimal/README.md diff --git a/dataflow/gpu-workers/pytorch-minimal/build.yaml b/dataflow/gpu-examples/pytorch-minimal/build.yaml similarity index 100% rename from dataflow/gpu-workers/pytorch-minimal/build.yaml rename to dataflow/gpu-examples/pytorch-minimal/build.yaml diff --git a/dataflow/gpu-workers/pytorch-minimal/pytorch_test.py b/dataflow/gpu-examples/pytorch-minimal/e2e_test.py similarity index 93% rename from dataflow/gpu-workers/pytorch-minimal/pytorch_test.py rename to dataflow/gpu-examples/pytorch-minimal/e2e_test.py index 039f7b70b32..41a127e4fee 100644 --- a/dataflow/gpu-workers/pytorch-minimal/pytorch_test.py +++ b/dataflow/gpu-examples/pytorch-minimal/e2e_test.py @@ -22,7 +22,7 @@ Utils = None import pytest -NAME = "dataflow/gpu-workers/pytorch-minimal" +NAME = "dataflow/gpu-examples/pytorch-minimal" @pytest.fixture(scope="session") @@ -34,7 +34,7 @@ def bucket_name(utils: Utils) -> str: def build_image(utils: Utils) -> str: yield from utils.cloud_build_submit( image_name=NAME, - config="pytorch-minimal/build.yaml", + config="build.yaml", substitutions={"_IMAGE": f"{NAME}:{utils.uuid}"}, ) @@ -43,7 +43,7 @@ def build_image(utils: Utils) -> str: def run_dataflow_job(utils: Utils, bucket_name: str, build_image: str) -> str: # Run the Beam pipeline in Dataflow making sure GPUs are used. yield from utils.cloud_build_submit( - config="pytorch-minimal/run.yaml", + config="run.yaml", substitutions={ "_JOB_NAME": utils.hyphen_name(NAME), "_IMAGE": f"{NAME}:{utils.uuid}", diff --git a/dataflow/gpu-workers/pytorch-minimal/main.py b/dataflow/gpu-examples/pytorch-minimal/main.py similarity index 100% rename from dataflow/gpu-workers/pytorch-minimal/main.py rename to dataflow/gpu-examples/pytorch-minimal/main.py diff --git a/dataflow/gpu-workers/pytorch-minimal/noxfile_config.py b/dataflow/gpu-examples/pytorch-minimal/noxfile_config.py similarity index 100% rename from dataflow/gpu-workers/pytorch-minimal/noxfile_config.py rename to dataflow/gpu-examples/pytorch-minimal/noxfile_config.py diff --git a/dataflow/gpu-workers/pytorch-minimal/requirements-test.txt b/dataflow/gpu-examples/pytorch-minimal/requirements-test.txt similarity index 100% rename from dataflow/gpu-workers/pytorch-minimal/requirements-test.txt rename to dataflow/gpu-examples/pytorch-minimal/requirements-test.txt diff --git a/dataflow/gpu-workers/pytorch-minimal/requirements.txt b/dataflow/gpu-examples/pytorch-minimal/requirements.txt similarity index 100% rename from dataflow/gpu-workers/pytorch-minimal/requirements.txt rename to dataflow/gpu-examples/pytorch-minimal/requirements.txt diff --git a/dataflow/gpu-workers/pytorch-minimal/run.yaml b/dataflow/gpu-examples/pytorch-minimal/run.yaml similarity index 100% rename from dataflow/gpu-workers/pytorch-minimal/run.yaml rename to dataflow/gpu-examples/pytorch-minimal/run.yaml diff --git a/dataflow/gpu-workers/tensorflow-landsat/.dockerignore b/dataflow/gpu-examples/tensorflow-landsat/.dockerignore similarity index 100% rename from dataflow/gpu-workers/tensorflow-landsat/.dockerignore rename to dataflow/gpu-examples/tensorflow-landsat/.dockerignore diff --git a/dataflow/gpu-workers/tensorflow-landsat/.gcloudignore b/dataflow/gpu-examples/tensorflow-landsat/.gcloudignore similarity index 100% rename from dataflow/gpu-workers/tensorflow-landsat/.gcloudignore rename to dataflow/gpu-examples/tensorflow-landsat/.gcloudignore diff --git a/dataflow/gpu-workers/tensorflow-landsat/Dockerfile b/dataflow/gpu-examples/tensorflow-landsat/Dockerfile similarity index 100% rename from dataflow/gpu-workers/tensorflow-landsat/Dockerfile rename to dataflow/gpu-examples/tensorflow-landsat/Dockerfile diff --git a/dataflow/gpu-workers/tensorflow-landsat/README.md b/dataflow/gpu-examples/tensorflow-landsat/README.md similarity index 100% rename from dataflow/gpu-workers/tensorflow-landsat/README.md rename to dataflow/gpu-examples/tensorflow-landsat/README.md diff --git a/dataflow/gpu-workers/tensorflow-landsat/build.yaml b/dataflow/gpu-examples/tensorflow-landsat/build.yaml similarity index 100% rename from dataflow/gpu-workers/tensorflow-landsat/build.yaml rename to dataflow/gpu-examples/tensorflow-landsat/build.yaml diff --git a/dataflow/gpu-workers/tensorflow-landsat/landsat_test.py b/dataflow/gpu-examples/tensorflow-landsat/e2e_test.py similarity index 94% rename from dataflow/gpu-workers/tensorflow-landsat/landsat_test.py rename to dataflow/gpu-examples/tensorflow-landsat/e2e_test.py index d6eab544cf9..21de08de240 100644 --- a/dataflow/gpu-workers/tensorflow-landsat/landsat_test.py +++ b/dataflow/gpu-examples/tensorflow-landsat/e2e_test.py @@ -23,7 +23,7 @@ from google.cloud import storage import pytest -NAME = "dataflow/gpu-workers/tensorflow-landsat" +NAME = "dataflow/gpu-examples/tensorflow-landsat" @pytest.fixture(scope="session") @@ -35,7 +35,7 @@ def bucket_name(utils: Utils) -> str: def build_image(utils: Utils) -> str: yield from utils.cloud_build_submit( image_name=NAME, - config="tensorflow-landsat/build.yaml", + config="build.yaml", substitutions={"_IMAGE": f"{NAME}:{utils.uuid}"}, ) @@ -44,7 +44,7 @@ def build_image(utils: Utils) -> str: def run_dataflow_job(utils: Utils, bucket_name: str, build_image: str) -> str: # Run the Beam pipeline in Dataflow making sure GPUs are used. yield from utils.cloud_build_submit( - config="tensorflow-landsat/run.yaml", + config="run.yaml", substitutions={ "_JOB_NAME": utils.hyphen_name(NAME), "_IMAGE": f"{NAME}:{utils.uuid}", diff --git a/dataflow/gpu-workers/tensorflow-landsat/main.py b/dataflow/gpu-examples/tensorflow-landsat/main.py similarity index 100% rename from dataflow/gpu-workers/tensorflow-landsat/main.py rename to dataflow/gpu-examples/tensorflow-landsat/main.py diff --git a/dataflow/gpu-workers/tensorflow-landsat/noxfile_config.py b/dataflow/gpu-examples/tensorflow-landsat/noxfile_config.py similarity index 100% rename from dataflow/gpu-workers/tensorflow-landsat/noxfile_config.py rename to dataflow/gpu-examples/tensorflow-landsat/noxfile_config.py diff --git a/dataflow/gpu-workers/tensorflow-landsat/requirements-test.txt b/dataflow/gpu-examples/tensorflow-landsat/requirements-test.txt similarity index 100% rename from dataflow/gpu-workers/tensorflow-landsat/requirements-test.txt rename to dataflow/gpu-examples/tensorflow-landsat/requirements-test.txt diff --git a/dataflow/gpu-workers/tensorflow-landsat/requirements.txt b/dataflow/gpu-examples/tensorflow-landsat/requirements.txt similarity index 100% rename from dataflow/gpu-workers/tensorflow-landsat/requirements.txt rename to dataflow/gpu-examples/tensorflow-landsat/requirements.txt diff --git a/dataflow/gpu-workers/tensorflow-landsat/run.yaml b/dataflow/gpu-examples/tensorflow-landsat/run.yaml similarity index 100% rename from dataflow/gpu-workers/tensorflow-landsat/run.yaml rename to dataflow/gpu-examples/tensorflow-landsat/run.yaml diff --git a/dataflow/gpu-workers/tensorflow-minimal/.dockerignore b/dataflow/gpu-examples/tensorflow-minimal/.dockerignore similarity index 100% rename from dataflow/gpu-workers/tensorflow-minimal/.dockerignore rename to dataflow/gpu-examples/tensorflow-minimal/.dockerignore diff --git a/dataflow/gpu-workers/tensorflow-minimal/.gcloudignore b/dataflow/gpu-examples/tensorflow-minimal/.gcloudignore similarity index 100% rename from dataflow/gpu-workers/tensorflow-minimal/.gcloudignore rename to dataflow/gpu-examples/tensorflow-minimal/.gcloudignore diff --git a/dataflow/gpu-workers/tensorflow-minimal/Dockerfile b/dataflow/gpu-examples/tensorflow-minimal/Dockerfile similarity index 100% rename from dataflow/gpu-workers/tensorflow-minimal/Dockerfile rename to dataflow/gpu-examples/tensorflow-minimal/Dockerfile diff --git a/dataflow/gpu-workers/tensorflow-minimal/README.md b/dataflow/gpu-examples/tensorflow-minimal/README.md similarity index 100% rename from dataflow/gpu-workers/tensorflow-minimal/README.md rename to dataflow/gpu-examples/tensorflow-minimal/README.md diff --git a/dataflow/gpu-workers/tensorflow-minimal/build.yaml b/dataflow/gpu-examples/tensorflow-minimal/build.yaml similarity index 100% rename from dataflow/gpu-workers/tensorflow-minimal/build.yaml rename to dataflow/gpu-examples/tensorflow-minimal/build.yaml diff --git a/dataflow/gpu-workers/tensorflow-minimal/tensorflow_test.py b/dataflow/gpu-examples/tensorflow-minimal/e2e_test.py similarity index 93% rename from dataflow/gpu-workers/tensorflow-minimal/tensorflow_test.py rename to dataflow/gpu-examples/tensorflow-minimal/e2e_test.py index dfd9236bad3..b78d9c49b88 100644 --- a/dataflow/gpu-workers/tensorflow-minimal/tensorflow_test.py +++ b/dataflow/gpu-examples/tensorflow-minimal/e2e_test.py @@ -22,7 +22,7 @@ Utils = None import pytest -NAME = "dataflow/gpu-workers/tensorflow-minimal" +NAME = "dataflow/gpu-examples/tensorflow-minimal" @pytest.fixture(scope="session") @@ -34,7 +34,7 @@ def bucket_name(utils: Utils) -> str: def build_image(utils: Utils) -> str: yield from utils.cloud_build_submit( image_name=NAME, - config="tensorflow-minimal/build.yaml", + config="build.yaml", substitutions={"_IMAGE": f"{NAME}:{utils.uuid}"}, ) @@ -43,7 +43,7 @@ def build_image(utils: Utils) -> str: def run_dataflow_job(utils: Utils, bucket_name: str, build_image: str) -> str: # Run the Beam pipeline in Dataflow making sure GPUs are used. yield from utils.cloud_build_submit( - config="tensorflow-minimal/run.yaml", + config="run.yaml", substitutions={ "_JOB_NAME": utils.hyphen_name(NAME), "_IMAGE": f"{NAME}:{utils.uuid}", diff --git a/dataflow/gpu-workers/tensorflow-minimal/main.py b/dataflow/gpu-examples/tensorflow-minimal/main.py similarity index 100% rename from dataflow/gpu-workers/tensorflow-minimal/main.py rename to dataflow/gpu-examples/tensorflow-minimal/main.py diff --git a/dataflow/gpu-workers/tensorflow-minimal/noxfile_config.py b/dataflow/gpu-examples/tensorflow-minimal/noxfile_config.py similarity index 100% rename from dataflow/gpu-workers/tensorflow-minimal/noxfile_config.py rename to dataflow/gpu-examples/tensorflow-minimal/noxfile_config.py diff --git a/dataflow/gpu-workers/tensorflow-minimal/requirements-test.txt b/dataflow/gpu-examples/tensorflow-minimal/requirements-test.txt similarity index 100% rename from dataflow/gpu-workers/tensorflow-minimal/requirements-test.txt rename to dataflow/gpu-examples/tensorflow-minimal/requirements-test.txt diff --git a/dataflow/gpu-workers/tensorflow-minimal/requirements.txt b/dataflow/gpu-examples/tensorflow-minimal/requirements.txt similarity index 100% rename from dataflow/gpu-workers/tensorflow-minimal/requirements.txt rename to dataflow/gpu-examples/tensorflow-minimal/requirements.txt diff --git a/dataflow/gpu-workers/tensorflow-minimal/run.yaml b/dataflow/gpu-examples/tensorflow-minimal/run.yaml similarity index 100% rename from dataflow/gpu-workers/tensorflow-minimal/run.yaml rename to dataflow/gpu-examples/tensorflow-minimal/run.yaml From 50f3d44c43c1d19288d41fc63ccec018bce484d4 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Fri, 25 Jun 2021 18:22:18 +0000 Subject: [PATCH 77/87] simplified instructions --- dataflow/gpu-examples/pytorch-minimal/.dockerignore | 5 ----- dataflow/gpu-examples/pytorch-minimal/.gcloudignore | 5 ----- dataflow/gpu-examples/pytorch-minimal/README.md | 2 +- dataflow/gpu-examples/tensorflow-landsat/.dockerignore | 5 ----- dataflow/gpu-examples/tensorflow-landsat/.gcloudignore | 5 ----- dataflow/gpu-examples/tensorflow-landsat/README.md | 2 +- dataflow/gpu-examples/tensorflow-minimal/.dockerignore | 5 ----- dataflow/gpu-examples/tensorflow-minimal/.gcloudignore | 5 ----- dataflow/gpu-examples/tensorflow-minimal/README.md | 2 +- 9 files changed, 3 insertions(+), 33 deletions(-) delete mode 100644 dataflow/gpu-examples/pytorch-minimal/.dockerignore delete mode 100644 dataflow/gpu-examples/pytorch-minimal/.gcloudignore delete mode 100644 dataflow/gpu-examples/tensorflow-landsat/.dockerignore delete mode 100644 dataflow/gpu-examples/tensorflow-landsat/.gcloudignore delete mode 100644 dataflow/gpu-examples/tensorflow-minimal/.dockerignore delete mode 100644 dataflow/gpu-examples/tensorflow-minimal/.gcloudignore diff --git a/dataflow/gpu-examples/pytorch-minimal/.dockerignore b/dataflow/gpu-examples/pytorch-minimal/.dockerignore deleted file mode 100644 index 775d845fa58..00000000000 --- a/dataflow/gpu-examples/pytorch-minimal/.dockerignore +++ /dev/null @@ -1,5 +0,0 @@ -# Ignore everything except the source files. -**/* -!Dockerfile -!requirements.txt -!*.py diff --git a/dataflow/gpu-examples/pytorch-minimal/.gcloudignore b/dataflow/gpu-examples/pytorch-minimal/.gcloudignore deleted file mode 100644 index 775d845fa58..00000000000 --- a/dataflow/gpu-examples/pytorch-minimal/.gcloudignore +++ /dev/null @@ -1,5 +0,0 @@ -# Ignore everything except the source files. -**/* -!Dockerfile -!requirements.txt -!*.py diff --git a/dataflow/gpu-examples/pytorch-minimal/README.md b/dataflow/gpu-examples/pytorch-minimal/README.md index 43e24830529..cb638cad308 100644 --- a/dataflow/gpu-examples/pytorch-minimal/README.md +++ b/dataflow/gpu-examples/pytorch-minimal/README.md @@ -27,7 +27,7 @@ We use Cloud Build to run the [Dataflow](https://cloud.google.com/dataflow) job. export REGION="us-central1" export GPU_TYPE="nvidia-tesla-t4" -gcloud beta builds submit \ +gcloud builds submit \ --config run.yaml \ --substitutions _REGION=$REGION,_GPU_TYPE=$GPU_TYPE \ --no-source diff --git a/dataflow/gpu-examples/tensorflow-landsat/.dockerignore b/dataflow/gpu-examples/tensorflow-landsat/.dockerignore deleted file mode 100644 index 775d845fa58..00000000000 --- a/dataflow/gpu-examples/tensorflow-landsat/.dockerignore +++ /dev/null @@ -1,5 +0,0 @@ -# Ignore everything except the source files. -**/* -!Dockerfile -!requirements.txt -!*.py diff --git a/dataflow/gpu-examples/tensorflow-landsat/.gcloudignore b/dataflow/gpu-examples/tensorflow-landsat/.gcloudignore deleted file mode 100644 index 775d845fa58..00000000000 --- a/dataflow/gpu-examples/tensorflow-landsat/.gcloudignore +++ /dev/null @@ -1,5 +0,0 @@ -# Ignore everything except the source files. -**/* -!Dockerfile -!requirements.txt -!*.py diff --git a/dataflow/gpu-examples/tensorflow-landsat/README.md b/dataflow/gpu-examples/tensorflow-landsat/README.md index dd5b8fadbc7..a89183c977c 100644 --- a/dataflow/gpu-examples/tensorflow-landsat/README.md +++ b/dataflow/gpu-examples/tensorflow-landsat/README.md @@ -32,7 +32,7 @@ export OUTPUT_PATH="gs://$BUCKET/samples/dataflow/landsat/output-images/" export REGION="us-central1" export GPU_TYPE="nvidia-tesla-t4" -gcloud beta builds submit \ +gcloud builds submit \ --config run.yaml \ --substitutions _OUTPUT_PATH=$OUTPUT_PATH,_REGION=$REGION,_GPU_TYPE=$GPU_TYPE \ --no-source diff --git a/dataflow/gpu-examples/tensorflow-minimal/.dockerignore b/dataflow/gpu-examples/tensorflow-minimal/.dockerignore deleted file mode 100644 index 775d845fa58..00000000000 --- a/dataflow/gpu-examples/tensorflow-minimal/.dockerignore +++ /dev/null @@ -1,5 +0,0 @@ -# Ignore everything except the source files. -**/* -!Dockerfile -!requirements.txt -!*.py diff --git a/dataflow/gpu-examples/tensorflow-minimal/.gcloudignore b/dataflow/gpu-examples/tensorflow-minimal/.gcloudignore deleted file mode 100644 index 775d845fa58..00000000000 --- a/dataflow/gpu-examples/tensorflow-minimal/.gcloudignore +++ /dev/null @@ -1,5 +0,0 @@ -# Ignore everything except the source files. -**/* -!Dockerfile -!requirements.txt -!*.py diff --git a/dataflow/gpu-examples/tensorflow-minimal/README.md b/dataflow/gpu-examples/tensorflow-minimal/README.md index debd86b0e91..9a457deeeba 100644 --- a/dataflow/gpu-examples/tensorflow-minimal/README.md +++ b/dataflow/gpu-examples/tensorflow-minimal/README.md @@ -27,7 +27,7 @@ We use Cloud Build to run the [Dataflow](https://cloud.google.com/dataflow) job. export REGION="us-central1" export GPU_TYPE="nvidia-tesla-t4" -gcloud beta builds submit \ +gcloud builds submit \ --config run.yaml \ --substitutions _REGION=$REGION,_GPU_TYPE=$GPU_TYPE \ --no-source From 085feeb504b47b41e501d7397f875e6fcc93f24f Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Fri, 25 Jun 2021 19:06:22 +0000 Subject: [PATCH 78/87] add gcloudignore to reduce test time --- dataflow/gpu-examples/pytorch-minimal/.gcloudignore | 5 +++++ dataflow/gpu-examples/tensorflow-landsat/.gcloudignore | 5 +++++ dataflow/gpu-examples/tensorflow-minimal/.gcloudignore | 5 +++++ 3 files changed, 15 insertions(+) create mode 100644 dataflow/gpu-examples/pytorch-minimal/.gcloudignore create mode 100644 dataflow/gpu-examples/tensorflow-landsat/.gcloudignore create mode 100644 dataflow/gpu-examples/tensorflow-minimal/.gcloudignore diff --git a/dataflow/gpu-examples/pytorch-minimal/.gcloudignore b/dataflow/gpu-examples/pytorch-minimal/.gcloudignore new file mode 100644 index 00000000000..775d845fa58 --- /dev/null +++ b/dataflow/gpu-examples/pytorch-minimal/.gcloudignore @@ -0,0 +1,5 @@ +# Ignore everything except the source files. +**/* +!Dockerfile +!requirements.txt +!*.py diff --git a/dataflow/gpu-examples/tensorflow-landsat/.gcloudignore b/dataflow/gpu-examples/tensorflow-landsat/.gcloudignore new file mode 100644 index 00000000000..775d845fa58 --- /dev/null +++ b/dataflow/gpu-examples/tensorflow-landsat/.gcloudignore @@ -0,0 +1,5 @@ +# Ignore everything except the source files. +**/* +!Dockerfile +!requirements.txt +!*.py diff --git a/dataflow/gpu-examples/tensorflow-minimal/.gcloudignore b/dataflow/gpu-examples/tensorflow-minimal/.gcloudignore new file mode 100644 index 00000000000..775d845fa58 --- /dev/null +++ b/dataflow/gpu-examples/tensorflow-minimal/.gcloudignore @@ -0,0 +1,5 @@ +# Ignore everything except the source files. +**/* +!Dockerfile +!requirements.txt +!*.py From dccd745695212eb6bf251d71620b5fb8cf0a59be Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Mon, 28 Jun 2021 18:21:59 +0000 Subject: [PATCH 79/87] reset gpu-workers entirely --- dataflow/gpu-workers/requirements.txt | 5 ----- 1 file changed, 5 deletions(-) diff --git a/dataflow/gpu-workers/requirements.txt b/dataflow/gpu-workers/requirements.txt index 00058684873..6c5d4338609 100644 --- a/dataflow/gpu-workers/requirements.txt +++ b/dataflow/gpu-workers/requirements.txt @@ -1,9 +1,4 @@ Pillow==8.2.0 -<<<<<<< HEAD -apache-beam[gcp]==2.30.0 -rasterio==1.2.4 -======= apache-beam[gcp]==2.29.0 rasterio==1.2.6 ->>>>>>> d41f2ed158db07c062562d4cf93d626781466d9a tensorflow==2.5.0 From 7ccbdcabc6e90c14bb98b8c1adf547623b4fc9aa Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Thu, 1 Jul 2021 18:33:43 +0000 Subject: [PATCH 80/87] test in a single Python version --- dataflow/gpu-examples/pytorch-minimal/noxfile_config.py | 5 ++++- dataflow/gpu-examples/tensorflow-landsat/noxfile_config.py | 5 ++++- dataflow/gpu-examples/tensorflow-minimal/noxfile_config.py | 5 ++++- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/dataflow/gpu-examples/pytorch-minimal/noxfile_config.py b/dataflow/gpu-examples/pytorch-minimal/noxfile_config.py index d8e9aba4fdd..627ee2bc4b8 100644 --- a/dataflow/gpu-examples/pytorch-minimal/noxfile_config.py +++ b/dataflow/gpu-examples/pytorch-minimal/noxfile_config.py @@ -22,7 +22,10 @@ TEST_CONFIG_OVERRIDE = { # You can opt out from the test for specific Python versions. - "ignored_versions": ["2.7", "3.9"], + # > ℹ️ We're opting out of all Python versions except 3.8. + # > The Python version used is defined by the Dockerfile, so it's redundant + # > to run multiple tests since they would all be running the same Dockerfile. + "ignored_versions": ["2.7", "3.6", "3.7", "3.9"], # Old samples are opted out of enforcing Python type hints # All new samples should feature them "enforce_type_hints": True, diff --git a/dataflow/gpu-examples/tensorflow-landsat/noxfile_config.py b/dataflow/gpu-examples/tensorflow-landsat/noxfile_config.py index d8e9aba4fdd..627ee2bc4b8 100644 --- a/dataflow/gpu-examples/tensorflow-landsat/noxfile_config.py +++ b/dataflow/gpu-examples/tensorflow-landsat/noxfile_config.py @@ -22,7 +22,10 @@ TEST_CONFIG_OVERRIDE = { # You can opt out from the test for specific Python versions. - "ignored_versions": ["2.7", "3.9"], + # > ℹ️ We're opting out of all Python versions except 3.8. + # > The Python version used is defined by the Dockerfile, so it's redundant + # > to run multiple tests since they would all be running the same Dockerfile. + "ignored_versions": ["2.7", "3.6", "3.7", "3.9"], # Old samples are opted out of enforcing Python type hints # All new samples should feature them "enforce_type_hints": True, diff --git a/dataflow/gpu-examples/tensorflow-minimal/noxfile_config.py b/dataflow/gpu-examples/tensorflow-minimal/noxfile_config.py index d8e9aba4fdd..627ee2bc4b8 100644 --- a/dataflow/gpu-examples/tensorflow-minimal/noxfile_config.py +++ b/dataflow/gpu-examples/tensorflow-minimal/noxfile_config.py @@ -22,7 +22,10 @@ TEST_CONFIG_OVERRIDE = { # You can opt out from the test for specific Python versions. - "ignored_versions": ["2.7", "3.9"], + # > ℹ️ We're opting out of all Python versions except 3.8. + # > The Python version used is defined by the Dockerfile, so it's redundant + # > to run multiple tests since they would all be running the same Dockerfile. + "ignored_versions": ["2.7", "3.6", "3.7", "3.9"], # Old samples are opted out of enforcing Python type hints # All new samples should feature them "enforce_type_hints": True, From 5142c9ebdbb6f5a3a5cee715850dcc76cf83358e Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Mon, 12 Jul 2021 19:43:02 +0000 Subject: [PATCH 81/87] run subprocess as daemon --- dataflow/gpu-examples/conftest.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/dataflow/gpu-examples/conftest.py b/dataflow/gpu-examples/conftest.py index ed7387d209f..05e292fffca 100644 --- a/dataflow/gpu-examples/conftest.py +++ b/dataflow/gpu-examples/conftest.py @@ -31,7 +31,8 @@ PROJECT = os.environ["GOOGLE_CLOUD_PROJECT"] REGION = "us-central1" -RETRY_MAX_TIME = 5 * 60 # 5 minutes in seconds +TIMEOUT_SEC = 30 * 60 # 30 minutes in seconds +POLL_INTERVAL_SEC = 60 # 1 minute in seconds HYPHEN_NAME_RE = re.compile(r"[^\w\d-]+") UNDERSCORE_NAME_RE = re.compile(r"[^\w\d_]+") @@ -178,13 +179,20 @@ def _infinite_publish_job() -> None: # Start a subprocess in the background to do the publishing. logging.info(f"Starting publisher on {topic_path}") p = mp.Process(target=_infinite_publish_job) + + # We set the subprocess as a daemon so the main process doesn't wait for + # the subprocess to finish. Since this is an infinite loop, it will + # never finish, so it would cause the whole test to hang. + # Typically, `terminate` should stop the subprocess during the fixture + # cleanup phase, but we've had cases where the tests hang, most likely + # due to concurrency issues with pytest running in parallel. + p.daemon = True p.start() yield p.is_alive() # For cleanup, terminate the background process. logging.info("Stopping publisher") - p.join(timeout=0) p.terminate() @staticmethod @@ -225,7 +233,7 @@ def cloud_build_submit( yield f.read() except Exception as e: logging.exception(e) - logging.warning(f'Current directory: {os.getcwd()}') + logging.warning(f"Current directory: {os.getcwd()}") yield config elif image_name: cmd = [ @@ -329,9 +337,9 @@ def dataflow_jobs_wait( project: str = PROJECT, region: str = REGION, until_status: str = "JOB_STATE_DONE", - timeout_sec: str = 30 * 60, - poll_interval_sec: int = 60, list_page_size: int = 100, + timeout_sec: str = TIMEOUT_SEC, + poll_interval_sec: int = POLL_INTERVAL_SEC, ) -> Optional[str]: """For a list of all the valid states: https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs#Job.JobState From 7caca190c3fb95810c239fc636f90da3694471f4 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Mon, 12 Jul 2021 19:43:16 +0000 Subject: [PATCH 82/87] update beam version --- dataflow/gpu-examples/pytorch-minimal/Dockerfile | 2 +- dataflow/gpu-examples/pytorch-minimal/requirements.txt | 2 +- dataflow/gpu-examples/tensorflow-landsat/Dockerfile | 2 +- dataflow/gpu-examples/tensorflow-landsat/requirements.txt | 2 +- dataflow/gpu-examples/tensorflow-minimal/Dockerfile | 2 +- dataflow/gpu-examples/tensorflow-minimal/requirements.txt | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/dataflow/gpu-examples/pytorch-minimal/Dockerfile b/dataflow/gpu-examples/pytorch-minimal/Dockerfile index 94aa381e344..ce52169aa54 100644 --- a/dataflow/gpu-examples/pytorch-minimal/Dockerfile +++ b/dataflow/gpu-examples/pytorch-minimal/Dockerfile @@ -17,7 +17,7 @@ FROM pytorch/pytorch:1.8.1-cuda11.1-cudnn8-runtime WORKDIR /pipeline # Copy the Apache Beam worker files and the pipeline source files. -COPY --from=apache/beam_python3.8_sdk:2.30.0 /opt/apache/beam /opt/apache/beam +COPY --from=apache/beam_python3.8_sdk:2.31.0 /opt/apache/beam /opt/apache/beam COPY requirements.txt . COPY *.py ./ diff --git a/dataflow/gpu-examples/pytorch-minimal/requirements.txt b/dataflow/gpu-examples/pytorch-minimal/requirements.txt index ad5777b6ca6..b766a3c2a56 100644 --- a/dataflow/gpu-examples/pytorch-minimal/requirements.txt +++ b/dataflow/gpu-examples/pytorch-minimal/requirements.txt @@ -1,2 +1,2 @@ -apache-beam[gcp]==2.30.0 +apache-beam[gcp]==2.31.0 torch==1.8.1 diff --git a/dataflow/gpu-examples/tensorflow-landsat/Dockerfile b/dataflow/gpu-examples/tensorflow-landsat/Dockerfile index 7a50a862756..7bf18507825 100644 --- a/dataflow/gpu-examples/tensorflow-landsat/Dockerfile +++ b/dataflow/gpu-examples/tensorflow-landsat/Dockerfile @@ -21,7 +21,7 @@ FROM nvcr.io/nvidia/cuda:11.2.2-cudnn8-runtime-ubuntu20.04 WORKDIR /pipeline # Copy the Apache Beam worker files and the pipeline source files. -COPY --from=apache/beam_python3.8_sdk:2.30.0 /opt/apache/beam /opt/apache/beam +COPY --from=apache/beam_python3.8_sdk:2.31.0 /opt/apache/beam /opt/apache/beam COPY requirements.txt . COPY *.py ./ diff --git a/dataflow/gpu-examples/tensorflow-landsat/requirements.txt b/dataflow/gpu-examples/tensorflow-landsat/requirements.txt index cbfaaf47f75..17a0da72dd9 100644 --- a/dataflow/gpu-examples/tensorflow-landsat/requirements.txt +++ b/dataflow/gpu-examples/tensorflow-landsat/requirements.txt @@ -1,4 +1,4 @@ Pillow==8.2.0 -apache-beam[gcp]==2.30.0 +apache-beam[gcp]==2.31.0 rasterio==1.2.4 tensorflow==2.5.0 diff --git a/dataflow/gpu-examples/tensorflow-minimal/Dockerfile b/dataflow/gpu-examples/tensorflow-minimal/Dockerfile index 48b4b390eeb..11bd4840cb5 100644 --- a/dataflow/gpu-examples/tensorflow-minimal/Dockerfile +++ b/dataflow/gpu-examples/tensorflow-minimal/Dockerfile @@ -21,7 +21,7 @@ FROM nvcr.io/nvidia/cuda:11.2.2-cudnn8-runtime-ubuntu20.04 WORKDIR /pipeline # Copy the Apache Beam worker files and the pipeline source files. -COPY --from=apache/beam_python3.8_sdk:2.30.0 /opt/apache/beam /opt/apache/beam +COPY --from=apache/beam_python3.8_sdk:2.31.0 /opt/apache/beam /opt/apache/beam COPY requirements.txt . COPY *.py ./ diff --git a/dataflow/gpu-examples/tensorflow-minimal/requirements.txt b/dataflow/gpu-examples/tensorflow-minimal/requirements.txt index aa9e7e634f5..f9d374a91aa 100644 --- a/dataflow/gpu-examples/tensorflow-minimal/requirements.txt +++ b/dataflow/gpu-examples/tensorflow-minimal/requirements.txt @@ -1,2 +1,2 @@ -apache-beam[gcp]==2.30.0 +apache-beam[gcp]==2.31.0 tensorflow==2.5.0 From e6876817fa77fde386c227396a6cb38b736e9446 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Mon, 12 Jul 2021 22:14:38 +0000 Subject: [PATCH 83/87] install g++ to compile google-cloud-profiler --- dataflow/gpu-examples/pytorch-minimal/Dockerfile | 12 ++++++++---- dataflow/gpu-examples/tensorflow-landsat/Dockerfile | 3 ++- dataflow/gpu-examples/tensorflow-minimal/Dockerfile | 3 ++- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/dataflow/gpu-examples/pytorch-minimal/Dockerfile b/dataflow/gpu-examples/pytorch-minimal/Dockerfile index ce52169aa54..08cc9c6fe64 100644 --- a/dataflow/gpu-examples/pytorch-minimal/Dockerfile +++ b/dataflow/gpu-examples/pytorch-minimal/Dockerfile @@ -21,10 +21,14 @@ COPY --from=apache/beam_python3.8_sdk:2.31.0 /opt/apache/beam /opt/apache/beam COPY requirements.txt . COPY *.py ./ -# Install the pipeline requirements and check that there are no conflicts. -# Since the image already has all the dependencies installed, -# there's no need to run with the --requirements_file option. -RUN pip install --no-cache-dir --upgrade pip \ +RUN apt-get update \ + # Since Apache Beam 2.31.0, we need g++ to compile google-cloud-profiler. + && apt-get -y install g++ \ + && rm -rf /var/lib/apt/lists/* \ + # Install the pipeline requirements and check that there are no conflicts. + # Since the image already has all the dependencies installed, + # there's no need to run with the --requirements_file option. + && pip install --no-cache-dir --upgrade pip \ && pip install --no-cache-dir -r requirements.txt \ && pip check diff --git a/dataflow/gpu-examples/tensorflow-landsat/Dockerfile b/dataflow/gpu-examples/tensorflow-landsat/Dockerfile index 7bf18507825..e0298b63c2e 100644 --- a/dataflow/gpu-examples/tensorflow-landsat/Dockerfile +++ b/dataflow/gpu-examples/tensorflow-landsat/Dockerfile @@ -28,7 +28,8 @@ COPY *.py ./ # If you need a different Python version, consider: # https://launchpad.net/~deadsnakes/+archive/ubuntu/ppa RUN apt-get update \ - && apt-get install -y curl python3.8 python3-distutils \ + # Since Apache Beam 2.31.0, we need g++ to compile google-cloud-profiler. + && apt-get install -y curl g++ python3.8 python3-distutils \ && rm -rf /var/lib/apt/lists/* \ && update-alternatives --install /usr/bin/python python /usr/bin/python3.8 10 \ && curl https://bootstrap.pypa.io/get-pip.py | python \ diff --git a/dataflow/gpu-examples/tensorflow-minimal/Dockerfile b/dataflow/gpu-examples/tensorflow-minimal/Dockerfile index 11bd4840cb5..b2598fdfee0 100644 --- a/dataflow/gpu-examples/tensorflow-minimal/Dockerfile +++ b/dataflow/gpu-examples/tensorflow-minimal/Dockerfile @@ -28,7 +28,8 @@ COPY *.py ./ # If you need a different Python version, consider: # https://launchpad.net/~deadsnakes/+archive/ubuntu/ppa RUN apt-get update \ - && apt-get install -y curl python3.8 python3-distutils \ + # Since Apache Beam 2.31.0, we need g++ to compile google-cloud-profiler. + && apt-get install -y curl g++ python3.8 python3-distutils \ && rm -rf /var/lib/apt/lists/* \ && update-alternatives --install /usr/bin/python python /usr/bin/python3.8 10 \ && curl https://bootstrap.pypa.io/get-pip.py | python \ From 9c11b461c74dc5edb7f7519e6ad5493c74f019a0 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Tue, 13 Jul 2021 00:30:51 +0000 Subject: [PATCH 84/87] fix build dependencies --- dataflow/gpu-examples/pytorch-minimal/Dockerfile | 8 +++----- dataflow/gpu-examples/pytorch-minimal/requirements.txt | 2 +- dataflow/gpu-examples/tensorflow-landsat/Dockerfile | 7 ++----- dataflow/gpu-examples/tensorflow-minimal/Dockerfile | 7 ++----- 4 files changed, 8 insertions(+), 16 deletions(-) diff --git a/dataflow/gpu-examples/pytorch-minimal/Dockerfile b/dataflow/gpu-examples/pytorch-minimal/Dockerfile index 08cc9c6fe64..52427ced6a8 100644 --- a/dataflow/gpu-examples/pytorch-minimal/Dockerfile +++ b/dataflow/gpu-examples/pytorch-minimal/Dockerfile @@ -12,18 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. -FROM pytorch/pytorch:1.8.1-cuda11.1-cudnn8-runtime +FROM pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime WORKDIR /pipeline -# Copy the Apache Beam worker files and the pipeline source files. -COPY --from=apache/beam_python3.8_sdk:2.31.0 /opt/apache/beam /opt/apache/beam COPY requirements.txt . COPY *.py ./ RUN apt-get update \ - # Since Apache Beam 2.31.0, we need g++ to compile google-cloud-profiler. - && apt-get -y install g++ \ + && apt-get install -y --no-install-recommends g++ \ && rm -rf /var/lib/apt/lists/* \ # Install the pipeline requirements and check that there are no conflicts. # Since the image already has all the dependencies installed, @@ -33,4 +30,5 @@ RUN apt-get update \ && pip check # Set the entrypoint to Apache Beam SDK worker launcher. +COPY --from=apache/beam_python3.8_sdk:2.31.0 /opt/apache/beam /opt/apache/beam ENTRYPOINT [ "/opt/apache/beam/boot" ] diff --git a/dataflow/gpu-examples/pytorch-minimal/requirements.txt b/dataflow/gpu-examples/pytorch-minimal/requirements.txt index b766a3c2a56..fbf5febe81a 100644 --- a/dataflow/gpu-examples/pytorch-minimal/requirements.txt +++ b/dataflow/gpu-examples/pytorch-minimal/requirements.txt @@ -1,2 +1,2 @@ apache-beam[gcp]==2.31.0 -torch==1.8.1 +torch==1.9.0 diff --git a/dataflow/gpu-examples/tensorflow-landsat/Dockerfile b/dataflow/gpu-examples/tensorflow-landsat/Dockerfile index e0298b63c2e..85ea20c7d93 100644 --- a/dataflow/gpu-examples/tensorflow-landsat/Dockerfile +++ b/dataflow/gpu-examples/tensorflow-landsat/Dockerfile @@ -20,25 +20,22 @@ FROM nvcr.io/nvidia/cuda:11.2.2-cudnn8-runtime-ubuntu20.04 WORKDIR /pipeline -# Copy the Apache Beam worker files and the pipeline source files. -COPY --from=apache/beam_python3.8_sdk:2.31.0 /opt/apache/beam /opt/apache/beam COPY requirements.txt . COPY *.py ./ # If you need a different Python version, consider: # https://launchpad.net/~deadsnakes/+archive/ubuntu/ppa RUN apt-get update \ - # Since Apache Beam 2.31.0, we need g++ to compile google-cloud-profiler. - && apt-get install -y curl g++ python3.8 python3-distutils \ + && apt-get install -y --no-install-recommends curl g++ python3.8-dev python3-distutils \ && rm -rf /var/lib/apt/lists/* \ && update-alternatives --install /usr/bin/python python /usr/bin/python3.8 10 \ && curl https://bootstrap.pypa.io/get-pip.py | python \ # Install the pipeline requirements and check that there are no conflicts. # Since the image already has all the dependencies installed, # there's no need to run with the --requirements_file option. - && pip install --no-cache-dir --upgrade pip \ && pip install --no-cache-dir -r requirements.txt \ && pip check # Set the entrypoint to Apache Beam SDK worker launcher. +COPY --from=apache/beam_python3.8_sdk:2.31.0 /opt/apache/beam /opt/apache/beam ENTRYPOINT [ "/opt/apache/beam/boot" ] diff --git a/dataflow/gpu-examples/tensorflow-minimal/Dockerfile b/dataflow/gpu-examples/tensorflow-minimal/Dockerfile index b2598fdfee0..39c185782bd 100644 --- a/dataflow/gpu-examples/tensorflow-minimal/Dockerfile +++ b/dataflow/gpu-examples/tensorflow-minimal/Dockerfile @@ -20,25 +20,22 @@ FROM nvcr.io/nvidia/cuda:11.2.2-cudnn8-runtime-ubuntu20.04 WORKDIR /pipeline -# Copy the Apache Beam worker files and the pipeline source files. -COPY --from=apache/beam_python3.8_sdk:2.31.0 /opt/apache/beam /opt/apache/beam COPY requirements.txt . COPY *.py ./ # If you need a different Python version, consider: # https://launchpad.net/~deadsnakes/+archive/ubuntu/ppa RUN apt-get update \ - # Since Apache Beam 2.31.0, we need g++ to compile google-cloud-profiler. - && apt-get install -y curl g++ python3.8 python3-distutils \ + && apt-get install -y --no-install-recommends curl g++ python3.8-dev python3-distutils \ && rm -rf /var/lib/apt/lists/* \ && update-alternatives --install /usr/bin/python python /usr/bin/python3.8 10 \ && curl https://bootstrap.pypa.io/get-pip.py | python \ # Install the pipeline requirements and check that there are no conflicts. # Since the image already has all the dependencies installed, # there's no need to run with the --requirements_file option. - && pip install --no-cache-dir --upgrade pip \ && pip install --no-cache-dir -r requirements.txt \ && pip check # Set the entrypoint to Apache Beam SDK worker launcher. +COPY --from=apache/beam_python3.8_sdk:2.31.0 /opt/apache/beam /opt/apache/beam ENTRYPOINT [ "/opt/apache/beam/boot" ] From cb5b4dd020754feb606d45c04a704f9c7054dee8 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Tue, 13 Jul 2021 00:52:13 +0000 Subject: [PATCH 85/87] fix build dependencies --- dataflow/gpu-examples/pytorch-minimal/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataflow/gpu-examples/pytorch-minimal/Dockerfile b/dataflow/gpu-examples/pytorch-minimal/Dockerfile index 52427ced6a8..c59eb99a09e 100644 --- a/dataflow/gpu-examples/pytorch-minimal/Dockerfile +++ b/dataflow/gpu-examples/pytorch-minimal/Dockerfile @@ -20,7 +20,7 @@ COPY requirements.txt . COPY *.py ./ RUN apt-get update \ - && apt-get install -y --no-install-recommends g++ \ + && apt-get install -y --no-install-recommends g++ python3-dev \ && rm -rf /var/lib/apt/lists/* \ # Install the pipeline requirements and check that there are no conflicts. # Since the image already has all the dependencies installed, From a503021009b5a2ba742d1ff54f5516c3ba020f62 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Tue, 13 Jul 2021 00:56:26 +0000 Subject: [PATCH 86/87] fix build dependencies --- dataflow/gpu-examples/pytorch-minimal/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataflow/gpu-examples/pytorch-minimal/Dockerfile b/dataflow/gpu-examples/pytorch-minimal/Dockerfile index c59eb99a09e..52427ced6a8 100644 --- a/dataflow/gpu-examples/pytorch-minimal/Dockerfile +++ b/dataflow/gpu-examples/pytorch-minimal/Dockerfile @@ -20,7 +20,7 @@ COPY requirements.txt . COPY *.py ./ RUN apt-get update \ - && apt-get install -y --no-install-recommends g++ python3-dev \ + && apt-get install -y --no-install-recommends g++ \ && rm -rf /var/lib/apt/lists/* \ # Install the pipeline requirements and check that there are no conflicts. # Since the image already has all the dependencies installed, From 0c8c2c15bca31c74c7839e9dc7993b7ea0aaf40c Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Tue, 13 Jul 2021 00:58:27 +0000 Subject: [PATCH 87/87] adjust timeout --- dataflow/gpu-examples/pytorch-minimal/build.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dataflow/gpu-examples/pytorch-minimal/build.yaml b/dataflow/gpu-examples/pytorch-minimal/build.yaml index eed5c16aa70..24a5b57e772 100644 --- a/dataflow/gpu-examples/pytorch-minimal/build.yaml +++ b/dataflow/gpu-examples/pytorch-minimal/build.yaml @@ -30,3 +30,5 @@ images: [ gcr.io/$PROJECT_ID/$_IMAGE ] options: machineType: E2_HIGHCPU_8 + +timeout: 1200s