From 3a9e7ed54941d239222954418655a6270784193f Mon Sep 17 00:00:00 2001 From: Bhavya Bahl Date: Thu, 25 Jul 2024 20:10:06 +0000 Subject: [PATCH 01/67] Bump PyTorch version to 2.4 (#1414) --- tpu/config.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tpu/config.txt b/tpu/config.txt index b495597c..a74310d2 100644 --- a/tpu/config.txt +++ b/tpu/config.txt @@ -9,11 +9,11 @@ TF_LINUX_WHEEL_VERSION=manylinux_2_17_x86_64.manylinux2014_x86_64 JAX_VERSION=0.4.23 # gsutil ls gs://pytorch-xla-releases/wheels/tpuvm/* | grep libtpu | grep -v -E ".*rc[0-9].*" # Supports nightly -TORCH_VERSION=2.3.0 +TORCH_VERSION=2.4.0 # https://github.com/pytorch/audio supports nightly -TORCHAUDIO_VERSION=2.3.0 +TORCHAUDIO_VERSION=2.4.0 # https://github.com/pytorch/text supports main TORCHTEXT_VERSION=0.18.0 # https://github.com/pytorch/vision supports nightly -TORCHVISION_VERSION=0.18.0 +TORCHVISION_VERSION=0.19.0 TORCH_LINUX_WHEEL_VERSION=manylinux_2_28_x86_64 From 9587f69ad46937f6c71c6984983cc9e116823084 Mon Sep 17 00:00:00 2001 From: Johnny Chavez <64660690+calderjo@users.noreply.github.com> Date: Fri, 16 Aug 2024 09:18:05 -0700 Subject: [PATCH 02/67] TF 2.16 with Torch 2.4.0 (#1415) Upgraded our base cpu and gpu base image -updated torch eco. -removed patch and pins placed for tf 2.15 -removed torch text, it is no longer maintained and incompatible with torch 2.4.0 -geopanda depreciated it's datasets methods, test needed to be updated --- Dockerfile.tmpl | 34 ++++++++++--------------------- Jenkinsfile | 1 - config.txt | 15 +++++++------- packages/jaxlib.Dockerfile | 6 ++++-- packages/torch.Dockerfile | 16 +-------------- patches/keras_patch.sh | 41 -------------------------------------- tests/test_geopandas.py | 17 ++++++---------- tests/test_torchtext.py | 12 ----------- tpu/Dockerfile | 3 +-- tpu/config.txt | 2 -- 10 files changed, 29 insertions(+), 118 deletions(-) delete mode 100644 patches/keras_patch.sh delete mode 100644 tests/test_torchtext.py diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index f79a7e1e..42f198d7 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -5,7 +5,6 @@ ARG GPU_BASE_IMAGE_NAME ARG LIGHTGBM_VERSION ARG TORCH_VERSION ARG TORCHAUDIO_VERSION -ARG TORCHTEXT_VERSION ARG TORCHVISION_VERSION ARG JAX_VERSION @@ -38,16 +37,15 @@ RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/lib {{ end }} # Keep these variables in sync if base image is updated. -ENV TENSORFLOW_VERSION=2.15.0 +ENV TENSORFLOW_VERSION=2.16.1 # See https://github.com/tensorflow/io#tensorflow-version-compatibility -ENV TENSORFLOW_IO_VERSION=0.35.0 +ENV TENSORFLOW_IO_VERSION=0.37.0 # We need to redefine the ARG here to get the ARG value defined above the FROM instruction. # See: https://docs.docker.com/engine/reference/builder/#understand-how-arg-and-from-interact ARG LIGHTGBM_VERSION ARG TORCH_VERSION ARG TORCHAUDIO_VERSION -ARG TORCHTEXT_VERSION ARG TORCHVISION_VERSION ARG JAX_VERSION @@ -62,7 +60,6 @@ ENV KMP_SETTINGS=false ENV PIP_ROOT_USER_ACTION=ignore ADD clean-layer.sh /tmp/clean-layer.sh -ADD patches/keras_patch.sh /tmp/keras_patch.sh ADD patches/nbconvert-extensions.tpl /opt/kaggle/nbconvert-extensions.tpl ADD patches/template_conf.json /opt/kaggle/conf.json @@ -122,12 +119,12 @@ RUN pip install spacy && \ {{ end}} # Install PyTorch +# b/356397043: magma-cuda121 is the latest version {{ if eq .Accelerator "gpu" }} COPY --from=torch_whl /tmp/whl/*.whl /tmp/torch/ -RUN mamba install -y -c pytorch magma-cuda${CUDA_MAJOR_VERSION}${CUDA_MINOR_VERSION} && \ +RUN mamba install -y -c pytorch magma-cuda121 && \ pip install /tmp/torch/*.whl && \ - # b/255757999 openmp (libomp.so) is an dependency of libtorchtext and libtorchaudio but - mamba install -y openmp && \ + sudo apt -y install libsox-dev && \ rm -rf /tmp/torch && \ /tmp/clean-layer.sh {{ else }} @@ -135,8 +132,7 @@ RUN pip install \ torch==$TORCH_VERSION+cpu \ torchvision==$TORCHVISION_VERSION+cpu \ torchaudio==$TORCHAUDIO_VERSION+cpu \ - torchtext==$TORCHTEXT_VERSION \ - -f https://download.pytorch.org/whl/torch_stable.html && \ + --index-url https://download.pytorch.org/whl/cpu && \ /tmp/clean-layer.sh {{ end }} @@ -199,32 +195,22 @@ RUN apt-get update && \ RUN pip install -f http://h2o-release.s3.amazonaws.com/h2o/latest_stable_Py.html h2o && /tmp/clean-layer.sh -# b/318672158 Use simply tensorflow-probability once > 0.23.0 is released. RUN pip install \ "tensorflow==${TENSORFLOW_VERSION}" \ "tensorflow-io==${TENSORFLOW_IO_VERSION}" \ - git+https://github.com/tensorflow/probability.git@fbc5ebe9b1d343113fb917010096cfd88b32eecf \ - tensorflow_text \ + tensorflow-probability \ + tensorflow_decision_forests \ + tensorflow-text \ "tensorflow_hub>=0.16.0" \ # b/331799280 remove once other packages over to dm-tre optree \ tf-keras && \ /tmp/clean-layer.sh -# b/318672158 Use simply tensorflow_decision_forests on next release, expected with tf 2.16 -RUN pip install tensorflow_decision_forests==1.8.1 --no-deps && \ - /tmp/clean-layer.sh - -RUN chmod +x /tmp/keras_patch.sh && \ - /tmp/keras_patch.sh - ADD patches/keras_internal.py /opt/conda/lib/python3.10/site-packages/tensorflow_decision_forests/keras/keras_internal.py ADD patches/keras_internal_test.py /opt/conda/lib/python3.10/site-packages/tensorflow_decision_forests/keras/keras_internal_test.py -# Remove "--no-deps" flag and "namex" package once Keras 3.* is included in our base image. -# We ignore dependencies since tf2.15 and Keras 3.* should work despite pip saying it won't. -# Currently, keras tries to install a nightly version of tf 2.16: https://github.com/keras-team/keras/blob/fe2f54aa5bc42fb23a96449cf90434ab9bb6a2cd/requirements.txt#L2 -RUN pip install --no-deps "keras>3" keras-cv keras-nlp namex && \ +RUN pip install "keras>3" keras-cv keras-nlp && \ /tmp/clean-layer.sh # b/328788268 libpysal 4.10 seems to fail with "module 'shapely' has no attribute 'Geometry'. Did you mean: 'geometry'" diff --git a/Jenkinsfile b/Jenkinsfile index 5137c675..93f4753d 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -36,7 +36,6 @@ pipeline { --package torch \ --version $TORCH_VERSION \ --build-arg TORCHAUDIO_VERSION=$TORCHAUDIO_VERSION \ - --build-arg TORCHTEXT_VERSION=$TORCHTEXT_VERSION \ --build-arg TORCHVISION_VERSION=$TORCHVISION_VERSION \ --build-arg CUDA_MAJOR_VERSION=$CUDA_MAJOR_VERSION \ --build-arg CUDA_MINOR_VERSION=$CUDA_MINOR_VERSION \ diff --git a/config.txt b/config.txt index 6afee191..e95a1af1 100644 --- a/config.txt +++ b/config.txt @@ -1,12 +1,11 @@ BASE_IMAGE_REPO=gcr.io/deeplearning-platform-release -BASE_IMAGE_TAG=m114 -CPU_BASE_IMAGE_NAME=tf2-cpu.2-15.py310 -GPU_BASE_IMAGE_NAME=tf2-gpu.2-15.py310 +BASE_IMAGE_TAG=m122 +CPU_BASE_IMAGE_NAME=tf2-cpu.2-16.py310 +GPU_BASE_IMAGE_NAME=tf2-gpu.2-16.py310 LIGHTGBM_VERSION=4.2.0 -TORCH_VERSION=2.1.2 -TORCHAUDIO_VERSION=2.1.2 -TORCHTEXT_VERSION=0.16.2 -TORCHVISION_VERSION=0.16.2 +TORCH_VERSION=2.4.0 +TORCHAUDIO_VERSION=2.4.0 +TORCHVISION_VERSION=0.19.0 JAX_VERSION=0.4.26 CUDA_MAJOR_VERSION=12 -CUDA_MINOR_VERSION=1 +CUDA_MINOR_VERSION=3 diff --git a/packages/jaxlib.Dockerfile b/packages/jaxlib.Dockerfile index cc4e5fe9..ed73991c 100644 --- a/packages/jaxlib.Dockerfile +++ b/packages/jaxlib.Dockerfile @@ -15,8 +15,10 @@ ENV LIBRARY_PATH="$LIBRARY_PATH:/opt/conda/lib" ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib" # Instructions: https://jax.readthedocs.io/en/latest/developer.html#building-jaxlib-from-source -RUN apt-get update && \ - apt-get install -y g++ python python3-dev +RUN sudo ln -s /usr/bin/python3 /usr/bin/python + +RUN apt-get update && \ + apt-get install -y g++ python3 python3-dev RUN pip install numpy wheel build diff --git a/packages/torch.Dockerfile b/packages/torch.Dockerfile index f9579675..68c1eff3 100644 --- a/packages/torch.Dockerfile +++ b/packages/torch.Dockerfile @@ -4,7 +4,6 @@ FROM ${BASE_IMAGE} AS builder ARG PACKAGE_VERSION ARG TORCHAUDIO_VERSION -ARG TORCHTEXT_VERSION ARG TORCHVISION_VERSION ARG CUDA_MAJOR_VERSION ARG CUDA_MINOR_VERSION @@ -20,7 +19,7 @@ RUN conda install -c conda-forge mamba # Build instructions: https://github.com/pytorch/pytorch#from-source RUN mamba install astunparse numpy ninja pyyaml mkl mkl-include setuptools cmake cffi typing_extensions future six requests dataclasses -RUN mamba install -c pytorch magma-cuda${CUDA_MAJOR_VERSION}${CUDA_MINOR_VERSION} +RUN mamba install -c pytorch magma-cuda121 # By default, it uses the version from version.txt which includes the `a0` (alpha zero) suffix and part of the git hash. # This causes dependency conflicts like these: https://paste.googleplex.com/4786486378496000 @@ -63,18 +62,6 @@ RUN sudo apt-get update && \ RUN sed -i 's/set(envs/set(envs\n "LIBS=-ltinfo"/' /usr/local/src/audio/third_party/sox/CMakeLists.txt RUN cd /usr/local/src/audio && python setup.py bdist_wheel -# Build torchtext -# Instructions: https://github.com/pytorch/text#building-from-source -# See comment above for PYTORCH_BUILD_VERSION. -ENV BUILD_VERSION=$TORCHTEXT_VERSION -RUN cd /usr/local/src && \ - git clone https://github.com/pytorch/text && \ - cd text && \ - git checkout tags/v$TORCHTEXT_VERSION && \ - git submodule sync && \ - git submodule update --init --recursive --jobs 1 && \ - python setup.py bdist_wheel - # Build torchvision. # Instructions: https://github.com/pytorch/vision/tree/main#installation # See comment above for PYTORCH_BUILD_VERSION. @@ -93,7 +80,6 @@ FROM alpine:latest RUN mkdir -p /tmp/whl/ COPY --from=builder /usr/local/src/pytorch/dist/*.whl /tmp/whl COPY --from=builder /usr/local/src/audio/dist/*.whl /tmp/whl -COPY --from=builder /usr/local/src/text/dist/*.whl /tmp/whl COPY --from=builder /usr/local/src/vision/dist/*.whl /tmp/whl # Print out the built .whl file. diff --git a/patches/keras_patch.sh b/patches/keras_patch.sh deleted file mode 100644 index 9f219026..00000000 --- a/patches/keras_patch.sh +++ /dev/null @@ -1,41 +0,0 @@ -#!/bin/bash - -# The following "sed" are to patch the current version of tf-df with -# a fix for keras 3. In essence, replaces the use of package name "tf.keras" with -# "tf_keras" - -sed -i "/import tensorflow_decision_forests as tfdf/a import tf_keras" /opt/conda/lib/python3.10/site-packages/tensorflow_decision_forests/__init__.py && \ -sed -i -e "/import tensorflow as tf/a import tf_keras" \ - -e "/from yggdrasil_decision_forests.utils.distribute.implementations.grpc/a from tensorflow_decision_forests.keras import keras_internal" \ - -e '/try:/{:a;N;/backend = tf.keras.backend/!ba;d}'\ - /opt/conda/lib/python3.10/site-packages/tensorflow_decision_forests/keras/core.py && \ -sed -i -e "s/from typing import Optional, List, Dict, Any, Union, NamedTuple/from typing import Any, Dict, List, NamedTuple, Optional, Union/g" \ - -e "/import tensorflow as tf/a from tensorflow_decision_forests.keras import keras_internal" \ - -e "/import tensorflow as tf/a import tf_keras" \ - -e '/layers = tf.keras.layers/{:a;N;/backend = tf.keras.backend/!ba;d}' \ - /opt/conda/lib/python3.10/site-packages/tensorflow_decision_forests/keras/core_inference.py && \ -find /opt/conda/lib/python3.10/site-packages/tensorflow_decision_forests -type f -exec sed -i \ - -e "s/get_data_handler/keras_internal.get_data_handler/g" \ - -e 's/"models.Functional"/keras_internal.Functional/g' \ - -e "s/tf.keras.utils.unpack_x_y_sample_weight/keras_internal.unpack_x_y_sample_weight/g" \ - -e "s/tf.keras.utils.experimental/keras_internal/g" \ - {} \; && \ -sed -i -e "/import tensorflow as tf/a import tf_keras" \ - -e "/from tensorflow_decision_forests.keras import core/a from tensorflow_decision_forests.keras import keras_internal" \ - -e '/layers = tf.keras.layers/{:a;N;/callbacks = tf.keras.callbacks/!ba;d}' \ - /opt/conda/lib/python3.10/site-packages/tensorflow_decision_forests/keras/keras_test.py && \ -find /opt/conda/lib/python3.10/site-packages/tensorflow_decision_forests/keras -type f -exec sed -i \ - -e "s/ layers.Input/ tf_keras.layers.Input/g" \ - -e "s/layers.minimum/tf_keras.layers.minimum/g" \ - -e "s/layers.Concatenate/tf_keras.layers.Concatenate/g" \ - -e "s/layers.Dense/tf_keras.layers.Dense/g" \ - -e "s/layers.experimental.preprocessing./tf_keras.layers./g" \ - -e "s/layers.DenseFeatures/keras_internal.layers.DenseFeatures/g" \ - -e "s/models.Model/tf_keras.models.Model/g" {} \; && \ -sed -i "s/ models.load_model/ tf_keras.models.load_model/g" /opt/conda/lib/python3.10/site-packages/tensorflow_decision_forests/keras/keras_test.py && \ -sed -i "/import tensorflow as tf/a import tf_keras" /opt/conda/lib/python3.10/site-packages/tensorflow_decision_forests/keras/test_runner.py && \ -sed -i "/import tensorflow as tf/a import tf_keras" /opt/conda/lib/python3.10/site-packages/tensorflow_decision_forests/keras/wrappers.py && \ -sed -i -e "/import tensorflow as tf/a import tf_keras" \ - -e "s/optimizer=optimizers.Adam()/optimizer=tf_keras.optimizers.Adam()/g" \ - /opt/conda/lib/python3.10/site-packages/tensorflow_decision_forests/keras/wrappers_pre_generated.py && \ -find /opt/conda/lib/python3.10/site-packages/tensorflow_decision_forests -type f -exec sed -i "s/tf.keras./tf_keras./g" {} \; diff --git a/tests/test_geopandas.py b/tests/test_geopandas.py index e2bb4583..62a3f418 100644 --- a/tests/test_geopandas.py +++ b/tests/test_geopandas.py @@ -1,16 +1,11 @@ import unittest import geopandas +from shapely.geometry import Polygon class TestGeopandas(unittest.TestCase): - def test_read(self): - df = geopandas.read_file(geopandas.datasets.get_path('nybb')) - self.assertTrue(df.size > 1) - - def test_spatial_join(self): - cities = geopandas.read_file(geopandas.datasets.get_path('naturalearth_cities')) - world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres')) - countries = world[['geometry', 'name']] - countries = countries.rename(columns={'name':'country'}) - cities_with_country = geopandas.sjoin(cities, countries, how="inner", op='intersects') - self.assertTrue(cities_with_country.size > 1) \ No newline at end of file + def test_GeoSeries(self): + p1 = Polygon([(0, 0), (1, 0), (1, 1)]) + p2 = Polygon([(0, 0), (1, 0), (1, 1), (0, 1)]) + p3 = Polygon([(2, 0), (3, 0), (3, 1), (2, 1)]) + g = geopandas.GeoSeries([p1, p2, p3]) \ No newline at end of file diff --git a/tests/test_torchtext.py b/tests/test_torchtext.py deleted file mode 100644 index f9fbf76f..00000000 --- a/tests/test_torchtext.py +++ /dev/null @@ -1,12 +0,0 @@ -import unittest - -from torchtext.data.metrics import bleu_score - - -class TestTorchtext(unittest.TestCase): - def test_bleu_score(self): - candidate = [['I', 'love', 'Kaggle', 'Notebooks']] - refs = [[['Completely', 'Different']]] - - self.assertEqual(0, bleu_score(candidate, refs)) - diff --git a/tpu/Dockerfile b/tpu/Dockerfile index ed9040a8..b94619da 100644 --- a/tpu/Dockerfile +++ b/tpu/Dockerfile @@ -13,7 +13,6 @@ ARG TENSORFLOW_VERSION ARG TF_LIBTPU_VERSION ARG JAX_VERSION ARG TORCHVISION_VERSION -ARG TORCHTEXT_VERSION ARG TORCHAUDIO_VERSION ENV ISTPUVM=1 @@ -60,7 +59,7 @@ RUN apt-get update && apt-get install ffmpeg libsm6 libxext6 -y # Additional useful packages should be added here RUN pip install tensorflow_hub https://storage.googleapis.com/cloud-tpu-tpuvm-artifacts/tensorflow/tf-${TENSORFLOW_VERSION}/tensorflow-${TENSORFLOW_VERSION}-${PYTHON_WHEEL_VERSION}-${PYTHON_WHEEL_VERSION}-${TF_LINUX_WHEEL_VERSION}.whl tensorflow-probability tensorflow-io \ - torch~=${TORCH_VERSION} https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-${TORCH_VERSION}+libtpu-${PYTHON_WHEEL_VERSION}-${PYTHON_WHEEL_VERSION}-${TORCH_LINUX_WHEEL_VERSION}.whl torchvision==${TORCHVISION_VERSION} torchtext==${TORCHTEXT_VERSION} torchaudio==${TORCHAUDIO_VERSION} \ + torch~=${TORCH_VERSION} https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-${TORCH_VERSION}+libtpu-${PYTHON_WHEEL_VERSION}-${PYTHON_WHEEL_VERSION}-${TORCH_LINUX_WHEEL_VERSION}.whl torchvision==${TORCHVISION_VERSION} torchaudio==${TORCHAUDIO_VERSION} \ jax[tpu]==${JAX_VERSION} -f https://storage.googleapis.com/jax-releases/libtpu_releases.html trax flax optax git+https://github.com/deepmind/dm-haiku jraph distrax \ papermill jupyterlab python-lsp-server[all] "jupyter-lsp==1.5.1" \ pandas matplotlib opencv-python-headless librosa accelerate diffusers scikit-learn transformers \ diff --git a/tpu/config.txt b/tpu/config.txt index a74310d2..4ce1c196 100644 --- a/tpu/config.txt +++ b/tpu/config.txt @@ -12,8 +12,6 @@ JAX_VERSION=0.4.23 TORCH_VERSION=2.4.0 # https://github.com/pytorch/audio supports nightly TORCHAUDIO_VERSION=2.4.0 -# https://github.com/pytorch/text supports main -TORCHTEXT_VERSION=0.18.0 # https://github.com/pytorch/vision supports nightly TORCHVISION_VERSION=0.19.0 TORCH_LINUX_WHEEL_VERSION=manylinux_2_28_x86_64 From ba786ed972fffe8aa0502cd341652c564ac461c6 Mon Sep 17 00:00:00 2001 From: Johnny Chavez <64660690+calderjo@users.noreply.github.com> Date: Fri, 16 Aug 2024 16:24:35 -0700 Subject: [PATCH 03/67] Learntool fix gp (#1417) --- Dockerfile.tmpl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index 42f198d7..963d5b1c 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -122,6 +122,8 @@ RUN pip install spacy && \ # b/356397043: magma-cuda121 is the latest version {{ if eq .Accelerator "gpu" }} COPY --from=torch_whl /tmp/whl/*.whl /tmp/torch/ +# b/356397043: We are currently using cuda 12.3, +# but magma-cuda121 is the latest compatible version RUN mamba install -y -c pytorch magma-cuda121 && \ pip install /tmp/torch/*.whl && \ sudo apt -y install libsox-dev && \ @@ -535,7 +537,8 @@ RUN pip install pytorch-ignite \ s3fs \ gcsfs \ kaggle-environments \ - geopandas \ + # geopandas > v0.14.4 breaks learn tools + geopandas==v0.14.4 \ "shapely<2" \ vowpalwabbit \ pydub \ From 15139332f5d247cd2df390cd31f9f0b724984f61 Mon Sep 17 00:00:00 2001 From: Johnny Chavez <64660690+calderjo@users.noreply.github.com> Date: Mon, 19 Aug 2024 11:57:09 -0700 Subject: [PATCH 04/67] fix geopandas again (#1418) This should fix the failing learn tools test. --- Dockerfile.tmpl | 5 +++-- tests/test_geopandas.py | 17 +++++++++++------ 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index 963d5b1c..11205a0d 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -89,8 +89,8 @@ RUN sed -i "s/httpredir.debian.org/debian.uchicago.edu/" /etc/apt/sources.list & apt-get install -y graphviz && pip install graphviz && \ /tmp/clean-layer.sh -# b/128333086: Set PROJ_LIB to points to the proj4 cartographic library. -ENV PROJ_LIB=/opt/conda/share/proj +# b/128333086: Set PROJ_DATA to points to the proj4 cartographic library. +ENV PROJ_DATA=/opt/conda/share/proj # Install conda packages not available on pip. # When using pip in a conda environment, conda commands should be ran first and then @@ -103,6 +103,7 @@ RUN conda config --add channels nvidia && \ # Base image channel order: conda-forge (highest priority), defaults. # End state: rapidsai (highest priority), nvidia, conda-forge, defaults. mamba install -y mkl cartopy imagemagick pyproj "shapely<2" && \ + rm -rf /opt/conda/lib/python3.10/site-packages/pyproj/proj_dir/ && \ /tmp/clean-layer.sh # Install spacy diff --git a/tests/test_geopandas.py b/tests/test_geopandas.py index 62a3f418..4c0106b2 100644 --- a/tests/test_geopandas.py +++ b/tests/test_geopandas.py @@ -1,11 +1,16 @@ import unittest import geopandas -from shapely.geometry import Polygon class TestGeopandas(unittest.TestCase): - def test_GeoSeries(self): - p1 = Polygon([(0, 0), (1, 0), (1, 1)]) - p2 = Polygon([(0, 0), (1, 0), (1, 1), (0, 1)]) - p3 = Polygon([(2, 0), (3, 0), (3, 1), (2, 1)]) - g = geopandas.GeoSeries([p1, p2, p3]) \ No newline at end of file + def test_read(self): + df = geopandas.read_file(geopandas.datasets.get_path('nybb')) + self.assertTrue(df.size > 1) + + def test_spatial_join(self): + cities = geopandas.read_file(geopandas.datasets.get_path('naturalearth_cities')) + world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres')) + countries = world[['geometry', 'name']] + countries = countries.rename(columns={'name':'country'}) + cities_with_country = geopandas.sjoin(cities, countries, how="inner", op='intersects') + self.assertTrue(cities_with_country.size > 1) From 846647d98e4999e565155e3324afbefd41bdd468 Mon Sep 17 00:00:00 2001 From: Johnny Chavez <64660690+calderjo@users.noreply.github.com> Date: Tue, 20 Aug 2024 01:24:41 -0700 Subject: [PATCH 05/67] fix (#1419) --- Dockerfile.tmpl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index 11205a0d..d1f2ee39 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -183,7 +183,7 @@ RUN export PATH=/usr/local/cuda/bin:$PATH && \ RUN JAXVER=$(pip freeze | grep -e "^jax==") && \ pip install --upgrade \ "matplotlib<3.8.0" \ - seaborn \ + "seaborn==0.12.2" \ python-dateutil dask dask-expr igraph \ pyyaml joblib geopy mne pyshp \ pandas \ From 9e8425833c5d0cc34143d49881ce71b94df43974 Mon Sep 17 00:00:00 2001 From: Anil Ozyalcin Date: Fri, 23 Aug 2024 11:02:25 -0700 Subject: [PATCH 06/67] removed fixing pillow to 9.5.0 (#1422) Update pillow to latest version so it can handle opening jpg images. --- Dockerfile.tmpl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index d1f2ee39..8b3c3382 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -448,7 +448,7 @@ RUN pip install bleach \ pexpect \ pickleshare \ # TODO(b/290035631) unpin when EasyOCR did a release. - Pillow==9.5.0 && \ + Pillow && \ # Install openslide and its python binding apt-get install -y openslide-tools && \ pip install openslide-python \ From f13f634100bccdf249d8d83f8f99536f3ea57f3b Mon Sep 17 00:00:00 2001 From: Johnny Chavez <64660690+calderjo@users.noreply.github.com> Date: Thu, 19 Sep 2024 09:40:29 -0700 Subject: [PATCH 07/67] Mixed bag clean up (#1425) yeah handful of things 1) Combined some layer 2) made comments referring to bugs use consistent format 3) removed low usage packages and tests, we recently launched package manager. we can confidently remove more unused packages. The following packages are to be removed: - pykalman - preprocessing - hmmlearn - gplearn - geoplot - polyglot - ggplot - descartes - fitter - imagecodecs - wfdb - hpsklearn - cleverhans - osmnx - pysal - wordsegment - vowpalwabbit - feather - kmodes - ortools - mlens - vecstack - Geohash - geoviews - s2sphere - flashtext - kmapper - stemming - hunspell - spectral - essentia - hypertools - stop_words - scattertext - vaex - blake3 - catalyst (note no longer maintained since 2022) --- Dockerfile.tmpl | 293 ++++++++++++++----------------------- tests/test_catalyst.py | 158 -------------------- tests/test_essentia.py | 7 - tests/test_geoviews.py | 17 --- tests/test_ggplot.py | 12 -- tests/test_imports.py | 1 - tests/test_kmapper.py | 7 - tests/test_matplotlib.py | 7 + tests/test_pykalman.py | 47 ------ tests/test_vaex.py | 10 -- tests/test_vowpalwabbit.py | 10 -- 11 files changed, 119 insertions(+), 450 deletions(-) delete mode 100644 tests/test_catalyst.py delete mode 100644 tests/test_essentia.py delete mode 100644 tests/test_geoviews.py delete mode 100644 tests/test_ggplot.py delete mode 100644 tests/test_kmapper.py delete mode 100644 tests/test_pykalman.py delete mode 100644 tests/test_vaex.py delete mode 100644 tests/test_vowpalwabbit.py diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index 8b3c3382..c0037839 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -1,12 +1,12 @@ -ARG BASE_IMAGE_REPO -ARG BASE_IMAGE_TAG -ARG CPU_BASE_IMAGE_NAME -ARG GPU_BASE_IMAGE_NAME -ARG LIGHTGBM_VERSION -ARG TORCH_VERSION -ARG TORCHAUDIO_VERSION -ARG TORCHVISION_VERSION -ARG JAX_VERSION +ARG BASE_IMAGE_REPO \ + BASE_IMAGE_TAG \ + CPU_BASE_IMAGE_NAME \ + GPU_BASE_IMAGE_NAME \ + LIGHTGBM_VERSION \ + TORCH_VERSION \ + TORCHAUDIO_VERSION \ + TORCHVISION_VERSION \ + JAX_VERSION {{ if eq .Accelerator "gpu" }} FROM gcr.io/kaggle-images/python-lightgbm-whl:${GPU_BASE_IMAGE_NAME}-${BASE_IMAGE_TAG}-${LIGHTGBM_VERSION} AS lightgbm_whl @@ -18,61 +18,50 @@ FROM ${BASE_IMAGE_REPO}/${CPU_BASE_IMAGE_NAME}:${BASE_IMAGE_TAG} {{ end }} # Ensures shared libraries installed with conda can be found by the dynamic link loader. -ENV LIBRARY_PATH="$LIBRARY_PATH:/opt/conda/lib" -ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib" +ENV LIBRARY_PATH="$LIBRARY_PATH:/opt/conda/lib" \ + LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib" {{ if eq .Accelerator "gpu" }} -ARG CUDA_MAJOR_VERSION -ARG CUDA_MINOR_VERSION -ENV CUDA_MAJOR_VERSION=${CUDA_MAJOR_VERSION} -ENV CUDA_MINOR_VERSION=${CUDA_MINOR_VERSION} +ARG CUDA_MAJOR_VERSION \ + CUDA_MINOR_VERSION +ENV CUDA_MAJOR_VERSION=${CUDA_MAJOR_VERSION} \ + CUDA_MINOR_VERSION=${CUDA_MINOR_VERSION} # Make sure we are on the right version of CUDA RUN update-alternatives --set cuda /usr/local/cuda-$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION # NVIDIA binaries from the host are mounted to /opt/bin. -ENV PATH=/opt/bin:${PATH} -# Add CUDA stubs to LD_LIBRARY_PATH to support building the GPU image on a CPU machine. -ENV LD_LIBRARY_PATH_NO_STUBS="$LD_LIBRARY_PATH" -ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64/stubs" +ENV PATH=/opt/bin:${PATH} \ + # Add CUDA stubs to LD_LIBRARY_PATH to support building the GPU image on a CPU machine. + LD_LIBRARY_PATH_NO_STUBS="$LD_LIBRARY_PATH" \ + LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64/stubs" RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 {{ end }} # Keep these variables in sync if base image is updated. -ENV TENSORFLOW_VERSION=2.16.1 -# See https://github.com/tensorflow/io#tensorflow-version-compatibility -ENV TENSORFLOW_IO_VERSION=0.37.0 +ENV TENSORFLOW_VERSION=2.16.1 \ + # See https://github.com/tensorflow/io#tensorflow-version-compatibility + TENSORFLOW_IO_VERSION=0.37.0 # We need to redefine the ARG here to get the ARG value defined above the FROM instruction. # See: https://docs.docker.com/engine/reference/builder/#understand-how-arg-and-from-interact -ARG LIGHTGBM_VERSION -ARG TORCH_VERSION -ARG TORCHAUDIO_VERSION -ARG TORCHVISION_VERSION -ARG JAX_VERSION +ARG LIGHTGBM_VERSION \ + TORCH_VERSION \ + TORCHAUDIO_VERSION \ + TORCHVISION_VERSION \ + JAX_VERSION # Disable pesky logs like: KMP_AFFINITY: pid 6121 tid 6121 thread 0 bound to OS proc set 0 # See: https://stackoverflow.com/questions/57385766/disable-tensorflow-log-information -ENV KMP_WARNINGS=0 -# Also make the KMP logs noverbose. -# https://stackoverflow.com/questions/70250304/stop-tensorflow-from-printing-warning-message -ENV KMP_SETTINGS=false - -# Remove the pip as the root user warning. -ENV PIP_ROOT_USER_ACTION=ignore +ENV KMP_WARNINGS=0 \ + # Also make the KMP logs noverbose. + # https://stackoverflow.com/questions/70250304/stop-tensorflow-from-printing-warning-message + KMP_SETTINGS=false \ + # Remove the pip as the root user warning. + PIP_ROOT_USER_ACTION=ignore ADD clean-layer.sh /tmp/clean-layer.sh ADD patches/nbconvert-extensions.tpl /opt/kaggle/nbconvert-extensions.tpl ADD patches/template_conf.json /opt/kaggle/conf.json -# b/276344496: Install specific version of boto3, because 1.26.103 is broken. -RUN pip install boto3==1.26.100 && \ - /tmp/clean-layer.sh - -{{ if eq .Accelerator "gpu" }} -# b/200968891 Keeps horovod once torch is upgraded. -RUN pip uninstall -y horovod && \ - /tmp/clean-layer.sh -{{ end }} - # Update GPG key per documentation at https://cloud.google.com/compute/docs/troubleshooting/known-issues RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add - RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - @@ -98,7 +87,7 @@ ENV PROJ_DATA=/opt/conda/share/proj RUN conda config --add channels nvidia && \ conda config --add channels rapidsai && \ conda config --set solver libmamba && \ - # b/299991198 remove curl/libcurl install once DLVM base image includes version >= 7.86 + # b/299991198: remove curl/libcurl install once DLVM base image includes version >= 7.86 conda install -c conda-forge mamba curl libcurl && \ # Base image channel order: conda-forge (highest priority), defaults. # End state: rapidsai (highest priority), nvidia, conda-forge, defaults. @@ -178,12 +167,15 @@ RUN export PATH=/usr/local/cuda/bin:$PATH && \ /tmp/clean-layer.sh {{ end }} -# (b/308525631) Pin Matplotlib until seaborn can be upgraded +# b/308525631: Pin Matplotlib until seaborn can be upgraded # to >0.13.0 (now it's stuck by a package conflict with ydata-profiling 4.5.1). RUN JAXVER=$(pip freeze | grep -e "^jax==") && \ pip install --upgrade \ "matplotlib<3.8.0" \ + # ipympl adds interactive widget support for matplotlib + ipympl==0.7.0 \ "seaborn==0.12.2" \ + pyupset \ python-dateutil dask dask-expr igraph \ pyyaml joblib geopy mne pyshp \ pandas \ @@ -205,21 +197,17 @@ RUN pip install \ tensorflow_decision_forests \ tensorflow-text \ "tensorflow_hub>=0.16.0" \ - # b/331799280 remove once other packages over to dm-tre - optree \ - tf-keras && \ + tf-keras \ + "keras>3" \ + keras-cv \ + keras-nlp && \ /tmp/clean-layer.sh -ADD patches/keras_internal.py /opt/conda/lib/python3.10/site-packages/tensorflow_decision_forests/keras/keras_internal.py -ADD patches/keras_internal_test.py /opt/conda/lib/python3.10/site-packages/tensorflow_decision_forests/keras/keras_internal_test.py +ADD patches/keras_internal.py \ + patches/keras_internal_test.py \ + /opt/conda/lib/python3.10/site-packages/tensorflow_decision_forests/keras/ -RUN pip install "keras>3" keras-cv keras-nlp && \ - /tmp/clean-layer.sh - -# b/328788268 libpysal 4.10 seems to fail with "module 'shapely' has no attribute 'Geometry'. Did you mean: 'geometry'" -RUN pip install pysal "libpysal==4.9.2" - -# b/350573866 xgboost v2.1.0 breaks learntools +# b/350573866: xgboost v2.1.0 breaks learntools RUN apt-get install -y libfreetype6-dev && \ apt-get install -y libglib2.0-0 libxext6 libsm6 libxrender1 libfontconfig1 --fix-missing && \ pip install gensim \ @@ -247,16 +235,15 @@ RUN apt-get install -y libfreetype6-dev && \ state_union stopwords subjectivity swadesh switchboard tagsets timit toolbox treebank \ twitter_samples udhr2 udhr unicode_samples universal_tagset universal_treebanks_v20 \ vader_lexicon verbnet webtext word2vec_sample wordnet wordnet_ic words ycoe && \ - # Stop-words - pip install stop-words \ - scikit-image && \ + pip install scikit-image && \ + pip install opencv-contrib-python opencv-python && \ /tmp/clean-layer.sh -RUN pip install opencv-contrib-python opencv-python && \ - /tmp/clean-layer.sh - -# Pin scipy until we update JAX b/335003097 -RUN pip install "scipy==1.12.0" \ +RUN pip install cython \ + fasttext \ + opencv-contrib-python \ + opencv-python \ + "scipy<1.14.0" \ # Scikit-learn accelerated library for x86 "scikit-learn-intelex>=2023.0.1" \ # HDF5 support @@ -269,17 +256,18 @@ RUN pip install "scipy==1.12.0" \ bokeh \ numba \ datashader \ - # Boruta (python implementation) + # b/328788268: libpysal 4.10 seems to fail with "module 'shapely' has no attribute 'Geometry'. Did you mean: 'geometry'" + "libpysal==4.9.2" \ + # b/276344496: Install specific version of boto3, because 1.26.103 is broken. + "boto3==1.26.100" \ Boruta && \ # Pandoc is a dependency of deap apt-get install -y pandoc && \ - pip install essentia - -RUN apt-get install -y git-lfs && \ /tmp/clean-layer.sh -# vtk with dependencies -RUN apt-get install -y libgl1-mesa-glx && \ +RUN apt-get install -y git-lfs && \ + # vtk with dependencies + apt-get install -y libgl1-mesa-glx && \ pip install vtk && \ # xvfbwrapper with dependencies apt-get install -y xvfb && \ @@ -295,22 +283,19 @@ RUN pip install mpld3 \ nibabel \ imgaug \ preprocessing \ - path.py \ - Geohash && \ + path.py && \ pip install deap \ - # b/302136621 Fix eli5 import for learntools, newer version require scikit-learn > 1.3 + # b/302136621: Fix eli5 import for learntools, newer version require scikit-learn > 1.3 "tpot==0.12.1" \ scikit-optimize \ haversine \ toolz cytoolz \ plotly \ hyperopt \ - fitter \ langid \ # Useful data exploration libraries (for missing data and generating reports) missingno \ pandas-profiling \ - s2sphere \ bayesian-optimization \ matplotlib-venn \ pyldavis \ @@ -320,32 +305,20 @@ RUN pip install mpld3 \ ecos \ CVXcanon \ pymc3 \ - imagecodecs \ tifffile \ - spectral \ - descartes \ geojson \ pydicom \ wavio \ SimpleITK \ - hmmlearn \ - gplearn \ squarify \ fuzzywuzzy \ python-louvain \ pyexcel-ods \ sklearn-pandas \ - stemming \ - # b/266272046 prophet 1.1.2 breaks the test - prophet==1.1.1 \ - # b/283847935 holidays >0.24 is broken - "holidays==0.24" \ + prophet \ + holidays \ holoviews \ - geoviews \ - hypertools \ - mlens \ scikit-multilearn \ - cleverhans \ leven \ catboost \ folium \ @@ -354,7 +327,6 @@ RUN pip install mpld3 \ plotnine \ scikit-surprise \ pymongo \ - geoplot \ eli5 \ kaggle \ kagglehub \ @@ -362,22 +334,16 @@ RUN pip install mpld3 \ pytest && \ /tmp/clean-layer.sh -RUN rm -rf /opt/conda/lib/python3.10/site-packages/numpy-1.23.5.dist-info* # Add google PAIR-code Facets RUN cd /opt/ && git clone https://github.com/PAIR-code/facets && cd facets/ && jupyter nbextension install facets-dist/ --user && \ export PYTHONPATH=$PYTHONPATH:/opt/facets/facets_overview/python/ && \ - pip install kmodes --no-dependencies && \ pip install librosa \ - polyglot \ sentencepiece \ cufflinks \ lime \ memory_profiler && \ /tmp/clean-layer.sh -RUN pip install cython \ - fasttext && \ - apt-get install -y libhunspell-dev && pip install hunspell RUN pip install annoy \ category_encoders && \ # b/183041606#comment5: the Kaggle data proxy doesn't support these APIs. If the library is missing, it falls back to using a regular BigQuery query to fetch data. @@ -390,33 +356,26 @@ RUN pip install annoy \ google-cloud-bigquery \ google-cloud-storage && \ # Split these installations to avoid `pip._vendor.resolvelib.resolvers.ResolutionTooDeep: 200000` - # TODO(b/315753846) Unpin translate package. + # b/315753846: Unpin translate package. pip install google-cloud-translate==3.12.1 \ google-cloud-language==2.* \ google-cloud-videointelligence==2.* \ google-cloud-vision==2.* \ protobuf==3.20.3 \ - ortools \ - scattertext \ # Pandas data reader pandas-datareader \ - wordsegment \ emoji \ # Add Japanese morphological analysis engine janome \ - wfdb \ - vecstack \ # yellowbrick machine learning visualization library yellowbrick \ mlcrate && \ /tmp/clean-layer.sh -# b/273059949 The pre-installed nbconvert is slow on html conversions and has to be force-uninstalled. -# b/274619697 learntools also requires a specific nbconvert right now +# b/273059949: The pre-installed nbconvert is slow on html conversions and has to be force-uninstalled. +# b/274619697: learntools also requires a specific nbconvert right now RUN rm -rf /opt/conda/lib/python3.10/site-packages/{nbconvert,nbclient,mistune,platformdirs}* -# Fix qgrid by pinning ipywidgets https://github.com/quantopian/qgrid/issues/376 -# allennlp \ RUN pip install bleach \ certifi \ cycler \ @@ -426,6 +385,7 @@ RUN pip install bleach \ ipykernel \ ipython \ ipython-genutils \ + # Fix qgrid by pinning ipywidgets https://github.com/quantopian/qgrid/issues/376 ipywidgets==7.7.1 \ isoweek \ jedi \ @@ -447,7 +407,6 @@ RUN pip install bleach \ pandocfilters \ pexpect \ pickleshare \ - # TODO(b/290035631) unpin when EasyOCR did a release. Pillow && \ # Install openslide and its python binding apt-get install -y openslide-tools && \ @@ -469,7 +428,6 @@ RUN pip install bleach \ widgetsnbextension \ # Require pyarrow newer than https://github.com/advisories/GHSA-5wvp-7f3h-6wmm {{ if eq .Accelerator "gpu" }} pyarrow {{ else }} "pyarrow>=14.0.1" {{ end }} \ - feather-format \ fastai RUN python -m spacy download en_core_web_sm && python -m spacy download en_core_web_lg && \ @@ -484,20 +442,14 @@ RUN python -m spacy download en_core_web_sm && python -m spacy download en_core_ # ########### -RUN rm /opt/conda/lib/python3.10/site-packages/google*/direct_url.json -RUN rm /opt/conda/lib/python3.10/site-packages/google*/REQUESTED - +RUN rm /opt/conda/lib/python3.10/site-packages/google*/direct_url.json && \ + rm /opt/conda/lib/python3.10/site-packages/google*/REQUESTED # dlib has a libmkl incompatibility: # test_dlib_face_detector (test_dlib.TestDLib) ... INTEL MKL ERROR: /opt/conda/bin/../lib/libmkl_avx512.so.2: undefined symbol: mkl_sparse_optimize_bsr_trsm_i8. # Intel MKL FATAL ERROR: Cannot load libmkl_avx512.so.2 or libmkl_def.so.2. # nnabla breaks protobuf compatibiilty: -RUN pip install flashtext \ - wandb \ - # b/214080882 blake3 0.3.0 is not compatible with vaex. - blake3==0.2.1 \ - vaex \ +RUN pip install wandb \ pyemd \ - pyupset \ pympler \ featuretools \ #-e git+https://github.com/SohierDane/BigQuery_Helper#egg=bq_helper \ @@ -506,30 +458,24 @@ RUN pip install flashtext \ gym \ pyarabic \ pandasql \ - # b/302136621 Fix eli5 import for learntools + # b/302136621: Fix eli5 import for learntools scikit-learn==1.2.2 \ - hpsklearn \ - kmapper \ - # b/329869023 shap 0.45.0 breaks learntools + # b/329869023 shap 0.45.0 breaks learntools shap==0.44.1 \ cesium \ rgf_python \ jieba \ - # ggplot is broken and main repo does not merge and release https://github.com/yhat/ggpy/pull/668 - https://github.com/hbasria/ggpy/archive/0.11.5.zip \ tsfresh \ - pykalman \ optuna \ plotly_express \ albumentations \ - accelerate \ - # b/290207097 switch back to the pip catalyst package when bug fixed - # https://github.com/catalyst-team/catalyst/issues/1440 - git+https://github.com/Philmod/catalyst.git@fix-fp16#egg=catalyst \ - osmnx && \ + Rtree \ + accelerate && \ apt-get -y install libspatialindex-dev -RUN pip install pytorch-ignite \ +RUN rm -rf /opt/conda/lib/python3.10/site-packages/numpy* && \ + pip install "numpy==1.26.4" && \ + pip install pytorch-ignite \ qgrid \ bqplot \ earthengine-api \ @@ -541,7 +487,6 @@ RUN pip install pytorch-ignite \ # geopandas > v0.14.4 breaks learn tools geopandas==v0.14.4 \ "shapely<2" \ - vowpalwabbit \ pydub \ pydegensac \ torchmetrics \ @@ -552,14 +497,12 @@ RUN pip install pytorch-ignite \ # pycrypto is used by competitions team. pycryptodome \ easyocr \ - # ipympl adds interactive widget support for matplotlib - ipympl==0.7.0 \ onnx \ tables \ openpyxl \ timm \ torchinfo && \ - pip install git+https://github.com/facebookresearch/segment-anything.git && \ + pip install git+https://github.com/facebookresearch/segment-anything.git && \ # b/343971718: remove duplicate aiohttp installs, and reinstall it rm -rf /opt/conda/lib/python3.10/site-packages/aiohttp* && \ mamba install --force-reinstall -y aiohttp && \ @@ -586,12 +529,12 @@ RUN apt-get install tesseract-ocr -y && \ pdf2image \ PyPDF && \ /tmp/clean-layer.sh -ENV TESSERACT_PATH=/usr/bin/tesseract -# For Facets -ENV PYTHONPATH=$PYTHONPATH:/opt/facets/facets_overview/python/ -# For Theano with MKL -ENV MKL_THREADING_LAYER=GNU +ENV TESSERACT_PATH=/usr/bin/tesseract \ + # For Facets + PYTHONPATH=$PYTHONPATH:/opt/facets/facets_overview/python/ \ + # For Theano with MKL + MKL_THREADING_LAYER=GNU # Temporary fixes and patches # Temporary patch for Dask getting downgraded, which breaks Keras @@ -605,7 +548,7 @@ RUN pip install --upgrade dask && \ mkdir -p /etc/ipython/ && echo "c = get_config(); c.IPKernelApp.matplotlib = 'inline'" > /etc/ipython/ipython_config.py && \ # Temporary patch for broken libpixman 0.38 in conda-forge, symlink to system libpixman 0.34 untile conda package gets updated to 0.38.5 or higher. ln -sf /usr/lib/x86_64-linux-gnu/libpixman-1.so.0.34.0 /opt/conda/lib/libpixman-1.so.0.38.0 && \ - # pin jupyter-server to version 2.12.5; later versions break LSP (b/333854354) + # b/333854354: pin jupyter-server to version 2.12.5; later versions break LSP (b/333854354) pip install --force-reinstall --no-deps jupyter_server==2.12.5 && \ /tmp/clean-layer.sh @@ -620,13 +563,15 @@ RUN mkdir -p ~/src && git clone https://github.com/SohierDane/BigQuery_Helper ~/ # Add BigQuery client proxy settings ENV PYTHONUSERBASE "/root/.local" -ADD patches/kaggle_gcp.py /root/.local/lib/python3.10/site-packages/kaggle_gcp.py -ADD patches/kaggle_secrets.py /root/.local/lib/python3.10/site-packages/kaggle_secrets.py -ADD patches/kaggle_session.py /root/.local/lib/python3.10/site-packages/kaggle_session.py -ADD patches/kaggle_web_client.py /root/.local/lib/python3.10/site-packages/kaggle_web_client.py -ADD patches/kaggle_datasets.py /root/.local/lib/python3.10/site-packages/kaggle_datasets.py -ADD patches/log.py /root/.local/lib/python3.10/site-packages/log.py -ADD patches/sitecustomize.py /root/.local/lib/python3.10/site-packages/sitecustomize.py +ADD patches/kaggle_gcp.py \ + patches/kaggle_secrets.py \ + patches/kaggle_session.py \ + patches/kaggle_web_client.py \ + patches/kaggle_datasets.py \ + patches/log.py \ + patches/sitecustomize.py \ + /root/.local/lib/python3.10/site-packages/ + # Override default imagemagick policies ADD patches/imagemagick-policy.xml /etc/ImageMagick-6/policy.xml @@ -635,20 +580,6 @@ ADD patches/kaggle_module_resolver.py /opt/conda/lib/python3.10/site-packages/te RUN sed -i '/from tensorflow_hub import uncompressed_module_resolver/a from tensorflow_hub import kaggle_module_resolver' /opt/conda/lib/python3.10/site-packages/tensorflow_hub/config.py && \ sed -i '/_install_default_resolvers()/a \ \ registry.resolver.add_implementation(kaggle_module_resolver.KaggleFileResolver())' /opt/conda/lib/python3.10/site-packages/tensorflow_hub/config.py -# TensorBoard Jupyter extension. Should be replaced with TensorBoard's provided magic once we have -# worker tunneling support in place. -# b/139212522 re-enable TensorBoard once solution for slowdown is implemented. -# ENV JUPYTER_CONFIG_DIR "/root/.jupyter/" -# RUN pip install jupyter_tensorboard && \ -# jupyter serverextension enable jupyter_tensorboard && \ -# jupyter tensorboard enable -# ADD patches/tensorboard/notebook.py /opt/conda/lib/python3.10/site-packages/tensorboard/notebook.py - -# Disable unnecessary jupyter extensions -#RUN jupyter-nbextension disable nb_conda --py --sys-prefix && \ -# jupyter-serverextension disable nb_conda --py --sys-prefix && \ -# python -m nb_conda_kernels.install --disable - # Disable preloaded jupyter modules (they add to startup, and break when they are missing) RUN sed -i /bq_stats/d /etc/ipython/ipython_kernel_config.py && \ sed -i /beatrix/d /etc/ipython/ipython_kernel_config.py && \ @@ -662,37 +593,37 @@ RUN rm /opt/conda/bin/../lib/libcusolver.so.11 && ln -s /usr/local/cuda/lib64/li RUN ln -s /usr/local/cuda/lib64/libcusolver.so.11 /opt/conda/bin/../lib/libcusolver.so.11 {{ end }} -# b/270147159 conda ships with a version of libtinfo which is missing version info causing warnings, replace it with a good version. +# b/270147159: conda ships with a version of libtinfo which is missing version info causing warnings, replace it with a good version. RUN rm /opt/conda/lib/libtinfo.so.6 && ln -s /usr/lib/x86_64-linux-gnu/libtinfo.so.6 /opt/conda/lib/libtinfo.so.6 -# b/276358430 fix Jupyter lsp freezing up the jupyter server +# b/276358430: fix Jupyter lsp freezing up the jupyter server RUN pip install "jupyter-lsp==1.5.1" # Set backend for matplotlib -ENV MPLBACKEND "agg" +ENV MPLBACKEND="agg" \ + # Set LC_ALL + # https://github.com/explosion/spaCy/issues/12872#issuecomment-1661847588 + LC_ALL="POSIX" -# Set LC_ALL -# https://github.com/explosion/spaCy/issues/12872#issuecomment-1661847588 -ENV LC_ALL "POSIX" +ARG GIT_COMMIT=unknown \ + BUILD_DATE=unknown -ARG GIT_COMMIT=unknown -ARG BUILD_DATE=unknown +LABEL git-commit=$GIT_COMMIT \ + build-date=$BUILD_DATE -LABEL git-commit=$GIT_COMMIT -LABEL build-date=$BUILD_DATE -ENV GIT_COMMIT=${GIT_COMMIT} -ENV BUILD_DATE=${BUILD_DATE} +ENV GIT_COMMIT=${GIT_COMMIT} \ + BUILD_DATE=${BUILD_DATE} -LABEL tensorflow-version=$TENSORFLOW_VERSION -# Used in the Jenkins `Docker GPU Build` step to restrict the images being pruned. -LABEL kaggle-lang=python +LABEL tensorflow-version=$TENSORFLOW_VERSION \ + # Used in the Jenkins `Docker GPU Build` step to restrict the images being pruned. + kaggle-lang=python # Correlate current release with the git hash inside the kernel editor by running `!cat /etc/git_commit`. RUN echo "$GIT_COMMIT" > /etc/git_commit && echo "$BUILD_DATE" > /etc/build_date {{ if eq .Accelerator "gpu" }} # Remove the CUDA stubs. -ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH_NO_STUBS" -# Add the CUDA home. -ENV CUDA_HOME=/usr/local/cuda +ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH_NO_STUBS" \ + # Add the CUDA home. + CUDA_HOME=/usr/local/cuda {{ end }} diff --git a/tests/test_catalyst.py b/tests/test_catalyst.py deleted file mode 100644 index 3b9c97d4..00000000 --- a/tests/test_catalyst.py +++ /dev/null @@ -1,158 +0,0 @@ -import unittest -import collections -import json -import numpy as np - -import torch -import torch.nn as nn -import torch.nn.functional as F -import torchvision -import torchvision.transforms as transforms - -import catalyst -from catalyst.dl import SupervisedRunner, CheckpointCallback -from catalyst import utils - - -def _to_categorical(y, num_classes=None, dtype='float32'): - """ - Taken from - github.com/keras-team/keras/blob/master/keras/utils/np_utils.py - Converts a class vector (integers) to binary class matrix. - E.g. for use with categorical_crossentropy. - # Arguments - y: class vector to be converted into a matrix - (integers from 0 to num_classes). - num_classes: total number of classes. - dtype: The data type expected by the input, as a string - (`float32`, `float64`, `int32`...) - # Returns - A binary matrix representation of the input. The classes axis - is placed last. - # Example - ```python - # Consider an array of 5 labels out of a set of 3 classes {0, 1, 2}: - > labels - array([0, 2, 1, 2, 0]) - # `to_categorical` converts this into a matrix with as many - # columns as there are classes. The number of rows - # stays the same. - > to_categorical(labels) - array([[ 1., 0., 0.], - [ 0., 0., 1.], - [ 0., 1., 0.], - [ 0., 0., 1.], - [ 1., 0., 0.]], dtype=float32) - ``` - """ - - y = np.array(y, dtype='int') - input_shape = y.shape - if input_shape and input_shape[-1] == 1 and len(input_shape) > 1: - input_shape = tuple(input_shape[:-1]) - y = y.ravel() - if not num_classes: - num_classes = np.max(y) + 1 - n = y.shape[0] - categorical = np.zeros((n, num_classes), dtype=dtype) - categorical[np.arange(n), y] = 1 - output_shape = input_shape + (num_classes,) - categorical = np.reshape(categorical, output_shape) - return categorical - - -class Net(nn.Module): - def __init__(self): - super().__init__() - self.conv1 = nn.Conv2d(1, 20, 5, 1) - self.conv2 = nn.Conv2d(20, 50, 5, 1) - self.fc1 = nn.Linear(4 * 4 * 50, 500) - self.fc2 = nn.Linear(500, 10) - - def forward(self, x): - x = F.relu(self.conv1(x)) - x = F.max_pool2d(x, 2, 2) - x = F.relu(self.conv2(x)) - x = F.max_pool2d(x, 2, 2) - x = x.view(-1, 4 * 4 * 50) - x = F.relu(self.fc1(x)) - x = self.fc2(x) - return x - - -class TestCatalyst(unittest.TestCase): - - def test_version(self): - self.assertIsNotNone(catalyst.__version__) - - def test_mnist(self): - utils.set_global_seed(42) - x_train = np.random.random((100, 1, 28, 28)).astype(np.float32) - y_train = _to_categorical( - np.random.randint(10, size=(100, 1)), - num_classes=10 - ).astype(np.float32) - x_valid = np.random.random((20, 1, 28, 28)).astype(np.float32) - y_valid = _to_categorical( - np.random.randint(10, size=(20, 1)), - num_classes=10 - ).astype(np.float32) - - x_train, y_train, x_valid, y_valid = \ - list(map(torch.tensor, [x_train, y_train, x_valid, y_valid])) - - bs = 32 - num_workers = 4 - data_transform = transforms.ToTensor() - - loaders = collections.OrderedDict() - - trainset = torch.utils.data.TensorDataset(x_train, y_train) - trainloader = torch.utils.data.DataLoader( - trainset, batch_size=bs, - shuffle=True, num_workers=num_workers) - - validset = torch.utils.data.TensorDataset(x_valid, y_valid) - validloader = torch.utils.data.DataLoader( - validset, batch_size=bs, - shuffle=False, num_workers=num_workers) - - loaders["train"] = trainloader - loaders["valid"] = validloader - - # experiment setup - num_epochs = 3 - logdir = "./logs" - - # model, criterion, optimizer - model = Net() - criterion = nn.BCEWithLogitsLoss() - optimizer = torch.optim.Adam(model.parameters()) - - # model runner - runner = SupervisedRunner() - - # model training - runner.train( - model=model, - criterion=criterion, - optimizer=optimizer, - loaders=loaders, - logdir=logdir, - num_epochs=num_epochs, - verbose=False, - callbacks=[CheckpointCallback( - logdir, - topk=3, - save_best=True, - loader_key="valid", - metric_key="loss", - minimize=True)] - ) - - with open('./logs/model.storage.json') as f: - metrics = json.load(f) - storage = metrics['storage'] - self.assertEqual(3, len(storage)) - self.assertTrue(storage[0]['metric'] < storage[2]['metric']) - self.assertTrue(storage[0]['metric']< 0.35) diff --git a/tests/test_essentia.py b/tests/test_essentia.py deleted file mode 100644 index 749b9466..00000000 --- a/tests/test_essentia.py +++ /dev/null @@ -1,7 +0,0 @@ -import unittest - -from essentia.standard import Windowing - -class TestEssentia(unittest.TestCase): - def test_windowing(self): - Windowing(type = 'hann') diff --git a/tests/test_geoviews.py b/tests/test_geoviews.py deleted file mode 100644 index 2636cc6f..00000000 --- a/tests/test_geoviews.py +++ /dev/null @@ -1,17 +0,0 @@ -import unittest - -from common import p100_exempt - -class TestGeoviews(unittest.TestCase): - - @p100_exempt # b/342143152: Uses cuDF(>=24.4v), which is no longer capitble with p100 GPUs. - - def test_viz(self): - import geoviews.feature as gf - import holoviews as hv - from cartopy import crs - - hv.extension('matplotlib') - (gf.ocean + gf.land + gf.ocean * gf.land * gf.coastline * gf.borders).options( - 'Feature', projection=crs.Geostationary(), global_extent=True - ).cols(3) diff --git a/tests/test_ggplot.py b/tests/test_ggplot.py deleted file mode 100644 index 30aec29f..00000000 --- a/tests/test_ggplot.py +++ /dev/null @@ -1,12 +0,0 @@ -import unittest -import os.path - -from ggplot import * - -class TestGgplot(unittest.TestCase): - - def test_plot(self): - p = ggplot(aes(x='mpg'), data=mtcars) + geom_histogram() - p.save("myplot.png") - - self.assertTrue(os.path.isfile("myplot.png")) diff --git a/tests/test_imports.py b/tests/test_imports.py index 4977ff9c..b22ebe7a 100644 --- a/tests/test_imports.py +++ b/tests/test_imports.py @@ -4,6 +4,5 @@ class TestImport(unittest.TestCase): # Basic import tests for packages without any. def test_basic(self): import bq_helper - import cleverhans import tensorflow_datasets import segment_anything diff --git a/tests/test_kmapper.py b/tests/test_kmapper.py deleted file mode 100644 index c75deea3..00000000 --- a/tests/test_kmapper.py +++ /dev/null @@ -1,7 +0,0 @@ -import unittest - -import kmapper as km - -class TestKMapper(unittest.TestCase): - def test_init(self): - km.KeplerMapper() diff --git a/tests/test_matplotlib.py b/tests/test_matplotlib.py index 1cbc939a..c04f3f23 100644 --- a/tests/test_matplotlib.py +++ b/tests/test_matplotlib.py @@ -1,10 +1,17 @@ import unittest import os.path +from distutils.version import StrictVersion + +import matplotlib import matplotlib.pyplot as plt import numpy as np class TestMatplotlib(unittest.TestCase): + def test_version(self): + # b/308525631: newer versions of Matplotlib causes learntools to fail + self.assertLess(StrictVersion(matplotlib.__version__), StrictVersion("3.8.0")) + def test_plot(self): plt.plot(np.linspace(0,1,50), np.random.rand(50)) plt.savefig("plot1.png") diff --git a/tests/test_pykalman.py b/tests/test_pykalman.py deleted file mode 100644 index 26d86003..00000000 --- a/tests/test_pykalman.py +++ /dev/null @@ -1,47 +0,0 @@ -import unittest -import numpy as np -from pykalman import KalmanFilter -from pykalman import UnscentedKalmanFilter -from pykalman.sqrt import CholeskyKalmanFilter, AdditiveUnscentedKalmanFilter - -class TestPyKalman(unittest.TestCase): - def test_kalman_filter(self): - kf = KalmanFilter(transition_matrices = [[1, 1], [0, 1]], observation_matrices = [[0.1, 0.5], [-0.3, 0.0]]) - measurements = np.asarray([[1,0], [0,0], [0,1]]) # 3 observations - kf = kf.em(measurements, n_iter=5) - (filtered_state_means, filtered_state_covariances) = kf.filter(measurements) - (smoothed_state_means, smoothed_state_covariances) = kf.smooth(measurements) - return filtered_state_means - - def test_kalman_missing(self): - kf = KalmanFilter(transition_matrices = [[1, 1], [0, 1]], observation_matrices = [[0.1, 0.5], [-0.3, 0.0]]) - measurements = np.asarray([[1,0], [0,0], [0,1]]) # 3 observations - measurements = np.ma.asarray(measurements) - measurements[1] = np.ma.masked - kf = kf.em(measurements, n_iter=5) - (filtered_state_means, filtered_state_covariances) = kf.filter(measurements) - (smoothed_state_means, smoothed_state_covariances) = kf.smooth(measurements) - return filtered_state_means - - def test_unscented_kalman(self): - ukf = UnscentedKalmanFilter(lambda x, w: x + np.sin(w), lambda x, v: x + v, transition_covariance=0.1) - (filtered_state_means, filtered_state_covariances) = ukf.filter([0, 1, 2]) - (smoothed_state_means, smoothed_state_covariances) = ukf.smooth([0, 1, 2]) - return filtered_state_means - - def test_online_update(self): - kf = KalmanFilter(transition_matrices = [[1, 1], [0, 1]], observation_matrices = [[0.1, 0.5], [-0.3, 0.0]]) - measurements = np.asarray([[1,0], [0,0], [0,1]]) # 3 observations - measurements = np.ma.asarray(measurements) - measurements[1] = np.ma.masked # measurement at timestep 1 is unobserved - kf = kf.em(measurements, n_iter=5) - (filtered_state_means, filtered_state_covariances) = kf.filter(measurements) - for t in range(1, 3): - filtered_state_means[t], filtered_state_covariances[t] = \ - kf.filter_update(filtered_state_means[t-1], filtered_state_covariances[t-1], measurements[t]) - return filtered_state_means - - def test_robust_sqrt(self): - kf = CholeskyKalmanFilter(transition_matrices = [[1, 1], [0, 1]], observation_matrices = [[0.1, 0.5], [-0.3, 0.0]]) - ukf = AdditiveUnscentedKalmanFilter(lambda x, w: x + np.sin(w), lambda x, v: x + v, observation_covariance=0.1) - diff --git a/tests/test_vaex.py b/tests/test_vaex.py deleted file mode 100644 index b64061b0..00000000 --- a/tests/test_vaex.py +++ /dev/null @@ -1,10 +0,0 @@ -import unittest - -import vaex - -class TestVaex(unittest.TestCase): - def test_read_csv(self): - df = vaex.read_csv("/input/tests/data/train.csv") - - self.assertEqual((100, 785), df.shape) - self.assertEqual(10, df['label'].nunique()) \ No newline at end of file diff --git a/tests/test_vowpalwabbit.py b/tests/test_vowpalwabbit.py deleted file mode 100644 index 839aed05..00000000 --- a/tests/test_vowpalwabbit.py +++ /dev/null @@ -1,10 +0,0 @@ -import unittest - -from vowpalwabbit import pyvw - -class TestVowpalwabbit(unittest.TestCase): - def test_basic(self): - vw = pyvw.vw(quiet=True) - ex = vw.example('1 | a b c') - vw.learn(ex) - self.assertGreater(vw.predict(ex), 0) From 84e11f1aea7cfa43688950ec4d304e3e02050aa8 Mon Sep 17 00:00:00 2001 From: DJ Date: Fri, 27 Sep 2024 15:18:55 -0600 Subject: [PATCH 08/67] Add nbdev (#1427) --- Dockerfile.tmpl | 1 + tests/test_nbdev.py | 8 ++++++++ 2 files changed, 9 insertions(+) create mode 100644 tests/test_nbdev.py diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index c0037839..d3458df4 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -496,6 +496,7 @@ RUN rm -rf /opt/conda/lib/python3.10/site-packages/numpy* && \ flask \ # pycrypto is used by competitions team. pycryptodome \ + nbdev \ easyocr \ onnx \ tables \ diff --git a/tests/test_nbdev.py b/tests/test_nbdev.py new file mode 100644 index 00000000..d5c6b484 --- /dev/null +++ b/tests/test_nbdev.py @@ -0,0 +1,8 @@ +import unittest + +import nbdev + +class TestNbdev(unittest.TestCase): + def test(self): + self.assertGreater(len(nbdev.__version__), 0) + From 556593c8e897e125b3af375b51916d2f8bdd2593 Mon Sep 17 00:00:00 2001 From: DJ Date: Fri, 27 Sep 2024 15:19:03 -0600 Subject: [PATCH 09/67] Add hvplot (#1426) Recreate of https://github.com/Kaggle/docker-python/pull/1424 which got borked This is a plotting library required to do plotting actions from the popular `polars` library. --- Dockerfile.tmpl | 1 + tests/test_polars.py | 9 +++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index d3458df4..44100f7e 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -484,6 +484,7 @@ RUN rm -rf /opt/conda/lib/python3.10/site-packages/numpy* && \ s3fs \ gcsfs \ kaggle-environments \ + hvplot \ # geopandas > v0.14.4 breaks learn tools geopandas==v0.14.4 \ "shapely<2" \ diff --git a/tests/test_polars.py b/tests/test_polars.py index c81a0b80..8526bd29 100644 --- a/tests/test_polars.py +++ b/tests/test_polars.py @@ -2,9 +2,14 @@ import polars as pl -class TestPolars(unittest.TestCase): +class TestPolars(unittest.TestCase): def test_read_csv(self): - data = pl.read_csv("/input/tests/data/train.csv") + data = pl.read_csv('/input/tests/data/train.csv') self.assertEqual(100, len(data)) + def test_plot(self): + # This relies on the hvplot package + data = pl.read_csv('/input/tests/data/train.csv') + data.plot.line() + From d6f04b1c5ae572a6b09555f7d42c8340dac8edcb Mon Sep 17 00:00:00 2001 From: Johnny Chavez <64660690+calderjo@users.noreply.github.com> Date: Mon, 30 Sep 2024 13:31:14 -0700 Subject: [PATCH 10/67] Remove hvplot (#1429) looks like hvplot isnt needed anymore with newest polars release. let remove it from docker image --- Dockerfile.tmpl | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index 44100f7e..b58d26f2 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -274,8 +274,6 @@ RUN apt-get install -y git-lfs && \ pip install xvfbwrapper && \ /tmp/clean-layer.sh -RUN rm -rf /opt/conda/lib/python3.10/site-packages/Shapely-1.8.5.post1.dist-info/ - RUN pip install mpld3 \ gpxpy \ arrow \ @@ -471,9 +469,8 @@ RUN pip install wandb \ albumentations \ Rtree \ accelerate && \ - apt-get -y install libspatialindex-dev - -RUN rm -rf /opt/conda/lib/python3.10/site-packages/numpy* && \ + apt-get -y install libspatialindex-dev && \ + rm -rf /opt/conda/lib/python3.10/site-packages/numpy* && \ pip install "numpy==1.26.4" && \ pip install pytorch-ignite \ qgrid \ @@ -484,7 +481,6 @@ RUN rm -rf /opt/conda/lib/python3.10/site-packages/numpy* && \ s3fs \ gcsfs \ kaggle-environments \ - hvplot \ # geopandas > v0.14.4 breaks learn tools geopandas==v0.14.4 \ "shapely<2" \ @@ -497,7 +493,7 @@ RUN rm -rf /opt/conda/lib/python3.10/site-packages/numpy* && \ flask \ # pycrypto is used by competitions team. pycryptodome \ - nbdev \ + nbdev \ easyocr \ onnx \ tables \ @@ -580,10 +576,9 @@ ADD patches/imagemagick-policy.xml /etc/ImageMagick-6/policy.xml # Add Kaggle module resolver ADD patches/kaggle_module_resolver.py /opt/conda/lib/python3.10/site-packages/tensorflow_hub/kaggle_module_resolver.py RUN sed -i '/from tensorflow_hub import uncompressed_module_resolver/a from tensorflow_hub import kaggle_module_resolver' /opt/conda/lib/python3.10/site-packages/tensorflow_hub/config.py && \ - sed -i '/_install_default_resolvers()/a \ \ registry.resolver.add_implementation(kaggle_module_resolver.KaggleFileResolver())' /opt/conda/lib/python3.10/site-packages/tensorflow_hub/config.py - -# Disable preloaded jupyter modules (they add to startup, and break when they are missing) -RUN sed -i /bq_stats/d /etc/ipython/ipython_kernel_config.py && \ + sed -i '/_install_default_resolvers()/a \ \ registry.resolver.add_implementation(kaggle_module_resolver.KaggleFileResolver())' /opt/conda/lib/python3.10/site-packages/tensorflow_hub/config.py && \ + # Disable preloaded jupyter modules (they add to startup, and break when they are missing) + sed -i /bq_stats/d /etc/ipython/ipython_kernel_config.py && \ sed -i /beatrix/d /etc/ipython/ipython_kernel_config.py && \ sed -i /bigquery/d /etc/ipython/ipython_kernel_config.py && \ sed -i /sql/d /etc/ipython/ipython_kernel_config.py @@ -596,10 +591,9 @@ RUN ln -s /usr/local/cuda/lib64/libcusolver.so.11 /opt/conda/bin/../lib/libcusol {{ end }} # b/270147159: conda ships with a version of libtinfo which is missing version info causing warnings, replace it with a good version. -RUN rm /opt/conda/lib/libtinfo.so.6 && ln -s /usr/lib/x86_64-linux-gnu/libtinfo.so.6 /opt/conda/lib/libtinfo.so.6 - -# b/276358430: fix Jupyter lsp freezing up the jupyter server -RUN pip install "jupyter-lsp==1.5.1" +RUN rm /opt/conda/lib/libtinfo.so.6 && ln -s /usr/lib/x86_64-linux-gnu/libtinfo.so.6 /opt/conda/lib/libtinfo.so.6 && \ + # b/276358430: fix Jupyter lsp freezing up the jupyter server + pip install "jupyter-lsp==1.5.1" # Set backend for matplotlib ENV MPLBACKEND="agg" \ From 4e22769cbb4ee9a6a327d8097d4c2ac68984a35a Mon Sep 17 00:00:00 2001 From: Johnny Chavez <64660690+calderjo@users.noreply.github.com> Date: Tue, 1 Oct 2024 23:01:52 -0700 Subject: [PATCH 11/67] Fix build due to numpy (#1430) numpy 2.0 and greater is causing issues with handful of packages --- Dockerfile.tmpl | 10 +++++++--- tests/test_fastai.py | 7 +++++++ tests/test_numpy.py | 8 +++++++- 3 files changed, 21 insertions(+), 4 deletions(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index b58d26f2..e4bde450 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -210,6 +210,8 @@ ADD patches/keras_internal.py \ # b/350573866: xgboost v2.1.0 breaks learntools RUN apt-get install -y libfreetype6-dev && \ apt-get install -y libglib2.0-0 libxext6 libsm6 libxrender1 libfontconfig1 --fix-missing && \ + rm -rf /opt/conda/lib/python3.10/site-packages/numpy* && \ + pip install "numpy==1.26.4" && \ pip install gensim \ textblob \ wordcloud \ @@ -425,8 +427,7 @@ RUN pip install bleach \ webencodings \ widgetsnbextension \ # Require pyarrow newer than https://github.com/advisories/GHSA-5wvp-7f3h-6wmm - {{ if eq .Accelerator "gpu" }} pyarrow {{ else }} "pyarrow>=14.0.1" {{ end }} \ - fastai + {{ if eq .Accelerator "gpu" }} pyarrow {{ else }} "pyarrow>=14.0.1" {{ end }} RUN python -m spacy download en_core_web_sm && python -m spacy download en_core_web_lg && \ apt-get update && apt-get install -y ffmpeg && \ @@ -470,7 +471,8 @@ RUN pip install wandb \ Rtree \ accelerate && \ apt-get -y install libspatialindex-dev && \ - rm -rf /opt/conda/lib/python3.10/site-packages/numpy* && \ + # b/370860329: newer versions are not capable with current tensorflow + rm -rf /opt/conda/lib/python3.10/site-packages/numpy* && \ pip install "numpy==1.26.4" && \ pip install pytorch-ignite \ qgrid \ @@ -501,6 +503,8 @@ RUN pip install wandb \ timm \ torchinfo && \ pip install git+https://github.com/facebookresearch/segment-anything.git && \ + # b/370860329: newer versions are not capable with current tensorflow + pip install --no-dependencies fastai fastdownload && \ # b/343971718: remove duplicate aiohttp installs, and reinstall it rm -rf /opt/conda/lib/python3.10/site-packages/aiohttp* && \ mamba install --force-reinstall -y aiohttp && \ diff --git a/tests/test_fastai.py b/tests/test_fastai.py index edfd402e..0de1f82f 100644 --- a/tests/test_fastai.py +++ b/tests/test_fastai.py @@ -5,6 +5,13 @@ from fastai.tabular.all import * class TestFastAI(unittest.TestCase): + # Basic import + def test_basic(self): + import fastai + import fastcore + import fastprogress + import fastdownload + def test_has_version(self): self.assertGreater(len(fastai.__version__), 2) diff --git a/tests/test_numpy.py b/tests/test_numpy.py index 18f74b8c..071c3d30 100644 --- a/tests/test_numpy.py +++ b/tests/test_numpy.py @@ -1,9 +1,15 @@ import unittest +from distutils.version import StrictVersion + import numpy as np from numpy.distutils.system_info import get_info -class TestNumpy(unittest.TestCase): +class TestNumpy(unittest.TestCase): + def test_version(self): + # b/370860329: newer versions are not capable with current tensorflow + self.assertEqual(StrictVersion(np.__version__), StrictVersion("1.26.4")) + def test_array(self): array = np.array([1, 3]) From 34427a248a3703ad28292534774d3a5d86ca116a Mon Sep 17 00:00:00 2001 From: Dustin H Date: Fri, 4 Oct 2024 13:39:23 -0400 Subject: [PATCH 12/67] Migrate to micromamba (#1431) http://b/358349812 --- Dockerfile.tmpl | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index e4bde450..11b05ebd 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -81,17 +81,23 @@ RUN sed -i "s/httpredir.debian.org/debian.uchicago.edu/" /etc/apt/sources.list & # b/128333086: Set PROJ_DATA to points to the proj4 cartographic library. ENV PROJ_DATA=/opt/conda/share/proj +# Install micromamba, setup channels, and replace conda with micromamba +ENV MAMBA_ROOT_PREFIX=/opt/conda +RUN curl -L "https://micro.mamba.pm/install.sh" -o /tmp/micromamba-install.sh \ + && bash /tmp/micromamba-install.sh \ + && rm /tmp/micromamba-install.sh \ + && mv ~/.local/bin/micromamba /usr/bin/micromamba \ + && (!(which conda) || cp /usr/bin/micromamba $(which conda)) \ + && micromamba config append channels nvidia \ + && micromamba config append channels rapidsai \ + && micromamba config append channels conda-forge \ + && micromamba config set channel_priority flexible \ + && python -m nb_conda_kernels.install --disable + # Install conda packages not available on pip. # When using pip in a conda environment, conda commands should be ran first and then # the remaining pip commands: https://www.anaconda.com/using-pip-in-a-conda-environment/ -RUN conda config --add channels nvidia && \ - conda config --add channels rapidsai && \ - conda config --set solver libmamba && \ - # b/299991198: remove curl/libcurl install once DLVM base image includes version >= 7.86 - conda install -c conda-forge mamba curl libcurl && \ - # Base image channel order: conda-forge (highest priority), defaults. - # End state: rapidsai (highest priority), nvidia, conda-forge, defaults. - mamba install -y mkl cartopy imagemagick pyproj "shapely<2" && \ +RUN micromamba install -y mkl cartopy imagemagick pyproj "shapely<2" && \ rm -rf /opt/conda/lib/python3.10/site-packages/pyproj/proj_dir/ && \ /tmp/clean-layer.sh @@ -100,8 +106,7 @@ RUN conda config --add channels nvidia && \ # b/341938540: unistall grpc-cpp to allow >=v24.4 cudf and cuml to be installed. {{ if eq .Accelerator "gpu" }} RUN pip uninstall -y pyarrow && \ - mamba remove -y --force grpc-cpp && \ - mamba install -y -c conda-forge spacy cudf>=24.4 cuml>=24.4 cupy cuda-version=$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION && \ + micromamba install -vvvy spacy "cudf>=24.4" "cuml>=24.4" cupy cuda-version=$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION && \ /tmp/clean-layer.sh {{ else }} RUN pip install spacy && \ @@ -114,7 +119,7 @@ RUN pip install spacy && \ COPY --from=torch_whl /tmp/whl/*.whl /tmp/torch/ # b/356397043: We are currently using cuda 12.3, # but magma-cuda121 is the latest compatible version -RUN mamba install -y -c pytorch magma-cuda121 && \ +RUN micromamba install -y -c pytorch magma-cuda121 && \ pip install /tmp/torch/*.whl && \ sudo apt -y install libsox-dev && \ rm -rf /tmp/torch && \ @@ -507,7 +512,7 @@ RUN pip install wandb \ pip install --no-dependencies fastai fastdownload && \ # b/343971718: remove duplicate aiohttp installs, and reinstall it rm -rf /opt/conda/lib/python3.10/site-packages/aiohttp* && \ - mamba install --force-reinstall -y aiohttp && \ + micromamba install --force-reinstall -y aiohttp && \ /tmp/clean-layer.sh # Download base easyocr models. From 99374cb4db0fa47defc39de1facc9212a965a618 Mon Sep 17 00:00:00 2001 From: Johnny Chavez <64660690+calderjo@users.noreply.github.com> Date: Mon, 25 Nov 2024 15:04:39 -0800 Subject: [PATCH 13/67] Fix Kagglehub test and Remove multiple version matplotlib files (#1441) fixing main build. - kagglehub login changes required updating our tests - latest matplotlib is needed by handful of package, but breaks learntools. i provided a workaround --- Dockerfile.tmpl | 39 +++++++++++++++++++++------------------ tests/test_kagglehub.py | 10 ++++++---- 2 files changed, 27 insertions(+), 22 deletions(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index 11b05ebd..ad6842ea 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -167,28 +167,12 @@ RUN pip install jax[cpu] && \ RUN export PATH=/usr/local/cuda/bin:$PATH && \ export CUDA_ROOT=/usr/local/cuda && \ pip install pycuda \ - pynvrtc \ + # TODO(379932879): pip resolver fails when not specified. + pynvrtc==9.2 \ pynvml && \ /tmp/clean-layer.sh {{ end }} -# b/308525631: Pin Matplotlib until seaborn can be upgraded -# to >0.13.0 (now it's stuck by a package conflict with ydata-profiling 4.5.1). -RUN JAXVER=$(pip freeze | grep -e "^jax==") && \ - pip install --upgrade \ - "matplotlib<3.8.0" \ - # ipympl adds interactive widget support for matplotlib - ipympl==0.7.0 \ - "seaborn==0.12.2" \ - pyupset \ - python-dateutil dask dask-expr igraph \ - pyyaml joblib geopy mne pyshp \ - pandas \ - polars \ - flax \ - "${JAXVER}" && \ - /tmp/clean-layer.sh - RUN apt-get update && \ apt-get install -y default-jre && \ /tmp/clean-layer.sh @@ -543,11 +527,30 @@ ENV TESSERACT_PATH=/usr/bin/tesseract \ # For Theano with MKL MKL_THREADING_LAYER=GNU +# b/308525631: Pin Matplotlib until seaborn can be upgraded +# to >0.13.0 (now it's stuck by a package conflict with ydata-profiling 4.5.1). +RUN JAXVER=$(pip freeze | grep -e "^jax==") && \ + pip install --upgrade \ + "matplotlib==3.7.5" \ + # ipympl adds interactive widget support for matplotlib + ipympl==0.7.0 \ + "seaborn==0.12.2" \ + pyupset \ + python-dateutil dask dask-expr igraph \ + pyyaml joblib geopy mne pyshp \ + pandas \ + polars \ + flax \ + "${JAXVER}" && \ + /tmp/clean-layer.sh + # Temporary fixes and patches # Temporary patch for Dask getting downgraded, which breaks Keras RUN pip install --upgrade dask && \ # Stop jupyter nbconvert trying to rewrite its folder hierarchy mkdir -p /root/.jupyter && touch /root/.jupyter/jupyter_nbconvert_config.py && touch /root/.jupyter/migrated && \ + # TODO(b/380921973): Ensure only matplotlib 3.7.5 files are present. + rm -r /opt/conda/lib/python3.10/site-packages/matplotlib-3.9.2.dist-info/ && \ mkdir -p /.jupyter && touch /.jupyter/jupyter_nbconvert_config.py && touch /.jupyter/migrated && \ # Stop Matplotlib printing junk to the console on first load sed -i "s/^.*Matplotlib is building the font cache using fc-list.*$/# Warning removed by Kaggle/g" /opt/conda/lib/python3.10/site-packages/matplotlib/font_manager.py && \ diff --git a/tests/test_kagglehub.py b/tests/test_kagglehub.py index 37b11248..f2c3e2a6 100644 --- a/tests/test_kagglehub.py +++ b/tests/test_kagglehub.py @@ -8,8 +8,10 @@ class TestKagglehub(unittest.TestCase): def test_login(self): with self.assertLogs('kagglehub', level='INFO') as l: with mock.patch("builtins.input") as mock_input: - mock_input.side_effect = ["lastplacelarry", "some-key"] - # Disabling credentials validation since network access is disabled in unittest. - kagglehub.login(validate_credentials=False) + with mock.patch("getpass.getpass") as mock_getpass: + mock_input.side_effect = ["lastplacelarry"] + mock_getpass.return_value = "some-key" - self.assertIn("credentials set", l.output[0]) + kagglehub.login(validate_credentials=False) + + self.assertIn("credentials set", l.output[0]) From 03c832e6abf7ae017f4d3eef542658681e3cfa0a Mon Sep 17 00:00:00 2001 From: Michael Aaron Date: Tue, 26 Nov 2024 09:30:45 -0700 Subject: [PATCH 14/67] Upgrade Ipywidgets to 8.1.5 (#1440) I think this will break qgrid fwiw, unsure how often that is used. --------- Co-authored-by: Johnny Chavez <64660690+calderjo@users.noreply.github.com> Co-authored-by: Jonathan Calderon Chavez --- Dockerfile.tmpl | 4 +--- tests/test_qgrid.py | 16 ---------------- 2 files changed, 1 insertion(+), 19 deletions(-) delete mode 100644 tests/test_qgrid.py diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index ad6842ea..f3193876 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -374,8 +374,7 @@ RUN pip install bleach \ ipykernel \ ipython \ ipython-genutils \ - # Fix qgrid by pinning ipywidgets https://github.com/quantopian/qgrid/issues/376 - ipywidgets==7.7.1 \ + ipywidgets==8.1.5 \ isoweek \ jedi \ jsonschema \ @@ -464,7 +463,6 @@ RUN pip install wandb \ rm -rf /opt/conda/lib/python3.10/site-packages/numpy* && \ pip install "numpy==1.26.4" && \ pip install pytorch-ignite \ - qgrid \ bqplot \ earthengine-api \ transformers \ diff --git a/tests/test_qgrid.py b/tests/test_qgrid.py deleted file mode 100644 index e97ef2a1..00000000 --- a/tests/test_qgrid.py +++ /dev/null @@ -1,16 +0,0 @@ -import unittest - -import numpy as np -import pandas as pd - -from qgrid import QgridWidget - - -class TestQgrid(unittest.TestCase): - def test_nans(self): - df = pd.DataFrame([(pd.Timestamp('2017-02-02'), np.nan), - (4, 2), - ('foo', 'bar')]) - view = QgridWidget(df=df) - - self.assertIsNotNone(view.get_changed_df()) From 66bac486a2c52eabeeca83dd8540cc7af43e08e6 Mon Sep 17 00:00:00 2001 From: Dustin H Date: Wed, 27 Nov 2024 17:05:29 -0500 Subject: [PATCH 15/67] Use Colab as a base image. (#1444) This change makes a number of major changes: - Colab is the base image - uv is the main package install tool - leveraging requirements.txt instead of many separate installs - stop building and installing tensorflow/torch/lightbgm/jax since those are managed by the Colab base image now In order to decide what packages to explicitly install I: - looked at what packages are in the Colab base image - looked at what packages were in the Kaggle image - looked at what packages were explicitly mentioned in Kaggle Dockerfile This may still take a few iterations to get all the right parts in the image, but this should hopefully make the image much more manageable. http://b/365782129 --- Dockerfile.tmpl | 594 +++++----------------------------------- Jenkinsfile | 60 ---- clean-layer.sh | 4 +- config.txt | 11 +- kaggle_requirements.txt | 139 ++++++++++ test | 8 +- tests/test_cuml.py | 1 + tests/test_fastai.py | 7 +- tests/test_lightgbm.py | 2 + 9 files changed, 228 insertions(+), 598 deletions(-) create mode 100644 kaggle_requirements.txt diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index f3193876..d6bc3f9b 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -1,66 +1,58 @@ -ARG BASE_IMAGE_REPO \ - BASE_IMAGE_TAG \ - CPU_BASE_IMAGE_NAME \ - GPU_BASE_IMAGE_NAME \ - LIGHTGBM_VERSION \ - TORCH_VERSION \ - TORCHAUDIO_VERSION \ - TORCHVISION_VERSION \ - JAX_VERSION +FROM us-docker.pkg.dev/colab-images/public/runtime:latest -{{ if eq .Accelerator "gpu" }} -FROM gcr.io/kaggle-images/python-lightgbm-whl:${GPU_BASE_IMAGE_NAME}-${BASE_IMAGE_TAG}-${LIGHTGBM_VERSION} AS lightgbm_whl -FROM gcr.io/kaggle-images/python-torch-whl:${GPU_BASE_IMAGE_NAME}-${BASE_IMAGE_TAG}-${TORCH_VERSION} AS torch_whl -FROM gcr.io/kaggle-images/python-jaxlib-whl:${GPU_BASE_IMAGE_NAME}-${BASE_IMAGE_TAG}-${JAX_VERSION} AS jaxlib_whl -FROM ${BASE_IMAGE_REPO}/${GPU_BASE_IMAGE_NAME}:${BASE_IMAGE_TAG} -{{ else }} -FROM ${BASE_IMAGE_REPO}/${CPU_BASE_IMAGE_NAME}:${BASE_IMAGE_TAG} -{{ end }} +ADD kaggle_requirements.txt /kaggle_requirements.txt + +# Freeze existing requirements from base image for critical packages: +RUN pip freeze | grep -E 'tensorflow|keras|torch|jax|lightgbm' > /colab_requirements.txt + +# Merge requirements files: +RUN cat /colab_requirements.txt >> /requirements.txt +RUN cat /kaggle_requirements.txt >> /requirements.txt + +# TODO: GPU requirements.txt +# TODO: merge them better (override matching ones). -# Ensures shared libraries installed with conda can be found by the dynamic link loader. -ENV LIBRARY_PATH="$LIBRARY_PATH:/opt/conda/lib" \ - LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib" +# Install uv & Kaggle packages +RUN curl -LsSf https://astral.sh/uv/install.sh | sh +RUN export PATH="${HOME}/.local/bin:${PATH}" && uv pip install --system -r /requirements.txt +ENV PATH="~/.local/bin:${PATH}" +# Install manual packages: +# b/183041606#comment5: the Kaggle data proxy doesn't support these APIs. If the library is missing, it falls back to using a regular BigQuery query to fetch data. +RUN uv pip uninstall --system google-cloud-bigquery-storage + +# NOTE(herbison): uv fails to install this for some reason +RUN pip install git+https://github.com/Kaggle/learntools + +# We install an incompatible pair of libs (shapely<, libpysal==4.9.2) so we can't put this one in the requirements.txt +RUN uv pip install --system "libpysal==4.9.2" + +# Adding non-package dependencies: + +ADD clean-layer.sh /tmp/clean-layer.sh +ADD patches/nbconvert-extensions.tpl /opt/kaggle/nbconvert-extensions.tpl +ADD patches/template_conf.json /opt/kaggle/conf.json + +# /opt/conda/lib/python3.10/site-packages +ARG PACKAGE_PATH=/usr/local/lib/python3.10/dist-packages + +# Install GPU-specific non-pip packages. {{ if eq .Accelerator "gpu" }} ARG CUDA_MAJOR_VERSION \ CUDA_MINOR_VERSION ENV CUDA_MAJOR_VERSION=${CUDA_MAJOR_VERSION} \ CUDA_MINOR_VERSION=${CUDA_MINOR_VERSION} + # Make sure we are on the right version of CUDA RUN update-alternatives --set cuda /usr/local/cuda-$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION -# NVIDIA binaries from the host are mounted to /opt/bin. -ENV PATH=/opt/bin:${PATH} \ - # Add CUDA stubs to LD_LIBRARY_PATH to support building the GPU image on a CPU machine. - LD_LIBRARY_PATH_NO_STUBS="$LD_LIBRARY_PATH" \ - LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64/stubs" -RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 -{{ end }} -# Keep these variables in sync if base image is updated. -ENV TENSORFLOW_VERSION=2.16.1 \ - # See https://github.com/tensorflow/io#tensorflow-version-compatibility - TENSORFLOW_IO_VERSION=0.37.0 - -# We need to redefine the ARG here to get the ARG value defined above the FROM instruction. -# See: https://docs.docker.com/engine/reference/builder/#understand-how-arg-and-from-interact -ARG LIGHTGBM_VERSION \ - TORCH_VERSION \ - TORCHAUDIO_VERSION \ - TORCHVISION_VERSION \ - JAX_VERSION - -# Disable pesky logs like: KMP_AFFINITY: pid 6121 tid 6121 thread 0 bound to OS proc set 0 -# See: https://stackoverflow.com/questions/57385766/disable-tensorflow-log-information -ENV KMP_WARNINGS=0 \ - # Also make the KMP logs noverbose. - # https://stackoverflow.com/questions/70250304/stop-tensorflow-from-printing-warning-message - KMP_SETTINGS=false \ - # Remove the pip as the root user warning. - PIP_ROOT_USER_ACTION=ignore +RUN uv pip install --system "pycuda" + +# Remove CUDA_VERSION from non-GPU image. +{{ else }} +ENV CUDA_VERSION="" +{{ end }} -ADD clean-layer.sh /tmp/clean-layer.sh -ADD patches/nbconvert-extensions.tpl /opt/kaggle/nbconvert-extensions.tpl -ADD patches/template_conf.json /opt/kaggle/conf.json # Update GPG key per documentation at https://cloud.google.com/compute/docs/troubleshooting/known-issues RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add - @@ -78,141 +70,18 @@ RUN sed -i "s/httpredir.debian.org/debian.uchicago.edu/" /etc/apt/sources.list & apt-get install -y graphviz && pip install graphviz && \ /tmp/clean-layer.sh -# b/128333086: Set PROJ_DATA to points to the proj4 cartographic library. -ENV PROJ_DATA=/opt/conda/share/proj - -# Install micromamba, setup channels, and replace conda with micromamba -ENV MAMBA_ROOT_PREFIX=/opt/conda -RUN curl -L "https://micro.mamba.pm/install.sh" -o /tmp/micromamba-install.sh \ - && bash /tmp/micromamba-install.sh \ - && rm /tmp/micromamba-install.sh \ - && mv ~/.local/bin/micromamba /usr/bin/micromamba \ - && (!(which conda) || cp /usr/bin/micromamba $(which conda)) \ - && micromamba config append channels nvidia \ - && micromamba config append channels rapidsai \ - && micromamba config append channels conda-forge \ - && micromamba config set channel_priority flexible \ - && python -m nb_conda_kernels.install --disable - -# Install conda packages not available on pip. -# When using pip in a conda environment, conda commands should be ran first and then -# the remaining pip commands: https://www.anaconda.com/using-pip-in-a-conda-environment/ -RUN micromamba install -y mkl cartopy imagemagick pyproj "shapely<2" && \ - rm -rf /opt/conda/lib/python3.10/site-packages/pyproj/proj_dir/ && \ - /tmp/clean-layer.sh - -# Install spacy -# b/232247930: uninstall pyarrow to avoid double installation with the GPU specific version. -# b/341938540: unistall grpc-cpp to allow >=v24.4 cudf and cuml to be installed. -{{ if eq .Accelerator "gpu" }} -RUN pip uninstall -y pyarrow && \ - micromamba install -vvvy spacy "cudf>=24.4" "cuml>=24.4" cupy cuda-version=$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION && \ - /tmp/clean-layer.sh -{{ else }} -RUN pip install spacy && \ - /tmp/clean-layer.sh -{{ end}} - -# Install PyTorch -# b/356397043: magma-cuda121 is the latest version -{{ if eq .Accelerator "gpu" }} -COPY --from=torch_whl /tmp/whl/*.whl /tmp/torch/ -# b/356397043: We are currently using cuda 12.3, -# but magma-cuda121 is the latest compatible version -RUN micromamba install -y -c pytorch magma-cuda121 && \ - pip install /tmp/torch/*.whl && \ - sudo apt -y install libsox-dev && \ - rm -rf /tmp/torch && \ - /tmp/clean-layer.sh -{{ else }} -RUN pip install \ - torch==$TORCH_VERSION+cpu \ - torchvision==$TORCHVISION_VERSION+cpu \ - torchaudio==$TORCHAUDIO_VERSION+cpu \ - --index-url https://download.pytorch.org/whl/cpu && \ - /tmp/clean-layer.sh -{{ end }} - -# Install LightGBM -{{ if eq .Accelerator "gpu" }} -COPY --from=lightgbm_whl /tmp/whl/*.whl /tmp/lightgbm/ -# Install OpenCL (required by LightGBM GPU version) -RUN apt-get install -y ocl-icd-libopencl1 clinfo && \ - mkdir -p /etc/OpenCL/vendors && \ - echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd && \ - pip install /tmp/lightgbm/*.whl && \ - rm -rf /tmp/lightgbm && \ - /tmp/clean-layer.sh -{{ else }} -RUN pip install lightgbm==$LIGHTGBM_VERSION && \ - /tmp/clean-layer.sh -{{ end }} - -# Install JAX -{{ if eq .Accelerator "gpu" }} -COPY --from=jaxlib_whl /tmp/whl/*.whl /tmp/jax/ -# b/319722433#comment9: Use pip wheels once versions matches our CUDA version. -RUN pip install /tmp/jax/*.whl jax==$JAX_VERSION && \ - /tmp/clean-layer.sh -{{ else }} -RUN pip install jax[cpu] && \ - /tmp/clean-layer.sh -{{ end }} - - -# Install GPU specific packages -{{ if eq .Accelerator "gpu" }} -# Install GPU-only packages -# No specific package for nnabla-ext-cuda 12.x minor versions. -RUN export PATH=/usr/local/cuda/bin:$PATH && \ - export CUDA_ROOT=/usr/local/cuda && \ - pip install pycuda \ - # TODO(379932879): pip resolver fails when not specified. - pynvrtc==9.2 \ - pynvml && \ - /tmp/clean-layer.sh -{{ end }} - -RUN apt-get update && \ - apt-get install -y default-jre && \ - /tmp/clean-layer.sh - -RUN pip install -f http://h2o-release.s3.amazonaws.com/h2o/latest_stable_Py.html h2o && /tmp/clean-layer.sh - -RUN pip install \ - "tensorflow==${TENSORFLOW_VERSION}" \ - "tensorflow-io==${TENSORFLOW_IO_VERSION}" \ - tensorflow-probability \ - tensorflow_decision_forests \ - tensorflow-text \ - "tensorflow_hub>=0.16.0" \ - tf-keras \ - "keras>3" \ - keras-cv \ - keras-nlp && \ - /tmp/clean-layer.sh - ADD patches/keras_internal.py \ patches/keras_internal_test.py \ - /opt/conda/lib/python3.10/site-packages/tensorflow_decision_forests/keras/ + $PACKAGE_PATH/tensorflow_decision_forests/keras/ -# b/350573866: xgboost v2.1.0 breaks learntools RUN apt-get install -y libfreetype6-dev && \ - apt-get install -y libglib2.0-0 libxext6 libsm6 libxrender1 libfontconfig1 --fix-missing && \ - rm -rf /opt/conda/lib/python3.10/site-packages/numpy* && \ - pip install "numpy==1.26.4" && \ - pip install gensim \ - textblob \ - wordcloud \ - "xgboost==2.0.3" \ - pydot \ - hep_ml && \ - # NLTK Project datasets - mkdir -p /usr/share/nltk_data && \ + apt-get install -y libglib2.0-0 libxext6 libsm6 libxrender1 libfontconfig1 --fix-missing + +# NLTK Project datasets +RUN mkdir -p /usr/share/nltk_data && \ # NLTK Downloader no longer continues smoothly after an error, so we explicitly list # the corpuses that work - # "yes | ..." answers yes to the retry prompt in case of an error. See b/133762095. - yes | python -m nltk.downloader -d /usr/share/nltk_data abc alpino averaged_perceptron_tagger \ + python -m nltk.downloader -d /usr/share/nltk_data abc alpino averaged_perceptron_tagger \ basque_grammars biocreative_ppi bllip_wsj_no_aux \ book_grammars brown brown_tei cess_cat cess_esp chat80 city_database cmudict \ comtrans conll2000 conll2002 conll2007 crubadan dependency_treebank \ @@ -225,276 +94,13 @@ RUN apt-get install -y libfreetype6-dev && \ sentiwordnet shakespeare sinica_treebank smultron snowball_data spanish_grammars \ state_union stopwords subjectivity swadesh switchboard tagsets timit toolbox treebank \ twitter_samples udhr2 udhr unicode_samples universal_tagset universal_treebanks_v20 \ - vader_lexicon verbnet webtext word2vec_sample wordnet wordnet_ic words ycoe && \ - pip install scikit-image && \ - pip install opencv-contrib-python opencv-python && \ - /tmp/clean-layer.sh - -RUN pip install cython \ - fasttext \ - opencv-contrib-python \ - opencv-python \ - "scipy<1.14.0" \ - # Scikit-learn accelerated library for x86 - "scikit-learn-intelex>=2023.0.1" \ - # HDF5 support - h5py \ - # PUDB, for local debugging convenience - pudb \ - imbalanced-learn \ - # Profiling and other utilities - line_profiler \ - bokeh \ - numba \ - datashader \ - # b/328788268: libpysal 4.10 seems to fail with "module 'shapely' has no attribute 'Geometry'. Did you mean: 'geometry'" - "libpysal==4.9.2" \ - # b/276344496: Install specific version of boto3, because 1.26.103 is broken. - "boto3==1.26.100" \ - Boruta && \ - # Pandoc is a dependency of deap - apt-get install -y pandoc && \ - /tmp/clean-layer.sh + vader_lexicon verbnet webtext word2vec_sample wordnet wordnet_ic words ycoe RUN apt-get install -y git-lfs && \ - # vtk with dependencies + # vtk dependencies apt-get install -y libgl1-mesa-glx && \ - pip install vtk && \ - # xvfbwrapper with dependencies + # xvfbwrapper dependencies apt-get install -y xvfb && \ - pip install xvfbwrapper && \ - /tmp/clean-layer.sh - -RUN pip install mpld3 \ - gpxpy \ - arrow \ - nilearn \ - nibabel \ - imgaug \ - preprocessing \ - path.py && \ - pip install deap \ - # b/302136621: Fix eli5 import for learntools, newer version require scikit-learn > 1.3 - "tpot==0.12.1" \ - scikit-optimize \ - haversine \ - toolz cytoolz \ - plotly \ - hyperopt \ - langid \ - # Useful data exploration libraries (for missing data and generating reports) - missingno \ - pandas-profiling \ - bayesian-optimization \ - matplotlib-venn \ - pyldavis \ - mlxtend \ - altair \ - ImageHash \ - ecos \ - CVXcanon \ - pymc3 \ - tifffile \ - geojson \ - pydicom \ - wavio \ - SimpleITK \ - squarify \ - fuzzywuzzy \ - python-louvain \ - pyexcel-ods \ - sklearn-pandas \ - prophet \ - holidays \ - holoviews \ - scikit-multilearn \ - leven \ - catboost \ - folium \ - scikit-plot \ - fury dipy \ - plotnine \ - scikit-surprise \ - pymongo \ - eli5 \ - kaggle \ - kagglehub \ - google-generativeai \ - pytest && \ - /tmp/clean-layer.sh - - # Add google PAIR-code Facets -RUN cd /opt/ && git clone https://github.com/PAIR-code/facets && cd facets/ && jupyter nbextension install facets-dist/ --user && \ - export PYTHONPATH=$PYTHONPATH:/opt/facets/facets_overview/python/ && \ - pip install librosa \ - sentencepiece \ - cufflinks \ - lime \ - memory_profiler && \ - /tmp/clean-layer.sh - -RUN pip install annoy \ - category_encoders && \ - # b/183041606#comment5: the Kaggle data proxy doesn't support these APIs. If the library is missing, it falls back to using a regular BigQuery query to fetch data. - pip uninstall -y google-cloud-bigquery-storage && \ - # google-cloud-automl 2.0.0 introduced incompatible API changes, need to pin to 1.0.1 - # After launch this should be installed from pip - pip install git+https://github.com/googleapis/python-aiplatform.git@mb-release \ - google-cloud-automl==1.0.1 \ - google-api-core==1.33.2 \ - google-cloud-bigquery \ - google-cloud-storage && \ - # Split these installations to avoid `pip._vendor.resolvelib.resolvers.ResolutionTooDeep: 200000` - # b/315753846: Unpin translate package. - pip install google-cloud-translate==3.12.1 \ - google-cloud-language==2.* \ - google-cloud-videointelligence==2.* \ - google-cloud-vision==2.* \ - protobuf==3.20.3 \ - # Pandas data reader - pandas-datareader \ - emoji \ - # Add Japanese morphological analysis engine - janome \ - # yellowbrick machine learning visualization library - yellowbrick \ - mlcrate && \ - /tmp/clean-layer.sh - -# b/273059949: The pre-installed nbconvert is slow on html conversions and has to be force-uninstalled. -# b/274619697: learntools also requires a specific nbconvert right now -RUN rm -rf /opt/conda/lib/python3.10/site-packages/{nbconvert,nbclient,mistune,platformdirs}* - -RUN pip install bleach \ - certifi \ - cycler \ - decorator \ - entrypoints \ - html5lib \ - ipykernel \ - ipython \ - ipython-genutils \ - ipywidgets==8.1.5 \ - isoweek \ - jedi \ - jsonschema \ - jupyter-client \ - jupyter-console \ - jupyter-core \ - jupyterlab-lsp \ - MarkupSafe \ - mistune \ - nbformat \ - notebook \ - "nbconvert==6.4.5" \ - papermill \ - python-lsp-server[all] \ - olefile \ - kornia \ - pandas_summary \ - pandocfilters \ - pexpect \ - pickleshare \ - Pillow && \ - # Install openslide and its python binding - apt-get install -y openslide-tools && \ - pip install openslide-python \ - ptyprocess \ - Pygments \ - pyparsing \ - pytz \ - PyYAML \ - pyzmq \ - qtconsole \ - six \ - terminado \ - tornado \ - tqdm \ - traitlets \ - wcwidth \ - webencodings \ - widgetsnbextension \ - # Require pyarrow newer than https://github.com/advisories/GHSA-5wvp-7f3h-6wmm - {{ if eq .Accelerator "gpu" }} pyarrow {{ else }} "pyarrow>=14.0.1" {{ end }} - -RUN python -m spacy download en_core_web_sm && python -m spacy download en_core_web_lg && \ - apt-get update && apt-get install -y ffmpeg && \ - /tmp/clean-layer.sh - - ########### - # - # NEW CONTRIBUTORS: - # Please add new pip/apt installs in this block. Don't forget a "&& \" at the end - # of all non-final lines. Thanks! - # - ########### - -RUN rm /opt/conda/lib/python3.10/site-packages/google*/direct_url.json && \ - rm /opt/conda/lib/python3.10/site-packages/google*/REQUESTED -# dlib has a libmkl incompatibility: -# test_dlib_face_detector (test_dlib.TestDLib) ... INTEL MKL ERROR: /opt/conda/bin/../lib/libmkl_avx512.so.2: undefined symbol: mkl_sparse_optimize_bsr_trsm_i8. -# Intel MKL FATAL ERROR: Cannot load libmkl_avx512.so.2 or libmkl_def.so.2. -# nnabla breaks protobuf compatibiilty: -RUN pip install wandb \ - pyemd \ - pympler \ - featuretools \ - #-e git+https://github.com/SohierDane/BigQuery_Helper#egg=bq_helper \ - git+https://github.com/Kaggle/learntools \ - ray \ - gym \ - pyarabic \ - pandasql \ - # b/302136621: Fix eli5 import for learntools - scikit-learn==1.2.2 \ - # b/329869023 shap 0.45.0 breaks learntools - shap==0.44.1 \ - cesium \ - rgf_python \ - jieba \ - tsfresh \ - optuna \ - plotly_express \ - albumentations \ - Rtree \ - accelerate && \ - apt-get -y install libspatialindex-dev && \ - # b/370860329: newer versions are not capable with current tensorflow - rm -rf /opt/conda/lib/python3.10/site-packages/numpy* && \ - pip install "numpy==1.26.4" && \ - pip install pytorch-ignite \ - bqplot \ - earthengine-api \ - transformers \ - datasets \ - s3fs \ - gcsfs \ - kaggle-environments \ - # geopandas > v0.14.4 breaks learn tools - geopandas==v0.14.4 \ - "shapely<2" \ - pydub \ - pydegensac \ - torchmetrics \ - pytorch-lightning \ - sympy \ - # flask is used by agents in the simulation competitions. - flask \ - # pycrypto is used by competitions team. - pycryptodome \ - nbdev \ - easyocr \ - onnx \ - tables \ - openpyxl \ - timm \ - torchinfo && \ - pip install git+https://github.com/facebookresearch/segment-anything.git && \ - # b/370860329: newer versions are not capable with current tensorflow - pip install --no-dependencies fastai fastdownload && \ - # b/343971718: remove duplicate aiohttp installs, and reinstall it - rm -rf /opt/conda/lib/python3.10/site-packages/aiohttp* && \ - micromamba install --force-reinstall -y aiohttp && \ /tmp/clean-layer.sh # Download base easyocr models. @@ -512,12 +118,7 @@ RUN mkdir -p /root/.EasyOCR/model && \ /tmp/clean-layer.sh # Tesseract and some associated utility packages -RUN apt-get install tesseract-ocr -y && \ - pip install pytesseract \ - wand \ - pdf2image \ - PyPDF && \ - /tmp/clean-layer.sh +RUN apt-get install tesseract-ocr -y ENV TESSERACT_PATH=/usr/bin/tesseract \ # For Facets @@ -525,39 +126,12 @@ ENV TESSERACT_PATH=/usr/bin/tesseract \ # For Theano with MKL MKL_THREADING_LAYER=GNU -# b/308525631: Pin Matplotlib until seaborn can be upgraded -# to >0.13.0 (now it's stuck by a package conflict with ydata-profiling 4.5.1). -RUN JAXVER=$(pip freeze | grep -e "^jax==") && \ - pip install --upgrade \ - "matplotlib==3.7.5" \ - # ipympl adds interactive widget support for matplotlib - ipympl==0.7.0 \ - "seaborn==0.12.2" \ - pyupset \ - python-dateutil dask dask-expr igraph \ - pyyaml joblib geopy mne pyshp \ - pandas \ - polars \ - flax \ - "${JAXVER}" && \ - /tmp/clean-layer.sh - # Temporary fixes and patches -# Temporary patch for Dask getting downgraded, which breaks Keras -RUN pip install --upgrade dask && \ - # Stop jupyter nbconvert trying to rewrite its folder hierarchy - mkdir -p /root/.jupyter && touch /root/.jupyter/jupyter_nbconvert_config.py && touch /root/.jupyter/migrated && \ - # TODO(b/380921973): Ensure only matplotlib 3.7.5 files are present. - rm -r /opt/conda/lib/python3.10/site-packages/matplotlib-3.9.2.dist-info/ && \ +# Stop jupyter nbconvert trying to rewrite its folder hierarchy +RUN mkdir -p /root/.jupyter && touch /root/.jupyter/jupyter_nbconvert_config.py && touch /root/.jupyter/migrated && \ mkdir -p /.jupyter && touch /.jupyter/jupyter_nbconvert_config.py && touch /.jupyter/migrated && \ - # Stop Matplotlib printing junk to the console on first load - sed -i "s/^.*Matplotlib is building the font cache using fc-list.*$/# Warning removed by Kaggle/g" /opt/conda/lib/python3.10/site-packages/matplotlib/font_manager.py && \ # Make matplotlib output in Jupyter notebooks display correctly mkdir -p /etc/ipython/ && echo "c = get_config(); c.IPKernelApp.matplotlib = 'inline'" > /etc/ipython/ipython_config.py && \ - # Temporary patch for broken libpixman 0.38 in conda-forge, symlink to system libpixman 0.34 untile conda package gets updated to 0.38.5 or higher. - ln -sf /usr/lib/x86_64-linux-gnu/libpixman-1.so.0.34.0 /opt/conda/lib/libpixman-1.so.0.38.0 && \ - # b/333854354: pin jupyter-server to version 2.12.5; later versions break LSP (b/333854354) - pip install --force-reinstall --no-deps jupyter_server==2.12.5 && \ /tmp/clean-layer.sh # Fix to import bq_helper library without downgrading setuptools @@ -566,50 +140,35 @@ RUN mkdir -p ~/src && git clone https://github.com/SohierDane/BigQuery_Helper ~/ mv ~/src/BigQuery_Helper/bq_helper.py ~/src/BigQuery_Helper/bq_helper/__init__.py && \ mv ~/src/BigQuery_Helper/test_helper.py ~/src/BigQuery_Helper/bq_helper/ && \ sed -i 's/)/packages=["bq_helper"])/g' ~/src/BigQuery_Helper/setup.py && \ - pip install -e ~/src/BigQuery_Helper && \ + uv pip install --system -e ~/src/BigQuery_Helper && \ /tmp/clean-layer.sh + +# install imagemagick for wand +# https://docs.wand-py.org/en/latest/guide/install.html#install-imagemagick-on-debian-ubuntu +RUN apt-get install libmagickwand-dev + +# Override default imagemagick policies +ADD patches/imagemagick-policy.xml /etc/ImageMagick-6/policy.xml + +# Add Kaggle module resolver +ADD patches/kaggle_module_resolver.py $PACKAGE_PATH/tensorflow_hub/kaggle_module_resolver.py +RUN sed -i '/from tensorflow_hub import uncompressed_module_resolver/a from tensorflow_hub import kaggle_module_resolver' $PACKAGE_PATH/tensorflow_hub/config.py && \ + sed -i '/_install_default_resolvers()/a \ \ registry.resolver.add_implementation(kaggle_module_resolver.KaggleFileResolver())' $PACKAGE_PATH/tensorflow_hub/config.py + # Add BigQuery client proxy settings -ENV PYTHONUSERBASE "/root/.local" +ENV PYTHONUSERBASE="/root/.local" ADD patches/kaggle_gcp.py \ patches/kaggle_secrets.py \ patches/kaggle_session.py \ patches/kaggle_web_client.py \ patches/kaggle_datasets.py \ patches/log.py \ - patches/sitecustomize.py \ - /root/.local/lib/python3.10/site-packages/ + $PACKAGE_PATH/ -# Override default imagemagick policies -ADD patches/imagemagick-policy.xml /etc/ImageMagick-6/policy.xml - -# Add Kaggle module resolver -ADD patches/kaggle_module_resolver.py /opt/conda/lib/python3.10/site-packages/tensorflow_hub/kaggle_module_resolver.py -RUN sed -i '/from tensorflow_hub import uncompressed_module_resolver/a from tensorflow_hub import kaggle_module_resolver' /opt/conda/lib/python3.10/site-packages/tensorflow_hub/config.py && \ - sed -i '/_install_default_resolvers()/a \ \ registry.resolver.add_implementation(kaggle_module_resolver.KaggleFileResolver())' /opt/conda/lib/python3.10/site-packages/tensorflow_hub/config.py && \ - # Disable preloaded jupyter modules (they add to startup, and break when they are missing) - sed -i /bq_stats/d /etc/ipython/ipython_kernel_config.py && \ - sed -i /beatrix/d /etc/ipython/ipython_kernel_config.py && \ - sed -i /bigquery/d /etc/ipython/ipython_kernel_config.py && \ - sed -i /sql/d /etc/ipython/ipython_kernel_config.py - -# Force only one libcusolver -{{ if eq .Accelerator "gpu" }} -RUN rm /opt/conda/bin/../lib/libcusolver.so.11 && ln -s /usr/local/cuda/lib64/libcusolver.so.11 /opt/conda/bin/../lib/libcusolver.so.11 -{{ else }} -RUN ln -s /usr/local/cuda/lib64/libcusolver.so.11 /opt/conda/bin/../lib/libcusolver.so.11 -{{ end }} - -# b/270147159: conda ships with a version of libtinfo which is missing version info causing warnings, replace it with a good version. -RUN rm /opt/conda/lib/libtinfo.so.6 && ln -s /usr/lib/x86_64-linux-gnu/libtinfo.so.6 /opt/conda/lib/libtinfo.so.6 && \ - # b/276358430: fix Jupyter lsp freezing up the jupyter server - pip install "jupyter-lsp==1.5.1" - -# Set backend for matplotlib -ENV MPLBACKEND="agg" \ - # Set LC_ALL - # https://github.com/explosion/spaCy/issues/12872#issuecomment-1661847588 - LC_ALL="POSIX" +# Figure out why this is in a different place? +# Found by doing a export PYTHONVERBOSE=1 and then running python and checking for where it looked for it. +ADD patches/sitecustomize.py /usr/lib/python3.10/sitecustomize.py ARG GIT_COMMIT=unknown \ BUILD_DATE=unknown @@ -620,10 +179,6 @@ LABEL git-commit=$GIT_COMMIT \ ENV GIT_COMMIT=${GIT_COMMIT} \ BUILD_DATE=${BUILD_DATE} -LABEL tensorflow-version=$TENSORFLOW_VERSION \ - # Used in the Jenkins `Docker GPU Build` step to restrict the images being pruned. - kaggle-lang=python - # Correlate current release with the git hash inside the kernel editor by running `!cat /etc/git_commit`. RUN echo "$GIT_COMMIT" > /etc/git_commit && echo "$BUILD_DATE" > /etc/build_date @@ -633,3 +188,4 @@ ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH_NO_STUBS" \ # Add the CUDA home. CUDA_HOME=/usr/local/cuda {{ end }} +ENTRYPOINT ["/usr/bin/env"] diff --git a/Jenkinsfile b/Jenkinsfile index 93f4753d..4980b956 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -21,66 +21,6 @@ pipeline { } stages { - stage('Pre-build Packages from Source') { - parallel { - stage('torch') { - options { - timeout(time: 300, unit: 'MINUTES') - } - steps { - sh '''#!/bin/bash - set -exo pipefail - source config.txt - cd packages/ - ./build_package --base-image $BASE_IMAGE_REPO/$GPU_BASE_IMAGE_NAME:$BASE_IMAGE_TAG \ - --package torch \ - --version $TORCH_VERSION \ - --build-arg TORCHAUDIO_VERSION=$TORCHAUDIO_VERSION \ - --build-arg TORCHVISION_VERSION=$TORCHVISION_VERSION \ - --build-arg CUDA_MAJOR_VERSION=$CUDA_MAJOR_VERSION \ - --build-arg CUDA_MINOR_VERSION=$CUDA_MINOR_VERSION \ - --push - ''' - } - } - stage('lightgbm') { - options { - timeout(time: 10, unit: 'MINUTES') - } - steps { - sh '''#!/bin/bash - set -exo pipefail - source config.txt - cd packages/ - ./build_package --base-image $BASE_IMAGE_REPO/$GPU_BASE_IMAGE_NAME:$BASE_IMAGE_TAG \ - --package lightgbm \ - --version $LIGHTGBM_VERSION \ - --build-arg CUDA_MAJOR_VERSION=$CUDA_MAJOR_VERSION \ - --build-arg CUDA_MINOR_VERSION=$CUDA_MINOR_VERSION \ - --push - ''' - } - } - stage('jaxlib') { - options { - timeout(time: 300, unit: 'MINUTES') - } - steps { - sh '''#!/bin/bash - set -exo pipefail - source config.txt - cd packages/ - ./build_package --base-image $BASE_IMAGE_REPO/$GPU_BASE_IMAGE_NAME:$BASE_IMAGE_TAG \ - --package jaxlib \ - --version $JAX_VERSION \ - --build-arg CUDA_MAJOR_VERSION=$CUDA_MAJOR_VERSION \ - --build-arg CUDA_MINOR_VERSION=$CUDA_MINOR_VERSION \ - --push - ''' - } - } - } - } stage('Build/Test/Diff') { parallel { stage('CPU') { diff --git a/clean-layer.sh b/clean-layer.sh index d1a048fc..467e1cac 100755 --- a/clean-layer.sh +++ b/clean-layer.sh @@ -19,6 +19,4 @@ apt-get clean # Ensures the current working directory won't be deleted cd /usr/local/src/ # Delete source files used for building binaries -rm -rf /usr/local/src/* -# Delete conda downloaded tarballs -conda clean -y --tarballs +rm -rf /usr/local/src/* \ No newline at end of file diff --git a/config.txt b/config.txt index e95a1af1..c0a7711c 100644 --- a/config.txt +++ b/config.txt @@ -1,11 +1,2 @@ -BASE_IMAGE_REPO=gcr.io/deeplearning-platform-release -BASE_IMAGE_TAG=m122 -CPU_BASE_IMAGE_NAME=tf2-cpu.2-16.py310 -GPU_BASE_IMAGE_NAME=tf2-gpu.2-16.py310 -LIGHTGBM_VERSION=4.2.0 -TORCH_VERSION=2.4.0 -TORCHAUDIO_VERSION=2.4.0 -TORCHVISION_VERSION=0.19.0 -JAX_VERSION=0.4.26 CUDA_MAJOR_VERSION=12 -CUDA_MINOR_VERSION=3 +CUDA_MINOR_VERSION=2 diff --git a/kaggle_requirements.txt b/kaggle_requirements.txt new file mode 100644 index 00000000..6400d8a1 --- /dev/null +++ b/kaggle_requirements.txt @@ -0,0 +1,139 @@ +altair>=5.4.0 +Babel +Boruta +Cartopy +ImageHash +Janome +PyArabic +PyUpSet +Pympler +Rtree +shapely<2 +SimpleITK +TPOT +Theano +Wand +annoy +arrow +bayesian-optimization +boto3 +catboost +category-encoders +cesium +comm +cytoolz +dask-expr +datasets +datashader +deap +dipy +docker +easyocr +eli5 +emoji +fasttext +featuretools +fiona +fury +fuzzywuzzy +geojson +# geopandas > v0.14.4 breaks learn tools +geopandas==v0.14.4 +google-cloud-aiplatform +# google-cloud-automl 2.0.0 introduced incompatible API changes, need to pin to 1.0.1 +google-cloud-automl==1.0.1 +# b/315753846: Unpin translate package. +google-cloud-translate==3.12.1 +google-cloud-videointelligence +google-cloud-vision +gpxpy +h2o +haversine +hep-ml +igraph +ipympl +ipywidgets==8.1.5 +isoweek +jedi +# b/276358430: fix Jupyter lsp freezing up the jupyter server +jupyter-lsp==1.5.1 +# b/333854354: pin jupyter-server to version 2.12.5; later versions break LSP (b/333854354) +jupyter_server==2.12.5 +jupyterlab +jupyterlab-lsp +kaggle-environments +kagglehub>=0.3.4 +# Keras 3.6 broke test_keras.py > test_train > keras.datasets.mnist.load_data(): +# See https://github.com/keras-team/keras/commit/dcefb139863505d166dd1325066f329b3033d45a +keras<3.6 +keras-cv +keras-nlp +keras-tuner +kornia +langid +leven +# b/328788268: libpysal 4.10 seems to fail with "module 'shapely' has no attribute 'Geometry'. Did you mean: 'geometry'" +libpysal<=4.9.2 +lime +line_profiler +mamba +mlcrate +mne +mpld3 +nbdev +nilearn +olefile +onnx +openslide-bin +openslide-python +optuna +pandas-profiling +pandasql +papermill +path +path.py +pdf2image +plotly-express +preprocessing +pudb +pyLDAvis +pycryptodome +pydegensac +pydicom +pydub +pyemd +pyexcel-ods +pymc3 +pymongo +pypdf +pytesseract +python-lsp-server +pytorch-ignite +pytorch-lightning +qgrid +qtconsole +ray +rgf-python +s3fs +scikit-learn-intelex +scikit-multilearn +scikit-optimize +scikit-plot +scikit-surprise +git+https://github.com/facebookresearch/segment-anything.git +shap +squarify +tensorflow-cloud +tensorflow-io +tensorflow-text +tensorflow_decision_forests +timm +torchinfo +torchmetrics +tsfresh +vtk +wandb +wavio +xgboost==2.0.3 +xvfbwrapper +ydata-profiling diff --git a/test b/test index ef1ffe3e..c2748e81 100755 --- a/test +++ b/test @@ -3,7 +3,7 @@ set -e IMAGE_TAG='kaggle/python-build' IMAGE_TAG_OVERRIDE='' -ADDITONAL_OPTS='' +ADDITONAL_OPTS='--runtime runc ' # Use the CPU runtime by default PATTERN='test*.py' usage() { @@ -69,8 +69,6 @@ readonly ADDITONAL_OPTS readonly PATTERN set -x -docker run --rm --net=none -v /tmp/python-build:/tmp/python-build "$IMAGE_TAG" rm -rf /tmp/python-build/* -docker rm jupyter_test || true mkdir -p /tmp/python-build/tmp mkdir -p /tmp/python-build/devshm mkdir -p /tmp/python-build/working @@ -97,6 +95,9 @@ fi # Note about `--hostname localhost` (b/158137436) # hostname defaults to the container name which fails DNS name # resolution with --net=none (required to keep tests hermetic). See details in bug. +# +# Note about CLOUDSDK_CONFIG=/tmp/.config/gcloud +# We use the /tmp dir since the filesystem is --read-only and we need writable space for gcloud configs. docker run --rm -t --read-only --net=none \ -e HOME=/tmp -e KAGGLE_DATA_PROXY_TOKEN=test-key \ -e KAGGLE_USER_SECRETS_TOKEN_KEY=test-secrets-key \ @@ -105,6 +106,7 @@ docker run --rm -t --read-only --net=none \ -e KAGGLE_DATA_PROXY_PROJECT=test \ -e TF_FORCE_GPU_ALLOW_GROWTH=true \ -e XLA_PYTHON_CLIENT_PREALLOCATE=false \ + -e CLOUDSDK_CONFIG=/tmp/.config/gcloud \ --hostname localhost \ --shm-size=2g \ -v $PWD:/input:ro -v /tmp/python-build/working:/working \ diff --git a/tests/test_cuml.py b/tests/test_cuml.py index bbb7f0c6..695e47ca 100644 --- a/tests/test_cuml.py +++ b/tests/test_cuml.py @@ -6,6 +6,7 @@ class TestCuml(unittest.TestCase): @gpu_test @p100_exempt # b/342143152: cuML(>=24.4v) is inompatible with p100 GPUs. + @unittest.skip("b/381287748 cuML is not installed in Colab.") def test_pca_fit_transform(self): import unittest import numpy as np diff --git a/tests/test_fastai.py b/tests/test_fastai.py index 0de1f82f..49bce0ac 100644 --- a/tests/test_fastai.py +++ b/tests/test_fastai.py @@ -27,8 +27,9 @@ def test_tabular(self): "/input/tests/data/train.csv", cont_names=["pixel"+str(i) for i in range(784)], y_names='label', - procs=[FillMissing, Categorify, Normalize]) + procs=[FillMissing, Categorify, Normalize]) learn = tabular_learner(dls, layers=[200, 100]) - learn.fit_one_cycle(n_epoch=1) + with learn.no_bar(): + learn.fit_one_cycle(n_epoch=1) - self.assertGreater(learn.smooth_loss, 0) + self.assertGreater(learn.smooth_loss, 0) diff --git a/tests/test_lightgbm.py b/tests/test_lightgbm.py index bcdbb1a6..7001a0a7 100644 --- a/tests/test_lightgbm.py +++ b/tests/test_lightgbm.py @@ -34,7 +34,9 @@ def test_cpu(self): self.assertEqual(1, gbm.best_iteration) + # TODO(b/381256047): Colab needs to install GPU-enabled lightgbm. @gpu_test + @unittest.skip("Skipping this test until b/381256047 is resolved.") def test_gpu(self): lgb_train, lgb_eval = self.load_datasets() From 4e6bec87f5d71764cad94b99bde0f90929a4af62 Mon Sep 17 00:00:00 2001 From: Dustin H Date: Mon, 2 Dec 2024 14:40:02 -0500 Subject: [PATCH 16/67] Re-pin learntools related packages. (#1448) These were unpinned during the base image migration but caused learntools to fail. http://b/365782129 --- Dockerfile.tmpl | 7 ++++--- kaggle_requirements.txt | 19 +++++++++++++++---- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index d6bc3f9b..20769a74 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -21,10 +21,11 @@ ENV PATH="~/.local/bin:${PATH}" # b/183041606#comment5: the Kaggle data proxy doesn't support these APIs. If the library is missing, it falls back to using a regular BigQuery query to fetch data. RUN uv pip uninstall --system google-cloud-bigquery-storage -# NOTE(herbison): uv fails to install this for some reason -RUN pip install git+https://github.com/Kaggle/learntools +# uv cannot install this in requirements.txt without --no-build-isolation +# to avoid affecting the larger build, we'll post-install it. +RUN uv pip install --no-build-isolation --system "git+https://github.com/Kaggle/learntools" -# We install an incompatible pair of libs (shapely<, libpysal==4.9.2) so we can't put this one in the requirements.txt +# b/328788268 We install an incompatible pair of libs (shapely<2, libpysal==4.9.2) so we can't put this one in the requirements.txt RUN uv pip install --system "libpysal==4.9.2" # Adding non-package dependencies: diff --git a/kaggle_requirements.txt b/kaggle_requirements.txt index 6400d8a1..30f819c5 100644 --- a/kaggle_requirements.txt +++ b/kaggle_requirements.txt @@ -1,4 +1,5 @@ -altair>=5.4.0 +# Please keep this in alphabetical order +Altair>=5.4.0 Babel Boruta Cartopy @@ -10,7 +11,8 @@ Pympler Rtree shapely<2 SimpleITK -TPOT +# b/302136621: Fix eli5 import for learntools, newer version require scikit-learn > 1.3 +TPOT==0.12.1 Theano Wand annoy @@ -80,6 +82,8 @@ mamba mlcrate mne mpld3 +# b/274619697: learntools requires a specific nbconvert right now +nbconvert==6.4.5 nbdev nilearn olefile @@ -115,13 +119,19 @@ qtconsole ray rgf-python s3fs -scikit-learn-intelex + # b/302136621: Fix eli5 import for learntools +scikit-learn==1.2.2 +# Scikit-learn accelerated library for x86 +scikit-learn-intelex>=2023.0.1 scikit-multilearn scikit-optimize scikit-plot scikit-surprise +# Also pinning seaborn for learntools +seaborn==0.12.2 git+https://github.com/facebookresearch/segment-anything.git -shap +# b/329869023 shap 0.45.0 breaks learntools +shap==0.44.1 squarify tensorflow-cloud tensorflow-io @@ -134,6 +144,7 @@ tsfresh vtk wandb wavio +# b/350573866: xgboost v2.1.0 breaks learntools xgboost==2.0.3 xvfbwrapper ydata-profiling From 77a3516e860a61491915626619427378c2fd27bd Mon Sep 17 00:00:00 2001 From: Dustin H Date: Tue, 3 Dec 2024 09:46:25 -0500 Subject: [PATCH 17/67] Include $PWD in PYTHONPATH http://b/365782129 --- Dockerfile.tmpl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index 20769a74..23697c8d 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -122,8 +122,8 @@ RUN mkdir -p /root/.EasyOCR/model && \ RUN apt-get install tesseract-ocr -y ENV TESSERACT_PATH=/usr/bin/tesseract \ - # For Facets - PYTHONPATH=$PYTHONPATH:/opt/facets/facets_overview/python/ \ + # For Facets, we also include an empty path to include $PWD. + PYTHONPATH=:$PYTHONPATH:/opt/facets/facets_overview/python/ \ # For Theano with MKL MKL_THREADING_LAYER=GNU From 79353b5c213d280128f5cc2cf2a92bc8388e2d28 Mon Sep 17 00:00:00 2001 From: Dustin H Date: Wed, 4 Dec 2024 12:45:59 -0500 Subject: [PATCH 18/67] b/381256047 lightgbm gpu install (#1451) --- Dockerfile.tmpl | 20 +++++++++++++++++++- Jenkinsfile | 23 +++++++++++++++++++++++ config.txt | 3 +++ packages/build_package | 8 ++------ tests/test_lightgbm.py | 2 -- 5 files changed, 47 insertions(+), 9 deletions(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index 23697c8d..77f198a0 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -1,4 +1,11 @@ -FROM us-docker.pkg.dev/colab-images/public/runtime:latest +ARG BASE_IMAGE \ + BASE_IMAGE_TAG \ + LIGHTGBM_VERSION + +{{ if eq .Accelerator "gpu" }} +FROM gcr.io/kaggle-images/python-lightgbm-whl:${BASE_IMAGE_TAG}-${LIGHTGBM_VERSION} AS lightgbm_whl +{{ end }} +FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} ADD kaggle_requirements.txt /kaggle_requirements.txt @@ -49,6 +56,17 @@ RUN update-alternatives --set cuda /usr/local/cuda-$CUDA_MAJOR_VERSION.$CUDA_MIN RUN uv pip install --system "pycuda" +# b/381256047 Remove once installed in Colabs base image. +# Install LightGBM +COPY --from=lightgbm_whl /tmp/whl/*.whl /tmp/lightgbm/ +# Install OpenCL (required by LightGBM GPU version) +RUN apt-get install -y ocl-icd-libopencl1 clinfo && \ + mkdir -p /etc/OpenCL/vendors && \ + echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd && \ + uv pip install --system /tmp/lightgbm/*.whl && \ + rm -rf /tmp/lightgbm && \ + /tmp/clean-layer.sh + # Remove CUDA_VERSION from non-GPU image. {{ else }} ENV CUDA_VERSION="" diff --git a/Jenkinsfile b/Jenkinsfile index 4980b956..92332a02 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -21,6 +21,29 @@ pipeline { } stages { + stage('Pre-build Packages from Source') { + stages { + stage('lightgbm') { + options { + timeout(time: 10, unit: 'MINUTES') + } + steps { + sh '''#!/bin/bash + set -exo pipefail + source config.txt + cd packages/ + ./build_package --base-image $BASE_IMAGE:$BASE_IMAGE_TAG \ + --package lightgbm \ + --version $LIGHTGBM_VERSION \ + --build-arg CUDA_MAJOR_VERSION=$CUDA_MAJOR_VERSION \ + --build-arg CUDA_MINOR_VERSION=$CUDA_MINOR_VERSION \ + --push + ''' + } + } + } + } + stage('Build/Test/Diff') { parallel { stage('CPU') { diff --git a/config.txt b/config.txt index c0a7711c..b799aadb 100644 --- a/config.txt +++ b/config.txt @@ -1,2 +1,5 @@ +BASE_IMAGE=us-docker.pkg.dev/colab-images/public/runtime +BASE_IMAGE_TAG=release-colab_20240920-060127_RC00 +LIGHTGBM_VERSION=4.5.0 CUDA_MAJOR_VERSION=12 CUDA_MINOR_VERSION=2 diff --git a/packages/build_package b/packages/build_package index 1e6a7f94..e0af53e2 100755 --- a/packages/build_package +++ b/packages/build_package @@ -115,12 +115,8 @@ if [[ -z "$DOCKERFILE" ]]; then exit 1 fi -# Keep only `tf2-gpu.2-6:m80` in `gcr.io/deeplearning-platform-release/tf2-gpu.2-6:m80` -TAG=${BASE_IMAGE/gcr.io\/deeplearning-platform-release\//} -# Keep only `python:v108` in `gcr.io/kaggle-images/python:v108` -TAG=${TAG/gcr.io\/kaggle-images\//} -# Replace the `:` in `tf2-gpu.2-6:m80` by `-` -TAG=${TAG/:/-} +# Keep only `release-colab_20240920-060127_RC00` in `us-docker.pkg.dev/colab-images/public/runtime:release-colab_20240920-060127_RC00` +TAG=$(echo $BASE_IMAGE | cut -d ':' -f 2) # Append the package version TAG=$TAG-$PACKAGE_VERSION # Add the gcr repo. diff --git a/tests/test_lightgbm.py b/tests/test_lightgbm.py index 7001a0a7..bcdbb1a6 100644 --- a/tests/test_lightgbm.py +++ b/tests/test_lightgbm.py @@ -34,9 +34,7 @@ def test_cpu(self): self.assertEqual(1, gbm.best_iteration) - # TODO(b/381256047): Colab needs to install GPU-enabled lightgbm. @gpu_test - @unittest.skip("Skipping this test until b/381256047 is resolved.") def test_gpu(self): lgb_train, lgb_eval = self.load_datasets() From adffc5d565c0b122a7d5d60c6cf1e1acad360f95 Mon Sep 17 00:00:00 2001 From: Dustin H Date: Wed, 4 Dec 2024 16:46:16 -0500 Subject: [PATCH 19/67] Build tpu image with uv (#1452) This should significantly improve build times for this image. http://b/350047073 --- tpu/Dockerfile | 20 ++++++++++---------- tpu/requirements.txt | 45 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 10 deletions(-) create mode 100644 tpu/requirements.txt diff --git a/tpu/Dockerfile b/tpu/Dockerfile index b94619da..6a45fa60 100644 --- a/tpu/Dockerfile +++ b/tpu/Dockerfile @@ -56,16 +56,16 @@ RUN apt-get update && apt-get install ffmpeg libsm6 libxext6 -y # Packages needed by the Notebook editor -# Additional useful packages should be added here - -RUN pip install tensorflow_hub https://storage.googleapis.com/cloud-tpu-tpuvm-artifacts/tensorflow/tf-${TENSORFLOW_VERSION}/tensorflow-${TENSORFLOW_VERSION}-${PYTHON_WHEEL_VERSION}-${PYTHON_WHEEL_VERSION}-${TF_LINUX_WHEEL_VERSION}.whl tensorflow-probability tensorflow-io \ - torch~=${TORCH_VERSION} https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-${TORCH_VERSION}+libtpu-${PYTHON_WHEEL_VERSION}-${PYTHON_WHEEL_VERSION}-${TORCH_LINUX_WHEEL_VERSION}.whl torchvision==${TORCHVISION_VERSION} torchaudio==${TORCHAUDIO_VERSION} \ - jax[tpu]==${JAX_VERSION} -f https://storage.googleapis.com/jax-releases/libtpu_releases.html trax flax optax git+https://github.com/deepmind/dm-haiku jraph distrax \ - papermill jupyterlab python-lsp-server[all] "jupyter-lsp==1.5.1" \ - pandas matplotlib opencv-python-headless librosa accelerate diffusers scikit-learn transformers \ - seaborn timm albumentations einops pyarrow fastparquet opencv-python \ - "keras>3" keras-cv keras-nlp \ - kagglehub && \ +# Additional useful packages should be added in the requirements.txt + +# Bring in the requirements.txt and replace variables in it: +RUN apt-get install -y gettext +ADD tpu/requirements.txt /kaggle_requirements.txt +RUN envsubst < /kaggle_requirements.txt > /requirements.txt + +# Install uv and then install the requirements: +RUN curl -LsSf https://astral.sh/uv/install.sh | sh +RUN export PATH="${HOME}/.local/bin:${PATH}" && uv pip install --system -r /requirements.txt --prerelease=allow --find-links https://storage.googleapis.com/jax-releases/libtpu_releases.html && \ /tmp/clean-layer.sh # Tensorflow libtpu: diff --git a/tpu/requirements.txt b/tpu/requirements.txt new file mode 100644 index 00000000..99db64a7 --- /dev/null +++ b/tpu/requirements.txt @@ -0,0 +1,45 @@ +# Tensorflow packages +https://storage.googleapis.com/cloud-tpu-tpuvm-artifacts/tensorflow/tf-${TENSORFLOW_VERSION}/tensorflow-${TENSORFLOW_VERSION}-${PYTHON_WHEEL_VERSION}-${PYTHON_WHEEL_VERSION}-${TF_LINUX_WHEEL_VERSION}.whl +tensorflow_hub +tensorflow-io +tensorflow-probability +# Torch packages +torch~=${TORCH_VERSION} +https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-${TORCH_VERSION}+libtpu-${PYTHON_WHEEL_VERSION}-${PYTHON_WHEEL_VERSION}-${TORCH_LINUX_WHEEL_VERSION}.whl +torchaudio==${TORCHAUDIO_VERSION} +torchvision==${TORCHVISION_VERSION} +# Jax packages +jax[tpu]==${JAX_VERSION} +distrax +flax +git+https://github.com/deepmind/dm-haiku +jraph +optax +trax +# Jupyter packages +jupyter-lsp==1.5.1 +jupyterlab +papermill +python-lsp-server[all] +# Keras Packages +keras>3 +keras-cv +keras-nlp +# Kaggle Packages +kagglehub +# Other useful packages, add more here +accelerate +albumentations +diffusers +einops +fastparquet +librosa +matplotlib +opencv-python +opencv-python-headless +pandas +pyarrow +scikit-learn +seaborn +timm +transformers From c9bf1ddb10d24904e09ba0360912b2d35b52456f Mon Sep 17 00:00:00 2001 From: Dustin H Date: Wed, 4 Dec 2024 23:43:11 -0500 Subject: [PATCH 20/67] Split build and test parallel steps (#1453) This speeds up e2e pipeline time since all three builds take around 10-11m now and tests take about 5m, making those two phases faster. --- Jenkinsfile | 109 ++++++++++++++++++++++++++-------------------------- 1 file changed, 55 insertions(+), 54 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 92332a02..97d9b4bb 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -61,22 +61,6 @@ pipeline { ''' } } - stage('Test CPU Image') { - options { - timeout(time: 15, unit: 'MINUTES') - } - steps { - retry(2) { - sh '''#!/bin/bash - set -exo pipefail - - date - docker pull gcr.io/kaggle-images/python:${PRETEST_TAG} - ./test --image gcr.io/kaggle-images/python:${PRETEST_TAG} - ''' - } - } - } stage('Diff CPU image') { steps { sh '''#!/bin/bash @@ -113,44 +97,6 @@ pipeline { ''' } } - stage('Test GPU Image') { - stages { - stage('Test on P100') { - agent { label 'ephemeral-linux-gpu' } - options { - timeout(time: 40, unit: 'MINUTES') - } - steps { - retry(2) { - sh '''#!/bin/bash - set -exo pipefail - - date - docker pull gcr.io/kaggle-private-byod/python:${PRETEST_TAG} - ./test --gpu --image gcr.io/kaggle-private-byod/python:${PRETEST_TAG} - ''' - } - } - } - stage('Test on T4x2') { - agent { label 'ephemeral-linux-gpu-t4x2' } - options { - timeout(time: 60, unit: 'MINUTES') - } - steps { - retry(2) { - sh '''#!/bin/bash - set -exo pipefail - - date - docker pull gcr.io/kaggle-private-byod/python:${PRETEST_TAG} - ./test --gpu --image gcr.io/kaggle-private-byod/python:${PRETEST_TAG} - ''' - } - } - } - } - } stage('Diff GPU Image') { steps { sh '''#!/bin/bash @@ -193,6 +139,61 @@ pipeline { } } + stage('Test') { + parallel { + stage('Test CPU Image') { + options { + timeout(time: 15, unit: 'MINUTES') + } + steps { + retry(2) { + sh '''#!/bin/bash + set -exo pipefail + + date + docker pull gcr.io/kaggle-images/python:${PRETEST_TAG} + ./test --image gcr.io/kaggle-images/python:${PRETEST_TAG} + ''' + } + } + } + stage('Test on P100') { + agent { label 'ephemeral-linux-gpu' } + options { + timeout(time: 40, unit: 'MINUTES') + } + steps { + retry(2) { + sh '''#!/bin/bash + set -exo pipefail + + date + docker pull gcr.io/kaggle-private-byod/python:${PRETEST_TAG} + ./test --gpu --image gcr.io/kaggle-private-byod/python:${PRETEST_TAG} + ''' + } + } + } + stage('Test on T4x2') { + agent { label 'ephemeral-linux-gpu-t4x2' } + options { + timeout(time: 60, unit: 'MINUTES') + } + steps { + retry(2) { + sh '''#!/bin/bash + set -exo pipefail + + date + docker pull gcr.io/kaggle-private-byod/python:${PRETEST_TAG} + ./test --gpu --image gcr.io/kaggle-private-byod/python:${PRETEST_TAG} + ''' + } + } + } + } + } + stage('Label CPU/GPU Staging Images') { steps { sh '''#!/bin/bash From 1e8d192cab8a5ee81a47774e329ca284c9513f41 Mon Sep 17 00:00:00 2001 From: psbang Date: Mon, 16 Dec 2024 12:23:46 -0500 Subject: [PATCH 21/67] Add patch for google_genai (#1454) --- patches/sitecustomize.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/patches/sitecustomize.py b/patches/sitecustomize.py index ea47698b..e8afb361 100644 --- a/patches/sitecustomize.py +++ b/patches/sitecustomize.py @@ -117,3 +117,30 @@ def new_configure(*args, **kwargs): module.configure = new_configure module.configure() # generativeai can use GOOGLE_API_KEY env variable, so make sure we have the other configs set + +@wrapt.when_imported('google.genai') +def post_genai_import_logic(module): + if os.getenv('KAGGLE_DISABLE_GOOGLE_GENERATIVE_AI_INTEGRATION'): + return + + if not (os.getenv('KAGGLE_DATA_PROXY_TOKEN') and + os.getenv('KAGGLE_USER_SECRETS_TOKEN') and + os.getenv('KAGGLE_DATA_PROXY_URL')): + return + @wrapt.patch_function_wrapper(module, 'Client.__init__') + def init_wrapper(wrapped, instance, args, kwargs): + # Don't want to forward requests that are to Vertex AI, debug mode, or have their own http_options specified + # Thus, if the client constructor contains any params other than api_key, we don't set up forwarding + if any(value is not None for key, value in kwargs.items() if key != 'api_key'): + return wrapped(*args, **kwargs) + + default_metadata = { + "x-kaggle-proxy-data": os.environ['KAGGLE_DATA_PROXY_TOKEN'], + 'x-kaggle-authorization': f"Bearer {os.environ['KAGGLE_USER_SECRETS_TOKEN']}" + } + http_options = { + 'base_url': os.getenv('KAGGLE_DATA_PROXY_URL') + '/palmapi/', + 'headers': default_metadata + } + kwargs['http_options'] = http_options + return wrapped(*args, **kwargs) From 18a67b0b430aaaa5bdd60bc30f4e86b6a9b5e8d5 Mon Sep 17 00:00:00 2001 From: Dustin H Date: Mon, 16 Dec 2024 13:11:57 -0500 Subject: [PATCH 22/67] Update instructions to point to kaggle_requirements.txt --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 387dcf89..315e7db2 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ If you the first step above doesn't work for your use case, [open an issue](http ## Opening a pull request -1. Edit the [Dockerfile](Dockerfile.tmpl). +1. Edit [kaggle_requirements.txt](kaggle_requirements.txt). 1. Follow the instructions below to build a new image. 1. Add tests for your new package. See this [example](https://github.com/Kaggle/docker-python/blob/main/tests/test_fastai.py). 1. Follow the instructions below to test the new image. From 2cb0a25b24d90807b216aa521d25c63d28c6f3bb Mon Sep 17 00:00:00 2001 From: psbang Date: Mon, 16 Dec 2024 15:35:02 -0500 Subject: [PATCH 23/67] Add torchtune to docker image packages (#1456) --- kaggle_requirements.txt | 1 + tests/test_torchtune.py | 9 +++++++++ 2 files changed, 10 insertions(+) create mode 100644 tests/test_torchtune.py diff --git a/kaggle_requirements.txt b/kaggle_requirements.txt index 30f819c5..8379d0f2 100644 --- a/kaggle_requirements.txt +++ b/kaggle_requirements.txt @@ -140,6 +140,7 @@ tensorflow_decision_forests timm torchinfo torchmetrics +torchtune tsfresh vtk wandb diff --git a/tests/test_torchtune.py b/tests/test_torchtune.py new file mode 100644 index 00000000..b2090ecc --- /dev/null +++ b/tests/test_torchtune.py @@ -0,0 +1,9 @@ +import unittest + +import subprocess + +class TestTorchtune(unittest.TestCase): + def test_help(self): + ret_code = subprocess.run(["tune", "--help"]) + self.assertEqual(0, ret_code.returncode) + self.assertIsNone(ret_code.stderr) \ No newline at end of file From c6e40876bedf778616e032d25cf54a0e2b8fb654 Mon Sep 17 00:00:00 2001 From: psbang Date: Mon, 16 Dec 2024 15:51:30 -0500 Subject: [PATCH 24/67] Add google-genai to docker image (#1455) --- kaggle_requirements.txt | 1 + tests/test_google_genai_patch.py | 55 ++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) create mode 100644 tests/test_google_genai_patch.py diff --git a/kaggle_requirements.txt b/kaggle_requirements.txt index 8379d0f2..a73ff406 100644 --- a/kaggle_requirements.txt +++ b/kaggle_requirements.txt @@ -48,6 +48,7 @@ google-cloud-automl==1.0.1 google-cloud-translate==3.12.1 google-cloud-videointelligence google-cloud-vision +google-genai gpxpy h2o haversine diff --git a/tests/test_google_genai_patch.py b/tests/test_google_genai_patch.py new file mode 100644 index 00000000..9d225763 --- /dev/null +++ b/tests/test_google_genai_patch.py @@ -0,0 +1,55 @@ +import json +import unittest +import threading + +from test.support.os_helper import EnvironmentVarGuard +from urllib.parse import urlparse + +from http.server import BaseHTTPRequestHandler, HTTPServer + +class HTTPHandler(BaseHTTPRequestHandler): + called = False + path = None + headers = {} + + def do_HEAD(self): + self.send_response(200) + + def do_POST(self): + HTTPHandler.path = self.path + HTTPHandler.headers = self.headers + HTTPHandler.called = True + self.send_response(200) + self.send_header("Content-type", "application/json") + self.end_headers() + +class TestGoogleGenAiPatch(unittest.TestCase): + endpoint = "http://127.0.0.1/" + + def test_proxy_enabled(self): + env = EnvironmentVarGuard() + secrets_token = "secrets_token" + proxy_token = "proxy_token" + env.set("KAGGLE_USER_SECRETS_TOKEN", secrets_token) + env.set("KAGGLE_DATA_PROXY_TOKEN", proxy_token) + env.set("KAGGLE_DATA_PROXY_URL", self.endpoint) + server_address = urlparse(self.endpoint) + with env: + with HTTPServer((server_address.hostname, server_address.port), HTTPHandler) as httpd: + threading.Thread(target=httpd.serve_forever).start() + from google import genai + api_key = "NotARealAPIKey" + client = genai.Client(api_key = api_key) + try: + client.models.generate_content( + model="gemini-2.0-flash-exp", + contents="What's the largest planet in our solar system?" + ) + except: + pass + httpd.shutdown() + self.assertTrue(HTTPHandler.called) + self.assertIn("/palmapi", HTTPHandler.path) + self.assertEqual(proxy_token, HTTPHandler.headers["x-kaggle-proxy-data"]) + self.assertEqual("Bearer {}".format(secrets_token), HTTPHandler.headers["x-kaggle-authorization"]) + self.assertEqual(api_key, HTTPHandler.headers["x-goog-api-key"]) From 2a33c9f40ef67089a0f2eafa277ff94a5ff85834 Mon Sep 17 00:00:00 2001 From: Dustin H Date: Tue, 17 Dec 2024 12:13:36 -0500 Subject: [PATCH 25/67] Fix LD_LIBRARY_PATH This was overwriting it to be empty which was breaking GPU function. --- Dockerfile.tmpl | 2 -- 1 file changed, 2 deletions(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index 77f198a0..c12a2976 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -202,8 +202,6 @@ ENV GIT_COMMIT=${GIT_COMMIT} \ RUN echo "$GIT_COMMIT" > /etc/git_commit && echo "$BUILD_DATE" > /etc/build_date {{ if eq .Accelerator "gpu" }} -# Remove the CUDA stubs. -ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH_NO_STUBS" \ # Add the CUDA home. CUDA_HOME=/usr/local/cuda {{ end }} From 00192f2d02aeb5f9afa6356e6a9f561e2a622e31 Mon Sep 17 00:00:00 2001 From: Dustin H Date: Tue, 17 Dec 2024 12:18:58 -0500 Subject: [PATCH 26/67] Fix CUDA_HOME env var --- Dockerfile.tmpl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index c12a2976..4d1cbc2f 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -202,7 +202,7 @@ ENV GIT_COMMIT=${GIT_COMMIT} \ RUN echo "$GIT_COMMIT" > /etc/git_commit && echo "$BUILD_DATE" > /etc/build_date {{ if eq .Accelerator "gpu" }} - # Add the CUDA home. - CUDA_HOME=/usr/local/cuda +# Add the CUDA home. +ENV CUDA_HOME=/usr/local/cuda {{ end }} ENTRYPOINT ["/usr/bin/env"] From e4c8674c32deee5d870dc8ddff847450b9b89bdd Mon Sep 17 00:00:00 2001 From: Dustin H Date: Tue, 17 Dec 2024 15:07:49 -0500 Subject: [PATCH 27/67] newer fastcore needed to match nbdev http://b/384745227 --- kaggle_requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/kaggle_requirements.txt b/kaggle_requirements.txt index a73ff406..feab427f 100644 --- a/kaggle_requirements.txt +++ b/kaggle_requirements.txt @@ -33,6 +33,7 @@ docker easyocr eli5 emoji +fastcore>=1.7.20 fasttext featuretools fiona From 02fabfde6934e1067ef24408139b4954a9eb9e81 Mon Sep 17 00:00:00 2001 From: Dustin H Date: Wed, 18 Dec 2024 10:03:57 -0500 Subject: [PATCH 28/67] Fix old version of hf datasets (#1457) Was failing tests with "Loading a dataset cached in a LocalFileSystem is not supported" --- kaggle_requirements.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kaggle_requirements.txt b/kaggle_requirements.txt index feab427f..fdf659ab 100644 --- a/kaggle_requirements.txt +++ b/kaggle_requirements.txt @@ -25,7 +25,9 @@ cesium comm cytoolz dask-expr -datasets +# Older versions of datasets fail with "Loading a dataset cached in a LocalFileSystem is not supported" +# https://stackoverflow.com/questions/77433096/notimplementederror-loading-a-dataset-cached-in-a-localfilesystem-is-not-suppor +datasets>=2.14.6 datashader deap dipy From 54527323533b2b1755647f1ee88cfd467ee7374e Mon Sep 17 00:00:00 2001 From: Dustin H Date: Wed, 18 Dec 2024 22:37:06 -0500 Subject: [PATCH 29/67] Explicitly add notebook pkg to tpu vm http://b/384994695 --- tpu/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/tpu/requirements.txt b/tpu/requirements.txt index 99db64a7..2b76f040 100644 --- a/tpu/requirements.txt +++ b/tpu/requirements.txt @@ -19,6 +19,7 @@ trax # Jupyter packages jupyter-lsp==1.5.1 jupyterlab +notebook papermill python-lsp-server[all] # Keras Packages From 9de11420a3ad9267fe52c2bef95478926f0521da Mon Sep 17 00:00:00 2001 From: Dustin H Date: Fri, 20 Dec 2024 12:48:18 -0500 Subject: [PATCH 30/67] Install cuml-cu12 (#1459) http://b/381287748 --- kaggle_requirements.txt | 2 ++ tests/test_cuml.py | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/kaggle_requirements.txt b/kaggle_requirements.txt index fdf659ab..1db55ced 100644 --- a/kaggle_requirements.txt +++ b/kaggle_requirements.txt @@ -1,4 +1,5 @@ # Please keep this in alphabetical order +--extra-index-url https://pypi.nvidia.com Altair>=5.4.0 Babel Boruta @@ -23,6 +24,7 @@ catboost category-encoders cesium comm +cuml-cu12 cytoolz dask-expr # Older versions of datasets fail with "Loading a dataset cached in a LocalFileSystem is not supported" diff --git a/tests/test_cuml.py b/tests/test_cuml.py index 695e47ca..bbb7f0c6 100644 --- a/tests/test_cuml.py +++ b/tests/test_cuml.py @@ -6,7 +6,6 @@ class TestCuml(unittest.TestCase): @gpu_test @p100_exempt # b/342143152: cuML(>=24.4v) is inompatible with p100 GPUs. - @unittest.skip("b/381287748 cuML is not installed in Colab.") def test_pca_fit_transform(self): import unittest import numpy as np From 3cba8096dd4524cf902034639631b3d2087c82e8 Mon Sep 17 00:00:00 2001 From: Dustin Herbison Date: Fri, 3 Jan 2025 15:02:42 +0000 Subject: [PATCH 31/67] bump tpu jax/torch versions, add tpu-info --- tpu/Dockerfile | 2 +- tpu/config.txt | 9 ++++----- tpu/requirements.txt | 4 +++- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/tpu/Dockerfile b/tpu/Dockerfile index 6a45fa60..ce3154db 100644 --- a/tpu/Dockerfile +++ b/tpu/Dockerfile @@ -11,7 +11,6 @@ ARG TORCH_LINUX_WHEEL_VERSION ARG TORCH_VERSION ARG TENSORFLOW_VERSION ARG TF_LIBTPU_VERSION -ARG JAX_VERSION ARG TORCHVISION_VERSION ARG TORCHAUDIO_VERSION @@ -67,6 +66,7 @@ RUN envsubst < /kaggle_requirements.txt > /requirements.txt RUN curl -LsSf https://astral.sh/uv/install.sh | sh RUN export PATH="${HOME}/.local/bin:${PATH}" && uv pip install --system -r /requirements.txt --prerelease=allow --find-links https://storage.googleapis.com/jax-releases/libtpu_releases.html && \ /tmp/clean-layer.sh +ENV PATH="${HOME}/.local/bin:${PATH}" # Tensorflow libtpu: RUN curl --output /usr/local/lib/python3.10/site-packages/libtpu/libtpu.so https://storage.googleapis.com/cloud-tpu-tpuvm-artifacts/libtpu/${TF_LIBTPU_VERSION}/libtpu.so diff --git a/tpu/config.txt b/tpu/config.txt index 4ce1c196..357c8147 100644 --- a/tpu/config.txt +++ b/tpu/config.txt @@ -6,12 +6,11 @@ PYTHON_VERSION_PATH=python3.10 TENSORFLOW_VERSION=2.16.1 TF_LIBTPU_VERSION=1.10.1 TF_LINUX_WHEEL_VERSION=manylinux_2_17_x86_64.manylinux2014_x86_64 -JAX_VERSION=0.4.23 -# gsutil ls gs://pytorch-xla-releases/wheels/tpuvm/* | grep libtpu | grep -v -E ".*rc[0-9].*" +# gsutil ls gs://pytorch-xla-releases/wheels/tpuvm/* | grep libtpu | grep torch_xla | grep -v -E ".*rc[0-9].*" | sed 's/.*torch_xla-\(.*\)+libtpu.*/\1/' | sort -rV # Supports nightly -TORCH_VERSION=2.4.0 +TORCH_VERSION=2.5.0 # https://github.com/pytorch/audio supports nightly -TORCHAUDIO_VERSION=2.4.0 +TORCHAUDIO_VERSION=2.5.0 # https://github.com/pytorch/vision supports nightly -TORCHVISION_VERSION=0.19.0 +TORCHVISION_VERSION=0.20.0 TORCH_LINUX_WHEEL_VERSION=manylinux_2_28_x86_64 diff --git a/tpu/requirements.txt b/tpu/requirements.txt index 2b76f040..4776f3fe 100644 --- a/tpu/requirements.txt +++ b/tpu/requirements.txt @@ -1,3 +1,5 @@ +# TPU Utils +tpu-info # Tensorflow packages https://storage.googleapis.com/cloud-tpu-tpuvm-artifacts/tensorflow/tf-${TENSORFLOW_VERSION}/tensorflow-${TENSORFLOW_VERSION}-${PYTHON_WHEEL_VERSION}-${PYTHON_WHEEL_VERSION}-${TF_LINUX_WHEEL_VERSION}.whl tensorflow_hub @@ -9,7 +11,7 @@ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-${TOR torchaudio==${TORCHAUDIO_VERSION} torchvision==${TORCHVISION_VERSION} # Jax packages -jax[tpu]==${JAX_VERSION} +jax[tpu]>=0.4.34 distrax flax git+https://github.com/deepmind/dm-haiku From 1ddb233d2c85c0a28356c658886d907f99880131 Mon Sep 17 00:00:00 2001 From: Dustin Herbison Date: Fri, 3 Jan 2025 16:27:44 +0000 Subject: [PATCH 32/67] upgrade tpu tf to 2.18.0 --- tpu/Dockerfile | 10 ---------- tpu/config.txt | 4 ---- tpu/requirements.txt | 4 ++-- 3 files changed, 2 insertions(+), 16 deletions(-) diff --git a/tpu/Dockerfile b/tpu/Dockerfile index ce3154db..eb39a9d6 100644 --- a/tpu/Dockerfile +++ b/tpu/Dockerfile @@ -6,11 +6,8 @@ FROM $BASE_IMAGE # See: https://docs.docker.com/engine/reference/builder/#understand-how-arg-and-from-interact ARG PYTHON_WHEEL_VERSION ARG PYTHON_VERSION_PATH -ARG TF_LINUX_WHEEL_VERSION ARG TORCH_LINUX_WHEEL_VERSION ARG TORCH_VERSION -ARG TENSORFLOW_VERSION -ARG TF_LIBTPU_VERSION ARG TORCHVISION_VERSION ARG TORCHAUDIO_VERSION @@ -68,17 +65,11 @@ RUN export PATH="${HOME}/.local/bin:${PATH}" && uv pip install --system -r /requ /tmp/clean-layer.sh ENV PATH="${HOME}/.local/bin:${PATH}" -# Tensorflow libtpu: -RUN curl --output /usr/local/lib/python3.10/site-packages/libtpu/libtpu.so https://storage.googleapis.com/cloud-tpu-tpuvm-artifacts/libtpu/${TF_LIBTPU_VERSION}/libtpu.so - # Kaggle Model Hub patches: ADD patches/kaggle_module_resolver.py /usr/local/lib/${PYTHON_VERSION_PATH}/site-packages/tensorflow_hub/kaggle_module_resolver.py RUN sed -i '/from tensorflow_hub import uncompressed_module_resolver/a from tensorflow_hub import kaggle_module_resolver' /usr/local/lib/${PYTHON_VERSION_PATH}/site-packages/tensorflow_hub/config.py RUN sed -i '/_install_default_resolvers()/a \ \ registry.resolver.add_implementation(kaggle_module_resolver.KaggleFileResolver())' /usr/local/lib/${PYTHON_VERSION_PATH}/site-packages/tensorflow_hub/config.py -# Monkey-patch the default TPU to the local (TPU VM). -RUN sed -i 's/tpu=None,/tpu="local",/' /usr/local/lib/${PYTHON_VERSION_PATH}/site-packages/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver.py - # Set these env vars so that they don't produce errs calling the metadata server to load them: ENV TPU_ACCELERATOR_TYPE=v3-8 ENV TPU_PROCESS_ADDRESSES=local @@ -92,7 +83,6 @@ LABEL build-date=$BUILD_DATE ENV GIT_COMMIT=${GIT_COMMIT} ENV BUILD_DATE=${BUILD_DATE} -LABEL tensorflow-version=$TENSORFLOW_VERSION LABEL kaggle-lang=python # Correlate current release with the git hash inside the kernel editor by running `!cat /etc/git_commit`. diff --git a/tpu/config.txt b/tpu/config.txt index 357c8147..6dc04ed4 100644 --- a/tpu/config.txt +++ b/tpu/config.txt @@ -1,10 +1,6 @@ BASE_IMAGE=python:3.10 PYTHON_WHEEL_VERSION=cp310 PYTHON_VERSION_PATH=python3.10 -# gsutil ls gs://cloud-tpu-tpuvm-artifacts/tensorflow -# https://cloud.google.com/tpu/docs/supported-tpu-configurations#libtpu_versions -TENSORFLOW_VERSION=2.16.1 -TF_LIBTPU_VERSION=1.10.1 TF_LINUX_WHEEL_VERSION=manylinux_2_17_x86_64.manylinux2014_x86_64 # gsutil ls gs://pytorch-xla-releases/wheels/tpuvm/* | grep libtpu | grep torch_xla | grep -v -E ".*rc[0-9].*" | sed 's/.*torch_xla-\(.*\)+libtpu.*/\1/' | sort -rV # Supports nightly diff --git a/tpu/requirements.txt b/tpu/requirements.txt index 4776f3fe..4aacad18 100644 --- a/tpu/requirements.txt +++ b/tpu/requirements.txt @@ -1,7 +1,8 @@ # TPU Utils tpu-info # Tensorflow packages -https://storage.googleapis.com/cloud-tpu-tpuvm-artifacts/tensorflow/tf-${TENSORFLOW_VERSION}/tensorflow-${TENSORFLOW_VERSION}-${PYTHON_WHEEL_VERSION}-${PYTHON_WHEEL_VERSION}-${TF_LINUX_WHEEL_VERSION}.whl +tensorflow-tpu>=2.18.0 +-f https://storage.googleapis.com/libtpu-tf-releases/index.html tensorflow_hub tensorflow-io tensorflow-probability @@ -36,7 +37,6 @@ albumentations diffusers einops fastparquet -librosa matplotlib opencv-python opencv-python-headless From 083bc20f00eda74a422ab91b9c18de7a80806d07 Mon Sep 17 00:00:00 2001 From: Dustin H Date: Fri, 3 Jan 2025 14:54:47 -0500 Subject: [PATCH 33/67] Update Colab base image to release-colab_20241217-060132_RC00 (#1458) http://b/385145217 --- Dockerfile.tmpl | 10 +++++++++- config.txt | 2 +- kaggle_requirements.txt | 4 +++- tests/test_numpy.py | 17 +++++++++++++---- 4 files changed, 26 insertions(+), 7 deletions(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index 4d1cbc2f..bf8b3c03 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -32,8 +32,16 @@ RUN uv pip uninstall --system google-cloud-bigquery-storage # to avoid affecting the larger build, we'll post-install it. RUN uv pip install --no-build-isolation --system "git+https://github.com/Kaggle/learntools" +# b/385161357 Latest Colab uses tf 2.17.1, but tf decision forests only has a version for 2.17.0. +# Instead, we'll install tfdf with its deps and hope that 2.17.0 compat tfdf works with tf 2.17.1. +RUN uv pip install --system --no-deps tensorflow-decision-forests==1.10.0 wurlitzer==3.1.1 ydf==0.9.0 + +# b/385145217 Latest Colab lacks mkl numpy, install it. +RUN uv pip install --system --force-reinstall -i https://pypi.anaconda.org/intel/simple numpy + # b/328788268 We install an incompatible pair of libs (shapely<2, libpysal==4.9.2) so we can't put this one in the requirements.txt -RUN uv pip install --system "libpysal==4.9.2" +# newer daal4py requires tbb>=2022, but libpysal is downgrading it for some reason +RUN uv pip install --system "tbb>=2022" "libpysal==4.9.2" # Adding non-package dependencies: diff --git a/config.txt b/config.txt index b799aadb..1da5b25b 100644 --- a/config.txt +++ b/config.txt @@ -1,5 +1,5 @@ BASE_IMAGE=us-docker.pkg.dev/colab-images/public/runtime -BASE_IMAGE_TAG=release-colab_20240920-060127_RC00 +BASE_IMAGE_TAG=release-colab_20241217-060132_RC00 LIGHTGBM_VERSION=4.5.0 CUDA_MAJOR_VERSION=12 CUDA_MINOR_VERSION=2 diff --git a/kaggle_requirements.txt b/kaggle_requirements.txt index 1db55ced..e2c2475e 100644 --- a/kaggle_requirements.txt +++ b/kaggle_requirements.txt @@ -85,6 +85,7 @@ libpysal<=4.9.2 lime line_profiler mamba +matplotlib<3.8 mlcrate mne mpld3 @@ -142,7 +143,8 @@ squarify tensorflow-cloud tensorflow-io tensorflow-text -tensorflow_decision_forests +# b/385161357: tf 2.17.1 does not have matching tensorflow_decision_forests release +# tensorflow_decision_forests timm torchinfo torchmetrics diff --git a/tests/test_numpy.py b/tests/test_numpy.py index 071c3d30..948455ea 100644 --- a/tests/test_numpy.py +++ b/tests/test_numpy.py @@ -3,9 +3,10 @@ from distutils.version import StrictVersion import numpy as np -from numpy.distutils.system_info import get_info +import io +from contextlib import redirect_stdout -class TestNumpy(unittest.TestCase): +class TestNumpy(unittest.TestCase): def test_version(self): # b/370860329: newer versions are not capable with current tensorflow self.assertEqual(StrictVersion(np.__version__), StrictVersion("1.26.4")) @@ -18,5 +19,13 @@ def test_array(self): # Numpy must be linked to the MKL. (Occasionally, a third-party package will muck up the installation # and numpy will be reinstalled with an OpenBLAS backing.) def test_mkl(self): - # This will throw an exception if the MKL is not linked correctly or return an empty dict. - self.assertTrue(get_info("blas_mkl")) + try: + from numpy.distutils.system_info import get_info + # This will throw an exception if the MKL is not linked correctly or return an empty dict. + self.assertTrue(get_info("blas_mkl")) + except: + # Fallback to check if mkl is present via show_config() + config_out = io.StringIO() + with redirect_stdout(config_out): + np.show_config() + self.assertIn("mkl_rt", config_out.getvalue()) From 55c18210abfa1f3e2b75e9ffd25950ebfbbb6694 Mon Sep 17 00:00:00 2001 From: goeffthomas Date: Mon, 6 Jan 2025 09:25:24 -0800 Subject: [PATCH 34/67] Bump `kagglehub` and add optional deps (#1460) Per @jplotts, I'll wait to merge this until after the holidays http://b/379756505 --- kaggle_requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kaggle_requirements.txt b/kaggle_requirements.txt index e2c2475e..9d78d8ad 100644 --- a/kaggle_requirements.txt +++ b/kaggle_requirements.txt @@ -70,7 +70,7 @@ jupyter_server==2.12.5 jupyterlab jupyterlab-lsp kaggle-environments -kagglehub>=0.3.4 +kagglehub[pandas-datasets,hf-datasets]>=0.3.6 # Keras 3.6 broke test_keras.py > test_train > keras.datasets.mnist.load_data(): # See https://github.com/keras-team/keras/commit/dcefb139863505d166dd1325066f329b3033d45a keras<3.6 From ffa7d0d6752e2fca55ea39fbe27bac2f4659f685 Mon Sep 17 00:00:00 2001 From: Dustin Herbison Date: Mon, 13 Jan 2025 18:31:05 +0000 Subject: [PATCH 35/67] fixing tf 2.18 install --- tpu/Dockerfile | 5 +++-- tpu/config.txt | 2 +- tpu/requirements.txt | 5 +++-- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/tpu/Dockerfile b/tpu/Dockerfile index eb39a9d6..b1f81bbd 100644 --- a/tpu/Dockerfile +++ b/tpu/Dockerfile @@ -6,6 +6,7 @@ FROM $BASE_IMAGE # See: https://docs.docker.com/engine/reference/builder/#understand-how-arg-and-from-interact ARG PYTHON_WHEEL_VERSION ARG PYTHON_VERSION_PATH +ARG TENSORFLOW_VERSION ARG TORCH_LINUX_WHEEL_VERSION ARG TORCH_VERSION ARG TORCHVISION_VERSION @@ -61,9 +62,9 @@ RUN envsubst < /kaggle_requirements.txt > /requirements.txt # Install uv and then install the requirements: RUN curl -LsSf https://astral.sh/uv/install.sh | sh -RUN export PATH="${HOME}/.local/bin:${PATH}" && uv pip install --system -r /requirements.txt --prerelease=allow --find-links https://storage.googleapis.com/jax-releases/libtpu_releases.html && \ +RUN export PATH="${HOME}/.local/bin:${PATH}" && uv pip install --system -r /requirements.txt --prerelease=allow --force-reinstall && \ /tmp/clean-layer.sh -ENV PATH="${HOME}/.local/bin:${PATH}" +ENV PATH="~/.local/bin:${PATH}" # Kaggle Model Hub patches: ADD patches/kaggle_module_resolver.py /usr/local/lib/${PYTHON_VERSION_PATH}/site-packages/tensorflow_hub/kaggle_module_resolver.py diff --git a/tpu/config.txt b/tpu/config.txt index 6dc04ed4..0bd198ac 100644 --- a/tpu/config.txt +++ b/tpu/config.txt @@ -1,7 +1,7 @@ BASE_IMAGE=python:3.10 PYTHON_WHEEL_VERSION=cp310 PYTHON_VERSION_PATH=python3.10 -TF_LINUX_WHEEL_VERSION=manylinux_2_17_x86_64.manylinux2014_x86_64 +TENSORFLOW_VERSION=2.18.0 # gsutil ls gs://pytorch-xla-releases/wheels/tpuvm/* | grep libtpu | grep torch_xla | grep -v -E ".*rc[0-9].*" | sed 's/.*torch_xla-\(.*\)+libtpu.*/\1/' | sort -rV # Supports nightly TORCH_VERSION=2.5.0 diff --git a/tpu/requirements.txt b/tpu/requirements.txt index 4aacad18..ec6fb273 100644 --- a/tpu/requirements.txt +++ b/tpu/requirements.txt @@ -1,8 +1,8 @@ # TPU Utils tpu-info # Tensorflow packages -tensorflow-tpu>=2.18.0 --f https://storage.googleapis.com/libtpu-tf-releases/index.html +tensorflow-tpu>=${TENSORFLOW_VERSION} +--find-links https://storage.googleapis.com/libtpu-tf-releases/index.html tensorflow_hub tensorflow-io tensorflow-probability @@ -13,6 +13,7 @@ torchaudio==${TORCHAUDIO_VERSION} torchvision==${TORCHVISION_VERSION} # Jax packages jax[tpu]>=0.4.34 +--find-links https://storage.googleapis.com/jax-releases/libtpu_releases.html distrax flax git+https://github.com/deepmind/dm-haiku From c170f0a204bcbe6cfcc52ef86a303784fbac97ad Mon Sep 17 00:00:00 2001 From: Dustin Herbison Date: Mon, 13 Jan 2025 21:40:16 +0000 Subject: [PATCH 36/67] use nvidia runtime for tests --- test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test b/test index c2748e81..574b49e3 100755 --- a/test +++ b/test @@ -28,7 +28,7 @@ while :; do ;; -g|--gpu) IMAGE_TAG='kaggle/python-gpu-build' - ADDITONAL_OPTS='-v /tmp/empty_dir:/usr/local/cuda/lib64/stubs:ro' + ADDITONAL_OPTS='--runtime nvidia -v /tmp/empty_dir:/usr/local/cuda/lib64/stubs:ro' ;; -i|--image) if [[ -z $2 ]]; then From e47c7efeac1abae100d0a9d1fe2ba994d09dbc3e Mon Sep 17 00:00:00 2001 From: Dustin H Date: Tue, 14 Jan 2025 09:49:09 -0500 Subject: [PATCH 37/67] Prefer building the TPU VM on a CPU agent Hopefully this forces it to build on a newer node. --- Jenkinsfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Jenkinsfile b/Jenkinsfile index 97d9b4bb..ea2cb9be 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -110,6 +110,7 @@ pipeline { } } stage('TPU VM') { + agent { label 'ephemeral-linux' } stages { stage('Build TPU VM Image') { options { From 6050d0dac696d4adb8af048bc4de994b7d747f19 Mon Sep 17 00:00:00 2001 From: Dustin H Date: Fri, 17 Jan 2025 15:42:31 -0500 Subject: [PATCH 38/67] Try to make tf 2.18 on tpu vm install reliably (#1469) I'm seeing it randomly pick the CUDA enabled version for some reason. Trying a force reinstall while maintaining existing packages to see if that helps. --- tpu/Dockerfile | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tpu/Dockerfile b/tpu/Dockerfile index b1f81bbd..923ad2d5 100644 --- a/tpu/Dockerfile +++ b/tpu/Dockerfile @@ -66,6 +66,12 @@ RUN export PATH="${HOME}/.local/bin:${PATH}" && uv pip install --system -r /requ /tmp/clean-layer.sh ENV PATH="~/.local/bin:${PATH}" +# Try to force tensorflow to reliably install without breaking other installed deps +RUN export PATH="${HOME}/.local/bin:${PATH}" && \ + uv pip freeze --system > /tmp/constraints.txt && \ + uv pip install --system -c /tmp/constraints.txt tensorflow-tpu -f https://storage.googleapis.com/libtpu-tf-releases/index.html --force-reinstall && \ + rm /tmp/constraints.txt + # Kaggle Model Hub patches: ADD patches/kaggle_module_resolver.py /usr/local/lib/${PYTHON_VERSION_PATH}/site-packages/tensorflow_hub/kaggle_module_resolver.py RUN sed -i '/from tensorflow_hub import uncompressed_module_resolver/a from tensorflow_hub import kaggle_module_resolver' /usr/local/lib/${PYTHON_VERSION_PATH}/site-packages/tensorflow_hub/config.py From 738fade33bb4de2fc22fe2af8bf2748abc0075a5 Mon Sep 17 00:00:00 2001 From: Johnny Chavez <64660690+calderjo@users.noreply.github.com> Date: Tue, 4 Feb 2025 16:01:54 -0800 Subject: [PATCH 39/67] Fix kagglehub (#1471) UV requires a separate flag for allowing prerelease packages to be installed. kagglehub install fails due to this, since sigstore (a dependency) needs a prerelease package. let's install it separate to allow prerelease installs. https://b.corp.google.com/issues/394382016 --- Dockerfile.tmpl | 3 +++ kaggle_requirements.txt | 1 - 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index bf8b3c03..1d146441 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -28,6 +28,9 @@ ENV PATH="~/.local/bin:${PATH}" # b/183041606#comment5: the Kaggle data proxy doesn't support these APIs. If the library is missing, it falls back to using a regular BigQuery query to fetch data. RUN uv pip uninstall --system google-cloud-bigquery-storage +# b/394382016: sigstore (dependency of kagglehub) requires a prerelease packages, installing separate. +RUN uv pip install --system --force-reinstall --prerelease=allow kagglehub[pandas-datasets,hf-datasets]>=0.3.7 + # uv cannot install this in requirements.txt without --no-build-isolation # to avoid affecting the larger build, we'll post-install it. RUN uv pip install --no-build-isolation --system "git+https://github.com/Kaggle/learntools" diff --git a/kaggle_requirements.txt b/kaggle_requirements.txt index 9d78d8ad..6bb69e62 100644 --- a/kaggle_requirements.txt +++ b/kaggle_requirements.txt @@ -70,7 +70,6 @@ jupyter_server==2.12.5 jupyterlab jupyterlab-lsp kaggle-environments -kagglehub[pandas-datasets,hf-datasets]>=0.3.6 # Keras 3.6 broke test_keras.py > test_train > keras.datasets.mnist.load_data(): # See https://github.com/keras-team/keras/commit/dcefb139863505d166dd1325066f329b3033d45a keras<3.6 From a0906f3cd6ccec655800f0598bc64337675d10b4 Mon Sep 17 00:00:00 2001 From: Johnny Chavez <64660690+calderjo@users.noreply.github.com> Date: Tue, 18 Feb 2025 14:25:51 -0800 Subject: [PATCH 40/67] Add kagglehub optional signing features (#1473) ditto title --- Dockerfile.tmpl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index 1d146441..ce72ab9e 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -29,7 +29,7 @@ ENV PATH="~/.local/bin:${PATH}" RUN uv pip uninstall --system google-cloud-bigquery-storage # b/394382016: sigstore (dependency of kagglehub) requires a prerelease packages, installing separate. -RUN uv pip install --system --force-reinstall --prerelease=allow kagglehub[pandas-datasets,hf-datasets]>=0.3.7 +RUN uv pip install --system --force-reinstall --prerelease=allow kagglehub[pandas-datasets,hf-datasets,signing]>=0.3.9 # uv cannot install this in requirements.txt without --no-build-isolation # to avoid affecting the larger build, we'll post-install it. From f2cdc779f33d5626352a533ea04fbfa6a832b774 Mon Sep 17 00:00:00 2001 From: Johnny Chavez <64660690+calderjo@users.noreply.github.com> Date: Fri, 4 Apr 2025 12:50:29 -0700 Subject: [PATCH 41/67] Update Colab Base image to colab_20250219-060225_RC01 (#1475) We are upgrading the base image to the latest release image by colab: colab_20250219-060225_RC01 Which includes the following upgrades: TF 2.18 Python 3.11 Cuda 12.5 This PR includes a handful of fixes to resolve conflicts related to these upgrade. Notably issues pertaining torch and cudnn. We also bumped lightgbm version as well We also included a fix to tune cli package conflict. --- Dockerfile.tmpl | 31 +++++++++++++++++++++++-------- config.txt | 6 +++--- kaggle_requirements.txt | 19 +++---------------- tests/test_torchtune.py | 9 +++++---- 4 files changed, 34 insertions(+), 31 deletions(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index ce72ab9e..4b3dfee5 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -31,29 +31,41 @@ RUN uv pip uninstall --system google-cloud-bigquery-storage # b/394382016: sigstore (dependency of kagglehub) requires a prerelease packages, installing separate. RUN uv pip install --system --force-reinstall --prerelease=allow kagglehub[pandas-datasets,hf-datasets,signing]>=0.3.9 +# b/408284143: google-cloud-automl 2.0.0 introduced incompatible API changes, need to pin to 1.0.1 + +# b/408284435: Keras 3.6 broke test_keras.py > test_train > keras.datasets.mnist.load_data() +# See https://github.com/keras-team/keras/commit/dcefb139863505d166dd1325066f329b3033d45a +# Colab base is on Keras 3.8, we have to install the package separately +RUN uv pip install --system google-cloud-automl==1.0.1 google-cloud-aiplatform google-cloud-translate==3.12.1 \ + google-cloud-videointelligence google-cloud-vision google-genai "keras<3.6" + # uv cannot install this in requirements.txt without --no-build-isolation # to avoid affecting the larger build, we'll post-install it. RUN uv pip install --no-build-isolation --system "git+https://github.com/Kaggle/learntools" -# b/385161357 Latest Colab uses tf 2.17.1, but tf decision forests only has a version for 2.17.0. -# Instead, we'll install tfdf with its deps and hope that 2.17.0 compat tfdf works with tf 2.17.1. -RUN uv pip install --system --no-deps tensorflow-decision-forests==1.10.0 wurlitzer==3.1.1 ydf==0.9.0 +# b/408281617: Torch is adamant that it can not install cudnn 9.3.x, only 9.1.x, but Tensorflow can only support 9.3.x. +# This conflict causes a number of package downgrades, which are handled in this command +RUN uv pip install --system --force-reinstall --extra-index-url https://pypi.nvidia.com pynvjitlink-cu12 cuml-cu12==25.2.1 \ + nvidia-cudnn-cu12==9.3.0.75 scipy tsfresh +RUN uv pip install --system --force-reinstall pynvjitlink-cu12==0.5.2 # b/385145217 Latest Colab lacks mkl numpy, install it. RUN uv pip install --system --force-reinstall -i https://pypi.anaconda.org/intel/simple numpy -# b/328788268 We install an incompatible pair of libs (shapely<2, libpysal==4.9.2) so we can't put this one in the requirements.txt # newer daal4py requires tbb>=2022, but libpysal is downgrading it for some reason RUN uv pip install --system "tbb>=2022" "libpysal==4.9.2" +# b/404590350: Ray and torchtune have conflicting tune cli, we will prioritize torchtune. +RUN uv pip install --system --force-reinstall --no-deps torchtune + # Adding non-package dependencies: ADD clean-layer.sh /tmp/clean-layer.sh ADD patches/nbconvert-extensions.tpl /opt/kaggle/nbconvert-extensions.tpl ADD patches/template_conf.json /opt/kaggle/conf.json -# /opt/conda/lib/python3.10/site-packages -ARG PACKAGE_PATH=/usr/local/lib/python3.10/dist-packages +# /opt/conda/lib/python3.11/site-packages +ARG PACKAGE_PATH=/usr/local/lib/python3.11/dist-packages # Install GPU-specific non-pip packages. {{ if eq .Accelerator "gpu" }} @@ -108,6 +120,9 @@ RUN apt-get install -y libfreetype6-dev && \ apt-get install -y libglib2.0-0 libxext6 libsm6 libxrender1 libfontconfig1 --fix-missing # NLTK Project datasets +# b/408298750: We currently reinstall the package, because we get the following error: +# `AttributeError: module 'inspect' has no attribute 'formatargspec'. Did you mean: 'formatargvalues'?` +RUN uv pip install --system --force-reinstall "nltk>=3.9.1" RUN mkdir -p /usr/share/nltk_data && \ # NLTK Downloader no longer continues smoothly after an error, so we explicitly list # the corpuses that work @@ -120,7 +135,7 @@ RUN mkdir -p /usr/share/nltk_data && \ masc_tagged maxent_ne_chunker maxent_treebank_pos_tagger moses_sample movie_reviews \ mte_teip5 names nps_chat omw opinion_lexicon paradigms \ pil pl196x porter_test ppattach problem_reports product_reviews_1 product_reviews_2 propbank \ - pros_cons ptb punkt qc reuters rslp rte sample_grammars semcor senseval sentence_polarity \ + pros_cons ptb punkt punkt_tab qc reuters rslp rte sample_grammars semcor senseval sentence_polarity \ sentiwordnet shakespeare sinica_treebank smultron snowball_data spanish_grammars \ state_union stopwords subjectivity swadesh switchboard tagsets timit toolbox treebank \ twitter_samples udhr2 udhr unicode_samples universal_tagset universal_treebanks_v20 \ @@ -198,7 +213,7 @@ ADD patches/kaggle_gcp.py \ # Figure out why this is in a different place? # Found by doing a export PYTHONVERBOSE=1 and then running python and checking for where it looked for it. -ADD patches/sitecustomize.py /usr/lib/python3.10/sitecustomize.py +ADD patches/sitecustomize.py /usr/lib/python3.11/sitecustomize.py ARG GIT_COMMIT=unknown \ BUILD_DATE=unknown diff --git a/config.txt b/config.txt index 1da5b25b..b9cff861 100644 --- a/config.txt +++ b/config.txt @@ -1,5 +1,5 @@ BASE_IMAGE=us-docker.pkg.dev/colab-images/public/runtime -BASE_IMAGE_TAG=release-colab_20241217-060132_RC00 -LIGHTGBM_VERSION=4.5.0 +BASE_IMAGE_TAG=release-colab_20250219-060225_RC01 +LIGHTGBM_VERSION=4.6.0 CUDA_MAJOR_VERSION=12 -CUDA_MINOR_VERSION=2 +CUDA_MINOR_VERSION=5 diff --git a/kaggle_requirements.txt b/kaggle_requirements.txt index 6bb69e62..3bc1dfd1 100644 --- a/kaggle_requirements.txt +++ b/kaggle_requirements.txt @@ -1,5 +1,4 @@ # Please keep this in alphabetical order ---extra-index-url https://pypi.nvidia.com Altair>=5.4.0 Babel Boruta @@ -24,7 +23,6 @@ catboost category-encoders cesium comm -cuml-cu12 cytoolz dask-expr # Older versions of datasets fail with "Loading a dataset cached in a LocalFileSystem is not supported" @@ -46,14 +44,6 @@ fuzzywuzzy geojson # geopandas > v0.14.4 breaks learn tools geopandas==v0.14.4 -google-cloud-aiplatform -# google-cloud-automl 2.0.0 introduced incompatible API changes, need to pin to 1.0.1 -google-cloud-automl==1.0.1 -# b/315753846: Unpin translate package. -google-cloud-translate==3.12.1 -google-cloud-videointelligence -google-cloud-vision -google-genai gpxpy h2o haversine @@ -70,15 +60,11 @@ jupyter_server==2.12.5 jupyterlab jupyterlab-lsp kaggle-environments -# Keras 3.6 broke test_keras.py > test_train > keras.datasets.mnist.load_data(): -# See https://github.com/keras-team/keras/commit/dcefb139863505d166dd1325066f329b3033d45a -keras<3.6 keras-cv keras-nlp keras-tuner kornia langid -leven # b/328788268: libpysal 4.10 seems to fail with "module 'shapely' has no attribute 'Geometry'. Did you mean: 'geometry'" libpysal<=4.9.2 lime @@ -142,12 +128,13 @@ squarify tensorflow-cloud tensorflow-io tensorflow-text -# b/385161357: tf 2.17.1 does not have matching tensorflow_decision_forests release -# tensorflow_decision_forests +tensorflow_decision_forests timm +torchao torchinfo torchmetrics torchtune +triton tsfresh vtk wandb diff --git a/tests/test_torchtune.py b/tests/test_torchtune.py index b2090ecc..aab17442 100644 --- a/tests/test_torchtune.py +++ b/tests/test_torchtune.py @@ -1,9 +1,10 @@ import unittest - import subprocess class TestTorchtune(unittest.TestCase): def test_help(self): - ret_code = subprocess.run(["tune", "--help"]) - self.assertEqual(0, ret_code.returncode) - self.assertIsNone(ret_code.stderr) \ No newline at end of file + result = subprocess.run(["tune", "--help"], stdout=subprocess.PIPE) + + self.assertEqual(0, result.returncode) + self.assertIsNone(result.stderr) + self.assertIn("Download a model from the Hugging Face Hub or Kaggle Model Hub.", result.stdout.decode("utf-8")) From af1ee04eceb85969d8329decedc7f0b751a4b728 Mon Sep 17 00:00:00 2001 From: Johnny Chavez <64660690+calderjo@users.noreply.github.com> Date: Mon, 7 Apr 2025 12:18:39 -0700 Subject: [PATCH 42/67] Adjust pins to fix learn tools tests errors (#1477) Torch and Cuml have compatibility issues, installing them outside of requirements files removes some of our pins. Let's add them back in when installing to ensure that pins are in place. --- Dockerfile.tmpl | 8 +++++--- kaggle_requirements.txt | 4 ---- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index 4b3dfee5..ae3f4e94 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -45,9 +45,11 @@ RUN uv pip install --no-build-isolation --system "git+https://github.com/Kaggle/ # b/408281617: Torch is adamant that it can not install cudnn 9.3.x, only 9.1.x, but Tensorflow can only support 9.3.x. # This conflict causes a number of package downgrades, which are handled in this command -RUN uv pip install --system --force-reinstall --extra-index-url https://pypi.nvidia.com pynvjitlink-cu12 cuml-cu12==25.2.1 \ - nvidia-cudnn-cu12==9.3.0.75 scipy tsfresh -RUN uv pip install --system --force-reinstall pynvjitlink-cu12==0.5.2 +# b/302136621: Fix eli5 import for learntools +RUN uv pip install --system --force-reinstall --extra-index-url https://pypi.nvidia.com "cuml-cu12==25.2.1" \ + "nvidia-cudnn-cu12==9.3.0.75" scipy tsfresh scikit-learn==1.2.2 category-encoders eli5 + +RUN uv pip install --system --force-reinstall "pynvjitlink-cu12==0.5.2" # b/385145217 Latest Colab lacks mkl numpy, install it. RUN uv pip install --system --force-reinstall -i https://pypi.anaconda.org/intel/simple numpy diff --git a/kaggle_requirements.txt b/kaggle_requirements.txt index 3bc1dfd1..159a02e8 100644 --- a/kaggle_requirements.txt +++ b/kaggle_requirements.txt @@ -20,7 +20,6 @@ arrow bayesian-optimization boto3 catboost -category-encoders cesium comm cytoolz @@ -33,7 +32,6 @@ deap dipy docker easyocr -eli5 emoji fastcore>=1.7.20 fasttext @@ -111,8 +109,6 @@ qtconsole ray rgf-python s3fs - # b/302136621: Fix eli5 import for learntools -scikit-learn==1.2.2 # Scikit-learn accelerated library for x86 scikit-learn-intelex>=2023.0.1 scikit-multilearn From b0f0125dc52e9b4479837d907aaa091fcffff6cd Mon Sep 17 00:00:00 2001 From: Johnny Chavez <64660690+calderjo@users.noreply.github.com> Date: Mon, 7 Apr 2025 14:05:52 -0700 Subject: [PATCH 43/67] Ensure transformers is up to date (#1479) We fix tests to adjust for deprecated methods, we added a line to ensure we get the latest transformers package is latest. --- kaggle_requirements.txt | 1 + tests/test_transformers.py | 5 ++--- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/kaggle_requirements.txt b/kaggle_requirements.txt index 159a02e8..002b07d2 100644 --- a/kaggle_requirements.txt +++ b/kaggle_requirements.txt @@ -130,6 +130,7 @@ torchao torchinfo torchmetrics torchtune +transformers>=4.51.0 triton tsfresh vtk diff --git a/tests/test_transformers.py b/tests/test_transformers.py index a81714cc..910eab30 100644 --- a/tests/test_transformers.py +++ b/tests/test_transformers.py @@ -1,7 +1,7 @@ import unittest import torch -from transformers import AdamW +import torch.optim as optim import transformers.pipelines # verify this import works @@ -10,13 +10,12 @@ def assertListAlmostEqual(self, list1, list2, tol): self.assertEqual(len(list1), len(list2)) for a, b in zip(list1, list2): self.assertAlmostEqual(a, b, delta=tol) - def test_adam_w(self): w = torch.tensor([0.1, -0.2, -0.1], requires_grad=True) target = torch.tensor([0.4, 0.2, -0.5]) criterion = torch.nn.MSELoss() # No warmup, constant schedule, no gradient clipping - optimizer = AdamW(params=[w], lr=2e-1, weight_decay=0.0) + optimizer = optim.AdamW(params=[w], lr=2e-1, weight_decay=0.0) for _ in range(100): loss = criterion(w, target) loss.backward() From 5545f99df08377c5e58197353a1b4d745149267f Mon Sep 17 00:00:00 2001 From: Johnny Chavez <64660690+calderjo@users.noreply.github.com> Date: Mon, 7 Apr 2025 17:18:26 -0700 Subject: [PATCH 44/67] TPU upgrades to tf and torch, clean up dead code (#1480) Since the latest colab image is now python 3.11, we should try to bump this as well. Also removing comments and code that are out of date --- tpu/Dockerfile | 23 ----------------------- tpu/config.txt | 6 +++--- tpu/requirements.txt | 4 ++-- 3 files changed, 5 insertions(+), 28 deletions(-) diff --git a/tpu/Dockerfile b/tpu/Dockerfile index 923ad2d5..ce68baf1 100644 --- a/tpu/Dockerfile +++ b/tpu/Dockerfile @@ -25,36 +25,13 @@ ADD patches/kaggle_session.py /root/.local/lib/${PYTHON_VERSION_PATH}/site-packa ADD patches/kaggle_web_client.py /root/.local/lib/${PYTHON_VERSION_PATH}/site-packages/kaggle_web_client.py ADD patches/kaggle_datasets.py /root/.local/lib/${PYTHON_VERSION_PATH}/site-packages/kaggle_datasets.py -# Disable GCP integrations for now -# ADD patches/kaggle_gcp.py /root/.local/lib/${PYTHON_VERSION_PATH}/site-packages/kaggle_gcp.py - -# Disable logging to file (why do we need this?) -# ADD patches/log.py /root/.local/lib/${PYTHON_VERSION_PATH}/site-packages/log.py - -# sitecustomize adds significant latency to ipython kernel startup and should only be added if needed -# ADD patches/sitecustomize.py /root/.local/lib/${PYTHON_VERSION_PATH}/site-packages/sitecustomize.py - # Prereqs # This is needed for cv2 (opencv-python): # https://stackoverflow.com/questions/55313610/importerror-libgl-so-1-cannot-open-shared-object-file-no-such-file-or-directo RUN apt-get update && apt-get install ffmpeg libsm6 libxext6 -y # Install all the packages together for maximum compatibility. - -# Install Tensorflow. - -# Install Pytorch & related packages -# https://cloud.google.com/tpu/docs/pytorch-xla-ug-tpu-vm#changing_pytorch_version -# The URL doesn't include patch version. i.e. must use 1.11 instead of 1.11.0 -# We need to keep the numpy version the same as the installed tf one but compatible with other installs. - -# Install JAX & related packages -# https://cloud.google.com/tpu/docs/jax-quickstart-tpu-vm#install_jax_on_your_cloud_tpu_vm - -# Packages needed by the Notebook editor - # Additional useful packages should be added in the requirements.txt - # Bring in the requirements.txt and replace variables in it: RUN apt-get install -y gettext ADD tpu/requirements.txt /kaggle_requirements.txt diff --git a/tpu/config.txt b/tpu/config.txt index 0bd198ac..645d2faf 100644 --- a/tpu/config.txt +++ b/tpu/config.txt @@ -4,9 +4,9 @@ PYTHON_VERSION_PATH=python3.10 TENSORFLOW_VERSION=2.18.0 # gsutil ls gs://pytorch-xla-releases/wheels/tpuvm/* | grep libtpu | grep torch_xla | grep -v -E ".*rc[0-9].*" | sed 's/.*torch_xla-\(.*\)+libtpu.*/\1/' | sort -rV # Supports nightly -TORCH_VERSION=2.5.0 +TORCH_VERSION=2.6.0 # https://github.com/pytorch/audio supports nightly -TORCHAUDIO_VERSION=2.5.0 +TORCHAUDIO_VERSION=2.6.0 # https://github.com/pytorch/vision supports nightly -TORCHVISION_VERSION=0.20.0 +TORCHVISION_VERSION=0.21.0 TORCH_LINUX_WHEEL_VERSION=manylinux_2_28_x86_64 diff --git a/tpu/requirements.txt b/tpu/requirements.txt index ec6fb273..ba3a1bdf 100644 --- a/tpu/requirements.txt +++ b/tpu/requirements.txt @@ -1,13 +1,13 @@ # TPU Utils tpu-info # Tensorflow packages -tensorflow-tpu>=${TENSORFLOW_VERSION} +tensorflow-tpu==${TENSORFLOW_VERSION} --find-links https://storage.googleapis.com/libtpu-tf-releases/index.html tensorflow_hub tensorflow-io tensorflow-probability # Torch packages -torch~=${TORCH_VERSION} +torch==${TORCH_VERSION} https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-${TORCH_VERSION}+libtpu-${PYTHON_WHEEL_VERSION}-${PYTHON_WHEEL_VERSION}-${TORCH_LINUX_WHEEL_VERSION}.whl torchaudio==${TORCHAUDIO_VERSION} torchvision==${TORCHVISION_VERSION} From f48df1d9dc484870ad98f3b7e2de9b2e6eef3173 Mon Sep 17 00:00:00 2001 From: Johnny Chavez <64660690+calderjo@users.noreply.github.com> Date: Tue, 8 Apr 2025 13:34:47 -0700 Subject: [PATCH 45/67] Update kaggle package (#1481) Let's make sure that we install the latest version of kaggle in our image. cc: @stevemessick --- kaggle_requirements.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kaggle_requirements.txt b/kaggle_requirements.txt index 002b07d2..845abb55 100644 --- a/kaggle_requirements.txt +++ b/kaggle_requirements.txt @@ -57,6 +57,9 @@ jupyter-lsp==1.5.1 jupyter_server==2.12.5 jupyterlab jupyterlab-lsp +# b/409363708: Ensure we have the update version, we can consider removing it once +# Colab base image is updated more frequently. +kaggle>=1.7.4.2 kaggle-environments keras-cv keras-nlp From 3cdabdfe2d93025e48e3cb8f00664f137951da36 Mon Sep 17 00:00:00 2001 From: jaesong-colab Date: Tue, 6 May 2025 14:28:45 -0700 Subject: [PATCH 46/67] Remove command to set cuda path and install uv (#1485) Remove command to install uv as it is installed in the Colab base image. Remove updating cuda path as Colab base image sets the CUDA version --- Dockerfile.tmpl | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index ae3f4e94..144672ad 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -19,10 +19,8 @@ RUN cat /kaggle_requirements.txt >> /requirements.txt # TODO: GPU requirements.txt # TODO: merge them better (override matching ones). -# Install uv & Kaggle packages -RUN curl -LsSf https://astral.sh/uv/install.sh | sh -RUN export PATH="${HOME}/.local/bin:${PATH}" && uv pip install --system -r /requirements.txt -ENV PATH="~/.local/bin:${PATH}" +# Install Kaggle packages +RUN uv pip install --system -r /requirements.txt # Install manual packages: # b/183041606#comment5: the Kaggle data proxy doesn't support these APIs. If the library is missing, it falls back to using a regular BigQuery query to fetch data. @@ -71,14 +69,6 @@ ARG PACKAGE_PATH=/usr/local/lib/python3.11/dist-packages # Install GPU-specific non-pip packages. {{ if eq .Accelerator "gpu" }} -ARG CUDA_MAJOR_VERSION \ - CUDA_MINOR_VERSION -ENV CUDA_MAJOR_VERSION=${CUDA_MAJOR_VERSION} \ - CUDA_MINOR_VERSION=${CUDA_MINOR_VERSION} - -# Make sure we are on the right version of CUDA -RUN update-alternatives --set cuda /usr/local/cuda-$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION - RUN uv pip install --system "pycuda" # b/381256047 Remove once installed in Colabs base image. From 8a20862e6c11454ecb8f328fb4a78f6b11a2b41c Mon Sep 17 00:00:00 2001 From: Johnny Chavez <64660690+calderjo@users.noreply.github.com> Date: Wed, 7 May 2025 17:05:35 -0700 Subject: [PATCH 47/67] Upgrade Base Image: colab_20250404-060113_RC00 (#1484) This particular image had issues with UV installs however does highlight a solution and will be included in the next image: https://github.com/googlecolab/colabtools/issues/5237 Base image also removes Gensim due to SciPy 1.14.1, we included a fix to install both, since Gensim is a popular package 200 users per day. Updated mocks for GCS related tests, latest version causes issues Adding a few packages back into requirements.txt that were remove due to fixes that have been since resolved --- Dockerfile.tmpl | 29 +++++++++++++-------------- config.txt | 2 +- kaggle_requirements.txt | 14 +++++++++++++- tests/test_automl.py | 4 +++- tests/test_datashader.py | 42 ---------------------------------------- tests/test_gcs.py | 4 +++- tests/test_keras.py | 9 +++++---- 7 files changed, 38 insertions(+), 66 deletions(-) delete mode 100644 tests/test_datashader.py diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index 144672ad..d4a4e1c8 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -7,6 +7,10 @@ FROM gcr.io/kaggle-images/python-lightgbm-whl:${BASE_IMAGE_TAG}-${LIGHTGBM_VERSI {{ end }} FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} +#b/415358342: UV reports missing requirements files https://github.com/googlecolab/colabtools/issues/5237 +ENV UV_CONSTRAINT= \ + UV_BUILD_CONSTRAINT= + ADD kaggle_requirements.txt /kaggle_requirements.txt # Freeze existing requirements from base image for critical packages: @@ -27,15 +31,10 @@ RUN uv pip install --system -r /requirements.txt RUN uv pip uninstall --system google-cloud-bigquery-storage # b/394382016: sigstore (dependency of kagglehub) requires a prerelease packages, installing separate. -RUN uv pip install --system --force-reinstall --prerelease=allow kagglehub[pandas-datasets,hf-datasets,signing]>=0.3.9 - -# b/408284143: google-cloud-automl 2.0.0 introduced incompatible API changes, need to pin to 1.0.1 - -# b/408284435: Keras 3.6 broke test_keras.py > test_train > keras.datasets.mnist.load_data() -# See https://github.com/keras-team/keras/commit/dcefb139863505d166dd1325066f329b3033d45a -# Colab base is on Keras 3.8, we have to install the package separately -RUN uv pip install --system google-cloud-automl==1.0.1 google-cloud-aiplatform google-cloud-translate==3.12.1 \ - google-cloud-videointelligence google-cloud-vision google-genai "keras<3.6" +# b/408284143: google-cloud-automl 2.0.0 introduced incompatible API changes, need to pin to 1.0.1, +# installed outside of kaggle_requirements.txt due to requiring an incompatibile version of protobuf. +RUN uv pip install --system --force-reinstall --prerelease=allow kagglehub[pandas-datasets,hf-datasets,signing]>=0.3.12 \ + google-cloud-automl==1.0.1 # uv cannot install this in requirements.txt without --no-build-isolation # to avoid affecting the larger build, we'll post-install it. @@ -43,10 +42,8 @@ RUN uv pip install --no-build-isolation --system "git+https://github.com/Kaggle/ # b/408281617: Torch is adamant that it can not install cudnn 9.3.x, only 9.1.x, but Tensorflow can only support 9.3.x. # This conflict causes a number of package downgrades, which are handled in this command -# b/302136621: Fix eli5 import for learntools RUN uv pip install --system --force-reinstall --extra-index-url https://pypi.nvidia.com "cuml-cu12==25.2.1" \ - "nvidia-cudnn-cu12==9.3.0.75" scipy tsfresh scikit-learn==1.2.2 category-encoders eli5 - + "nvidia-cudnn-cu12==9.3.0.75" RUN uv pip install --system --force-reinstall "pynvjitlink-cu12==0.5.2" # b/385145217 Latest Colab lacks mkl numpy, install it. @@ -56,10 +53,10 @@ RUN uv pip install --system --force-reinstall -i https://pypi.anaconda.org/intel RUN uv pip install --system "tbb>=2022" "libpysal==4.9.2" # b/404590350: Ray and torchtune have conflicting tune cli, we will prioritize torchtune. -RUN uv pip install --system --force-reinstall --no-deps torchtune +# b/415358158: Gensim removed from Colab image to upgrade scipy +RUN uv pip install --system --force-reinstall --no-deps torchtune gensim # Adding non-package dependencies: - ADD clean-layer.sh /tmp/clean-layer.sh ADD patches/nbconvert-extensions.tpl /opt/kaggle/nbconvert-extensions.tpl ADD patches/template_conf.json /opt/kaggle/conf.json @@ -171,13 +168,13 @@ RUN mkdir -p /root/.jupyter && touch /root/.jupyter/jupyter_nbconvert_config.py mkdir -p /etc/ipython/ && echo "c = get_config(); c.IPKernelApp.matplotlib = 'inline'" > /etc/ipython/ipython_config.py && \ /tmp/clean-layer.sh -# Fix to import bq_helper library without downgrading setuptools +# Fix to import bq_helper library without downgrading setuptools and upgrading protobuf RUN mkdir -p ~/src && git clone https://github.com/SohierDane/BigQuery_Helper ~/src/BigQuery_Helper && \ mkdir -p ~/src/BigQuery_Helper/bq_helper && \ mv ~/src/BigQuery_Helper/bq_helper.py ~/src/BigQuery_Helper/bq_helper/__init__.py && \ mv ~/src/BigQuery_Helper/test_helper.py ~/src/BigQuery_Helper/bq_helper/ && \ sed -i 's/)/packages=["bq_helper"])/g' ~/src/BigQuery_Helper/setup.py && \ - uv pip install --system -e ~/src/BigQuery_Helper && \ + uv pip install --system -e ~/src/BigQuery_Helper "protobuf<3.21"&& \ /tmp/clean-layer.sh diff --git a/config.txt b/config.txt index b9cff861..cfe8026a 100644 --- a/config.txt +++ b/config.txt @@ -1,5 +1,5 @@ BASE_IMAGE=us-docker.pkg.dev/colab-images/public/runtime -BASE_IMAGE_TAG=release-colab_20250219-060225_RC01 +BASE_IMAGE_TAG=release-colab_20250404-060113_RC00 LIGHTGBM_VERSION=4.6.0 CUDA_MAJOR_VERSION=12 CUDA_MINOR_VERSION=5 diff --git a/kaggle_requirements.txt b/kaggle_requirements.txt index 845abb55..22b26470 100644 --- a/kaggle_requirements.txt +++ b/kaggle_requirements.txt @@ -20,6 +20,7 @@ arrow bayesian-optimization boto3 catboost +category-encoders cesium comm cytoolz @@ -27,11 +28,12 @@ dask-expr # Older versions of datasets fail with "Loading a dataset cached in a LocalFileSystem is not supported" # https://stackoverflow.com/questions/77433096/notimplementederror-loading-a-dataset-cached-in-a-localfilesystem-is-not-suppor datasets>=2.14.6 -datashader deap dipy docker easyocr +# b/302136621: Fix eli5 import for learntools +eli5 emoji fastcore>=1.7.20 fasttext @@ -42,6 +44,13 @@ fuzzywuzzy geojson # geopandas > v0.14.4 breaks learn tools geopandas==v0.14.4 +gensim +google-cloud-aiplatform +# b/315753846: Unpin translate package. +google-cloud-translate==3.12.1 +google-cloud-videointelligence +google-cloud-vision +google-genai gpxpy h2o haversine @@ -112,12 +121,15 @@ qtconsole ray rgf-python s3fs +scikit-learn==1.2.2 # Scikit-learn accelerated library for x86 scikit-learn-intelex>=2023.0.1 scikit-multilearn scikit-optimize scikit-plot scikit-surprise +# b/415358158: Gensim removed from Colab image to upgrade scipy to 1.14.1 +scipy==1.15.1 # Also pinning seaborn for learntools seaborn==0.12.2 git+https://github.com/facebookresearch/segment-anything.git diff --git a/tests/test_automl.py b/tests/test_automl.py index 63c34c69..9a048b14 100644 --- a/tests/test_automl.py +++ b/tests/test_automl.py @@ -8,7 +8,9 @@ def _make_credentials(): import google.auth.credentials - return Mock(spec=google.auth.credentials.Credentials) + credentials = Mock(spec=google.auth.credentials.Credentials) + credentials.universe_domain = 'googleapis.com' + return credentials class TestAutoMl(unittest.TestCase): diff --git a/tests/test_datashader.py b/tests/test_datashader.py deleted file mode 100644 index ad3afe15..00000000 --- a/tests/test_datashader.py +++ /dev/null @@ -1,42 +0,0 @@ -import unittest - -from common import p100_exempt - -class TestDatashader(unittest.TestCase): - - @p100_exempt # b/342143152: Uses cuDF(>=24.4v), which is no longer capitble with p100 GPUs. - def test_pipeline(self): - # based on https://github.com/pyviz/datashader/blob/master/datashader/tests/test_pipeline.py - import numpy as np - import pandas as pd - import datashader as ds - import datashader.transfer_functions as tf - - df = pd.DataFrame({ - 'x': np.array(([0.] * 10 + [1] * 10)), - 'y': np.array(([0.] * 5 + [1] * 5 + [0] * 5 + [1] * 5)), - 'f64': np.arange(20, dtype='f8') - }) - df.f64.iloc[2] = np.nan - - cvs = ds.Canvas(plot_width=2, plot_height=2, x_range=(0, 1), y_range=(0, 1)) - - pipeline = ds.Pipeline(df, ds.Point('x', 'y')) - img = pipeline((0, 1), (0, 1), 2, 2) - agg = cvs.points(df, 'x', 'y', ds.count()) - self.assertTrue(img.equals(tf.shade(agg))) - - color_fn = lambda agg: tf.shade(agg, 'pink', 'red') - pipeline.color_fn = color_fn - img = pipeline((0, 1), (0, 1), 2, 2) - self.assertTrue(img.equals(color_fn(agg))) - - transform_fn = lambda agg: agg + 1 - pipeline.transform_fn = transform_fn - img = pipeline((0, 1), (0, 1), 2, 2) - self.assertTrue(img.equals(color_fn(transform_fn(agg)))) - - pipeline = ds.Pipeline(df, ds.Point('x', 'y'), ds.sum('f64')) - img = pipeline((0, 1), (0, 1), 2, 2) - agg = cvs.points(df, 'x', 'y', ds.sum('f64')) - self.assertTrue(img.equals(tf.shade(agg))) diff --git a/tests/test_gcs.py b/tests/test_gcs.py index eb15ea5f..94da58c9 100644 --- a/tests/test_gcs.py +++ b/tests/test_gcs.py @@ -8,7 +8,9 @@ def _make_credentials(): import google.auth.credentials - return Mock(spec=google.auth.credentials.Credentials) + credentials = Mock(spec=google.auth.credentials.Credentials) + credentials.universe_domain = 'googleapis.com' + return credentials class TestStorage(unittest.TestCase): diff --git a/tests/test_keras.py b/tests/test_keras.py index 22cb6f9f..5dc4610d 100644 --- a/tests/test_keras.py +++ b/tests/test_keras.py @@ -9,10 +9,11 @@ class TestKeras(unittest.TestCase): def test_train(self): - # Load the data and split it between train and test sets - (x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data( - path='/input/tests/data/mnist.npz' - ) + path = '/input/tests/data/mnist.npz' + with np.load(path) as f: + x_train, y_train = f['x_train'], f['y_train'] + x_test, y_test = f['x_test'], f['y_test'] + # Scale images to the [0, 1] range x_train = x_train.astype("float32") / 255 From ddcafd25480e308fa699e0ed09f5f90a31286a4e Mon Sep 17 00:00:00 2001 From: jaesong-colab Date: Thu, 8 May 2025 09:58:54 -0700 Subject: [PATCH 48/67] Remove setting CUDA_VERSION to empty string and deprecated apt-key (#1486) Remove setting CUDA_VERSION to empty string for CPU images. Remove getting deprecated apt-key.gpg key file from https://packages.cloud.google.com/ --- Dockerfile.tmpl | 8 -------- tests/common.py | 5 ++++- tests/test_jax.py | 4 ++-- 3 files changed, 6 insertions(+), 11 deletions(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index d4a4e1c8..9ca776a9 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -78,17 +78,9 @@ RUN apt-get install -y ocl-icd-libopencl1 clinfo && \ uv pip install --system /tmp/lightgbm/*.whl && \ rm -rf /tmp/lightgbm && \ /tmp/clean-layer.sh - -# Remove CUDA_VERSION from non-GPU image. -{{ else }} -ENV CUDA_VERSION="" {{ end }} -# Update GPG key per documentation at https://cloud.google.com/compute/docs/troubleshooting/known-issues -RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add - -RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - - # Use a fixed apt-get repo to stop intermittent failures due to flaky httpredir connections, # as described by Lionel Chan at http://stackoverflow.com/a/37426929/5881346 RUN sed -i "s/httpredir.debian.org/debian.uchicago.edu/" /etc/apt/sources.list && \ diff --git a/tests/common.py b/tests/common.py index 30a7bb0f..469033dd 100644 --- a/tests/common.py +++ b/tests/common.py @@ -11,7 +11,10 @@ def getAcceleratorName(): except FileNotFoundError: return("nvidia-smi not found.") -gpu_test = unittest.skipIf(len(os.environ.get('CUDA_VERSION', '')) == 0, 'Not running GPU tests') +def isGPU(): + return os.path.isfile('/proc/driver/nvidia/version') + +gpu_test = unittest.skipIf(not isGPU(), 'Not running GPU tests') # b/342143152 P100s are slowly being unsupported in new release of popular ml tools such as RAPIDS. p100_exempt = unittest.skipIf(getAcceleratorName() == "Tesla P100-PCIE-16GB", 'Not running p100 exempt tests') tpu_test = unittest.skipIf(len(os.environ.get('ISTPUVM', '')) == 0, 'Not running TPU tests') diff --git a/tests/test_jax.py b/tests/test_jax.py index b5e0898e..f8eca3bb 100644 --- a/tests/test_jax.py +++ b/tests/test_jax.py @@ -6,7 +6,7 @@ import jax import jax.numpy as np -from common import gpu_test +from common import gpu_test, isGPU from jax import grad, jit @@ -21,5 +21,5 @@ def test_grad(self): self.assertEqual(0.4199743, ag) def test_backend(self): - expected_backend = 'cpu' if len(os.environ.get('CUDA_VERSION', '')) == 0 else 'gpu' + expected_backend = 'cpu' if not isGPU() else 'gpu' self.assertEqual(expected_backend, jax.default_backend()) From 987863d34740b65715c71cdebc5a038609a03dff Mon Sep 17 00:00:00 2001 From: Johnny Chavez <64660690+calderjo@users.noreply.github.com> Date: Wed, 25 Jun 2025 17:26:35 +0000 Subject: [PATCH 49/67] Clean up lightgbm install and fix build (#1488) Bumping up the base image version to the latest. Good news, Colab installs lightgbm on both CPU and GPU, Bad news seems like the way they installed the package seems to give issues. I assume they install lightgbm in a similar manner to us, build the wheel and install in a tmp folder. UV PIP doesn't like this and errors at install requirement file step: error: Distribution not found at: file:///tmp/lightgbm/LightGBM/dist/lightgbm-4.5.0-py3-none-linux_x86_64.whl Removing our pin on the package fixes the build. I could also introduce a pin for lightgbm, but this could be problematic, as we'll continuously have to update the pining. --- Dockerfile.tmpl | 23 ++--------------------- Jenkinsfile | 23 ----------------------- config.txt | 3 +-- packages/lightgbm.Dockerfile | 30 ------------------------------ 4 files changed, 3 insertions(+), 76 deletions(-) delete mode 100644 packages/lightgbm.Dockerfile diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index 9ca776a9..1dc4c0fa 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -1,20 +1,12 @@ ARG BASE_IMAGE \ - BASE_IMAGE_TAG \ - LIGHTGBM_VERSION + BASE_IMAGE_TAG -{{ if eq .Accelerator "gpu" }} -FROM gcr.io/kaggle-images/python-lightgbm-whl:${BASE_IMAGE_TAG}-${LIGHTGBM_VERSION} AS lightgbm_whl -{{ end }} FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} -#b/415358342: UV reports missing requirements files https://github.com/googlecolab/colabtools/issues/5237 -ENV UV_CONSTRAINT= \ - UV_BUILD_CONSTRAINT= - ADD kaggle_requirements.txt /kaggle_requirements.txt # Freeze existing requirements from base image for critical packages: -RUN pip freeze | grep -E 'tensorflow|keras|torch|jax|lightgbm' > /colab_requirements.txt +RUN pip freeze | grep -E 'tensorflow|keras|torch|jax' > /colab_requirements.txt # Merge requirements files: RUN cat /colab_requirements.txt >> /requirements.txt @@ -67,17 +59,6 @@ ARG PACKAGE_PATH=/usr/local/lib/python3.11/dist-packages # Install GPU-specific non-pip packages. {{ if eq .Accelerator "gpu" }} RUN uv pip install --system "pycuda" - -# b/381256047 Remove once installed in Colabs base image. -# Install LightGBM -COPY --from=lightgbm_whl /tmp/whl/*.whl /tmp/lightgbm/ -# Install OpenCL (required by LightGBM GPU version) -RUN apt-get install -y ocl-icd-libopencl1 clinfo && \ - mkdir -p /etc/OpenCL/vendors && \ - echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd && \ - uv pip install --system /tmp/lightgbm/*.whl && \ - rm -rf /tmp/lightgbm && \ - /tmp/clean-layer.sh {{ end }} diff --git a/Jenkinsfile b/Jenkinsfile index ea2cb9be..906e0464 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -21,29 +21,6 @@ pipeline { } stages { - stage('Pre-build Packages from Source') { - stages { - stage('lightgbm') { - options { - timeout(time: 10, unit: 'MINUTES') - } - steps { - sh '''#!/bin/bash - set -exo pipefail - source config.txt - cd packages/ - ./build_package --base-image $BASE_IMAGE:$BASE_IMAGE_TAG \ - --package lightgbm \ - --version $LIGHTGBM_VERSION \ - --build-arg CUDA_MAJOR_VERSION=$CUDA_MAJOR_VERSION \ - --build-arg CUDA_MINOR_VERSION=$CUDA_MINOR_VERSION \ - --push - ''' - } - } - } - } - stage('Build/Test/Diff') { parallel { stage('CPU') { diff --git a/config.txt b/config.txt index cfe8026a..d3829bcf 100644 --- a/config.txt +++ b/config.txt @@ -1,5 +1,4 @@ BASE_IMAGE=us-docker.pkg.dev/colab-images/public/runtime -BASE_IMAGE_TAG=release-colab_20250404-060113_RC00 -LIGHTGBM_VERSION=4.6.0 +BASE_IMAGE_TAG=release-colab_20250603-060055_RC00 CUDA_MAJOR_VERSION=12 CUDA_MINOR_VERSION=5 diff --git a/packages/lightgbm.Dockerfile b/packages/lightgbm.Dockerfile deleted file mode 100644 index 376eaaef..00000000 --- a/packages/lightgbm.Dockerfile +++ /dev/null @@ -1,30 +0,0 @@ -ARG BASE_IMAGE - -FROM ${BASE_IMAGE} AS builder - -ARG PACKAGE_VERSION -ARG CUDA_MAJOR_VERSION -ARG CUDA_MINOR_VERSION - -# Make sure we are on the right version of CUDA -RUN update-alternatives --set cuda /usr/local/cuda-$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION - -# Build instructions: https://lightgbm.readthedocs.io/en/latest/GPU-Tutorial.html#build-lightgbm -RUN apt-get update && \ - apt-get install -y build-essential cmake libboost-dev libboost-system-dev libboost-filesystem-dev clinfo nvidia-opencl-dev opencl-headers - -RUN cd /usr/local/src && \ - git clone --recursive https://github.com/microsoft/LightGBM && \ - cd LightGBM && \ - git checkout tags/v$PACKAGE_VERSION && \ - ./build-python.sh bdist_wheel --gpu --opencl-library=/usr/local/cuda/lib64/libOpenCL.so --opencl-include-dir=/usr/local/cuda/include/ - -# Using multi-stage builds to ensure the output image is very small -# See: https://docs.docker.com/develop/develop-images/multistage-build/ -FROM alpine:latest - -RUN mkdir -p /tmp/whl/ -COPY --from=builder /usr/local/src/LightGBM/dist/*.whl /tmp/whl - -# Print out the built .whl file. -RUN ls -lh /tmp/whl/ \ No newline at end of file From f6db354afe8a5395a98f35113814f0eaaa7c3fbe Mon Sep 17 00:00:00 2001 From: Johnny Chavez <64660690+calderjo@users.noreply.github.com> Date: Fri, 27 Jun 2025 00:45:37 +0000 Subject: [PATCH 50/67] Fix torch tune, keras, tensorflow tests (#1489) Looks like torch tune changed the output of the --help command, this cause issues with our smoke tests. Keras, along with other package had issues with existing issues with cudnn downgrading due to torch requirements, we pinned relevant tests. --- Dockerfile.tmpl | 7 +++++-- kaggle_requirements.txt | 5 ++--- tests/test_torchtune.py | 12 +++++++++--- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index 1dc4c0fa..8f698612 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -35,7 +35,10 @@ RUN uv pip install --no-build-isolation --system "git+https://github.com/Kaggle/ # b/408281617: Torch is adamant that it can not install cudnn 9.3.x, only 9.1.x, but Tensorflow can only support 9.3.x. # This conflict causes a number of package downgrades, which are handled in this command RUN uv pip install --system --force-reinstall --extra-index-url https://pypi.nvidia.com "cuml-cu12==25.2.1" \ - "nvidia-cudnn-cu12==9.3.0.75" + "nvidia-cudnn-cu12==9.3.0.75" "nvidia-cublas-cu12==12.5.3.2" "nvidia-cusolver-cu12==11.6.3.83" \ + "nvidia-cuda-cupti-cu12==12.5.82" "nvidia-cuda-nvrtc-cu12==12.5.82" "nvidia-cuda-runtime-cu12==12.5.82" \ + "nvidia-cufft-cu12==11.2.3.61" "nvidia-curand-cu12==10.3.6.82" "nvidia-cusparse-cu12==12.5.1.3" \ + "nvidia-nvjitlink-cu12==12.5.82" RUN uv pip install --system --force-reinstall "pynvjitlink-cu12==0.5.2" # b/385145217 Latest Colab lacks mkl numpy, install it. @@ -46,7 +49,7 @@ RUN uv pip install --system "tbb>=2022" "libpysal==4.9.2" # b/404590350: Ray and torchtune have conflicting tune cli, we will prioritize torchtune. # b/415358158: Gensim removed from Colab image to upgrade scipy -RUN uv pip install --system --force-reinstall --no-deps torchtune gensim +RUN uv pip install --system --force-reinstall --no-deps torchtune gensim "scipy<=1.15.3" # Adding non-package dependencies: ADD clean-layer.sh /tmp/clean-layer.sh diff --git a/kaggle_requirements.txt b/kaggle_requirements.txt index 22b26470..cc43b8c2 100644 --- a/kaggle_requirements.txt +++ b/kaggle_requirements.txt @@ -121,6 +121,7 @@ qtconsole ray rgf-python s3fs +# b/302136621: Fix eli5 import for learntools scikit-learn==1.2.2 # Scikit-learn accelerated library for x86 scikit-learn-intelex>=2023.0.1 @@ -128,12 +129,10 @@ scikit-multilearn scikit-optimize scikit-plot scikit-surprise -# b/415358158: Gensim removed from Colab image to upgrade scipy to 1.14.1 -scipy==1.15.1 # Also pinning seaborn for learntools seaborn==0.12.2 git+https://github.com/facebookresearch/segment-anything.git -# b/329869023 shap 0.45.0 breaks learntools +# b/329869023: shap 0.45.0 breaks learntools shap==0.44.1 squarify tensorflow-cloud diff --git a/tests/test_torchtune.py b/tests/test_torchtune.py index aab17442..c4a702fd 100644 --- a/tests/test_torchtune.py +++ b/tests/test_torchtune.py @@ -3,8 +3,14 @@ class TestTorchtune(unittest.TestCase): def test_help(self): - result = subprocess.run(["tune", "--help"], stdout=subprocess.PIPE) + result = subprocess.run( + ["tune", "--help"], + capture_output=True, + text=True + ) self.assertEqual(0, result.returncode) - self.assertIsNone(result.stderr) - self.assertIn("Download a model from the Hugging Face Hub or Kaggle Model Hub.", result.stdout.decode("utf-8")) + self.assertIn( + "Download a model from the Hugging Face Hub or Kaggle", + result.stdout + ) From 0eb38ee5b80e4003101469c19d6da1f4353d5960 Mon Sep 17 00:00:00 2001 From: Johnny Chavez <64660690+calderjo@users.noreply.github.com> Date: Tue, 1 Jul 2025 14:38:01 -0700 Subject: [PATCH 51/67] Bump Colab Base Image - July (#1490) let's bump the image, this was released a couple days ago. while we were planning a release with the prior Base image, let's save us a release and get our self to the latest --- config.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config.txt b/config.txt index d3829bcf..61395f5e 100644 --- a/config.txt +++ b/config.txt @@ -1,4 +1,4 @@ BASE_IMAGE=us-docker.pkg.dev/colab-images/public/runtime -BASE_IMAGE_TAG=release-colab_20250603-060055_RC00 +BASE_IMAGE_TAG=release-colab_20250626-060053_RC00 CUDA_MAJOR_VERSION=12 CUDA_MINOR_VERSION=5 From 4d494f144d2915ad83b198a526c4623d3e087fce Mon Sep 17 00:00:00 2001 From: Eric Johnson <65414824+metrizable@users.noreply.github.com> Date: Wed, 17 Sep 2025 20:42:58 -0700 Subject: [PATCH 52/67] Upgrade TPU image to Python 3.11.Tpupy311 (#1493) Python 3.10 is entering its [last year of support](https://devguide.python.org/versions/) before end-of-life and many packages, including [NumPy](https://devguide.python.org/versions/), have dropped support for it altogether. Included in this change: * Upgrade the TPU docker image to derive from `python:3.11`. * Upgrade `tensorflow` to 2.20.0. * Upgrade `jax` to >=0.5.2. For a compatible dep closure, this installs `jax` 0.7.2 re: `tensorflow-tpu` dep on`libtpu`. * Upgrade `torch` (and ecosystem) to 2.8.0. Of note, there is no wheel with a `+libtpu` label. * Remove unneeded environment variable. Tested: Locally, by invoking `./tpu/build`: > 9jyLVT6hAPjKaCh Also invoked other back-end testing. --- tpu/Dockerfile | 1 - tpu/config.txt | 14 +++++++------- tpu/requirements.txt | 4 ++-- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/tpu/Dockerfile b/tpu/Dockerfile index ce68baf1..fd0c0684 100644 --- a/tpu/Dockerfile +++ b/tpu/Dockerfile @@ -55,7 +55,6 @@ RUN sed -i '/from tensorflow_hub import uncompressed_module_resolver/a from tens RUN sed -i '/_install_default_resolvers()/a \ \ registry.resolver.add_implementation(kaggle_module_resolver.KaggleFileResolver())' /usr/local/lib/${PYTHON_VERSION_PATH}/site-packages/tensorflow_hub/config.py # Set these env vars so that they don't produce errs calling the metadata server to load them: -ENV TPU_ACCELERATOR_TYPE=v3-8 ENV TPU_PROCESS_ADDRESSES=local # Metadata diff --git a/tpu/config.txt b/tpu/config.txt index 645d2faf..b399a592 100644 --- a/tpu/config.txt +++ b/tpu/config.txt @@ -1,12 +1,12 @@ -BASE_IMAGE=python:3.10 -PYTHON_WHEEL_VERSION=cp310 -PYTHON_VERSION_PATH=python3.10 -TENSORFLOW_VERSION=2.18.0 +BASE_IMAGE=python:3.11 +PYTHON_WHEEL_VERSION=cp311 +PYTHON_VERSION_PATH=python3.11 +TENSORFLOW_VERSION=2.20.0 # gsutil ls gs://pytorch-xla-releases/wheels/tpuvm/* | grep libtpu | grep torch_xla | grep -v -E ".*rc[0-9].*" | sed 's/.*torch_xla-\(.*\)+libtpu.*/\1/' | sort -rV # Supports nightly -TORCH_VERSION=2.6.0 +TORCH_VERSION=2.8.0 # https://github.com/pytorch/audio supports nightly -TORCHAUDIO_VERSION=2.6.0 +TORCHAUDIO_VERSION=2.8.0 # https://github.com/pytorch/vision supports nightly -TORCHVISION_VERSION=0.21.0 +TORCHVISION_VERSION=0.23.0 TORCH_LINUX_WHEEL_VERSION=manylinux_2_28_x86_64 diff --git a/tpu/requirements.txt b/tpu/requirements.txt index ba3a1bdf..c4b0cc58 100644 --- a/tpu/requirements.txt +++ b/tpu/requirements.txt @@ -8,11 +8,11 @@ tensorflow-io tensorflow-probability # Torch packages torch==${TORCH_VERSION} -https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-${TORCH_VERSION}+libtpu-${PYTHON_WHEEL_VERSION}-${PYTHON_WHEEL_VERSION}-${TORCH_LINUX_WHEEL_VERSION}.whl +https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-${TORCH_VERSION}-${PYTHON_WHEEL_VERSION}-${PYTHON_WHEEL_VERSION}-${TORCH_LINUX_WHEEL_VERSION}.whl torchaudio==${TORCHAUDIO_VERSION} torchvision==${TORCHVISION_VERSION} # Jax packages -jax[tpu]>=0.4.34 +jax[tpu]>=0.5.2 --find-links https://storage.googleapis.com/jax-releases/libtpu_releases.html distrax flax From da65765599c2f8d6d65a0ceb665270d3035adf8d Mon Sep 17 00:00:00 2001 From: Dustin H Date: Thu, 18 Sep 2025 00:34:16 -0400 Subject: [PATCH 53/67] Add tunix to tpu (#1494) --- tpu/requirements.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tpu/requirements.txt b/tpu/requirements.txt index c4b0cc58..f31a2e51 100644 --- a/tpu/requirements.txt +++ b/tpu/requirements.txt @@ -6,6 +6,7 @@ tensorflow-tpu==${TENSORFLOW_VERSION} tensorflow_hub tensorflow-io tensorflow-probability +tensorflow_datasets # Torch packages torch==${TORCH_VERSION} https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-${TORCH_VERSION}-${PYTHON_WHEEL_VERSION}-${PYTHON_WHEEL_VERSION}-${TORCH_LINUX_WHEEL_VERSION}.whl @@ -20,6 +21,10 @@ git+https://github.com/deepmind/dm-haiku jraph optax trax +# Tunix GRPO +git+https://github.com/google/tunix +git+https://github.com/google/qwix +grain # Jupyter packages jupyter-lsp==1.5.1 jupyterlab @@ -38,6 +43,7 @@ albumentations diffusers einops fastparquet +ipywidgets matplotlib opencv-python opencv-python-headless From d019ef74b14fa4281074e1e8709f077e8fbc78c7 Mon Sep 17 00:00:00 2001 From: Eric Johnson <65414824+metrizable@users.noreply.github.com> Date: Thu, 18 Sep 2025 13:24:17 -0700 Subject: [PATCH 54/67] Include the default PyPI for missing libucx-cu12 package version. (#1495) The CPU/GPU container image builds are currently broken due to dependency resolution failures: 9MgPMDp9qN8qCqf It appears, this version was removed from the Nvidia index since the last build. 6NFm9uDQDA5efcV We ensure that a compatible package version `libucx-cu12==1.18.1` is available by including the default PyPI and specifying an appropriate [index strategy](https://docs.astral.sh/uv/concepts/indexes/#searching-across-multiple-indexes) to `uv`. In the near future, we may want to consider upgrading `cuml-cu12` and ecosystem to 25.6 or later. --- Dockerfile.tmpl | 6 +++-- kaggle_requirements.txt | 9 +++++-- tests/test_fastai.py | 59 +++++++++++++++++++++-------------------- 3 files changed, 41 insertions(+), 33 deletions(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index 8f698612..1b63837d 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -34,7 +34,9 @@ RUN uv pip install --no-build-isolation --system "git+https://github.com/Kaggle/ # b/408281617: Torch is adamant that it can not install cudnn 9.3.x, only 9.1.x, but Tensorflow can only support 9.3.x. # This conflict causes a number of package downgrades, which are handled in this command -RUN uv pip install --system --force-reinstall --extra-index-url https://pypi.nvidia.com "cuml-cu12==25.2.1" \ +RUN uv pip install \ + --index-url https://pypi.nvidia.com --extra-index-url https://pypi.org/simple/ --index-strategy unsafe-first-match \ + --system --force-reinstall "cuml-cu12==25.2.1" \ "nvidia-cudnn-cu12==9.3.0.75" "nvidia-cublas-cu12==12.5.3.2" "nvidia-cusolver-cu12==11.6.3.83" \ "nvidia-cuda-cupti-cu12==12.5.82" "nvidia-cuda-nvrtc-cu12==12.5.82" "nvidia-cuda-runtime-cu12==12.5.82" \ "nvidia-cufft-cu12==11.2.3.61" "nvidia-curand-cu12==10.3.6.82" "nvidia-cusparse-cu12==12.5.1.3" \ @@ -171,7 +173,7 @@ ENV PYTHONUSERBASE="/root/.local" ADD patches/kaggle_gcp.py \ patches/kaggle_secrets.py \ patches/kaggle_session.py \ - patches/kaggle_web_client.py \ + patches/kaggle_web_client.py \ patches/kaggle_datasets.py \ patches/log.py \ $PACKAGE_PATH/ diff --git a/kaggle_requirements.txt b/kaggle_requirements.txt index cc43b8c2..03c489b4 100644 --- a/kaggle_requirements.txt +++ b/kaggle_requirements.txt @@ -35,7 +35,10 @@ easyocr # b/302136621: Fix eli5 import for learntools eli5 emoji -fastcore>=1.7.20 +fastcore +# b/445960030: Requires a newer version of fastai than the currently used base image. +# Remove when relying on a newer base image. +fastai>=2.8.4 fasttext featuretools fiona @@ -89,7 +92,9 @@ nbconvert==6.4.5 nbdev nilearn olefile -onnx +# b/445960030: Broken in 1.19.0. See https://github.com/onnx/onnx/issues/7249. +# Fixed with https://github.com/onnx/onnx/pull/7254. Upgrade when version with fix is published. +onnx==1.18.0 openslide-bin openslide-python optuna diff --git a/tests/test_fastai.py b/tests/test_fastai.py index 49bce0ac..33a436a5 100644 --- a/tests/test_fastai.py +++ b/tests/test_fastai.py @@ -1,35 +1,36 @@ import unittest import fastai - from fastai.tabular.all import * + class TestFastAI(unittest.TestCase): - # Basic import - def test_basic(self): - import fastai - import fastcore - import fastprogress - import fastdownload - - def test_has_version(self): - self.assertGreater(len(fastai.__version__), 2) - - # based on https://github.com/fastai/fastai/blob/master/tests/test_torch_core.py#L17 - def test_torch_tensor(self): - a = tensor([1, 2, 3]) - b = torch.tensor([1, 2, 3]) - - self.assertTrue(torch.all(a == b)) - - def test_tabular(self): - dls = TabularDataLoaders.from_csv( - "/input/tests/data/train.csv", - cont_names=["pixel"+str(i) for i in range(784)], - y_names='label', - procs=[FillMissing, Categorify, Normalize]) - learn = tabular_learner(dls, layers=[200, 100]) - with learn.no_bar(): - learn.fit_one_cycle(n_epoch=1) - - self.assertGreater(learn.smooth_loss, 0) + # Basic import + def test_basic(self): + import fastai + import fastcore + import fastprogress + import fastdownload + + def test_has_version(self): + self.assertGreater(len(fastai.__version__), 2) + + # based on https://github.com/fastai/fastai/blob/master/tests/test_torch_core.py#L17 + def test_torch_tensor(self): + a = tensor([1, 2, 3]) + b = torch.tensor([1, 2, 3]) + + self.assertTrue(torch.all(a == b)) + + def test_tabular(self): + dls = TabularDataLoaders.from_csv( + "/input/tests/data/train.csv", + cont_names=["pixel" + str(i) for i in range(784)], + y_names="label", + procs=[FillMissing, Categorify, Normalize], + ) + learn = tabular_learner(dls, layers=[200, 100]) + with learn.no_bar(): + learn.fit_one_cycle(n_epoch=1) + + self.assertGreater(learn.smooth_loss, 0) From 506c34b7805b1f545c544daf75fd5feb4c3c10ac Mon Sep 17 00:00:00 2001 From: Dustin H Date: Mon, 22 Sep 2025 10:24:06 -0400 Subject: [PATCH 55/67] Python 3.12 bump (#1496) --- tpu/config.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tpu/config.txt b/tpu/config.txt index b399a592..ab933ba7 100644 --- a/tpu/config.txt +++ b/tpu/config.txt @@ -1,6 +1,6 @@ -BASE_IMAGE=python:3.11 -PYTHON_WHEEL_VERSION=cp311 -PYTHON_VERSION_PATH=python3.11 +BASE_IMAGE=python:3.12 +PYTHON_WHEEL_VERSION=cp312 +PYTHON_VERSION_PATH=python3.12 TENSORFLOW_VERSION=2.20.0 # gsutil ls gs://pytorch-xla-releases/wheels/tpuvm/* | grep libtpu | grep torch_xla | grep -v -E ".*rc[0-9].*" | sed 's/.*torch_xla-\(.*\)+libtpu.*/\1/' | sort -rV # Supports nightly From acb8bccefc274f76aa51effdeacf69e2e1aa2dd7 Mon Sep 17 00:00:00 2001 From: Johnny Chavez <64660690+calderjo@users.noreply.github.com> Date: Thu, 25 Sep 2025 09:56:21 -0700 Subject: [PATCH 56/67] Upgrade base image to colab_20250725-060057_RC00, install google-adk (#1498) A couple of things here: we upgraded to our base to latest images that is still - on py3.11 - uses keras 3.8.x and tf 2.18.x we encounter some issues with upgrading to py3.12, so we'll punt that for later. in addition, we will install google-adk and pyngrok we will add a more involved test in a later PR. also we diffed new base image and removed packages that is now installed in their image @jplotts please lmk if pyngrok is needed for b/443054743 , we can always remove this at a later point --- Dockerfile.tmpl | 4 ++-- config.txt | 2 +- kaggle_requirements.txt | 13 ++++--------- 3 files changed, 7 insertions(+), 12 deletions(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index 1b63837d..7561a700 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -25,8 +25,8 @@ RUN uv pip uninstall --system google-cloud-bigquery-storage # b/394382016: sigstore (dependency of kagglehub) requires a prerelease packages, installing separate. # b/408284143: google-cloud-automl 2.0.0 introduced incompatible API changes, need to pin to 1.0.1, # installed outside of kaggle_requirements.txt due to requiring an incompatibile version of protobuf. -RUN uv pip install --system --force-reinstall --prerelease=allow kagglehub[pandas-datasets,hf-datasets,signing]>=0.3.12 \ - google-cloud-automl==1.0.1 +RUN uv pip install --system --force-reinstall --prerelease=allow "kagglehub[pandas-datasets,hf-datasets,signing]>=0.3.12" \ + "google-cloud-automl==1.0.1" # uv cannot install this in requirements.txt without --no-build-isolation # to avoid affecting the larger build, we'll post-install it. diff --git a/config.txt b/config.txt index 61395f5e..af541652 100644 --- a/config.txt +++ b/config.txt @@ -1,4 +1,4 @@ BASE_IMAGE=us-docker.pkg.dev/colab-images/public/runtime -BASE_IMAGE_TAG=release-colab_20250626-060053_RC00 +BASE_IMAGE_TAG=release-colab_20250725-060057_RC00 CUDA_MAJOR_VERSION=12 CUDA_MINOR_VERSION=5 diff --git a/kaggle_requirements.txt b/kaggle_requirements.txt index 03c489b4..c711869e 100644 --- a/kaggle_requirements.txt +++ b/kaggle_requirements.txt @@ -1,6 +1,4 @@ # Please keep this in alphabetical order -Altair>=5.4.0 -Babel Boruta Cartopy ImageHash @@ -24,7 +22,6 @@ category-encoders cesium comm cytoolz -dask-expr # Older versions of datasets fail with "Loading a dataset cached in a LocalFileSystem is not supported" # https://stackoverflow.com/questions/77433096/notimplementederror-loading-a-dataset-cached-in-a-localfilesystem-is-not-suppor datasets>=2.14.6 @@ -48,6 +45,8 @@ geojson # geopandas > v0.14.4 breaks learn tools geopandas==v0.14.4 gensim +# b/443054743 +google-adk google-cloud-aiplatform # b/315753846: Unpin translate package. google-cloud-translate==3.12.1 @@ -111,7 +110,6 @@ pyLDAvis pycryptodome pydegensac pydicom -pydub pyemd pyexcel-ods pymc3 @@ -144,18 +142,15 @@ tensorflow-cloud tensorflow-io tensorflow-text tensorflow_decision_forests -timm -torchao torchinfo torchmetrics torchtune transformers>=4.51.0 -triton -tsfresh vtk -wandb wavio # b/350573866: xgboost v2.1.0 breaks learntools xgboost==2.0.3 xvfbwrapper ydata-profiling +# b/443054743: pinned as newer versions requires protobuf > 3.20.3 +ydf==0.9.0 From 3e031bac8b3c6d484215bdd6de6e33c0c51cd4fb Mon Sep 17 00:00:00 2001 From: Eric Johnson <65414824+metrizable@users.noreply.github.com> Date: Fri, 26 Sep 2025 11:18:38 -0700 Subject: [PATCH 57/67] Fix libtpu version for torch and do not pre-install tensorflow-tpu on TPU. (#1499) We install a libtpu version compatible with both jax 0.7.2 and torch 2.8.0. Why? tunix latest -> flax 0.12 -> jax 0.7.2 -> libtpu 0.0.23, and that libtpu version causes pjrt api errors for torch 2.8.0: ``` pjrt_c_api_helpers.cc:258] Unexpected error status Unexpected PJRT_Plugin_Attributes_Args size: expe cted 32, got 24. The plugin is likely built with a later version than the framework. This plugin is built with PJRT API version 0.75. ``` * https://github.com/pytorch/xla/blob/d517649bdef6ab0519c30c704bde8779c8216502/setup.py#L111 * https://github.com/jax-ml/jax/blob/3489529b38d1f11d1e5caf4540775aadd5f2cdda/setup.py#L26 Of particular note, we no longer pre-install `tensorflow-tpu` as the newer libtpu causes issues finding the TPUs ``` external/local_xla/xla/stream_executor/tpu/tpu_platform_interface.cc:78] No TPU platform found. Platform manager status: OK ``` We also update how we install Python packages via `uv` for consistency and reproducibility. From a `requirements.in` file, we first generate a consistent dependency closure via `uv pip compile`, and then `uv pip install` the packages from the generated `requirements.txt`. --- tpu/Dockerfile | 33 ++++++++++++++++++----- tpu/{requirements.txt => requirements.in} | 7 +++-- 2 files changed, 29 insertions(+), 11 deletions(-) rename tpu/{requirements.txt => requirements.in} (82%) diff --git a/tpu/Dockerfile b/tpu/Dockerfile index fd0c0684..343443ae 100644 --- a/tpu/Dockerfile +++ b/tpu/Dockerfile @@ -34,20 +34,39 @@ RUN apt-get update && apt-get install ffmpeg libsm6 libxext6 -y # Additional useful packages should be added in the requirements.txt # Bring in the requirements.txt and replace variables in it: RUN apt-get install -y gettext -ADD tpu/requirements.txt /kaggle_requirements.txt -RUN envsubst < /kaggle_requirements.txt > /requirements.txt +ADD tpu/requirements.in /kaggle_requirements.in +RUN envsubst < /kaggle_requirements.in > /requirements.in # Install uv and then install the requirements: RUN curl -LsSf https://astral.sh/uv/install.sh | sh -RUN export PATH="${HOME}/.local/bin:${PATH}" && uv pip install --system -r /requirements.txt --prerelease=allow --force-reinstall && \ +RUN export PATH="${HOME}/.local/bin:${PATH}" && \ + uv pip compile --system --prerelease=allow \ + --verbose \ + --upgrade \ + --find-links=https://storage.googleapis.com/jax-releases/libtpu_releases.html \ + --find-links=https://storage.googleapis.com/libtpu-releases/index.html \ + --find-links=https://storage.googleapis.com/libtpu-wheels/index.html \ + --find-links=https://download.pytorch.org/whl/torch_stable.html \ + --emit-find-links \ + --no-emit-package pip \ + --no-emit-package setuptools \ + --output-file /requirements.txt \ + /requirements.in && \ + uv pip install --system --prerelease=allow --force-reinstall \ + -r /requirements.txt && \ + uv cache clean && \ /tmp/clean-layer.sh ENV PATH="~/.local/bin:${PATH}" -# Try to force tensorflow to reliably install without breaking other installed deps +# We install a libtpu version compatible with both jax 0.7.2 and torch 2.8.0. +# Why? tunix latest -> flax 0.12 -> jax 0.7.2 -> libtpu 0.0.23. However, that +# libtpu causes pjrt api errors for torch 2.8.0. screenshot/5heUtdyaJ4MmR3D +# https://github.com/pytorch/xla/blob/d517649bdef6ab0519c30c704bde8779c8216502/setup.py#L111 +# https://github.com/jax-ml/jax/blob/3489529b38d1f11d1e5caf4540775aadd5f2cdda/setup.py#L26 RUN export PATH="${HOME}/.local/bin:${PATH}" && \ - uv pip freeze --system > /tmp/constraints.txt && \ - uv pip install --system -c /tmp/constraints.txt tensorflow-tpu -f https://storage.googleapis.com/libtpu-tf-releases/index.html --force-reinstall && \ - rm /tmp/constraints.txt + uv pip install --system --force-reinstall libtpu==0.0.17 && \ + uv cache clean && \ + /tmp/clean-layer.sh # Kaggle Model Hub patches: ADD patches/kaggle_module_resolver.py /usr/local/lib/${PYTHON_VERSION_PATH}/site-packages/tensorflow_hub/kaggle_module_resolver.py diff --git a/tpu/requirements.txt b/tpu/requirements.in similarity index 82% rename from tpu/requirements.txt rename to tpu/requirements.in index f31a2e51..3991c7d3 100644 --- a/tpu/requirements.txt +++ b/tpu/requirements.in @@ -1,8 +1,8 @@ # TPU Utils tpu-info # Tensorflow packages -tensorflow-tpu==${TENSORFLOW_VERSION} ---find-links https://storage.googleapis.com/libtpu-tf-releases/index.html +# TODO: b/447621961 - re-enable tensorflow-tpu when a compatible libtpu can be found. +tensorflow-cpu==${TENSORFLOW_VERSION} tensorflow_hub tensorflow-io tensorflow-probability @@ -13,8 +13,7 @@ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-${TOR torchaudio==${TORCHAUDIO_VERSION} torchvision==${TORCHVISION_VERSION} # Jax packages -jax[tpu]>=0.5.2 ---find-links https://storage.googleapis.com/jax-releases/libtpu_releases.html +jax[tpu] distrax flax git+https://github.com/deepmind/dm-haiku From 0898ca48ac0b34bf2966e77585e6c41ab5d7bb7a Mon Sep 17 00:00:00 2001 From: Dustin Herbison Date: Wed, 1 Oct 2025 20:51:36 +0000 Subject: [PATCH 58/67] Switch to newer libtpu/tunix and cpu-based torch for tpu. http://b/436838265 --- tpu/Dockerfile | 10 ---------- tpu/requirements.in | 6 +++--- 2 files changed, 3 insertions(+), 13 deletions(-) diff --git a/tpu/Dockerfile b/tpu/Dockerfile index 343443ae..fe2d065d 100644 --- a/tpu/Dockerfile +++ b/tpu/Dockerfile @@ -58,16 +58,6 @@ RUN export PATH="${HOME}/.local/bin:${PATH}" && \ /tmp/clean-layer.sh ENV PATH="~/.local/bin:${PATH}" -# We install a libtpu version compatible with both jax 0.7.2 and torch 2.8.0. -# Why? tunix latest -> flax 0.12 -> jax 0.7.2 -> libtpu 0.0.23. However, that -# libtpu causes pjrt api errors for torch 2.8.0. screenshot/5heUtdyaJ4MmR3D -# https://github.com/pytorch/xla/blob/d517649bdef6ab0519c30c704bde8779c8216502/setup.py#L111 -# https://github.com/jax-ml/jax/blob/3489529b38d1f11d1e5caf4540775aadd5f2cdda/setup.py#L26 -RUN export PATH="${HOME}/.local/bin:${PATH}" && \ - uv pip install --system --force-reinstall libtpu==0.0.17 && \ - uv cache clean && \ - /tmp/clean-layer.sh - # Kaggle Model Hub patches: ADD patches/kaggle_module_resolver.py /usr/local/lib/${PYTHON_VERSION_PATH}/site-packages/tensorflow_hub/kaggle_module_resolver.py RUN sed -i '/from tensorflow_hub import uncompressed_module_resolver/a from tensorflow_hub import kaggle_module_resolver' /usr/local/lib/${PYTHON_VERSION_PATH}/site-packages/tensorflow_hub/config.py diff --git a/tpu/requirements.in b/tpu/requirements.in index 3991c7d3..1fceeebb 100644 --- a/tpu/requirements.in +++ b/tpu/requirements.in @@ -8,10 +8,10 @@ tensorflow-io tensorflow-probability tensorflow_datasets # Torch packages -torch==${TORCH_VERSION} +https://download.pytorch.org/whl/cpu/torch-${TORCH_VERSION}%2Bcpu-${PYTHON_WHEEL_VERSION}-${PYTHON_WHEEL_VERSION}-${TORCH_LINUX_WHEEL_VERSION}.whl +https://download.pytorch.org/whl/cpu/torchaudio-${TORCHAUDIO_VERSION}%2Bcpu-${PYTHON_WHEEL_VERSION}-${PYTHON_WHEEL_VERSION}-${TORCH_LINUX_WHEEL_VERSION}.whl +https://download.pytorch.org/whl/cpu/torchvision-${TORCHVISION_VERSION}%2Bcpu-${PYTHON_WHEEL_VERSION}-${PYTHON_WHEEL_VERSION}-${TORCH_LINUX_WHEEL_VERSION}.whl https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-${TORCH_VERSION}-${PYTHON_WHEEL_VERSION}-${PYTHON_WHEEL_VERSION}-${TORCH_LINUX_WHEEL_VERSION}.whl -torchaudio==${TORCHAUDIO_VERSION} -torchvision==${TORCHVISION_VERSION} # Jax packages jax[tpu] distrax From 221ec49fe771668afc2026f4e0ae9a85b817a777 Mon Sep 17 00:00:00 2001 From: Dustin Herbison Date: Wed, 1 Oct 2025 21:54:01 +0000 Subject: [PATCH 59/67] Re-add the libtpu pin to make torch and jax work together again... http://b/436838265 --- tpu/Dockerfile | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tpu/Dockerfile b/tpu/Dockerfile index fe2d065d..343443ae 100644 --- a/tpu/Dockerfile +++ b/tpu/Dockerfile @@ -58,6 +58,16 @@ RUN export PATH="${HOME}/.local/bin:${PATH}" && \ /tmp/clean-layer.sh ENV PATH="~/.local/bin:${PATH}" +# We install a libtpu version compatible with both jax 0.7.2 and torch 2.8.0. +# Why? tunix latest -> flax 0.12 -> jax 0.7.2 -> libtpu 0.0.23. However, that +# libtpu causes pjrt api errors for torch 2.8.0. screenshot/5heUtdyaJ4MmR3D +# https://github.com/pytorch/xla/blob/d517649bdef6ab0519c30c704bde8779c8216502/setup.py#L111 +# https://github.com/jax-ml/jax/blob/3489529b38d1f11d1e5caf4540775aadd5f2cdda/setup.py#L26 +RUN export PATH="${HOME}/.local/bin:${PATH}" && \ + uv pip install --system --force-reinstall libtpu==0.0.17 && \ + uv cache clean && \ + /tmp/clean-layer.sh + # Kaggle Model Hub patches: ADD patches/kaggle_module_resolver.py /usr/local/lib/${PYTHON_VERSION_PATH}/site-packages/tensorflow_hub/kaggle_module_resolver.py RUN sed -i '/from tensorflow_hub import uncompressed_module_resolver/a from tensorflow_hub import kaggle_module_resolver' /usr/local/lib/${PYTHON_VERSION_PATH}/site-packages/tensorflow_hub/config.py From f972e95de3e7eeec59f77b5be435bbed8ebab3a7 Mon Sep 17 00:00:00 2001 From: Sohier Dane Date: Wed, 29 Oct 2025 11:19:33 -0700 Subject: [PATCH 60/67] Update Dockerfile.tmpl (#1508) Drop support for BQ Helper, which is obsolete now that the official BigQuery library is more feature complete. [Context](https://chat.kaggle.net/kaggle/pl/egqzrknaz7dcfydjfrn1xiwame). --- Dockerfile.tmpl | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index 7561a700..b0d0d111 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -146,16 +146,6 @@ RUN mkdir -p /root/.jupyter && touch /root/.jupyter/jupyter_nbconvert_config.py mkdir -p /etc/ipython/ && echo "c = get_config(); c.IPKernelApp.matplotlib = 'inline'" > /etc/ipython/ipython_config.py && \ /tmp/clean-layer.sh -# Fix to import bq_helper library without downgrading setuptools and upgrading protobuf -RUN mkdir -p ~/src && git clone https://github.com/SohierDane/BigQuery_Helper ~/src/BigQuery_Helper && \ - mkdir -p ~/src/BigQuery_Helper/bq_helper && \ - mv ~/src/BigQuery_Helper/bq_helper.py ~/src/BigQuery_Helper/bq_helper/__init__.py && \ - mv ~/src/BigQuery_Helper/test_helper.py ~/src/BigQuery_Helper/bq_helper/ && \ - sed -i 's/)/packages=["bq_helper"])/g' ~/src/BigQuery_Helper/setup.py && \ - uv pip install --system -e ~/src/BigQuery_Helper "protobuf<3.21"&& \ - /tmp/clean-layer.sh - - # install imagemagick for wand # https://docs.wand-py.org/en/latest/guide/install.html#install-imagemagick-on-debian-ubuntu RUN apt-get install libmagickwand-dev From dd0d0b39f079eca46f304fa480dc0c307f2a13e1 Mon Sep 17 00:00:00 2001 From: Jim Plotts Date: Wed, 29 Oct 2025 14:30:09 -0400 Subject: [PATCH 61/67] Changes to support agents (#1507) Removes automl implementation, which requires old protobuf (3.20.3). This enables it to be upgraded to a modern version, which is required by `google-adk[a2a]`. Also removes old unused `BigQuery_Helper`. Adds ADK `eval` and `a2a` as dependencies. http://b/455550872 --- Dockerfile.tmpl | 5 +---- kaggle_requirements.txt | 7 +++---- patches/kaggle_gcp.py | 32 -------------------------------- 3 files changed, 4 insertions(+), 40 deletions(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index b0d0d111..320d1dea 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -23,10 +23,7 @@ RUN uv pip install --system -r /requirements.txt RUN uv pip uninstall --system google-cloud-bigquery-storage # b/394382016: sigstore (dependency of kagglehub) requires a prerelease packages, installing separate. -# b/408284143: google-cloud-automl 2.0.0 introduced incompatible API changes, need to pin to 1.0.1, -# installed outside of kaggle_requirements.txt due to requiring an incompatibile version of protobuf. -RUN uv pip install --system --force-reinstall --prerelease=allow "kagglehub[pandas-datasets,hf-datasets,signing]>=0.3.12" \ - "google-cloud-automl==1.0.1" +RUN uv pip install --system --force-reinstall --prerelease=allow "kagglehub[pandas-datasets,hf-datasets,signing]>=0.3.12" # uv cannot install this in requirements.txt without --no-build-isolation # to avoid affecting the larger build, we'll post-install it. diff --git a/kaggle_requirements.txt b/kaggle_requirements.txt index c711869e..9bb07412 100644 --- a/kaggle_requirements.txt +++ b/kaggle_requirements.txt @@ -45,8 +45,8 @@ geojson # geopandas > v0.14.4 breaks learn tools geopandas==v0.14.4 gensim -# b/443054743 -google-adk +# b/443054743,b/455550872 +google-adk[a2a,eval] google-cloud-aiplatform # b/315753846: Unpin translate package. google-cloud-translate==3.12.1 @@ -152,5 +152,4 @@ wavio xgboost==2.0.3 xvfbwrapper ydata-profiling -# b/443054743: pinned as newer versions requires protobuf > 3.20.3 -ydf==0.9.0 +ydf diff --git a/patches/kaggle_gcp.py b/patches/kaggle_gcp.py index 2c8b64cc..64a4611f 100644 --- a/patches/kaggle_gcp.py +++ b/patches/kaggle_gcp.py @@ -253,37 +253,6 @@ def init_gcs(): KaggleKernelCredentials(target=GcpTarget.GCS)) return storage -def init_automl(): - from google.cloud import automl, automl_v1beta1 - if not is_user_secrets_token_set(): - return - - from kaggle_gcp import get_integrations - if not get_integrations().has_cloudai(): - return - - from kaggle_secrets import GcpTarget - from kaggle_gcp import KaggleKernelCredentials - kaggle_kernel_credentials = KaggleKernelCredentials(target=GcpTarget.CLOUDAI) - - # Patch the 2 GA clients: AutoMlClient and PreditionServiceClient - monkeypatch_client(automl.AutoMlClient, kaggle_kernel_credentials) - monkeypatch_client(automl.PredictionServiceClient, kaggle_kernel_credentials) - - # The AutoML client library exposes 3 different client classes (AutoMlClient, - # TablesClient, PredictionServiceClient), so patch each of them. - # The same KaggleKernelCredentials are passed to all of them. - # The GcsClient class is only used internally by TablesClient. - - # The beta version of the clients that are now GA are included here for now. - # They are deprecated and will be removed by 1 May 2020. - monkeypatch_client(automl_v1beta1.AutoMlClient, kaggle_kernel_credentials) - monkeypatch_client(automl_v1beta1.PredictionServiceClient, kaggle_kernel_credentials) - - # The TablesClient is still in beta, so this will not be deprecated until - # the TablesClient is GA. - monkeypatch_client(automl_v1beta1.TablesClient, kaggle_kernel_credentials) - def init_translation_v2(): from google.cloud import translate_v2 if not is_user_secrets_token_set(): @@ -379,7 +348,6 @@ def init_vision(): def init(): init_bigquery() init_gcs() - init_automl() init_translation_v2() init_translation_v3() init_natural_language() From 0d57d8ddae4d4a695d4866d4cade6d9d2078bd52 Mon Sep 17 00:00:00 2001 From: Dustin H Date: Wed, 29 Oct 2025 16:14:21 -0400 Subject: [PATCH 62/67] Add jupyter_server_proxy to requirements --- kaggle_requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/kaggle_requirements.txt b/kaggle_requirements.txt index 9bb07412..c91dc073 100644 --- a/kaggle_requirements.txt +++ b/kaggle_requirements.txt @@ -66,6 +66,7 @@ jedi jupyter-lsp==1.5.1 # b/333854354: pin jupyter-server to version 2.12.5; later versions break LSP (b/333854354) jupyter_server==2.12.5 +jupyter_server_proxy jupyterlab jupyterlab-lsp # b/409363708: Ensure we have the update version, we can consider removing it once From 523ec52937102fd6375273179f0d87f25a83760f Mon Sep 17 00:00:00 2001 From: Johnny Chavez <64660690+calderjo@users.noreply.github.com> Date: Wed, 29 Oct 2025 14:35:45 -0700 Subject: [PATCH 63/67] Pin huggingface-hub (#1510) pytorch-lighting and transformer are throwing errors due to huggingface-hub requirements. let's pin huggingface-hub for now --- Dockerfile.tmpl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index 320d1dea..ba164474 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -48,7 +48,8 @@ RUN uv pip install --system "tbb>=2022" "libpysal==4.9.2" # b/404590350: Ray and torchtune have conflicting tune cli, we will prioritize torchtune. # b/415358158: Gensim removed from Colab image to upgrade scipy -RUN uv pip install --system --force-reinstall --no-deps torchtune gensim "scipy<=1.15.3" +# b/456239669: remove huggingface-hub pin when pytorch-lighting and transformer are compatible +RUN uv pip install --system --force-reinstall --no-deps torchtune gensim "scipy<=1.15.3" "huggingface-hub==0.36.0" # Adding non-package dependencies: ADD clean-layer.sh /tmp/clean-layer.sh From 20bbb5c1e080dcc5c4790d55d348e1f2474ff077 Mon Sep 17 00:00:00 2001 From: Jim Plotts Date: Wed, 29 Oct 2025 18:56:47 -0400 Subject: [PATCH 64/67] Clean up missed references to AutoML (#1509) --- patches/sitecustomize.py | 1 - tests/test_automl.py | 139 --------------------------------------- 2 files changed, 140 deletions(-) delete mode 100644 tests/test_automl.py diff --git a/patches/sitecustomize.py b/patches/sitecustomize.py index e8afb361..4caa5d28 100644 --- a/patches/sitecustomize.py +++ b/patches/sitecustomize.py @@ -56,7 +56,6 @@ def create_module(self, spec): _LOADERS = { 'google.cloud.bigquery': kaggle_gcp.init_bigquery, 'google.cloud.storage': kaggle_gcp.init_gcs, - 'google.cloud.automl_v1beta1': kaggle_gcp.init_automl, 'google.cloud.translate': kaggle_gcp.init_translation_v3, 'google.cloud.translate_v2': kaggle_gcp.init_translation_v2, 'google.cloud.translate_v3': kaggle_gcp.init_translation_v3, diff --git a/tests/test_automl.py b/tests/test_automl.py deleted file mode 100644 index 9a048b14..00000000 --- a/tests/test_automl.py +++ /dev/null @@ -1,139 +0,0 @@ -import unittest - -from unittest.mock import Mock, patch - -from kaggle_gcp import KaggleKernelCredentials, init_automl -from test.support.os_helper import EnvironmentVarGuard -from google.cloud import storage, automl_v1beta1, automl - -def _make_credentials(): - import google.auth.credentials - credentials = Mock(spec=google.auth.credentials.Credentials) - credentials.universe_domain = 'googleapis.com' - return credentials - -class TestAutoMl(unittest.TestCase): - - class FakeClient: - def __init__(self, credentials=None, client_info=None, **kwargs): - self.credentials = credentials - - class FakeConnection(): - def __init__(self, user_agent): - self.user_agent = user_agent - if (client_info is not None): - self._connection = FakeConnection(client_info.user_agent) - - @patch("google.cloud.automl.AutoMlClient", new=FakeClient) - def test_user_provided_credentials(self): - credentials = _make_credentials() - env = EnvironmentVarGuard() - env.set('KAGGLE_USER_SECRETS_TOKEN', 'foobar') - env.set('KAGGLE_KERNEL_INTEGRATIONS', 'CLOUDAI') - with env: - init_automl() - client = automl.AutoMlClient(credentials=credentials) - self.assertNotIsInstance(client.credentials, KaggleKernelCredentials) - self.assertIsNotNone(client.credentials) - - def test_tables_gcs_client(self): - # The GcsClient can't currently be monkeypatched for default - # credentials because it requires a project which can't be set. - # Verify that creating an automl_v1beta1.GcsClient given an actual - # storage.Client sets the client properly. - gcs_client = storage.Client(project="xyz", credentials=_make_credentials()) - tables_gcs_client = automl_v1beta1.GcsClient(client=gcs_client) - self.assertIs(tables_gcs_client.client, gcs_client) - - @patch("google.cloud.automl_v1beta1.gapic.auto_ml_client.AutoMlClient", new=FakeClient) - def test_tables_client_credentials(self): - credentials = _make_credentials() - env = EnvironmentVarGuard() - env.set('KAGGLE_USER_SECRETS_TOKEN', 'foobar') - env.set('KAGGLE_KERNEL_INTEGRATIONS', 'CLOUDAI') - with env: - init_automl() - tables_client = automl_v1beta1.TablesClient(credentials=credentials) - self.assertEqual(tables_client.auto_ml_client.credentials, credentials) - - @patch("google.cloud.automl.AutoMlClient", new=FakeClient) - def test_default_credentials_automl_client(self): - env = EnvironmentVarGuard() - env.set('KAGGLE_USER_SECRETS_TOKEN', 'foobar') - env.set('KAGGLE_KERNEL_INTEGRATIONS', 'CLOUDAI') - with env: - init_automl() - automl_client = automl.AutoMlClient() - self.assertIsNotNone(automl_client.credentials) - self.assertIsInstance(automl_client.credentials, KaggleKernelCredentials) - self.assertTrue(automl_client._connection.user_agent.startswith("kaggle-gcp-client/1.0")) - - @patch("google.cloud.automl_v1beta1.AutoMlClient", new=FakeClient) - def test_default_credentials_automl_v1beta1_client(self): - env = EnvironmentVarGuard() - env.set('KAGGLE_USER_SECRETS_TOKEN', 'foobar') - env.set('KAGGLE_KERNEL_INTEGRATIONS', 'CLOUDAI') - with env: - init_automl() - automl_client = automl_v1beta1.AutoMlClient() - self.assertIsNotNone(automl_client.credentials) - self.assertIsInstance(automl_client.credentials, KaggleKernelCredentials) - self.assertTrue(automl_client._connection.user_agent.startswith("kaggle-gcp-client/1.0")) - - @patch("google.cloud.automl_v1beta1.TablesClient", new=FakeClient) - def test_default_credentials_tables_client(self): - env = EnvironmentVarGuard() - env.set('KAGGLE_USER_SECRETS_TOKEN', 'foobar') - env.set('KAGGLE_KERNEL_INTEGRATIONS', 'CLOUDAI') - with env: - init_automl() - tables_client = automl_v1beta1.TablesClient() - self.assertIsNotNone(tables_client.credentials) - self.assertIsInstance(tables_client.credentials, KaggleKernelCredentials) - self.assertTrue(tables_client._connection.user_agent.startswith("kaggle-gcp-client/1.0")) - - @patch("google.cloud.automl.PredictionServiceClient", new=FakeClient) - def test_default_credentials_prediction_client(self): - env = EnvironmentVarGuard() - env.set('KAGGLE_USER_SECRETS_TOKEN', 'foobar') - env.set('KAGGLE_KERNEL_INTEGRATIONS', 'CLOUDAI') - with env: - prediction_client = automl.PredictionServiceClient() - self.assertIsNotNone(prediction_client.credentials) - self.assertIsInstance(prediction_client.credentials, KaggleKernelCredentials) - self.assertTrue(prediction_client._connection.user_agent.startswith("kaggle-gcp-client/1.0")) - - @patch("google.cloud.automl_v1beta1.PredictionServiceClient", new=FakeClient) - def test_default_credentials_prediction_v1beta1_client(self): - env = EnvironmentVarGuard() - env.set('KAGGLE_USER_SECRETS_TOKEN', 'foobar') - env.set('KAGGLE_KERNEL_INTEGRATIONS', 'CLOUDAI') - with env: - prediction_client = automl_v1beta1.PredictionServiceClient() - self.assertIsNotNone(prediction_client.credentials) - self.assertIsInstance(prediction_client.credentials, KaggleKernelCredentials) - self.assertTrue(prediction_client._connection.user_agent.startswith("kaggle-gcp-client/1.0")) - - def test_monkeypatching_idempotent(self): - env = EnvironmentVarGuard() - env.set('KAGGLE_USER_SECRETS_TOKEN', 'foobar') - env.set('KAGGLE_KERNEL_INTEGRATIONS', 'CLOUDAI') - with env: - client1 = automl.AutoMlClient.__init__ - init_automl() - client2 = automl.AutoMlClient.__init__ - self.assertEqual(client1, client2) - - @patch("google.cloud.automl_v1beta1.PredictionServiceClient", new=FakeClient) - def test_legacy_AUTOML_variable_v1beta1_client(self): - """ - Tests previous KAGGLE_KERNEL_INTEGRATIONS="AUTOML" environment setting - """ - env = EnvironmentVarGuard() - env.set('KAGGLE_USER_SECRETS_TOKEN', 'foobar') - env.set('KAGGLE_KERNEL_INTEGRATIONS', 'AUTOML') - with env: - prediction_client = automl_v1beta1.PredictionServiceClient() - self.assertIsNotNone(prediction_client.credentials) - self.assertIsInstance(prediction_client.credentials, KaggleKernelCredentials) - self.assertTrue(prediction_client._connection.user_agent.startswith("kaggle-gcp-client/1.0")) \ No newline at end of file From 0cb222723ba71fa79c9cf0694297586f53840f49 Mon Sep 17 00:00:00 2001 From: Johnny Chavez <64660690+calderjo@users.noreply.github.com> Date: Wed, 29 Oct 2025 16:14:48 -0700 Subject: [PATCH 65/67] Remove even more references to AutoMl and Bq (#1511) --- patches/sitecustomize.py | 1 - tests/test_imports.py | 1 - 2 files changed, 2 deletions(-) diff --git a/patches/sitecustomize.py b/patches/sitecustomize.py index 4caa5d28..b8ae0692 100644 --- a/patches/sitecustomize.py +++ b/patches/sitecustomize.py @@ -13,7 +13,6 @@ class GcpModuleFinder(importlib.abc.MetaPathFinder): _MODULES = [ 'google.cloud.bigquery', 'google.cloud.storage', - 'google.cloud.automl_v1beta1', 'google.cloud.translate', 'google.cloud.translate_v2', 'google.cloud.translate_v3', diff --git a/tests/test_imports.py b/tests/test_imports.py index b22ebe7a..6c429516 100644 --- a/tests/test_imports.py +++ b/tests/test_imports.py @@ -3,6 +3,5 @@ class TestImport(unittest.TestCase): # Basic import tests for packages without any. def test_basic(self): - import bq_helper import tensorflow_datasets import segment_anything From 708186076f37107ff322fa2b31ca8e34e6bdacea Mon Sep 17 00:00:00 2001 From: Johnny Chavez <64660690+calderjo@users.noreply.github.com> Date: Fri, 31 Oct 2025 02:35:32 -0700 Subject: [PATCH 66/67] move cloud-translate (#1512) --- Dockerfile.tmpl | 3 ++- kaggle_requirements.txt | 2 -- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index ba164474..20acac5c 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -49,7 +49,8 @@ RUN uv pip install --system "tbb>=2022" "libpysal==4.9.2" # b/404590350: Ray and torchtune have conflicting tune cli, we will prioritize torchtune. # b/415358158: Gensim removed from Colab image to upgrade scipy # b/456239669: remove huggingface-hub pin when pytorch-lighting and transformer are compatible -RUN uv pip install --system --force-reinstall --no-deps torchtune gensim "scipy<=1.15.3" "huggingface-hub==0.36.0" +# b/315753846: Unpin translate package, currently conflicts with adk 1.17.0 +RUN uv pip install --system --force-reinstall --no-deps torchtune gensim "scipy<=1.15.3" "huggingface-hub==0.36.0" "google-cloud-translate==3.12.1" # Adding non-package dependencies: ADD clean-layer.sh /tmp/clean-layer.sh diff --git a/kaggle_requirements.txt b/kaggle_requirements.txt index c91dc073..503d37f9 100644 --- a/kaggle_requirements.txt +++ b/kaggle_requirements.txt @@ -48,8 +48,6 @@ gensim # b/443054743,b/455550872 google-adk[a2a,eval] google-cloud-aiplatform -# b/315753846: Unpin translate package. -google-cloud-translate==3.12.1 google-cloud-videointelligence google-cloud-vision google-genai From e756e929cb98b6c8ef6e016c0c0f917536c4c871 Mon Sep 17 00:00:00 2001 From: Jim Plotts Date: Mon, 3 Nov 2025 12:12:52 -0500 Subject: [PATCH 67/67] Remove custom logging. (#1514) This is very old and was probably mostly useful while the GCP integrations were in development. However, it interferes with the user's logging in a way that's peculiar to Kaggle, so let's remove it to make Kaggle more similar to other platforms. http://b/455836683 --- Dockerfile.tmpl | 1 - patches/kaggle_gcp.py | 33 +++++----- patches/log.py | 133 --------------------------------------- patches/sitecustomize.py | 3 +- 4 files changed, 17 insertions(+), 153 deletions(-) delete mode 100644 patches/log.py diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index 20acac5c..ba382531 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -164,7 +164,6 @@ ADD patches/kaggle_gcp.py \ patches/kaggle_session.py \ patches/kaggle_web_client.py \ patches/kaggle_datasets.py \ - patches/log.py \ $PACKAGE_PATH/ # Figure out why this is in a different place? diff --git a/patches/kaggle_gcp.py b/patches/kaggle_gcp.py index 64a4611f..4cb98858 100644 --- a/patches/kaggle_gcp.py +++ b/patches/kaggle_gcp.py @@ -1,5 +1,6 @@ import os import inspect +import logging from google.auth import credentials, environment_vars from google.auth.exceptions import RefreshError from google.api_core.gapic_v1.client_info import ClientInfo @@ -8,8 +9,6 @@ from google.cloud.bigquery._http import Connection from kaggle_secrets import GcpTarget, UserSecretsClient -from log import Log - KAGGLE_GCP_CLIENT_USER_AGENT="kaggle-gcp-client/1.0" def get_integrations(): @@ -22,7 +21,7 @@ def get_integrations(): target = GcpTarget[integration.upper()] kernel_integrations.add_integration(target) except KeyError as e: - Log.error(f"Unknown integration target: {integration.upper()}") + logging.debug(f"Unknown integration target: {integration.upper()}") return kernel_integrations @@ -66,14 +65,14 @@ def refresh(self, request): elif self.target == GcpTarget.CLOUDAI: self.token, self.expiry = client._get_cloudai_access_token() except ConnectionError as e: - Log.error(f"Connection error trying to refresh access token: {e}") + logging.error(f"Connection error trying to refresh access token: {e}") print("There was a connection error trying to fetch the access token. " f"Please ensure internet is on in order to use the {self.target.service} Integration.") raise RefreshError('Unable to refresh access token due to connection error.') from e except Exception as e: - Log.error(f"Error trying to refresh access token: {e}") + logging.error(f"Error trying to refresh access token: {e}") if (not get_integrations().has_integration(self.target)): - Log.error(f"No {self.target.service} integration found.") + logging.error(f"No {self.target.service} integration found.") print( f"Please ensure you have selected a {self.target.service} account in the Notebook Add-ons menu.") raise RefreshError('Unable to refresh access token.') from e @@ -102,7 +101,7 @@ def api_request(self, *args, **kwargs): msg = ("Permission denied using Kaggle's public BigQuery integration. " "Did you mean to select a BigQuery account in the Notebook Add-ons menu?") print(msg) - Log.info(msg) + logging.info(msg) raise e @@ -156,23 +155,23 @@ def monkeypatch_bq(bq_client, *args, **kwargs): # Remove these two lines once this is resolved: # https://github.com/googleapis/google-cloud-python/issues/8108 if explicit_project_id: - Log.info(f"Explicit project set to {explicit_project_id}") + logging.info(f"Explicit project set to {explicit_project_id}") kwargs['project'] = explicit_project_id if explicit_project_id is None and specified_credentials is None and not has_bigquery: msg = "Using Kaggle's public dataset BigQuery integration." - Log.info(msg) + logging.info(msg) print(msg) return PublicBigqueryClient(*args, **kwargs) else: if specified_credentials is None: - Log.info("No credentials specified, using KaggleKernelCredentials.") + logging.info("No credentials specified, using KaggleKernelCredentials.") kwargs['credentials'] = KaggleKernelCredentials() if (not has_bigquery): - Log.info("No bigquery integration found, creating client anyways.") + logging.info("No bigquery integration found, creating client anyways.") print('Please ensure you have selected a BigQuery ' 'account in the Notebook Add-ons menu.') if explicit_project_id is None: - Log.info("No project specified while using the unmodified client.") + logging.info("No project specified while using the unmodified client.") print('Please ensure you specify a project id when creating the client' ' in order to use your BigQuery account.') kwargs['client_info'] = set_kaggle_user_agent(kwargs.get('client_info')) @@ -196,20 +195,20 @@ def monkeypatch_aiplatform_init(aiplatform_klass, kaggle_kernel_credentials): def patched_init(*args, **kwargs): specified_credentials = kwargs.get('credentials') if specified_credentials is None: - Log.info("No credentials specified, using KaggleKernelCredentials.") + logging.info("No credentials specified, using KaggleKernelCredentials.") kwargs['credentials'] = kaggle_kernel_credentials return aiplatform_init(*args, **kwargs) if (not has_been_monkeypatched(aiplatform_klass.init)): aiplatform_klass.init = patched_init - Log.info("aiplatform.init patched") + logging.info("aiplatform.init patched") def monkeypatch_client(client_klass, kaggle_kernel_credentials): client_init = client_klass.__init__ def patched_init(self, *args, **kwargs): specified_credentials = kwargs.get('credentials') if specified_credentials is None: - Log.info("No credentials specified, using KaggleKernelCredentials.") + logging.info("No credentials specified, using KaggleKernelCredentials.") # Some GCP services demand the billing and target project must be the same. # To avoid using default service account based credential as caller credential # user need to provide ClientOptions with quota_project_id: @@ -227,7 +226,7 @@ def patched_init(self, *args, **kwargs): if (not has_been_monkeypatched(client_klass.__init__)): client_klass.__init__ = patched_init - Log.info(f"Client patched: {client_klass}") + logging.info(f"Client patched: {client_klass}") def set_kaggle_user_agent(client_info: ClientInfo): # Add kaggle client user agent in order to attribute usage. @@ -360,4 +359,4 @@ def init(): # google.cloud.* and kaggle_gcp. By calling init here, we guarantee # that regardless of the original import that caused google.cloud.* to be # loaded, the monkeypatching will be done. -init() +init() \ No newline at end of file diff --git a/patches/log.py b/patches/log.py deleted file mode 100644 index 59a07c8c..00000000 --- a/patches/log.py +++ /dev/null @@ -1,133 +0,0 @@ -import io -import logging -import os - -import google.auth - - -_LOG_TO_FILE_ENV = os.getenv("KAGGLE_LOG_TO_FILE") - - -class _LogFormatter(logging.Formatter): - """A logging formatter which truncates long messages.""" - - _MAX_LOG_LENGTH = 10000 # Be generous, not to truncate long backtraces. - - def format(self, record): - msg = super(_LogFormatter, self).format(record) - return msg[:_LogFormatter._MAX_LOG_LENGTH] if msg else msg - -# TODO(vimota): Clean this up once we're using python 3.8 and can use -# (https://github.com/python/cpython/commit/dde9fdbe453925279ac3d2a6a72102f6f9ef247c) -# Right now, making the logging module display the intended frame's information -# when the logging calls (info, warn, ...) are wrapped (as is the case in our -# Log class) involves fragile logic. -class _Logger(logging.Logger): - - # This is a copy of logging.Logger.findCaller with the filename ignore - # set expanded to include the current filename (".../log.py"). - # Copyright 2001-2015 by Vinay Sajip. All Rights Reserved. - # License: https://github.com/python/cpython/blob/ce9e62544571e7ade7186697d5dd065fb4c5243f/LICENSE - def findCaller(self, stack_info=False, stacklevel=1): - f = logging.currentframe() - f = f.f_back - rv = "(unknown file)", 0, "(unknown function)", None - while hasattr(f, "f_code"): - co = f.f_code - filename = os.path.normcase(co.co_filename) - if filename in _ignore_srcfiles: - f = f.f_back - continue - sinfo = None - if stack_info: - sio = io.StringIO() - sio.write('Stack (most recent call last):\n') - traceback.print_stack(f, file=sio) - sinfo = sio.getvalue() - if sinfo[-1] == '\n': - sinfo = sinfo[:-1] - sio.close() - rv = (co.co_filename, f.f_lineno, co.co_name, sinfo) - break - return rv - - -_srcfile = os.path.normcase(_Logger.findCaller.__code__.co_filename) -_ignore_srcfiles = (_srcfile, logging._srcfile) - -class Log: - """ Helper aggregate for all things related to logging activity. """ - - _GLOBAL_LOG = logging.getLogger("") - _initialized = False - - # These are convenience helpers. For performance, consider saving Log.get_logger() and using that - @staticmethod - def critical(msg, *args, **kwargs): - Log._GLOBAL_LOG.critical(msg, *args, **kwargs) - - @staticmethod - def fatal(msg, *args, **kwargs): - Log._GLOBAL_LOG.fatal(msg, *args, **kwargs) - - @staticmethod - def exception(msg, *args, **kwargs): - Log._GLOBAL_LOG.exception(msg, *args, **kwargs) - - @staticmethod - def error(msg, *args, **kwargs): - Log._GLOBAL_LOG.error(msg, *args, **kwargs) - - @staticmethod - def warn(msg, *args, **kwargs): - Log._GLOBAL_LOG.warn(msg, *args, **kwargs) - - @staticmethod - def warning(msg, *args, **kwargs): - Log._GLOBAL_LOG.warning(msg, *args, **kwargs) - - @staticmethod - def debug(msg, *args, **kwargs): - Log._GLOBAL_LOG.debug(msg, *args, **kwargs) - - @staticmethod - def info(msg, *args, **kwargs): - Log._GLOBAL_LOG.info(msg, *args, **kwargs) - - @staticmethod - def set_level(loglevel): - if isinstance(loglevel, int): - Log._GLOBAL_LOG.setLevel(loglevel) - return - elif isinstance(loglevel, str): - # idea from https://docs.python.org/3.5/howto/logging.html#logging-to-a-file - numeric_level = getattr(logging, loglevel.upper(), None) - if isinstance(numeric_level, int): - Log._GLOBAL_LOG.setLevel(numeric_level) - return - - raise ValueError('Invalid log level: %s' % loglevel) - - @staticmethod - def _static_init(): - if Log._initialized: - return - - logging.setLoggerClass(_Logger) - # The root logger's type is unfortunately (and surprisingly) not affected by - # `setLoggerClass`. Monkey patch it instead. TODO(vimota): Remove this, see the TODO - # associated with _Logger. - logging.RootLogger.findCaller = _Logger.findCaller - log_to_file = _LOG_TO_FILE_ENV.lower() in ("yes", "true", "t", "1") if _LOG_TO_FILE_ENV is not None else True - if log_to_file: - handler = logging.FileHandler(filename='/tmp/kaggle.log', mode='w') - else: - handler = logging.StreamHandler() - - # ".1s" is for the first letter: http://stackoverflow.com/a/27453084/1869. - format_string = "%(asctime)s %(levelname).1s %(process)d %(filename)s:%(lineno)d] %(message)s" - handler.setFormatter(_LogFormatter(format_string)) - logging.basicConfig(level=logging.INFO, handlers=[handler]) - Log._initialized = True - -Log._static_init() diff --git a/patches/sitecustomize.py b/patches/sitecustomize.py index b8ae0692..1bb8a1b6 100644 --- a/patches/sitecustomize.py +++ b/patches/sitecustomize.py @@ -1,7 +1,6 @@ +import logging import os -from log import Log - import sys import importlib.abc import importlib