diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index f79a7e1e..ba382531 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -1,84 +1,70 @@ -ARG BASE_IMAGE_REPO -ARG BASE_IMAGE_TAG -ARG CPU_BASE_IMAGE_NAME -ARG GPU_BASE_IMAGE_NAME -ARG LIGHTGBM_VERSION -ARG TORCH_VERSION -ARG TORCHAUDIO_VERSION -ARG TORCHTEXT_VERSION -ARG TORCHVISION_VERSION -ARG JAX_VERSION +ARG BASE_IMAGE \ + BASE_IMAGE_TAG -{{ if eq .Accelerator "gpu" }} -FROM gcr.io/kaggle-images/python-lightgbm-whl:${GPU_BASE_IMAGE_NAME}-${BASE_IMAGE_TAG}-${LIGHTGBM_VERSION} AS lightgbm_whl -FROM gcr.io/kaggle-images/python-torch-whl:${GPU_BASE_IMAGE_NAME}-${BASE_IMAGE_TAG}-${TORCH_VERSION} AS torch_whl -FROM gcr.io/kaggle-images/python-jaxlib-whl:${GPU_BASE_IMAGE_NAME}-${BASE_IMAGE_TAG}-${JAX_VERSION} AS jaxlib_whl -FROM ${BASE_IMAGE_REPO}/${GPU_BASE_IMAGE_NAME}:${BASE_IMAGE_TAG} -{{ else }} -FROM ${BASE_IMAGE_REPO}/${CPU_BASE_IMAGE_NAME}:${BASE_IMAGE_TAG} -{{ end }} +FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} -# Ensures shared libraries installed with conda can be found by the dynamic link loader. -ENV LIBRARY_PATH="$LIBRARY_PATH:/opt/conda/lib" -ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib" +ADD kaggle_requirements.txt /kaggle_requirements.txt -{{ if eq .Accelerator "gpu" }} -ARG CUDA_MAJOR_VERSION -ARG CUDA_MINOR_VERSION -ENV CUDA_MAJOR_VERSION=${CUDA_MAJOR_VERSION} -ENV CUDA_MINOR_VERSION=${CUDA_MINOR_VERSION} -# Make sure we are on the right version of CUDA -RUN update-alternatives --set cuda /usr/local/cuda-$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION -# NVIDIA binaries from the host are mounted to /opt/bin. -ENV PATH=/opt/bin:${PATH} -# Add CUDA stubs to LD_LIBRARY_PATH to support building the GPU image on a CPU machine. -ENV LD_LIBRARY_PATH_NO_STUBS="$LD_LIBRARY_PATH" -ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64/stubs" -RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 -{{ end }} +# Freeze existing requirements from base image for critical packages: +RUN pip freeze | grep -E 'tensorflow|keras|torch|jax' > /colab_requirements.txt + +# Merge requirements files: +RUN cat /colab_requirements.txt >> /requirements.txt +RUN cat /kaggle_requirements.txt >> /requirements.txt + +# TODO: GPU requirements.txt +# TODO: merge them better (override matching ones). + +# Install Kaggle packages +RUN uv pip install --system -r /requirements.txt + +# Install manual packages: +# b/183041606#comment5: the Kaggle data proxy doesn't support these APIs. If the library is missing, it falls back to using a regular BigQuery query to fetch data. +RUN uv pip uninstall --system google-cloud-bigquery-storage + +# b/394382016: sigstore (dependency of kagglehub) requires a prerelease packages, installing separate. +RUN uv pip install --system --force-reinstall --prerelease=allow "kagglehub[pandas-datasets,hf-datasets,signing]>=0.3.12" + +# uv cannot install this in requirements.txt without --no-build-isolation +# to avoid affecting the larger build, we'll post-install it. +RUN uv pip install --no-build-isolation --system "git+https://github.com/Kaggle/learntools" -# Keep these variables in sync if base image is updated. -ENV TENSORFLOW_VERSION=2.15.0 -# See https://github.com/tensorflow/io#tensorflow-version-compatibility -ENV TENSORFLOW_IO_VERSION=0.35.0 - -# We need to redefine the ARG here to get the ARG value defined above the FROM instruction. -# See: https://docs.docker.com/engine/reference/builder/#understand-how-arg-and-from-interact -ARG LIGHTGBM_VERSION -ARG TORCH_VERSION -ARG TORCHAUDIO_VERSION -ARG TORCHTEXT_VERSION -ARG TORCHVISION_VERSION -ARG JAX_VERSION - -# Disable pesky logs like: KMP_AFFINITY: pid 6121 tid 6121 thread 0 bound to OS proc set 0 -# See: https://stackoverflow.com/questions/57385766/disable-tensorflow-log-information -ENV KMP_WARNINGS=0 -# Also make the KMP logs noverbose. -# https://stackoverflow.com/questions/70250304/stop-tensorflow-from-printing-warning-message -ENV KMP_SETTINGS=false - -# Remove the pip as the root user warning. -ENV PIP_ROOT_USER_ACTION=ignore +# b/408281617: Torch is adamant that it can not install cudnn 9.3.x, only 9.1.x, but Tensorflow can only support 9.3.x. +# This conflict causes a number of package downgrades, which are handled in this command +RUN uv pip install \ + --index-url https://pypi.nvidia.com --extra-index-url https://pypi.org/simple/ --index-strategy unsafe-first-match \ + --system --force-reinstall "cuml-cu12==25.2.1" \ + "nvidia-cudnn-cu12==9.3.0.75" "nvidia-cublas-cu12==12.5.3.2" "nvidia-cusolver-cu12==11.6.3.83" \ + "nvidia-cuda-cupti-cu12==12.5.82" "nvidia-cuda-nvrtc-cu12==12.5.82" "nvidia-cuda-runtime-cu12==12.5.82" \ + "nvidia-cufft-cu12==11.2.3.61" "nvidia-curand-cu12==10.3.6.82" "nvidia-cusparse-cu12==12.5.1.3" \ + "nvidia-nvjitlink-cu12==12.5.82" +RUN uv pip install --system --force-reinstall "pynvjitlink-cu12==0.5.2" +# b/385145217 Latest Colab lacks mkl numpy, install it. +RUN uv pip install --system --force-reinstall -i https://pypi.anaconda.org/intel/simple numpy + +# newer daal4py requires tbb>=2022, but libpysal is downgrading it for some reason +RUN uv pip install --system "tbb>=2022" "libpysal==4.9.2" + +# b/404590350: Ray and torchtune have conflicting tune cli, we will prioritize torchtune. +# b/415358158: Gensim removed from Colab image to upgrade scipy +# b/456239669: remove huggingface-hub pin when pytorch-lighting and transformer are compatible +# b/315753846: Unpin translate package, currently conflicts with adk 1.17.0 +RUN uv pip install --system --force-reinstall --no-deps torchtune gensim "scipy<=1.15.3" "huggingface-hub==0.36.0" "google-cloud-translate==3.12.1" + +# Adding non-package dependencies: ADD clean-layer.sh /tmp/clean-layer.sh -ADD patches/keras_patch.sh /tmp/keras_patch.sh ADD patches/nbconvert-extensions.tpl /opt/kaggle/nbconvert-extensions.tpl ADD patches/template_conf.json /opt/kaggle/conf.json -# b/276344496: Install specific version of boto3, because 1.26.103 is broken. -RUN pip install boto3==1.26.100 && \ - /tmp/clean-layer.sh +# /opt/conda/lib/python3.11/site-packages +ARG PACKAGE_PATH=/usr/local/lib/python3.11/dist-packages +# Install GPU-specific non-pip packages. {{ if eq .Accelerator "gpu" }} -# b/200968891 Keeps horovod once torch is upgraded. -RUN pip uninstall -y horovod && \ - /tmp/clean-layer.sh +RUN uv pip install --system "pycuda" {{ end }} -# Update GPG key per documentation at https://cloud.google.com/compute/docs/troubleshooting/known-issues -RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add - -RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - # Use a fixed apt-get repo to stop intermittent failures due to flaky httpredir connections, # as described by Lionel Chan at http://stackoverflow.com/a/37426929/5881346 @@ -92,159 +78,21 @@ RUN sed -i "s/httpredir.debian.org/debian.uchicago.edu/" /etc/apt/sources.list & apt-get install -y graphviz && pip install graphviz && \ /tmp/clean-layer.sh -# b/128333086: Set PROJ_LIB to points to the proj4 cartographic library. -ENV PROJ_LIB=/opt/conda/share/proj - -# Install conda packages not available on pip. -# When using pip in a conda environment, conda commands should be ran first and then -# the remaining pip commands: https://www.anaconda.com/using-pip-in-a-conda-environment/ -RUN conda config --add channels nvidia && \ - conda config --add channels rapidsai && \ - conda config --set solver libmamba && \ - # b/299991198 remove curl/libcurl install once DLVM base image includes version >= 7.86 - conda install -c conda-forge mamba curl libcurl && \ - # Base image channel order: conda-forge (highest priority), defaults. - # End state: rapidsai (highest priority), nvidia, conda-forge, defaults. - mamba install -y mkl cartopy imagemagick pyproj "shapely<2" && \ - /tmp/clean-layer.sh - -# Install spacy -# b/232247930: uninstall pyarrow to avoid double installation with the GPU specific version. -# b/341938540: unistall grpc-cpp to allow >=v24.4 cudf and cuml to be installed. -{{ if eq .Accelerator "gpu" }} -RUN pip uninstall -y pyarrow && \ - mamba remove -y --force grpc-cpp && \ - mamba install -y -c conda-forge spacy cudf>=24.4 cuml>=24.4 cupy cuda-version=$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION && \ - /tmp/clean-layer.sh -{{ else }} -RUN pip install spacy && \ - /tmp/clean-layer.sh -{{ end}} - -# Install PyTorch -{{ if eq .Accelerator "gpu" }} -COPY --from=torch_whl /tmp/whl/*.whl /tmp/torch/ -RUN mamba install -y -c pytorch magma-cuda${CUDA_MAJOR_VERSION}${CUDA_MINOR_VERSION} && \ - pip install /tmp/torch/*.whl && \ - # b/255757999 openmp (libomp.so) is an dependency of libtorchtext and libtorchaudio but - mamba install -y openmp && \ - rm -rf /tmp/torch && \ - /tmp/clean-layer.sh -{{ else }} -RUN pip install \ - torch==$TORCH_VERSION+cpu \ - torchvision==$TORCHVISION_VERSION+cpu \ - torchaudio==$TORCHAUDIO_VERSION+cpu \ - torchtext==$TORCHTEXT_VERSION \ - -f https://download.pytorch.org/whl/torch_stable.html && \ - /tmp/clean-layer.sh -{{ end }} - -# Install LightGBM -{{ if eq .Accelerator "gpu" }} -COPY --from=lightgbm_whl /tmp/whl/*.whl /tmp/lightgbm/ -# Install OpenCL (required by LightGBM GPU version) -RUN apt-get install -y ocl-icd-libopencl1 clinfo && \ - mkdir -p /etc/OpenCL/vendors && \ - echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd && \ - pip install /tmp/lightgbm/*.whl && \ - rm -rf /tmp/lightgbm && \ - /tmp/clean-layer.sh -{{ else }} -RUN pip install lightgbm==$LIGHTGBM_VERSION && \ - /tmp/clean-layer.sh -{{ end }} - -# Install JAX -{{ if eq .Accelerator "gpu" }} -COPY --from=jaxlib_whl /tmp/whl/*.whl /tmp/jax/ -# b/319722433#comment9: Use pip wheels once versions matches our CUDA version. -RUN pip install /tmp/jax/*.whl jax==$JAX_VERSION && \ - /tmp/clean-layer.sh -{{ else }} -RUN pip install jax[cpu] && \ - /tmp/clean-layer.sh -{{ end }} - - -# Install GPU specific packages -{{ if eq .Accelerator "gpu" }} -# Install GPU-only packages -# No specific package for nnabla-ext-cuda 12.x minor versions. -RUN export PATH=/usr/local/cuda/bin:$PATH && \ - export CUDA_ROOT=/usr/local/cuda && \ - pip install pycuda \ - pynvrtc \ - pynvml && \ - /tmp/clean-layer.sh -{{ end }} - -# (b/308525631) Pin Matplotlib until seaborn can be upgraded -# to >0.13.0 (now it's stuck by a package conflict with ydata-profiling 4.5.1). -RUN JAXVER=$(pip freeze | grep -e "^jax==") && \ - pip install --upgrade \ - "matplotlib<3.8.0" \ - seaborn \ - python-dateutil dask dask-expr igraph \ - pyyaml joblib geopy mne pyshp \ - pandas \ - polars \ - flax \ - "${JAXVER}" && \ - /tmp/clean-layer.sh - -RUN apt-get update && \ - apt-get install -y default-jre && \ - /tmp/clean-layer.sh - -RUN pip install -f http://h2o-release.s3.amazonaws.com/h2o/latest_stable_Py.html h2o && /tmp/clean-layer.sh - -# b/318672158 Use simply tensorflow-probability once > 0.23.0 is released. -RUN pip install \ - "tensorflow==${TENSORFLOW_VERSION}" \ - "tensorflow-io==${TENSORFLOW_IO_VERSION}" \ - git+https://github.com/tensorflow/probability.git@fbc5ebe9b1d343113fb917010096cfd88b32eecf \ - tensorflow_text \ - "tensorflow_hub>=0.16.0" \ - # b/331799280 remove once other packages over to dm-tre - optree \ - tf-keras && \ - /tmp/clean-layer.sh - -# b/318672158 Use simply tensorflow_decision_forests on next release, expected with tf 2.16 -RUN pip install tensorflow_decision_forests==1.8.1 --no-deps && \ - /tmp/clean-layer.sh - -RUN chmod +x /tmp/keras_patch.sh && \ - /tmp/keras_patch.sh +ADD patches/keras_internal.py \ + patches/keras_internal_test.py \ + $PACKAGE_PATH/tensorflow_decision_forests/keras/ -ADD patches/keras_internal.py /opt/conda/lib/python3.10/site-packages/tensorflow_decision_forests/keras/keras_internal.py -ADD patches/keras_internal_test.py /opt/conda/lib/python3.10/site-packages/tensorflow_decision_forests/keras/keras_internal_test.py - -# Remove "--no-deps" flag and "namex" package once Keras 3.* is included in our base image. -# We ignore dependencies since tf2.15 and Keras 3.* should work despite pip saying it won't. -# Currently, keras tries to install a nightly version of tf 2.16: https://github.com/keras-team/keras/blob/fe2f54aa5bc42fb23a96449cf90434ab9bb6a2cd/requirements.txt#L2 -RUN pip install --no-deps "keras>3" keras-cv keras-nlp namex && \ - /tmp/clean-layer.sh - -# b/328788268 libpysal 4.10 seems to fail with "module 'shapely' has no attribute 'Geometry'. Did you mean: 'geometry'" -RUN pip install pysal "libpysal==4.9.2" - -# b/350573866 xgboost v2.1.0 breaks learntools RUN apt-get install -y libfreetype6-dev && \ - apt-get install -y libglib2.0-0 libxext6 libsm6 libxrender1 libfontconfig1 --fix-missing && \ - pip install gensim \ - textblob \ - wordcloud \ - "xgboost==2.0.3" \ - pydot \ - hep_ml && \ - # NLTK Project datasets - mkdir -p /usr/share/nltk_data && \ + apt-get install -y libglib2.0-0 libxext6 libsm6 libxrender1 libfontconfig1 --fix-missing + +# NLTK Project datasets +# b/408298750: We currently reinstall the package, because we get the following error: +# `AttributeError: module 'inspect' has no attribute 'formatargspec'. Did you mean: 'formatargvalues'?` +RUN uv pip install --system --force-reinstall "nltk>=3.9.1" +RUN mkdir -p /usr/share/nltk_data && \ # NLTK Downloader no longer continues smoothly after an error, so we explicitly list # the corpuses that work - # "yes | ..." answers yes to the retry prompt in case of an error. See b/133762095. - yes | python -m nltk.downloader -d /usr/share/nltk_data abc alpino averaged_perceptron_tagger \ + python -m nltk.downloader -d /usr/share/nltk_data abc alpino averaged_perceptron_tagger \ basque_grammars biocreative_ppi bllip_wsj_no_aux \ book_grammars brown brown_tei cess_cat cess_esp chat80 city_database cmudict \ comtrans conll2000 conll2002 conll2007 crubadan dependency_treebank \ @@ -253,326 +101,17 @@ RUN apt-get install -y libfreetype6-dev && \ masc_tagged maxent_ne_chunker maxent_treebank_pos_tagger moses_sample movie_reviews \ mte_teip5 names nps_chat omw opinion_lexicon paradigms \ pil pl196x porter_test ppattach problem_reports product_reviews_1 product_reviews_2 propbank \ - pros_cons ptb punkt qc reuters rslp rte sample_grammars semcor senseval sentence_polarity \ + pros_cons ptb punkt punkt_tab qc reuters rslp rte sample_grammars semcor senseval sentence_polarity \ sentiwordnet shakespeare sinica_treebank smultron snowball_data spanish_grammars \ state_union stopwords subjectivity swadesh switchboard tagsets timit toolbox treebank \ twitter_samples udhr2 udhr unicode_samples universal_tagset universal_treebanks_v20 \ - vader_lexicon verbnet webtext word2vec_sample wordnet wordnet_ic words ycoe && \ - # Stop-words - pip install stop-words \ - scikit-image && \ - /tmp/clean-layer.sh - -RUN pip install opencv-contrib-python opencv-python && \ - /tmp/clean-layer.sh - -# Pin scipy until we update JAX b/335003097 -RUN pip install "scipy==1.12.0" \ - # Scikit-learn accelerated library for x86 - "scikit-learn-intelex>=2023.0.1" \ - # HDF5 support - h5py \ - # PUDB, for local debugging convenience - pudb \ - imbalanced-learn \ - # Profiling and other utilities - line_profiler \ - bokeh \ - numba \ - datashader \ - # Boruta (python implementation) - Boruta && \ - # Pandoc is a dependency of deap - apt-get install -y pandoc && \ - pip install essentia + vader_lexicon verbnet webtext word2vec_sample wordnet wordnet_ic words ycoe RUN apt-get install -y git-lfs && \ - /tmp/clean-layer.sh - -# vtk with dependencies -RUN apt-get install -y libgl1-mesa-glx && \ - pip install vtk && \ - # xvfbwrapper with dependencies + # vtk dependencies + apt-get install -y libgl1-mesa-glx && \ + # xvfbwrapper dependencies apt-get install -y xvfb && \ - pip install xvfbwrapper && \ - /tmp/clean-layer.sh - -RUN rm -rf /opt/conda/lib/python3.10/site-packages/Shapely-1.8.5.post1.dist-info/ - -RUN pip install mpld3 \ - gpxpy \ - arrow \ - nilearn \ - nibabel \ - imgaug \ - preprocessing \ - path.py \ - Geohash && \ - pip install deap \ - # b/302136621 Fix eli5 import for learntools, newer version require scikit-learn > 1.3 - "tpot==0.12.1" \ - scikit-optimize \ - haversine \ - toolz cytoolz \ - plotly \ - hyperopt \ - fitter \ - langid \ - # Useful data exploration libraries (for missing data and generating reports) - missingno \ - pandas-profiling \ - s2sphere \ - bayesian-optimization \ - matplotlib-venn \ - pyldavis \ - mlxtend \ - altair \ - ImageHash \ - ecos \ - CVXcanon \ - pymc3 \ - imagecodecs \ - tifffile \ - spectral \ - descartes \ - geojson \ - pydicom \ - wavio \ - SimpleITK \ - hmmlearn \ - gplearn \ - squarify \ - fuzzywuzzy \ - python-louvain \ - pyexcel-ods \ - sklearn-pandas \ - stemming \ - # b/266272046 prophet 1.1.2 breaks the test - prophet==1.1.1 \ - # b/283847935 holidays >0.24 is broken - "holidays==0.24" \ - holoviews \ - geoviews \ - hypertools \ - mlens \ - scikit-multilearn \ - cleverhans \ - leven \ - catboost \ - folium \ - scikit-plot \ - fury dipy \ - plotnine \ - scikit-surprise \ - pymongo \ - geoplot \ - eli5 \ - kaggle \ - kagglehub \ - google-generativeai \ - pytest && \ - /tmp/clean-layer.sh - -RUN rm -rf /opt/conda/lib/python3.10/site-packages/numpy-1.23.5.dist-info* - # Add google PAIR-code Facets -RUN cd /opt/ && git clone https://github.com/PAIR-code/facets && cd facets/ && jupyter nbextension install facets-dist/ --user && \ - export PYTHONPATH=$PYTHONPATH:/opt/facets/facets_overview/python/ && \ - pip install kmodes --no-dependencies && \ - pip install librosa \ - polyglot \ - sentencepiece \ - cufflinks \ - lime \ - memory_profiler && \ - /tmp/clean-layer.sh - -RUN pip install cython \ - fasttext && \ - apt-get install -y libhunspell-dev && pip install hunspell -RUN pip install annoy \ - category_encoders && \ - # b/183041606#comment5: the Kaggle data proxy doesn't support these APIs. If the library is missing, it falls back to using a regular BigQuery query to fetch data. - pip uninstall -y google-cloud-bigquery-storage && \ - # google-cloud-automl 2.0.0 introduced incompatible API changes, need to pin to 1.0.1 - # After launch this should be installed from pip - pip install git+https://github.com/googleapis/python-aiplatform.git@mb-release \ - google-cloud-automl==1.0.1 \ - google-api-core==1.33.2 \ - google-cloud-bigquery \ - google-cloud-storage && \ - # Split these installations to avoid `pip._vendor.resolvelib.resolvers.ResolutionTooDeep: 200000` - # TODO(b/315753846) Unpin translate package. - pip install google-cloud-translate==3.12.1 \ - google-cloud-language==2.* \ - google-cloud-videointelligence==2.* \ - google-cloud-vision==2.* \ - protobuf==3.20.3 \ - ortools \ - scattertext \ - # Pandas data reader - pandas-datareader \ - wordsegment \ - emoji \ - # Add Japanese morphological analysis engine - janome \ - wfdb \ - vecstack \ - # yellowbrick machine learning visualization library - yellowbrick \ - mlcrate && \ - /tmp/clean-layer.sh - -# b/273059949 The pre-installed nbconvert is slow on html conversions and has to be force-uninstalled. -# b/274619697 learntools also requires a specific nbconvert right now -RUN rm -rf /opt/conda/lib/python3.10/site-packages/{nbconvert,nbclient,mistune,platformdirs}* - -# Fix qgrid by pinning ipywidgets https://github.com/quantopian/qgrid/issues/376 -# allennlp \ -RUN pip install bleach \ - certifi \ - cycler \ - decorator \ - entrypoints \ - html5lib \ - ipykernel \ - ipython \ - ipython-genutils \ - ipywidgets==7.7.1 \ - isoweek \ - jedi \ - jsonschema \ - jupyter-client \ - jupyter-console \ - jupyter-core \ - jupyterlab-lsp \ - MarkupSafe \ - mistune \ - nbformat \ - notebook \ - "nbconvert==6.4.5" \ - papermill \ - python-lsp-server[all] \ - olefile \ - kornia \ - pandas_summary \ - pandocfilters \ - pexpect \ - pickleshare \ - # TODO(b/290035631) unpin when EasyOCR did a release. - Pillow==9.5.0 && \ - # Install openslide and its python binding - apt-get install -y openslide-tools && \ - pip install openslide-python \ - ptyprocess \ - Pygments \ - pyparsing \ - pytz \ - PyYAML \ - pyzmq \ - qtconsole \ - six \ - terminado \ - tornado \ - tqdm \ - traitlets \ - wcwidth \ - webencodings \ - widgetsnbextension \ - # Require pyarrow newer than https://github.com/advisories/GHSA-5wvp-7f3h-6wmm - {{ if eq .Accelerator "gpu" }} pyarrow {{ else }} "pyarrow>=14.0.1" {{ end }} \ - feather-format \ - fastai - -RUN python -m spacy download en_core_web_sm && python -m spacy download en_core_web_lg && \ - apt-get update && apt-get install -y ffmpeg && \ - /tmp/clean-layer.sh - - ########### - # - # NEW CONTRIBUTORS: - # Please add new pip/apt installs in this block. Don't forget a "&& \" at the end - # of all non-final lines. Thanks! - # - ########### - -RUN rm /opt/conda/lib/python3.10/site-packages/google*/direct_url.json -RUN rm /opt/conda/lib/python3.10/site-packages/google*/REQUESTED - -# dlib has a libmkl incompatibility: -# test_dlib_face_detector (test_dlib.TestDLib) ... INTEL MKL ERROR: /opt/conda/bin/../lib/libmkl_avx512.so.2: undefined symbol: mkl_sparse_optimize_bsr_trsm_i8. -# Intel MKL FATAL ERROR: Cannot load libmkl_avx512.so.2 or libmkl_def.so.2. -# nnabla breaks protobuf compatibiilty: -RUN pip install flashtext \ - wandb \ - # b/214080882 blake3 0.3.0 is not compatible with vaex. - blake3==0.2.1 \ - vaex \ - pyemd \ - pyupset \ - pympler \ - featuretools \ - #-e git+https://github.com/SohierDane/BigQuery_Helper#egg=bq_helper \ - git+https://github.com/Kaggle/learntools \ - ray \ - gym \ - pyarabic \ - pandasql \ - # b/302136621 Fix eli5 import for learntools - scikit-learn==1.2.2 \ - hpsklearn \ - kmapper \ - # b/329869023 shap 0.45.0 breaks learntools - shap==0.44.1 \ - cesium \ - rgf_python \ - jieba \ - # ggplot is broken and main repo does not merge and release https://github.com/yhat/ggpy/pull/668 - https://github.com/hbasria/ggpy/archive/0.11.5.zip \ - tsfresh \ - pykalman \ - optuna \ - plotly_express \ - albumentations \ - accelerate \ - # b/290207097 switch back to the pip catalyst package when bug fixed - # https://github.com/catalyst-team/catalyst/issues/1440 - git+https://github.com/Philmod/catalyst.git@fix-fp16#egg=catalyst \ - osmnx && \ - apt-get -y install libspatialindex-dev - -RUN pip install pytorch-ignite \ - qgrid \ - bqplot \ - earthengine-api \ - transformers \ - datasets \ - s3fs \ - gcsfs \ - kaggle-environments \ - geopandas \ - "shapely<2" \ - vowpalwabbit \ - pydub \ - pydegensac \ - torchmetrics \ - pytorch-lightning \ - sympy \ - # flask is used by agents in the simulation competitions. - flask \ - # pycrypto is used by competitions team. - pycryptodome \ - easyocr \ - # ipympl adds interactive widget support for matplotlib - ipympl==0.7.0 \ - onnx \ - tables \ - openpyxl \ - timm \ - torchinfo && \ - pip install git+https://github.com/facebookresearch/segment-anything.git && \ - # b/343971718: remove duplicate aiohttp installs, and reinstall it - rm -rf /opt/conda/lib/python3.10/site-packages/aiohttp* && \ - mamba install --force-reinstall -y aiohttp && \ /tmp/clean-layer.sh # Download base easyocr models. @@ -590,119 +129,61 @@ RUN mkdir -p /root/.EasyOCR/model && \ /tmp/clean-layer.sh # Tesseract and some associated utility packages -RUN apt-get install tesseract-ocr -y && \ - pip install pytesseract \ - wand \ - pdf2image \ - PyPDF && \ - /tmp/clean-layer.sh -ENV TESSERACT_PATH=/usr/bin/tesseract +RUN apt-get install tesseract-ocr -y -# For Facets -ENV PYTHONPATH=$PYTHONPATH:/opt/facets/facets_overview/python/ -# For Theano with MKL -ENV MKL_THREADING_LAYER=GNU +ENV TESSERACT_PATH=/usr/bin/tesseract \ + # For Facets, we also include an empty path to include $PWD. + PYTHONPATH=:$PYTHONPATH:/opt/facets/facets_overview/python/ \ + # For Theano with MKL + MKL_THREADING_LAYER=GNU # Temporary fixes and patches -# Temporary patch for Dask getting downgraded, which breaks Keras -RUN pip install --upgrade dask && \ - # Stop jupyter nbconvert trying to rewrite its folder hierarchy - mkdir -p /root/.jupyter && touch /root/.jupyter/jupyter_nbconvert_config.py && touch /root/.jupyter/migrated && \ +# Stop jupyter nbconvert trying to rewrite its folder hierarchy +RUN mkdir -p /root/.jupyter && touch /root/.jupyter/jupyter_nbconvert_config.py && touch /root/.jupyter/migrated && \ mkdir -p /.jupyter && touch /.jupyter/jupyter_nbconvert_config.py && touch /.jupyter/migrated && \ - # Stop Matplotlib printing junk to the console on first load - sed -i "s/^.*Matplotlib is building the font cache using fc-list.*$/# Warning removed by Kaggle/g" /opt/conda/lib/python3.10/site-packages/matplotlib/font_manager.py && \ # Make matplotlib output in Jupyter notebooks display correctly mkdir -p /etc/ipython/ && echo "c = get_config(); c.IPKernelApp.matplotlib = 'inline'" > /etc/ipython/ipython_config.py && \ - # Temporary patch for broken libpixman 0.38 in conda-forge, symlink to system libpixman 0.34 untile conda package gets updated to 0.38.5 or higher. - ln -sf /usr/lib/x86_64-linux-gnu/libpixman-1.so.0.34.0 /opt/conda/lib/libpixman-1.so.0.38.0 && \ - # pin jupyter-server to version 2.12.5; later versions break LSP (b/333854354) - pip install --force-reinstall --no-deps jupyter_server==2.12.5 && \ /tmp/clean-layer.sh -# Fix to import bq_helper library without downgrading setuptools -RUN mkdir -p ~/src && git clone https://github.com/SohierDane/BigQuery_Helper ~/src/BigQuery_Helper && \ - mkdir -p ~/src/BigQuery_Helper/bq_helper && \ - mv ~/src/BigQuery_Helper/bq_helper.py ~/src/BigQuery_Helper/bq_helper/__init__.py && \ - mv ~/src/BigQuery_Helper/test_helper.py ~/src/BigQuery_Helper/bq_helper/ && \ - sed -i 's/)/packages=["bq_helper"])/g' ~/src/BigQuery_Helper/setup.py && \ - pip install -e ~/src/BigQuery_Helper && \ - /tmp/clean-layer.sh +# install imagemagick for wand +# https://docs.wand-py.org/en/latest/guide/install.html#install-imagemagick-on-debian-ubuntu +RUN apt-get install libmagickwand-dev -# Add BigQuery client proxy settings -ENV PYTHONUSERBASE "/root/.local" -ADD patches/kaggle_gcp.py /root/.local/lib/python3.10/site-packages/kaggle_gcp.py -ADD patches/kaggle_secrets.py /root/.local/lib/python3.10/site-packages/kaggle_secrets.py -ADD patches/kaggle_session.py /root/.local/lib/python3.10/site-packages/kaggle_session.py -ADD patches/kaggle_web_client.py /root/.local/lib/python3.10/site-packages/kaggle_web_client.py -ADD patches/kaggle_datasets.py /root/.local/lib/python3.10/site-packages/kaggle_datasets.py -ADD patches/log.py /root/.local/lib/python3.10/site-packages/log.py -ADD patches/sitecustomize.py /root/.local/lib/python3.10/site-packages/sitecustomize.py # Override default imagemagick policies ADD patches/imagemagick-policy.xml /etc/ImageMagick-6/policy.xml # Add Kaggle module resolver -ADD patches/kaggle_module_resolver.py /opt/conda/lib/python3.10/site-packages/tensorflow_hub/kaggle_module_resolver.py -RUN sed -i '/from tensorflow_hub import uncompressed_module_resolver/a from tensorflow_hub import kaggle_module_resolver' /opt/conda/lib/python3.10/site-packages/tensorflow_hub/config.py && \ - sed -i '/_install_default_resolvers()/a \ \ registry.resolver.add_implementation(kaggle_module_resolver.KaggleFileResolver())' /opt/conda/lib/python3.10/site-packages/tensorflow_hub/config.py - -# TensorBoard Jupyter extension. Should be replaced with TensorBoard's provided magic once we have -# worker tunneling support in place. -# b/139212522 re-enable TensorBoard once solution for slowdown is implemented. -# ENV JUPYTER_CONFIG_DIR "/root/.jupyter/" -# RUN pip install jupyter_tensorboard && \ -# jupyter serverextension enable jupyter_tensorboard && \ -# jupyter tensorboard enable -# ADD patches/tensorboard/notebook.py /opt/conda/lib/python3.10/site-packages/tensorboard/notebook.py - -# Disable unnecessary jupyter extensions -#RUN jupyter-nbextension disable nb_conda --py --sys-prefix && \ -# jupyter-serverextension disable nb_conda --py --sys-prefix && \ -# python -m nb_conda_kernels.install --disable - -# Disable preloaded jupyter modules (they add to startup, and break when they are missing) -RUN sed -i /bq_stats/d /etc/ipython/ipython_kernel_config.py && \ - sed -i /beatrix/d /etc/ipython/ipython_kernel_config.py && \ - sed -i /bigquery/d /etc/ipython/ipython_kernel_config.py && \ - sed -i /sql/d /etc/ipython/ipython_kernel_config.py - -# Force only one libcusolver -{{ if eq .Accelerator "gpu" }} -RUN rm /opt/conda/bin/../lib/libcusolver.so.11 && ln -s /usr/local/cuda/lib64/libcusolver.so.11 /opt/conda/bin/../lib/libcusolver.so.11 -{{ else }} -RUN ln -s /usr/local/cuda/lib64/libcusolver.so.11 /opt/conda/bin/../lib/libcusolver.so.11 -{{ end }} - -# b/270147159 conda ships with a version of libtinfo which is missing version info causing warnings, replace it with a good version. -RUN rm /opt/conda/lib/libtinfo.so.6 && ln -s /usr/lib/x86_64-linux-gnu/libtinfo.so.6 /opt/conda/lib/libtinfo.so.6 +ADD patches/kaggle_module_resolver.py $PACKAGE_PATH/tensorflow_hub/kaggle_module_resolver.py +RUN sed -i '/from tensorflow_hub import uncompressed_module_resolver/a from tensorflow_hub import kaggle_module_resolver' $PACKAGE_PATH/tensorflow_hub/config.py && \ + sed -i '/_install_default_resolvers()/a \ \ registry.resolver.add_implementation(kaggle_module_resolver.KaggleFileResolver())' $PACKAGE_PATH/tensorflow_hub/config.py -# b/276358430 fix Jupyter lsp freezing up the jupyter server -RUN pip install "jupyter-lsp==1.5.1" - -# Set backend for matplotlib -ENV MPLBACKEND "agg" +# Add BigQuery client proxy settings +ENV PYTHONUSERBASE="/root/.local" +ADD patches/kaggle_gcp.py \ + patches/kaggle_secrets.py \ + patches/kaggle_session.py \ + patches/kaggle_web_client.py \ + patches/kaggle_datasets.py \ + $PACKAGE_PATH/ -# Set LC_ALL -# https://github.com/explosion/spaCy/issues/12872#issuecomment-1661847588 -ENV LC_ALL "POSIX" +# Figure out why this is in a different place? +# Found by doing a export PYTHONVERBOSE=1 and then running python and checking for where it looked for it. +ADD patches/sitecustomize.py /usr/lib/python3.11/sitecustomize.py -ARG GIT_COMMIT=unknown -ARG BUILD_DATE=unknown +ARG GIT_COMMIT=unknown \ + BUILD_DATE=unknown -LABEL git-commit=$GIT_COMMIT -LABEL build-date=$BUILD_DATE -ENV GIT_COMMIT=${GIT_COMMIT} -ENV BUILD_DATE=${BUILD_DATE} +LABEL git-commit=$GIT_COMMIT \ + build-date=$BUILD_DATE -LABEL tensorflow-version=$TENSORFLOW_VERSION -# Used in the Jenkins `Docker GPU Build` step to restrict the images being pruned. -LABEL kaggle-lang=python +ENV GIT_COMMIT=${GIT_COMMIT} \ + BUILD_DATE=${BUILD_DATE} # Correlate current release with the git hash inside the kernel editor by running `!cat /etc/git_commit`. RUN echo "$GIT_COMMIT" > /etc/git_commit && echo "$BUILD_DATE" > /etc/build_date {{ if eq .Accelerator "gpu" }} -# Remove the CUDA stubs. -ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH_NO_STUBS" # Add the CUDA home. ENV CUDA_HOME=/usr/local/cuda {{ end }} +ENTRYPOINT ["/usr/bin/env"] diff --git a/Jenkinsfile b/Jenkinsfile index 5137c675..906e0464 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -21,67 +21,6 @@ pipeline { } stages { - stage('Pre-build Packages from Source') { - parallel { - stage('torch') { - options { - timeout(time: 300, unit: 'MINUTES') - } - steps { - sh '''#!/bin/bash - set -exo pipefail - source config.txt - cd packages/ - ./build_package --base-image $BASE_IMAGE_REPO/$GPU_BASE_IMAGE_NAME:$BASE_IMAGE_TAG \ - --package torch \ - --version $TORCH_VERSION \ - --build-arg TORCHAUDIO_VERSION=$TORCHAUDIO_VERSION \ - --build-arg TORCHTEXT_VERSION=$TORCHTEXT_VERSION \ - --build-arg TORCHVISION_VERSION=$TORCHVISION_VERSION \ - --build-arg CUDA_MAJOR_VERSION=$CUDA_MAJOR_VERSION \ - --build-arg CUDA_MINOR_VERSION=$CUDA_MINOR_VERSION \ - --push - ''' - } - } - stage('lightgbm') { - options { - timeout(time: 10, unit: 'MINUTES') - } - steps { - sh '''#!/bin/bash - set -exo pipefail - source config.txt - cd packages/ - ./build_package --base-image $BASE_IMAGE_REPO/$GPU_BASE_IMAGE_NAME:$BASE_IMAGE_TAG \ - --package lightgbm \ - --version $LIGHTGBM_VERSION \ - --build-arg CUDA_MAJOR_VERSION=$CUDA_MAJOR_VERSION \ - --build-arg CUDA_MINOR_VERSION=$CUDA_MINOR_VERSION \ - --push - ''' - } - } - stage('jaxlib') { - options { - timeout(time: 300, unit: 'MINUTES') - } - steps { - sh '''#!/bin/bash - set -exo pipefail - source config.txt - cd packages/ - ./build_package --base-image $BASE_IMAGE_REPO/$GPU_BASE_IMAGE_NAME:$BASE_IMAGE_TAG \ - --package jaxlib \ - --version $JAX_VERSION \ - --build-arg CUDA_MAJOR_VERSION=$CUDA_MAJOR_VERSION \ - --build-arg CUDA_MINOR_VERSION=$CUDA_MINOR_VERSION \ - --push - ''' - } - } - } - } stage('Build/Test/Diff') { parallel { stage('CPU') { @@ -99,22 +38,6 @@ pipeline { ''' } } - stage('Test CPU Image') { - options { - timeout(time: 15, unit: 'MINUTES') - } - steps { - retry(2) { - sh '''#!/bin/bash - set -exo pipefail - - date - docker pull gcr.io/kaggle-images/python:${PRETEST_TAG} - ./test --image gcr.io/kaggle-images/python:${PRETEST_TAG} - ''' - } - } - } stage('Diff CPU image') { steps { sh '''#!/bin/bash @@ -151,44 +74,6 @@ pipeline { ''' } } - stage('Test GPU Image') { - stages { - stage('Test on P100') { - agent { label 'ephemeral-linux-gpu' } - options { - timeout(time: 40, unit: 'MINUTES') - } - steps { - retry(2) { - sh '''#!/bin/bash - set -exo pipefail - - date - docker pull gcr.io/kaggle-private-byod/python:${PRETEST_TAG} - ./test --gpu --image gcr.io/kaggle-private-byod/python:${PRETEST_TAG} - ''' - } - } - } - stage('Test on T4x2') { - agent { label 'ephemeral-linux-gpu-t4x2' } - options { - timeout(time: 60, unit: 'MINUTES') - } - steps { - retry(2) { - sh '''#!/bin/bash - set -exo pipefail - - date - docker pull gcr.io/kaggle-private-byod/python:${PRETEST_TAG} - ./test --gpu --image gcr.io/kaggle-private-byod/python:${PRETEST_TAG} - ''' - } - } - } - } - } stage('Diff GPU Image') { steps { sh '''#!/bin/bash @@ -202,6 +87,7 @@ pipeline { } } stage('TPU VM') { + agent { label 'ephemeral-linux' } stages { stage('Build TPU VM Image') { options { @@ -231,6 +117,61 @@ pipeline { } } + stage('Test') { + parallel { + stage('Test CPU Image') { + options { + timeout(time: 15, unit: 'MINUTES') + } + steps { + retry(2) { + sh '''#!/bin/bash + set -exo pipefail + + date + docker pull gcr.io/kaggle-images/python:${PRETEST_TAG} + ./test --image gcr.io/kaggle-images/python:${PRETEST_TAG} + ''' + } + } + } + stage('Test on P100') { + agent { label 'ephemeral-linux-gpu' } + options { + timeout(time: 40, unit: 'MINUTES') + } + steps { + retry(2) { + sh '''#!/bin/bash + set -exo pipefail + + date + docker pull gcr.io/kaggle-private-byod/python:${PRETEST_TAG} + ./test --gpu --image gcr.io/kaggle-private-byod/python:${PRETEST_TAG} + ''' + } + } + } + stage('Test on T4x2') { + agent { label 'ephemeral-linux-gpu-t4x2' } + options { + timeout(time: 60, unit: 'MINUTES') + } + steps { + retry(2) { + sh '''#!/bin/bash + set -exo pipefail + + date + docker pull gcr.io/kaggle-private-byod/python:${PRETEST_TAG} + ./test --gpu --image gcr.io/kaggle-private-byod/python:${PRETEST_TAG} + ''' + } + } + } + } + } + stage('Label CPU/GPU Staging Images') { steps { sh '''#!/bin/bash diff --git a/README.md b/README.md index 387dcf89..315e7db2 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ If you the first step above doesn't work for your use case, [open an issue](http ## Opening a pull request -1. Edit the [Dockerfile](Dockerfile.tmpl). +1. Edit [kaggle_requirements.txt](kaggle_requirements.txt). 1. Follow the instructions below to build a new image. 1. Add tests for your new package. See this [example](https://github.com/Kaggle/docker-python/blob/main/tests/test_fastai.py). 1. Follow the instructions below to test the new image. diff --git a/clean-layer.sh b/clean-layer.sh index d1a048fc..467e1cac 100755 --- a/clean-layer.sh +++ b/clean-layer.sh @@ -19,6 +19,4 @@ apt-get clean # Ensures the current working directory won't be deleted cd /usr/local/src/ # Delete source files used for building binaries -rm -rf /usr/local/src/* -# Delete conda downloaded tarballs -conda clean -y --tarballs +rm -rf /usr/local/src/* \ No newline at end of file diff --git a/config.txt b/config.txt index 6afee191..af541652 100644 --- a/config.txt +++ b/config.txt @@ -1,12 +1,4 @@ -BASE_IMAGE_REPO=gcr.io/deeplearning-platform-release -BASE_IMAGE_TAG=m114 -CPU_BASE_IMAGE_NAME=tf2-cpu.2-15.py310 -GPU_BASE_IMAGE_NAME=tf2-gpu.2-15.py310 -LIGHTGBM_VERSION=4.2.0 -TORCH_VERSION=2.1.2 -TORCHAUDIO_VERSION=2.1.2 -TORCHTEXT_VERSION=0.16.2 -TORCHVISION_VERSION=0.16.2 -JAX_VERSION=0.4.26 +BASE_IMAGE=us-docker.pkg.dev/colab-images/public/runtime +BASE_IMAGE_TAG=release-colab_20250725-060057_RC00 CUDA_MAJOR_VERSION=12 -CUDA_MINOR_VERSION=1 +CUDA_MINOR_VERSION=5 diff --git a/kaggle_requirements.txt b/kaggle_requirements.txt new file mode 100644 index 00000000..503d37f9 --- /dev/null +++ b/kaggle_requirements.txt @@ -0,0 +1,154 @@ +# Please keep this in alphabetical order +Boruta +Cartopy +ImageHash +Janome +PyArabic +PyUpSet +Pympler +Rtree +shapely<2 +SimpleITK +# b/302136621: Fix eli5 import for learntools, newer version require scikit-learn > 1.3 +TPOT==0.12.1 +Theano +Wand +annoy +arrow +bayesian-optimization +boto3 +catboost +category-encoders +cesium +comm +cytoolz +# Older versions of datasets fail with "Loading a dataset cached in a LocalFileSystem is not supported" +# https://stackoverflow.com/questions/77433096/notimplementederror-loading-a-dataset-cached-in-a-localfilesystem-is-not-suppor +datasets>=2.14.6 +deap +dipy +docker +easyocr +# b/302136621: Fix eli5 import for learntools +eli5 +emoji +fastcore +# b/445960030: Requires a newer version of fastai than the currently used base image. +# Remove when relying on a newer base image. +fastai>=2.8.4 +fasttext +featuretools +fiona +fury +fuzzywuzzy +geojson +# geopandas > v0.14.4 breaks learn tools +geopandas==v0.14.4 +gensim +# b/443054743,b/455550872 +google-adk[a2a,eval] +google-cloud-aiplatform +google-cloud-videointelligence +google-cloud-vision +google-genai +gpxpy +h2o +haversine +hep-ml +igraph +ipympl +ipywidgets==8.1.5 +isoweek +jedi +# b/276358430: fix Jupyter lsp freezing up the jupyter server +jupyter-lsp==1.5.1 +# b/333854354: pin jupyter-server to version 2.12.5; later versions break LSP (b/333854354) +jupyter_server==2.12.5 +jupyter_server_proxy +jupyterlab +jupyterlab-lsp +# b/409363708: Ensure we have the update version, we can consider removing it once +# Colab base image is updated more frequently. +kaggle>=1.7.4.2 +kaggle-environments +keras-cv +keras-nlp +keras-tuner +kornia +langid +# b/328788268: libpysal 4.10 seems to fail with "module 'shapely' has no attribute 'Geometry'. Did you mean: 'geometry'" +libpysal<=4.9.2 +lime +line_profiler +mamba +matplotlib<3.8 +mlcrate +mne +mpld3 +# b/274619697: learntools requires a specific nbconvert right now +nbconvert==6.4.5 +nbdev +nilearn +olefile +# b/445960030: Broken in 1.19.0. See https://github.com/onnx/onnx/issues/7249. +# Fixed with https://github.com/onnx/onnx/pull/7254. Upgrade when version with fix is published. +onnx==1.18.0 +openslide-bin +openslide-python +optuna +pandas-profiling +pandasql +papermill +path +path.py +pdf2image +plotly-express +preprocessing +pudb +pyLDAvis +pycryptodome +pydegensac +pydicom +pyemd +pyexcel-ods +pymc3 +pymongo +pypdf +pytesseract +python-lsp-server +pytorch-ignite +pytorch-lightning +qgrid +qtconsole +ray +rgf-python +s3fs +# b/302136621: Fix eli5 import for learntools +scikit-learn==1.2.2 +# Scikit-learn accelerated library for x86 +scikit-learn-intelex>=2023.0.1 +scikit-multilearn +scikit-optimize +scikit-plot +scikit-surprise +# Also pinning seaborn for learntools +seaborn==0.12.2 +git+https://github.com/facebookresearch/segment-anything.git +# b/329869023: shap 0.45.0 breaks learntools +shap==0.44.1 +squarify +tensorflow-cloud +tensorflow-io +tensorflow-text +tensorflow_decision_forests +torchinfo +torchmetrics +torchtune +transformers>=4.51.0 +vtk +wavio +# b/350573866: xgboost v2.1.0 breaks learntools +xgboost==2.0.3 +xvfbwrapper +ydata-profiling +ydf diff --git a/packages/build_package b/packages/build_package index 1e6a7f94..e0af53e2 100755 --- a/packages/build_package +++ b/packages/build_package @@ -115,12 +115,8 @@ if [[ -z "$DOCKERFILE" ]]; then exit 1 fi -# Keep only `tf2-gpu.2-6:m80` in `gcr.io/deeplearning-platform-release/tf2-gpu.2-6:m80` -TAG=${BASE_IMAGE/gcr.io\/deeplearning-platform-release\//} -# Keep only `python:v108` in `gcr.io/kaggle-images/python:v108` -TAG=${TAG/gcr.io\/kaggle-images\//} -# Replace the `:` in `tf2-gpu.2-6:m80` by `-` -TAG=${TAG/:/-} +# Keep only `release-colab_20240920-060127_RC00` in `us-docker.pkg.dev/colab-images/public/runtime:release-colab_20240920-060127_RC00` +TAG=$(echo $BASE_IMAGE | cut -d ':' -f 2) # Append the package version TAG=$TAG-$PACKAGE_VERSION # Add the gcr repo. diff --git a/packages/jaxlib.Dockerfile b/packages/jaxlib.Dockerfile index cc4e5fe9..ed73991c 100644 --- a/packages/jaxlib.Dockerfile +++ b/packages/jaxlib.Dockerfile @@ -15,8 +15,10 @@ ENV LIBRARY_PATH="$LIBRARY_PATH:/opt/conda/lib" ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib" # Instructions: https://jax.readthedocs.io/en/latest/developer.html#building-jaxlib-from-source -RUN apt-get update && \ - apt-get install -y g++ python python3-dev +RUN sudo ln -s /usr/bin/python3 /usr/bin/python + +RUN apt-get update && \ + apt-get install -y g++ python3 python3-dev RUN pip install numpy wheel build diff --git a/packages/lightgbm.Dockerfile b/packages/lightgbm.Dockerfile deleted file mode 100644 index 376eaaef..00000000 --- a/packages/lightgbm.Dockerfile +++ /dev/null @@ -1,30 +0,0 @@ -ARG BASE_IMAGE - -FROM ${BASE_IMAGE} AS builder - -ARG PACKAGE_VERSION -ARG CUDA_MAJOR_VERSION -ARG CUDA_MINOR_VERSION - -# Make sure we are on the right version of CUDA -RUN update-alternatives --set cuda /usr/local/cuda-$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION - -# Build instructions: https://lightgbm.readthedocs.io/en/latest/GPU-Tutorial.html#build-lightgbm -RUN apt-get update && \ - apt-get install -y build-essential cmake libboost-dev libboost-system-dev libboost-filesystem-dev clinfo nvidia-opencl-dev opencl-headers - -RUN cd /usr/local/src && \ - git clone --recursive https://github.com/microsoft/LightGBM && \ - cd LightGBM && \ - git checkout tags/v$PACKAGE_VERSION && \ - ./build-python.sh bdist_wheel --gpu --opencl-library=/usr/local/cuda/lib64/libOpenCL.so --opencl-include-dir=/usr/local/cuda/include/ - -# Using multi-stage builds to ensure the output image is very small -# See: https://docs.docker.com/develop/develop-images/multistage-build/ -FROM alpine:latest - -RUN mkdir -p /tmp/whl/ -COPY --from=builder /usr/local/src/LightGBM/dist/*.whl /tmp/whl - -# Print out the built .whl file. -RUN ls -lh /tmp/whl/ \ No newline at end of file diff --git a/packages/torch.Dockerfile b/packages/torch.Dockerfile index f9579675..68c1eff3 100644 --- a/packages/torch.Dockerfile +++ b/packages/torch.Dockerfile @@ -4,7 +4,6 @@ FROM ${BASE_IMAGE} AS builder ARG PACKAGE_VERSION ARG TORCHAUDIO_VERSION -ARG TORCHTEXT_VERSION ARG TORCHVISION_VERSION ARG CUDA_MAJOR_VERSION ARG CUDA_MINOR_VERSION @@ -20,7 +19,7 @@ RUN conda install -c conda-forge mamba # Build instructions: https://github.com/pytorch/pytorch#from-source RUN mamba install astunparse numpy ninja pyyaml mkl mkl-include setuptools cmake cffi typing_extensions future six requests dataclasses -RUN mamba install -c pytorch magma-cuda${CUDA_MAJOR_VERSION}${CUDA_MINOR_VERSION} +RUN mamba install -c pytorch magma-cuda121 # By default, it uses the version from version.txt which includes the `a0` (alpha zero) suffix and part of the git hash. # This causes dependency conflicts like these: https://paste.googleplex.com/4786486378496000 @@ -63,18 +62,6 @@ RUN sudo apt-get update && \ RUN sed -i 's/set(envs/set(envs\n "LIBS=-ltinfo"/' /usr/local/src/audio/third_party/sox/CMakeLists.txt RUN cd /usr/local/src/audio && python setup.py bdist_wheel -# Build torchtext -# Instructions: https://github.com/pytorch/text#building-from-source -# See comment above for PYTORCH_BUILD_VERSION. -ENV BUILD_VERSION=$TORCHTEXT_VERSION -RUN cd /usr/local/src && \ - git clone https://github.com/pytorch/text && \ - cd text && \ - git checkout tags/v$TORCHTEXT_VERSION && \ - git submodule sync && \ - git submodule update --init --recursive --jobs 1 && \ - python setup.py bdist_wheel - # Build torchvision. # Instructions: https://github.com/pytorch/vision/tree/main#installation # See comment above for PYTORCH_BUILD_VERSION. @@ -93,7 +80,6 @@ FROM alpine:latest RUN mkdir -p /tmp/whl/ COPY --from=builder /usr/local/src/pytorch/dist/*.whl /tmp/whl COPY --from=builder /usr/local/src/audio/dist/*.whl /tmp/whl -COPY --from=builder /usr/local/src/text/dist/*.whl /tmp/whl COPY --from=builder /usr/local/src/vision/dist/*.whl /tmp/whl # Print out the built .whl file. diff --git a/patches/kaggle_gcp.py b/patches/kaggle_gcp.py index 2c8b64cc..4cb98858 100644 --- a/patches/kaggle_gcp.py +++ b/patches/kaggle_gcp.py @@ -1,5 +1,6 @@ import os import inspect +import logging from google.auth import credentials, environment_vars from google.auth.exceptions import RefreshError from google.api_core.gapic_v1.client_info import ClientInfo @@ -8,8 +9,6 @@ from google.cloud.bigquery._http import Connection from kaggle_secrets import GcpTarget, UserSecretsClient -from log import Log - KAGGLE_GCP_CLIENT_USER_AGENT="kaggle-gcp-client/1.0" def get_integrations(): @@ -22,7 +21,7 @@ def get_integrations(): target = GcpTarget[integration.upper()] kernel_integrations.add_integration(target) except KeyError as e: - Log.error(f"Unknown integration target: {integration.upper()}") + logging.debug(f"Unknown integration target: {integration.upper()}") return kernel_integrations @@ -66,14 +65,14 @@ def refresh(self, request): elif self.target == GcpTarget.CLOUDAI: self.token, self.expiry = client._get_cloudai_access_token() except ConnectionError as e: - Log.error(f"Connection error trying to refresh access token: {e}") + logging.error(f"Connection error trying to refresh access token: {e}") print("There was a connection error trying to fetch the access token. " f"Please ensure internet is on in order to use the {self.target.service} Integration.") raise RefreshError('Unable to refresh access token due to connection error.') from e except Exception as e: - Log.error(f"Error trying to refresh access token: {e}") + logging.error(f"Error trying to refresh access token: {e}") if (not get_integrations().has_integration(self.target)): - Log.error(f"No {self.target.service} integration found.") + logging.error(f"No {self.target.service} integration found.") print( f"Please ensure you have selected a {self.target.service} account in the Notebook Add-ons menu.") raise RefreshError('Unable to refresh access token.') from e @@ -102,7 +101,7 @@ def api_request(self, *args, **kwargs): msg = ("Permission denied using Kaggle's public BigQuery integration. " "Did you mean to select a BigQuery account in the Notebook Add-ons menu?") print(msg) - Log.info(msg) + logging.info(msg) raise e @@ -156,23 +155,23 @@ def monkeypatch_bq(bq_client, *args, **kwargs): # Remove these two lines once this is resolved: # https://github.com/googleapis/google-cloud-python/issues/8108 if explicit_project_id: - Log.info(f"Explicit project set to {explicit_project_id}") + logging.info(f"Explicit project set to {explicit_project_id}") kwargs['project'] = explicit_project_id if explicit_project_id is None and specified_credentials is None and not has_bigquery: msg = "Using Kaggle's public dataset BigQuery integration." - Log.info(msg) + logging.info(msg) print(msg) return PublicBigqueryClient(*args, **kwargs) else: if specified_credentials is None: - Log.info("No credentials specified, using KaggleKernelCredentials.") + logging.info("No credentials specified, using KaggleKernelCredentials.") kwargs['credentials'] = KaggleKernelCredentials() if (not has_bigquery): - Log.info("No bigquery integration found, creating client anyways.") + logging.info("No bigquery integration found, creating client anyways.") print('Please ensure you have selected a BigQuery ' 'account in the Notebook Add-ons menu.') if explicit_project_id is None: - Log.info("No project specified while using the unmodified client.") + logging.info("No project specified while using the unmodified client.") print('Please ensure you specify a project id when creating the client' ' in order to use your BigQuery account.') kwargs['client_info'] = set_kaggle_user_agent(kwargs.get('client_info')) @@ -196,20 +195,20 @@ def monkeypatch_aiplatform_init(aiplatform_klass, kaggle_kernel_credentials): def patched_init(*args, **kwargs): specified_credentials = kwargs.get('credentials') if specified_credentials is None: - Log.info("No credentials specified, using KaggleKernelCredentials.") + logging.info("No credentials specified, using KaggleKernelCredentials.") kwargs['credentials'] = kaggle_kernel_credentials return aiplatform_init(*args, **kwargs) if (not has_been_monkeypatched(aiplatform_klass.init)): aiplatform_klass.init = patched_init - Log.info("aiplatform.init patched") + logging.info("aiplatform.init patched") def monkeypatch_client(client_klass, kaggle_kernel_credentials): client_init = client_klass.__init__ def patched_init(self, *args, **kwargs): specified_credentials = kwargs.get('credentials') if specified_credentials is None: - Log.info("No credentials specified, using KaggleKernelCredentials.") + logging.info("No credentials specified, using KaggleKernelCredentials.") # Some GCP services demand the billing and target project must be the same. # To avoid using default service account based credential as caller credential # user need to provide ClientOptions with quota_project_id: @@ -227,7 +226,7 @@ def patched_init(self, *args, **kwargs): if (not has_been_monkeypatched(client_klass.__init__)): client_klass.__init__ = patched_init - Log.info(f"Client patched: {client_klass}") + logging.info(f"Client patched: {client_klass}") def set_kaggle_user_agent(client_info: ClientInfo): # Add kaggle client user agent in order to attribute usage. @@ -253,37 +252,6 @@ def init_gcs(): KaggleKernelCredentials(target=GcpTarget.GCS)) return storage -def init_automl(): - from google.cloud import automl, automl_v1beta1 - if not is_user_secrets_token_set(): - return - - from kaggle_gcp import get_integrations - if not get_integrations().has_cloudai(): - return - - from kaggle_secrets import GcpTarget - from kaggle_gcp import KaggleKernelCredentials - kaggle_kernel_credentials = KaggleKernelCredentials(target=GcpTarget.CLOUDAI) - - # Patch the 2 GA clients: AutoMlClient and PreditionServiceClient - monkeypatch_client(automl.AutoMlClient, kaggle_kernel_credentials) - monkeypatch_client(automl.PredictionServiceClient, kaggle_kernel_credentials) - - # The AutoML client library exposes 3 different client classes (AutoMlClient, - # TablesClient, PredictionServiceClient), so patch each of them. - # The same KaggleKernelCredentials are passed to all of them. - # The GcsClient class is only used internally by TablesClient. - - # The beta version of the clients that are now GA are included here for now. - # They are deprecated and will be removed by 1 May 2020. - monkeypatch_client(automl_v1beta1.AutoMlClient, kaggle_kernel_credentials) - monkeypatch_client(automl_v1beta1.PredictionServiceClient, kaggle_kernel_credentials) - - # The TablesClient is still in beta, so this will not be deprecated until - # the TablesClient is GA. - monkeypatch_client(automl_v1beta1.TablesClient, kaggle_kernel_credentials) - def init_translation_v2(): from google.cloud import translate_v2 if not is_user_secrets_token_set(): @@ -379,7 +347,6 @@ def init_vision(): def init(): init_bigquery() init_gcs() - init_automl() init_translation_v2() init_translation_v3() init_natural_language() @@ -392,4 +359,4 @@ def init(): # google.cloud.* and kaggle_gcp. By calling init here, we guarantee # that regardless of the original import that caused google.cloud.* to be # loaded, the monkeypatching will be done. -init() +init() \ No newline at end of file diff --git a/patches/keras_patch.sh b/patches/keras_patch.sh deleted file mode 100644 index 9f219026..00000000 --- a/patches/keras_patch.sh +++ /dev/null @@ -1,41 +0,0 @@ -#!/bin/bash - -# The following "sed" are to patch the current version of tf-df with -# a fix for keras 3. In essence, replaces the use of package name "tf.keras" with -# "tf_keras" - -sed -i "/import tensorflow_decision_forests as tfdf/a import tf_keras" /opt/conda/lib/python3.10/site-packages/tensorflow_decision_forests/__init__.py && \ -sed -i -e "/import tensorflow as tf/a import tf_keras" \ - -e "/from yggdrasil_decision_forests.utils.distribute.implementations.grpc/a from tensorflow_decision_forests.keras import keras_internal" \ - -e '/try:/{:a;N;/backend = tf.keras.backend/!ba;d}'\ - /opt/conda/lib/python3.10/site-packages/tensorflow_decision_forests/keras/core.py && \ -sed -i -e "s/from typing import Optional, List, Dict, Any, Union, NamedTuple/from typing import Any, Dict, List, NamedTuple, Optional, Union/g" \ - -e "/import tensorflow as tf/a from tensorflow_decision_forests.keras import keras_internal" \ - -e "/import tensorflow as tf/a import tf_keras" \ - -e '/layers = tf.keras.layers/{:a;N;/backend = tf.keras.backend/!ba;d}' \ - /opt/conda/lib/python3.10/site-packages/tensorflow_decision_forests/keras/core_inference.py && \ -find /opt/conda/lib/python3.10/site-packages/tensorflow_decision_forests -type f -exec sed -i \ - -e "s/get_data_handler/keras_internal.get_data_handler/g" \ - -e 's/"models.Functional"/keras_internal.Functional/g' \ - -e "s/tf.keras.utils.unpack_x_y_sample_weight/keras_internal.unpack_x_y_sample_weight/g" \ - -e "s/tf.keras.utils.experimental/keras_internal/g" \ - {} \; && \ -sed -i -e "/import tensorflow as tf/a import tf_keras" \ - -e "/from tensorflow_decision_forests.keras import core/a from tensorflow_decision_forests.keras import keras_internal" \ - -e '/layers = tf.keras.layers/{:a;N;/callbacks = tf.keras.callbacks/!ba;d}' \ - /opt/conda/lib/python3.10/site-packages/tensorflow_decision_forests/keras/keras_test.py && \ -find /opt/conda/lib/python3.10/site-packages/tensorflow_decision_forests/keras -type f -exec sed -i \ - -e "s/ layers.Input/ tf_keras.layers.Input/g" \ - -e "s/layers.minimum/tf_keras.layers.minimum/g" \ - -e "s/layers.Concatenate/tf_keras.layers.Concatenate/g" \ - -e "s/layers.Dense/tf_keras.layers.Dense/g" \ - -e "s/layers.experimental.preprocessing./tf_keras.layers./g" \ - -e "s/layers.DenseFeatures/keras_internal.layers.DenseFeatures/g" \ - -e "s/models.Model/tf_keras.models.Model/g" {} \; && \ -sed -i "s/ models.load_model/ tf_keras.models.load_model/g" /opt/conda/lib/python3.10/site-packages/tensorflow_decision_forests/keras/keras_test.py && \ -sed -i "/import tensorflow as tf/a import tf_keras" /opt/conda/lib/python3.10/site-packages/tensorflow_decision_forests/keras/test_runner.py && \ -sed -i "/import tensorflow as tf/a import tf_keras" /opt/conda/lib/python3.10/site-packages/tensorflow_decision_forests/keras/wrappers.py && \ -sed -i -e "/import tensorflow as tf/a import tf_keras" \ - -e "s/optimizer=optimizers.Adam()/optimizer=tf_keras.optimizers.Adam()/g" \ - /opt/conda/lib/python3.10/site-packages/tensorflow_decision_forests/keras/wrappers_pre_generated.py && \ -find /opt/conda/lib/python3.10/site-packages/tensorflow_decision_forests -type f -exec sed -i "s/tf.keras./tf_keras./g" {} \; diff --git a/patches/log.py b/patches/log.py deleted file mode 100644 index 59a07c8c..00000000 --- a/patches/log.py +++ /dev/null @@ -1,133 +0,0 @@ -import io -import logging -import os - -import google.auth - - -_LOG_TO_FILE_ENV = os.getenv("KAGGLE_LOG_TO_FILE") - - -class _LogFormatter(logging.Formatter): - """A logging formatter which truncates long messages.""" - - _MAX_LOG_LENGTH = 10000 # Be generous, not to truncate long backtraces. - - def format(self, record): - msg = super(_LogFormatter, self).format(record) - return msg[:_LogFormatter._MAX_LOG_LENGTH] if msg else msg - -# TODO(vimota): Clean this up once we're using python 3.8 and can use -# (https://github.com/python/cpython/commit/dde9fdbe453925279ac3d2a6a72102f6f9ef247c) -# Right now, making the logging module display the intended frame's information -# when the logging calls (info, warn, ...) are wrapped (as is the case in our -# Log class) involves fragile logic. -class _Logger(logging.Logger): - - # This is a copy of logging.Logger.findCaller with the filename ignore - # set expanded to include the current filename (".../log.py"). - # Copyright 2001-2015 by Vinay Sajip. All Rights Reserved. - # License: https://github.com/python/cpython/blob/ce9e62544571e7ade7186697d5dd065fb4c5243f/LICENSE - def findCaller(self, stack_info=False, stacklevel=1): - f = logging.currentframe() - f = f.f_back - rv = "(unknown file)", 0, "(unknown function)", None - while hasattr(f, "f_code"): - co = f.f_code - filename = os.path.normcase(co.co_filename) - if filename in _ignore_srcfiles: - f = f.f_back - continue - sinfo = None - if stack_info: - sio = io.StringIO() - sio.write('Stack (most recent call last):\n') - traceback.print_stack(f, file=sio) - sinfo = sio.getvalue() - if sinfo[-1] == '\n': - sinfo = sinfo[:-1] - sio.close() - rv = (co.co_filename, f.f_lineno, co.co_name, sinfo) - break - return rv - - -_srcfile = os.path.normcase(_Logger.findCaller.__code__.co_filename) -_ignore_srcfiles = (_srcfile, logging._srcfile) - -class Log: - """ Helper aggregate for all things related to logging activity. """ - - _GLOBAL_LOG = logging.getLogger("") - _initialized = False - - # These are convenience helpers. For performance, consider saving Log.get_logger() and using that - @staticmethod - def critical(msg, *args, **kwargs): - Log._GLOBAL_LOG.critical(msg, *args, **kwargs) - - @staticmethod - def fatal(msg, *args, **kwargs): - Log._GLOBAL_LOG.fatal(msg, *args, **kwargs) - - @staticmethod - def exception(msg, *args, **kwargs): - Log._GLOBAL_LOG.exception(msg, *args, **kwargs) - - @staticmethod - def error(msg, *args, **kwargs): - Log._GLOBAL_LOG.error(msg, *args, **kwargs) - - @staticmethod - def warn(msg, *args, **kwargs): - Log._GLOBAL_LOG.warn(msg, *args, **kwargs) - - @staticmethod - def warning(msg, *args, **kwargs): - Log._GLOBAL_LOG.warning(msg, *args, **kwargs) - - @staticmethod - def debug(msg, *args, **kwargs): - Log._GLOBAL_LOG.debug(msg, *args, **kwargs) - - @staticmethod - def info(msg, *args, **kwargs): - Log._GLOBAL_LOG.info(msg, *args, **kwargs) - - @staticmethod - def set_level(loglevel): - if isinstance(loglevel, int): - Log._GLOBAL_LOG.setLevel(loglevel) - return - elif isinstance(loglevel, str): - # idea from https://docs.python.org/3.5/howto/logging.html#logging-to-a-file - numeric_level = getattr(logging, loglevel.upper(), None) - if isinstance(numeric_level, int): - Log._GLOBAL_LOG.setLevel(numeric_level) - return - - raise ValueError('Invalid log level: %s' % loglevel) - - @staticmethod - def _static_init(): - if Log._initialized: - return - - logging.setLoggerClass(_Logger) - # The root logger's type is unfortunately (and surprisingly) not affected by - # `setLoggerClass`. Monkey patch it instead. TODO(vimota): Remove this, see the TODO - # associated with _Logger. - logging.RootLogger.findCaller = _Logger.findCaller - log_to_file = _LOG_TO_FILE_ENV.lower() in ("yes", "true", "t", "1") if _LOG_TO_FILE_ENV is not None else True - if log_to_file: - handler = logging.FileHandler(filename='/tmp/kaggle.log', mode='w') - else: - handler = logging.StreamHandler() - - # ".1s" is for the first letter: http://stackoverflow.com/a/27453084/1869. - format_string = "%(asctime)s %(levelname).1s %(process)d %(filename)s:%(lineno)d] %(message)s" - handler.setFormatter(_LogFormatter(format_string)) - logging.basicConfig(level=logging.INFO, handlers=[handler]) - Log._initialized = True - -Log._static_init() diff --git a/patches/sitecustomize.py b/patches/sitecustomize.py index ea47698b..1bb8a1b6 100644 --- a/patches/sitecustomize.py +++ b/patches/sitecustomize.py @@ -1,7 +1,6 @@ +import logging import os -from log import Log - import sys import importlib.abc import importlib @@ -13,7 +12,6 @@ class GcpModuleFinder(importlib.abc.MetaPathFinder): _MODULES = [ 'google.cloud.bigquery', 'google.cloud.storage', - 'google.cloud.automl_v1beta1', 'google.cloud.translate', 'google.cloud.translate_v2', 'google.cloud.translate_v3', @@ -56,7 +54,6 @@ def create_module(self, spec): _LOADERS = { 'google.cloud.bigquery': kaggle_gcp.init_bigquery, 'google.cloud.storage': kaggle_gcp.init_gcs, - 'google.cloud.automl_v1beta1': kaggle_gcp.init_automl, 'google.cloud.translate': kaggle_gcp.init_translation_v3, 'google.cloud.translate_v2': kaggle_gcp.init_translation_v2, 'google.cloud.translate_v3': kaggle_gcp.init_translation_v3, @@ -117,3 +114,30 @@ def new_configure(*args, **kwargs): module.configure = new_configure module.configure() # generativeai can use GOOGLE_API_KEY env variable, so make sure we have the other configs set + +@wrapt.when_imported('google.genai') +def post_genai_import_logic(module): + if os.getenv('KAGGLE_DISABLE_GOOGLE_GENERATIVE_AI_INTEGRATION'): + return + + if not (os.getenv('KAGGLE_DATA_PROXY_TOKEN') and + os.getenv('KAGGLE_USER_SECRETS_TOKEN') and + os.getenv('KAGGLE_DATA_PROXY_URL')): + return + @wrapt.patch_function_wrapper(module, 'Client.__init__') + def init_wrapper(wrapped, instance, args, kwargs): + # Don't want to forward requests that are to Vertex AI, debug mode, or have their own http_options specified + # Thus, if the client constructor contains any params other than api_key, we don't set up forwarding + if any(value is not None for key, value in kwargs.items() if key != 'api_key'): + return wrapped(*args, **kwargs) + + default_metadata = { + "x-kaggle-proxy-data": os.environ['KAGGLE_DATA_PROXY_TOKEN'], + 'x-kaggle-authorization': f"Bearer {os.environ['KAGGLE_USER_SECRETS_TOKEN']}" + } + http_options = { + 'base_url': os.getenv('KAGGLE_DATA_PROXY_URL') + '/palmapi/', + 'headers': default_metadata + } + kwargs['http_options'] = http_options + return wrapped(*args, **kwargs) diff --git a/test b/test index ef1ffe3e..574b49e3 100755 --- a/test +++ b/test @@ -3,7 +3,7 @@ set -e IMAGE_TAG='kaggle/python-build' IMAGE_TAG_OVERRIDE='' -ADDITONAL_OPTS='' +ADDITONAL_OPTS='--runtime runc ' # Use the CPU runtime by default PATTERN='test*.py' usage() { @@ -28,7 +28,7 @@ while :; do ;; -g|--gpu) IMAGE_TAG='kaggle/python-gpu-build' - ADDITONAL_OPTS='-v /tmp/empty_dir:/usr/local/cuda/lib64/stubs:ro' + ADDITONAL_OPTS='--runtime nvidia -v /tmp/empty_dir:/usr/local/cuda/lib64/stubs:ro' ;; -i|--image) if [[ -z $2 ]]; then @@ -69,8 +69,6 @@ readonly ADDITONAL_OPTS readonly PATTERN set -x -docker run --rm --net=none -v /tmp/python-build:/tmp/python-build "$IMAGE_TAG" rm -rf /tmp/python-build/* -docker rm jupyter_test || true mkdir -p /tmp/python-build/tmp mkdir -p /tmp/python-build/devshm mkdir -p /tmp/python-build/working @@ -97,6 +95,9 @@ fi # Note about `--hostname localhost` (b/158137436) # hostname defaults to the container name which fails DNS name # resolution with --net=none (required to keep tests hermetic). See details in bug. +# +# Note about CLOUDSDK_CONFIG=/tmp/.config/gcloud +# We use the /tmp dir since the filesystem is --read-only and we need writable space for gcloud configs. docker run --rm -t --read-only --net=none \ -e HOME=/tmp -e KAGGLE_DATA_PROXY_TOKEN=test-key \ -e KAGGLE_USER_SECRETS_TOKEN_KEY=test-secrets-key \ @@ -105,6 +106,7 @@ docker run --rm -t --read-only --net=none \ -e KAGGLE_DATA_PROXY_PROJECT=test \ -e TF_FORCE_GPU_ALLOW_GROWTH=true \ -e XLA_PYTHON_CLIENT_PREALLOCATE=false \ + -e CLOUDSDK_CONFIG=/tmp/.config/gcloud \ --hostname localhost \ --shm-size=2g \ -v $PWD:/input:ro -v /tmp/python-build/working:/working \ diff --git a/tests/common.py b/tests/common.py index 30a7bb0f..469033dd 100644 --- a/tests/common.py +++ b/tests/common.py @@ -11,7 +11,10 @@ def getAcceleratorName(): except FileNotFoundError: return("nvidia-smi not found.") -gpu_test = unittest.skipIf(len(os.environ.get('CUDA_VERSION', '')) == 0, 'Not running GPU tests') +def isGPU(): + return os.path.isfile('/proc/driver/nvidia/version') + +gpu_test = unittest.skipIf(not isGPU(), 'Not running GPU tests') # b/342143152 P100s are slowly being unsupported in new release of popular ml tools such as RAPIDS. p100_exempt = unittest.skipIf(getAcceleratorName() == "Tesla P100-PCIE-16GB", 'Not running p100 exempt tests') tpu_test = unittest.skipIf(len(os.environ.get('ISTPUVM', '')) == 0, 'Not running TPU tests') diff --git a/tests/test_automl.py b/tests/test_automl.py deleted file mode 100644 index 63c34c69..00000000 --- a/tests/test_automl.py +++ /dev/null @@ -1,137 +0,0 @@ -import unittest - -from unittest.mock import Mock, patch - -from kaggle_gcp import KaggleKernelCredentials, init_automl -from test.support.os_helper import EnvironmentVarGuard -from google.cloud import storage, automl_v1beta1, automl - -def _make_credentials(): - import google.auth.credentials - return Mock(spec=google.auth.credentials.Credentials) - -class TestAutoMl(unittest.TestCase): - - class FakeClient: - def __init__(self, credentials=None, client_info=None, **kwargs): - self.credentials = credentials - - class FakeConnection(): - def __init__(self, user_agent): - self.user_agent = user_agent - if (client_info is not None): - self._connection = FakeConnection(client_info.user_agent) - - @patch("google.cloud.automl.AutoMlClient", new=FakeClient) - def test_user_provided_credentials(self): - credentials = _make_credentials() - env = EnvironmentVarGuard() - env.set('KAGGLE_USER_SECRETS_TOKEN', 'foobar') - env.set('KAGGLE_KERNEL_INTEGRATIONS', 'CLOUDAI') - with env: - init_automl() - client = automl.AutoMlClient(credentials=credentials) - self.assertNotIsInstance(client.credentials, KaggleKernelCredentials) - self.assertIsNotNone(client.credentials) - - def test_tables_gcs_client(self): - # The GcsClient can't currently be monkeypatched for default - # credentials because it requires a project which can't be set. - # Verify that creating an automl_v1beta1.GcsClient given an actual - # storage.Client sets the client properly. - gcs_client = storage.Client(project="xyz", credentials=_make_credentials()) - tables_gcs_client = automl_v1beta1.GcsClient(client=gcs_client) - self.assertIs(tables_gcs_client.client, gcs_client) - - @patch("google.cloud.automl_v1beta1.gapic.auto_ml_client.AutoMlClient", new=FakeClient) - def test_tables_client_credentials(self): - credentials = _make_credentials() - env = EnvironmentVarGuard() - env.set('KAGGLE_USER_SECRETS_TOKEN', 'foobar') - env.set('KAGGLE_KERNEL_INTEGRATIONS', 'CLOUDAI') - with env: - init_automl() - tables_client = automl_v1beta1.TablesClient(credentials=credentials) - self.assertEqual(tables_client.auto_ml_client.credentials, credentials) - - @patch("google.cloud.automl.AutoMlClient", new=FakeClient) - def test_default_credentials_automl_client(self): - env = EnvironmentVarGuard() - env.set('KAGGLE_USER_SECRETS_TOKEN', 'foobar') - env.set('KAGGLE_KERNEL_INTEGRATIONS', 'CLOUDAI') - with env: - init_automl() - automl_client = automl.AutoMlClient() - self.assertIsNotNone(automl_client.credentials) - self.assertIsInstance(automl_client.credentials, KaggleKernelCredentials) - self.assertTrue(automl_client._connection.user_agent.startswith("kaggle-gcp-client/1.0")) - - @patch("google.cloud.automl_v1beta1.AutoMlClient", new=FakeClient) - def test_default_credentials_automl_v1beta1_client(self): - env = EnvironmentVarGuard() - env.set('KAGGLE_USER_SECRETS_TOKEN', 'foobar') - env.set('KAGGLE_KERNEL_INTEGRATIONS', 'CLOUDAI') - with env: - init_automl() - automl_client = automl_v1beta1.AutoMlClient() - self.assertIsNotNone(automl_client.credentials) - self.assertIsInstance(automl_client.credentials, KaggleKernelCredentials) - self.assertTrue(automl_client._connection.user_agent.startswith("kaggle-gcp-client/1.0")) - - @patch("google.cloud.automl_v1beta1.TablesClient", new=FakeClient) - def test_default_credentials_tables_client(self): - env = EnvironmentVarGuard() - env.set('KAGGLE_USER_SECRETS_TOKEN', 'foobar') - env.set('KAGGLE_KERNEL_INTEGRATIONS', 'CLOUDAI') - with env: - init_automl() - tables_client = automl_v1beta1.TablesClient() - self.assertIsNotNone(tables_client.credentials) - self.assertIsInstance(tables_client.credentials, KaggleKernelCredentials) - self.assertTrue(tables_client._connection.user_agent.startswith("kaggle-gcp-client/1.0")) - - @patch("google.cloud.automl.PredictionServiceClient", new=FakeClient) - def test_default_credentials_prediction_client(self): - env = EnvironmentVarGuard() - env.set('KAGGLE_USER_SECRETS_TOKEN', 'foobar') - env.set('KAGGLE_KERNEL_INTEGRATIONS', 'CLOUDAI') - with env: - prediction_client = automl.PredictionServiceClient() - self.assertIsNotNone(prediction_client.credentials) - self.assertIsInstance(prediction_client.credentials, KaggleKernelCredentials) - self.assertTrue(prediction_client._connection.user_agent.startswith("kaggle-gcp-client/1.0")) - - @patch("google.cloud.automl_v1beta1.PredictionServiceClient", new=FakeClient) - def test_default_credentials_prediction_v1beta1_client(self): - env = EnvironmentVarGuard() - env.set('KAGGLE_USER_SECRETS_TOKEN', 'foobar') - env.set('KAGGLE_KERNEL_INTEGRATIONS', 'CLOUDAI') - with env: - prediction_client = automl_v1beta1.PredictionServiceClient() - self.assertIsNotNone(prediction_client.credentials) - self.assertIsInstance(prediction_client.credentials, KaggleKernelCredentials) - self.assertTrue(prediction_client._connection.user_agent.startswith("kaggle-gcp-client/1.0")) - - def test_monkeypatching_idempotent(self): - env = EnvironmentVarGuard() - env.set('KAGGLE_USER_SECRETS_TOKEN', 'foobar') - env.set('KAGGLE_KERNEL_INTEGRATIONS', 'CLOUDAI') - with env: - client1 = automl.AutoMlClient.__init__ - init_automl() - client2 = automl.AutoMlClient.__init__ - self.assertEqual(client1, client2) - - @patch("google.cloud.automl_v1beta1.PredictionServiceClient", new=FakeClient) - def test_legacy_AUTOML_variable_v1beta1_client(self): - """ - Tests previous KAGGLE_KERNEL_INTEGRATIONS="AUTOML" environment setting - """ - env = EnvironmentVarGuard() - env.set('KAGGLE_USER_SECRETS_TOKEN', 'foobar') - env.set('KAGGLE_KERNEL_INTEGRATIONS', 'AUTOML') - with env: - prediction_client = automl_v1beta1.PredictionServiceClient() - self.assertIsNotNone(prediction_client.credentials) - self.assertIsInstance(prediction_client.credentials, KaggleKernelCredentials) - self.assertTrue(prediction_client._connection.user_agent.startswith("kaggle-gcp-client/1.0")) \ No newline at end of file diff --git a/tests/test_catalyst.py b/tests/test_catalyst.py deleted file mode 100644 index 3b9c97d4..00000000 --- a/tests/test_catalyst.py +++ /dev/null @@ -1,158 +0,0 @@ -import unittest -import collections -import json -import numpy as np - -import torch -import torch.nn as nn -import torch.nn.functional as F -import torchvision -import torchvision.transforms as transforms - -import catalyst -from catalyst.dl import SupervisedRunner, CheckpointCallback -from catalyst import utils - - -def _to_categorical(y, num_classes=None, dtype='float32'): - """ - Taken from - github.com/keras-team/keras/blob/master/keras/utils/np_utils.py - Converts a class vector (integers) to binary class matrix. - E.g. for use with categorical_crossentropy. - # Arguments - y: class vector to be converted into a matrix - (integers from 0 to num_classes). - num_classes: total number of classes. - dtype: The data type expected by the input, as a string - (`float32`, `float64`, `int32`...) - # Returns - A binary matrix representation of the input. The classes axis - is placed last. - # Example - ```python - # Consider an array of 5 labels out of a set of 3 classes {0, 1, 2}: - > labels - array([0, 2, 1, 2, 0]) - # `to_categorical` converts this into a matrix with as many - # columns as there are classes. The number of rows - # stays the same. - > to_categorical(labels) - array([[ 1., 0., 0.], - [ 0., 0., 1.], - [ 0., 1., 0.], - [ 0., 0., 1.], - [ 1., 0., 0.]], dtype=float32) - ``` - """ - - y = np.array(y, dtype='int') - input_shape = y.shape - if input_shape and input_shape[-1] == 1 and len(input_shape) > 1: - input_shape = tuple(input_shape[:-1]) - y = y.ravel() - if not num_classes: - num_classes = np.max(y) + 1 - n = y.shape[0] - categorical = np.zeros((n, num_classes), dtype=dtype) - categorical[np.arange(n), y] = 1 - output_shape = input_shape + (num_classes,) - categorical = np.reshape(categorical, output_shape) - return categorical - - -class Net(nn.Module): - def __init__(self): - super().__init__() - self.conv1 = nn.Conv2d(1, 20, 5, 1) - self.conv2 = nn.Conv2d(20, 50, 5, 1) - self.fc1 = nn.Linear(4 * 4 * 50, 500) - self.fc2 = nn.Linear(500, 10) - - def forward(self, x): - x = F.relu(self.conv1(x)) - x = F.max_pool2d(x, 2, 2) - x = F.relu(self.conv2(x)) - x = F.max_pool2d(x, 2, 2) - x = x.view(-1, 4 * 4 * 50) - x = F.relu(self.fc1(x)) - x = self.fc2(x) - return x - - -class TestCatalyst(unittest.TestCase): - - def test_version(self): - self.assertIsNotNone(catalyst.__version__) - - def test_mnist(self): - utils.set_global_seed(42) - x_train = np.random.random((100, 1, 28, 28)).astype(np.float32) - y_train = _to_categorical( - np.random.randint(10, size=(100, 1)), - num_classes=10 - ).astype(np.float32) - x_valid = np.random.random((20, 1, 28, 28)).astype(np.float32) - y_valid = _to_categorical( - np.random.randint(10, size=(20, 1)), - num_classes=10 - ).astype(np.float32) - - x_train, y_train, x_valid, y_valid = \ - list(map(torch.tensor, [x_train, y_train, x_valid, y_valid])) - - bs = 32 - num_workers = 4 - data_transform = transforms.ToTensor() - - loaders = collections.OrderedDict() - - trainset = torch.utils.data.TensorDataset(x_train, y_train) - trainloader = torch.utils.data.DataLoader( - trainset, batch_size=bs, - shuffle=True, num_workers=num_workers) - - validset = torch.utils.data.TensorDataset(x_valid, y_valid) - validloader = torch.utils.data.DataLoader( - validset, batch_size=bs, - shuffle=False, num_workers=num_workers) - - loaders["train"] = trainloader - loaders["valid"] = validloader - - # experiment setup - num_epochs = 3 - logdir = "./logs" - - # model, criterion, optimizer - model = Net() - criterion = nn.BCEWithLogitsLoss() - optimizer = torch.optim.Adam(model.parameters()) - - # model runner - runner = SupervisedRunner() - - # model training - runner.train( - model=model, - criterion=criterion, - optimizer=optimizer, - loaders=loaders, - logdir=logdir, - num_epochs=num_epochs, - verbose=False, - callbacks=[CheckpointCallback( - logdir, - topk=3, - save_best=True, - loader_key="valid", - metric_key="loss", - minimize=True)] - ) - - with open('./logs/model.storage.json') as f: - metrics = json.load(f) - storage = metrics['storage'] - self.assertEqual(3, len(storage)) - self.assertTrue(storage[0]['metric'] < storage[2]['metric']) - self.assertTrue(storage[0]['metric']< 0.35) diff --git a/tests/test_datashader.py b/tests/test_datashader.py deleted file mode 100644 index ad3afe15..00000000 --- a/tests/test_datashader.py +++ /dev/null @@ -1,42 +0,0 @@ -import unittest - -from common import p100_exempt - -class TestDatashader(unittest.TestCase): - - @p100_exempt # b/342143152: Uses cuDF(>=24.4v), which is no longer capitble with p100 GPUs. - def test_pipeline(self): - # based on https://github.com/pyviz/datashader/blob/master/datashader/tests/test_pipeline.py - import numpy as np - import pandas as pd - import datashader as ds - import datashader.transfer_functions as tf - - df = pd.DataFrame({ - 'x': np.array(([0.] * 10 + [1] * 10)), - 'y': np.array(([0.] * 5 + [1] * 5 + [0] * 5 + [1] * 5)), - 'f64': np.arange(20, dtype='f8') - }) - df.f64.iloc[2] = np.nan - - cvs = ds.Canvas(plot_width=2, plot_height=2, x_range=(0, 1), y_range=(0, 1)) - - pipeline = ds.Pipeline(df, ds.Point('x', 'y')) - img = pipeline((0, 1), (0, 1), 2, 2) - agg = cvs.points(df, 'x', 'y', ds.count()) - self.assertTrue(img.equals(tf.shade(agg))) - - color_fn = lambda agg: tf.shade(agg, 'pink', 'red') - pipeline.color_fn = color_fn - img = pipeline((0, 1), (0, 1), 2, 2) - self.assertTrue(img.equals(color_fn(agg))) - - transform_fn = lambda agg: agg + 1 - pipeline.transform_fn = transform_fn - img = pipeline((0, 1), (0, 1), 2, 2) - self.assertTrue(img.equals(color_fn(transform_fn(agg)))) - - pipeline = ds.Pipeline(df, ds.Point('x', 'y'), ds.sum('f64')) - img = pipeline((0, 1), (0, 1), 2, 2) - agg = cvs.points(df, 'x', 'y', ds.sum('f64')) - self.assertTrue(img.equals(tf.shade(agg))) diff --git a/tests/test_essentia.py b/tests/test_essentia.py deleted file mode 100644 index 749b9466..00000000 --- a/tests/test_essentia.py +++ /dev/null @@ -1,7 +0,0 @@ -import unittest - -from essentia.standard import Windowing - -class TestEssentia(unittest.TestCase): - def test_windowing(self): - Windowing(type = 'hann') diff --git a/tests/test_fastai.py b/tests/test_fastai.py index edfd402e..33a436a5 100644 --- a/tests/test_fastai.py +++ b/tests/test_fastai.py @@ -1,27 +1,36 @@ import unittest import fastai - from fastai.tabular.all import * + class TestFastAI(unittest.TestCase): - def test_has_version(self): - self.assertGreater(len(fastai.__version__), 2) - - # based on https://github.com/fastai/fastai/blob/master/tests/test_torch_core.py#L17 - def test_torch_tensor(self): - a = tensor([1, 2, 3]) - b = torch.tensor([1, 2, 3]) - - self.assertTrue(torch.all(a == b)) - - def test_tabular(self): - dls = TabularDataLoaders.from_csv( - "/input/tests/data/train.csv", - cont_names=["pixel"+str(i) for i in range(784)], - y_names='label', - procs=[FillMissing, Categorify, Normalize]) - learn = tabular_learner(dls, layers=[200, 100]) - learn.fit_one_cycle(n_epoch=1) - - self.assertGreater(learn.smooth_loss, 0) + # Basic import + def test_basic(self): + import fastai + import fastcore + import fastprogress + import fastdownload + + def test_has_version(self): + self.assertGreater(len(fastai.__version__), 2) + + # based on https://github.com/fastai/fastai/blob/master/tests/test_torch_core.py#L17 + def test_torch_tensor(self): + a = tensor([1, 2, 3]) + b = torch.tensor([1, 2, 3]) + + self.assertTrue(torch.all(a == b)) + + def test_tabular(self): + dls = TabularDataLoaders.from_csv( + "/input/tests/data/train.csv", + cont_names=["pixel" + str(i) for i in range(784)], + y_names="label", + procs=[FillMissing, Categorify, Normalize], + ) + learn = tabular_learner(dls, layers=[200, 100]) + with learn.no_bar(): + learn.fit_one_cycle(n_epoch=1) + + self.assertGreater(learn.smooth_loss, 0) diff --git a/tests/test_gcs.py b/tests/test_gcs.py index eb15ea5f..94da58c9 100644 --- a/tests/test_gcs.py +++ b/tests/test_gcs.py @@ -8,7 +8,9 @@ def _make_credentials(): import google.auth.credentials - return Mock(spec=google.auth.credentials.Credentials) + credentials = Mock(spec=google.auth.credentials.Credentials) + credentials.universe_domain = 'googleapis.com' + return credentials class TestStorage(unittest.TestCase): diff --git a/tests/test_geopandas.py b/tests/test_geopandas.py index e2bb4583..4c0106b2 100644 --- a/tests/test_geopandas.py +++ b/tests/test_geopandas.py @@ -13,4 +13,4 @@ def test_spatial_join(self): countries = world[['geometry', 'name']] countries = countries.rename(columns={'name':'country'}) cities_with_country = geopandas.sjoin(cities, countries, how="inner", op='intersects') - self.assertTrue(cities_with_country.size > 1) \ No newline at end of file + self.assertTrue(cities_with_country.size > 1) diff --git a/tests/test_geoviews.py b/tests/test_geoviews.py deleted file mode 100644 index 2636cc6f..00000000 --- a/tests/test_geoviews.py +++ /dev/null @@ -1,17 +0,0 @@ -import unittest - -from common import p100_exempt - -class TestGeoviews(unittest.TestCase): - - @p100_exempt # b/342143152: Uses cuDF(>=24.4v), which is no longer capitble with p100 GPUs. - - def test_viz(self): - import geoviews.feature as gf - import holoviews as hv - from cartopy import crs - - hv.extension('matplotlib') - (gf.ocean + gf.land + gf.ocean * gf.land * gf.coastline * gf.borders).options( - 'Feature', projection=crs.Geostationary(), global_extent=True - ).cols(3) diff --git a/tests/test_ggplot.py b/tests/test_ggplot.py deleted file mode 100644 index 30aec29f..00000000 --- a/tests/test_ggplot.py +++ /dev/null @@ -1,12 +0,0 @@ -import unittest -import os.path - -from ggplot import * - -class TestGgplot(unittest.TestCase): - - def test_plot(self): - p = ggplot(aes(x='mpg'), data=mtcars) + geom_histogram() - p.save("myplot.png") - - self.assertTrue(os.path.isfile("myplot.png")) diff --git a/tests/test_google_genai_patch.py b/tests/test_google_genai_patch.py new file mode 100644 index 00000000..9d225763 --- /dev/null +++ b/tests/test_google_genai_patch.py @@ -0,0 +1,55 @@ +import json +import unittest +import threading + +from test.support.os_helper import EnvironmentVarGuard +from urllib.parse import urlparse + +from http.server import BaseHTTPRequestHandler, HTTPServer + +class HTTPHandler(BaseHTTPRequestHandler): + called = False + path = None + headers = {} + + def do_HEAD(self): + self.send_response(200) + + def do_POST(self): + HTTPHandler.path = self.path + HTTPHandler.headers = self.headers + HTTPHandler.called = True + self.send_response(200) + self.send_header("Content-type", "application/json") + self.end_headers() + +class TestGoogleGenAiPatch(unittest.TestCase): + endpoint = "http://127.0.0.1/" + + def test_proxy_enabled(self): + env = EnvironmentVarGuard() + secrets_token = "secrets_token" + proxy_token = "proxy_token" + env.set("KAGGLE_USER_SECRETS_TOKEN", secrets_token) + env.set("KAGGLE_DATA_PROXY_TOKEN", proxy_token) + env.set("KAGGLE_DATA_PROXY_URL", self.endpoint) + server_address = urlparse(self.endpoint) + with env: + with HTTPServer((server_address.hostname, server_address.port), HTTPHandler) as httpd: + threading.Thread(target=httpd.serve_forever).start() + from google import genai + api_key = "NotARealAPIKey" + client = genai.Client(api_key = api_key) + try: + client.models.generate_content( + model="gemini-2.0-flash-exp", + contents="What's the largest planet in our solar system?" + ) + except: + pass + httpd.shutdown() + self.assertTrue(HTTPHandler.called) + self.assertIn("/palmapi", HTTPHandler.path) + self.assertEqual(proxy_token, HTTPHandler.headers["x-kaggle-proxy-data"]) + self.assertEqual("Bearer {}".format(secrets_token), HTTPHandler.headers["x-kaggle-authorization"]) + self.assertEqual(api_key, HTTPHandler.headers["x-goog-api-key"]) diff --git a/tests/test_imports.py b/tests/test_imports.py index 4977ff9c..6c429516 100644 --- a/tests/test_imports.py +++ b/tests/test_imports.py @@ -3,7 +3,5 @@ class TestImport(unittest.TestCase): # Basic import tests for packages without any. def test_basic(self): - import bq_helper - import cleverhans import tensorflow_datasets import segment_anything diff --git a/tests/test_jax.py b/tests/test_jax.py index b5e0898e..f8eca3bb 100644 --- a/tests/test_jax.py +++ b/tests/test_jax.py @@ -6,7 +6,7 @@ import jax import jax.numpy as np -from common import gpu_test +from common import gpu_test, isGPU from jax import grad, jit @@ -21,5 +21,5 @@ def test_grad(self): self.assertEqual(0.4199743, ag) def test_backend(self): - expected_backend = 'cpu' if len(os.environ.get('CUDA_VERSION', '')) == 0 else 'gpu' + expected_backend = 'cpu' if not isGPU() else 'gpu' self.assertEqual(expected_backend, jax.default_backend()) diff --git a/tests/test_kagglehub.py b/tests/test_kagglehub.py index 37b11248..f2c3e2a6 100644 --- a/tests/test_kagglehub.py +++ b/tests/test_kagglehub.py @@ -8,8 +8,10 @@ class TestKagglehub(unittest.TestCase): def test_login(self): with self.assertLogs('kagglehub', level='INFO') as l: with mock.patch("builtins.input") as mock_input: - mock_input.side_effect = ["lastplacelarry", "some-key"] - # Disabling credentials validation since network access is disabled in unittest. - kagglehub.login(validate_credentials=False) + with mock.patch("getpass.getpass") as mock_getpass: + mock_input.side_effect = ["lastplacelarry"] + mock_getpass.return_value = "some-key" - self.assertIn("credentials set", l.output[0]) + kagglehub.login(validate_credentials=False) + + self.assertIn("credentials set", l.output[0]) diff --git a/tests/test_keras.py b/tests/test_keras.py index 22cb6f9f..5dc4610d 100644 --- a/tests/test_keras.py +++ b/tests/test_keras.py @@ -9,10 +9,11 @@ class TestKeras(unittest.TestCase): def test_train(self): - # Load the data and split it between train and test sets - (x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data( - path='/input/tests/data/mnist.npz' - ) + path = '/input/tests/data/mnist.npz' + with np.load(path) as f: + x_train, y_train = f['x_train'], f['y_train'] + x_test, y_test = f['x_test'], f['y_test'] + # Scale images to the [0, 1] range x_train = x_train.astype("float32") / 255 diff --git a/tests/test_kmapper.py b/tests/test_kmapper.py deleted file mode 100644 index c75deea3..00000000 --- a/tests/test_kmapper.py +++ /dev/null @@ -1,7 +0,0 @@ -import unittest - -import kmapper as km - -class TestKMapper(unittest.TestCase): - def test_init(self): - km.KeplerMapper() diff --git a/tests/test_matplotlib.py b/tests/test_matplotlib.py index 1cbc939a..c04f3f23 100644 --- a/tests/test_matplotlib.py +++ b/tests/test_matplotlib.py @@ -1,10 +1,17 @@ import unittest import os.path +from distutils.version import StrictVersion + +import matplotlib import matplotlib.pyplot as plt import numpy as np class TestMatplotlib(unittest.TestCase): + def test_version(self): + # b/308525631: newer versions of Matplotlib causes learntools to fail + self.assertLess(StrictVersion(matplotlib.__version__), StrictVersion("3.8.0")) + def test_plot(self): plt.plot(np.linspace(0,1,50), np.random.rand(50)) plt.savefig("plot1.png") diff --git a/tests/test_nbdev.py b/tests/test_nbdev.py new file mode 100644 index 00000000..d5c6b484 --- /dev/null +++ b/tests/test_nbdev.py @@ -0,0 +1,8 @@ +import unittest + +import nbdev + +class TestNbdev(unittest.TestCase): + def test(self): + self.assertGreater(len(nbdev.__version__), 0) + diff --git a/tests/test_numpy.py b/tests/test_numpy.py index 18f74b8c..948455ea 100644 --- a/tests/test_numpy.py +++ b/tests/test_numpy.py @@ -1,9 +1,16 @@ import unittest +from distutils.version import StrictVersion + import numpy as np -from numpy.distutils.system_info import get_info +import io +from contextlib import redirect_stdout + +class TestNumpy(unittest.TestCase): + def test_version(self): + # b/370860329: newer versions are not capable with current tensorflow + self.assertEqual(StrictVersion(np.__version__), StrictVersion("1.26.4")) -class TestNumpy(unittest.TestCase): def test_array(self): array = np.array([1, 3]) @@ -12,5 +19,13 @@ def test_array(self): # Numpy must be linked to the MKL. (Occasionally, a third-party package will muck up the installation # and numpy will be reinstalled with an OpenBLAS backing.) def test_mkl(self): - # This will throw an exception if the MKL is not linked correctly or return an empty dict. - self.assertTrue(get_info("blas_mkl")) + try: + from numpy.distutils.system_info import get_info + # This will throw an exception if the MKL is not linked correctly or return an empty dict. + self.assertTrue(get_info("blas_mkl")) + except: + # Fallback to check if mkl is present via show_config() + config_out = io.StringIO() + with redirect_stdout(config_out): + np.show_config() + self.assertIn("mkl_rt", config_out.getvalue()) diff --git a/tests/test_polars.py b/tests/test_polars.py index c81a0b80..8526bd29 100644 --- a/tests/test_polars.py +++ b/tests/test_polars.py @@ -2,9 +2,14 @@ import polars as pl -class TestPolars(unittest.TestCase): +class TestPolars(unittest.TestCase): def test_read_csv(self): - data = pl.read_csv("/input/tests/data/train.csv") + data = pl.read_csv('/input/tests/data/train.csv') self.assertEqual(100, len(data)) + def test_plot(self): + # This relies on the hvplot package + data = pl.read_csv('/input/tests/data/train.csv') + data.plot.line() + diff --git a/tests/test_pykalman.py b/tests/test_pykalman.py deleted file mode 100644 index 26d86003..00000000 --- a/tests/test_pykalman.py +++ /dev/null @@ -1,47 +0,0 @@ -import unittest -import numpy as np -from pykalman import KalmanFilter -from pykalman import UnscentedKalmanFilter -from pykalman.sqrt import CholeskyKalmanFilter, AdditiveUnscentedKalmanFilter - -class TestPyKalman(unittest.TestCase): - def test_kalman_filter(self): - kf = KalmanFilter(transition_matrices = [[1, 1], [0, 1]], observation_matrices = [[0.1, 0.5], [-0.3, 0.0]]) - measurements = np.asarray([[1,0], [0,0], [0,1]]) # 3 observations - kf = kf.em(measurements, n_iter=5) - (filtered_state_means, filtered_state_covariances) = kf.filter(measurements) - (smoothed_state_means, smoothed_state_covariances) = kf.smooth(measurements) - return filtered_state_means - - def test_kalman_missing(self): - kf = KalmanFilter(transition_matrices = [[1, 1], [0, 1]], observation_matrices = [[0.1, 0.5], [-0.3, 0.0]]) - measurements = np.asarray([[1,0], [0,0], [0,1]]) # 3 observations - measurements = np.ma.asarray(measurements) - measurements[1] = np.ma.masked - kf = kf.em(measurements, n_iter=5) - (filtered_state_means, filtered_state_covariances) = kf.filter(measurements) - (smoothed_state_means, smoothed_state_covariances) = kf.smooth(measurements) - return filtered_state_means - - def test_unscented_kalman(self): - ukf = UnscentedKalmanFilter(lambda x, w: x + np.sin(w), lambda x, v: x + v, transition_covariance=0.1) - (filtered_state_means, filtered_state_covariances) = ukf.filter([0, 1, 2]) - (smoothed_state_means, smoothed_state_covariances) = ukf.smooth([0, 1, 2]) - return filtered_state_means - - def test_online_update(self): - kf = KalmanFilter(transition_matrices = [[1, 1], [0, 1]], observation_matrices = [[0.1, 0.5], [-0.3, 0.0]]) - measurements = np.asarray([[1,0], [0,0], [0,1]]) # 3 observations - measurements = np.ma.asarray(measurements) - measurements[1] = np.ma.masked # measurement at timestep 1 is unobserved - kf = kf.em(measurements, n_iter=5) - (filtered_state_means, filtered_state_covariances) = kf.filter(measurements) - for t in range(1, 3): - filtered_state_means[t], filtered_state_covariances[t] = \ - kf.filter_update(filtered_state_means[t-1], filtered_state_covariances[t-1], measurements[t]) - return filtered_state_means - - def test_robust_sqrt(self): - kf = CholeskyKalmanFilter(transition_matrices = [[1, 1], [0, 1]], observation_matrices = [[0.1, 0.5], [-0.3, 0.0]]) - ukf = AdditiveUnscentedKalmanFilter(lambda x, w: x + np.sin(w), lambda x, v: x + v, observation_covariance=0.1) - diff --git a/tests/test_qgrid.py b/tests/test_qgrid.py deleted file mode 100644 index e97ef2a1..00000000 --- a/tests/test_qgrid.py +++ /dev/null @@ -1,16 +0,0 @@ -import unittest - -import numpy as np -import pandas as pd - -from qgrid import QgridWidget - - -class TestQgrid(unittest.TestCase): - def test_nans(self): - df = pd.DataFrame([(pd.Timestamp('2017-02-02'), np.nan), - (4, 2), - ('foo', 'bar')]) - view = QgridWidget(df=df) - - self.assertIsNotNone(view.get_changed_df()) diff --git a/tests/test_torchtext.py b/tests/test_torchtext.py deleted file mode 100644 index f9fbf76f..00000000 --- a/tests/test_torchtext.py +++ /dev/null @@ -1,12 +0,0 @@ -import unittest - -from torchtext.data.metrics import bleu_score - - -class TestTorchtext(unittest.TestCase): - def test_bleu_score(self): - candidate = [['I', 'love', 'Kaggle', 'Notebooks']] - refs = [[['Completely', 'Different']]] - - self.assertEqual(0, bleu_score(candidate, refs)) - diff --git a/tests/test_torchtune.py b/tests/test_torchtune.py new file mode 100644 index 00000000..c4a702fd --- /dev/null +++ b/tests/test_torchtune.py @@ -0,0 +1,16 @@ +import unittest +import subprocess + +class TestTorchtune(unittest.TestCase): + def test_help(self): + result = subprocess.run( + ["tune", "--help"], + capture_output=True, + text=True + ) + + self.assertEqual(0, result.returncode) + self.assertIn( + "Download a model from the Hugging Face Hub or Kaggle", + result.stdout + ) diff --git a/tests/test_transformers.py b/tests/test_transformers.py index a81714cc..910eab30 100644 --- a/tests/test_transformers.py +++ b/tests/test_transformers.py @@ -1,7 +1,7 @@ import unittest import torch -from transformers import AdamW +import torch.optim as optim import transformers.pipelines # verify this import works @@ -10,13 +10,12 @@ def assertListAlmostEqual(self, list1, list2, tol): self.assertEqual(len(list1), len(list2)) for a, b in zip(list1, list2): self.assertAlmostEqual(a, b, delta=tol) - def test_adam_w(self): w = torch.tensor([0.1, -0.2, -0.1], requires_grad=True) target = torch.tensor([0.4, 0.2, -0.5]) criterion = torch.nn.MSELoss() # No warmup, constant schedule, no gradient clipping - optimizer = AdamW(params=[w], lr=2e-1, weight_decay=0.0) + optimizer = optim.AdamW(params=[w], lr=2e-1, weight_decay=0.0) for _ in range(100): loss = criterion(w, target) loss.backward() diff --git a/tests/test_vaex.py b/tests/test_vaex.py deleted file mode 100644 index b64061b0..00000000 --- a/tests/test_vaex.py +++ /dev/null @@ -1,10 +0,0 @@ -import unittest - -import vaex - -class TestVaex(unittest.TestCase): - def test_read_csv(self): - df = vaex.read_csv("/input/tests/data/train.csv") - - self.assertEqual((100, 785), df.shape) - self.assertEqual(10, df['label'].nunique()) \ No newline at end of file diff --git a/tests/test_vowpalwabbit.py b/tests/test_vowpalwabbit.py deleted file mode 100644 index 839aed05..00000000 --- a/tests/test_vowpalwabbit.py +++ /dev/null @@ -1,10 +0,0 @@ -import unittest - -from vowpalwabbit import pyvw - -class TestVowpalwabbit(unittest.TestCase): - def test_basic(self): - vw = pyvw.vw(quiet=True) - ex = vw.example('1 | a b c') - vw.learn(ex) - self.assertGreater(vw.predict(ex), 0) diff --git a/tpu/Dockerfile b/tpu/Dockerfile index ed9040a8..343443ae 100644 --- a/tpu/Dockerfile +++ b/tpu/Dockerfile @@ -6,14 +6,10 @@ FROM $BASE_IMAGE # See: https://docs.docker.com/engine/reference/builder/#understand-how-arg-and-from-interact ARG PYTHON_WHEEL_VERSION ARG PYTHON_VERSION_PATH -ARG TF_LINUX_WHEEL_VERSION +ARG TENSORFLOW_VERSION ARG TORCH_LINUX_WHEEL_VERSION ARG TORCH_VERSION -ARG TENSORFLOW_VERSION -ARG TF_LIBTPU_VERSION -ARG JAX_VERSION ARG TORCHVISION_VERSION -ARG TORCHTEXT_VERSION ARG TORCHAUDIO_VERSION ENV ISTPUVM=1 @@ -29,59 +25,55 @@ ADD patches/kaggle_session.py /root/.local/lib/${PYTHON_VERSION_PATH}/site-packa ADD patches/kaggle_web_client.py /root/.local/lib/${PYTHON_VERSION_PATH}/site-packages/kaggle_web_client.py ADD patches/kaggle_datasets.py /root/.local/lib/${PYTHON_VERSION_PATH}/site-packages/kaggle_datasets.py -# Disable GCP integrations for now -# ADD patches/kaggle_gcp.py /root/.local/lib/${PYTHON_VERSION_PATH}/site-packages/kaggle_gcp.py - -# Disable logging to file (why do we need this?) -# ADD patches/log.py /root/.local/lib/${PYTHON_VERSION_PATH}/site-packages/log.py - -# sitecustomize adds significant latency to ipython kernel startup and should only be added if needed -# ADD patches/sitecustomize.py /root/.local/lib/${PYTHON_VERSION_PATH}/site-packages/sitecustomize.py - # Prereqs # This is needed for cv2 (opencv-python): # https://stackoverflow.com/questions/55313610/importerror-libgl-so-1-cannot-open-shared-object-file-no-such-file-or-directo RUN apt-get update && apt-get install ffmpeg libsm6 libxext6 -y # Install all the packages together for maximum compatibility. - -# Install Tensorflow. - -# Install Pytorch & related packages -# https://cloud.google.com/tpu/docs/pytorch-xla-ug-tpu-vm#changing_pytorch_version -# The URL doesn't include patch version. i.e. must use 1.11 instead of 1.11.0 -# We need to keep the numpy version the same as the installed tf one but compatible with other installs. - -# Install JAX & related packages -# https://cloud.google.com/tpu/docs/jax-quickstart-tpu-vm#install_jax_on_your_cloud_tpu_vm - -# Packages needed by the Notebook editor - -# Additional useful packages should be added here - -RUN pip install tensorflow_hub https://storage.googleapis.com/cloud-tpu-tpuvm-artifacts/tensorflow/tf-${TENSORFLOW_VERSION}/tensorflow-${TENSORFLOW_VERSION}-${PYTHON_WHEEL_VERSION}-${PYTHON_WHEEL_VERSION}-${TF_LINUX_WHEEL_VERSION}.whl tensorflow-probability tensorflow-io \ - torch~=${TORCH_VERSION} https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-${TORCH_VERSION}+libtpu-${PYTHON_WHEEL_VERSION}-${PYTHON_WHEEL_VERSION}-${TORCH_LINUX_WHEEL_VERSION}.whl torchvision==${TORCHVISION_VERSION} torchtext==${TORCHTEXT_VERSION} torchaudio==${TORCHAUDIO_VERSION} \ - jax[tpu]==${JAX_VERSION} -f https://storage.googleapis.com/jax-releases/libtpu_releases.html trax flax optax git+https://github.com/deepmind/dm-haiku jraph distrax \ - papermill jupyterlab python-lsp-server[all] "jupyter-lsp==1.5.1" \ - pandas matplotlib opencv-python-headless librosa accelerate diffusers scikit-learn transformers \ - seaborn timm albumentations einops pyarrow fastparquet opencv-python \ - "keras>3" keras-cv keras-nlp \ - kagglehub && \ +# Additional useful packages should be added in the requirements.txt +# Bring in the requirements.txt and replace variables in it: +RUN apt-get install -y gettext +ADD tpu/requirements.in /kaggle_requirements.in +RUN envsubst < /kaggle_requirements.in > /requirements.in + +# Install uv and then install the requirements: +RUN curl -LsSf https://astral.sh/uv/install.sh | sh +RUN export PATH="${HOME}/.local/bin:${PATH}" && \ + uv pip compile --system --prerelease=allow \ + --verbose \ + --upgrade \ + --find-links=https://storage.googleapis.com/jax-releases/libtpu_releases.html \ + --find-links=https://storage.googleapis.com/libtpu-releases/index.html \ + --find-links=https://storage.googleapis.com/libtpu-wheels/index.html \ + --find-links=https://download.pytorch.org/whl/torch_stable.html \ + --emit-find-links \ + --no-emit-package pip \ + --no-emit-package setuptools \ + --output-file /requirements.txt \ + /requirements.in && \ + uv pip install --system --prerelease=allow --force-reinstall \ + -r /requirements.txt && \ + uv cache clean && \ + /tmp/clean-layer.sh +ENV PATH="~/.local/bin:${PATH}" + +# We install a libtpu version compatible with both jax 0.7.2 and torch 2.8.0. +# Why? tunix latest -> flax 0.12 -> jax 0.7.2 -> libtpu 0.0.23. However, that +# libtpu causes pjrt api errors for torch 2.8.0. screenshot/5heUtdyaJ4MmR3D +# https://github.com/pytorch/xla/blob/d517649bdef6ab0519c30c704bde8779c8216502/setup.py#L111 +# https://github.com/jax-ml/jax/blob/3489529b38d1f11d1e5caf4540775aadd5f2cdda/setup.py#L26 +RUN export PATH="${HOME}/.local/bin:${PATH}" && \ + uv pip install --system --force-reinstall libtpu==0.0.17 && \ + uv cache clean && \ /tmp/clean-layer.sh - -# Tensorflow libtpu: -RUN curl --output /usr/local/lib/python3.10/site-packages/libtpu/libtpu.so https://storage.googleapis.com/cloud-tpu-tpuvm-artifacts/libtpu/${TF_LIBTPU_VERSION}/libtpu.so # Kaggle Model Hub patches: ADD patches/kaggle_module_resolver.py /usr/local/lib/${PYTHON_VERSION_PATH}/site-packages/tensorflow_hub/kaggle_module_resolver.py RUN sed -i '/from tensorflow_hub import uncompressed_module_resolver/a from tensorflow_hub import kaggle_module_resolver' /usr/local/lib/${PYTHON_VERSION_PATH}/site-packages/tensorflow_hub/config.py RUN sed -i '/_install_default_resolvers()/a \ \ registry.resolver.add_implementation(kaggle_module_resolver.KaggleFileResolver())' /usr/local/lib/${PYTHON_VERSION_PATH}/site-packages/tensorflow_hub/config.py -# Monkey-patch the default TPU to the local (TPU VM). -RUN sed -i 's/tpu=None,/tpu="local",/' /usr/local/lib/${PYTHON_VERSION_PATH}/site-packages/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver.py - # Set these env vars so that they don't produce errs calling the metadata server to load them: -ENV TPU_ACCELERATOR_TYPE=v3-8 ENV TPU_PROCESS_ADDRESSES=local # Metadata @@ -93,7 +85,6 @@ LABEL build-date=$BUILD_DATE ENV GIT_COMMIT=${GIT_COMMIT} ENV BUILD_DATE=${BUILD_DATE} -LABEL tensorflow-version=$TENSORFLOW_VERSION LABEL kaggle-lang=python # Correlate current release with the git hash inside the kernel editor by running `!cat /etc/git_commit`. diff --git a/tpu/config.txt b/tpu/config.txt index b495597c..ab933ba7 100644 --- a/tpu/config.txt +++ b/tpu/config.txt @@ -1,19 +1,12 @@ -BASE_IMAGE=python:3.10 -PYTHON_WHEEL_VERSION=cp310 -PYTHON_VERSION_PATH=python3.10 -# gsutil ls gs://cloud-tpu-tpuvm-artifacts/tensorflow -# https://cloud.google.com/tpu/docs/supported-tpu-configurations#libtpu_versions -TENSORFLOW_VERSION=2.16.1 -TF_LIBTPU_VERSION=1.10.1 -TF_LINUX_WHEEL_VERSION=manylinux_2_17_x86_64.manylinux2014_x86_64 -JAX_VERSION=0.4.23 -# gsutil ls gs://pytorch-xla-releases/wheels/tpuvm/* | grep libtpu | grep -v -E ".*rc[0-9].*" +BASE_IMAGE=python:3.12 +PYTHON_WHEEL_VERSION=cp312 +PYTHON_VERSION_PATH=python3.12 +TENSORFLOW_VERSION=2.20.0 +# gsutil ls gs://pytorch-xla-releases/wheels/tpuvm/* | grep libtpu | grep torch_xla | grep -v -E ".*rc[0-9].*" | sed 's/.*torch_xla-\(.*\)+libtpu.*/\1/' | sort -rV # Supports nightly -TORCH_VERSION=2.3.0 +TORCH_VERSION=2.8.0 # https://github.com/pytorch/audio supports nightly -TORCHAUDIO_VERSION=2.3.0 -# https://github.com/pytorch/text supports main -TORCHTEXT_VERSION=0.18.0 +TORCHAUDIO_VERSION=2.8.0 # https://github.com/pytorch/vision supports nightly -TORCHVISION_VERSION=0.18.0 +TORCHVISION_VERSION=0.23.0 TORCH_LINUX_WHEEL_VERSION=manylinux_2_28_x86_64 diff --git a/tpu/requirements.in b/tpu/requirements.in new file mode 100644 index 00000000..1fceeebb --- /dev/null +++ b/tpu/requirements.in @@ -0,0 +1,54 @@ +# TPU Utils +tpu-info +# Tensorflow packages +# TODO: b/447621961 - re-enable tensorflow-tpu when a compatible libtpu can be found. +tensorflow-cpu==${TENSORFLOW_VERSION} +tensorflow_hub +tensorflow-io +tensorflow-probability +tensorflow_datasets +# Torch packages +https://download.pytorch.org/whl/cpu/torch-${TORCH_VERSION}%2Bcpu-${PYTHON_WHEEL_VERSION}-${PYTHON_WHEEL_VERSION}-${TORCH_LINUX_WHEEL_VERSION}.whl +https://download.pytorch.org/whl/cpu/torchaudio-${TORCHAUDIO_VERSION}%2Bcpu-${PYTHON_WHEEL_VERSION}-${PYTHON_WHEEL_VERSION}-${TORCH_LINUX_WHEEL_VERSION}.whl +https://download.pytorch.org/whl/cpu/torchvision-${TORCHVISION_VERSION}%2Bcpu-${PYTHON_WHEEL_VERSION}-${PYTHON_WHEEL_VERSION}-${TORCH_LINUX_WHEEL_VERSION}.whl +https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-${TORCH_VERSION}-${PYTHON_WHEEL_VERSION}-${PYTHON_WHEEL_VERSION}-${TORCH_LINUX_WHEEL_VERSION}.whl +# Jax packages +jax[tpu] +distrax +flax +git+https://github.com/deepmind/dm-haiku +jraph +optax +trax +# Tunix GRPO +git+https://github.com/google/tunix +git+https://github.com/google/qwix +grain +# Jupyter packages +jupyter-lsp==1.5.1 +jupyterlab +notebook +papermill +python-lsp-server[all] +# Keras Packages +keras>3 +keras-cv +keras-nlp +# Kaggle Packages +kagglehub +# Other useful packages, add more here +accelerate +albumentations +diffusers +einops +fastparquet +ipywidgets +matplotlib +opencv-python +opencv-python-headless +pandas +pyarrow +scikit-learn +seaborn +timm +transformers