diff --git a/Deployment/Linear_regression/requirements.txt b/Deployment/Linear_regression/requirements.txt index 042b208..8cfc955 100644 --- a/Deployment/Linear_regression/requirements.txt +++ b/Deployment/Linear_regression/requirements.txt @@ -1,4 +1,4 @@ -numpy==1.16.3 +numpy==1.22.0 pandas==0.24.2 requests==2.22.0 scikit-learn==0.21.2 diff --git a/Deployment/rnn_app/requirements.txt b/Deployment/rnn_app/requirements.txt index 4fb6913..cee79e4 100644 --- a/Deployment/rnn_app/requirements.txt +++ b/Deployment/rnn_app/requirements.txt @@ -1,4 +1,4 @@ -numpy==1.16.3 +numpy==1.22.0 pandas==0.24.2 requests==2.22.0 diff --git a/README.md b/README.md index bc4c537..077c9d7 100644 --- a/README.md +++ b/README.md @@ -148,4 +148,11 @@ See my articles on Medium on this topic. ### Unit testing ML code with Pytest Check the files and detailed instructions in the [Pytest](https://github.com/tirthajyoti/Machine-Learning-with-Python/tree/master/Pytest) directory to understand how one should write unit testing code/module for machine learning models +--- + +### Memory and timing profiling + +Profiling data science code and ML models for memory footprint and computing time is a critical but often overlooed area. Here are a couple of Notebooks showing the ideas, +* [Memory profling using Scalene](https://github.com/tirthajyoti/Machine-Learning-with-Python/tree/master/Memory-profiling/Scalene) +* [Time-profiling data science code](https://github.com/tirthajyoti/Machine-Learning-with-Python/blob/master/Time-profiling/cProfile.ipynb) diff --git a/Time-profiling/8.14 - profile-DS-workflow.png b/Time-profiling/8.14 - profile-DS-workflow.png new file mode 100644 index 0000000..e17e393 Binary files /dev/null and b/Time-profiling/8.14 - profile-DS-workflow.png differ diff --git a/Time-profiling/Readme.md b/Time-profiling/Readme.md new file mode 100644 index 0000000..6d6d34d --- /dev/null +++ b/Time-profiling/Readme.md @@ -0,0 +1 @@ +## Time-profiling ML code diff --git a/Time-profiling/cProfile.ipynb b/Time-profiling/cProfile.ipynb new file mode 100644 index 0000000..a65359a --- /dev/null +++ b/Time-profiling/cProfile.ipynb @@ -0,0 +1,513 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "fc118b68-6776-4504-ace0-0e076616fde6", + "metadata": {}, + "source": [ + "# Time-profiling Data Science code using `cProfile`\n", + "\n", + "## Dr. Tirthajyoti Sarkar\n", + "\n", + "---" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "98192c2c-f90d-4a7b-b3b2-ae23870d6f4e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 3 function calls in 0.064 seconds\n", + "\n", + " Ordered by: standard name\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1 0.064 0.064 0.064 0.064 :1()\n", + " 1 0.000 0.000 0.064 0.064 {built-in method builtins.exec}\n", + " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", + "\n", + "\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "import cProfile\n", + "\n", + "SIZE = 10_000_000\n", + "a = np.arange(SIZE)\n", + "b = np.random.normal(size=SIZE)\n", + "\n", + "cProfile.run('a+b')" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "9500a64c-9d19-4e05-97cf-231ab5f1842e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SIZE = 10_000_000\n", + "a = np.arange(SIZE)\n", + "b = np.random.normal(size=SIZE)\n", + "a+b\n" + ] + } + ], + "source": [ + "code = \"\"\"SIZE = 10_000_000\n", + "a = np.arange(SIZE)\n", + "b = np.random.normal(size=SIZE)\n", + "a+b\"\"\"\n", + "\n", + "print(code)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "6d810ec6-245d-4082-8f64-d5c16b162936", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 5 function calls in 0.488 seconds\n", + "\n", + " Ordered by: standard name\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1 0.078 0.078 0.488 0.488 :1()\n", + " 1 0.000 0.000 0.488 0.488 {built-in method builtins.exec}\n", + " 1 0.028 0.028 0.028 0.028 {built-in method numpy.arange}\n", + " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", + " 1 0.381 0.381 0.381 0.381 {method 'normal' of 'numpy.random.mtrand.RandomState' objects}\n", + "\n", + "\n" + ] + } + ], + "source": [ + "cProfile.run(code)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "8a5bf74f-fa5b-4997-a864-82a6d9503216", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 6 function calls in 0.531 seconds\n", + "\n", + " Ordered by: standard name\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1 0.055 0.055 0.510 0.510 1735574101.py:1(add)\n", + " 1 0.021 0.021 0.531 0.531 :1()\n", + " 1 0.000 0.000 0.531 0.531 {built-in method builtins.exec}\n", + " 1 0.057 0.057 0.057 0.057 {built-in method numpy.arange}\n", + " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", + " 1 0.397 0.397 0.397 0.397 {method 'normal' of 'numpy.random.mtrand.RandomState' objects}\n", + "\n", + "\n" + ] + } + ], + "source": [ + "def add():\n", + " SIZE = 10_000_000\n", + " a = np.arange(SIZE)\n", + " b = np.random.normal(size=SIZE)\n", + " c=a+b\n", + "\n", + "cProfile.run('add()')" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "f899c334-2d1f-41ef-aceb-71348e5a6818", + "metadata": {}, + "outputs": [], + "source": [ + "def add(size):\n", + " a = np.arange(size)\n", + " b = np.random.normal(size=size)\n", + " c=a+b" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "cdf8d43a-f240-4de6-9a92-6dee6c1cd98b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 6 function calls in 0.500 seconds\n", + "\n", + " Ordered by: standard name\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1 0.054 0.054 0.478 0.478 1565836920.py:1(add)\n", + " 1 0.021 0.021 0.500 0.500 :1()\n", + " 1 0.000 0.000 0.500 0.500 {built-in method builtins.exec}\n", + " 1 0.030 0.030 0.030 0.030 {built-in method numpy.arange}\n", + " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", + " 1 0.394 0.394 0.394 0.394 {method 'normal' of 'numpy.random.mtrand.RandomState' objects}\n", + "\n", + "\n" + ] + } + ], + "source": [ + "SIZE = 10_000_000\n", + "cProfile.run('add(SIZE)')" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "8d038a76-4394-472b-9949-be9ef936dca5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 6 function calls in 1.034 seconds\n", + "\n", + " Ordered by: standard name\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1 0.114 0.114 0.987 0.987 1565836920.py:1(add)\n", + " 1 0.047 0.047 1.034 1.034 :1()\n", + " 1 0.000 0.000 1.034 1.034 {built-in method builtins.exec}\n", + " 1 0.082 0.082 0.082 0.082 {built-in method numpy.arange}\n", + " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", + " 1 0.791 0.791 0.791 0.791 {method 'normal' of 'numpy.random.mtrand.RandomState' objects}\n", + "\n", + "\n" + ] + } + ], + "source": [ + "SIZE = 20_000_000\n", + "cProfile.run('add(SIZE)')" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "f14c476d-cc19-4ce5-b74e-abf211eabc7a", + "metadata": {}, + "outputs": [], + "source": [ + "def ops(a,b):\n", + " x1 = a+b\n", + " x2 = a-b\n", + " x3 = a*b\n", + " x4 = a/b" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "771a1452-2d4a-476e-8775-3479beac3700", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 4 function calls in 0.287 seconds\n", + "\n", + " Ordered by: standard name\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1 0.251 0.251 0.251 0.251 3200973052.py:1(ops)\n", + " 1 0.036 0.036 0.286 0.286 :1()\n", + " 1 0.000 0.000 0.287 0.287 {built-in method builtins.exec}\n", + " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", + "\n", + "\n" + ] + } + ], + "source": [ + "cProfile.run('ops(a,b)')" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "5af6602e-a66f-4bf0-9d22-bc5b71e8c4d0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total function calls: 48\n", + "Total time (seconds): 1.1839559\n" + ] + } + ], + "source": [ + "import cProfile, pstats\n", + "\n", + "profiler = cProfile.Profile()\n", + "# Enable profiler\n", + "profiler.enable()\n", + "# Function execution\n", + "add(SIZE)\n", + "# Disable profiler\n", + "profiler.disable()\n", + "# pstats\n", + "stats = pstats.Stats(profiler)\n", + "# Print the total time and function calls\n", + "print(\"Total function calls:\", stats.total_calls)\n", + "print(\"Total time (seconds):\", stats.total_tt)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "91f2d625-37bf-41f6-9c92-ae52bf608d02", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 48 function calls in 1.184 seconds\n", + "\n", + " Random listing order was used\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 2 0.000 0.000 0.000 0.000 c:\\users\\tirtha\\appdata\\local\\programs\\python\\python39\\lib\\contextlib.py:86(__init__)\n", + " 2 0.000 0.000 0.000 0.000 c:\\users\\tirtha\\appdata\\local\\programs\\python\\python39\\lib\\contextlib.py:112(__enter__)\n", + " 2 0.000 0.000 0.000 0.000 c:\\users\\tirtha\\appdata\\local\\programs\\python\\python39\\lib\\contextlib.py:121(__exit__)\n", + " 2 0.000 0.000 0.000 0.000 c:\\users\\tirtha\\appdata\\local\\programs\\python\\python39\\lib\\contextlib.py:242(helper)\n", + " 2 0.000 0.000 0.000 0.000 c:\\users\\tirtha\\appdata\\local\\programs\\python\\python39\\lib\\site-packages\\traitlets\\traitlets.py:535(get)\n", + " 2 0.000 0.000 0.000 0.000 c:\\users\\tirtha\\appdata\\local\\programs\\python\\python39\\lib\\site-packages\\traitlets\\traitlets.py:566(__get__)\n", + " 2 0.000 0.000 0.000 0.000 c:\\users\\tirtha\\appdata\\local\\programs\\python\\python39\\lib\\site-packages\\IPython\\utils\\ipstruct.py:125(__getattr__)\n", + " 2 0.000 0.000 0.000 0.000 c:\\users\\tirtha\\appdata\\local\\programs\\python\\python39\\lib\\codeop.py:142(__call__)\n", + " 4 0.000 0.000 0.000 0.000 c:\\users\\tirtha\\appdata\\local\\programs\\python\\python39\\lib\\site-packages\\IPython\\core\\compilerop.py:166(extra_flags)\n", + " 2 0.000 0.000 0.000 0.000 c:\\users\\tirtha\\appdata\\local\\programs\\python\\python39\\lib\\site-packages\\IPython\\core\\interactiveshell.py:1286(user_global_ns)\n", + " 2 0.000 0.000 0.000 0.000 c:\\users\\tirtha\\appdata\\local\\programs\\python\\python39\\lib\\site-packages\\IPython\\core\\interactiveshell.py:3354(compare)\n", + " 2 0.000 0.000 1.184 0.592 c:\\users\\tirtha\\appdata\\local\\programs\\python\\python39\\lib\\site-packages\\IPython\\core\\interactiveshell.py:3416(run_code)\n", + " 2 0.000 0.000 0.000 0.000 c:\\users\\tirtha\\appdata\\local\\programs\\python\\python39\\lib\\site-packages\\IPython\\core\\hooks.py:103(__call__)\n", + " 2 0.000 0.000 0.000 0.000 c:\\users\\tirtha\\appdata\\local\\programs\\python\\python39\\lib\\site-packages\\IPython\\core\\hooks.py:168(pre_run_code_hook)\n", + " 1 0.000 0.000 0.000 0.000 C:\\Users\\Tirtha\\AppData\\Local\\Temp/ipykernel_12356/3775033682.py:9()\n", + " 1 0.044 0.044 1.184 1.184 C:\\Users\\Tirtha\\AppData\\Local\\Temp/ipykernel_12356/3775033682.py:7()\n", + " 1 0.114 0.114 1.140 1.140 C:\\Users\\Tirtha\\AppData\\Local\\Temp/ipykernel_12356/1565836920.py:1(add)\n", + " 1 0.081 0.081 0.081 0.081 {built-in method numpy.arange}\n", + " 1 0.945 0.945 0.945 0.945 {method 'normal' of 'numpy.random.mtrand.RandomState' objects}\n", + " 2 0.000 0.000 0.000 0.000 {built-in method builtins.compile}\n", + " 2 0.000 0.000 1.184 0.592 {built-in method builtins.exec}\n", + " 4 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", + " 4 0.000 0.000 0.000 0.000 {built-in method builtins.next}\n", + " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", + "\n", + "\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stats = pstats.Stats(profiler)\n", + "stats.print_stats()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "06fa3a62-5fef-4d9e-8c6a-d9e309d80e16", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "pstats.Stats" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type(stats)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "cefecaa0-9933-49ae-b0dc-3f869e83fa89", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1.1839559" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stats.total_tt" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "fe15e37e-d329-4168-a479-60c219126e16", + "metadata": {}, + "outputs": [], + "source": [ + "stats.fcn_list" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "15c966eb-e094-4e5f-8e54-1a2c3923f6c7", + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "560a8a4d-84fc-43c2-a1c8-79113dcc73ac", + "metadata": {}, + "outputs": [], + "source": [ + "size = [int(i*1e6) for i in range(5,26,5)]\n", + "total_tt = []\n", + "for s in size:\n", + " profiler = cProfile.Profile()\n", + " profiler.enable()\n", + " add(s)\n", + " profiler.disable()\n", + " stats = pstats.Stats(profiler)\n", + " total_tt.append(round(stats.total_tt,3)) " + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "264f4589-e40b-4c2d-bb08-9116dc4eb2e4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[0.274, 0.464, 0.706, 0.94, 1.187]" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "total_tt" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "695c9306-bcc6-45ea-8939-28e8e127eec3", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(6,3),dpi=120)\n", + "plt.bar(x=[str(i)+'-million' for i in range(5,26,5)],\n", + " height=total_tt, \n", + " edgecolor='k',\n", + " color=\"#2c75b0\")\n", + "plt.xlabel(\"Array size\", fontsize=16)\n", + "plt.ylabel(\"Time taken (seconds)\",fontsize=16)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "9b1e4af3-385a-420a-be85-449119210f93", + "metadata": {}, + "source": [ + "## Data science workflow profiling\n", + "\n", + "While measuring the execution time of these small standalone functions serve as basic demonstration of the usage of these profilers, the real utility is realized when they are used in a large-scale data science workflow. Such a workflow has a variety of modules and functions and we can set up profiling for all of them if necessary. The output may be logged into a database or even be fed into a monitoring system that will track the performance of the modules over time and take action if needed (e.g., a function has poorly performed and took too much time in a certain run or for a certain input data).\n", + "\n", + "![ds-workflow](https://raw.githubusercontent.com/tirthajyoti/Machine-Learning-with-Python/master/Time-profiling/8.14%20-%20profile-DS-workflow.png)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}