diff --git a/Deployment/Linear_regression/requirements.txt b/Deployment/Linear_regression/requirements.txt index 042b208..8cfc955 100644 --- a/Deployment/Linear_regression/requirements.txt +++ b/Deployment/Linear_regression/requirements.txt @@ -1,4 +1,4 @@ -numpy==1.16.3 +numpy==1.22.0 pandas==0.24.2 requests==2.22.0 scikit-learn==0.21.2 diff --git a/Deployment/rnn_app/requirements.txt b/Deployment/rnn_app/requirements.txt index 4fb6913..cee79e4 100644 --- a/Deployment/rnn_app/requirements.txt +++ b/Deployment/rnn_app/requirements.txt @@ -1,4 +1,4 @@ -numpy==1.16.3 +numpy==1.22.0 pandas==0.24.2 requests==2.22.0 diff --git a/Images/Height_Weight_file_picture.PNG b/Images/Height_Weight_file_picture.PNG new file mode 100644 index 0000000..ca1623a Binary files /dev/null and b/Images/Height_Weight_file_picture.PNG differ diff --git a/Images/ML-DS-cycle-1.png b/Images/ML-DS-cycle-1.png new file mode 100644 index 0000000..40b6f21 Binary files /dev/null and b/Images/ML-DS-cycle-1.png differ diff --git a/Images/pandas-site.PNG b/Images/pandas-site.PNG new file mode 100644 index 0000000..09eecfe Binary files /dev/null and b/Images/pandas-site.PNG differ diff --git a/Memory-profiling/Readme.md b/Memory-profiling/Readme.md new file mode 100644 index 0000000..0c7f946 --- /dev/null +++ b/Memory-profiling/Readme.md @@ -0,0 +1 @@ +## Memory-profiling machine learning code diff --git a/Memory-profiling/Scalene/Readme.md b/Memory-profiling/Scalene/Readme.md new file mode 100644 index 0000000..e80142f --- /dev/null +++ b/Memory-profiling/Scalene/Readme.md @@ -0,0 +1,29 @@ +## [Scalene](https://github.com/plasma-umass/scalene) + +### Install +`pip install scalene` + +### Run (on CLI) +`$ scalene ` + +### Example output + +If you run the `mlp.py` + +`$ scalene mlp.py` + +You may see something like, + +![scalene-1](https://raw.githubusercontent.com/tirthajyoti/Machine-Learning-with-Python/master/Memory-profiling/Scalene/scalene-1.PNG) + +### Features +Here are some of the cool features of Scalene. Most of them are self-explanatory and can be gauged from the screenshot above, + +- **Lines or functions**: Reports information both for entire functions and for every independent code line +- **Threads**: It supports Python threads. +- **Multiprocessing**: supports use of the multiprocessing library +- **Python vs. C time**: Scalene breaks out time spent in Python vs. native code (e.g., libraries) +- **System time**: It distinguishes system time (e.g., sleeping or performing I/O operations) +- **GPU**: It also can report the time spent on an NVIDIA GPU (if present) +- **Copy volume**: It reports MBs of data being copied per second  +- **Detects leaks**: Scalene can automatically pinpoint lines responsible for likely memory leaks! diff --git a/Memory-profiling/Scalene/linearmodel.py b/Memory-profiling/Scalene/linearmodel.py new file mode 100644 index 0000000..b4b3e23 --- /dev/null +++ b/Memory-profiling/Scalene/linearmodel.py @@ -0,0 +1,52 @@ +import pandas as pd +import pickle +import numpy as np +from sklearn.linear_model import LinearRegression +from sklearn.datasets import make_regression + +NUM_FEATURES = 10 +NUM_SAMPLES = 1000 + +# Make data +def make_data(): + X,y = make_regression(n_samples=NUM_SAMPLES,n_features=NUM_FEATURES, + n_informative=NUM_FEATURES,noise=0.5) + data = pd.DataFrame(X,columns=['X'+str(i) for i in range(1,NUM_FEATURES+1)],dtype=np.float16) + data['y']=np.array(y,dtype=np.float16) + return data + +# Test/Train +def test_train(data): + X_train,y_train = data.iloc[:int(NUM_SAMPLES/2)].drop(['y'],axis=1),data.iloc[:int(NUM_SAMPLES/2)]['y'] + X_test,y_test = data.iloc[int(NUM_SAMPLES/2):].drop(['y'],axis=1),data.iloc[int(NUM_SAMPLES/2):]['y'] + return (X_train,y_train,X_test,y_test) + +# Fitting +def fitting(X_train,y_train): + lm = LinearRegression(n_jobs=1) + lm.fit(X_train,y_train) + del X_train + del y_train + return lm + +# Saving model +def save(lm): + with open('LinearModel.sav',mode='wb') as f: + pickle.dump(lm,f) + +def model_run(model,testfile): + """ + Loads and runs a sklearn linear model + """ + lm = pickle.load(open(model, 'rb')) + X_test = pd.read_csv(testfile) + _= lm.predict(X_test) + return None + +if __name__ == '__main__': + data = make_data() + X_train,y_train,X_test,y_test = test_train(data) + #X_test.to_csv("Test.csv",index=False) + lm = fitting(X_train,y_train) + save(lm) + model_run('LinearModel.sav','Test.csv') \ No newline at end of file diff --git a/Memory-profiling/Scalene/mlp.py b/Memory-profiling/Scalene/mlp.py new file mode 100644 index 0000000..9b2d534 --- /dev/null +++ b/Memory-profiling/Scalene/mlp.py @@ -0,0 +1,52 @@ +import pandas as pd +import pickle +import numpy as np +from sklearn.neural_network import MLPRegressor +from sklearn.datasets import make_regression + +NUM_FEATURES = 10 +NUM_SAMPLES = 1000 + +# Make data +def make_data(): + X,y = make_regression(n_samples=NUM_SAMPLES,n_features=NUM_FEATURES, + n_informative=NUM_FEATURES,noise=0.5) + data = pd.DataFrame(X,columns=['X'+str(i) for i in range(1,NUM_FEATURES+1)],dtype=np.float16) + data['y']=np.array(y,dtype=np.float16) + return data + +# Test/Train +def test_train(data): + X_train,y_train = data.iloc[:int(NUM_SAMPLES/2)].drop(['y'],axis=1),data.iloc[:int(NUM_SAMPLES/2)]['y'] + X_test,y_test = data.iloc[int(NUM_SAMPLES/2):].drop(['y'],axis=1),data.iloc[int(NUM_SAMPLES/2):]['y'] + return (X_train,y_train,X_test,y_test) + +# Fitting +def fitting(X_train,y_train): + mlp = MLPRegressor(max_iter=50) + mlp.fit(X_train,y_train) + del X_train + del y_train + return mlp + +# Saving model +def save(mlp): + with open('MultiLayerPerceptron.sav',mode='wb') as f: + pickle.dump(mlp,f) + +def model_run(model,testfile): + """ + Loads and runs a sklearn linear model + """ + mlp = pickle.load(open(model, 'rb')) + X_test = pd.read_csv(testfile) + _= mlp.predict(X_test) + return None + +if __name__ == '__main__': + data = make_data() + X_train,y_train,X_test,y_test = test_train(data) + X_test.to_csv("Test.csv",index=False) + mlp = fitting(X_train,y_train) + save(mlp) + model_run('MultiLayerPerceptron.sav','Test.csv') \ No newline at end of file diff --git a/Memory-profiling/Scalene/scalene-1.PNG b/Memory-profiling/Scalene/scalene-1.PNG new file mode 100644 index 0000000..6324e41 Binary files /dev/null and b/Memory-profiling/Scalene/scalene-1.PNG differ diff --git a/Pandas and Numpy/Pandas_iteration.ipynb b/Pandas and Numpy/Pandas_iteration.ipynb new file mode 100644 index 0000000..675712b --- /dev/null +++ b/Pandas and Numpy/Pandas_iteration.ipynb @@ -0,0 +1,1074 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "d860a88f-a209-4502-a326-df204702cdcd", + "metadata": {}, + "source": [ + "# Iteration with Pandas\n", + "\n", + "## [Dr. Tirthajyoti Sarkar](https://www.linkedin.com/in/tirthajyoti-sarkar-2127aa7/), Fremont, CA\n", + "\n", + "As data scientists, all of us have been there.\n", + "\n", + "We are given a large Pandas DataFrame and asked to check some relationships between various fields in the columns - in a row-by-row fashion. It could be some logical operation or some sophisticated mathematical transformation on the raw data.\n", + "\n", + "Essentially, it is a **simple case of iterating over the rows of the DataFrame** and doing some processing at each iteration.\n", + "\n", + "However, it **may not be that simple in terms of choosing the most efficient method** of executing this type of task. You might already have a hunch that a simple for-loop is going to be quite inefficient for this iteration task. \n", + "\n", + "And, you will be absolutely right. So, what can we do to avoid such speed pitfall?\n", + "\n", + "This type of knowledge is critical to practice what we call [\"Productive Data Science\"](https://medium.com/productive-data-science/why-and-how-should-you-learn-productive-data-science-53377b473f37)." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "1c6f700f-9cf9-4793-872a-7e60c6c5d67b", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np, pandas as pd, matplotlib.pyplot as plt\n", + "from tqdm import tqdm\n", + "from time import time" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "03e83313-dddd-4335-965e-e1c742795156", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABCD
095118170
16387759
27740463
340609264
45129340
\n", + "
" + ], + "text/plain": [ + " A B C D\n", + "0 95 11 81 70\n", + "1 63 87 75 9\n", + "2 77 40 4 63\n", + "3 40 60 92 64\n", + "4 5 12 93 40" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.random.seed(101)\n", + "df = pd.DataFrame(np.random.randint(0,100,size=(100000, 4)), \n", + " columns=list('ABCD'),dtype=np.int16)\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "cb40c948-489c-41ec-b454-b41dfb2689ba", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 100000 entries, 0 to 99999\n", + "Data columns (total 4 columns):\n", + " # Column Non-Null Count Dtype\n", + "--- ------ -------------- -----\n", + " 0 A 100000 non-null int16\n", + " 1 B 100000 non-null int16\n", + " 2 C 100000 non-null int16\n", + " 3 D 100000 non-null int16\n", + "dtypes: int16(4)\n", + "memory usage: 781.4 KB\n" + ] + } + ], + "source": [ + "df.info(memory_usage=\"deep\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "2b863004-8a6b-423a-81a8-e8dd77b4d2dd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABCD
count100000.000000100000.000000100000.000000100000.000000
mean49.56264049.53382049.47714049.495560
std28.88510528.90781628.78959428.849037
min0.0000000.0000000.0000000.000000
25%25.00000024.00000025.00000024.000000
50%50.00000050.00000050.00000050.000000
75%75.00000075.00000074.00000074.000000
max99.00000099.00000099.00000099.000000
\n", + "
" + ], + "text/plain": [ + " A B C D\n", + "count 100000.000000 100000.000000 100000.000000 100000.000000\n", + "mean 49.562640 49.533820 49.477140 49.495560\n", + "std 28.885105 28.907816 28.789594 28.849037\n", + "min 0.000000 0.000000 0.000000 0.000000\n", + "25% 25.000000 24.000000 25.000000 24.000000\n", + "50% 50.000000 50.000000 50.000000 50.000000\n", + "75% 75.000000 75.000000 74.000000 74.000000\n", + "max 99.000000 99.000000 99.000000 99.000000" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.describe()" + ] + }, + { + "cell_type": "markdown", + "id": "2fcaa5a5-e4a7-4417-a00a-f71e49de79fe", + "metadata": {}, + "source": [ + "## Most inefficient _for_ loop" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "35667490-c0ce-49be-8005-5181d71d656f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Time:32.22 seconds\n", + "Count: 49769\n" + ] + } + ], + "source": [ + "count=0\n", + "t1=time()\n", + "for i in range(len(df)):\n", + " if df.iloc[i]['A']+df.iloc[i]['B'] > df.iloc[i]['C']+df.iloc[i]['D']:\n", + " count+=1\n", + "t2=time()\n", + "delt1 = round((t2-t1),2)\n", + "print(f\"Time:{delt1} seconds\")\n", + "print(\"Count:\",count)" + ] + }, + { + "cell_type": "markdown", + "id": "7bf463cd-e887-4257-ad58-8f3cda024101", + "metadata": {}, + "source": [ + "## Comparing `iterrows()` and `df.values` - 1" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "37ecedc8-fd98-4e6c-a0c3-14a5a2a7324e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Time:6.91 seconds\n", + "Count: 49769\n" + ] + } + ], + "source": [ + "count=0\n", + "t1=time()\n", + "for idx, row in df.iterrows():\n", + " if row['A']+row['B'] > (row['C']+row['D']):\n", + " count+=1\n", + "t2=time()\n", + "delt1 = round((t2-t1),2)\n", + "print(f\"Time:{delt1} seconds\")\n", + "print(\"Count:\",count)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "cc1c3e1d-6d2b-4f1b-908e-6af8a0a2ff8c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Time:0.112 seconds\n", + "Count: 49769\n" + ] + } + ], + "source": [ + "count=0\n", + "t1=time()\n", + "for row in df.values:\n", + " if row[0]+row[1] > (row[2]+row[3]):\n", + " count+=1\n", + "t2=time()\n", + "delt2 = round((t2-t1),3)\n", + "print(f\"Time:{delt2} seconds\")\n", + "print(\"Count:\",count)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "a09a19c8-df6c-4681-a40e-ba74006609c9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "df.values is 61.7 times faster\n" + ] + } + ], + "source": [ + "print(f\"df.values is {round(delt1/delt2,2)} times faster\")" + ] + }, + { + "cell_type": "markdown", + "id": "41979a9d-ee10-4fc4-a614-d75bd907bd27", + "metadata": {}, + "source": [ + "## Comparing `iterrows()` and `df.values` - 2" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "6b5ccfa4-1a3d-462a-ac1a-14778ace0194", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Time:8.05 seconds\n", + "Count: 35886\n" + ] + } + ], + "source": [ + "count=0\n", + "t1=time()\n", + "for idx, row in df.iterrows():\n", + " if row['A']+row['B'] > 1.25*(row['C']+row['D']):\n", + " count+=1\n", + "t2=time()\n", + "delt1 = round((t2-t1),2)\n", + "print(f\"Time:{delt1} seconds\")\n", + "print(\"Count:\",count)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "34e52057-5ef4-4b14-997d-3ef3337d8ce7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Time:0.546 seconds\n", + "Count: 35886\n" + ] + } + ], + "source": [ + "count=0\n", + "t1=time()\n", + "for row in df.values:\n", + " if row[0]+row[1] > 1.25*(row[2]+row[3]):\n", + " count+=1\n", + "t2=time()\n", + "delt2 = round((t2-t1),3)\n", + "print(f\"Time:{delt2} seconds\")\n", + "print(\"Count:\",count)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "d76203cf-a0a1-40db-97cc-591cabd210a8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "df.values is 14.74 times faster\n" + ] + } + ], + "source": [ + "print(f\"df.values is {round(delt1/delt2,2)} times faster\")" + ] + }, + { + "cell_type": "markdown", + "id": "8f00b66c-e91c-4390-bb48-7edc9f785144", + "metadata": {}, + "source": [ + "## Comparing `iterrows()` and `df.values` - 3" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "e619a3c6-712a-49a8-b47b-4eece6205e97", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Time:8.76 seconds\n", + "Count: 9202\n" + ] + } + ], + "source": [ + "count=0\n", + "t1=time()\n", + "for idx, row in df.iterrows():\n", + " if np.log(1+row['A']+row['B']) > np.sqrt(0.5*(row['C']+row['D'])):\n", + " count+=1\n", + "t2=time()\n", + "delt1 = round((t2-t1),2)\n", + "print(f\"Time:{delt1} seconds\")\n", + "print(\"Count:\",count)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "6163d0dc-4be7-4533-90a2-06d6a997fdb9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Time:0.962 seconds\n", + "Count: 9202\n" + ] + } + ], + "source": [ + "count=0\n", + "t1=time()\n", + "for row in df.values:\n", + " if np.log(1+row[0]+row[1]) > np.sqrt(0.5*(row[2]+row[3])):\n", + " count+=1\n", + "t2=time()\n", + "delt2 = round((t2-t1),3)\n", + "print(f\"Time:{delt2} seconds\")\n", + "print(\"Count:\",count)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "8821b345-847d-4d28-ba1a-0d05b887d22f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "df.values is 9.11 times faster\n" + ] + } + ], + "source": [ + "print(f\"df.values is {round(delt1/delt2,2)} times faster\")" + ] + }, + { + "cell_type": "markdown", + "id": "3c11a370-bd6b-4862-a628-536dfab3c09b", + "metadata": {}, + "source": [ + "## Simple vectorized operation is fastest in this counting example" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "cfe80cb8-e4d1-4f51-aaad-e71b02c479e3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Time:0.01 seconds\n", + "Count: 9202\n" + ] + } + ], + "source": [ + "t1 = time()\n", + "df['result'] = np.log(1+df['A']+df['B']) > np.sqrt(0.5*(df['C']+df['D']))\n", + "t2=time()\n", + "delt3 = round((t2-t1),3)\n", + "print(f\"Time:{delt3} seconds\")\n", + "print(\"Count:\",df['result'].sum())" + ] + }, + { + "cell_type": "markdown", + "id": "a0709632-41cc-4e9f-8482-7b9cff0dc480", + "metadata": {}, + "source": [ + "## String identifier" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "26d4fd9f-eaf6-4730-b6ba-ab353e677594", + "metadata": {}, + "outputs": [], + "source": [ + "def identifier():\n", + " \"\"\"\n", + " Generates random identifier string of 5 characters\n", + " \"\"\"\n", + " letters = list('CFJQZ')\n", + " numbers = list('123456789')\n", + " \n", + " random_id = ''\n", + " random_id+=np.random.choice(letters)\n", + " random_id+=np.random.choice(letters)\n", + " random_id+=np.random.choice(numbers)\n", + " random_id+=np.random.choice(numbers)\n", + " #random_id+=np.random.choice(numbers)\n", + " #random_id+=np.random.choice(numbers)\n", + " #random_id+=np.random.choice(letters)\n", + " random_id+=np.random.choice(letters)\n", + " \n", + " return random_id" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "02d1f2f0-87d2-4125-8a0b-83382463bcb0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "JQ98C\n", + "CC24Z\n", + "ZQ25Q\n", + "QC54C\n", + "ZZ24Z\n", + "JF91Z\n", + "FQ89C\n", + "FQ15F\n", + "CZ59F\n", + "ZC59F\n" + ] + } + ], + "source": [ + "for i in range(10):\n", + " print(identifier())" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "2c5e1160-91cc-49ed-a541-2bb424d79cad", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABCDresult
095118170False
16387759False
27740463False
340609264False
45129340False
\n", + "
" + ], + "text/plain": [ + " A B C D result\n", + "0 95 11 81 70 False\n", + "1 63 87 75 9 False\n", + "2 77 40 4 63 False\n", + "3 40 60 92 64 False\n", + "4 5 12 93 40 False" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "e6b372dd-9f5e-4028-9770-b171025a97a6", + "metadata": {}, + "outputs": [], + "source": [ + "id_lst=[]\n", + "for i in range(100000):\n", + " id_lst.append(identifier())\n", + "id_lst=np.array(id_lst)\n", + "\n", + "df.insert(0,'ID',id_lst)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "0bf94057-e44c-4fba-ae12-7daff8c587a8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IDABCDresult
78891ZF36F19648567False
6152FC69F2122212False
99247FC76F3777832False
63451ZF94Q30951012True
40941QJ46F1449540False
\n", + "
" + ], + "text/plain": [ + " ID A B C D result\n", + "78891 ZF36F 19 64 85 67 False\n", + "6152 FC69F 21 2 22 12 False\n", + "99247 FC76F 3 77 78 32 False\n", + "63451 ZF94Q 30 95 10 12 True\n", + "40941 QJ46F 14 4 95 40 False" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.sample(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "bbd50364-66b3-4703-99bf-a2e802772c04", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "10125" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['ID'].nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "23834f67-2c42-42a0-b533-3fdae891da5c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Time:6.597 seconds\n" + ] + } + ], + "source": [ + "ratio_dict={'ID':[],'Ratio':[]}\n", + "t1=time()\n", + "for _,row in df.iterrows():\n", + " if row['ID'][0:2] == 'ZZ' and row['ID'][-1]=='F':\n", + " ratio = (row['A']+row['B'])/(0.01+row['C']+row['D'])\n", + " ratio_dict['ID'].append(row[0])\n", + " ratio_dict['Ratio'].append(ratio)\n", + "t2=time()\n", + "delt4 = round((t2-t1),3)\n", + "print(f\"Time:{delt4} seconds\")" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "4901d6f8-646c-4bcb-aedc-7502e69c02f3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Time:0.056 seconds\n" + ] + } + ], + "source": [ + "ratio_dict={'ID':[],'Ratio':[]}\n", + "t1=time()\n", + "for row in df.values:\n", + " if row[0][0:2] == 'ZZ' and row[0][-1]=='F':\n", + " ratio = (row[1]+row[2])/(0.01+row[3]+row[4])\n", + " ratio_dict['ID'].append(row[0])\n", + " ratio_dict['Ratio'].append(ratio)\n", + "t2=time()\n", + "delt4 = round((t2-t1),3)\n", + "print(f\"Time:{delt4} seconds\")" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "a40af042-c101-43b6-a3f2-c8648c072672", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IDRatio
0ZZ43F0.349377
1ZZ79F1.285600
2ZZ57F1.090744
3ZZ17F0.862388
4ZZ56F1.134152
.........
822ZZ34F0.888724
823ZZ22F0.767811
824ZZ59F2.063838
825ZZ47F0.608283
826ZZ71F1.550388
\n", + "

827 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " ID Ratio\n", + "0 ZZ43F 0.349377\n", + "1 ZZ79F 1.285600\n", + "2 ZZ57F 1.090744\n", + "3 ZZ17F 0.862388\n", + "4 ZZ56F 1.134152\n", + ".. ... ...\n", + "822 ZZ34F 0.888724\n", + "823 ZZ22F 0.767811\n", + "824 ZZ59F 2.063838\n", + "825 ZZ47F 0.608283\n", + "826 ZZ71F 1.550388\n", + "\n", + "[827 rows x 2 columns]" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.DataFrame(ratio_dict)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Pandas and Numpy/Read_data_various_sources/Pandas CSV vs. PyArrow parquet reading speed.ipynb b/Pandas and Numpy/Read_data_various_sources/Pandas CSV vs. PyArrow parquet reading speed.ipynb index cdfdac1..f1e75eb 100644 --- a/Pandas and Numpy/Read_data_various_sources/Pandas CSV vs. PyArrow parquet reading speed.ipynb +++ b/Pandas and Numpy/Read_data_various_sources/Pandas CSV vs. PyArrow parquet reading speed.ipynb @@ -6,15 +6,14 @@ "metadata": {}, "source": [ "# Pandas vs. PyArrow file reading speed comparison\n", - "\n", - "## Dr. Tirthajyoti Sarkar, Fremont, CA, April 2021\n", + "## April, 2021\n", "\n", "---" ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 66, "id": "812531a4-bb0e-49a2-9604-ec566c8359d9", "metadata": {}, "outputs": [], @@ -24,7 +23,10 @@ "import numpy as np\n", "import pandas as pd\n", "import pyarrow.parquet as pq\n", - "import os" + "import os\n", + "import sys\n", + "import matplotlib as mpl\n", + "mpl.rcParams['figure.dpi']=125" ] }, { @@ -129,7 +131,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 76, "id": "f90897c2-c99f-4974-b52f-fa8f34b6f5d0", "metadata": {}, "outputs": [ @@ -171,7 +173,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 77, "id": "9e0f4eff-c3f9-4088-8785-4db160116975", "metadata": {}, "outputs": [], @@ -182,15 +184,15 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 78, "id": "86dfb137-3143-4ac4-970f-2d50cafc0bfe", "metadata": {}, "outputs": [ { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ - "
" + "
" ] }, "metadata": { @@ -227,7 +229,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 79, "id": "hybrid-immigration", "metadata": {}, "outputs": [ @@ -235,7 +237,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Time taken to read 2 columns of a 100 MB (53250 rows) CSV file with Pandas: 1.107 seconds\n" + "Time taken to read 2 columns of a 100 MB (53250 rows) CSV file with Pandas: 1.114 seconds\n" ] } ], @@ -256,12 +258,12 @@ "id": "1fe506be-0aa7-4ae8-9363-a56ff5ae0cb6", "metadata": {}, "source": [ - "#### The reading speed of the 100 MB CSV file with `pd.read_csv()` is about 0.6 seconds." + "#### The reading speed of the 100 MB CSV file with `pd.read_csv()` is about 1.114 seconds." ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 80, "id": "suffering-stake", "metadata": {}, "outputs": [ @@ -329,7 +331,7 @@ "4 1.021031 0.295909" ] }, - "execution_count": 8, + "execution_count": 80, "metadata": {}, "output_type": "execute_result" } @@ -340,7 +342,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 81, "id": "assumed-harvard", "metadata": {}, "outputs": [ @@ -348,7 +350,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Time taken to read 2 columns of the identical 53250 rows zipped parquet file with PyArrow: 0.019 seconds\n" + "Time taken to read 2 columns of the identical 53250 rows zipped parquet file with PyArrow: 0.026 seconds\n" ] } ], @@ -369,12 +371,12 @@ "id": "0d8c9230-179e-45fe-be85-120d8286f5d8", "metadata": {}, "source": [ - "#### The reading speed of the same file (in the parquet gzipped version) with `pq.read_table()` is about 0.012 seconds!" + "#### The reading speed of the same file (in the parquet gzipped version) with `pq.read_table()` is about 0.026 seconds!" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 82, "id": "joint-safety", "metadata": {}, "outputs": [ @@ -442,7 +444,7 @@ "4 1.021031 0.295909" ] }, - "execution_count": 10, + "execution_count": 82, "metadata": {}, "output_type": "execute_result" } @@ -473,7 +475,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 83, "id": "deluxe-absence", "metadata": {}, "outputs": [ @@ -596,6 +598,153 @@ "plt.ylabel(\"Ratio of Pandas/Arrow read time\", fontsize=14)\n", "plt.show()" ] + }, + { + "cell_type": "markdown", + "id": "a6b4ab83-6b96-4d36-aa4d-476d43130883", + "metadata": {}, + "source": [ + "## PyArrow (Parquet) reading time varies with sparsity in the file" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "4370323a-7d89-4fe5-8f49-179f1e2aa686", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "NaN-filled file written with 2.28%\n", + "NaN-filled file written with 5.49%\n", + "NaN-filled file written with 11.49%\n", + "NaN-filled file written with 21.21%\n", + "NaN-filled file written with 34.45%\n", + "NaN-filled file written with 50.04%\n", + "NaN-filled file written with 65.54%\n", + "NaN-filled file written with 78.81%\n", + "NaN-filled file written with 88.5%\n", + "NaN-filled file written with 94.53%\n" + ] + } + ], + "source": [ + "pct_nan = []\n", + "for i in range(11,21):\n", + " a = np.random.normal(size=(int(5325*5), int(1e2)))\n", + " cutoff = -2+0.4*(i-11)\n", + " a = np.where(a < cutoff, np.nan, a)\n", + " p_nan = round(100*np.count_nonzero(np.isnan(a))/a.size,2)\n", + " pct_nan.append(p_nan)\n", + " df = pd.DataFrame(a, columns=[\"C\" + str(i) for i in range(100)])\n", + " fname = \"test\"+str(i)+\".csv\"\n", + " df.to_csv(fname)\n", + " print(f\"NaN-filled file written with {p_nan}%\")" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "132f13c7-583c-46cd-b1db-3a3fc38006b0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Created test11_parquet.zip\n", + "Created test12_parquet.zip\n", + "Created test13_parquet.zip\n", + "Created test14_parquet.zip\n", + "Created test15_parquet.zip\n", + "Created test16_parquet.zip\n", + "Created test17_parquet.zip\n", + "Created test18_parquet.zip\n", + "Created test19_parquet.zip\n", + "Created test20_parquet.zip\n" + ] + } + ], + "source": [ + "for i in range(11,21):\n", + " fname = \"test\"+str(i)+\".csv\"\n", + " parquet_name = \"test\"+str(i)+\"_parquet.zip\"\n", + " df = pd.read_csv(fname)\n", + " df.to_parquet(parquet_name,compression=\"gzip\")\n", + " print(f\"Created {parquet_name}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "b0f53bc3-5a73-496a-b574-b5535227c72c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done for file # 11\n", + "Done for file # 12\n", + "Done for file # 13\n", + "Done for file # 14\n", + "Done for file # 15\n", + "Done for file # 16\n", + "Done for file # 17\n", + "Done for file # 18\n", + "Done for file # 19\n", + "Done for file # 20\n" + ] + } + ], + "source": [ + "t_read_nan = []\n", + "m_read_nan = []\n", + "\n", + "for i in range(11,21):\n", + " parquet_name = \"test\"+str(i)+\"_parquet.zip\"\n", + " t1 = time.time()\n", + " df2 = pq.read_table(parquet_name)\n", + " t2 = time.time()\n", + " delta_t = round(1000*(t2 - t1), 3)\n", + " t_read_nan.append(delta_t)\n", + " m_read_nan.append()\n", + " print(f\"Done for file # {i}\")\n", + "t_read_nan=np.array(t_read_nan)" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "531f5393-d837-48b0-bccb-dfbf3562301c", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(11, 5))\n", + "plt.plot(pct_nan,t_read_nan, \"bo--\", linewidth=2, markersize=8)\n", + "plt.grid(True)\n", + "plt.title(\"PyArrow (Parquet) reading time varies with sparsity in the file\",fontsize=16,)\n", + "#plt.xticks([10*i for i in range(1, 11)],fontsize=14)\n", + "plt.xlabel(\"Sparsity (% of NaN values)\", fontsize=14)\n", + "plt.ylabel(\"Read time (milliseconds)\", fontsize=14)\n", + "plt.ylim(int(t_read_nan.min()*0.9),int(t_read_nan.max()*1.1))\n", + "plt.show()" + ] } ], "metadata": { diff --git a/README.md b/README.md index c9c35cc..077c9d7 100644 --- a/README.md +++ b/README.md @@ -7,13 +7,15 @@ ### Dr. Tirthajyoti Sarkar, Fremont, California ([Please feel free to connect on LinkedIn here](https://www.linkedin.com/in/tirthajyoti-sarkar-2127aa7)) +![ml-ds](https://raw.githubusercontent.com/tirthajyoti/Machine-Learning-with-Python/master/Images/ML-DS-cycle-1.png) + --- ## Also check out these super-useful Repos that I curated -[Highly cited and useful papers related to machine learning, deep learning, AI, game theory, reinforcement learning](https://github.com/tirthajyoti/Papers-Literature-ML-DL-RL-AI) +- ### [Highly cited and useful papers related to machine learning, deep learning, AI, game theory, reinforcement learning](https://github.com/tirthajyoti/Papers-Literature-ML-DL-RL-AI) -[Carefully curated resource links for data science in one place](https://github.com/tirthajyoti/Data-science-best-resources) +- ### [Carefully curated resource links for data science in one place](https://github.com/tirthajyoti/Data-science-best-resources) ## Requirements * **Python 3.6+** @@ -146,4 +148,11 @@ See my articles on Medium on this topic. ### Unit testing ML code with Pytest Check the files and detailed instructions in the [Pytest](https://github.com/tirthajyoti/Machine-Learning-with-Python/tree/master/Pytest) directory to understand how one should write unit testing code/module for machine learning models +--- + +### Memory and timing profiling + +Profiling data science code and ML models for memory footprint and computing time is a critical but often overlooed area. Here are a couple of Notebooks showing the ideas, +* [Memory profling using Scalene](https://github.com/tirthajyoti/Machine-Learning-with-Python/tree/master/Memory-profiling/Scalene) +* [Time-profiling data science code](https://github.com/tirthajyoti/Machine-Learning-with-Python/blob/master/Time-profiling/cProfile.ipynb) diff --git a/Time-profiling/8.14 - profile-DS-workflow.png b/Time-profiling/8.14 - profile-DS-workflow.png new file mode 100644 index 0000000..e17e393 Binary files /dev/null and b/Time-profiling/8.14 - profile-DS-workflow.png differ diff --git a/Time-profiling/Readme.md b/Time-profiling/Readme.md new file mode 100644 index 0000000..6d6d34d --- /dev/null +++ b/Time-profiling/Readme.md @@ -0,0 +1 @@ +## Time-profiling ML code diff --git a/Time-profiling/cProfile.ipynb b/Time-profiling/cProfile.ipynb new file mode 100644 index 0000000..a65359a --- /dev/null +++ b/Time-profiling/cProfile.ipynb @@ -0,0 +1,513 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "fc118b68-6776-4504-ace0-0e076616fde6", + "metadata": {}, + "source": [ + "# Time-profiling Data Science code using `cProfile`\n", + "\n", + "## Dr. Tirthajyoti Sarkar\n", + "\n", + "---" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "98192c2c-f90d-4a7b-b3b2-ae23870d6f4e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 3 function calls in 0.064 seconds\n", + "\n", + " Ordered by: standard name\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1 0.064 0.064 0.064 0.064 :1()\n", + " 1 0.000 0.000 0.064 0.064 {built-in method builtins.exec}\n", + " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", + "\n", + "\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "import cProfile\n", + "\n", + "SIZE = 10_000_000\n", + "a = np.arange(SIZE)\n", + "b = np.random.normal(size=SIZE)\n", + "\n", + "cProfile.run('a+b')" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "9500a64c-9d19-4e05-97cf-231ab5f1842e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SIZE = 10_000_000\n", + "a = np.arange(SIZE)\n", + "b = np.random.normal(size=SIZE)\n", + "a+b\n" + ] + } + ], + "source": [ + "code = \"\"\"SIZE = 10_000_000\n", + "a = np.arange(SIZE)\n", + "b = np.random.normal(size=SIZE)\n", + "a+b\"\"\"\n", + "\n", + "print(code)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "6d810ec6-245d-4082-8f64-d5c16b162936", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 5 function calls in 0.488 seconds\n", + "\n", + " Ordered by: standard name\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1 0.078 0.078 0.488 0.488 :1()\n", + " 1 0.000 0.000 0.488 0.488 {built-in method builtins.exec}\n", + " 1 0.028 0.028 0.028 0.028 {built-in method numpy.arange}\n", + " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", + " 1 0.381 0.381 0.381 0.381 {method 'normal' of 'numpy.random.mtrand.RandomState' objects}\n", + "\n", + "\n" + ] + } + ], + "source": [ + "cProfile.run(code)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "8a5bf74f-fa5b-4997-a864-82a6d9503216", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 6 function calls in 0.531 seconds\n", + "\n", + " Ordered by: standard name\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1 0.055 0.055 0.510 0.510 1735574101.py:1(add)\n", + " 1 0.021 0.021 0.531 0.531 :1()\n", + " 1 0.000 0.000 0.531 0.531 {built-in method builtins.exec}\n", + " 1 0.057 0.057 0.057 0.057 {built-in method numpy.arange}\n", + " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", + " 1 0.397 0.397 0.397 0.397 {method 'normal' of 'numpy.random.mtrand.RandomState' objects}\n", + "\n", + "\n" + ] + } + ], + "source": [ + "def add():\n", + " SIZE = 10_000_000\n", + " a = np.arange(SIZE)\n", + " b = np.random.normal(size=SIZE)\n", + " c=a+b\n", + "\n", + "cProfile.run('add()')" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "f899c334-2d1f-41ef-aceb-71348e5a6818", + "metadata": {}, + "outputs": [], + "source": [ + "def add(size):\n", + " a = np.arange(size)\n", + " b = np.random.normal(size=size)\n", + " c=a+b" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "cdf8d43a-f240-4de6-9a92-6dee6c1cd98b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 6 function calls in 0.500 seconds\n", + "\n", + " Ordered by: standard name\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1 0.054 0.054 0.478 0.478 1565836920.py:1(add)\n", + " 1 0.021 0.021 0.500 0.500 :1()\n", + " 1 0.000 0.000 0.500 0.500 {built-in method builtins.exec}\n", + " 1 0.030 0.030 0.030 0.030 {built-in method numpy.arange}\n", + " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", + " 1 0.394 0.394 0.394 0.394 {method 'normal' of 'numpy.random.mtrand.RandomState' objects}\n", + "\n", + "\n" + ] + } + ], + "source": [ + "SIZE = 10_000_000\n", + "cProfile.run('add(SIZE)')" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "8d038a76-4394-472b-9949-be9ef936dca5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 6 function calls in 1.034 seconds\n", + "\n", + " Ordered by: standard name\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1 0.114 0.114 0.987 0.987 1565836920.py:1(add)\n", + " 1 0.047 0.047 1.034 1.034 :1()\n", + " 1 0.000 0.000 1.034 1.034 {built-in method builtins.exec}\n", + " 1 0.082 0.082 0.082 0.082 {built-in method numpy.arange}\n", + " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", + " 1 0.791 0.791 0.791 0.791 {method 'normal' of 'numpy.random.mtrand.RandomState' objects}\n", + "\n", + "\n" + ] + } + ], + "source": [ + "SIZE = 20_000_000\n", + "cProfile.run('add(SIZE)')" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "f14c476d-cc19-4ce5-b74e-abf211eabc7a", + "metadata": {}, + "outputs": [], + "source": [ + "def ops(a,b):\n", + " x1 = a+b\n", + " x2 = a-b\n", + " x3 = a*b\n", + " x4 = a/b" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "771a1452-2d4a-476e-8775-3479beac3700", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 4 function calls in 0.287 seconds\n", + "\n", + " Ordered by: standard name\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1 0.251 0.251 0.251 0.251 3200973052.py:1(ops)\n", + " 1 0.036 0.036 0.286 0.286 :1()\n", + " 1 0.000 0.000 0.287 0.287 {built-in method builtins.exec}\n", + " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", + "\n", + "\n" + ] + } + ], + "source": [ + "cProfile.run('ops(a,b)')" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "5af6602e-a66f-4bf0-9d22-bc5b71e8c4d0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total function calls: 48\n", + "Total time (seconds): 1.1839559\n" + ] + } + ], + "source": [ + "import cProfile, pstats\n", + "\n", + "profiler = cProfile.Profile()\n", + "# Enable profiler\n", + "profiler.enable()\n", + "# Function execution\n", + "add(SIZE)\n", + "# Disable profiler\n", + "profiler.disable()\n", + "# pstats\n", + "stats = pstats.Stats(profiler)\n", + "# Print the total time and function calls\n", + "print(\"Total function calls:\", stats.total_calls)\n", + "print(\"Total time (seconds):\", stats.total_tt)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "91f2d625-37bf-41f6-9c92-ae52bf608d02", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 48 function calls in 1.184 seconds\n", + "\n", + " Random listing order was used\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 2 0.000 0.000 0.000 0.000 c:\\users\\tirtha\\appdata\\local\\programs\\python\\python39\\lib\\contextlib.py:86(__init__)\n", + " 2 0.000 0.000 0.000 0.000 c:\\users\\tirtha\\appdata\\local\\programs\\python\\python39\\lib\\contextlib.py:112(__enter__)\n", + " 2 0.000 0.000 0.000 0.000 c:\\users\\tirtha\\appdata\\local\\programs\\python\\python39\\lib\\contextlib.py:121(__exit__)\n", + " 2 0.000 0.000 0.000 0.000 c:\\users\\tirtha\\appdata\\local\\programs\\python\\python39\\lib\\contextlib.py:242(helper)\n", + " 2 0.000 0.000 0.000 0.000 c:\\users\\tirtha\\appdata\\local\\programs\\python\\python39\\lib\\site-packages\\traitlets\\traitlets.py:535(get)\n", + " 2 0.000 0.000 0.000 0.000 c:\\users\\tirtha\\appdata\\local\\programs\\python\\python39\\lib\\site-packages\\traitlets\\traitlets.py:566(__get__)\n", + " 2 0.000 0.000 0.000 0.000 c:\\users\\tirtha\\appdata\\local\\programs\\python\\python39\\lib\\site-packages\\IPython\\utils\\ipstruct.py:125(__getattr__)\n", + " 2 0.000 0.000 0.000 0.000 c:\\users\\tirtha\\appdata\\local\\programs\\python\\python39\\lib\\codeop.py:142(__call__)\n", + " 4 0.000 0.000 0.000 0.000 c:\\users\\tirtha\\appdata\\local\\programs\\python\\python39\\lib\\site-packages\\IPython\\core\\compilerop.py:166(extra_flags)\n", + " 2 0.000 0.000 0.000 0.000 c:\\users\\tirtha\\appdata\\local\\programs\\python\\python39\\lib\\site-packages\\IPython\\core\\interactiveshell.py:1286(user_global_ns)\n", + " 2 0.000 0.000 0.000 0.000 c:\\users\\tirtha\\appdata\\local\\programs\\python\\python39\\lib\\site-packages\\IPython\\core\\interactiveshell.py:3354(compare)\n", + " 2 0.000 0.000 1.184 0.592 c:\\users\\tirtha\\appdata\\local\\programs\\python\\python39\\lib\\site-packages\\IPython\\core\\interactiveshell.py:3416(run_code)\n", + " 2 0.000 0.000 0.000 0.000 c:\\users\\tirtha\\appdata\\local\\programs\\python\\python39\\lib\\site-packages\\IPython\\core\\hooks.py:103(__call__)\n", + " 2 0.000 0.000 0.000 0.000 c:\\users\\tirtha\\appdata\\local\\programs\\python\\python39\\lib\\site-packages\\IPython\\core\\hooks.py:168(pre_run_code_hook)\n", + " 1 0.000 0.000 0.000 0.000 C:\\Users\\Tirtha\\AppData\\Local\\Temp/ipykernel_12356/3775033682.py:9()\n", + " 1 0.044 0.044 1.184 1.184 C:\\Users\\Tirtha\\AppData\\Local\\Temp/ipykernel_12356/3775033682.py:7()\n", + " 1 0.114 0.114 1.140 1.140 C:\\Users\\Tirtha\\AppData\\Local\\Temp/ipykernel_12356/1565836920.py:1(add)\n", + " 1 0.081 0.081 0.081 0.081 {built-in method numpy.arange}\n", + " 1 0.945 0.945 0.945 0.945 {method 'normal' of 'numpy.random.mtrand.RandomState' objects}\n", + " 2 0.000 0.000 0.000 0.000 {built-in method builtins.compile}\n", + " 2 0.000 0.000 1.184 0.592 {built-in method builtins.exec}\n", + " 4 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", + " 4 0.000 0.000 0.000 0.000 {built-in method builtins.next}\n", + " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", + "\n", + "\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stats = pstats.Stats(profiler)\n", + "stats.print_stats()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "06fa3a62-5fef-4d9e-8c6a-d9e309d80e16", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "pstats.Stats" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type(stats)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "cefecaa0-9933-49ae-b0dc-3f869e83fa89", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1.1839559" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stats.total_tt" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "fe15e37e-d329-4168-a479-60c219126e16", + "metadata": {}, + "outputs": [], + "source": [ + "stats.fcn_list" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "15c966eb-e094-4e5f-8e54-1a2c3923f6c7", + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "560a8a4d-84fc-43c2-a1c8-79113dcc73ac", + "metadata": {}, + "outputs": [], + "source": [ + "size = [int(i*1e6) for i in range(5,26,5)]\n", + "total_tt = []\n", + "for s in size:\n", + " profiler = cProfile.Profile()\n", + " profiler.enable()\n", + " add(s)\n", + " profiler.disable()\n", + " stats = pstats.Stats(profiler)\n", + " total_tt.append(round(stats.total_tt,3)) " + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "264f4589-e40b-4c2d-bb08-9116dc4eb2e4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[0.274, 0.464, 0.706, 0.94, 1.187]" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "total_tt" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "695c9306-bcc6-45ea-8939-28e8e127eec3", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(6,3),dpi=120)\n", + "plt.bar(x=[str(i)+'-million' for i in range(5,26,5)],\n", + " height=total_tt, \n", + " edgecolor='k',\n", + " color=\"#2c75b0\")\n", + "plt.xlabel(\"Array size\", fontsize=16)\n", + "plt.ylabel(\"Time taken (seconds)\",fontsize=16)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "9b1e4af3-385a-420a-be85-449119210f93", + "metadata": {}, + "source": [ + "## Data science workflow profiling\n", + "\n", + "While measuring the execution time of these small standalone functions serve as basic demonstration of the usage of these profilers, the real utility is realized when they are used in a large-scale data science workflow. Such a workflow has a variety of modules and functions and we can set up profiling for all of them if necessary. The output may be logged into a database or even be fed into a monitoring system that will track the performance of the modules over time and take action if needed (e.g., a function has poorly performed and took too much time in a certain run or for a certain input data).\n", + "\n", + "![ds-workflow](https://raw.githubusercontent.com/tirthajyoti/Machine-Learning-with-Python/master/Time-profiling/8.14%20-%20profile-DS-workflow.png)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Timing-decorator/Readme.md b/Timing-decorator/Readme.md new file mode 100644 index 0000000..8936534 --- /dev/null +++ b/Timing-decorator/Readme.md @@ -0,0 +1 @@ +## Timing decorator with ML estimators diff --git a/Timing-decorator/Timing-decorator-ML-optimization.ipynb b/Timing-decorator/Timing-decorator-ML-optimization.ipynb new file mode 100644 index 0000000..2d4bbc9 --- /dev/null +++ b/Timing-decorator/Timing-decorator-ML-optimization.ipynb @@ -0,0 +1,489 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 201, + "id": "5a0b11ef-4d23-4678-98de-2e2c5d6ac365", + "metadata": {}, + "outputs": [], + "source": [ + "from functools import wraps\n", + "from time import time,sleep\n", + "import numpy as np, matplotlib.pyplot as plt\n", + "from sklearn.linear_model import LogisticRegressionCV\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.datasets import make_classification,make_regression\n", + "from sklearn.model_selection import train_test_split" + ] + }, + { + "cell_type": "markdown", + "id": "179c44d3-b908-4544-a506-c24170c9f642", + "metadata": {}, + "source": [ + "## Timing decorator with `functools.wraps`" + ] + }, + { + "cell_type": "code", + "execution_count": 235, + "id": "ed5eaae6-1f2b-490d-9a77-16dfca402aa0", + "metadata": {}, + "outputs": [], + "source": [ + "def timing(func):\n", + " @wraps(func)\n", + " def wrap(*args, **kw):\n", + " ts = time()\n", + " result = func(*args, **kw)\n", + " te = time()\n", + " tdelta = round(1000*(te-ts),3)\n", + " print (f\"Function '{func.__name__}' took {tdelta} milliseconds to run\")\n", + " return result\n", + " return wrap" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "39761a68-afed-428f-9f2f-58d933bc61ad", + "metadata": {}, + "outputs": [], + "source": [ + "@timing\n", + "def list_length(a):\n", + " if isinstance(a,list):\n", + " sleep(0.1)\n", + " s = len(a)\n", + " return s\n", + " else:\n", + " print(\"Argument is not a list\")" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "573c97d3-b24f-4590-a492-d34588777c23", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Function 'list_length' took 111.291 milliseconds to run\n" + ] + }, + { + "data": { + "text/plain": [ + "3" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list_length([1,2,3])" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "4e4096ba-226d-4f73-a801-eb85bfad14d8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Argument is not a list\n", + "Function 'list_length' took 0.0 milliseconds to run\n" + ] + } + ], + "source": [ + "list_length(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "bb695ae1-b0bf-4389-aff0-526929dd216a", + "metadata": {}, + "outputs": [], + "source": [ + "def time_return(func):\n", + " @wraps(func)\n", + " def wrap(*args, **kw):\n", + " ts = time()\n", + " result = func(*args, **kw)\n", + " te = time()\n", + " tdelta = round(1000*(te-ts),3)\n", + " return tdelta\n", + " return wrap" + ] + }, + { + "cell_type": "code", + "execution_count": 135, + "id": "7fbea76a-a77d-408a-a1e4-92ddb6ce1ced", + "metadata": {}, + "outputs": [], + "source": [ + "@time_return\n", + "def numpy_matmul(a,b):\n", + " return (np.matmul(a,b))" + ] + }, + { + "cell_type": "code", + "execution_count": 140, + "id": "cdbacc07-0035-4223-85ef-480c296c26f7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "16.48" + ] + }, + "execution_count": 140, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "SIZE = 1000\n", + "a = np.random.beta(1.0,2.0,size=(SIZE,SIZE))\n", + "b = np.random.beta(1.0,2.0,size=(SIZE,SIZE))\n", + "numpy_matmul(a,b)" + ] + }, + { + "cell_type": "code", + "execution_count": 141, + "id": "33a26139-375f-406d-92ff-f2fffc958b26", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "111.301" + ] + }, + "execution_count": 141, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "SIZE = 2000\n", + "a = np.random.beta(1.0,2.0,size=(SIZE,SIZE))\n", + "b = np.random.beta(1.0,2.0,size=(SIZE,SIZE))\n", + "numpy_matmul(a,b)" + ] + }, + { + "cell_type": "code", + "execution_count": 143, + "id": "8b62edde-40ac-4d10-abc6-7fc77b057916", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Matrix multiplication of size (500x500) took 3.0 milliseconds\n", + "Matrix multiplication of size (1000x1000) took 17.031 milliseconds\n", + "Matrix multiplication of size (2000x2000) took 111.501 milliseconds\n", + "Matrix multiplication of size (3000x3000) took 359.307 milliseconds\n", + "Matrix multiplication of size (4000x4000) took 835.614 milliseconds\n", + "Matrix multiplication of size (5000x5000) took 1611.042 milliseconds\n" + ] + } + ], + "source": [ + "SIZE = [500,1000,2000,3000,4000,5000]\n", + "for s in SIZE:\n", + " a = np.random.beta(1.0,2.0,size=(s,s))\n", + " b = np.random.beta(1.0,2.0,size=(s,s))\n", + " t = numpy_matmul(a,b)\n", + " print(f\"Matrix multiplication of size ({s}x{s}) took {t} milliseconds\")" + ] + }, + { + "cell_type": "markdown", + "id": "68f93384-d0c4-45d8-bf44-ac2657089d2d", + "metadata": {}, + "source": [ + "## Throwing an ML estimator into the mix" + ] + }, + { + "cell_type": "code", + "execution_count": 144, + "id": "28dc6ecb-4a34-473d-b3a4-a607afd4e8de", + "metadata": {}, + "outputs": [], + "source": [ + "def time_estimator(func):\n", + " @wraps(func)\n", + " def wrap(*args, **kw):\n", + " ts = time()\n", + " result = func(*args, **kw)\n", + " te = time()\n", + " tdelta = round(1000*(te-ts),3)\n", + " return (tdelta, result)\n", + " return wrap" + ] + }, + { + "cell_type": "code", + "execution_count": 178, + "id": "6cbe46a4-0a4d-4cb8-866e-14646442e918", + "metadata": {}, + "outputs": [], + "source": [ + "@time_estimator\n", + "def classifier_accuracy(estimator,x,y):\n", + " X_train, X_test, y_train, y_test = train_test_split(x, y, \n", + " test_size=0.33, \n", + " random_state=42)\n", + " estimator.fit(X_train,y_train)\n", + " score = estimator.score(X_test,y_test)\n", + " return round(score,3)" + ] + }, + { + "cell_type": "code", + "execution_count": 179, + "id": "50dd6f15-7df4-4627-a62f-0f55e5e90d74", + "metadata": {}, + "outputs": [], + "source": [ + "data = make_classification(n_samples=1000,n_features=20,n_informative=20,n_redundant=0,\n", + " flip_y=0.05,class_sep=1.5)\n", + "x,y = data[0],data[1]" + ] + }, + { + "cell_type": "code", + "execution_count": 180, + "id": "4da09873-f5c3-4edf-8347-c995d95eb043", + "metadata": {}, + "outputs": [], + "source": [ + "log_model = LogisticRegressionCV()" + ] + }, + { + "cell_type": "code", + "execution_count": 181, + "id": "39c5e844-b61c-4763-9ccb-c9d058e7be63", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(312.083, 0.836)" + ] + }, + "execution_count": 181, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "classifier_accuracy(log_model,x,y)" + ] + }, + { + "cell_type": "markdown", + "id": "b987ae48-52e9-449b-9fd3-663459465097", + "metadata": {}, + "source": [ + "## Change the data and record execution time" + ] + }, + { + "cell_type": "code", + "execution_count": 176, + "id": "f21f6487-9c84-48cf-8d43-414b8aa88428", + "metadata": {}, + "outputs": [], + "source": [ + "SIZE = [1000+500*i for i in range(21)]\n", + "log_model = LogisticRegressionCV()\n", + "model_time, model_acc = [],[]\n", + "\n", + "for s in SIZE:\n", + " data = make_classification(n_samples=s,n_features=20,n_informative=20,n_redundant=0,\n", + " flip_y=0.05,class_sep=1.5)\n", + " x,y = data[0],data[1]\n", + " m_time, m_acc = classifier_accuracy(log_model,x,y)\n", + " model_time.append(m_time)\n", + " model_acc.append(m_acc)" + ] + }, + { + "cell_type": "code", + "execution_count": 177, + "id": "593abe3d-4d07-404e-87c5-7374379689cb", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "fig,ax = plt.subplots(1,2,figsize=(12,4))\n", + "ax[0].scatter(SIZE,model_acc,edgecolor='k',s=100)\n", + "ax[0].plot(SIZE,model_acc)\n", + "ax[0].set_title(\"Accuracy score with data size\",fontsize=15)\n", + "ax[0].set_xlabel(\"Data size\",fontsize=14)\n", + "ax[0].grid(True)\n", + "ax[1].scatter(SIZE,model_time,edgecolor='k',s=100)\n", + "ax[1].plot(SIZE,model_time)\n", + "ax[1].set_title(\"Training time (msec) with data size\",fontsize=15)\n", + "ax[1].set_xlabel(\"Data size\",fontsize=14)\n", + "ax[1].grid(True)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "102ece6b-e0be-4ff3-8ee2-7fa3012c3741", + "metadata": {}, + "source": [ + "## Change the model and optimize" + ] + }, + { + "cell_type": "code", + "execution_count": 229, + "id": "52a0dca9-879c-4f8c-9942-8960a734a210", + "metadata": {}, + "outputs": [], + "source": [ + "num_trees = [5*x for x in range(1,21)]\n", + "model_time, model_acc = [],[]\n", + "data = make_classification(n_samples=1000, n_features=20, \n", + " n_informative=20, n_redundant=0, \n", + " flip_y=0.05,class_sep=1.0)\n", + "x,y = data[0],data[1]\n", + "for n in num_trees:\n", + " rf_model = RandomForestClassifier(n_estimators=n)\n", + " m_time, m_acc = classifier_accuracy(rf_model,x,y)\n", + " model_time.append(m_time)\n", + " model_acc.append(m_acc)" + ] + }, + { + "cell_type": "code", + "execution_count": 232, + "id": "ce3efbe4-e817-4296-9227-75bd3905e41d", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "fig,ax = plt.subplots(1,2,figsize=(12,4))\n", + "ax[0].scatter(num_trees,model_acc,edgecolor='k',s=100)\n", + "ax[0].plot(num_trees,model_acc)\n", + "ax[0].set_title(\"Accuracy score with Random Forest\",fontsize=15)\n", + "ax[0].set_xlabel(\"Number of trees\",fontsize=14)\n", + "ax[0].grid(True)\n", + "ax[1].scatter(num_trees,model_time,edgecolor='k',s=100)\n", + "ax[1].plot(num_trees,model_time)\n", + "ax[1].set_title(\"Training time (msec) with Random Forest\",fontsize=15)\n", + "ax[1].set_xlabel(\"Number of trees\",fontsize=14)\n", + "ax[1].grid(True)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 233, + "id": "e2db0158-f14b-46d9-81fe-f5698ceb9be7", + "metadata": {}, + "outputs": [], + "source": [ + "model_time = np.array(model_time)\n", + "model_acc = np.array(model_acc)\n", + "model_opt = model_acc + 1/model_time" + ] + }, + { + "cell_type": "code", + "execution_count": 236, + "id": "9e2568d9-9ec5-4a6f-8540-6a286c6252b8", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(dpi=120)\n", + "plt.title(\"Model optimization with number of trees\", fontsize=15)\n", + "plt.plot(num_trees,model_opt)\n", + "plt.scatter(num_trees,model_opt,s=100,edgecolor='k')\n", + "plt.xlabel(\"Number of trees\",fontsize=14)\n", + "plt.grid(True)\n", + "plt.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}