-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathHowTo_Pandas_01_NA.py
57 lines (45 loc) · 1.92 KB
/
HowTo_Pandas_01_NA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import pandas as pd
import random as rd
# create NA
pd.NA # <NA> is 'object', does not coerce to numeric
na = pd.to_numeric(pd.NA) # returns np.nan ('float', there's no NA-'int' in Pandas)
d = pd.DataFrame({
"A": ['A', 'A', 'A', 'A'],
"B": ['B', 'B', na, 'B'],
"V1": [12, na, na, 32],
'V2': [1.23, 1.75, 1.54, na],
})
# count NA
d.shape # count rows and columns
d.info() # count non-NA elements by column.
d.count() # same
d.notna().sum() # same
d.isna().sum() # inverse
d.isna().agg(['sum','count']) # same, with counts
d.isna().agg(['sum','count'], axis=1) # same by rows
d.notna().agg(['sum','count']) # inverse
# drop NA
d.dropna() # drop rows with any NA (default is inplace=False)
d.dropna(how='all') # drop rows only consisting of NA.
d.dropna(thresh=4) # keep rows with at least 4 non-NA cells, 'how=' ignored
d.dropna(subset=['B','V1']) # Columns to look for missing values.
# d = d[d['A'].notna()].copy() # same
# d = d[~d['A'].na()].copy() # same
# fill NA
d.fillna(0) # replace NA with 0
d.fillna(method="ffill") # replace NA with last 'ffill' or next 'bfill' valid value.
d.fillna(method="ffill", limit=3) # fill max 3 NA per column or with method= per seq of NA.
d['V1'].fillna(d['V2']) # fill by another column
d.interpolate() # linear interpolation
d.interpolate('spline') # spline interpolation, see documentation, uses SciPy
# compare NA
d.equals(d) # True, equals() treats NaNs as equal
d == d # False, NaNs do not compare as equals
d.select_dtypes('number') > 0 # False
# NA in groupby
d.groupby('B').sum()
# NA in count values
d.value_counts(['A','B'], dropna=False) # d[, .N, k=.(A,B)]
# NA in reduction
d.mean() # skipna=True (default)
d.mean(skipna=False)