Skip to content

Commit c192162

Browse files
committed
har_trees: Tool for reading labels from LabelStudio
1 parent 098198b commit c192162

File tree

2 files changed

+163
-12
lines changed

2 files changed

+163
-12
lines changed

examples/har_trees/read_data.py

Lines changed: 60 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,12 @@
55
import pandas
66
import numpy
77

8-
def load_har_record(path, samplerate):
8+
def load_har_record(path,
9+
samplerate=100,
10+
sensitivity=2.0,
11+
maxvalue=32767,
12+
):
13+
914
suffix = '.npy'
1015

1116
files = []
@@ -19,29 +24,72 @@ def load_har_record(path, samplerate):
1924
print(e)
2025
continue
2126

22-
df = pandas.DataFrame(data.T, columns=['x', 'y', 'z'])
27+
df = pandas.DataFrame(data, columns=['x', 'y', 'z'])
28+
29+
# Scale values into physical units (g)
30+
df = df.astype(float) / maxvalue * sensitivity
31+
32+
# Add a time column, use as index
2333
t = numpy.arange(0, len(df)) * (1.0/samplerate)
2434
df['time'] = t
2535
df = df.set_index('time')
36+
2637
classname = f.split('_')[1].rstrip(suffix)
38+
39+
# Remove :, special character on Windows
40+
filename = f.replace(':', '')
2741

28-
files.append(dict(data=df, filename=f, classname=classname))
42+
files.append(dict(data=df, filename=filename, classname=classname))
2943

3044
#print(f, data.shape)
3145

3246
out = pandas.DataFrame.from_records(files)
3347
out = out.set_index('filename')
3448
return out
3549

36-
p = '/home/jon/temp/micropython-test-esptool/har_record'
37-
data = load_har_record(p, samplerate=100)
50+
def main():
51+
52+
p = './data/har_record_excercises/har_record'
53+
data = load_har_record(p, samplerate=100)
54+
55+
print(data.head())
56+
print(data.shape)
57+
58+
import plotly.express
59+
60+
print(data.classname.value_counts())
61+
62+
out_dir = 'to_label'
63+
64+
for idx, f in data.iterrows():
65+
d = f.data.sort_index()
66+
67+
out_path = os.path.join(out_dir, idx+'.csv')
68+
d.to_csv(out_path)
69+
print('Wrote', out_path)
70+
71+
m = d.median()
72+
print(list(m))
73+
74+
rel = (d - m)
75+
diffed = d.diff(-1)
76+
77+
continue
78+
79+
fig = plotly.express.line(rel.reset_index(),
80+
x='time',
81+
y=['x', 'y', 'z'],
82+
title=f'{f.classname}: {idx}',
83+
)
84+
#fig.show()
85+
86+
#fig = plotly.express.scatter_3d(diffed, x='x', y='y', z='z', title=f'{f.classname}: {idx}')
87+
#fig.show()
88+
89+
#fig = plotly.express.line(d.reset_index(), x='time', y=['x', 'y', 'z'])
90+
#fig.show()
3891

39-
print(data.head())
40-
print(data.shape)
92+
if __name__ == '__main__':
93+
main()
4194

42-
import plotly.express
4395

44-
for idx, f in data.iterrows():
45-
d = f.data.reset_index()
46-
fig = plotly.express.line(d, x='time', y=['x', 'y', 'z'])
47-
fig.show()

examples/har_trees/read_labels.py

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
2+
import re
3+
import os
4+
5+
import pandas
6+
7+
8+
9+
from read_data import load_har_record
10+
11+
def extract_filename(url):
12+
13+
base = os.path.basename(url)
14+
15+
# label studio adds a ID- on import
16+
filename = re.sub(r'(\w+)-', '', base, count=1)
17+
18+
return filename
19+
20+
def read_labels(path):
21+
df = pandas.read_csv(path)
22+
23+
df = df.rename(columns={
24+
'trend_forecast': 'activity',
25+
'timeseriesUrl': 'data_url',
26+
}, errors='ignore')
27+
28+
# Extract data identifier, used for correlating with data
29+
df['file'] = df['data_url'].apply(extract_filename)
30+
31+
columns = ['file', 'activity']
32+
df = df[columns]
33+
34+
# Convert to a single label per file
35+
# even though there may be multiple annotations
36+
df = df.groupby('file').agg(pandas.Series.mode)
37+
38+
#print(df.head())
39+
40+
return df
41+
42+
43+
44+
def main():
45+
46+
out_path = 'har_exercise_1.parquet'
47+
labels_path = 'project-3-at-2024-12-01-17-29-ba296417.csv'
48+
data_path = 'data/har_record_excercises/har_record'
49+
seed = 1
50+
51+
labels = read_labels(labels_path)
52+
53+
# Drop files with a mix of class labels
54+
labels = labels[~labels.activity.isin(['mixed'])]
55+
56+
# Balance the 'other' category, by downsampling
57+
without_other = labels[labels.activity != 'other']
58+
class_occurance = int(without_other.value_counts().median())
59+
only_other = labels[labels.activity == 'other']
60+
other_downsampled = only_other.sample(n=class_occurance, random_state=seed)
61+
labels = pandas.concat([without_other, other_downsampled])
62+
63+
print('\nFiles after balancing:')
64+
print(labels.activity.value_counts())
65+
66+
# Lookup data
67+
data = load_har_record(data_path)
68+
69+
# Merge in label data
70+
dfs = []
71+
for filename, row in labels.iterrows():
72+
classname = row.activity
73+
filename = filename.rstrip('.csv')
74+
#print(filename, classname)
75+
76+
try:
77+
d = data.loc[filename].data
78+
except KeyError as e:
79+
print('Load error', e)
80+
continue
81+
82+
d = d.reset_index()
83+
d['subject'] = 'unknown'
84+
d['file'] = filename
85+
d['activity'] = classname
86+
dfs.append(d)
87+
88+
#p = 'data/processed/pamap2.parquet'
89+
90+
out = pandas.concat(dfs, ignore_index=True)
91+
print(out)
92+
out.to_parquet(out_path)
93+
94+
#return
95+
# Sanity check
96+
df = pandas.read_parquet(out_path)
97+
print(df.columns)
98+
print(df.activity.value_counts())
99+
print(df.file.value_counts())
100+
print(df.head())
101+
102+
if __name__ == '__main__':
103+
main()

0 commit comments

Comments
 (0)