har_trees: Tool for reading labels from LabelStudio

jonnor · jonnor · commit c192162438c3 · 2024-12-01T19:25:13.000+01:00
diff --git a/examples/har_trees/read_data.py b/examples/har_trees/read_data.py
@@ -5,7 +5,12 @@
 import pandas
 import numpy
 
-def load_har_record(path, samplerate):
+def load_har_record(path,
+        samplerate=100,
+        sensitivity=2.0,
+        maxvalue=32767,
+        ):
+
     suffix = '.npy'
 
     files = []
@@ -19,29 +24,72 @@ def load_har_record(path, samplerate):
                 print(e)
                 continue
 
-            df = pandas.DataFrame(data.T, columns=['x', 'y', 'z'])
+            df = pandas.DataFrame(data, columns=['x', 'y', 'z'])
+
+            # Scale values into physical units (g)
+            df = df.astype(float) / maxvalue * sensitivity
+
+            # Add a time column, use as index
             t = numpy.arange(0, len(df)) * (1.0/samplerate)
             df['time'] = t
             df = df.set_index('time')
+
             classname = f.split('_')[1].rstrip(suffix)
+            
+            # Remove :, special character on Windows
+            filename = f.replace(':', '')
 
-            files.append(dict(data=df, filename=f, classname=classname))
+            files.append(dict(data=df, filename=filename, classname=classname))
 
             #print(f, data.shape)
 
     out = pandas.DataFrame.from_records(files)
     out = out.set_index('filename')
     return out
 
-p = '/home/jon/temp/micropython-test-esptool/har_record'
-data = load_har_record(p, samplerate=100)
+def main():
+
+    p = './data/har_record_excercises/har_record'
+    data = load_har_record(p, samplerate=100)
+
+    print(data.head())
+    print(data.shape)
+
+    import plotly.express
+
+    print(data.classname.value_counts())
+
+    out_dir = 'to_label'
+
+    for idx, f in data.iterrows():
+        d = f.data.sort_index()
+        
+        out_path = os.path.join(out_dir, idx+'.csv')
+        d.to_csv(out_path)
+        print('Wrote', out_path)
+
+        m = d.median()
+        print(list(m))
+
+        rel = (d - m)
+        diffed = d.diff(-1)
+
+        continue
+        
+        fig = plotly.express.line(rel.reset_index(),
+                x='time',
+                y=['x', 'y', 'z'],
+                title=f'{f.classname}: {idx}',
+        )
+        #fig.show()
+
+        #fig = plotly.express.scatter_3d(diffed, x='x', y='y', z='z', title=f'{f.classname}: {idx}')
+        #fig.show()
+
+        #fig = plotly.express.line(d.reset_index(), x='time', y=['x', 'y', 'z'])
+        #fig.show()
 
-print(data.head())
-print(data.shape)
+if __name__ == '__main__':
+    main()
 
-import plotly.express
 
-for idx, f in data.iterrows():
-    d = f.data.reset_index()
-    fig = plotly.express.line(d, x='time', y=['x', 'y', 'z'])
-    fig.show()
diff --git a/examples/har_trees/read_labels.py b/examples/har_trees/read_labels.py
@@ -0,0 +1,103 @@
+
+import re
+import os
+
+import pandas
+
+
+
+from read_data import load_har_record
+
+def extract_filename(url):
+
+    base = os.path.basename(url)
+
+    # label studio adds a ID- on import
+    filename = re.sub(r'(\w+)-', '', base, count=1)
+
+    return filename
+
+def read_labels(path):
+    df = pandas.read_csv(path)
+
+    df = df.rename(columns={
+        'trend_forecast': 'activity',
+        'timeseriesUrl': 'data_url',
+    }, errors='ignore')
+
+    # Extract data identifier, used for correlating with data
+    df['file'] = df['data_url'].apply(extract_filename)
+
+    columns = ['file', 'activity']
+    df = df[columns]
+
+    # Convert to a single label per file
+    # even though there may be multiple annotations
+    df = df.groupby('file').agg(pandas.Series.mode)
+
+    #print(df.head())
+
+    return df
+    
+
+
+def main():
+
+    out_path = 'har_exercise_1.parquet'
+    labels_path = 'project-3-at-2024-12-01-17-29-ba296417.csv'
+    data_path = 'data/har_record_excercises/har_record'
+    seed = 1
+
+    labels = read_labels(labels_path)
+
+    # Drop files with a mix of class labels
+    labels = labels[~labels.activity.isin(['mixed'])]
+
+    # Balance the 'other' category, by downsampling
+    without_other = labels[labels.activity != 'other']
+    class_occurance = int(without_other.value_counts().median())
+    only_other = labels[labels.activity == 'other']
+    other_downsampled = only_other.sample(n=class_occurance, random_state=seed)
+    labels = pandas.concat([without_other, other_downsampled])
+
+    print('\nFiles after balancing:')
+    print(labels.activity.value_counts())
+
+    # Lookup data
+    data = load_har_record(data_path)
+
+    # Merge in label data
+    dfs = []
+    for filename, row in labels.iterrows():
+        classname = row.activity
+        filename = filename.rstrip('.csv')
+        #print(filename, classname)
+
+        try:
+            d = data.loc[filename].data
+        except KeyError as e:
+            print('Load error', e)
+            continue
+
+        d = d.reset_index()
+        d['subject'] = 'unknown'
+        d['file'] = filename
+        d['activity'] = classname
+        dfs.append(d)
+
+    #p = 'data/processed/pamap2.parquet'
+    
+    out = pandas.concat(dfs, ignore_index=True)
+    print(out)
+    out.to_parquet(out_path)
+
+    #return
+    # Sanity check
+    df = pandas.read_parquet(out_path)
+    print(df.columns)
+    print(df.activity.value_counts())
+    print(df.file.value_counts())
+    print(df.head())
+
+if __name__ == '__main__':
+    main()