@@ -446,6 +446,55 @@ def gisette(dataset_dir: Path) -> bool:
446446 return True
447447
448448
449+ def hepmass_150K (dataset_dir : Path ) -> bool :
450+ """
451+ HEPMASS dataset from UCI machine learning repository (
452+ https://archive.ics.uci.edu/ml/datasets/HEPMASS).
453+
454+ Classification task. n_classes = 2.
455+ hepmass_150K X train dataset (100000, 28)
456+ hepmass_150K y train dataset (100000, 1)
457+ hepmass_150K X test dataset (50000, 28)
458+ hepmass_150K y test dataset (50000, 1)
459+ """
460+ dataset_name = 'hepmass_150K'
461+ os .makedirs (dataset_dir , exist_ok = True )
462+
463+ url_test = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00347/all_test.csv.gz'
464+ url_train = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00347/all_train.csv.gz'
465+
466+ local_url_test = os .path .join (dataset_dir , os .path .basename (url_test ))
467+ local_url_train = os .path .join (dataset_dir , os .path .basename (url_train ))
468+
469+ if not os .path .isfile (local_url_test ):
470+ logging .info (f'Started loading { dataset_name } , test' )
471+ retrieve (url_test , local_url_test )
472+ if not os .path .isfile (local_url_train ):
473+ logging .info (f'Started loading { dataset_name } , train' )
474+ retrieve (url_train , local_url_train )
475+ logging .info (f'{ dataset_name } is loaded, started parsing...' )
476+
477+ nrows_train , nrows_test , dtype = 100000 , 50000 , np .float32
478+ data_test : Any = pd .read_csv (local_url_test , delimiter = "," ,
479+ compression = "gzip" , dtype = dtype ,
480+ nrows = nrows_test )
481+ data_train : Any = pd .read_csv (local_url_train , delimiter = "," ,
482+ compression = "gzip" , dtype = dtype ,
483+ nrows = nrows_train )
484+
485+ x_test = np .ascontiguousarray (data_test .values [:nrows_test , 1 :], dtype = dtype )
486+ y_test = np .ascontiguousarray (data_test .values [:nrows_test , 0 ], dtype = dtype )
487+ x_train = np .ascontiguousarray (data_train .values [:nrows_train , 1 :], dtype = dtype )
488+ y_train = np .ascontiguousarray (data_train .values [:nrows_train , 0 ], dtype = dtype )
489+
490+ for data , name in zip ((x_train , x_test , y_train , y_test ),
491+ ('x_train' , 'x_test' , 'y_train' , 'y_test' )):
492+ filename = f'{ dataset_name } _{ name } .npy'
493+ np .save (os .path .join (dataset_dir , filename ), data )
494+ logging .info (f'dataset { dataset_name } is ready.' )
495+ return True
496+
497+
449498def higgs (dataset_dir : Path ) -> bool :
450499 """
451500 Higgs dataset from UCI machine learning repository
@@ -637,3 +686,43 @@ def skin_segmentation(dataset_dir: Path) -> bool:
637686 np .save (os .path .join (dataset_dir , filename ), data )
638687 logging .info (f'dataset { dataset_name } is ready.' )
639688 return True
689+
690+
691+ def susy (dataset_dir : Path ) -> bool :
692+ """
693+ SUSY dataset from UCI machine learning repository (
694+ https://archive.ics.uci.edu/ml/datasets/SUSY).
695+
696+ Classification task. n_classes = 2.
697+ susy X train dataset (4500000, 28)
698+ susy y train dataset (4500000, 1)
699+ susy X test dataset (500000, 28)
700+ susy y test dataset (500000, 1)
701+ """
702+ dataset_name = 'susy'
703+ os .makedirs (dataset_dir , exist_ok = True )
704+
705+ url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00279/SUSY.csv.gz'
706+ local_url = os .path .join (dataset_dir , os .path .basename (url ))
707+ if not os .path .isfile (local_url ):
708+ logging .info (f'Started loading { dataset_name } ' )
709+ retrieve (url , local_url )
710+ logging .info (f'{ dataset_name } is loaded, started parsing...' )
711+
712+ nrows_train , nrows_test , dtype = 4500000 , 500000 , np .float32
713+ data : Any = pd .read_csv (local_url , delimiter = "," , header = None ,
714+ compression = "gzip" , dtype = dtype ,
715+ nrows = nrows_train + nrows_test )
716+
717+ X = data [data .columns [1 :]]
718+ y = data [data .columns [0 :1 ]]
719+
720+ x_train , x_test , y_train , y_test = train_test_split (
721+ X , y , train_size = nrows_train , test_size = nrows_test , shuffle = False )
722+
723+ for data , name in zip ((x_train , x_test , y_train , y_test ),
724+ ('x_train' , 'x_test' , 'y_train' , 'y_test' )):
725+ filename = f'{ dataset_name } _{ name } .npy'
726+ np .save (os .path .join (dataset_dir , filename ), data )
727+ logging .info (f'dataset { dataset_name } is ready.' )
728+ return True
0 commit comments