Skip to content

Commit 7182a43

Browse files
maskani-mohagramfort
authored andcommitted
[MRG+1] Take over PR scikit-learn#7647 - Add a "filename" attribute to datasets that have a CSV file (scikit-learn#9101)
* add filename attribute for load_iris * add filename attribute for load_boston * add filename attribute for load_linnerud
1 parent c62338f commit 7182a43

File tree

3 files changed

+51
-22
lines changed

3 files changed

+51
-22
lines changed

doc/tutorial/basic/tutorial.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ learn::
136136
<sphx_glr_auto_examples_classification_plot_digits_classification.py>` illustrates how starting
137137
from the original problem one can shape the data for consumption in
138138
scikit-learn.
139-
139+
140140
.. topic:: Loading from external datasets
141141

142142
To load from an external dataset, please refer to :ref:`loading external datasets <external_datasets>`.
@@ -401,4 +401,4 @@ is similarly possible for an instance to be assigned multiple labels::
401401
In this case, the classifier is fit upon instances each assigned multiple labels.
402402
The :class:`MultiLabelBinarizer <sklearn.preprocessing.MultiLabelBinarizer>` is
403403
used to binarize the 2d array of multilabels to ``fit`` upon. As a result,
404-
``predict()`` returns a 2d array with multiple predicted labels for each instance.
404+
``predict()`` returns a 2d array with multiple predicted labels for each instance.

sklearn/datasets/base.py

Lines changed: 44 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -352,8 +352,9 @@ def load_iris(return_X_y=False):
352352
Dictionary-like object, the interesting attributes are:
353353
'data', the data to learn, 'target', the classification labels,
354354
'target_names', the meaning of the labels, 'feature_names', the
355-
meaning of the features, and 'DESCR', the
356-
full description of the dataset.
355+
meaning of the features, 'DESCR', the full description of
356+
the dataset, 'filename', the physical location of
357+
iris csv dataset (added in version `0.20`).
357358
358359
(data, target) : tuple if ``return_X_y`` is True
359360
@@ -373,6 +374,7 @@ def load_iris(return_X_y=False):
373374
"""
374375
module_path = dirname(__file__)
375376
data, target, target_names = load_data(module_path, 'iris.csv')
377+
iris_csv_filename = join(module_path, 'data', 'iris.csv')
376378

377379
with open(join(module_path, 'descr', 'iris.rst')) as rst_file:
378380
fdescr = rst_file.read()
@@ -384,7 +386,8 @@ def load_iris(return_X_y=False):
384386
target_names=target_names,
385387
DESCR=fdescr,
386388
feature_names=['sepal length (cm)', 'sepal width (cm)',
387-
'petal length (cm)', 'petal width (cm)'])
389+
'petal length (cm)', 'petal width (cm)'],
390+
filename=iris_csv_filename)
388391

389392

390393
def load_breast_cancer(return_X_y=False):
@@ -415,8 +418,9 @@ def load_breast_cancer(return_X_y=False):
415418
Dictionary-like object, the interesting attributes are:
416419
'data', the data to learn, 'target', the classification labels,
417420
'target_names', the meaning of the labels, 'feature_names', the
418-
meaning of the features, and 'DESCR', the
419-
full description of the dataset.
421+
meaning of the features, and 'DESCR', the full description of
422+
the dataset, 'filename', the physical location of
423+
breast cancer csv dataset (added in version `0.20`).
420424
421425
(data, target) : tuple if ``return_X_y`` is True
422426
@@ -440,6 +444,7 @@ def load_breast_cancer(return_X_y=False):
440444
"""
441445
module_path = dirname(__file__)
442446
data, target, target_names = load_data(module_path, 'breast_cancer.csv')
447+
csv_filename = join(module_path, 'data', 'breast_cancer.csv')
443448

444449
with open(join(module_path, 'descr', 'breast_cancer.rst')) as rst_file:
445450
fdescr = rst_file.read()
@@ -466,7 +471,8 @@ def load_breast_cancer(return_X_y=False):
466471
return Bunch(data=data, target=target,
467472
target_names=target_names,
468473
DESCR=fdescr,
469-
feature_names=feature_names)
474+
feature_names=feature_names,
475+
filename=csv_filename)
470476

471477

472478
def load_digits(n_class=10, return_X_y=False):
@@ -573,18 +579,21 @@ def load_diabetes(return_X_y=False):
573579
-------
574580
data : Bunch
575581
Dictionary-like object, the interesting attributes are:
576-
'data', the data to learn and 'target', the regression target for each
577-
sample.
582+
'data', the data to learn, 'target', the regression target for each
583+
sample, 'data_filename', the physical location
584+
of diabetes data csv dataset, and 'target_filename', the physical
585+
location of diabetes targets csv datataset (added in version `0.20`).
578586
579587
(data, target) : tuple if ``return_X_y`` is True
580588
581589
.. versionadded:: 0.18
582590
"""
583-
584591
module_path = dirname(__file__)
585592
base_dir = join(module_path, 'data')
586-
data = np.loadtxt(join(base_dir, 'diabetes_data.csv.gz'))
587-
target = np.loadtxt(join(base_dir, 'diabetes_target.csv.gz'))
593+
data_filename = join(base_dir, 'diabetes_data.csv.gz')
594+
data = np.loadtxt(data_filename)
595+
target_filename = join(base_dir, 'diabetes_target.csv.gz')
596+
target = np.loadtxt(target_filename)
588597

589598
with open(join(module_path, 'descr', 'diabetes.rst')) as rst_file:
590599
fdescr = rst_file.read()
@@ -594,7 +603,9 @@ def load_diabetes(return_X_y=False):
594603

595604
return Bunch(data=data, target=target, DESCR=fdescr,
596605
feature_names=['age', 'sex', 'bmi', 'bp',
597-
's1', 's2', 's3', 's4', 's5', 's6'])
606+
's1', 's2', 's3', 's4', 's5', 's6'],
607+
data_filename=data_filename,
608+
target_filename=target_filename)
598609

599610

600611
def load_linnerud(return_X_y=False):
@@ -622,21 +633,29 @@ def load_linnerud(return_X_y=False):
622633
'targets', the two multivariate datasets, with 'data' corresponding to
623634
the exercise and 'targets' corresponding to the physiological
624635
measurements, as well as 'feature_names' and 'target_names'.
636+
In addition, you will also have access to 'data_filename',
637+
the physical location of linnerud data csv dataset, and
638+
'target_filename', the physical location of
639+
linnerud targets csv datataset (added in version `0.20`).
625640
626641
(data, target) : tuple if ``return_X_y`` is True
627642
628643
.. versionadded:: 0.18
629644
"""
630645
base_dir = join(dirname(__file__), 'data/')
646+
data_filename = join(base_dir, 'linnerud_exercise.csv')
647+
target_filename = join(base_dir, 'linnerud_physiological.csv')
648+
631649
# Read data
632-
data_exercise = np.loadtxt(base_dir + 'linnerud_exercise.csv', skiprows=1)
633-
data_physiological = np.loadtxt(base_dir + 'linnerud_physiological.csv',
634-
skiprows=1)
650+
data_exercise = np.loadtxt(data_filename, skiprows=1)
651+
data_physiological = np.loadtxt(target_filename, skiprows=1)
652+
635653
# Read header
636-
with open(base_dir + 'linnerud_exercise.csv') as f:
654+
with open(data_filename) as f:
637655
header_exercise = f.readline().split()
638-
with open(base_dir + 'linnerud_physiological.csv') as f:
656+
with open(target_filename) as f:
639657
header_physiological = f.readline().split()
658+
640659
with open(dirname(__file__) + '/descr/linnerud.rst') as f:
641660
descr = f.read()
642661

@@ -646,7 +665,9 @@ def load_linnerud(return_X_y=False):
646665
return Bunch(data=data_exercise, feature_names=header_exercise,
647666
target=data_physiological,
648667
target_names=header_physiological,
649-
DESCR=descr)
668+
DESCR=descr,
669+
data_filename=data_filename,
670+
target_filename=target_filename)
650671

651672

652673
def load_boston(return_X_y=False):
@@ -672,7 +693,9 @@ def load_boston(return_X_y=False):
672693
data : Bunch
673694
Dictionary-like object, the interesting attributes are:
674695
'data', the data to learn, 'target', the regression targets,
675-
and 'DESCR', the full description of the dataset.
696+
'DESCR', the full description of the dataset,
697+
and 'filename', the physical location of boston
698+
csv dataset (added in version `0.20`).
676699
677700
(data, target) : tuple if ``return_X_y`` is True
678701
@@ -713,7 +736,8 @@ def load_boston(return_X_y=False):
713736
target=target,
714737
# last column is target value
715738
feature_names=feature_names[:-1],
716-
DESCR=descr_text)
739+
DESCR=descr_text,
740+
filename=data_file_name)
717741

718742

719743
def load_sample_images():

sklearn/datasets/tests/test_base.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,8 @@ def test_load_linnerud():
197197
assert_equal(res.target.shape, (20, 3))
198198
assert_equal(len(res.target_names), 3)
199199
assert_true(res.DESCR)
200+
assert_true(os.path.exists(res.data_filename))
201+
assert_true(os.path.exists(res.target_filename))
200202

201203
# test return_X_y option
202204
X_y_tuple = load_linnerud(return_X_y=True)
@@ -212,6 +214,7 @@ def test_load_iris():
212214
assert_equal(res.target.size, 150)
213215
assert_equal(res.target_names.size, 3)
214216
assert_true(res.DESCR)
217+
assert_true(os.path.exists(res.filename))
215218

216219
# test return_X_y option
217220
X_y_tuple = load_iris(return_X_y=True)
@@ -242,6 +245,7 @@ def test_load_breast_cancer():
242245
assert_equal(res.target.size, 569)
243246
assert_equal(res.target_names.size, 2)
244247
assert_true(res.DESCR)
248+
assert_true(os.path.exists(res.filename))
245249

246250
# test return_X_y option
247251
X_y_tuple = load_breast_cancer(return_X_y=True)
@@ -257,6 +261,7 @@ def test_load_boston():
257261
assert_equal(res.target.size, 506)
258262
assert_equal(res.feature_names.size, 13)
259263
assert_true(res.DESCR)
264+
assert_true(os.path.exists(res.filename))
260265

261266
# test return_X_y option
262267
X_y_tuple = load_boston(return_X_y=True)

0 commit comments

Comments
 (0)