Skip to content

Commit 18c36b9

Browse files
authored
Mnist Dataset (#326)
* Implement MnistDataset * Add MNIST dataset documentation
1 parent 8ac013b commit 18c36b9

File tree

10 files changed

+164
-1
lines changed

10 files changed

+164
-1
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ Public datasets are available in a separate repository [php-ai/php-ml-datasets](
112112
* [CSV](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/csv-dataset/)
113113
* [Files](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/files-dataset/)
114114
* [SVM](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/svm-dataset/)
115+
* [MNIST](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/mnist-dataset.md)
115116
* Ready to use:
116117
* [Iris](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/demo/iris/)
117118
* [Wine](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/demo/wine/)

docs/index.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@ Example scripts are available in a separate repository [php-ai/php-ml-examples](
9393
* [CSV](machine-learning/datasets/csv-dataset.md)
9494
* [Files](machine-learning/datasets/files-dataset.md)
9595
* [SVM](machine-learning/datasets/svm-dataset.md)
96+
* [MNIST](machine-learning/datasets/mnist-dataset.md)
9697
* Ready to use:
9798
* [Iris](machine-learning/datasets/demo/iris.md)
9899
* [Wine](machine-learning/datasets/demo/wine.md)
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# MnistDataset
2+
3+
Helper class that load data from MNIST dataset: [http://yann.lecun.com/exdb/mnist/](http://yann.lecun.com/exdb/mnist/)
4+
5+
> The MNIST database of handwritten digits, available from this page, has a training set of 60,000 examples, and a test set of 10,000 examples. It is a subset of a larger set available from NIST. The digits have been size-normalized and centered in a fixed-size image.
6+
It is a good database for people who want to try learning techniques and pattern recognition methods on real-world data while spending minimal efforts on preprocessing and formatting.
7+
8+
### Constructors Parameters
9+
10+
* $imagePath - (string) path to image file
11+
* $labelPath - (string) path to label file
12+
13+
```
14+
use Phpml\Dataset\MnistDataset;
15+
16+
$trainDataset = new MnistDataset('train-images-idx3-ubyte', 'train-labels-idx1-ubyte');
17+
```
18+
19+
### Samples and labels
20+
21+
To get samples or labels you can use getters:
22+
23+
```
24+
$dataset->getSamples();
25+
$dataset->getTargets();
26+
```

mkdocs.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ pages:
3939
- CSV Dataset: machine-learning/datasets/csv-dataset.md
4040
- Files Dataset: machine-learning/datasets/files-dataset.md
4141
- SVM Dataset: machine-learning/datasets/svm-dataset.md
42+
- MNIST Dataset: machine-learning/datasets/mnist-dataset.md
4243
- Ready to use datasets:
4344
- Iris: machine-learning/datasets/demo/iris.md
4445
- Wine: machine-learning/datasets/demo/wine.md

phpstan.neon

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ includes:
66
parameters:
77
ignoreErrors:
88
- '#Property Phpml\\Clustering\\KMeans\\Cluster\:\:\$points \(iterable\<Phpml\\Clustering\\KMeans\\Point\>\&SplObjectStorage\) does not accept SplObjectStorage#'
9-
- '#Phpml\\Dataset\\FilesDataset::__construct\(\) does not call parent constructor from Phpml\\Dataset\\ArrayDataset#'
9+
- '#Phpml\\Dataset\\(.*)Dataset::__construct\(\) does not call parent constructor from Phpml\\Dataset\\ArrayDataset#'
1010

1111
# wide range cases
1212
- '#Parameter \#1 \$coordinates of class Phpml\\Clustering\\KMeans\\Point constructor expects array, array<int>\|Phpml\\Clustering\\KMeans\\Point given#'

src/Dataset/MnistDataset.php

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace Phpml\Dataset;
6+
7+
use Phpml\Exception\InvalidArgumentException;
8+
9+
/**
10+
* MNIST dataset: http://yann.lecun.com/exdb/mnist/
11+
* original mnist dataset reader: https://github.com/AndrewCarterUK/mnist-neural-network-plain-php
12+
*/
13+
final class MnistDataset extends ArrayDataset
14+
{
15+
private const MAGIC_IMAGE = 0x00000803;
16+
17+
private const MAGIC_LABEL = 0x00000801;
18+
19+
private const IMAGE_ROWS = 28;
20+
21+
private const IMAGE_COLS = 28;
22+
23+
public function __construct(string $imagePath, string $labelPath)
24+
{
25+
$this->samples = $this->readImages($imagePath);
26+
$this->targets = $this->readLabels($labelPath);
27+
28+
if (count($this->samples) !== count($this->targets)) {
29+
throw new InvalidArgumentException('Must have the same number of images and labels');
30+
}
31+
}
32+
33+
private function readImages(string $imagePath): array
34+
{
35+
$stream = fopen($imagePath, 'rb');
36+
37+
if ($stream === false) {
38+
throw new InvalidArgumentException('Could not open file: '.$imagePath);
39+
}
40+
41+
$images = [];
42+
43+
try {
44+
$header = fread($stream, 16);
45+
46+
$fields = unpack('Nmagic/Nsize/Nrows/Ncols', (string) $header);
47+
48+
if ($fields['magic'] !== self::MAGIC_IMAGE) {
49+
throw new InvalidArgumentException('Invalid magic number: '.$imagePath);
50+
}
51+
52+
if ($fields['rows'] != self::IMAGE_ROWS) {
53+
throw new InvalidArgumentException('Invalid number of image rows: '.$imagePath);
54+
}
55+
56+
if ($fields['cols'] != self::IMAGE_COLS) {
57+
throw new InvalidArgumentException('Invalid number of image cols: '.$imagePath);
58+
}
59+
60+
for ($i = 0; $i < $fields['size']; $i++) {
61+
$imageBytes = fread($stream, $fields['rows'] * $fields['cols']);
62+
63+
// Convert to float between 0 and 1
64+
$images[] = array_map(function ($b) {
65+
return $b / 255;
66+
}, array_values(unpack('C*', (string) $imageBytes)));
67+
}
68+
} finally {
69+
fclose($stream);
70+
}
71+
72+
return $images;
73+
}
74+
75+
private function readLabels(string $labelPath): array
76+
{
77+
$stream = fopen($labelPath, 'rb');
78+
79+
if ($stream === false) {
80+
throw new InvalidArgumentException('Could not open file: '.$labelPath);
81+
}
82+
83+
$labels = [];
84+
85+
try {
86+
$header = fread($stream, 8);
87+
88+
$fields = unpack('Nmagic/Nsize', (string) $header);
89+
90+
if ($fields['magic'] !== self::MAGIC_LABEL) {
91+
throw new InvalidArgumentException('Invalid magic number: '.$labelPath);
92+
}
93+
94+
$labels = fread($stream, $fields['size']);
95+
} finally {
96+
fclose($stream);
97+
}
98+
99+
return array_values(unpack('C*', (string) $labels));
100+
}
101+
}

tests/Dataset/MnistDatasetTest.php

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace Phpml\Tests\Dataset;
6+
7+
use Phpml\Dataset\MnistDataset;
8+
use Phpml\Exception\InvalidArgumentException;
9+
use PHPUnit\Framework\TestCase;
10+
11+
class MnistDatasetTest extends TestCase
12+
{
13+
public function testSimpleMnistDataset(): void
14+
{
15+
$dataset = new MnistDataset(
16+
__DIR__.'/Resources/mnist/images-idx-ubyte',
17+
__DIR__.'/Resources/mnist/labels-idx-ubyte'
18+
);
19+
20+
self::assertCount(10, $dataset->getSamples());
21+
self::assertCount(10, $dataset->getTargets());
22+
}
23+
24+
public function testCheckSamplesAndTargetsCountMatch(): void
25+
{
26+
$this->expectException(InvalidArgumentException::class);
27+
28+
new MnistDataset(
29+
__DIR__.'/Resources/mnist/images-idx-ubyte',
30+
__DIR__.'/Resources/mnist/labels-11-idx-ubyte'
31+
);
32+
}
33+
}
7.67 KB
Binary file not shown.
19 Bytes
Binary file not shown.
18 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)