-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdataset.js
107 lines (98 loc) · 2.67 KB
/
dataset.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
'use strict';
/*
* Copyright (c) 2017 Topcoder, Inc. All rights reserved.
*/
/*
* Dataset service
*/
const config = require('config');
const {
acceptedProgramCodes,
acceptedKeywords,
acceptedFormat,
ignoredNames
} = require('../constants');
const csv = require('csv-parse');
const _ = require('lodash');
const rp = require('request-promise');
const fs = require('fs');
/**
* Get available data sets
*/
function* getAvailableDatasets() {
const datasets = yield rp(config.dataset_url, { json: true });
const filteredDatasets = [];
_.each(_.uniqBy(datasets.dataset, 'identifier'), (entry) => {
if (
_.intersection(entry.programCode, acceptedProgramCodes).length > 0
&& _.intersection(entry.keyword, acceptedKeywords).length > 0
&& _.find(entry.distribution, d => d.format === acceptedFormat)
) {
// Check if should ignore this entry
let shouldIgnore = false;
_.each(ignoredNames, (name) => {
if (_.includes(entry.title.toLowerCase(), name.toLowerCase())) {
shouldIgnore = true;
}
});
const data = {
downloadURL: _.find(entry.distribution, d => d.format === acceptedFormat).downloadURL,
title: entry.title
};
if (data.downloadURL && !shouldIgnore) filteredDatasets.push(data);
}
});
return filteredDatasets;
}
/**
* Download and save dataset
* @param {string} url - The dataset URL
*/
function* downloadDataset(url) {
const csvStr = yield rp(url);
yield new Promise((resolve, reject) => {
// Create directory if not exists
if (!fs.existsSync(`${config.downloadPath}`)) {
fs.mkdirSync(`${config.downloadPath}`);
}
// Save file to configured directory
fs.writeFile(`${config.downloadPath}/${url.split('/')[url.split('/').length - 1]}`, csvStr, (err) => {
if (err) return reject(err);
resolve();
});
});
}
/**
* Read CSV file
* @param {string} path - The file path
*/
function* readCSV(path) {
return yield new Promise((resolve, reject) => {
fs.readFile(path, (err, data) => {
if (err) return reject(err);
csv(data, { relax_column_count: true, relax: true, skip_empty_lines: true }, (parsingError, rows) => {
if (parsingError) return reject(parsingError);
data = null;
resolve(rows);
});
});
});
}
/**
* Get array with file names in given directory
* @param {string} directory - The target directory
*/
function getFilenames(directory) {
const fileNames = [];
// Get available files
fs.readdirSync(directory).forEach((file) => {
fileNames.push(file);
});
return fileNames;
}
module.exports = {
getAvailableDatasets,
downloadDataset,
readCSV,
getFilenames
};