| 
12 | 12 | import codecs  | 
13 | 13 | 
 
  | 
14 | 14 | pages = {  | 
15 |  | -    u'ar': u'http://ar.wikipedia.org/wiki/%D9%88%D9%8A%D9%83%D9%8A%D8%A8%D9%8A%D8%AF%D9%8A%D8%A7',  | 
16 |  | -    u'de': u'http://de.wikipedia.org/wiki/Wikipedia',  | 
17 |  | -    u'en': u'https://en.wikipedia.org/wiki/Wikipedia',  | 
18 |  | -    u'es': u'http://es.wikipedia.org/wiki/Wikipedia',  | 
19 |  | -    u'fr': u'http://fr.wikipedia.org/wiki/Wikip%C3%A9dia',  | 
20 |  | -    u'it': u'http://it.wikipedia.org/wiki/Wikipedia',  | 
21 |  | -    u'ja': u'http://ja.wikipedia.org/wiki/Wikipedia',  | 
22 |  | -    u'nl': u'http://nl.wikipedia.org/wiki/Wikipedia',  | 
23 |  | -    u'pl': u'http://pl.wikipedia.org/wiki/Wikipedia',  | 
24 |  | -    u'pt': u'http://pt.wikipedia.org/wiki/Wikip%C3%A9dia',  | 
25 |  | -    u'ru': u'http://ru.wikipedia.org/wiki/%D0%92%D0%B8%D0%BA%D0%B8%D0%BF%D0%B5%D0%B4%D0%B8%D1%8F',  | 
 | 15 | +    'ar': 'http://ar.wikipedia.org/wiki/%D9%88%D9%8A%D9%83%D9%8A%D8%A8%D9%8A%D8%AF%D9%8A%D8%A7',   # noqa: E501  | 
 | 16 | +    'de': 'http://de.wikipedia.org/wiki/Wikipedia',  | 
 | 17 | +    'en': 'https://en.wikipedia.org/wiki/Wikipedia',  | 
 | 18 | +    'es': 'http://es.wikipedia.org/wiki/Wikipedia',  | 
 | 19 | +    'fr': 'http://fr.wikipedia.org/wiki/Wikip%C3%A9dia',  | 
 | 20 | +    'it': 'http://it.wikipedia.org/wiki/Wikipedia',  | 
 | 21 | +    'ja': 'http://ja.wikipedia.org/wiki/Wikipedia',  | 
 | 22 | +    'nl': 'http://nl.wikipedia.org/wiki/Wikipedia',  | 
 | 23 | +    'pl': 'http://pl.wikipedia.org/wiki/Wikipedia',  | 
 | 24 | +    'pt': 'http://pt.wikipedia.org/wiki/Wikip%C3%A9dia',  | 
 | 25 | +    'ru': 'http://ru.wikipedia.org/wiki/%D0%92%D0%B8%D0%BA%D0%B8%D0%BF%D0%B5%D0%B4%D0%B8%D1%8F',  # noqa: E501  | 
26 | 26 | #    u'zh': u'http://zh.wikipedia.org/wiki/Wikipedia',  | 
27 | 27 | }  | 
28 | 28 | 
 
  | 
29 |  | -html_folder = u'html'  | 
30 |  | -text_folder = u'paragraphs'  | 
31 |  | -short_text_folder = u'short_paragraphs'  | 
 | 29 | +html_folder = 'html'  | 
 | 30 | +text_folder = 'paragraphs'  | 
 | 31 | +short_text_folder = 'short_paragraphs'  | 
32 | 32 | n_words_per_short_text = 5  | 
33 | 33 | 
 
  | 
34 | 34 | 
 
  | 
 | 
88 | 88 |         groups = np.array_split(words, n_groups)  | 
89 | 89 | 
 
  | 
90 | 90 |         for group in groups:  | 
91 |  | -            small_content = u" ".join(group)  | 
 | 91 | +            small_content = " ".join(group)  | 
92 | 92 | 
 
  | 
93 | 93 |             short_text_filename = os.path.join(short_text_lang_folder,  | 
94 | 94 |                                                '%s_%04d.txt' % (lang, j))  | 
 | 
0 commit comments