@@ -21,27 +21,27 @@ class Data_Factory():
21
21
22
22
def load (self , path ):
23
23
R = pickle .load (open (path + "/ratings.all" , "rb" ))
24
- print ("Load preprocessed rating data - %s" % (path + "/ratings.all" ))
24
+ print ("Load preprocessed rating data - %s" % (path + "/ratings.all" ))
25
25
D_all = pickle .load (open (path + "/document.all" , "rb" ))
26
- print ("Load preprocessed document data - %s" % (path + "/document.all" ))
26
+ print ("Load preprocessed document data - %s" % (path + "/document.all" ))
27
27
return R , D_all
28
28
29
29
def save (self , path , R , D_all ):
30
30
if not os .path .exists (path ):
31
31
os .makedirs (path )
32
- print ("Saving preprocessed rating data - %s" % (path + "/ratings.all" ))
32
+ print ("Saving preprocessed rating data - %s" % (path + "/ratings.all" ))
33
33
pickle .dump (R , open (path + "/ratings.all" , "wb" ))
34
- print ("Done!" )
35
- print ("Saving preprocessed document data - %s" % (path + "/document.all" ))
34
+ print ("Done!" )
35
+ print ("Saving preprocessed document data - %s" % (path + "/document.all" ))
36
36
pickle .dump (D_all , open (path + "/document.all" , "wb" ))
37
- print ("Done!" )
37
+ print ("Done!" )
38
38
39
39
def read_rating (self , path ):
40
40
results = []
41
41
if os .path .isfile (path ):
42
42
raw_ratings = open (path , 'r' )
43
43
else :
44
- print ("Path (preprocessed) is wrong!" )
44
+ print ("Path (preprocessed) is wrong!" )
45
45
sys .exit ()
46
46
index_list = []
47
47
rating_list = []
@@ -109,7 +109,7 @@ def read_pretrained_word2vec(self, path, vocab, dim):
109
109
return W
110
110
111
111
def split_data (self , ratio , R ):
112
- print ("Randomly splitting rating data into training set (%.1f) and test set (%.1f)..." % (1 - ratio , ratio ))
112
+ print ("Randomly splitting rating data into training set (%.1f) and test set (%.1f)..." % (1 - ratio , ratio ))
113
113
train = []
114
114
for i in range (R .shape [0 ]):
115
115
user_rating = R [i ].nonzero ()[1 ]
@@ -132,7 +132,7 @@ def split_data(self, ratio, R):
132
132
133
133
num_addition = int ((1 - ratio ) * total_size ) - len (train )
134
134
if num_addition < 0 :
135
- print ('this ratio cannot be handled' )
135
+ print ('this ratio cannot be handled' )
136
136
sys .exit ()
137
137
else :
138
138
train .extend (remain_rating_list [:num_addition ])
@@ -247,7 +247,7 @@ def generate_train_valid_test_file_from_R(self, path, R, ratio):
247
247
f_train_user .close ()
248
248
f_valid_user .close ()
249
249
f_test_user .close ()
250
- print ("\t train_user.dat, valid_user.dat, test_user.dat files are generated." )
250
+ print ("\t train_user.dat, valid_user.dat, test_user.dat files are generated." )
251
251
252
252
f_train_item = open (path + "/train_item.dat" , "w" )
253
253
f_valid_item = open (path + "/valid_item.dat" , "w" )
@@ -331,34 +331,37 @@ def preprocess(self, path_rating, path_itemtext, min_rating,
331
331
# Validate data paths
332
332
if os .path .isfile (path_rating ):
333
333
raw_ratings = open (path_rating , 'r' )
334
- print ("Path - rating data: %s" % path_rating )
334
+ print ("Path - rating data: %s" % path_rating )
335
335
else :
336
- print ("Path(rating) is wrong!" )
336
+ print ("Path(rating) is wrong!" )
337
337
sys .exit ()
338
338
339
339
if os .path .isfile (path_itemtext ):
340
340
raw_content = open (path_itemtext , 'r' )
341
- print ("Path - document data: %s" % path_itemtext )
341
+ print ("Path - document data: %s" % path_itemtext )
342
342
else :
343
- print ("Path(item text) is wrong!" )
343
+ print ("Path(item text) is wrong!" )
344
344
sys .exit ()
345
345
346
346
# 1st scan document file to filter items which have documents
347
347
tmp_id_plot = set ()
348
348
all_line = raw_content .read ().splitlines ()
349
- #content format:(1::a little boy |)
349
+ # content format:(1::a little boy |)
350
350
for line in all_line :
351
351
tmp = line .split ('::' )
352
352
i = tmp [0 ]
353
- tmp_plot = tmp [1 ].split ('|' )
353
+ try :
354
+ tmp_plot = tmp [1 ].split ('|' )
355
+ except :
356
+ print (tmp [0 ])
354
357
if tmp_plot [0 ] == '' :
355
358
continue
356
- #tmp_id_plot to remove rating that has no content
359
+ # tmp_id_plot to remove rating that has no content
357
360
tmp_id_plot .add (i )
358
361
raw_content .close ()
359
362
360
- print ("Preprocessing rating data..." )
361
- print ("\t Counting # ratings of each user and removing users having less than %d ratings..." % min_rating )
363
+ print ("Preprocessing rating data..." )
364
+ print ("\t Counting # ratings of each user and removing users having less than %d ratings..." % min_rating )
362
365
# 1st scan rating file to check # ratings of each user
363
366
all_line = raw_ratings .read ().splitlines ()
364
367
tmp_user = {}
@@ -420,11 +423,11 @@ def preprocess(self, path_rating, path_itemtext, min_rating,
420
423
# sparse matrix
421
424
R = csr_matrix ((rating , (user , item )))
422
425
423
- print ("Finish preprocessing rating data - # user: %d, # item: %d, # ratings: %d" % (R .shape [0 ], R .shape [1 ], R .nnz ))
426
+ print ("Finish preprocessing rating data - # user: %d, # item: %d, # ratings: %d" % (R .shape [0 ], R .shape [1 ], R .nnz ))
424
427
425
428
# 2nd scan document file to make idx2plot dictionary according to
426
429
# indices of items in rating matrix
427
- print ("Preprocessing item document..." )
430
+ print ("Preprocessing item document..." )
428
431
429
432
# Read Document File
430
433
raw_content = open (path_itemtext , 'r' )
@@ -439,8 +442,8 @@ def preprocess(self, path_rating, path_itemtext, min_rating,
439
442
eachid_plot = (' ' .join (tmp_plot )).split ()[:max_length ]
440
443
map_idtoplot [i ] = ' ' .join (eachid_plot )
441
444
442
- print ("\t Removing stop words..." )
443
- print ("\t Filtering words by TF-IDF score with max_df: %.1f, vocab_size: %d" % (_max_df , _vocab_size ))
445
+ print ("\t Removing stop words..." )
446
+ print ("\t Filtering words by TF-IDF score with max_df: %.1f, vocab_size: %d" % (_max_df , _vocab_size ))
444
447
445
448
# Make vocabulary by document
446
449
vectorizer = TfidfVectorizer (max_df = _max_df , stop_words = {
@@ -466,6 +469,6 @@ def preprocess(self, path_rating, path_itemtext, min_rating,
466
469
'X_vocab' : X_vocab ,
467
470
}
468
471
469
- print ("Finish preprocessing document data!" )
472
+ print ("Finish preprocessing document data!" )
470
473
471
474
return R , D_all
0 commit comments