Skip to content

Commit cf88cfa

Browse files
author
captain
committed
remove useless code
1 parent 75a2b81 commit cf88cfa

16 files changed

+94
-29171
lines changed

Untitled.ipynb

Lines changed: 0 additions & 318 deletions
This file was deleted.

__pycache__/models.cpython-36.pyc

-169 Bytes
Binary file not shown.

ml-1m/0.2/test_item.dat

Lines changed: 0 additions & 3544 deletions
This file was deleted.

ml-1m/0.2/test_user.dat

Lines changed: 0 additions & 6040 deletions
This file was deleted.

ml-1m/0.2/train_item.dat

Lines changed: 0 additions & 3544 deletions
This file was deleted.

ml-1m/0.2/train_user.dat

Lines changed: 0 additions & 6040 deletions
This file was deleted.

ml-1m/0.2/valid_item.dat

Lines changed: 0 additions & 3544 deletions
This file was deleted.

ml-1m/0.2/valid_user.dat

Lines changed: 0 additions & 6040 deletions
This file was deleted.

ml-1m/Untitled.ipynb

Lines changed: 0 additions & 68 deletions
This file was deleted.

ml-1m/document.all

-13.5 MB
Binary file not shown.

ml-1m/ratings.all

-11.4 MB
Binary file not shown.

models.py

Lines changed: 30 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,17 @@
66
from util import eval_RMSE
77
import math
88
import numpy as np
9-
from cnn_model import CNN
9+
from text_analysis.cnn_model import CNN
1010
from torch.autograd import Variable
11+
import torch
12+
13+
'''
14+
尚未解决的问题:
15+
1、word_embedding
16+
2、batch
17+
3、give_item_weight到底是做什么用的
18+
4、模型中seed的用法
19+
'''
1120

1221

1322
def ConvMF(res_dir, train_user, train_item, valid_user, test_user,
@@ -31,6 +40,7 @@ def ConvMF(res_dir, train_user, train_item, valid_user, test_user,
3140
Test_R = test_user[1]
3241
Valid_R = valid_user[1]
3342

43+
# 这一部分到底是做什么用的
3444
if give_item_weight is True:
3545
item_weight = np.array([math.sqrt(len(i))
3646
for i in Train_R_J], dtype=float)
@@ -40,17 +50,15 @@ def ConvMF(res_dir, train_user, train_item, valid_user, test_user,
4050

4151
pre_val_eval = 1e10
4252

43-
# dimension: latent of dimension for users and items
44-
# emb_dim: Size of latent dimension for word vectors
53+
# dimension: 用户和物品的隐特征维数
54+
# emb_dim: 词向量的维数
4555
cnn_module = CNN(dimension, vocab_size, dropout_rate,
46-
emb_dim, max_len, num_kernel_per_ws, init_W)
56+
emb_dim, max_len, num_kernel_per_ws, init_W)
4757

4858
# return the output of CNN
49-
# size of V is (dimension, num_item)
50-
theta = cnn_module(Variable(CNN_X))
59+
# size of V is (num_item, dimension)
60+
cnn_module = cnn_module.cuda()
5161
theta = cnn_module.get_projection_layer(CNN_X)
52-
np.random.seed(133)
53-
# dimension is the k
5462
U = np.random.uniform(size=(num_user, dimension))
5563
V = theta
5664

@@ -98,25 +106,29 @@ def ConvMF(res_dir, train_user, train_item, valid_user, test_user,
98106
loss = loss + np.sum(sub_loss)
99107
seed = np.random.randint(100000)
100108

101-
# important
102-
history = cnn_module.train(CNN_X, V, item_weight, seed)
109+
# 用V训练CNN模型,更新V
110+
cnn_module.train(CNN_X, V)
103111
theta = cnn_module.get_projection_layer(CNN_X)
104112

105-
cnn_loss = history.history['loss'][-1]
113+
# 这部分添加计算CNN模型的损失
114+
# cnn_loss = history.history['loss'][-1]
106115

107-
loss -= 0.5 * lambda_v * cnn_loss * num_item
116+
# loss -= 0.5 * lambda_v * cnn_loss * num_item
108117

109118
tr_eval = eval_RMSE(Train_R_I, U, V, train_user[0])
110119
val_eval = eval_RMSE(Valid_R, U, V, valid_user[0])
111120
te_eval = eval_RMSE(Test_R, U, V, test_user[0])
112121

122+
# 计算一次迭代的时间
113123
toc = time.time()
114124
elapsed = toc - tic
115125

126+
# 计算Loss下降率
116127
converge = abs((loss - PREV_LOSS) / PREV_LOSS)
117128

129+
# 存储模型参数
118130
if val_eval < pre_val_eval:
119-
cnn_module.save_model(res_dir + '/CNN_weights.hdf5')
131+
# cnn_module.save_model(res_dir + '/CNN_weights.hdf5')
120132
np.savetxt(res_dir + '/U.dat', U)
121133
np.savetxt(res_dir + '/V.dat', V)
122134
np.savetxt(res_dir + '/theta.dat', theta)
@@ -125,12 +137,12 @@ def ConvMF(res_dir, train_user, train_item, valid_user, test_user,
125137

126138
pre_val_eval = val_eval
127139

128-
print("Loss: %.5f Elpased: %.4fs Converge: %.6f Tr: %.5f Val: %.5f Te: %.5f" % (
129-
loss, elapsed, converge, tr_eval, val_eval, te_eval))
130-
f1.write("Loss: %.5f Elpased: %.4fs Converge: %.6f Tr: %.5f Val: %.5f Te: %.5f\n" % (
131-
loss, elapsed, converge, tr_eval, val_eval, te_eval))
140+
print("Elpased: %.4fs Converge: %.6f Tr: %.5f Val: %.5f Te: %.5f" % (
141+
elapsed, converge, tr_eval, val_eval, te_eval))
142+
f1.write("Elpased: %.4fs Converge: %.6f Tr: %.5f Val: %.5f Te: %.5f\n" % (
143+
elapsed, converge, tr_eval, val_eval, te_eval))
132144

133-
# endure_count = 5
145+
# 超过五次则退出迭代训练
134146
if count == endure_count:
135147
break
136148

run.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1-
#coding:utf-8
1+
# coding:utf-8
22

33
import argparse
44
import sys
55
import os
66
from data_manager import Data_Factory
7+
import numpy as np
78

89
parser = argparse.ArgumentParser()
910

@@ -39,7 +40,7 @@
3940
parser.add_argument("-e", "--emb_dim", type=int,
4041
help="Size of latent dimension for word vectors (default: 200)", default=200)
4142
parser.add_argument("-p", "--pretrain_w2v", type=str,
42-
help="Path to pretrain word embedding model to initialize word vectors")
43+
help="Path to pretrain word embedding model to initialize word vectors", default=None)
4344
parser.add_argument("-g", "--give_item_weight", type=bool,
4445
help="True or False to give item weight of ConvMF (default = False)", default=True)
4546
parser.add_argument("-k", "--dimension", type=int,
@@ -136,7 +137,14 @@
136137
valid_user = data_factory.read_rating(data_path + '/valid_user.dat')
137138
test_user = data_factory.read_rating(data_path + '/test_user.dat')
138139

140+
# CNN_X添加padding,以处理不同长度的文本数据
141+
input_array = np.full((len(CNN_X), 300), 8000)
142+
for i in range(len(CNN_X)):
143+
for j in range(len(CNN_X[i])):
144+
input_array[i][j] = CNN_X[i][j]
145+
146+
# 使用新的填充后的文本数据
139147
ConvMF(max_iter=max_iter, res_dir=res_dir,
140148
lambda_u=lambda_u, lambda_v=lambda_v, dimension=dimension, vocab_size=vocab_size, init_W=init_W,
141-
give_item_weight=give_item_weight, CNN_X=CNN_X, emb_dim=emb_dim, num_kernel_per_ws=num_kernel_per_ws,
149+
give_item_weight=give_item_weight, CNN_X=input_array, emb_dim=emb_dim, num_kernel_per_ws=num_kernel_per_ws,
142150
train_user=train_user, train_item=train_item, valid_user=valid_user, test_user=test_user, R=R)

run_test_ConvMF.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,13 @@ python ./run.py \
44
-a ./data/preprocessed/ml-1m/ \
55
-o ./result/ml-1m/1_100_200 \
66
-e 50 \
7-
-p ./data/glove/glove.6B.50d.txt \
87
-u 10 \
98
-v 100 \
109
-g True
1110

1211

1312
##!/usr/bin/env bash
13+
# -p ./data/glove.6B/glove.6B.50d.txt \
1414
#python ./run.py \
1515
#-d ./data/preprocessed/aiv/0.2/ \
1616
#-a ./data/preprocessed/aiv/ \

cnn_model.py renamed to text_analysis/cnn_model.py

Lines changed: 51 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,18 @@
44
import torch
55
import torch.nn as nn
66
import torch.nn.functional as F
7+
import numpy as np
78
import torch.optim as optim
89
from torch.autograd import Variable
910

1011

1112
class CNN(nn.Module):
13+
batch_size = 128
14+
# More than this epoch cause easily over-fitting on our data sets
15+
nb_epoch = 5
16+
1217
def __init__(self, output_dimesion, vocab_size, dropout_rate, emb_dim, max_len, n_filters, init_W=None):
13-
# number_filters
14-
print(type(self))
18+
# n_filter为卷积核个数
1519
super(CNN, self).__init__()
1620

1721
self.max_len = max_len
@@ -21,9 +25,10 @@ def __init__(self, output_dimesion, vocab_size, dropout_rate, emb_dim, max_len,
2125
self.qual_conv_set = {}
2226

2327
'''Embedding Layer'''
24-
if init_W is None:
25-
# 先尝试使用embedding随机赋值
26-
self.embedding = nn.Embedding(vocab_size, emb_dim)
28+
# if init_W is None:
29+
# # 最后一个索引为填充的标记文本
30+
# # 先尝试使用随机生成的词向量值
31+
self.embedding = nn.Embedding(vocab_size + 1, emb_dim)
2732

2833
self.conv1 = nn.Sequential(
2934
# 卷积层的激活函数
@@ -53,22 +58,57 @@ def __init__(self, output_dimesion, vocab_size, dropout_rate, emb_dim, max_len,
5358
# output_layer = Dense(projection_dimension, activation='tanh')(layer)
5459
self.output_layer = nn.Linear(vanila_dimension, projection_dimension)
5560

56-
def forward(self, input):
57-
embeds = self.embedding(input)
61+
def forward(self, inputs):
62+
size = len(inputs)
63+
embeds = self.embedding(inputs)
64+
5865
# 进入卷积层前需要将Tensor第二个维度变成emb_dim,作为卷积的通道数
5966
embeds = embeds.view([len(embeds), self.emb_dim, -1])
6067
# concatenate the tensors
6168
x = self.conv1(embeds)
6269
y = self.conv2(embeds)
6370
z = self.conv3(embeds)
64-
flatten = torch.cat((x.view(-1), y.view(-1), z.view(-1)), 1)
71+
flatten = torch.cat((x.view(size, -1), y.view(size, -1), z.view(size, -1)), 1)
6572

6673
out = F.tanh(self.layer(flatten))
6774
out = self.dropout(out)
6875
out = F.tanh(self.output_layer(out))
6976

70-
def train(self, X_train, V, item_weight, seed):
71-
pass
77+
return out
78+
79+
def train(self, X_train, V):
80+
81+
# learning rate暂时定为0.001
82+
optimizer = torch.optim.Adam(self.parameters(), lr=0.001)
83+
84+
for epoch in range(1, self.nb_epoch + 1):
85+
86+
print('<---epoch' + str(epoch))
87+
n_batch = len(X_train) // self.batch_size
88+
89+
# 这里会漏掉一些训练集,先这样写
90+
for i in range(n_batch):
91+
begin_idx, end_idx = i * self.batch_size, (i + 1) * self.batch_size
92+
feature = X_train[begin_idx:end_idx][...]
93+
target = V[begin_idx:end_idx][...]
94+
95+
feature = Variable(torch.from_numpy(feature.astype('int64')).long())
96+
target = Variable(torch.from_numpy(target))
97+
feature, target = feature.cuda(), target.cuda()
98+
99+
optimizer.zero_grad()
100+
logit = self(feature)
101+
102+
loss = F.mse_loss(logit, target)
103+
loss.backward()
104+
optimizer.step()
105+
106+
def get_projection_layer(self, X_train):
107+
inputs = Variable(torch.from_numpy(X_train.astype('int64')).long())
108+
inputs = inputs.cuda()
109+
outputs = self(inputs)
110+
return outputs.cpu().data.numpy()
111+
72112

73113
# 获取CNN模型的输出
74114

@@ -78,7 +118,7 @@ def train(self, X_train, V, item_weight, seed):
78118
# np.random.seed(seed)
79119
# X_train = np.random.permutation(X_train)
80120
# np.random.seed(seed)
81-
# V = np.random.permutation(V)
121+
# V = np.random.permutation(V)ojecti
82122
# np.random.seed(seed)
83123
# item_weight = np.random.permutation(item_weight)
84124
#

text_analysis/models.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,5 +139,6 @@ def train(self, X_train, V, item_weight, seed):
139139

140140
def get_projection_layer(self, X_train):
141141
X_train = sequence.pad_sequences(X_train, maxlen=self.max_len)
142+
X_train = X_train.cuda()
142143
Y = self.model.predict(X_train, batch_size=len(X_train))
143144
return Y

0 commit comments

Comments
 (0)