使用神经网络-垃圾邮件检测-LSTM或者CNN(一维卷积)效果都不错【代码有问题,pass】

简介:
复制代码
from sklearn.feature_extraction.text import CountVectorizer
import os
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn import metrics
import matplotlib.pyplot as plt
import numpy as np
from sklearn import svm
from sklearn.feature_extraction.text import TfidfTransformer
import tensorflow as tf
import tflearn
from tflearn.layers.core import input_data, dropout, fully_connected
from tflearn.layers.conv import conv_1d, global_max_pool
from tflearn.layers.conv import conv_2d, max_pool_2d
from tflearn.layers.merge_ops import merge
from tflearn.layers.estimator import regression
from tflearn.data_utils import to_categorical, pad_sequences
from sklearn.neural_network import MLPClassifier
from tflearn.layers.normalization import local_response_normalization
from tensorflow.contrib import learn


max_features=500
max_document_length=1024



def load_one_file(filename):
    x=""
    with open(filename) as f:
        for line in f:
            line=line.strip('\n')
            line = line.strip('\r')
            x+=line
    return x

def load_files_from_dir(rootdir):
    x=[]
    list = os.listdir(rootdir)
    for i in range(0, len(list)):
        path = os.path.join(rootdir, list[i])
        if os.path.isfile(path):
            v=load_one_file(path)
            x.append(v)
    return x

def load_all_files():
    ham=[]
    spam=[]
    for i in range(1,5):
        path="../data/mail/enron%d/ham/" % i
        print "Load %s" % path
        ham+=load_files_from_dir(path)
        path="../data/mail/enron%d/spam/" % i
        print "Load %s" % path
        spam+=load_files_from_dir(path)
    return ham,spam

def get_features_by_wordbag():
    ham, spam=load_all_files()
    x=ham+spam
    y=[0]*len(ham)+[1]*len(spam)
    vectorizer = CountVectorizer(
                                 decode_error='ignore',
                                 strip_accents='ascii',
                                 max_features=max_features,
                                 stop_words='english',
                                 max_df=1.0,
                                 min_df=1 )
    print vectorizer
    x=vectorizer.fit_transform(x)
    x=x.toarray()
    return x,y

def show_diffrent_max_features():
    global max_features
    a=[]
    b=[]
    for i in range(1000,20000,2000):
        max_features=i
        print "max_features=%d" % i
        x, y = get_features_by_wordbag()
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=0)
        gnb = GaussianNB()
        gnb.fit(x_train, y_train)
        y_pred = gnb.predict(x_test)
        score=metrics.accuracy_score(y_test, y_pred)
        a.append(max_features)
        b.append(score)
        plt.plot(a, b, 'r')
    plt.xlabel("max_features")
    plt.ylabel("metrics.accuracy_score")
    plt.title("metrics.accuracy_score VS max_features")
    plt.legend()
    plt.show()

def do_nb_wordbag(x_train, x_test, y_train, y_test):
    print "NB and wordbag"
    gnb = GaussianNB()
    gnb.fit(x_train,y_train)
    y_pred=gnb.predict(x_test)
    print metrics.accuracy_score(y_test, y_pred)
    print metrics.confusion_matrix(y_test, y_pred)

def do_svm_wordbag(x_train, x_test, y_train, y_test):
    print "SVM and wordbag"
    clf = svm.SVC()
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print metrics.accuracy_score(y_test, y_pred)
    print metrics.confusion_matrix(y_test, y_pred)

def get_features_by_wordbag_tfidf():
    ham, spam=load_all_files()
    x=ham+spam
    y=[0]*len(ham)+[1]*len(spam)
    vectorizer = CountVectorizer(binary=True,
                                 decode_error='ignore',
                                 strip_accents='ascii',
                                 max_features=max_features,
                                 stop_words='english',
                                 max_df=1.0,
                                 min_df=1 )
    print vectorizer
    x=vectorizer.fit_transform(x)
    x=x.toarray()
    transformer = TfidfTransformer(smooth_idf=False)
    print transformer
    tfidf = transformer.fit_transform(x)
    x = tfidf.toarray()
    return  x,y


def do_cnn_wordbag(trainX, testX, trainY, testY):
    global max_document_length
    print "CNN and tf"

    trainX = pad_sequences(trainX, maxlen=max_document_length, value=0.)
    testX = pad_sequences(testX, maxlen=max_document_length, value=0.)
    # Converting labels to binary vectors
    trainY = to_categorical(trainY, nb_classes=2)
    testY = to_categorical(testY, nb_classes=2)

    # Building convolutional network
    network = input_data(shape=[None,max_document_length], name='input')
    network = tflearn.embedding(network, input_dim=1000000, output_dim=128)
    branch1 = conv_1d(network, 128, 3, padding='valid', activation='relu', regularizer="L2")
    branch2 = conv_1d(network, 128, 4, padding='valid', activation='relu', regularizer="L2")
    branch3 = conv_1d(network, 128, 5, padding='valid', activation='relu', regularizer="L2")
    network = merge([branch1, branch2, branch3], mode='concat', axis=1)
    network = tf.expand_dims(network, 2)
    network = global_max_pool(network)
    network = dropout(network, 0.8)
    network = fully_connected(network, 2, activation='softmax')
    network = regression(network, optimizer='adam', learning_rate=0.001,
                         loss='categorical_crossentropy', name='target')
    # Training
    model = tflearn.DNN(network, tensorboard_verbose=0)
    model.fit(trainX, trainY,
              n_epoch=5, shuffle=True, validation_set=(testX, testY),
              show_metric=True, batch_size=100,run_id="spam")

def do_rnn_wordbag(trainX, testX, trainY, testY):
    global max_document_length
    print "RNN and wordbag"

    trainX = pad_sequences(trainX, maxlen=max_document_length, value=0.)
    testX = pad_sequences(testX, maxlen=max_document_length, value=0.)
    # Converting labels to binary vectors
    trainY = to_categorical(trainY, nb_classes=2)
    testY = to_categorical(testY, nb_classes=2)

    # Network building
    net = tflearn.input_data([None, max_document_length])
    net = tflearn.embedding(net, input_dim=10240000, output_dim=128)
    net = tflearn.lstm(net, 128, dropout=0.8)
    net = tflearn.fully_connected(net, 2, activation='softmax')
    net = tflearn.regression(net, optimizer='adam', learning_rate=0.001,
                             loss='categorical_crossentropy')

    # Training
    model = tflearn.DNN(net, tensorboard_verbose=0)
    model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True,
              batch_size=10,run_id="spm-run",n_epoch=5)


def do_dnn_wordbag(x_train, x_test, y_train, y_testY):
    print "DNN and wordbag"

    # Building deep neural network
    clf = MLPClassifier(solver='lbfgs',
                        alpha=1e-5,
                        hidden_layer_sizes = (5, 2),
                        random_state = 1)
    print  clf
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print metrics.accuracy_score(y_test, y_pred)
    print metrics.confusion_matrix(y_test, y_pred)



def  get_features_by_tf():
    global  max_document_length
    x=[]
    y=[]
    ham, spam=load_all_files()
    x=ham+spam
    y=[0]*len(ham)+[1]*len(spam)
    vp=tflearn.data_utils.VocabularyProcessor(max_document_length=max_document_length,
                                              min_frequency=0,
                                              vocabulary=None,
                                              tokenizer_fn=None)
    x=vp.fit_transform(x, unused_y=None)
    x=np.array(list(x))
    return x,y



if __name__ == "__main__":
    print "Hello spam-mail"
    #print "get_features_by_wordbag"
    #x,y=get_features_by_wordbag()
    #x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.4, random_state = 0)

    #print "get_features_by_wordbag_tfidf"
    #x,y=get_features_by_wordbag_tfidf()
    #x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.4, random_state = 0)
    #NB
    #do_nb_wordbag(x_train, x_test, y_train, y_test)
    #show_diffrent_max_features()

    #SVM
    #do_svm_wordbag(x_train, x_test, y_train, y_test)

    #DNN
    #do_dnn_wordbag(x_train, x_test, y_train, y_test)

    print "get_features_by_tf"
    x,y=get_features_by_wordbag()
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.4, random_state = 0)
    #CNN
    do_cnn_wordbag(x_train, x_test, y_train, y_test)


    #RNN
    #do_rnn_wordbag(x_train, x_test, y_train, y_test)
复制代码

自己写检测算法的时候也记得多个算法比较下

标签:  安全分析机器学习
0
0
« 上一篇: 使用CNN做文本分类——将图像2维卷积换成1维
» 下一篇: ES failed to notify ClusterStateListener java.lang.IllegalStateException: environment is not locked
posted on  2017-11-28 11:08  bonelee 阅读( 75) 评论( 3编辑  收藏

  
#1楼 [ 楼主2017-12-11 16:12 |  bonelee   
感觉有些问题,因为LSTM要记住序列位置,而词袋模型,破坏了序列先后顺序!
http://www.cnblogs.com/bonelee/p/7639920.html
见这个DGA LSTM的文章!!!才应该是正解!
  
#2楼 [ 楼主2017-12-11 16:23 |  bonelee   
代码确实有问题!
RNN或者CNN都应该是针对原始数据进行编码!用:
Vocabulary Processor

tflearn.data_utils.VocabularyProcessor (max_document_length, min_frequency=0, vocabulary=None, tokenizer_fn=None)

Maps documents to sequences of word ids.



tf.contrib.learn.preprocessing.VocabularyProcessor (max_document_length, min_frequency=0, vocabulary=None, tokenizer_fn=None)

参数:
max_document_length: 文档的最大长度。如果文本的长度大于最大长度,那么它会被剪切,反之则用0填充。
min_frequency: 词频的最小值,出现次数小于最小词频则不会被收录到词表中。
vocabulary: CategoricalVocabulary 对象。
tokenizer_fn:分词函数

代码:

from tensorflow.contrib import learn
import numpy as np
max_document_length = 4
x_text =[
'i love you',
'me too'
]
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
vocab_processor.fit(x_text)
print next(vocab_processor.transform(['i me too'])).tolist()
x = np.array(list(vocab_processor.fit_transform(x_text)))
print x

[1, 4, 5, 0]
[[1 2 3 0]
[4 5 0 0]]

文档地址: http://tflearn.org/data_utils/
























本文转自张昺华-sky博客园博客,原文链接:http://www.cnblogs.com/bonelee/p/7908573.html,如需转载请自行联系原作者


相关文章
|
4月前
|
机器学习/深度学习 算法 安全
【PSO-LSTM】基于PSO优化LSTM网络的电力负荷预测(Python代码实现)
【PSO-LSTM】基于PSO优化LSTM网络的电力负荷预测(Python代码实现)
271 0
|
5月前
|
JSON 监控 API
在线网络PING接口检测服务器连通状态免费API教程
接口盒子提供免费PING检测API,可测试域名或IP的连通性与响应速度,支持指定地域节点,适用于服务器运维和网络监控。
601 0
|
6月前
|
机器学习/深度学习 算法 数据挖掘
基于WOA鲸鱼优化的BiLSTM双向长短期记忆网络序列预测算法matlab仿真,对比BiLSTM和LSTM
本项目基于MATLAB 2022a/2024b实现,采用WOA优化的BiLSTM算法进行序列预测。核心代码包含完整中文注释与操作视频,展示从参数优化到模型训练、预测的全流程。BiLSTM通过前向与后向LSTM结合,有效捕捉序列前后文信息,解决传统RNN梯度消失问题。WOA优化超参数(如学习率、隐藏层神经元数),提升模型性能,避免局部最优解。附有运行效果图预览,最终输出预测值与实际值对比,RMSE评估精度。适合研究时序数据分析与深度学习优化的开发者参考。
|
6月前
|
机器学习/深度学习 算法 数据安全/隐私保护
基于GA遗传优化的BiLSTM双向长短期记忆网络序列预测算法matlab仿真,对比BiLSTM和LSTM
本内容包含基于BiLSTM与遗传算法(GA)的算法介绍及实现。算法通过MATLAB2022a/2024b运行,核心为优化BiLSTM超参数(如学习率、神经元数量),提升预测性能。LSTM解决传统RNN梯度问题,捕捉长期依赖;BiLSTM双向处理序列,融合前文后文信息,适合全局信息任务。附完整代码(含注释)、操作视频及无水印运行效果预览,适用于股票预测等场景,精度优于单向LSTM。
|
6月前
|
机器学习/深度学习 算法 数据安全/隐私保护
基于PSO粒子群优化TCN-LSTM时间卷积神经网络时间序列预测算法matlab仿真
本内容展示了一种基于粒子群优化(PSO)与时间卷积神经网络(TCN)的时间序列预测方法。通过 MATLAB2022a 实现,完整程序运行无水印,核心代码附详细中文注释及操作视频。算法利用 PSO 优化 TCN 的超参数(如卷积核大小、层数等),提升非线性时间序列预测性能。TCN 结构包含因果卷积层与残差连接,结合 LSTM 构建混合模型,经多次迭代选择最优超参数,最终实现更准确可靠的预测效果,适用于金融、气象等领域。
|
3月前
|
机器学习/深度学习 数据采集 资源调度
基于长短期记忆网络定向改进预测的动态多目标进化算法(LSTM-DIP-DMOEA)求解CEC2018(DF1-DF14)研究(Matlab代码实现)
基于长短期记忆网络定向改进预测的动态多目标进化算法(LSTM-DIP-DMOEA)求解CEC2018(DF1-DF14)研究(Matlab代码实现)
113 0
|
3月前
|
机器学习/深度学习 传感器 数据采集
基于贝叶斯优化CNN-LSTM混合神经网络预测(Matlab代码实现)
基于贝叶斯优化CNN-LSTM混合神经网络预测(Matlab代码实现)
657 0
|
7月前
|
机器学习/深度学习 人工智能 算法
深度解析:基于卷积神经网络的宠物识别
宠物识别技术随着饲养规模扩大而兴起,传统手段存在局限性,基于卷积神经网络的宠物识别技术应运而生。快瞳AI通过优化MobileNet-SSD架构、多尺度特征融合及动态网络剪枝等技术,实现高效精准识别。其在智能家居、宠物医疗和防走失领域展现广泛应用前景,为宠物管理带来智能化解决方案,推动行业迈向新高度。
|
5月前
|
监控 算法 安全
基于 C# 基数树算法的网络屏幕监控敏感词检测技术研究
随着数字化办公和网络交互迅猛发展,网络屏幕监控成为信息安全的关键。基数树(Trie Tree)凭借高效的字符串处理能力,在敏感词检测中表现出色。结合C#语言,可构建高时效、高准确率的敏感词识别模块,提升网络安全防护能力。
148 2
|
6月前
|
机器学习/深度学习 算法 数据安全/隐私保护
基于PSO粒子群优化的BiLSTM双向长短期记忆网络序列预测算法matlab仿真,对比BiLSTM和LSTM
本项目基于MATLAB2022a/2024b开发,结合粒子群优化(PSO)算法与双向长短期记忆网络(BiLSTM),用于优化序列预测任务中的模型参数。核心代码包含详细中文注释及操作视频,涵盖遗传算法优化过程、BiLSTM网络构建、训练及预测分析。通过PSO优化BiLSTM的超参数(如学习率、隐藏层神经元数等),显著提升模型捕捉长期依赖关系和上下文信息的能力,适用于气象、交通流量等场景。附有运行效果图预览,展示适应度值、RMSE变化及预测结果对比,验证方法有效性。

热门文章

最新文章