朴素贝叶斯(离散型+连续型)

本文介绍了朴素贝叶斯算法在处理数据集时,如何应对既有离散特征又有连续特征的情况。通过正态分布假设每个连续特征的条件概率,并利用训练集的均值和方差进行估算。此外,提到了将连续特征转化为离散特征的另一种方法,但因简单而未予实现。

讲道理上次写完离散性朴素贝叶斯的实现,这次得写连续型的了,考虑到还有离散性+连续型(考虑到我懒),即数据集里的特征既有离散的特征又有连续的特征这样,就一并一起洗写了吧o(* ̄▽ ̄*)ブ


上次讲到了朴素贝叶斯的思想,本质上就是假设数据特征的条件概率是无关的,然后我们通过正态分布去假设每个特征条件概率的分布;

于是乎对于连续型的特征我们可以通过它们在训练集上的均值和方差去估算新来样本的条件概率


然后就和离散性一样啦~


import numpy as np
import math

pi = math.pi

class NaiveBayesClassifier(object):
    def __init__(self):
        self.x = self.y = []
        self.feat_dics = self.label_dic = self.dic_label = self.pri = None
        self.con = []
        self.is_continue = self.cont_con =  None

    def pre(self, x, y):
        xt = map(list, zip(*x))
        features = [set(feat) for feat in xt]
        self.feat_dics = [{_l: i for i, _l in enumerate(feats)}
                     for i, feats in enumerate(features)]
        x = np.array([[self.feat_dics[i][_l] for i, _l in enumerate(sample)]
                      for sample in x], dtype=np.float32)
        self.label_dic = {_l: i for i, _l in enumerate(set(y))}
        y = np.array([self.label_dic[yy] for yy in y], dtype=np.int8)
        self.dic_label = {_l: i for i, _l in self.label_dic.items()}
        return x, y

    def fit(self, path, lb=1, is_continue=None):
        with open(path, "r", encoding="UTF-8") as file:
            for sample in file:
                self.x.append(sample.strip().split(","))
        tar_idx = -1
        self.y = np.array([xx.pop(tar_idx) for xx in self.x])
        self.x = np.array(self.x)

        if is_continue is not None:
            self.is_continue = np.array(is_continue)
            is_discrete = ~self.is_continue
            x = self.x[:, np.where(is_discrete)]
        else:
            x = self.x

        if x.ndim > 2:
            x = x.reshape([x.shape[0], x.shape[2]])

        x, y = self.pre(np.array(x), self.y)

        _c_num = [sum(self.label_dic[i] == y)
                    for i in self.label_dic]
        self.pri = [(_c_num[self.label_dic[i]] + lb) / (len(self.y) + lb * len(self.label_dic))
                    for i in self.label_dic]
        self.con = []
        for k in self.label_dic:
            t2 = []
            for i in range(len(self.feat_dics)):
                t = []
                for j in self.feat_dics[i]:
                    t.append(
                        (sum(x[y == self.label_dic[k]][:, i] == self.feat_dics[i][j]) + lb) / (len(y) + lb * len(self.feat_dics[i])))
                t2.append(t)
            self.con.append(t2)
        self.con = np.array(self.con)

        if self.is_continue is not None:
            x = self.x[:, np.where(self.is_continue)]
            x = x.reshape([x.shape[0], x.shape[2]])
            self.cont_con = []
            for i in range(len(x.T)):
                t = []
                for k in self.label_dic:
                    t2 = []
                    t2.append(np.mean(np.asarray(x[y == self.label_dic[k], i], dtype=np.float32)))
                    t2.append(np.std(np.asarray(x[y == self.label_dic[k], i], dtype=np.float32)))
                    t.append(t2)
                self.cont_con.append(t)
            self.cont_con = np.asarray(self.cont_con, dtype=np.float32)

    def predict(self, x):
        if self.is_continue is not None:
            x_cont = x[:, np.where(self.is_continue)]
            x_cont = np.asarray(x_cont.reshape([x_cont.shape[0], x_cont.shape[2]]), dtype=np.float32)
            x = x[:, np.where(~self.is_continue)]
            x = x.reshape([x.shape[0], x.shape[2]])
        x = np.array([[self.feat_dics[i][_l] for i, _l in enumerate(sample)]
                      for sample in x], dtype=np.float32)
        y = []
        for i in range(len(x)):
            t = []
            for k in self.label_dic:
                t2 = 1
                for j in range(len(x[i])):
                    t2 *= self.con[self.label_dic[k]][j][x[i][j]]
                t.append(t2)
            y.append(t)

        y = np.array(y)

        if self.is_continue is not None:
            for k in self.label_dic:
                for i in range(len(x_cont.T)):
                    for j in range(len(x_cont[i])):
                        y[j][self.label_dic[k]] *= (np.exp(((x_cont[j][i]-self.cont_con[self.label_dic[k]][i][0])**2)/
                               2*(self.cont_con[self.label_dic[k]][i][1])**2)/(np.sqrt(2*pi)*self.cont_con
                        [self.label_dic[k]][i][1]))
        return [self.dic_label[np.argmax(i)] for i in y]



这里输入的is_continue是一个bool型的数组用以标记哪些特征是连续型特征


多提一句,其实这只是连续型的处理方法之一,另一种方式是设定阈值将连续型特征转变成离散型特征(for example, 我们设定年龄,40岁以下为年龄小,40岁以上为成熟)

考虑到这种方法实现起来比较简单(我懒*2),就不实现了( ̄▽ ̄)"


收工

评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值