Apriori算法中的辅助函数(二)

2021-03-22 16:25

阅读:405

(1)承接上文,如果有需要的话点击链接(https://www.cnblogs.com/xero/p/13771331.html)
(2)加粗部分是新添加的内容,有颜色的部分是改正的地方。
#!/usr/bin/env python
# _*_ coding: utf-8 _*_
# @Time : 2020/10/5 14:54
# @Author : 沐蓉
# @Version:V 0.1
# @File : apriori.py
# @desc : apriori算法

import copy

# 创建初始数据集
def loadDataSet():
return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]


# 创建大小为1的所有候选项集的集合
def creatC1(dataSet):
C1 = []
for transaction in dataSet:
for item in transaction:
if not [item] in C1:
C1.append([item])
C1.sort()
# print(*map(frozenset, C1))
return map(frozenset, C1)


# creatC1(loadDataSet())

# 重复扫描每条记录,计算所有项集的支持度
"""
D: 数据集
Ck: 候选项目集
minSupport: 最小支持度
"""


def scanD(D, Ck, minSupport):
# print("D:", D)
# print("Ck:", Ck)
# print("minSupport:", minSupport)
# # numItems = len(list(D))
ssCnt = {}
tmpD = copy.deepcopy(D) # 不要直接操作D,需要深拷贝
for tid in tmpD: # tid为数据集中的一条交易记录
# print("tid:", tid)
ListCk = copy.deepcopy(Ck) # 借助深拷贝
for can in ListCk: # can为每次从候选集中拿出的一个项集
if can.issubset(tid):
if can in ssCnt:
ssCnt[can] += 1
else:
ssCnt[can] = 1

numItems = len(list(copy.deepcopy(D)))
# print("numItems:", numItems)
retList = []
supportData = {}
for key in ssCnt:
support = ssCnt[key] / numItems # 计算每个项集的支持度
if support >= minSupport:
retList.insert(0, key) # 列表前插法插入每个满足支持度的项集:retList = [2, 3, 5]
supportData[key] = support # supportData = {‘2‘:0.75, ‘3‘:0.75, ‘5‘:0.75}
return retList, supportData


# 通过逐步合并k个项的候选项集,构建k+1个项的候选项集
# 利用的算法思想是:
# 如果这两个集合的前面k-2个元素都相等,那么就将这两个集合合并成一个为k的集合
def aprioriGen(Lk, k):
retList = []
lenLk = len(Lk)
for i in range(lenLk):
for j in range(i + 1, lenLk):
L1 = list(Lk[i])[:k - 2]
L2 = list(Lk[j])[:k - 2]
L1.sort()
L2.sort()
if L1 == L2:
retList.append(Lk[i] | Lk[j])
return retList


def apriori(dataSet, minSupport=0.5):
C1 = creatC1(dataSet)
D = map(set, dataSet)
L1, supportData = scanD(D, C1, minSupport)
L = [L1]
k = 2
while (len(L[k - 2])) > 0:
Ck = aprioriGen(L[k - 2], k)
LK, supK = scanD(D, Ck, minSupport)
supportData.update(supK)
L.append(LK)
k += 1
return L, supportData


def generateRules(L, supportData, minConf=0.7):
bigRuleList = []
for i in range(1, len(L)):
for freqSet in L[i]:
H1 = [frozenset([item]) for item in freqSet]
if (i > 1):
rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf)
else:
calcConf(freqSet, H1, supportData, bigRuleList, minConf)
return bigRuleList


def calcConf(freqSet, H, supportData, br1, minConf=0.7):
prunedH = []
for conseq in H:
conf = supportData[freqSet] / supportData[freqSet - conseq]
if conf >= minConf:
# print(freqSet - conseq)
print(freqSet - conseq, ‘--->‘, conseq, ‘conf:‘, round(conf, 2))
br1.append((freqSet - conseq, conseq, conf))
prunedH.append(conseq)
return prunedH


def rulesFromConseq(freqSet, H, supportData, br1, minConf=0.7):
m = len(H[0])
if (m == 1):
Hmp1 = calcConf(freqSet, H, supportData, br1, minConf) # 后件为一项

if (len(freqSet) > (m + 1)): # 后件为两项及以上
Hmp1 = aprioriGen(Hmp1, m + 1)
Hmp1 = calcConf(freqSet, Hmp1, supportData, br1, minConf)
if (len(Hmp1) > 1):
rulesFromConseq(freqSet, Hmp1, supportData, br1, minConf)


if __name__ == ‘__main__‘:
dataSet = loadDataSet()
# C1 = creatC1(dataSet)
# D = map(set, dataSet)
#
# L1, suppData0 = scanD(D, C1, 0.5)
# print("L1: {} \nsuppData0: {}".format(L1, suppData0))

L, suppData = apriori(dataSet)
# print("频繁项集L:", L)
# print("数据集的支持度:", suppData)
bigRuleList = generateRules(L, suppData, 0.5)
# print(bigRuleList)

给定支持度阈值0.5,运行结果如下:


评论


亲,登录后才可以留言!