楼主: shadowaver
1656 0

[程序分享] 利用python的apriori算法做购物篮关联分析 [推广有奖]

随心所欲不逾矩

已卖:1243份资源

教授

56%

还不是VIP/贵宾

-

威望
0
论坛币
8309 个
通用积分
689.8825
学术水平
18 点
热心指数
22 点
信用等级
13 点
经验
29709 点
帖子
890
精华
0
在线时间
1382 小时
注册时间
2007-9-27
最后登录
2026-2-6

楼主
shadowaver 在职认证  发表于 2023-4-20 16:10:48 |AI写论文

+2 论坛币
k人 参与回答

经管之家送您一份

应届毕业生专属福利!

求职就业群
赵安豆老师微信:zhaoandou666

经管之家联合CDA

送您一个全额奖学金名额~ !

感谢您参与论坛问题回答

经管之家送您两个论坛币!

+2 论坛币
# -*- coding: utf-8 -*-
"""
Created on Wed Mar 29 15:44:11 2023

@author: tony.song
"""

import pandas as pd
import numpy as np

ndarr=np.array(([1,2,3],[4,5,6]))
print(ndarr)

ndarr1=np.arange(0,20).reshape(5,4)

ndarr2=np.ones((5,5))

ndarr2 not in ndarr1

ndarr1.T.dot(ndarr2)

ndarr3=np.linspace(0,100,20)

type(ndarr3)

np.random.random()
np.random.random(3)

a=[1,2,3]
aa=np.array(a)

list(a)
b='123'
list(b)

#ndarray强复制与浅复制
#强复制
#浅复制

import matplotlib.pyplot as plt
import pandas as pd

df=pd.read_excel(r'C:\Users\tony.song\Desktop\for py.xlsx')




plt.plot()

plt.xlabel('trade_time')
plt.ylabel('real_amount')
plt.title('line')
plt.xticks(rotation='45')



fig=plt.figure(figsize=[20,5])
df2=df.head(100)
df2['trade_time']=pd.to_datetime(df2['trade_time'])
df2['trade_time']=sorted(df2['trade_time'])

fig=plt.plot(df2['trade_time'],df2['real_amount'])

import numpy as np
x=np.random.random(100)
y=np.random.random(100)
fig2=plt.figure(figsize=[12,5])
fig2=plt.scatter(x,y)
fig2=plt.scatter(sorted(x),sorted(y))

fig3=plt.figure(figsize=[12,5])
fig3=plt.plot(sorted(x),sorted(y))

plt.plot(np.random.randint(1,30,30),np.random.random(30))
plt.scatter(np.random.randint(1,30,30),np.random.random(30)\
            ,c='red')

x=np.random.randint(1,300,150)
y=np.random.random(150)

plt.plot(x,y,c='blue')
plt.scatter(x,y,c='red')
plt.legend('best')
plt.plot(x,y,x-0.5,y-0.5)
plt.legend('best')

import matplotlib.pyplot as plt

fig=plt.figure(figsize=(3,3))
ax1=fig.add_subplot(3,3,1)
ax3=fig.add_subplot(3,3,3)
ax5=fig.add_subplot(3,3,5)
ax9=fig.add_subplot(3,3,9)
ax4=fig.add_subplot(3,3,4)

ax5.plot(x,y,c='red', alpha=0.4)
ax9.plot(x,y,c='blue', alpha=0.4)

plt.legend('best')
x1=[]
for i in range(30):
    x1=x1+x
    print(x1)

import pandas as pd
df00=pd.read_excel(r'C:\Users\tony.song\Desktop\basket for py.xlsx')
df01=df00.pivot_table(index=['store_A','store_B'],values='AB_purchaser',aggfunc='sum')

import matplotlib.pyplot as plt
hist_confidence=plt.hist(df00['confidence'])
hist_lift=plt.hist(df00['lift'])
scatter=plt.scatter(df00['confidence'],df00['lift'],c='red')

fig1=plt.figure(figsize=(2,3))
ax1=fig1.add_subplot(2,3,1)
ax2=fig1.add_subplot(2,3,2)
ax3=fig1.add_subplot(2,3,3)


ax4=fig1.add_subplot(2,3,4)
ax5=fig1.add_subplot(2,3,5)
ax6=fig1.add_subplot(2,3,6)

ax1.hist(df00['confidence'])
ax2.hist(df00['lift'])
ax3.scatter(df00['confidence'],df00['lift'],c='red')


import seaborn as sbn
import numpy as np

x=np.linspace(1,20,40)

sbn.heatmap(np.random.random(size=(10,10)))
sbn.heatmap( df00[['confidence']].head(10))
x=df00['confidence'].head(10).values
y=df00['lift'].head(10).values
z=np.random.random(size=(10,10))
sbn.heatmap( df00[['confidence','lift']].head(60))

sbn.set()

#########################################
#案例分析
#########################################
import pandas as pd
df00=pd.read_excel(r'C:\Users\tony.song\Desktop\basket for py.xlsx')

#1-fundmental desc analysis
rst1=df00.describe().T
rst2=df00[df00['AB_purchaser']==1125]

df00.dtypes #查看各列的数据类型
col_list=df00.columns.tolist() #将各个列名称提取并存成列表类型
#DataFrame格式的groupby方法,目的是聚合计算,必须待聚合函数

#按store_A分组汇总AB_purchaser
df00['AB_purchaser'].groupby(df00['store_A']).sum()
#or
df00.groupby(df00['store_A']).AB_purchaser.sum()

#有多少家店铺
df00['store_A'].groupby(df00['store_A']).count()
#or
df00['store_A'].value_counts()

rst3=df00['AB_purchaser'].groupby(df00['store_A']).sum()

#2- data splice
col_list=df00.columns.tolist()
for col in col_list:
    print(col)
   
import missingno as msno

msno.matrix(df00)
df00['store_A_cate'].notnull().value_counts().loc[True]

import pandas as pd

#交叉表,只用来计数,比groupby好用,排除缺失值
#crosstab可以配合seaborn中的heatmap可视化
bb=pd.crosstab(df00['store_A_cate'],df00['store_B_cate'])

import seaborn as sbn

sbn.heatmap(bb)

#pd.unique()字段值类别统计,=select count(distinct Field),缺失值也会统计
#pd.value_counts()相当于分组汇总且计数,= select Field,count(*),缺失值不统计

#pd的分段函数pd.qcut()
pd.qcut(df00['confidence'],4) #平均分成四段
pd.qcut(df00['confidence'],4).value_counts()
a=pd.qcut(df00['confidence'],4) #平均分成四段
#将结果写到df表中的办法
df00['confidence_cate']=pd.qcut(df00['confidence'],4)

#查看df各个字段的数值类型
df00.dtypes

##########################################
import warnings
import pandas as pd

bi=pd.read_excel(r'C:\Users\tony.song\Desktop\BI_s.xlsx')
bi['year']=bi.biz_month.dt.year
pd.crosstab(bi.project_name,bi.year,values=bi.sales,aggfunc='sum')

#透视表的3种实现方式
rst1=pd.crosstab(bi.project_name,bi.year,values=bi.sales,aggfunc='sum')
rst2=bi.sales.groupby([bi.project_name,bi.year]).sum()
rst3=bi.pivot_table(index='project_name',columns='year',values='sales',aggfunc='sum')

import pandas as pd
pd.merge(b,c,how='inner',left_index=True,right_index=True)
pd.merge(b,c,how='outer',left_index=True,right_index=True,sort=False)
d=pd.merge(b,c,how='outer',left_index=True,right_index=True,sort=False)


#案例分析
#通过临时重写配置文件的方法,可以解决 Matplotlib 显示中文乱码的问题
import matplotlib.pyplot as plt
plt.rcParams["font.sans-serif"]=["SimHei"] #设置字体
plt.rcParams["axes.unicode_minus"]=False #该语句解决图像中的“-”负号的乱码问题

#data source
import pymssql
def conn():
    connect=pymssql.connect(
            server='-.sqlser.com',\
            user='',\
            password='',\
            database='crm_db',\
            charset='cp936')  #字段值是中文不会出现乱码
    if connect:
        print('connect success')
   
    return connect

conn()

sql_code="select * from cons_bonus_history \
where year(trade_time)=2023 and month(trade_time)<=4"
sql_code

import pandas as pd
df_sql =pd.read_sql(sql_code,con=conn())
#[1824350 rows x 51 columns]
df_sql.head()
df_sql.shape
df_sql.dtypes

conn().close()

#df_sql.to_csv(r'C:\Users\t\Desktop\py_base.csv'\
#              ,sep=',',index=False,header=True,encoding='cp936')

#df_sql=pd.read_csv(r'C:\Users\tony.song\Desktop\py_base.csv')

#TPQ part
df_sql_tpq=df_sql[df_sql.mall_name=='上海新天地']

#基本描述统计分析
#(165283, 51)
df_sql_tpq.shape
desc_rst=df_sql_tpq.describe().T
desc_rst

#missing value stats
#缺失值统计

miss_stat=pd.DataFrame({'col':[],'not_null_rate':[]})
#miss_stat=pd.DataFrame()

col=df_sql_tpq.columns.tolist()

for col in col:
    print(col,':',df_sql_tpq['%s'%col].notnull().sum()*1.0/len(df_sql_tpq['%s'%col]))
    a=[]
    b=[]
    a.append(col)
    b.append(df_sql_tpq['%s'%col].notnull().sum()*1.0/len(df_sql_tpq['%s'%col]))
    miss_stat=miss_stat.append(pd.DataFrame({'col':a,'not_null_rate':b}))

#提取数值型字段或者必要字段
type_df_sql_tpq=df_sql_tpq.dtypes

df_sql_tpq_part=type_df_sql_tpq[(type_df_sql_tpq.values=='int64')|(type_df_sql_tpq=='float64')|(type_df_sql_tpq=='datetime64[ns]')]
df_sql_tpq_part

plt_missing=pd.merge(miss_stat,pd.DataFrame(df_sql_tpq_part),left_on=miss_stat.col,right_index=True,how='inner')

import matplotlib.pyplot as plt

import seaborn as sbn
sbn.set_style
plt.xticks(rotation=45)
plt.legend('best')
plt.bar(plt_missing.col,plt_missing.not_null_rate)
sbn.barplot(plt_missing.col,plt_missing.not_null_rate)
plt.xticks(rotation=45)

#构造稀疏矩阵
#(58722, 323)
sparse_matrix=pd.crosstab(df_sql_tpq.user_id,df_sql_tpq.shop_name,values='real_amount',aggfunc='count')

sparse_matrix_desc=sparse_matrix.describe().T
sparse_matrix_desc

#np.nan值替换为0
import numpy as np
sparse_matrix.replace(np.nan,0,inplace=True)
#购买多次的标记为1

sparse_matrix2=pd.DataFrame()
for col in sparse_matrix.columns:
    sparse_matrix2['%s'%col]=sparse_matrix['%s'%col].apply(lambda x:1 if x>1 else 0 )

sparse_matrix2.describe().T
sparse_matrix2.values.max()

#统计每家店铺的购买人数
shop_purchaser=sparse_matrix2.sum().sort_values(ascending=False)
shop_purchaser.values*1.0/sparse_matrix2.shape[0]
shop_trans=sparse_matrix.sum().sort_values(ascending=False)

#pd.DataFrame([shop_purchaser.values,shop_purchaser.values*1.0/sparse_matrix2.shape[0]]\
#                            ,index=shop_purchaser.index
#                            ,columns=['purchaser','pct'])

#通过临时重写配置文件的方法,可以解决 Matplotlib 显示中文乱码的问题
import matplotlib.pyplot as plt

#plt.bar(shop_purchaser.head(20).index,shop_purchaser.head(20).values)
import seaborn as sbn
sbn.set() #套用seaborn风格模版画图
# 设置主题色
sbn.set_style('whitegrid') # darkgrid, whitegrid, dark, white, ticks -- sn.set-style()
plt.rcParams["font.sans-serif"]=["SimHei"] #设置字体
plt.rcParams["axes.unicode_minus"]=False #该语句解决图像中的“-”负号的乱码问题
plt.rcParams['font.sans-serif'] = ['Microsoft Yahei']

fig=plt.figure(figsize=(2,1))

ax1=fig.add_subplot(2,1,1)
ax1.bar(shop_trans.head(20).index,shop_trans.head(20).values,width=0.8)
plt.xticks(rotation=-15,fontsize =8)
plt.yticks(fontsize =8)
#plt.xlabel('shop_name',fontsize =10)
plt.ylabel('trans',fontsize =12)
plt.title("best seller single shop")
#plt.legend('best')  # 显示两组柱状图的标签


ax2=fig.add_subplot(2,1,2)
ax2.bar(shop_purchaser.head(20).index,shop_purchaser.head(20).values,width=0.8)
plt.xticks(rotation=-15,fontsize =8)
plt.yticks(fontsize =8)
plt.xlabel('shop_name',fontsize =10)
plt.ylabel('purchaser',fontsize =12)
#plt.legend('best')  # 显示两组柱状图的标签

plt.show()


#关联分析
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
#frequent_itemsets = apriori(sparse_matrix2.sample(100000), min_support=0.001, use_colnames=True)
frequent_itemsets = apriori(sparse_matrix2.sample(10000), min_support=0.0005, use_colnames=True)
frequent_itemsets['brand_count'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
frequent_itemsets
#frequent_itemsets.sort_values(by=frequent_itemsets.support,ascending=False)

temp=frequent_itemsets[ (frequent_itemsets['brand_count'] >= 2) & (frequent_itemsets['support'] >= 0.0001000) ]

rules_by_confidence = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.3) #按置信度>=0.7提取规则
rules_by_lift = association_rules(frequent_itemsets, metric="lift", min_threshold=1) #按照提升度>=1提取规则
rules_by_confidence.head(300)
rules_by_lift.head(300)

#########################################################
for i in range(1,len(df_sample.real_amount)):
    print(df_sample.real_amount.loc,\
          'True' if df_sample.real_amount.loc>15 else 'False')
二维码

扫码加我 拉你入群

请注明:姓名-公司-职位

以便审核进群资格,未注明则拒绝

关键词:Apriori算法 Apriori Priori python PRIOR 利用python的apriori算法做购物篮关联分析

shadowaver@163.com
QQ 540722048

您需要登录后才可以回帖 登录 | 我要注册

本版微信群
加好友,备注cda
拉您进交流群
GMT+8, 2026-2-13 09:43