# -*- coding: utf-8 -*-
"""
Created on Wed Mar 29 15:44:11 2023
@author: tony.song
"""
import pandas as pd
import numpy as np
ndarr=np.array(([1,2,3],[4,5,6]))
print(ndarr)
ndarr1=np.arange(0,20).reshape(5,4)
ndarr2=np.ones((5,5))
ndarr2 not in ndarr1
ndarr1.T.dot(ndarr2)
ndarr3=np.linspace(0,100,20)
type(ndarr3)
np.random.random()
np.random.random(3)
a=[1,2,3]
aa=np.array(a)
list(a)
b='123'
list(b)
#ndarray强复制与浅复制
#强复制
#浅复制
import matplotlib.pyplot as plt
import pandas as pd
df=pd.read_excel(r'C:\Users\tony.song\Desktop\for py.xlsx')
plt.plot()
plt.xlabel('trade_time')
plt.ylabel('real_amount')
plt.title('line')
plt.xticks(rotation='45')
fig=plt.figure(figsize=[20,5])
df2=df.head(100)
df2['trade_time']=pd.to_datetime(df2['trade_time'])
df2['trade_time']=sorted(df2['trade_time'])
fig=plt.plot(df2['trade_time'],df2['real_amount'])
import numpy as np
x=np.random.random(100)
y=np.random.random(100)
fig2=plt.figure(figsize=[12,5])
fig2=plt.scatter(x,y)
fig2=plt.scatter(sorted(x),sorted(y))
fig3=plt.figure(figsize=[12,5])
fig3=plt.plot(sorted(x),sorted(y))
plt.plot(np.random.randint(1,30,30),np.random.random(30))
plt.scatter(np.random.randint(1,30,30),np.random.random(30)\
,c='red')
x=np.random.randint(1,300,150)
y=np.random.random(150)
plt.plot(x,y,c='blue')
plt.scatter(x,y,c='red')
plt.legend('best')
plt.plot(x,y,x-0.5,y-0.5)
plt.legend('best')
import matplotlib.pyplot as plt
fig=plt.figure(figsize=(3,3))
ax1=fig.add_subplot(3,3,1)
ax3=fig.add_subplot(3,3,3)
ax5=fig.add_subplot(3,3,5)
ax9=fig.add_subplot(3,3,9)
ax4=fig.add_subplot(3,3,4)
ax5.plot(x,y,c='red', alpha=0.4)
ax9.plot(x,y,c='blue', alpha=0.4)
plt.legend('best')
x1=[]
for i in range(30):
x1=x1+x
print(x1)
import pandas as pd
df00=pd.read_excel(r'C:\Users\tony.song\Desktop\basket for py.xlsx')
df01=df00.pivot_table(index=['store_A','store_B'],values='AB_purchaser',aggfunc='sum')
import matplotlib.pyplot as plt
hist_confidence=plt.hist(df00['confidence'])
hist_lift=plt.hist(df00['lift'])
scatter=plt.scatter(df00['confidence'],df00['lift'],c='red')
fig1=plt.figure(figsize=(2,3))
ax1=fig1.add_subplot(2,3,1)
ax2=fig1.add_subplot(2,3,2)
ax3=fig1.add_subplot(2,3,3)
ax4=fig1.add_subplot(2,3,4)
ax5=fig1.add_subplot(2,3,5)
ax6=fig1.add_subplot(2,3,6)
ax1.hist(df00['confidence'])
ax2.hist(df00['lift'])
ax3.scatter(df00['confidence'],df00['lift'],c='red')
import seaborn as sbn
import numpy as np
x=np.linspace(1,20,40)
sbn.heatmap(np.random.random(size=(10,10)))
sbn.heatmap( df00[['confidence']].head(10))
x=df00['confidence'].head(10).values
y=df00['lift'].head(10).values
z=np.random.random(size=(10,10))
sbn.heatmap( df00[['confidence','lift']].head(60))
sbn.set()
#########################################
#案例分析
#########################################
import pandas as pd
df00=pd.read_excel(r'C:\Users\tony.song\Desktop\basket for py.xlsx')
#1-fundmental desc analysis
rst1=df00.describe().T
rst2=df00[df00['AB_purchaser']==1125]
df00.dtypes #查看各列的数据类型
col_list=df00.columns.tolist() #将各个列名称提取并存成列表类型
#DataFrame格式的groupby方法,目的是聚合计算,必须待聚合函数
#按store_A分组汇总AB_purchaser
df00['AB_purchaser'].groupby(df00['store_A']).sum()
#or
df00.groupby(df00['store_A']).AB_purchaser.sum()
#有多少家店铺
df00['store_A'].groupby(df00['store_A']).count()
#or
df00['store_A'].value_counts()
rst3=df00['AB_purchaser'].groupby(df00['store_A']).sum()
#2- data splice
col_list=df00.columns.tolist()
for col in col_list:
print(col)
import missingno as msno
msno.matrix(df00)
df00['store_A_cate'].notnull().value_counts().loc[True]
import pandas as pd
#交叉表,只用来计数,比groupby好用,排除缺失值
#crosstab可以配合seaborn中的heatmap可视化
bb=pd.crosstab(df00['store_A_cate'],df00['store_B_cate'])
import seaborn as sbn
sbn.heatmap(bb)
#pd.unique()字段值类别统计,=select count(distinct Field),缺失值也会统计
#pd.value_counts()相当于分组汇总且计数,= select Field,count(*),缺失值不统计
#pd的分段函数pd.qcut()
pd.qcut(df00['confidence'],4) #平均分成四段
pd.qcut(df00['confidence'],4).value_counts()
a=pd.qcut(df00['confidence'],4) #平均分成四段
#将结果写到df表中的办法
df00['confidence_cate']=pd.qcut(df00['confidence'],4)
#查看df各个字段的数值类型
df00.dtypes
##########################################
import warnings
import pandas as pd
bi=pd.read_excel(r'C:\Users\tony.song\Desktop\BI_s.xlsx')
bi['year']=bi.biz_month.dt.year
pd.crosstab(bi.project_name,bi.year,values=bi.sales,aggfunc='sum')
#透视表的3种实现方式
rst1=pd.crosstab(bi.project_name,bi.year,values=bi.sales,aggfunc='sum')
rst2=bi.sales.groupby([bi.project_name,bi.year]).sum()
rst3=bi.pivot_table(index='project_name',columns='year',values='sales',aggfunc='sum')
import pandas as pd
pd.merge(b,c,how='inner',left_index=True,right_index=True)
pd.merge(b,c,how='outer',left_index=True,right_index=True,sort=False)
d=pd.merge(b,c,how='outer',left_index=True,right_index=True,sort=False)
#案例分析
#通过临时重写配置文件的方法,可以解决 Matplotlib 显示中文乱码的问题
import matplotlib.pyplot as plt
plt.rcParams["font.sans-serif"]=["SimHei"] #设置字体
plt.rcParams["axes.unicode_minus"]=False #该语句解决图像中的“-”负号的乱码问题
#data source
import pymssql
def conn():
connect=pymssql.connect(
server='-.sqlser.com',\
user='',\
password='',\
database='crm_db',\
charset='cp936') #字段值是中文不会出现乱码
if connect:
print('connect success')
return connect
conn()
sql_code="select * from cons_bonus_history \
where year(trade_time)=2023 and month(trade_time)<=4"
sql_code
import pandas as pd
df_sql =pd.read_sql(sql_code,con=conn())
#[1824350 rows x 51 columns]
df_sql.head()
df_sql.shape
df_sql.dtypes
conn().close()
#df_sql.to_csv(r'C:\Users\t\Desktop\py_base.csv'\
# ,sep=',',index=False,header=True,encoding='cp936')
#df_sql=pd.read_csv(r'C:\Users\tony.song\Desktop\py_base.csv')
#TPQ part
df_sql_tpq=df_sql[df_sql.mall_name=='上海新天地']
#基本描述统计分析
#(165283, 51)
df_sql_tpq.shape
desc_rst=df_sql_tpq.describe().T
desc_rst
#missing value stats
#缺失值统计
miss_stat=pd.DataFrame({'col':[],'not_null_rate':[]})
#miss_stat=pd.DataFrame()
col=df_sql_tpq.columns.tolist()
for col in col:
print(col,':',df_sql_tpq['%s'%col].notnull().sum()*1.0/len(df_sql_tpq['%s'%col]))
a=[]
b=[]
a.append(col)
b.append(df_sql_tpq['%s'%col].notnull().sum()*1.0/len(df_sql_tpq['%s'%col]))
miss_stat=miss_stat.append(pd.DataFrame({'col':a,'not_null_rate':b}))
#提取数值型字段或者必要字段
type_df_sql_tpq=df_sql_tpq.dtypes
df_sql_tpq_part=type_df_sql_tpq[(type_df_sql_tpq.values=='int64')|(type_df_sql_tpq=='float64')|(type_df_sql_tpq=='datetime64[ns]')]
df_sql_tpq_part
plt_missing=pd.merge(miss_stat,pd.DataFrame(df_sql_tpq_part),left_on=miss_stat.col,right_index=True,how='inner')
import matplotlib.pyplot as plt
import seaborn as sbn
sbn.set_style
plt.xticks(rotation=45)
plt.legend('best')
plt.bar(plt_missing.col,plt_missing.not_null_rate)
sbn.barplot(plt_missing.col,plt_missing.not_null_rate)
plt.xticks(rotation=45)
#构造稀疏矩阵
#(58722, 323)
sparse_matrix=pd.crosstab(df_sql_tpq.user_id,df_sql_tpq.shop_name,values='real_amount',aggfunc='count')
sparse_matrix_desc=sparse_matrix.describe().T
sparse_matrix_desc
#np.nan值替换为0
import numpy as np
sparse_matrix.replace(np.nan,0,inplace=True)
#购买多次的标记为1
sparse_matrix2=pd.DataFrame()
for col in sparse_matrix.columns:
sparse_matrix2['%s'%col]=sparse_matrix['%s'%col].apply(lambda x:1 if x>1 else 0 )
sparse_matrix2.describe().T
sparse_matrix2.values.max()
#统计每家店铺的购买人数
shop_purchaser=sparse_matrix2.sum().sort_values(ascending=False)
shop_purchaser.values*1.0/sparse_matrix2.shape[0]
shop_trans=sparse_matrix.sum().sort_values(ascending=False)
#pd.DataFrame([shop_purchaser.values,shop_purchaser.values*1.0/sparse_matrix2.shape[0]]\
# ,index=shop_purchaser.index
# ,columns=['purchaser','pct'])
#通过临时重写配置文件的方法,可以解决 Matplotlib 显示中文乱码的问题
import matplotlib.pyplot as plt
#plt.bar(shop_purchaser.head(20).index,shop_purchaser.head(20).values)
import seaborn as sbn
sbn.set() #套用seaborn风格模版画图
# 设置主题色
sbn.set_style('whitegrid') # darkgrid, whitegrid, dark, white, ticks -- sn.set-style()
plt.rcParams["font.sans-serif"]=["SimHei"] #设置字体
plt.rcParams["axes.unicode_minus"]=False #该语句解决图像中的“-”负号的乱码问题
plt.rcParams['font.sans-serif'] = ['Microsoft Yahei']
fig=plt.figure(figsize=(2,1))
ax1=fig.add_subplot(2,1,1)
ax1.bar(shop_trans.head(20).index,shop_trans.head(20).values,width=0.8)
plt.xticks(rotation=-15,fontsize =8)
plt.yticks(fontsize =8)
#plt.xlabel('shop_name',fontsize =10)
plt.ylabel('trans',fontsize =12)
plt.title("best seller single shop")
#plt.legend('best') # 显示两组柱状图的标签
ax2=fig.add_subplot(2,1,2)
ax2.bar(shop_purchaser.head(20).index,shop_purchaser.head(20).values,width=0.8)
plt.xticks(rotation=-15,fontsize =8)
plt.yticks(fontsize =8)
plt.xlabel('shop_name',fontsize =10)
plt.ylabel('purchaser',fontsize =12)
#plt.legend('best') # 显示两组柱状图的标签
plt.show()
#关联分析
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
#frequent_itemsets = apriori(sparse_matrix2.sample(100000), min_support=0.001, use_colnames=True)
frequent_itemsets = apriori(sparse_matrix2.sample(10000), min_support=0.0005, use_colnames=True)
frequent_itemsets['brand_count'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
frequent_itemsets
#frequent_itemsets.sort_values(by=frequent_itemsets.support,ascending=False)
temp=frequent_itemsets[ (frequent_itemsets['brand_count'] >= 2) & (frequent_itemsets['support'] >= 0.0001000) ]
rules_by_confidence = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.3) #按置信度>=0.7提取规则
rules_by_lift = association_rules(frequent_itemsets, metric="lift", min_threshold=1) #按照提升度>=1提取规则
rules_by_confidence.head(300)
rules_by_lift.head(300)
#########################################################
for i in range(1,len(df_sample.real_amount)):
print(df_sample.real_amount.loc,\
'True' if df_sample.real_amount.loc>15 else 'False')


雷达卡




京公网安备 11010802022788号







