相似文件
换一批
经管之家送您一份
应届毕业生专属福利!
求职就业群
感谢您参与论坛问题回答
经管之家送您两个论坛币!
+2 论坛币
# coding: utf-8
# # 用分箱数据拟合
# In[2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sbn
# In[119]:
df=pd.read_excel(r'C:\Users\tony.song\Desktop\pareto_data.xlsx')
# In[121]:
df.head()
df.sample(10)
df.columns
df.dtypes
df.describe().T
# In[140]:
# missing values process
df1=df[~(df.lost_Prob==0)]
df1.describe().T
# In[141]:
fig,ax=plt.subplots(3,1,figsize=(15,7))
ax[0].scatter(df1.Freq_per_year,df1.lost_Prob)
ax[1].scatter(df1[df1.Freq_per_year<=50].Freq_per_year,df1[df1.Freq_per_year<=50].lost_Prob)
ax[2].scatter(df1[df1.Freq_per_year<=35].Freq_per_year,df1[df1.Freq_per_year<=35].lost_Prob)
plt.show()
# In[146]:
df2=df1[df1.Freq_per_year<=35]
df2.describe().T
# In[172]:
fit.distribution_compare('power_law','exponential')
# help(fit.distribution_compare)
# distribution_compare(dist1, dist2, nested=None, **kwargs) method of powerlaw.Fit instance
# Returns the loglikelihood ratio, and its p-value, between the two
# distribution fits, assuming the candidate distributions are nested.
# Returns
# -------
# R : float
# Loglikelihood ratio of the two distributions' fit to the data. If
# greater than 0, the first distribution is preferred. If less than
# 0, the second distribution is preferred.
# p : float
# Significance of R
# 5.508748079440679>0
# In[147]:
import powerlaw
powerlaw
fit=powerlaw.Fit(df2.lost_Prob,descrete=False)
# In[151]:
print(fit.alpha,'\n',fit.xmin)
# In[162]:
plt.figure(figsize=(7,3))
powerlaw.plot_pdf(df2.lost_Prob,color ='r')
plt.show()
plt.figure(figsize=(7,3))
powerlaw.plot_cdf(df2.lost_Prob,color ='b')
plt.show()
# In[188]:
# pdf:
# p(x)=C*x**-alpha,s.t.C=(alpha-1)/xmin**(-alpha+1),when x<=300
df2['pred_lost_Prob']=(fit.alpha-1)/fit.xmin**(-fit.alpha+1)*df2.Freq_per_year**-fit.alpha
df2['pred_error']=df2.lost_Prob-df2.pred_lost_Prob
print(df2.pred_error.describe())
# In[207]:
fig,ax=plt.subplots(1,1,figsize=(18,10))
ax.plot(df2.Freq_per_year,df2.pred_lost_Prob,label='predict',color='r')
ax.legend(loc='upper left')
#x同轴
ax2=ax.twinx()
ax2.plot(df2.Freq_per_year,df2.lost_Prob,label='actual')
ax2.legend(loc='upper right')
plt.tight_layout()
# plt.grid()
plt.show()
# In[8]:
# df.lost_Prob.plot()
# plt.show()
# plt.scatter(df.Freq_per_year,df.lost_Prob)
# plt.show()
df30=df.head(30)
# df30
# plt.scatter(df30.Freq_per_year,df30.lost_Prob)
# plt.show()
# 单行多列或者单列多行按照一维数组方式引用子图ax[0]...
fig,ax=plt.subplots(1,3,figsize=(18,3))
ax[0].scatter(df30.Freq_per_year,df30.lost_Prob,c='r')
ax[1].plot(df30.Freq_per_year,df30.lost_Prob)
ax[2].plot(df.Freq_per_year,df.lost_Prob)
plt.tight_layout
plt.show()
# In[46]:
# !pip install powerlaw
import powerlaw
powerlaw
# help(powerlaw)
# In[67]:
fit=powerlaw.Fit(df30.lost_Prob,descrete=False)
# fit=powerlaw.Fit(df.lost_Prob)
# help(powerlaw.Fit)
# In[85]:
print('alpha=',fit.alpha,'\n')
# help(fit)
print('xmin=',fit.xmin,'\n')
df30.lost_Prob.describe().T
# In[152]:
fit.distribution_compare('power_law','exponential')
# help(fit.distribution_compare)
# distribution_compare(dist1, dist2, nested=None, **kwargs) method of powerlaw.Fit instance
# Returns the loglikelihood ratio, and its p-value, between the two
# distribution fits, assuming the candidate distributions are nested.
# In[86]:
plt.figure(figsize=(9,4))
powerlaw.plot_pdf(df30.lost_Prob,color ='r')
# plt.show()
powerlaw.plot_cdf(df30.lost_Prob,color ='b')
plt.show()
# # 用不分箱数据拟合
# In[1]:
import pandas as pd
df_raw=pd.read_excel(r'C:\Users\tony.song\Desktop\df_raw.xlsx')
df_raw.head()
# In[2]:
# generate lost freq that similar as prob
df_raw['lost_prob']=df_raw.apply(lambda d:d.lost/(d.lost+d.existing),axis=1)
df_raw.head()
# In[108]:
## considering missing and zero values
print(df_raw.describe(),'\n','vs')
df_raw[df_raw.lost_prob==0]
# 删除lost prob为0的行
df_raw1=df_raw.loc[~(df_raw.lost_prob==0)]
# 删除0值所在的行
# df_raw[~(df_raw==0).any(axis=1)]
print(df_raw1.describe())
print(df_raw2.describe())
# In[87]:
import matplotlib.pyplot as plt
import seaborn as sbn
sbn.set_style('whitegrid')
fig,ax=plt.subplots(4,1,figsize=(15,9))
ax[0].scatter(df_raw.Freq,df_raw.lost_prob)
ax[1].scatter(df_raw1.Freq,df_raw1.lost_prob)
ax[2].scatter(df_raw1[df_raw1.Freq<300].Freq,df_raw1[df_raw1.Freq<300].lost_prob)
ax[3].hist(df_raw1[df_raw1.Freq<300].lost_prob,bins=15)
plt.show()
# 当消费频次不超过300次,流失概率没有出现翘尾震荡,可以用powerlaw分布拟合
# In[115]:
df_raw2=df_raw1[df_raw1.Freq<300]
print(df_raw2.head())
import powerlaw
fit=powerlaw.Fit(df_raw2.lost_prob,descrete=False)
# In[117]:
print(fit.alpha)
print(fit.xmin)
# In[113]:
# pdf:
# p(x)=C*x**-alpha,s.t.C=(alpha-1)/xmin**(-alpha+1),when x<=300
# 预测的会员流失概率与会员消费活跃度的关系为:
# lost_prob=(3.56-1)/0.0476**(-3.56+1)*Freq**-3.56
df_raw2['pred_lost_prob']=((3.56-1)/0.0476**(-3.56+1))*df_raw2.Freq**-3.56
df_raw2
# In[114]:
# 比较预测值与实际值
# plt.plot(df_raw2.lost_prob)
plt.figure(figsize=(16,3))
plt.plot(df_raw2.pred_lost_prob,color='r')
plt.show()
# In[29]:
# 生成1w个[0,1]上均匀分布的随机数
rnd_list = np.random.uniform(0,1,10**5)
# In[31]:
rnd_list.shape
# In[44]:
a, m = 3, 2
s = (np.random.pareto(a, 1000) + 1) * m
plt.hist(np.random.pareto(a,1000),bins=10)
plt.show()
# In[45]:
plt.plot(s)
plt.show()
plt.hist(s,bins=15)
plt.show()
# # X与alpha值域对应的曲线形状模拟
# In[1]:
import numpy as np
import matplotlib.pyplot as plt
# In[30]:
x1=np.linspace(0,1,10)
x2=np.linspace(1,3,10)
x3=np.linspace(-3,0,10)
fig,ax=plt.subplots(6,1,figsize=(15,20))
ax[0].plot(x1,x1**3)
ax[1].plot(x2,x2**3)
ax[2].plot(x1,x1**-3)
ax[3].plot(x2,x2**-3)
ax[4].plot(x3,x3**3)
ax[5].plot(x3,x3**-3)
plt.show()
# In[17]:
扫码加我 拉你入群
请注明:姓名-公司-职位
以便审核进群资格,未注明则拒绝