[程序分享] 基于python的幂律分布中帕累托分布拟合 [推广有奖]

17关注
11粉丝

随心所欲不逾矩

教授

还不是VIP/贵宾

威望: 0 级
论坛币: 8766 个
通用积分: 717.1512
学术水平: 18 点
热心指数: 19 点
信用等级: 13 点
经验: 28386 点
帖子: 781
精华: 0
在线时间: 927 小时
注册时间: 2007-9-27
最后登录: 2024-4-28

楼主

shadowaver

发表于 2023-9-25 10:53:13 |只看作者 |坛友微信交流群|倒序 |AI写论文

相似文件

换一批

是否 +2 论坛币

k人参与回答

经管之家送您一份

应届毕业生专属福利!

求职就业群

赵安豆老师微信：zhaoandou666

经管之家联合CDA

送您一个全额奖学金名额~ !

立即领取

感谢您参与论坛问题回答

经管之家送您两个论坛币！

+2 论坛币

# coding: utf-8

# # 用分箱数据拟合

# In[2]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sbn

# In[119]:

df=pd.read_excel(r'C:\Users\tony.song\Desktop\pareto_data.xlsx')

# In[121]:

df.head()
df.sample(10)
df.columns
df.dtypes
df.describe().T

# In[140]:

# missing values process
df1=df[~(df.lost_Prob==0)]
df1.describe().T

# In[141]:

fig,ax=plt.subplots(3,1,figsize=(15,7))
ax[0].scatter(df1.Freq_per_year,df1.lost_Prob)
ax[1].scatter(df1[df1.Freq_per_year<=50].Freq_per_year,df1[df1.Freq_per_year<=50].lost_Prob)
ax[2].scatter(df1[df1.Freq_per_year<=35].Freq_per_year,df1[df1.Freq_per_year<=35].lost_Prob)
plt.show()

# In[146]:

df2=df1[df1.Freq_per_year<=35]
df2.describe().T

# In[172]:

fit.distribution_compare('power_law','exponential')
# help(fit.distribution_compare)
# distribution_compare(dist1, dist2, nested=None, **kwargs) method of powerlaw.Fit instance
#    Returns the loglikelihood ratio, and its p-value, between the two
#    distribution fits, assuming the candidate distributions are nested.
#    Returns
#    -------
#    R : float
#       Loglikelihood ratio of the two distributions' fit to the data. If
#       greater than 0, the first distribution is preferred. If less than
#       0, the second distribution is preferred.
#    p : float
#       Significance of R
# 5.508748079440679>0

# In[147]:

import powerlaw
powerlaw
fit=powerlaw.Fit(df2.lost_Prob,descrete=False)

# In[151]:

print(fit.alpha,'\n',fit.xmin)

# In[162]:

plt.figure(figsize=(7,3))
powerlaw.plot_pdf(df2.lost_Prob,color ='r')
plt.show()
plt.figure(figsize=(7,3))
powerlaw.plot_cdf(df2.lost_Prob,color ='b')
plt.show()

# In[188]:

# pdf:
#    p(x)=C*x**-alpha,s.t.C=(alpha-1)/xmin**(-alpha+1),when x<=300
df2['pred_lost_Prob']=(fit.alpha-1)/fit.xmin**(-fit.alpha+1)*df2.Freq_per_year**-fit.alpha
df2['pred_error']=df2.lost_Prob-df2.pred_lost_Prob
print(df2.pred_error.describe())

# In[207]:

fig,ax=plt.subplots(1,1,figsize=(18,10))
ax.plot(df2.Freq_per_year,df2.pred_lost_Prob,label='predict',color='r')
ax.legend(loc='upper left')
#x同轴
ax2=ax.twinx()
ax2.plot(df2.Freq_per_year,df2.lost_Prob,label='actual')
ax2.legend(loc='upper right')
plt.tight_layout()
# plt.grid()
plt.show()

# In[8]:

# df.lost_Prob.plot()
# plt.show()
# plt.scatter(df.Freq_per_year,df.lost_Prob)
# plt.show()
df30=df.head(30)
# df30
# plt.scatter(df30.Freq_per_year,df30.lost_Prob)
# plt.show()
# 单行多列或者单列多行按照一维数组方式引用子图ax[0]...
fig,ax=plt.subplots(1,3,figsize=(18,3))
ax[0].scatter(df30.Freq_per_year,df30.lost_Prob,c='r')
ax[1].plot(df30.Freq_per_year,df30.lost_Prob)
ax[2].plot(df.Freq_per_year,df.lost_Prob)
plt.tight_layout
plt.show()

# In[46]:

# !pip install powerlaw
import powerlaw
powerlaw
# help(powerlaw)

# In[67]:

fit=powerlaw.Fit(df30.lost_Prob,descrete=False)
# fit=powerlaw.Fit(df.lost_Prob)
# help(powerlaw.Fit)

# In[85]:

print('alpha=',fit.alpha,'\n')
# help(fit)
print('xmin=',fit.xmin,'\n')
df30.lost_Prob.describe().T

# In[152]:

fit.distribution_compare('power_law','exponential')
# help(fit.distribution_compare)
# distribution_compare(dist1, dist2, nested=None, **kwargs) method of powerlaw.Fit instance
#    Returns the loglikelihood ratio, and its p-value, between the two
#    distribution fits, assuming the candidate distributions are nested.

# In[86]:

plt.figure(figsize=(9,4))
powerlaw.plot_pdf(df30.lost_Prob,color ='r')
# plt.show()
powerlaw.plot_cdf(df30.lost_Prob,color ='b')
plt.show()

# # 用不分箱数据拟合

# In[1]:

import pandas as pd
df_raw=pd.read_excel(r'C:\Users\tony.song\Desktop\df_raw.xlsx')
df_raw.head()

# In[2]:

# generate lost freq that similar as prob
df_raw['lost_prob']=df_raw.apply(lambda d:d.lost/(d.lost+d.existing),axis=1)
df_raw.head()

# In[108]:

## considering missing and zero values
print(df_raw.describe(),'\n','vs')
df_raw[df_raw.lost_prob==0]
# 删除lost prob为0的行
df_raw1=df_raw.loc[~(df_raw.lost_prob==0)]
# 删除0值所在的行
# df_raw[~(df_raw==0).any(axis=1)]
print(df_raw1.describe())
print(df_raw2.describe())

# In[87]:

import matplotlib.pyplot as plt
import seaborn as sbn
sbn.set_style('whitegrid')
fig,ax=plt.subplots(4,1,figsize=(15,9))
ax[0].scatter(df_raw.Freq,df_raw.lost_prob)
ax[1].scatter(df_raw1.Freq,df_raw1.lost_prob)
ax[2].scatter(df_raw1[df_raw1.Freq<300].Freq,df_raw1[df_raw1.Freq<300].lost_prob)
ax[3].hist(df_raw1[df_raw1.Freq<300].lost_prob,bins=15)
plt.show()
# 当消费频次不超过300次，流失概率没有出现翘尾震荡，可以用powerlaw分布拟合

# In[115]:

df_raw2=df_raw1[df_raw1.Freq<300]
print(df_raw2.head())
import powerlaw
fit=powerlaw.Fit(df_raw2.lost_prob,descrete=False)

# In[117]:

print(fit.alpha)
print(fit.xmin)

# In[113]:

# pdf:
#    p(x)=C*x**-alpha,s.t.C=(alpha-1)/xmin**(-alpha+1),when x<=300
# 预测的会员流失概率与会员消费活跃度的关系为：
#    lost_prob=(3.56-1)/0.0476**(-3.56+1)*Freq**-3.56

df_raw2['pred_lost_prob']=((3.56-1)/0.0476**(-3.56+1))*df_raw2.Freq**-3.56
df_raw2

# In[114]:

# 比较预测值与实际值
# plt.plot(df_raw2.lost_prob)
plt.figure(figsize=(16,3))
plt.plot(df_raw2.pred_lost_prob,color='r')
plt.show()

# In[29]:

# 生成1w个[0,1]上均匀分布的随机数
rnd_list = np.random.uniform(0,1,10**5)

# In[31]:

rnd_list.shape

# In[44]:

a, m = 3, 2
s = (np.random.pareto(a, 1000) + 1) * m
plt.hist(np.random.pareto(a,1000),bins=10)
plt.show()

# In[45]:

plt.plot(s)
plt.show()
plt.hist(s,bins=15)
plt.show()

# # X与alpha值域对应的曲线形状模拟

# In[1]:

import numpy as np
import matplotlib.pyplot as plt

# In[30]:

x1=np.linspace(0,1,10)
x2=np.linspace(1,3,10)
x3=np.linspace(-3,0,10)

fig,ax=plt.subplots(6,1,figsize=(15,20))
ax[0].plot(x1,x1**3)
ax[1].plot(x2,x2**3)
ax[2].plot(x1,x1**-3)
ax[3].plot(x2,x2**-3)
ax[4].plot(x3,x3**3)
ax[5].plot(x3,x3**-3)
plt.show()

# In[17]: