【IWR_231226】Python餐饮业用户流失预测

2023-12-26 16:51:54

import random
import string
from datetime import datetime

def generate_random_string(length=3):
    characters = string.ascii_uppercase
    return ''.join(random.choice(characters) for _ in range(length))

def generate_timestamped_string(separator='_'):
    timestamp = datetime.now().strftime('%y%m%d') # %H%M%S
    random_part = generate_random_string(length=3)
    return random_part+separator+timestamp

timestamped_string = generate_timestamped_string()
print('【{0}】'.format(timestamped_string))

【Talk is cheap】

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# 读取数据
users = pd.read_csv('./data/user_loss.csv', encoding='gbk')
info = pd.read_csv('./data/info_new.csv', encoding='utf-8')
print('客户信息表的维数:', users.shape)
print('订单详情表的维数:', info.shape)
客户信息表的维数: (2431, 36)
订单详情表的维数: (6611, 21)
users.head()
USER_IDMYIDACCOUNTNAMEORGANIZE_IDORGANIZE_NAMEDUTY_IDTITLE_IDPASSWORDEMAIL...MODIFYERTELstuNoqqweixinsexpooaddressagetype
03300NaN艾朵a2328330统计17级NaNNaN202cb962ac59075b964b07152d234b1752NaN...NaN186888824522019351NaNNaN广东汕尾广州26非流失
13497NaN艾国真a2525330统计17级NaNNaN202cb962ac59075b964b07152d234b1949NaN...NaN186888826492019548NaNNaN广东汕尾广州43非流失
22863NaN艾锦程a1891330统计17级NaNNaN202cb962ac59075b964b07152d234b1315NaN...NaN186888820152018914NaNNaN广东汕尾佛山58已流失
33006NaN艾穹a2034330统计17级NaNNaN202cb962ac59075b964b07152d234b1458NaN...NaN186888821582019057NaNNaN四川广安广州22已流失
43160NaN艾融乐a2188330统计17级NaNNaN202cb962ac59075b964b07152d234b1612NaN...NaN186888823122019211NaNNaN广东中山广州20非流失

5 rows × 36 columns

info.head()
info_idemp_idnumber_consumersmodedining_table_iddining_table_nameexpendituredishes_countaccounts_payableuse_start_time...lock_timecashier_idpc_idorder_numberorg_idprint_doc_bill_numlock_table_infoorder_statusphonename
0132635564NaN14851006423134232016/2/5 19:08...2016/2/5 19:15NaNNaNNaN330NaNNaN118688882708麻庶汐
1132718747NaN1516103811012911012016/1/4 11:51...2016/1/4 12:09NaNNaNNaN330NaNNaN118688881026濮明智
2132834845NaN15041010437204372016/1/29 13:31...2016/1/29 13:37NaNNaNNaN330NaNNaN118688882636姜萌萌
3132936392NaN1482100325182512016/1/19 12:02...2016/1/19 12:14NaNNaNNaN330NaNNaN118688882791封振翔
4133038352NaN1480100236363632016/7/18 12:35...2016/7/18 12:45NaNNaNNaN330NaNNaN118688882987白子晨

5 rows × 21 columns

# 将时间转为时间格式
users['CREATED'] = pd.to_datetime(users['CREATED'])
info['use_start_time'] = pd.to_datetime(info['use_start_time'])
info['lock_time'] = pd.to_datetime(info['lock_time'])

# 客户流失与年龄的关系
a = users.loc[users['type'] == '已流失', ['age', 'type']]['age'].value_counts().sort_index()
b = users.loc[users['type'] == '非流失', ['age', 'type']]['age'].value_counts().sort_index()
c = users.loc[users['type'] == '准流失', ['age', 'type']]['age'].value_counts().sort_index()

df = pd.DataFrame({'已流失': a.values,
                   '非流失': b.values,
                   '准流失': c.values},
                   index=range(20, 61, 1))
df.head()
已流失非流失准流失
2071125
21242954
22173249
23132548
24152033
plt.rcParams['font.sans-serif'] = 'SimHei'  # 设置中文显示
plt.rcParams['axes.unicode_minus'] = False
plt.figure(figsize=(8, 4))
sns.lineplot(data=df)
plt.xlabel('年龄(岁)')
plt.ylabel('客户流失数量(人)')
plt.title('客户流失数量与年龄的关系')
Text(0.5, 1.0, '客户流失数量与年龄的关系')

count1 = pd.DataFrame(users[users['sex']=='男']['type'].value_counts())
count1.columns=['数量(人)']
count2 = pd.DataFrame(users[users['sex']=='女']['type'].value_counts())
count2.columns=['数量(人)']
index1 = count1.index
index2 = count2.index
fig,axes=plt.subplots(1, 2, figsize=(8, 4))
sns.barplot(x=index1, y=count1['数量(人)'], ax=axes[0])
axes[0].set_title('男性客户流失状态数量')
sns.barplot(x=index2, y=count2['数量(人)'], ax=axes[1])
axes[1].set_title('女性客户流失状态数量')
Text(0.5, 1.0, '女性客户流失状态数量')

print('订单详情表重复值个数:',
     info.duplicated(subset=['name', 'use_start_time']).sum())
订单详情表重复值个数: 0
ind = info[info.duplicated(['dining_table_id', 'use_start_time'])].index
print('同一时间桌子被不同人使用的订单:\n',
     info[(info['dining_table_id'] == info.iloc[ind[1], :]['dining_table_id']) & 
     (info['use_start_time'] == info.iloc[ind[1], :]['use_start_time'])]
     [['info_id', 'dining_table_id','use_start_time']])
同一时间桌子被不同人使用的订单:
       info_id  dining_table_id      use_start_time
2052     3392             1484 2016-03-26 21:55:00
2140     3480             1484 2016-03-26 21:55:00
info.drop(index=ind, inplace=True)
info = info.reset_index(drop=True)
print('异常值个数:', len(ind))
print('去除异常值订单详情表维数:', info.shape)
异常值个数: 17
去除异常值订单详情表维数: (6594, 21)
print('客户信息表缺失值个数:', info.isnull().sum().sum())
print('订单详情表缺失值个数:', users.isnull().sum().sum())
客户信息表缺失值个数: 46158
订单详情表缺失值个数: 50842
info.head()
USER_IDnumber_consumersexpenditure
035564423
1187471101
234845437
336392251
438352363
for i in range(len(users)):
    info1 = info.iloc[info[info['name'] == users.iloc[i, 2]].index.tolist(), :]
    if sum(info['name'] == users.iloc[i, 2]) != 0:
        users.iloc[i, 14] = max(info1['use_start_time'])
# 获取订单状态为1的订单
info = info.loc[info['order_status'] == 1, ['emp_id', 'number_consumers', 'expenditure']]
info = info.rename(columns={'emp_id': 'USER_ID'})  # 修改列名
user = users[['USER_ID', 'LAST_VISITS', 'type']]

# 合并两个表
info_user = pd.merge(user, info, left_on='USER_ID', right_on='USER_ID', how='left')
print('合并表缺失值个数:\n', info_user.isnull().sum())
info_user.dropna(inplace=True)

info_user.to_csv('./tmp/info_user.csv', index=False, encoding='utf-8')

print('处理缺失值数据维度:\n', info_user.shape)
合并表缺失值个数:
 USER_ID               0
LAST_VISITS         152
type                  0
number_consumers      4
expenditure           4
dtype: int64
处理缺失值数据维度:
 (6460, 5)
# 构建特征
info_user = pd.read_csv('./tmp/info_user.csv', encoding='utf-8')

# 统计每个客户的用餐次数
info_user1 = info_user['USER_ID'].value_counts()
info_user1 = info_user1.reset_index()
info_user1.columns = ['USER_ID', 'frequence']  # 修改列名

# 求出每个客户的消费总金额
# 分组求和
info_user2 = info_user[['number_consumers',
                        'expenditure']].groupby(info_user['USER_ID']).sum()
info_user2 = info_user2.reset_index()
info_user2.columns = ['USER_ID', 'numbers', 'amount']
# 合并客户的用餐次数和消费总金额
data_new = pd.merge(info_user1, info_user2,
                      left_on='USER_ID', right_on='USER_ID', how='left')

# 提取数据
info_user = info_user.iloc[:, :4]
info_user = info_user.groupby(['USER_ID']).last()
info_user = info_user.reset_index()
# 合并
info_user_new = pd.merge(data_new, info_user,
                      left_on='USER_ID', right_on='USER_ID', how='left')
print(info_user_new.head())

# 求人均消费金额,并保留2为小数
info_user_new['average'] = info_user_new['amount'] / info_user_new['numbers']
info_user_new['average'] = info_user_new['average'].apply(lambda x: '%.2f' % x)

# 计算每个客户最近一次点餐的时间距离观测窗口结束的天数
# 修改时间列,改为日期
info_user_new['LAST_VISITS'] = pd.to_datetime(info_user_new['LAST_VISITS'])
datefinally = pd.to_datetime('2016-7-31')  # 观测窗口结束时间
time = datefinally - info_user_new['LAST_VISITS']
info_user_new['recently'] = time.apply(lambda x: x.days)  # 计算时间差
# 特征选取
info_user_new = info_user_new.loc[:, ['USER_ID', 'frequence',
                             'amount', 'average', 'recently', 'type']]
info_user_new.to_csv('./tmp/info_user_clear.csv', index=False, encoding='gbk')
print(info_user_new.head())
USER_ID  frequence  numbers   amount     LAST_VISITS type  number_consumers
0     2361         41    237.0  34784.0  2016/7/8 11:31  非流失               7.0
1     3478         37    231.0  33570.0  2016/7/7 12:54  非流失               5.0
2     3430         34    224.0  31903.0  2016/7/9 12:53  非流失               5.0
3     3762         33    208.0  30394.0  2016/7/7 12:51  非流失              10.0
4     3307         33    199.0  30400.0  2016/7/9 12:56  非流失               2.0
   USER_ID  frequence   amount average  recently type
0     2361         41  34784.0  146.77        22  非流失
1     3478         37  33570.0  145.32        23  非流失
2     3430         34  31903.0  142.42        21  非流失
3     3762         33  30394.0  146.12        23  非流失
4     3307         33  30400.0  152.76        21  非流失
import pandas as pd
from sklearn.metrics import confusion_matrix

# 自定义评价函数
def test_pre(pred):
    # 混淆矩阵
    hx = confusion_matrix(y_te, pred, labels=['非流失', '准流失'])
    print('混淆矩阵:\n', hx)
    # 精确率
    P = hx[1, 1] / (hx[0, 1] + hx[1, 1])
    print('精确率:', round(P, 3))
    # 召回率
    R = hx[1, 1] / (hx[1, 0] + hx[1, 1])
    print('召回率:', round(R, 3))
    # F1值
    F1 = 2 * P * R / (P + R)
    print('F1值:', round(F1, 3))
# 读取数据
info_user = pd.read_csv('./tmp/info_user_clear.csv', encoding='gbk')
# 删除流失客户
info_user = info_user[info_user['type'] != '已流失']
model_data = info_user.iloc[:, [1, 2, 3, 4, 5]]
# 划分测试集、训练集
from sklearn.model_selection import train_test_split

# 构建模型
from sklearn.tree import DecisionTreeClassifier as DTC
dtc = DTC(random_state=12345)
dtc.fit(x_tr, y_tr)  # 训练模型
pre = dtc.predict(x_te)
# 评价模型
test_pre(pre)

print('真实值:\n', y_te[:10].to_list())
print('预测结果:\n', pre[:10])
混淆矩阵:
 [[159   8]
 [ 17 198]]
精确率: 0.961
召回率: 0.921
F1值: 0.941
真实值:
 ['准流失', '非流失', '准流失', '非流失', '准流失', '准流失', '准流失', '准流失', '准流失', '准流失']
预测结果:
 ['准流失' '非流失' '准流失' '非流失' '准流失' '准流失' '准流失' '准流失' '准流失' '准流失']
from sklearn.svm import LinearSVC
svc = LinearSVC(random_state=123)
svc.fit(x_tr, y_tr)
pre = svc.predict(x_te)
test_pre(pre)

print('真实值:\n', y_te[:10].to_list())
print('预测结果:\n', pre[:10])
混淆矩阵:
 [[113  54]
 [  0 215]]
精确率: 0.799
召回率: 1.0
F1值: 0.888
真实值:
 ['准流失', '非流失', '准流失', '非流失', '准流失', '准流失', '准流失', '准流失', '准流失', '准流失']
预测结果:
 ['准流失' '准流失' '准流失' '非流失' '准流失' '准流失' '准流失' '准流失' '准流失' '准流失']


F:\python3.9\Scripts\venv_pdfgpt\lib\site-packages\sklearn\svm\_base.py:1244: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  warnings.warn(

文章来源:https://blog.csdn.net/baidu_22713341/article/details/135223087
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。