数据挖掘目标(价格预测挑战)
2023-12-13 05:46:02
import time import numpy as np import pandas as pd from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.linear_model import Ridge from sklearn.pipeline import FeatureUnion
In?[3]:
train_data = pd.read_csv('../data/4/train.csv', sep="\t") test_data = pd.read_csv('../data/4/test.csv',sep='\t') # train_data = pd.concat([train_data_1, train_data_1, train_data_1, train_data_1, train_data_1], axis=0) # pre_data = pd.concat([pre_data_1, pre_data_1, pre_data_1, pre_data_1, pre_data_1], axis=0)
In?[5]:
train_data.info() # train_id – 训练序号 name – 商品名称 # item_condition_id – 物品当前状态 brand_name – 品牌名称 # shipping – 是否包邮 item_description – 商品描述 # category_name – 商品类别 price – 商品价格
<class 'pandas.core.frame.DataFrame'> RangeIndex: 474710 entries, 0 to 474709 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 train_id 474710 non-null int64 1 name 474710 non-null object 2 item_condition_id 474710 non-null int64 3 category_name 472655 non-null object 4 brand_name 272297 non-null object 5 price 474710 non-null float64 6 shipping 474710 non-null int64 7 item_description 474708 non-null object dtypes: float64(1), int64(3), object(4) memory usage: 29.0+ MB
In?[7]:
df = pd.concat([train_data, test_data], axis=0)
In?[6]:
#两个缺失值字段都是字符串类型,给缺失值填充标识符,将整数型转换为字符串 def featureProcessing(df): # delete the data that will not be used df = df.drop(['price', 'test_id', 'train_id'], axis=1) # deal with the missing value with a default value df['category_name'] = df['category_name'].fillna('MISS').astype(str) df['brand_name'] = df['brand_name'].fillna('missing').astype(str) df['item_description'] = df['item_description'].fillna('No') # convert the data : int -> str df['shipping'] = df['shipping'].astype(str) df['item_condition_id'] = df['item_condition_id'].astype(str) return df
In?[4]:
# df = pd.concat([train_data, test_data], axis=0)
c:\users\skd621\anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version of pandas will change to not sort by default. To accept the future behavior, pass 'sort=False'. To retain the current behavior and silence the warning, pass 'sort=True'. """Entry point for launching an IPython kernel.
In?[8]:
df = featureProcessing(df)
In?[10]:
y_train = np.log1p(train_data['price'])
In?[11]:
default_preprocessor = CountVectorizer().build_preprocessor()
In?[12]:
def build_preprocessor_1(field): field_idx = list(df.columns).index(field) return lambda x: default_preprocessor(x[field_idx])
In?[13]:
vectorizer = FeatureUnion([ ('name', CountVectorizer(ngram_range=(1, 2), max_features=50000, preprocessor=build_preprocessor_1('name'))), ('category_name', CountVectorizer(token_pattern='.+', preprocessor=build_preprocessor_1('category_name'))), ('brand_name', CountVectorizer(token_pattern='.+', preprocessor=build_preprocessor_1('brand_name'))), ('shipping', CountVectorizer(token_pattern='\d+', preprocessor=build_preprocessor_1('shipping'))), ('item_condition_id', CountVectorizer(token_pattern='\d+', preprocessor=build_preprocessor_1('item_condition_id'))), ('item_description', TfidfVectorizer(ngram_range=(1, 3), max_features=100000, preprocessor=build_preprocessor_1('item_description'))), ])
In?[19]:
# # 传入数据集进行处理 X = vectorizer.fit_transform(df.values) # # 训练数据的行数 nrow_train = train_data.shape[0] # # 处理后的训练数据 X_train = X[:nrow_train] # # 处理后的测试数据 X_test = X[nrow_train:] # df.values
In?[22]:
def ridgeClassify(train_data, train_label): ridgeClf = Ridge( solver='auto', fit_intercept=True, alpha=0.5, max_iter=500, # normalize=False, tol=0.05) # 训练 ridgeClf.fit(train_data, train_label) return ridgeClf
In?[24]:
ridgeClf = ridgeClassify(X_train, y_train) # 结果预测 test_price = np.expm1(ridgeClf.predict(X_test))
In?[25]:
true_price = pd.read_csv("../data/4/label_test.csv", sep="\t").price.tolist()
In?[26]:
from sklearn.metrics import mean_squared_log_error
In?[27]:
mean_squared_log_error(true_price, test_price)
Out[27]:
0.2398692547251235
In?[28]:
def score(predict_label, true_label): res = 0 for p, t in zip(predict_label, true_label): res += np.power((np.log(p + 1) - np.log(t + 1)), 2) return res / len(predict_label)
文章来源:https://blog.csdn.net/LiYao1103/article/details/134943737
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。 如若内容造成侵权/违法违规/事实不符,请联系我的编程经验分享网邮箱:veading@qq.com进行投诉反馈,一经查实,立即删除!
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。 如若内容造成侵权/违法违规/事实不符,请联系我的编程经验分享网邮箱:veading@qq.com进行投诉反馈,一经查实,立即删除!